diff --git a/shim/.buckconfig b/.Package.swift/backend_coreml/dummy.swift
similarity index 100%
rename from shim/.buckconfig
rename to .Package.swift/backend_coreml/dummy.swift
diff --git a/shim/TARGETS b/.Package.swift/backend_coreml_debug/dummy.swift
similarity index 100%
rename from shim/TARGETS
rename to .Package.swift/backend_coreml_debug/dummy.swift
diff --git a/.Package.swift/backend_mps/dummy.swift b/.Package.swift/backend_mps/dummy.swift
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/.Package.swift/backend_mps_debug/dummy.swift b/.Package.swift/backend_mps_debug/dummy.swift
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/.Package.swift/backend_xnnpack/dummy.swift b/.Package.swift/backend_xnnpack/dummy.swift
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/.Package.swift/backend_xnnpack_debug/dummy.swift b/.Package.swift/backend_xnnpack_debug/dummy.swift
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/.Package.swift/executorch/dummy.swift b/.Package.swift/executorch/dummy.swift
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/.Package.swift/executorch_debug/dummy.swift b/.Package.swift/executorch_debug/dummy.swift
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/.Package.swift/kernels_custom/dummy.swift b/.Package.swift/kernels_custom/dummy.swift
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/.Package.swift/kernels_custom_debug/dummy.swift b/.Package.swift/kernels_custom_debug/dummy.swift
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/.Package.swift/kernels_optimized/dummy.swift b/.Package.swift/kernels_optimized/dummy.swift
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/.Package.swift/kernels_optimized_debug/dummy.swift b/.Package.swift/kernels_optimized_debug/dummy.swift
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/.Package.swift/kernels_portable/dummy.swift b/.Package.swift/kernels_portable/dummy.swift
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/.Package.swift/kernels_portable_debug/dummy.swift b/.Package.swift/kernels_portable_debug/dummy.swift
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/.Package.swift/kernels_quantized/dummy.swift b/.Package.swift/kernels_quantized/dummy.swift
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/.Package.swift/kernels_quantized_debug/dummy.swift b/.Package.swift/kernels_quantized_debug/dummy.swift
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/.buckconfig b/.buckconfig
index 7a4aecd9710..1995dc91946 100644
--- a/.buckconfig
+++ b/.buckconfig
@@ -8,15 +8,18 @@
   root = .
   prelude = third-party/prelude
   shim = shim
+  shim_et = shim_et
 
 [repository_aliases]
+  bazel_skylib = shim
   config = prelude
   ovr_config = prelude
-  toolchains = shim
-  fbcode = shim
+  toolchains = shim_et
+  fbcode = shim_et
   fbcode_macros = shim
-  fbsource = shim
+  fbsource = shim_et
   buck = shim
+  gh_facebook_buck2_shims_meta = shim
 
 [cxx]
   cxxflags = -g -std=c++17
@@ -36,3 +39,6 @@
 
 [buck2]
 restarter=true
+
+[oss]
+folly_cxx_tests = False
diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
index 16b62b81784..81c9c52f3f4 100755
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -29,6 +29,10 @@ case "${IMAGE_NAME}" in
     LINTRUNNER=""
     CLANG_VERSION=12
     ;;
+  executorch-ubuntu-22.04-gcc11-aarch64)
+    LINTRUNNER=""
+    GCC_VERSION=11
+    ;;
   executorch-ubuntu-22.04-linter)
     LINTRUNNER=yes
     CLANG_VERSION=12
@@ -44,6 +48,7 @@ case "${IMAGE_NAME}" in
   executorch-ubuntu-22.04-mediatek-sdk)
     MEDIATEK_SDK=yes
     CLANG_VERSION=12
+    ANDROID_NDK_VERSION=r27b
     ;;
   executorch-ubuntu-22.04-clang12-android)
     LINTRUNNER=""
diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
index 2c2d910da92..b17dd3f8f95 100644
--- a/.ci/docker/ci_commit_pins/pytorch.txt
+++ b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-27e35de6c288bffad1b4d18b393579c1d1a95547
+295f2ed4d103017f7e19a7b8263ece606cd629db
diff --git a/.ci/docker/common/install_android.sh b/.ci/docker/common/install_android.sh
index ab779ade8fc..e990bc6933f 100755
--- a/.ci/docker/common/install_android.sh
+++ b/.ci/docker/common/install_android.sh
@@ -70,6 +70,7 @@ install_sdk() {
   # These are the tools needed to build Android apps
   yes | /opt/cmdline-tools/bin/sdkmanager --sdk_root="${SDK_INSTALLATION_DIR}" --install "platforms;android-34"
   yes | /opt/cmdline-tools/bin/sdkmanager --sdk_root="${SDK_INSTALLATION_DIR}" --install "build-tools;33.0.1"
+  yes | /opt/cmdline-tools/bin/sdkmanager --sdk_root="${SDK_INSTALLATION_DIR}" --install "build-tools;35.0.0"
   # And some more tools for future emulator tests
   yes | /opt/cmdline-tools/bin/sdkmanager --sdk_root="${SDK_INSTALLATION_DIR}" --install "platform-tools"
   yes | /opt/cmdline-tools/bin/sdkmanager --sdk_root="${SDK_INSTALLATION_DIR}" --install "tools"
diff --git a/.ci/docker/common/install_conda.sh b/.ci/docker/common/install_conda.sh
index ce51a16452f..ac3e4fc1a0e 100755
--- a/.ci/docker/common/install_conda.sh
+++ b/.ci/docker/common/install_conda.sh
@@ -13,6 +13,9 @@ source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
 install_miniconda() {
   BASE_URL="https://repo.anaconda.com/miniconda"
   CONDA_FILE="Miniconda3-py${PYTHON_VERSION//./}_${MINICONDA_VERSION}-Linux-x86_64.sh"
+  if [[ $(uname -m) == "aarch64" ]]; then 
+    CONDA_FILE="Miniconda3-py${PYTHON_VERSION//./}_${MINICONDA_VERSION}-Linux-aarch64.sh"
+  fi
 
   mkdir -p /opt/conda
   chown ci-user:ci-user /opt/conda
@@ -36,7 +39,7 @@ install_python() {
 
   # From https://github.com/pytorch/pytorch/blob/main/.ci/docker/common/install_conda.sh
   if [[ $(uname -m) == "aarch64" ]]; then
-    conda_install "openblas==0.3.28=*openmp*"
+    conda_install "openblas==0.3.29=*openmp*" -c conda-forge
   else
     conda_install mkl=2022.1.0 mkl-include=2022.1.0
   fi
diff --git a/.ci/docker/common/install_java.sh b/.ci/docker/common/install_java.sh
new file mode 100644
index 00000000000..a8c34112285
--- /dev/null
+++ b/.ci/docker/common/install_java.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -ex
+
+apt-get update
+
+apt-get install -y --no-install-recommends openjdk-17-jdk
diff --git a/.ci/docker/conda-env-ci.txt b/.ci/docker/conda-env-ci.txt
index c675b3d9f6e..292a87ace64 100644
--- a/.ci/docker/conda-env-ci.txt
+++ b/.ci/docker/conda-env-ci.txt
@@ -1,4 +1,4 @@
-cmake=3.22.1
+cmake=3.26.4
 ninja=1.10.2
 libuv
 llvm-openmp
diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile
index 5efa0825b96..48a89173fda 100644
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@@ -30,6 +30,10 @@ ARG BUCK2_VERSION
 COPY ./common/install_buck.sh install_buck.sh
 RUN bash ./install_buck.sh && rm install_buck.sh
 
+# Install java
+COPY ./common/install_java.sh install_java.sh
+RUN bash ./install_java.sh && rm install_java.sh
+
 # Setup user
 COPY ./common/install_user.sh install_user.sh
 RUN bash ./install_user.sh && rm install_user.sh
diff --git a/.ci/scripts/build_android_instrumentation.sh b/.ci/scripts/build_android_instrumentation.sh
new file mode 100644
index 00000000000..5e074d9e215
--- /dev/null
+++ b/.ci/scripts/build_android_instrumentation.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -ex
+
+if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
+  PYTHON_EXECUTABLE=python3
+fi
+which "${PYTHON_EXECUTABLE}"
+
+mkdir -p "${BUILD_AAR_DIR}"/executorch_android/src/androidTest/resources
+cp extension/module/test/resources/add.pte "${BUILD_AAR_DIR}"/executorch_android/src/androidTest/resources
+
+pushd "${BUILD_AAR_DIR}"
+ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew :executorch_android:testDebugUnitTest
+ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew :executorch_android:assembleAndroidTest
+popd
diff --git a/.ci/scripts/build_llama_android.sh b/.ci/scripts/build_llama_android.sh
index d37c65aa8ec..2460f5483d9 100644
--- a/.ci/scripts/build_llama_android.sh
+++ b/.ci/scripts/build_llama_android.sh
@@ -14,7 +14,6 @@ if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
   PYTHON_EXECUTABLE=python3
 fi
 which "${PYTHON_EXECUTABLE}"
-CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')"
 
 install_executorch_and_backend_lib() {
   echo "Installing executorch and xnnpack backend"
@@ -28,7 +27,6 @@ install_executorch_and_backend_lib() {
     -DANDROID_ABI="${ANDROID_ABI}" \
     -DCMAKE_INSTALL_PREFIX=cmake-android-out \
     -DCMAKE_BUILD_TYPE=Release \
-    -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
     -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
@@ -44,6 +42,10 @@ install_executorch_and_backend_lib() {
 
 build_llama_runner() {
     echo "Building llama runner for Android..."
+    pushd extension/llm/tokenizers
+    echo "Updating tokenizers submodule"
+    git submodule update --init
+    popd
     ANDROID_ABI=arm64-v8a
     cmake -DBUCK2="${BUCK2}" \
     -DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK"/build/cmake/android.toolchain.cmake  \
@@ -54,11 +56,9 @@ build_llama_runner() {
     -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
     -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
-    -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \
     -Bcmake-android-out/examples/models/llama examples/models/llama
 
     cmake --build cmake-android-out/examples/models/llama -j4 --config Release
 }
-install_flatc_from_source
 install_executorch_and_backend_lib
 build_llama_runner
diff --git a/.ci/scripts/gather_benchmark_configs.py b/.ci/scripts/gather_benchmark_configs.py
index 0fe60a0d772..b2126f84e78 100755
--- a/.ci/scripts/gather_benchmark_configs.py
+++ b/.ci/scripts/gather_benchmark_configs.py
@@ -263,7 +263,8 @@ def is_valid_huggingface_model_id(model_name: str) -> bool:
 def get_benchmark_configs() -> Dict[str, Dict]:  # noqa: C901
     """
     Gather benchmark configurations for a given set of models on the target operating system and devices.
-
+    CHANGE IF this function's return changed:
+        extract_model_info() in executorch/.github/scripts/extract_benchmark_results.py IF YOU CHANGE THE RESULT OF THIS FUNCTION.
     Args:
         None
 
diff --git a/.ci/scripts/gather_test_models.py b/.ci/scripts/gather_test_models.py
index d02213b9faf..b32a052026a 100755
--- a/.ci/scripts/gather_test_models.py
+++ b/.ci/scripts/gather_test_models.py
@@ -5,6 +5,9 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# WARNING: The CI runner logic should directly be in the corresponding yml files
+# This file will be deleted once the reference in periodic.yml is deleted.
+
 import itertools
 import json
 import os
@@ -30,6 +33,7 @@
         "dl3": "linux.4xlarge.memory",
         "emformer_join": "linux.4xlarge.memory",
         "emformer_predict": "linux.4xlarge.memory",
+        "phi-4-mini": "linux.4xlarge.memory",
     }
 }
 
@@ -104,8 +108,12 @@ def model_should_run_on_target_os(model: str, target_os: str) -> bool:
     For example, a big model can be disabled in macos due to the limited macos resources.
     """
     if target_os == "macos":
+        # Disabled in macos due to limited resources, and should stay that way even if
+        # we otherwise re-enable.
         return model not in ["llava"]
-    return True
+    # Disabled globally because we have test-llava-runner-linux that does a more
+    # comprehensive E2E test of llava.
+    return model not in ["llava"]
 
 
 def export_models_for_ci() -> dict[str, dict]:
diff --git a/.ci/scripts/setup-linux.sh b/.ci/scripts/setup-linux.sh
index 36fbcd72743..a090571ab49 100755
--- a/.ci/scripts/setup-linux.sh
+++ b/.ci/scripts/setup-linux.sh
@@ -10,19 +10,17 @@ set -exu
 # shellcheck source=/dev/null
 source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
 
-BUILD_TOOL=$1
-if [[ -z "${BUILD_TOOL:-}" ]]; then
-  echo "Missing build tool (require buck2 or cmake), exiting..."
-  exit 1
-else
-  echo "Setup Linux for ${BUILD_TOOL} ..."
-fi
+read -r BUILD_TOOL BUILD_MODE EDITABLE < <(parse_args "$@")
 
 # As Linux job is running inside a Docker container, all of its dependencies
 # have already been installed, so we use PyTorch build from source here instead
 # of nightly. This allows CI to test against latest commits from PyTorch
-install_executorch "use-pt-pinned-commit"
-build_executorch_runner "${BUILD_TOOL}"
+if [[ "${EDITABLE:-false}" == "true" ]]; then
+  install_executorch --use-pt-pinned-commit --editable
+else
+  install_executorch --use-pt-pinned-commit
+fi
+build_executorch_runner "${BUILD_TOOL}" "${BUILD_MODE}"
 
 if [[ "${GITHUB_BASE_REF:-}" == *main* || "${GITHUB_BASE_REF:-}" == *gh* ]]; then
   do_not_use_nightly_on_ci
diff --git a/.ci/scripts/setup-macos.sh b/.ci/scripts/setup-macos.sh
index 033c2996038..4b43a730710 100755
--- a/.ci/scripts/setup-macos.sh
+++ b/.ci/scripts/setup-macos.sh
@@ -10,13 +10,7 @@ set -exu
 # shellcheck source=/dev/null
 source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
 
-BUILD_TOOL=$1
-if [[ -z "${BUILD_TOOL:-}" ]]; then
-  echo "Missing build tool (require buck2 or cmake), exiting..."
-  exit 1
-else
-  echo "Setup MacOS for ${BUILD_TOOL} ..."
-fi
+read -r BUILD_TOOL BUILD_MODE EDITABLE < <(parse_args "$@")
 
 install_buck() {
   if ! command -v zstd &> /dev/null; then
@@ -135,8 +129,12 @@ print_cmake_info
 install_pytorch_and_domains
 # We build PyTorch from source here instead of using nightly. This allows CI to test against
 # the pinned commit from PyTorch
-install_executorch "use-pt-pinned-commit"
-build_executorch_runner "${BUILD_TOOL}"
+if [[ "$EDITABLE" == "true" ]]; then
+  install_executorch --use-pt-pinned-commit --editable
+else
+  install_executorch --use-pt-pinned-commit
+fi
+build_executorch_runner "${BUILD_TOOL}" "${BUILD_MODE}"
 
 if [[ "${GITHUB_BASE_REF:-}" == *main* || "${GITHUB_BASE_REF:-}" == *gh* ]]; then
   do_not_use_nightly_on_ci
diff --git a/.ci/scripts/test_ane_static_llama.sh b/.ci/scripts/test_ane_static_llama.sh
new file mode 100644
index 00000000000..fd16c663372
--- /dev/null
+++ b/.ci/scripts/test_ane_static_llama.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -exu
+
+source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
+
+export EXECUTORCH_ROOT="$(dirname "${BASH_SOURCE[0]}")/../.."
+
+if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
+  PYTHON_EXECUTABLE=python3
+fi
+
+which "${PYTHON_EXECUTABLE}"
+
+# Update tokenizers submodule
+pushd $EXECUTORCH_ROOT/extension/llm/tokenizers
+echo "Update tokenizers submodule"
+git submodule update --init
+popd
+
+pushd $EXECUTORCH_ROOT/examples/apple/coreml/llama
+
+# Download stories llama110m artifacts
+download_stories_model_artifacts
+
+python export.py -n model.pte -p params.json -c stories110M.pt --seq_length 32 --max_seq_length 64 --dtype fp16 --coreml-quantize c4w
+
+popd
diff --git a/build/test_ios_ci.sh b/.ci/scripts/test_ios_ci.sh
similarity index 100%
rename from build/test_ios_ci.sh
rename to .ci/scripts/test_ios_ci.sh
diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh
index 9735e26798d..175d02cd65c 100644
--- a/.ci/scripts/test_llama.sh
+++ b/.ci/scripts/test_llama.sh
@@ -154,7 +154,6 @@ cmake_install_executorch_libraries() {
     rm -rf cmake-out
     retry cmake \
         -DCMAKE_INSTALL_PREFIX=cmake-out \
-        -DCMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')" \
         -DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
@@ -174,6 +173,10 @@ cmake_install_executorch_libraries() {
 
 cmake_build_llama_runner() {
     echo "Building llama runner"
+    pushd extension/llm/tokenizers
+    echo "Updating tokenizers submodule"
+    git submodule update --init
+    popd
     dir="examples/models/llama"
     retry cmake \
         -DCMAKE_INSTALL_PREFIX=cmake-out \
diff --git a/.ci/scripts/test_llama_torchao_lowbit.sh b/.ci/scripts/test_llama_torchao_lowbit.sh
new file mode 100644
index 00000000000..76fabb04250
--- /dev/null
+++ b/.ci/scripts/test_llama_torchao_lowbit.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -exu
+
+source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
+
+export EXECUTORCH_ROOT="$(dirname "${BASH_SOURCE[0]}")/../.."
+
+if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
+  PYTHON_EXECUTABLE=python3
+fi
+
+which "${PYTHON_EXECUTABLE}"
+
+# Update tokenizers submodule
+pushd $EXECUTORCH_ROOT/extension/llm/tokenizers
+echo "Update tokenizers submodule"
+git submodule update --init
+popd
+
+# Install ET with CMake
+cmake -DPYTHON_EXECUTABLE=python \
+    -DCMAKE_INSTALL_PREFIX=cmake-out \
+    -DEXECUTORCH_ENABLE_LOGGING=1 \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+    -DEXECUTORCH_BUILD_XNNPACK=OFF \
+    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+    -Bcmake-out .
+cmake --build cmake-out -j16 --target install --config Release
+
+# Install llama runner with torchao
+cmake -DPYTHON_EXECUTABLE=python \
+    -DCMAKE_PREFIX_PATH=$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())') \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+    -DEXECUTORCH_BUILD_XNNPACK=OFF \
+    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+    -DEXECUTORCH_BUILD_TORCHAO=ON \
+    -Bcmake-out/examples/models/llama \
+    examples/models/llama
+cmake --build cmake-out/examples/models/llama -j16 --config Release
+
+# Download stories llama110m artifacts
+download_stories_model_artifacts
+
+echo "Creating tokenizer.bin"
+$PYTHON_EXECUTABLE -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
+
+# Export model
+LLAMA_CHECKPOINT=stories110M.pt
+LLAMA_PARAMS=params.json
+MODEL_OUT=model.pte
+TOKENIZER=tokenizer.bin
+
+# Set low-bit quantization parameters
+QLINEAR_BITWIDTH=3 # Can be 1-8
+QLINEAR_GROUP_SIZE=128 # Must be multiple of 16
+QEMBEDDING_BITWIDTH=4 # Can be 1-8
+QEMBEDDING_GROUP_SIZE=32 # Must be multiple of 16
+
+${PYTHON_EXECUTABLE} -m examples.models.llama.export_llama \
+    --checkpoint "${LLAMA_CHECKPOINT:?}" \
+    --params "${LLAMA_PARAMS:?}" \
+    -kv \
+    --use_sdpa_with_kv_cache \
+    --output_name=${MODEL_OUT} \
+    -qmode "torchao:8da${QLINEAR_BITWIDTH}w" \
+    --group_size ${QLINEAR_GROUP_SIZE} \
+    -E "torchao:${QEMBEDDING_BITWIDTH},${QEMBEDDING_GROUP_SIZE}" \
+    --disable_dynamic_shape \
+    -d fp32
+
+# Test run
+./cmake-out/examples/models/llama/llama_main --model_path=$MODEL_OUT --tokenizer_path=$TOKENIZER --prompt="Once upon a time,"
diff --git a/.ci/scripts/test_llava.sh b/.ci/scripts/test_llava.sh
index c511942be91..15df725f9c1 100644
--- a/.ci/scripts/test_llava.sh
+++ b/.ci/scripts/test_llava.sh
@@ -31,7 +31,6 @@ NPROC=8
 if hash nproc &> /dev/null; then NPROC=$(nproc); fi
 
 python_lib=$($PYTHON_EXECUTABLE -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')
-CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')"
 EXECUTORCH_COMMON_CMAKE_ARGS="                      \
         -DCMAKE_INSTALL_PREFIX=${BUILD_DIR}         \
         -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}      \
@@ -48,7 +47,6 @@ EXECUTORCH_COMMON_CMAKE_ARGS="                      \
 cmake_install_executorch_libraries() {
     cmake                               \
         ${EXECUTORCH_COMMON_CMAKE_ARGS} \
-        "-DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}" \
         -B${BUILD_DIR} .
 
     cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${CMAKE_BUILD_TYPE}
@@ -59,7 +57,6 @@ cmake_install_executorch_libraries_for_android() {
         -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
         -DANDROID_ABI=arm64-v8a                                                 \
         ${EXECUTORCH_COMMON_CMAKE_ARGS}                                         \
-        "-DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}" \
         -B${BUILD_DIR} .
 
     cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${CMAKE_BUILD_TYPE}
@@ -80,7 +77,7 @@ cmake_build_llava_runner() {
 
     cmake                                 \
         ${LLAVA_COMMON_CMAKE_ARGS}        \
-        -DCMAKE_PREFIX_PATH="$python_lib;${CMAKE_PREFIX_PATH}" \
+        -DCMAKE_PREFIX_PATH="$python_lib" \
         -B${BUILD_DIR}/${dir}             \
         ${dir}
 
@@ -96,7 +93,7 @@ cmake_build_llava_runner_for_android() {
         -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
         -DANDROID_ABI=arm64-v8a                                                 \
         ${LLAVA_COMMON_CMAKE_ARGS}                                              \
-        -DCMAKE_PREFIX_PATH="$python_lib;${CMAKE_PREFIX_PATH}"                  \
+        -DCMAKE_PREFIX_PATH="$python_lib"                  \
         -DLLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE=ON                                  \
         -B${BUILD_DIR}/${dir}                                                   \
         ${dir}
diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh
index ef4859135c6..51e81e62a9f 100755
--- a/.ci/scripts/test_model.sh
+++ b/.ci/scripts/test_model.sh
@@ -50,12 +50,10 @@ prepare_artifacts_upload() {
 
 build_cmake_executor_runner() {
   echo "Building executor_runner"
-  CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')"
   rm -rf ${CMAKE_OUTPUT_DIR}
   cmake -DCMAKE_BUILD_TYPE=Debug \
       -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
       -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
-      -DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \
       -B${CMAKE_OUTPUT_DIR} .
 
   cmake --build ${CMAKE_OUTPUT_DIR} -j4 --config Debug
@@ -93,21 +91,38 @@ test_model() {
     # Install requirements for llama vision.
     bash examples/models/llama3_2_vision/install_requirements.sh
   fi
-  # python3 -m examples.portable.scripts.export --model_name="llama2" should works too
+  if [[ "${MODEL_NAME}" == "qwen2_5" ]]; then
+      # Install requirements for export_llama
+      bash examples/models/llama/install_requirements.sh
+      # Test export_llama script: python3 -m examples.models.llama.export_llama.
+      # Use Llama random checkpoint with Qwen 2.5 1.5b model configuration.
+      "${PYTHON_EXECUTABLE}" -m examples.models.llama.export_llama --model "${MODEL_NAME}" -c examples/models/llama/params/demo_rand_params.pth -p examples/models/qwen2_5/1_5b_config.json
+      rm "./${MODEL_NAME}.pte"
+      return  # Skip running with portable executor runnner since portable doesn't support Qwen's biased linears.
+  fi
+  if [[ "${MODEL_NAME}" == "phi-4-mini" ]]; then
+      # Install requirements for export_llama
+      bash examples/models/llama/install_requirements.sh
+      # Test export_llama script: python3 -m examples.models.llama.export_llama.
+      "${PYTHON_EXECUTABLE}" -m examples.models.llama.export_llama --model "${MODEL_NAME}" -c examples/models/llama/params/demo_rand_params.pth -p examples/models/phi-4-mini/config.json
+      run_portable_executor_runner
+      rm "./${MODEL_NAME}.pte"
+      return
+  fi
+
+  # Export a basic .pte and run the model.
   "${PYTHON_EXECUTABLE}" -m examples.portable.scripts.export --model_name="${MODEL_NAME}" "${STRICT}"
   run_portable_executor_runner
 }
 
 build_cmake_xnn_executor_runner() {
   echo "Building xnn_executor_runner"
-  CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')"
 
   (rm -rf ${CMAKE_OUTPUT_DIR} \
     && mkdir ${CMAKE_OUTPUT_DIR} \
     && cd ${CMAKE_OUTPUT_DIR} \
     && retry cmake -DCMAKE_BUILD_TYPE=Release \
       -DEXECUTORCH_BUILD_XNNPACK=ON \
-      -DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \
       -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..)
 
   cmake --build ${CMAKE_OUTPUT_DIR} -j4
@@ -158,6 +173,7 @@ test_model_with_qnn() {
   export LD_LIBRARY_PATH=$QNN_SDK_ROOT/lib/x86_64-linux-clang/
   export PYTHONPATH=$EXECUTORCH_ROOT/..
 
+  EXTRA_FLAGS=""
   if [[ "${MODEL_NAME}" == "dl3" ]]; then
     EXPORT_SCRIPT=deeplab_v3
   elif [[ "${MODEL_NAME}" == "mv3" ]]; then
@@ -170,6 +186,12 @@ test_model_with_qnn() {
     EXPORT_SCRIPT=inception_v3
   elif [[ "${MODEL_NAME}" == "vit" ]]; then
     EXPORT_SCRIPT=torchvision_vit
+  elif [[ "${MODEL_NAME}" == "mb" ]]; then
+    EXPORT_SCRIPT=mobilebert_fine_tune
+    EXTRA_FLAGS="--num_epochs 1"
+    pip install scikit-learn
+  elif [[ "${MODEL_NAME}" == "w2l" ]]; then
+    EXPORT_SCRIPT=wav2letter
   elif [[ "${MODEL_NAME}" == "edsr" ]]; then
     EXPORT_SCRIPT=edsr
     # Additional deps for edsr
@@ -183,13 +205,18 @@ test_model_with_qnn() {
   # TODO(guangyang): Make QNN chipset matches the target device
   QNN_CHIPSET=SM8450
 
-  "${PYTHON_EXECUTABLE}" -m examples.qualcomm.scripts.${EXPORT_SCRIPT} -b ${CMAKE_OUTPUT_DIR} -m ${QNN_CHIPSET} --compile_only
+  "${PYTHON_EXECUTABLE}" -m examples.qualcomm.scripts.${EXPORT_SCRIPT} -b ${CMAKE_OUTPUT_DIR} -m ${QNN_CHIPSET} --compile_only $EXTRA_FLAGS
   EXPORTED_MODEL=$(find "./${EXPORT_SCRIPT}" -type f -name "${MODEL_NAME}*.pte" -print -quit)
 }
 
+# Run CoreML tests.
+#
+# @param should_test If true, build and test the model using the coreml_executor_runner.
 test_model_with_coreml() {
-  if [[ "${BUILD_TOOL}" == "buck2" ]]; then
-    echo "coreml doesn't support buck2."
+  local should_test="$1"
+
+  if [[ "${BUILD_TOOL}" != "cmake" ]]; then
+    echo "coreml only supports cmake."
     exit 1
   fi
 
@@ -207,6 +234,14 @@ test_model_with_coreml() {
     echo "No .pte file found"
     exit 1
   fi
+
+  # Run the model
+  if [ "${should_test}" = true ]; then
+    echo "Testing exported model with coreml_executor_runner..."
+    local out_dir=$(mktemp -d)
+    COREML_EXECUTOR_RUNNER_OUT_DIR="${out_dir}" examples/apple/coreml/scripts/build_executor_runner.sh
+    "${out_dir}/coreml_executor_runner" --model_path "${EXPORTED_MODEL}"
+  fi
 }
 
 test_model_with_mps() {
@@ -225,7 +260,11 @@ elif [[ "${BACKEND}" == *"qnn"* ]]; then
   fi
 elif [[ "${BACKEND}" == *"coreml"* ]]; then
   echo "Testing ${MODEL_NAME} with coreml..."
-  test_model_with_coreml
+  should_test_coreml=false
+  if [[ "${BACKEND}" == *"test"* ]]; then
+    should_test_coreml=true
+  fi
+  test_model_with_coreml "${should_test_coreml}"
   if [[ $? -eq 0 ]]; then
     prepare_artifacts_upload
   fi
diff --git a/.ci/scripts/test_phi_3_mini.sh b/.ci/scripts/test_phi_3_mini.sh
index 64dd6b829d8..40767013e23 100644
--- a/.ci/scripts/test_phi_3_mini.sh
+++ b/.ci/scripts/test_phi_3_mini.sh
@@ -22,10 +22,8 @@ NPROC=8
 if hash nproc &> /dev/null; then NPROC=$(nproc); fi
 
 cmake_install_executorch_libraries() {
-  CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')"
   cmake -DPYTHON_EXECUTABLE=python \
       -DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \
-      -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \
       -DEXECUTORCH_ENABLE_LOGGING=1 \
       -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
       -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
@@ -41,10 +39,8 @@ cmake_install_executorch_libraries() {
 }
 
 cmake_build_phi_3_mini() {
-  CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')"
   cmake -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
       -DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \
-      -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \
       -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
       -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
       -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
diff --git a/.ci/scripts/test_qnn_static_llama.sh b/.ci/scripts/test_qnn_static_llama.sh
index 8aab21846f1..5df74bddef4 100644
--- a/.ci/scripts/test_qnn_static_llama.sh
+++ b/.ci/scripts/test_qnn_static_llama.sh
@@ -34,11 +34,11 @@ $PYTHON_EXECUTABLE -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o to
 
 set +e
 # Compile only as weight sharing is not applicable on x86
-$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleScript.test_stories_single_llama --model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir . --compile_only
+$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir . --llama_artifacts . --compile_only
 exit_code1=$?
 
 # Checks accuracy with weight sharing disabled since x86 does not support weight sharing.
-$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleScript.test_stories_single_llama --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir . --enable_x86_64
+$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir . --llama_artifacts . --enable_x86_64
 exit_code2=$?
 
 # Check the exit codes and print messages
diff --git a/.ci/scripts/test_quantized_aot_lib.sh b/.ci/scripts/test_quantized_aot_lib.sh
index 3f8ea886f5c..b522eb7b418 100755
--- a/.ci/scripts/test_quantized_aot_lib.sh
+++ b/.ci/scripts/test_quantized_aot_lib.sh
@@ -16,13 +16,10 @@ CMAKE_OUTPUT_DIR=cmake-out
 
 build_cmake_quantized_aot_lib() {
   echo "Building quantized aot lib"
-  SITE_PACKAGES="$(${PYTHON_EXECUTABLE} -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
-  CMAKE_PREFIX_PATH="${SITE_PACKAGES}/torch"
   (rm -rf ${CMAKE_OUTPUT_DIR} \
     && mkdir ${CMAKE_OUTPUT_DIR} \
     && cd ${CMAKE_OUTPUT_DIR} \
     && retry cmake -DCMAKE_BUILD_TYPE=Release \
-      -DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \
       -DEXECUTORCH_BUILD_KERNELS_QUANTIZED_AOT=ON \
       -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..)
 
diff --git a/.ci/scripts/unittest-buck2.sh b/.ci/scripts/unittest-buck2.sh
new file mode 100755
index 00000000000..efbf386ffa0
--- /dev/null
+++ b/.ci/scripts/unittest-buck2.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+set -eux
+
+# TODO: expand this to //...
+# TODO: can't query cadence & vulkan backends
+# TODO: can't query //kernels/prim_ops because of non-buckified stuff in OSS.
+buck2 query "//backends/apple/... + //backends/example/... + \
+//backends/mediatek/... + //backends/test/... + //backends/transforms/... + \
+//backends/xnnpack/... + //configurations/... + //kernels/aten/... + \
+//kernels/optimized/... + //kernels/portable/... + //kernels/quantized/... + \
+//kernels/test/... + //runtime/... + //schema/... + //test/... + //util/..."
+
+UNBUILDABLE_OPTIMIZED_OPS_REGEX="gelu|fft_r2c|log_softmax"
+BUILDABLE_OPTIMIZED_OPS=$(buck2 query //kernels/optimized/cpu/... | grep -E -v $UNBUILDABLE_OPTIMIZED_OPS_REGEX)
+
+# TODO: build prim_ops_test_cpp again once supported_features works in
+# OSS buck.
+BUILDABLE_KERNELS_PRIM_OPS_TARGETS=$(buck2 query //kernels/prim_ops/... | grep -v prim_ops_test)
+# TODO: expand the covered scope of Buck targets.
+# //runtime/kernel/... is failing because //third-party:torchgen_files's shell script can't find python on PATH.
+# //runtime/test/... requires Python torch, which we don't have in our OSS buck setup.
+buck2 test $BUILDABLE_OPTIMIZED_OPS //kernels/portable/... \
+      $BUILDABLE_KERNELS_PRIM_OPS_TARGETS //runtime/backend/... //runtime/core/... \
+      //runtime/executor: //runtime/kernel/... //runtime/platform/...
diff --git a/.ci/scripts/unittest-linux-cmake.sh b/.ci/scripts/unittest-linux-cmake.sh
new file mode 100755
index 00000000000..7b61256eb51
--- /dev/null
+++ b/.ci/scripts/unittest-linux-cmake.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+set -eux
+
+# Run pytest with coverage
+pytest -n auto --cov=./ --cov-report=xml
+# Run gtest
+LLVM_PROFDATA=llvm-profdata-12 LLVM_COV=llvm-cov-12 \
+test/run_oss_cpp_tests.sh
diff --git a/.ci/scripts/unittest-linux.sh b/.ci/scripts/unittest-linux.sh
new file mode 100755
index 00000000000..f8ff9df773e
--- /dev/null
+++ b/.ci/scripts/unittest-linux.sh
@@ -0,0 +1,43 @@
+#!/usr/bin/env bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+set -eux
+
+# shellcheck source=/dev/null
+source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
+
+read -r BUILD_TOOL BUILD_MODE EDITABLE < <(parse_args "$@")
+
+# The generic Linux job chooses to use base env, not the one setup by the image
+eval "$(conda shell.bash hook)"
+CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+conda activate "${CONDA_ENV}"
+
+if [[ "$BUILD_TOOL" == "cmake" ]]; then
+    # Setup swiftshader and Vulkan SDK which are required to build the Vulkan delegate
+    source .ci/scripts/setup-vulkan-linux-deps.sh
+
+    PYTHON_EXECUTABLE=python \
+    EXECUTORCH_BUILD_PYBIND=ON \
+    CMAKE_ARGS="-DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON" \
+    .ci/scripts/setup-linux.sh "$@"
+
+    # Install llama3_2_vision dependencies.
+    PYTHON_EXECUTABLE=python ./examples/models/llama3_2_vision/install_requirements.sh
+
+    .ci/scripts/unittest-linux-cmake.sh
+elif [[ "$BUILD_TOOL" == "buck2" ]]; then
+    # Removing this breaks sccache in the Buck build, apparently
+    # because TMPDIR gets messed up? Please feel free to fix this and
+    # speed up this CI job!
+    PYTHON_EXECUTABLE=python \
+    .ci/scripts/setup-linux.sh "$@"
+
+    .ci/scripts/unittest-buck2.sh
+else
+    echo "Unknown build tool $BUILD_TOOL"
+    exit 1
+fi
diff --git a/build/__init__.py b/.ci/scripts/unittest-macos-buck2.sh
similarity index 74%
rename from build/__init__.py
rename to .ci/scripts/unittest-macos-buck2.sh
index a9fdb3b996b..4a5dc99b4ce 100644
--- a/build/__init__.py
+++ b/.ci/scripts/unittest-macos-buck2.sh
@@ -1,6 +1,9 @@
-#!/usr/bin/env python3
+#!/usr/bin/env bash
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+set -eux
+
+buck2 test //extension/apple:ExecuTorch
diff --git a/.ci/scripts/unittest-macos-cmake.sh b/.ci/scripts/unittest-macos-cmake.sh
new file mode 100755
index 00000000000..cdb40c40244
--- /dev/null
+++ b/.ci/scripts/unittest-macos-cmake.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+set -eux
+
+# Run pytest with coverage
+${CONDA_RUN} pytest -n auto --cov=./ --cov-report=xml
+# Run gtest
+LLVM_PROFDATA="xcrun llvm-profdata" LLVM_COV="xcrun llvm-cov" \
+${CONDA_RUN} test/run_oss_cpp_tests.sh
diff --git a/.ci/scripts/unittest-macos.sh b/.ci/scripts/unittest-macos.sh
new file mode 100755
index 00000000000..d5ca97404aa
--- /dev/null
+++ b/.ci/scripts/unittest-macos.sh
@@ -0,0 +1,42 @@
+#!/usr/bin/env bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+set -eux
+
+# shellcheck source=/dev/null
+source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
+
+read -r BUILD_TOOL BUILD_MODE EDITABLE < <(parse_args "$@")
+
+bash .ci/scripts/setup-conda.sh
+eval "$(conda shell.bash hook)"
+
+# Create temp directory for sccache shims
+export TMP_DIR=$(mktemp -d)
+export PATH="${TMP_DIR}:$PATH"
+trap 'rm -rfv ${TMP_DIR}' EXIT
+
+# Setup MacOS dependencies as there is no Docker support on MacOS atm
+PYTHON_EXECUTABLE=python \
+EXECUTORCH_BUILD_PYBIND=ON \
+CMAKE_ARGS="-DEXECUTORCH_BUILD_COREML=ON -DEXECUTORCH_BUILD_MPS=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON" \
+${CONDA_RUN} --no-capture-output \
+.ci/scripts/setup-macos.sh "$@"
+
+if [[ "$BUILD_TOOL" == "cmake" ]]; then
+    # Install llama3_2_vision dependencies.
+    PYTHON_EXECUTABLE=python \
+    ${CONDA_RUN} --no-capture-output \
+    ./examples/models/llama3_2_vision/install_requirements.sh
+
+    .ci/scripts/unittest-macos-cmake.sh
+elif [[ "$BUILD_TOOL" == "buck2" ]]; then
+    .ci/scripts/unittest-buck2.sh
+    # .ci/scripts/unittest-macos-buck2.sh
+else
+    echo "Unknown build tool $BUILD_TOOL"
+    exit 1
+fi
diff --git a/.ci/scripts/utils.sh b/.ci/scripts/utils.sh
index c21d0bb604e..677578ce3a4 100644
--- a/.ci/scripts/utils.sh
+++ b/.ci/scripts/utils.sh
@@ -20,15 +20,19 @@ clean_executorch_install_folders() {
   ./install_executorch.sh --clean
 }
 
+update_tokenizers_git_submodule() {
+  echo "Updating tokenizers git submodule..."
+  git submodule update --init
+  pushd extension/llm/tokenizers
+  git submodule update --init
+  popd
+}
+
 install_executorch() {
   which pip
   # Install executorch, this assumes that Executorch is checked out in the
   # current directory.
-  if [[ "${1:-}" == "use-pt-pinned-commit" ]]; then
-    ./install_executorch.sh --pybind xnnpack --use-pt-pinned-commit
-  else
-    ./install_executorch.sh --pybind xnnpack
-  fi
+  ./install_executorch.sh --pybind xnnpack "$@"
   # Just print out the list of packages for debugging
   pip list
 }
@@ -56,12 +60,46 @@ install_pytorch_and_domains() {
   # Fetch the target commit
   pushd pytorch || return
   git checkout "${TORCH_VERSION}"
-  git submodule update --init --recursive
 
-  export USE_DISTRIBUTED=1
-  # Then build and install PyTorch
-  python setup.py bdist_wheel
-  pip install "$(echo dist/*.whl)"
+  local system_name=$(uname)
+  if [[ "${system_name}" == "Darwin" ]]; then
+    local platform=$(python -c 'import sysconfig; import platform; v=platform.mac_ver()[0].split(".")[0]; platform=sysconfig.get_platform().split("-"); platform[1]=f"{v}_0"; print("_".join(platform))')
+  fi
+  local python_version=$(python -c 'import platform; v=platform.python_version_tuple(); print(f"{v[0]}{v[1]}")')
+  local torch_release=$(cat version.txt)
+  local torch_short_hash=${TORCH_VERSION:0:7}
+  local torch_wheel_path="cached_artifacts/pytorch/executorch/pytorch_wheels/${system_name}/${python_version}"
+  local torch_wheel_name="torch-${torch_release}%2Bgit${torch_short_hash}-cp${python_version}-cp${python_version}-${platform:-}.whl"
+
+  local cached_torch_wheel="https://gha-artifacts.s3.us-east-1.amazonaws.com/${torch_wheel_path}/${torch_wheel_name}"
+  # Cache PyTorch wheel is only needed on MacOS, Linux CI already has this as part
+  # of the Docker image
+  local torch_wheel_not_found=0
+  if [[ "${system_name}" == "Darwin" ]]; then
+    pip install "${cached_torch_wheel}" || torch_wheel_not_found=1
+  else
+    torch_wheel_not_found=1
+  fi
+
+  # Found no such wheel, we will build it from source then
+  if [[ "${torch_wheel_not_found}" == "1" ]]; then
+    echo "No cached wheel found, continue with building PyTorch at ${TORCH_VERSION}"
+
+    git submodule update --init --recursive
+    USE_DISTRIBUTED=1 python setup.py bdist_wheel
+    pip install "$(echo dist/*.whl)"
+
+    # Only AWS runners have access to S3
+    if command -v aws && [[ -z "${GITHUB_RUNNER:-}" ]]; then
+      for wheel_path in dist/*.whl; do
+        local wheel_name=$(basename "${wheel_path}")
+        echo "Caching ${wheel_name}"
+        aws s3 cp "${wheel_path}" "s3://gha-artifacts/${torch_wheel_path}/${wheel_name}"
+      done
+    fi
+  else
+    echo "Use cached wheel at ${cached_torch_wheel}"
+  fi
 
   # Grab the pinned audio and vision commits from PyTorch
   TORCHAUDIO_VERSION=$(cat .github/ci_commit_pins/audio.txt)
@@ -76,25 +114,6 @@ install_pytorch_and_domains() {
   sccache --show-stats || true
 }
 
-install_flatc_from_source() {
-  # NB: This function could be used to install flatbuffer from source
-  pushd third-party/flatbuffers || return
-
-  cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release
-  if [ "$(uname)" == "Darwin" ]; then
-    CMAKE_JOBS=$(( $(sysctl -n hw.ncpu) - 1 ))
-  else
-    CMAKE_JOBS=$(( $(nproc) - 1 ))
-  fi
-  cmake --build . -j "${CMAKE_JOBS}"
-
-  # Copy the flatc binary to conda path
-  EXEC_PATH=$(dirname "$(which python)")
-  cp flatc "${EXEC_PATH}"
-
-  popd || return
-}
-
 build_executorch_runner_buck2() {
   # Build executorch runtime with retry as this step is flaky on macos CI
   retry buck2 build //examples/portable/executor_runner:executor_runner
@@ -107,9 +126,14 @@ build_executorch_runner_cmake() {
   mkdir "${CMAKE_OUTPUT_DIR}"
 
   pushd "${CMAKE_OUTPUT_DIR}" || return
+  if [[ $1 == "Debug" ]]; then
+      CXXFLAGS="-fsanitize=address,undefined"
+  else
+      CXXFLAGS=""
+  fi
   # This command uses buck2 to gather source files and buck2 could crash flakily
   # on MacOS
-  retry cmake -DPYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" -DCMAKE_BUILD_TYPE=Release ..
+  CXXFLAGS="$CXXFLAGS" retry cmake -DPYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" -DCMAKE_BUILD_TYPE="${1:-Release}" ..
   popd || return
 
   if [ "$(uname)" == "Darwin" ]; then
@@ -124,7 +148,7 @@ build_executorch_runner() {
   if [[ $1 == "buck2" ]]; then
     build_executorch_runner_buck2
   elif [[ $1 == "cmake" ]]; then
-    build_executorch_runner_cmake
+    build_executorch_runner_cmake "$2"
   else
     echo "Invalid build tool $1. Only buck2 and cmake are supported atm"
     exit 1
@@ -136,7 +160,6 @@ cmake_install_executorch_lib() {
   clean_executorch_install_folders
   retry cmake -DBUCK2="$BUCK" \
           -DCMAKE_INSTALL_PREFIX=cmake-out \
-          -DCMAKE_PREFIX_PATH="$($PYTHON_EXECUTABLE -c 'import torch as _; print(_.__path__[0])')" \
           -DCMAKE_BUILD_TYPE=Release \
           -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
           -Bcmake-out .
@@ -167,3 +190,52 @@ do_not_use_nightly_on_ci() {
     exit 1
   fi
 }
+
+
+parse_args() {
+  local args=("$@")
+  local i
+  local BUILD_TOOL=""
+  local BUILD_MODE=""
+  local EDITABLE=""
+  for ((i=0; i<${#args[@]}; i++)); do
+    case "${args[$i]}" in
+      --build-tool)
+        BUILD_TOOL="${args[$((i+1))]}"
+        i=$((i+1))
+        ;;
+      --build-mode)
+        BUILD_MODE="${args[$((i+1))]}"
+        i=$((i+1))
+        ;;
+      --editable)
+        EDITABLE="${args[$((i+1))]}"
+        i=$((i+1))
+        ;;
+      *)
+        echo "Invalid argument: ${args[$i]}"
+        exit 1
+        ;;
+    esac
+  done
+
+  if [ -z "$BUILD_TOOL" ]; then
+    echo "Missing build tool (require buck2 or cmake), exiting..."
+    exit 1
+  elif ! [[ $BUILD_TOOL =~ ^(cmake|buck2)$ ]]; then
+    echo "Require buck2 or cmake for --build-tool, got ${BUILD_TOOL}, exiting..."
+    exit 1
+  fi
+  BUILD_MODE="${BUILD_MODE:-Release}"
+  if ! [[ "$BUILD_MODE" =~ ^(Debug|Release)$ ]]; then
+    echo "Unsupported build mode ${BUILD_MODE}, options are Debug or Release."
+    exit 1
+  fi
+  EDITABLE="${EDITABLE:-false}"
+  if ! [[ $EDITABLE =~ ^(true|false)$ ]]; then
+    echo "Require true or false for --editable, got ${EDITABLE}, exiting..."
+    exit 1
+  fi
+
+  echo "$BUILD_TOOL $BUILD_MODE $EDITABLE"
+}
diff --git a/.ci/scripts/wheel/__init__.py b/.ci/scripts/wheel/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/build/packaging/env_var_script_linux.sh b/.ci/scripts/wheel/envvar_base.sh
old mode 100644
new mode 100755
similarity index 100%
rename from build/packaging/env_var_script_linux.sh
rename to .ci/scripts/wheel/envvar_base.sh
diff --git a/.ci/scripts/wheel/envvar_linux.sh b/.ci/scripts/wheel/envvar_linux.sh
new file mode 100755
index 00000000000..3b24b3f7188
--- /dev/null
+++ b/.ci/scripts/wheel/envvar_linux.sh
@@ -0,0 +1,11 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# This file is sourced into the environment before building a pip wheel. It
+# should typically only contain shell variable assignments. Be sure to export
+# any variables so that subprocesses will see them.
+
+source "${GITHUB_WORKSPACE}/${REPOSITORY}/.ci/scripts/wheel/envvar_base.sh"
diff --git a/build/packaging/env_var_script_m1.sh b/.ci/scripts/wheel/envvar_macos.sh
old mode 100644
new mode 100755
similarity index 58%
rename from build/packaging/env_var_script_m1.sh
rename to .ci/scripts/wheel/envvar_macos.sh
index da1192455f6..ad30f86d20c
--- a/build/packaging/env_var_script_m1.sh
+++ b/.ci/scripts/wheel/envvar_macos.sh
@@ -8,16 +8,7 @@
 # should typically only contain shell variable assignments. Be sure to export
 # any variables so that subprocesses will see them.
 
-# Enable pybindings so that users can execute ExecuTorch programs from python.
-export EXECUTORCH_BUILD_PYBIND=1
-
-# Ensure that CMAKE_ARGS is defined before referencing it. Defaults to empty
-# if not defined.
-export CMAKE_ARGS="${CMAKE_ARGS:-}"
-
-# Link the XNNPACK backend into the pybindings runtime so that users can execute
-# ExecuTorch programs that delegate to it.
-CMAKE_ARGS="${CMAKE_ARGS} -DEXECUTORCH_BUILD_XNNPACK=ON"
+source "${GITHUB_WORKSPACE}/${REPOSITORY}/.ci/scripts/wheel/envvar_base.sh"
 
 # When building for macOS, link additional backends into the pybindings runtime.
 CMAKE_ARGS="${CMAKE_ARGS} -DEXECUTORCH_BUILD_COREML=ON"
diff --git a/build/packaging/post_build_script.sh b/.ci/scripts/wheel/post_build_script.sh
old mode 100644
new mode 100755
similarity index 100%
rename from build/packaging/post_build_script.sh
rename to .ci/scripts/wheel/post_build_script.sh
diff --git a/build/packaging/pre_build_script.sh b/.ci/scripts/wheel/pre_build_script.sh
old mode 100644
new mode 100755
similarity index 71%
rename from build/packaging/pre_build_script.sh
rename to .ci/scripts/wheel/pre_build_script.sh
index 74c98406d05..2bf8c7c73f0
--- a/build/packaging/pre_build_script.sh
+++ b/.ci/scripts/wheel/pre_build_script.sh
@@ -13,14 +13,5 @@ set -euxo pipefail
 # not install them. TODO(dbort): Switch to using `python -m build --wheel`,
 # which does install them. Though we'd need to disable build isolation to be
 # able to see the installed torch package.
-readonly BUILD_DEPS=(
-  # This list must match the build-system.requires list from pyproject.toml.
-  "cmake"
-  "pip>=23"
-  "pyyaml"
-  "setuptools>=63"
-  "tomli"
-  "wheel"
-  "zstd"
-)
-pip install --progress-bar off "${BUILD_DEPS[@]}"
+
+"${GITHUB_WORKSPACE}/${REPOSITORY}/install_requirements.sh"
diff --git a/.ci/scripts/wheel/test_base.py b/.ci/scripts/wheel/test_base.py
new file mode 100644
index 00000000000..f8a7309a6c2
--- /dev/null
+++ b/.ci/scripts/wheel/test_base.py
@@ -0,0 +1,66 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import subprocess
+import sys
+from dataclasses import dataclass
+from functools import cache
+from typing import List
+
+
+@cache
+def _unsafe_get_env(key: str) -> str:
+    value = os.getenv(key)
+    if value is None:
+        raise RuntimeError(f"environment variable '{key}' is not set")
+    return value
+
+
+@cache
+def _repository_root_dir() -> str:
+    return os.path.join(
+        _unsafe_get_env("GITHUB_WORKSPACE"),
+        _unsafe_get_env("REPOSITORY"),
+    )
+
+
+# For some reason, we are unable to see the entire repo in the python path.
+# So manually add it.
+sys.path.append(_repository_root_dir())
+from examples.models import Backend, Model
+
+
+@dataclass
+class ModelTest:
+    model: Model
+    backend: Backend
+
+
+def run_tests(model_tests: List[ModelTest]) -> None:
+    # Why are we doing this envvar shenanigans? Since we build the testers, which
+    # uses buck, we cannot run as root. This is a sneaky of getting around that
+    # test.
+    #
+    # This can be reverted if either:
+    #   - We remove usage of buck in our builds
+    #   - We stop running the Docker image as root: https://github.com/pytorch/test-infra/issues/5091
+    envvars = os.environ.copy()
+    envvars.pop("HOME")
+
+    for model_test in model_tests:
+        subprocess.run(
+            [
+                os.path.join(_repository_root_dir(), ".ci/scripts/test_model.sh"),
+                str(model_test.model),
+                # What to build `executor_runner` with for testing.
+                "cmake",
+                str(model_test.backend),
+            ],
+            env=envvars,
+            check=True,
+            cwd=_repository_root_dir(),
+        )
diff --git a/.ci/scripts/wheel/test_linux.py b/.ci/scripts/wheel/test_linux.py
new file mode 100644
index 00000000000..5e8db90c863
--- /dev/null
+++ b/.ci/scripts/wheel/test_linux.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import test_base
+from examples.models import Backend, Model
+
+if __name__ == "__main__":
+    test_base.run_tests(
+        model_tests=[
+            test_base.ModelTest(
+                model=Model.Mv3,
+                backend=Backend.XnnpackQuantizationDelegation,
+            )
+        ]
+    )
diff --git a/.ci/scripts/wheel/test_macos.py b/.ci/scripts/wheel/test_macos.py
new file mode 100644
index 00000000000..cd4bce6c136
--- /dev/null
+++ b/.ci/scripts/wheel/test_macos.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import test_base
+from examples.models import Backend, Model
+
+if __name__ == "__main__":
+    test_base.run_tests(
+        model_tests=[
+            test_base.ModelTest(
+                model=Model.Mv3,
+                backend=Backend.XnnpackQuantizationDelegation,
+            ),
+            # Enable this once CoreML is suppported out-of-the-box
+            # https://github.com/pytorch/executorch/issues/9019
+            # test_base.ModelTest(
+            #     model=Model.Mv3,
+            #     backend=Backend.CoreMlTest,
+            # )
+        ]
+    )
diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml
index 86363e7da9d..010f7c1132e 100644
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -21,7 +21,7 @@ body:
       A clear and concise description of what the bug is.
 
       ```python
-      # Sample code to reproduce the problem
+      # Sample code to reproduce the problem. If applicable, also include your model export command.
       ```
 
       ```
diff --git a/.github/scripts/extract_benchmark_results.py b/.github/scripts/extract_benchmark_results.py
index 76f0e533389..81b06c96c32 100755
--- a/.github/scripts/extract_benchmark_results.py
+++ b/.github/scripts/extract_benchmark_results.py
@@ -10,11 +10,12 @@
 import logging
 import os
 import re
+import sys
 import zipfile
 from argparse import Action, ArgumentParser, Namespace
 from io import BytesIO
 from logging import info, warning
-from typing import Any, Dict, List, Optional
+from typing import Any, DefaultDict, Dict, List, Optional
 from urllib import error, request
 
 
@@ -87,49 +88,25 @@ def parse_args() -> Any:
         help="the directory to keep the benchmark results",
     )
     parser.add_argument(
-        "--repo",
-        type=str,
-        required=True,
-        help="which GitHub repo this workflow run belongs to",
-    )
-    parser.add_argument(
-        "--head-branch",
-        type=str,
-        required=True,
-        help="the head branch that runs",
-    )
-    parser.add_argument(
-        "--workflow-name",
+        "--benchmark-configs",
         type=str,
         required=True,
-        help="the name of the benchmark workflow",
-    )
-    parser.add_argument(
-        "--workflow-run-id",
-        type=int,
-        required=True,
-        help="the id of the benchmark workflow",
-    )
-    parser.add_argument(
-        "--workflow-run-attempt",
-        type=int,
-        required=True,
-        help="which retry of the workflow this is",
+        action=ValidateDir,
+        help="the directory to keep the benchmark configs",
     )
+
     parser.add_argument(
-        "--benchmark-configs",
+        "--app",
         type=str,
         required=True,
-        action=ValidateDir,
-        help="the directory to keep the benchmark configs",
+        choices=["android", "ios"],
+        help="the type of app, ios or android, this is mainly used to generate default record when a failed job happens",
     )
 
     return parser.parse_args()
 
 
-def extract_android_benchmark_results(
-    job_name: str, artifact_type: str, artifact_s3_url: str
-) -> List:
+def extract_android_benchmark_results(artifact_type: str, artifact_s3_url: str) -> List:
     """
     The benchmark results from Android have already been stored in CUSTOMER_ARTIFACT
     artifact, so we will just need to get it
@@ -153,9 +130,10 @@ def extract_android_benchmark_results(
         # This is to handle the case where there is no benchmark results
         warning(f"Fail to load the benchmark results from {artifact_s3_url}")
         return []
+    return []
 
 
-def initialize_ios_metadata(test_name: str) -> Dict[str, any]:
+def initialize_ios_metadata(test_name: str) -> Dict[str, Any]:
     """
     Extract the benchmark metadata from the test name, for example:
         test_forward_llama2_pte_iOS_17_2_1_iPhone15_4
@@ -229,11 +207,7 @@ def extract_ios_metric(
 
     elif method == "forward":
         if metric_name == "Clock Monotonic Time, s":
-            benchmark_result["metric"] = (
-                "generate_time(ms)"
-                if "llama" in test_name
-                else "avg_inference_latency(ms)"
-            )
+            benchmark_result["metric"] = "avg_inference_latency(ms)"
             benchmark_result["actualValue"] = metric_value * 1000
 
         elif metric_name == "Memory Peak Physical, kB":
@@ -241,16 +215,19 @@ def extract_ios_metric(
             benchmark_result["metric"] = "peak_inference_mem_usage(mb)"
             benchmark_result["actualValue"] = metric_value / 1024
 
-    elif method == "generate" and metric_name == "Tokens Per Second, t/s":
-        benchmark_result["metric"] = "token_per_sec"
-        benchmark_result["actualValue"] = metric_value
+    elif method == "generate":
+        if metric_name == "Clock Monotonic Time, s":
+            benchmark_result["metric"] = "generate_time(ms)"
+            benchmark_result["actualValue"] = metric_value * 1000
+
+        elif metric_name == "Tokens Per Second, t/s":
+            benchmark_result["metric"] = "token_per_sec"
+            benchmark_result["actualValue"] = metric_value
 
     return benchmark_result
 
 
-def extract_ios_benchmark_results(
-    job_name: str, artifact_type: str, artifact_s3_url: str
-) -> List:
+def extract_ios_benchmark_results(artifact_type: str, artifact_s3_url: str) -> List:
     """
     The benchmark results from iOS are currently from xcresult, which could either
     be parsed from CUSTOMER_ARTIFACT or get from the test spec output. The latter
@@ -363,14 +340,7 @@ def transform(
     app_type: str,
     benchmark_results: List,
     benchmark_config: Dict[str, str],
-    repo: str,
-    head_branch: str,
-    workflow_name: str,
-    workflow_run_id: int,
-    workflow_run_attempt: int,
     job_name: str,
-    job_id: int,
-    schema_version: str,
 ) -> List:
     """
     Transform the benchmark results into the format writable into the benchmark database
@@ -380,145 +350,348 @@ def transform(
     for r in benchmark_results:
         r["deviceInfo"]["device"] = job_name
 
-    if schema_version == "v2":
-        # TODO (huydhn): Clean up this branch after ExecuTorch dashboard migrates to v3
-        return [
-            {
-                # GH-info to identify where the benchmark is run
-                "repo": repo,
-                "head_branch": head_branch,
-                "workflow_id": workflow_run_id,
-                "run_attempt": workflow_run_attempt,
-                "job_id": job_id,
-                # The model
-                "name": f"{r['benchmarkModel']['name']} {r['benchmarkModel'].get('backend', '')}".strip(),
-                "dtype": (
-                    r["benchmarkModel"]["quantization"]
-                    if r["benchmarkModel"]["quantization"]
-                    else "unknown"
-                ),
-                # The metric value
-                "metric": r["metric"],
-                "actual": r["actualValue"],
-                "target": r["targetValue"],
-                # The device
-                "device": r["deviceInfo"]["device"],
-                "arch": r["deviceInfo"].get("os", ""),
-                # Not used here, just set it to something unique here
-                "filename": workflow_name,
-                "test_name": app_type,
-                "runner": job_name,
-            }
-            for r in benchmark_results
-        ]
-    elif schema_version == "v3":
-        v3_benchmark_results = []
-        # From https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
-        return [
-            {
-                "benchmark": {
-                    "name": "ExecuTorch",
-                    "mode": "inference",
-                    "extra_info": {
-                        "app_type": app_type,
-                        # Just keep a copy of the benchmark config here
-                        "benchmark_config": json.dumps(benchmark_config),
-                    },
+    # From https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
+    return [
+        {
+            "benchmark": {
+                "name": "ExecuTorch",
+                "mode": "inference",
+                "extra_info": {
+                    "app_type": app_type,
+                    # Just keep a copy of the benchmark config here
+                    "benchmark_config": json.dumps(benchmark_config),
                 },
-                "model": {
-                    "name": benchmark_config.get("model", r["benchmarkModel"]["name"]),
-                    "type": "OSS model",
-                    "backend": benchmark_config.get(
-                        "config", r["benchmarkModel"].get("backend", "")
-                    ),
-                },
-                "metric": {
-                    "name": r["metric"],
-                    "benchmark_values": [r["actualValue"]],
-                    "target_value": r["targetValue"],
-                    "extra_info": {
-                        "method": r.get("method", ""),
-                    },
+            },
+            "model": {
+                "name": benchmark_config.get("model", r["benchmarkModel"]["name"]),
+                "type": "OSS model",
+                "backend": benchmark_config.get(
+                    "config", r["benchmarkModel"].get("backend", "")
+                ),
+            },
+            "metric": {
+                "name": r["metric"],
+                "benchmark_values": [r["actualValue"]],
+                "target_value": r["targetValue"],
+                "extra_info": {
+                    "method": r.get("method", ""),
                 },
-                "runners": [
-                    {
-                        "name": r["deviceInfo"]["device"],
-                        "type": r["deviceInfo"]["os"],
-                        "avail_mem_in_gb": r["deviceInfo"].get("availMem", ""),
-                        "total_mem_in_gb": r["deviceInfo"].get("totalMem", ""),
-                    }
-                ],
-            }
-            for r in benchmark_results
-        ]
+            },
+            "runners": [
+                {
+                    "name": r["deviceInfo"]["device"],
+                    "type": r["deviceInfo"]["os"],
+                    "avail_mem_in_gb": r["deviceInfo"].get("availMem", ""),
+                    "total_mem_in_gb": r["deviceInfo"].get("totalMem", ""),
+                }
+            ],
+        }
+        for r in benchmark_results
+    ]
+
+
+def extract_model_info(git_job_name: str) -> Dict[str, str]:
+    """
+    Get model infomation form git_job_name.
+    CHANGE IF CHANGE:
+        - get_benchmark_configs() in executorch/.ci/scripts/gather_benchmark_configs.py
+        - job name benchmark-on-device in executorch/.github/workflows/android-perf.yml
+        - job name benchmark-on-device in executorch/.github/workflows/apple-perf.yml
+    for example:
+        benchmark-on-device (ic4, qnn_q8, samsung_galaxy_s24, arn:aws:devicefarm:us-west-2:308535385114:d... / mobile-job (android)
+        benchmark-on-device (llama, xnnpack_q8, apple_iphone_15, arn:aws:devicefarm:us-west-2:30853538511... / mobile-job (ios)
+    """
+    # Extract content inside the first parentheses,
+    pattern = r"benchmark-on-device \((.+)"
+    match = re.search(pattern, git_job_name)
+    if not match:
+        raise ValueError(
+            f"regex pattern not found from git_job_name: pattern: `{pattern}`, git_job_name: `{git_job_name}`. please check if pattern is in sync with executorch/.ci/scripts/gather_benchmark_configs.py and the job name from previous step"
+        )
+
+    extracted_content = match.group(1)  # Get content after the opening parenthesis
+    items = extracted_content.split(",")
+    if len(items) < 3:
+        raise ValueError(
+            f"expect at least 3 items extrac from git_job_name {git_job_name}, but got {items}. please check if pattern is in sync with executorch/.ci/scripts/gather_benchmark_configs.py"
+        )
 
+    return {
+        "model_name": items[0].strip(),
+        "model_backend": items[1].strip(),
+        "device_pool_name": items[2].strip(),
+    }
 
-def main() -> None:
-    args = parse_args()
 
-    # Across all devices, keeping both schemas for now until ExecuTorch dashboard migrates to v3
-    all_benchmark_results = {
-        "v2": [],
-        "v3": [],
+def transform_failure_record(
+    app_type: str,
+    level: str,
+    model_name: str,
+    model_backend: str,
+    device_name: str,
+    device_os: str,
+    result: str,
+    report: Any = {},
+) -> Any:
+    """
+    Transform the benchmark results into the format writable into the benchmark database for job failures
+    """
+    # From https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
+    return {
+        "benchmark": {
+            "name": "ExecuTorch",
+            "mode": "inference",
+            "extra_info": {
+                "app_type": app_type,
+                "job_conclusion": result,
+                "failure_type": level,
+                "job_report": json.dumps(report),
+            },
+        },
+        "model": {
+            "name": model_name,
+            "type": "OSS model",
+            "backend": model_backend,
+        },
+        "metric": {
+            "name": "FAILURE_REPORT",
+            "benchmark_values": 0,
+            "target_value": 0,
+            "extra_info": {
+                "method": "",
+            },
+        },
+        "runners": [
+            {
+                "name": device_name,
+                "type": device_os,
+            }
+        ],
     }
-    benchmark_config = {}
 
-    with open(args.artifacts) as f:
-        for artifact in json.load(f):
-            app_type = artifact.get("app_type", "")
-            # We expect this to be set to either ANDROID_APP or IOS_APP
-            if not app_type or app_type not in ["ANDROID_APP", "IOS_APP"]:
-                info(
-                    f"App type {app_type} is not recognized in artifact {json.dumps(artifact)}"
-                )
-                continue
 
-            job_name = artifact["job_name"]
-            artifact_type = artifact["type"]
-            artifact_s3_url = artifact["s3_url"]
+def to_job_report_map(job_reports) -> Dict[str, Any]:
+    return {job_report["arn"]: job_report for job_report in job_reports}
 
-            if artifact_type == "TESTSPEC_OUTPUT":
-                benchmark_config = read_benchmark_config(
-                    artifact_s3_url, args.benchmark_configs
-                )
 
-            if app_type == "ANDROID_APP":
-                benchmark_results = extract_android_benchmark_results(
-                    job_name, artifact_type, artifact_s3_url
-                )
+def group_by_arn(artifacts: List) -> Dict[str, List]:
+    """
+    Group the artifacts by the job ARN
+    """
+    arn_to_artifacts = DefaultDict(list)
+    for artifact in artifacts:
+        job_arn = artifact.get("job_arn", "")
+        app_type = artifact.get("app_type", "")
+        if not app_type or app_type not in ["ANDROID_APP", "IOS_APP"]:
+            info(
+                f"App type {app_type} is not recognized in artifact {json.dumps(artifact)}"
+            )
+            continue
+        if not job_arn:
+            info(f"missing job_arn in artifact {json.dumps(artifact)}")
+            continue
+        arn_to_artifacts[job_arn].append(artifact)
+    return arn_to_artifacts
 
-            if app_type == "IOS_APP":
-                benchmark_results = extract_ios_benchmark_results(
-                    job_name, artifact_type, artifact_s3_url
-                )
 
-            if benchmark_results:
-                for schema in all_benchmark_results.keys():
-                    results = transform(
-                        app_type,
-                        benchmark_results,
-                        benchmark_config,
-                        args.repo,
-                        args.head_branch,
-                        args.workflow_name,
-                        args.workflow_run_id,
-                        args.workflow_run_attempt,
-                        job_name,
-                        extract_job_id(args.artifacts),
-                        schema,
-                    )
-                    all_benchmark_results[schema].extend(results)
-
-    for schema in all_benchmark_results.keys():
-        if not all_benchmark_results.get(schema):
+# get the benchmark config from TestSpec file if any exist
+def get_benchmark_config(
+    artifacts: List[Dict[str, Any]], benchmark_configs: str
+) -> Dict[str, str]:
+    result = next(
+        (artifact for artifact in artifacts if artifact["type"] == "TESTSPEC_OUTPUT"),
+        None,
+    )
+    if not result:
+        return {}
+    artifact_s3_url = result["s3_url"]
+    return read_benchmark_config(artifact_s3_url, benchmark_configs)
+
+
+def extract_benchmark_result_from_artifact(
+    artifact: Dict[str, Any],
+    benchmark_config: Dict[str, str],
+) -> List[Any]:
+    job_name = artifact.get("job_name", "")
+    artifact_type = artifact.get("type", "")
+    artifact_s3_url = artifact.get("s3_url", "")
+    app_type = artifact.get("app_type", "")
+
+    info(
+        f"Processing {app_type} artifact: {job_name} {artifact_type} {artifact_s3_url}"
+    )
+    benchmark_results = []
+    if app_type == "ANDROID_APP":
+        benchmark_results = extract_android_benchmark_results(
+            artifact_type, artifact_s3_url
+        )
+    if app_type == "IOS_APP":
+        benchmark_results = extract_ios_benchmark_results(
+            artifact_type, artifact_s3_url
+        )
+    if not benchmark_results:
+        return []
+    return transform(app_type, benchmark_results, benchmark_config, job_name)
+
+
+def get_app_type(type: str):
+    match type:
+        case "ios":
+            return "IOS_APP"
+        case "android":
+            return "ANDROID_APP"
+        case _:
+            raise ValueError(
+                f"unknown device type detected: {type}, currently we only support `ios` and `android`"
+            )
+
+
+def get_device_os_type(type: str):
+    match type:
+        case "ios":
+            return "iOS"
+        case "android":
+            return "Android"
+        case _:
+            raise ValueError(
+                f"unknown device type detected: {type}, currently we only support `ios` and `android`"
+            )
+
+
+def generate_git_job_level_failure_record(git_job_name: str, app: str) -> Any:
+    """
+    generates benchmark record for GIT_JOB level failure, this is mainly used as placeholder in UI to indicate job failures.
+    """
+    level = "GIT_JOB"
+
+    app_type = get_app_type(app)
+    device_prefix = get_device_os_type(app)
+
+    model_infos = extract_model_info(git_job_name)
+
+    model_name = model_infos["model_name"]
+    model_backend = model_infos["model_backend"]
+    device_pool_name = model_infos["device_pool_name"]
+
+    return transform_failure_record(
+        app_type,
+        level,
+        model_name,
+        model_backend,
+        device_pool_name,
+        device_prefix,
+        "FAILURE",
+    )
+
+
+def generate_device_level_failure_record(
+    git_job_name: str, job_report: Any, app: str
+) -> Any:
+    """
+    generates benchmark record for DEVICE_JOB level failure, this is mainly used as placeholder in UI to indicate job failures.
+    """
+    level = "DEVICE_JOB"
+
+    model_infos = extract_model_info(git_job_name)
+
+    model_name = model_infos["model_name"]
+    model_backend = model_infos["model_backend"]
+
+    osPrefix = get_device_os_type(app)
+    job_report_os = job_report["os"]
+
+    # make sure the device os name has prefix iOS and Android
+    device_os = job_report_os
+    if not job_report_os.startswith(osPrefix):
+        device_os = f"{osPrefix} {job_report_os}"
+
+    return transform_failure_record(
+        job_report["app_type"],
+        level,
+        model_name,
+        model_backend,
+        job_report["name"],
+        device_os,
+        job_report["result"],
+        job_report,
+    )
+
+
+def process_benchmark_results(content: Any, app: str, benchmark_configs: str):
+    """
+    main code to run to extract benchmark results from artifacts.
+    Job can be failed at two levels: GIT_JOB and DEVICE_JOB. If any job fails, generate failure benchmark record.
+
+    this function is mainly used in android-perf and apple-perf workflow.
+    """
+    artifacts = content.get("artifacts")
+    git_job_name = content["git_job_name"]
+
+    # this indicated that the git job fails, generate a failure record
+    if not artifacts:
+        info(f"job failed at GIT_JOB level with git job name {git_job_name}")
+        try:
+            failure_record = generate_git_job_level_failure_record(git_job_name, app)
+        except Exception as e:
+            raise ValueError(
+                f"Fail to generate record for GIT_JOB level failure for {git_job_name}: {e}"
+            )
+        return [failure_record]
+
+    arn_to_artifacts = group_by_arn(artifacts)
+    job_reports = content["job_reports"]
+    arn_to_job_report = to_job_report_map(job_reports)
+
+    all_benchmark_results = []
+
+    # process mobile job's benchmark results. Each job represent one device+os in device pool
+    for job_arn, job_artifacts in arn_to_artifacts.items():
+        job_report = arn_to_job_report.get(job_arn)
+
+        if not job_report:
+            info(
+                f"job arn {job_arn} is not recognized in job_reports list {json.dumps(job_reports)}, skip the process"
+            )
             continue
 
-        output_dir = os.path.join(args.output_dir, schema)
-        os.makedirs(output_dir, exist_ok=True)
+        result = job_report.get("result", "")
+        if result != "PASSED":
+            arn = job_report["arn"]
+            info(f"job {arn} failed at DEVICE_JOB level with result {result}")
+            # device test failed, generate a failure record instead
+            try:
+                failure_record = generate_device_level_failure_record(
+                    git_job_name, job_report, app
+                )
+            except Exception as e:
+                raise ValueError(
+                    f"Fail to generate record for DEVICE_JOB level failure for job {job_arn}: {e}"
+                )
+            all_benchmark_results.append(failure_record)
+        else:
+            benchmark_config = get_benchmark_config(job_artifacts, benchmark_configs)
+            for job_artifact in job_artifacts:
+                # generate result for each schema
+                results = extract_benchmark_result_from_artifact(
+                    job_artifact, benchmark_config
+                )
+                all_benchmark_results.extend(results)
+    return all_benchmark_results
+
 
-        output_file = os.path.basename(args.artifacts)
-        with open(f"{output_dir}/{output_file}", "w") as f:
-            json.dump(all_benchmark_results[schema], f)
+def main() -> None:
+    args = parse_args()
+    with open(args.artifacts) as f:
+        content = json.load(f)
+        all_benchmark_results = process_benchmark_results(
+            content, args.app, args.benchmark_configs
+        )
+    # add v3 in case we have higher version of schema
+    output_dir = os.path.join(args.output_dir, "v3")
+    os.makedirs(output_dir, exist_ok=True)
+    output_file = os.path.basename(args.artifacts)
+    with open(f"{output_dir}/{output_file}", "w") as f:
+        json.dump(all_benchmark_results, f)
 
 
 if __name__ == "__main__":
diff --git a/.github/scripts/test_extract_benchmark_results.py b/.github/scripts/test_extract_benchmark_results.py
new file mode 100644
index 00000000000..c10000c9499
--- /dev/null
+++ b/.github/scripts/test_extract_benchmark_results.py
@@ -0,0 +1,395 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import copy
+import json
+import unittest
+from re import M
+from typing import Any, Dict
+from unittest import mock
+from unittest.mock import MagicMock
+
+from extract_benchmark_results import (
+    extract_android_benchmark_results,
+    extract_ios_benchmark_results,
+    process_benchmark_results,
+)
+
+
+def get_mock_happy_flow_content(app_type: str = "IOS_APP"):
+    return {
+        "git_job_name": "benchmark-on-device (ic4, mps, apple_iphone_15, arn:aws:devicefarm:us-west-2:308535385114:devicep... / mobile-job (ios)",
+        "artifacts": [
+            {
+                "arn": "1",
+                "name": "Syslog",
+                "type": "DEVICE_LOG",
+                "extension": "syslog",
+                "url": "https://job_arn_1_device_log",
+                "s3_url": "https://job_arn_1/test-workflow1/1/syslog.syslog",
+                "app_type": app_type,
+                "job_name": "job_arn_1_name",
+                "os": "14",
+                "job_arn": "job_arn_1",
+                "job_conclusion": "PASSED",
+            },
+            {
+                "arn": "2",
+                "name": "Test spec output",
+                "type": "TESTSPEC_OUTPUT",
+                "extension": "txt",
+                "url": "job_arn_1_test_spec_output",
+                "s3_url": "job_arn_1_test_spec_output",
+                "app_type": app_type,
+                "job_name": "job_arn_1_device_name",
+                "os": "14",
+                "job_arn": "job_arn_1",
+                "job_conclusion": "PASSED",
+            },
+            {
+                "arn": "3",
+                "name": "Customer Artifacts",
+                "type": "CUSTOMER_ARTIFACT",
+                "extension": "zip",
+                "url": "https://job_arn_1_customer_artifact",
+                "s3_url": "https://job_arn_1_customer_artifact1",
+                "app_type": app_type,
+                "job_name": "job_arn_1_device_name",
+                "os": "14",
+                "job_arn": "job_arn_1",
+                "job_conclusion": "PASSED",
+            },
+            {
+                "arn": "5",
+                "name": "Syslog",
+                "type": "DEVICE_LOG",
+                "extension": "syslog",
+                "url": "https://job_arn_1_device_log",
+                "s3_url": "https://job_arn_1/test-workflow1/1/syslog.syslog",
+                "app_type": app_type,
+                "job_name": "job_arn_2_name",
+                "os": "14",
+                "job_arn": "job_arn_2",
+                "job_conclusion": "PASSED",
+            },
+            {
+                "arn": "6",
+                "name": "Test spec output",
+                "type": "TESTSPEC_OUTPUT",
+                "extension": "txt",
+                "url": "job_arn_2_test_spec_output",
+                "s3_url": "job_arn_2_test_spec_output",
+                "app_type": app_type,
+                "job_name": "job_arn_2_name",
+                "os": "14",
+                "job_arn": "job_arn_2",
+                "job_conclusion": "PASSED",
+            },
+            {
+                "arn": "7",
+                "name": "Customer Artifacts",
+                "type": "CUSTOMER_ARTIFACT",
+                "extension": "zip",
+                "url": "https://job_arn_1_customer_artifact",
+                "s3_url": "https://job_arn_1_customer_artifact1",
+                "app_type": app_type,
+                "job_name": "job_arn_2_name",
+                "os": "14",
+                "job_arn": "job_arn_2",
+                "job_conclusion": "PASSED",
+            },
+        ],
+        "run_report": {
+            "name": "mobile-job-ios-1",
+            "arn": "run_arn_1",
+            "report_type": "run",
+            "status": "COMPLETED",
+            "result": "PASSED",
+            "app_type": app_type,
+            "infos": {},
+            "parent_arn": "",
+        },
+        "job_reports": [
+            {
+                "name": "job_arn_1_report_device_name",
+                "arn": "job_arn_1",
+                "report_type": "job",
+                "status": "COMPLETED",
+                "result": "PASSED",
+                "app_type": app_type,
+                "infos": {},
+                "parent_arn": "run_arn_1",
+                "os": "14",
+            },
+            {
+                "name": "job_arn_2_name_report",
+                "arn": "job_arn_2",
+                "report_type": "job",
+                "status": "COMPLETED",
+                "result": "PASSED",
+                "app_type": app_type,
+                "infos": {},
+                "parent_arn": "run_arn_1",
+                "os": "14",
+            },
+        ],
+    }
+
+
+def mockExtractBenchmarkResults(artifact_type, artifact_s3_url):
+    if artifact_type != "TESTSPEC_OUTPUT":
+        return []
+    if artifact_s3_url == "job_arn_1_test_spec_output":
+        return [get_mock_extract_result()[0]]
+    return [get_mock_extract_result()[1]]
+
+
+class Test(unittest.TestCase):
+    @mock.patch("extract_benchmark_results.extract_ios_benchmark_results")
+    @mock.patch("extract_benchmark_results.read_benchmark_config")
+    def test_process_benchmark_results_when_ios_succuess_then_returnBenchmarkResults(
+        self, read_benchmark_config_mock, extract_ios_mock
+    ):
+        # setup mocks
+        content = get_mock_happy_flow_content()
+        extract_ios_mock.side_effect = (
+            lambda artifact_type, artifact_s3_url: mockExtractBenchmarkResults(
+                artifact_type, artifact_s3_url
+            )
+        )
+        read_benchmark_config_mock.return_value = {}
+
+        # execute
+        result = process_benchmark_results(content, "ios", "benchmark_configs")
+
+        # assert
+        self.assertGreaterEqual(len(result), 2)
+        self.assertNotEqual(result[0]["metric"]["name"], "FAILURE_REPORT")
+        self.assertNotEqual(result[1]["metric"]["name"], "FAILURE_REPORT")
+
+    @mock.patch("extract_benchmark_results.extract_android_benchmark_results")
+    @mock.patch("extract_benchmark_results.read_benchmark_config")
+    def test_process_benchmark_results_when_android_succuess_then_returnBenchmarkResults(
+        self, read_benchmark_config_mock, extract_android_mock
+    ):
+        # setup mocks
+        content = get_mock_happy_flow_content("ANDROID_APP")
+        extract_android_mock.side_effect = (
+            lambda artifact_type, artifact_s3_url: mockExtractBenchmarkResults(
+                artifact_type, artifact_s3_url
+            )
+        )
+        read_benchmark_config_mock.return_value = {}
+
+        # execute
+        result = process_benchmark_results(content, "android", "benchmark_configs")
+        self.assertGreaterEqual(len(result), 2)
+
+    def test_process_benchmark_results_when_ANDROID_git_job_fails_then_returnBenchmarkRecordWithFailure(
+        self,
+    ):
+        # setup mocks
+        # mimic artifact when job is failed.
+        content = {
+            "git_job_name": "benchmark-on-device (ic4, qnn_q8, samsung_galaxy_s22, arn:aws:devicefarm:us-west-2:308535385114:d... / mobile-job (android)"
+        }
+
+        # execute
+        result = process_benchmark_results(content, "android", "benchmark_configs")
+
+        # assert
+        self.assertGreaterEqual(len(result), 1)
+
+        self.assertEqual(
+            result[0]["model"],
+            {
+                "name": "ic4",
+                "type": "OSS model",
+                "backend": "qnn_q8",
+            },
+        )
+        self.assertEqual(
+            result[0]["benchmark"],
+            {
+                "name": "ExecuTorch",
+                "mode": "inference",
+                "extra_info": {
+                    "app_type": "ANDROID_APP",
+                    "job_conclusion": "FAILURE",
+                    "failure_type": "GIT_JOB",
+                    "job_report": "{}",
+                },
+            },
+        )
+
+        self.assertEqual(result[0]["runners"][0]["name"], "samsung_galaxy_s22")
+        self.assertEqual(result[0]["runners"][0]["type"], "Android")
+        self.assertEqual(result[0]["metric"]["name"], "FAILURE_REPORT")
+
+    def test_process_benchmark_results_when_IOS_git_job_fails_then_returnBenchmarkRecordWithFailure(
+        self,
+    ):
+        # setup mocks
+        # mimic artifact when job is failed.
+        content = {
+            "git_job_name": "benchmark-on-device (ic4, mps, apple_iphone_15, arn:aws:devicefarm:us-west-2:308535385114:devicep... / mobile-job (ios)"
+        }
+
+        # execute
+        result = process_benchmark_results(content, "ios", "benchmark_configs")
+
+        # assert
+        self.assertGreaterEqual(len(result), 1)
+
+        self.assertEqual(
+            result[0]["model"],
+            {
+                "name": "ic4",
+                "type": "OSS model",
+                "backend": "mps",
+            },
+        )
+        self.assertEqual(
+            result[0]["benchmark"],
+            {
+                "name": "ExecuTorch",
+                "mode": "inference",
+                "extra_info": {
+                    "app_type": "IOS_APP",
+                    "job_conclusion": "FAILURE",
+                    "failure_type": "GIT_JOB",
+                    "job_report": "{}",
+                },
+            },
+        )
+        self.assertEqual(result[0]["runners"][0]["name"], "apple_iphone_15")
+        self.assertEqual(result[0]["runners"][0]["type"], "iOS")
+        self.assertEqual(result[0]["metric"]["name"], "FAILURE_REPORT")
+
+    @mock.patch("extract_benchmark_results.extract_ios_benchmark_results")
+    @mock.patch("extract_benchmark_results.read_benchmark_config")
+    def test_process_benchmark_results_when_one_IOS_mobile_job_fails_then_returnBenchmarkRecordWithFailure(
+        self, read_benchmark_config_mock, extract_ios_mock
+    ):
+        # setup mocks
+        content = get_mock_happy_flow_content()
+        content["job_reports"][0]["result"] = "FAILED"
+
+        extract_ios_mock.side_effect = (
+            lambda artifact_type, artifact_s3_url: mockExtractBenchmarkResults(
+                artifact_type, artifact_s3_url
+            )
+        )
+        read_benchmark_config_mock.return_value = {}
+
+        # execute
+        result = process_benchmark_results(content, "ios", "benchmark_configs")
+
+        # assert
+        self.assertGreaterEqual(len(result), 2)
+        self.assertEqual(
+            result[0]["model"],
+            {
+                "name": "ic4",
+                "type": "OSS model",
+                "backend": "mps",
+            },
+        )
+        self.assertEqual(result[0]["metric"]["name"], "FAILURE_REPORT")
+
+        self.assertNotEqual(result[1]["metric"]["name"], "FAILURE_REPORT")
+
+    @mock.patch("extract_benchmark_results.extract_ios_benchmark_results")
+    @mock.patch("extract_benchmark_results.read_benchmark_config")
+    def test_process_benchmark_results_when_one_mobile_job_fails_with_invalid_app_type_then_throw_errors(
+        self, read_benchmark_config_mock, extract_ios_mock
+    ):
+        # setup mocks
+        content = get_mock_happy_flow_content()
+        content["job_reports"][0]["result"] = "FAILED"
+
+        extract_ios_mock.side_effect = (
+            lambda artifact_type, artifact_s3_url: mockExtractBenchmarkResults(
+                artifact_type, artifact_s3_url
+            )
+        )
+        read_benchmark_config_mock.return_value = {}
+
+        # execute
+        with self.assertRaises(ValueError) as context:
+            _ = process_benchmark_results(content, "random", "benchmark_configs")
+
+        # assert
+        self.assertTrue(
+            "unknown device type detected: random" in str(context.exception)
+        )
+        read_benchmark_config_mock.assert_not_called()
+        extract_ios_mock.assert_not_called()
+
+    def test_process_benchmark_results_when_git_job_fails_with_invalid_git_job_name_then_throw_errors(
+        self,
+    ):
+        # setup mocks
+        # mimic artifact when job is failed.
+        content = {
+            "git_job_name": "benchmark-on (ic4, qnn_q8, samsung_galaxy_s22, arn:aws:devicefarm:us-west-2:308535385114:d... / mobile-job (android)"
+        }
+
+        # execute
+        with self.assertRaises(ValueError) as context:
+            _ = process_benchmark_results(content, "ios", "benchmark_configs")
+
+        # assert
+        print("exception yang:", str(context.exception))
+        self.assertTrue(
+            "regex pattern not found from git_job_name" in str(context.exception)
+        )
+
+
+def get_mock_extract_result():
+    return [
+        {
+            "benchmarkModel": {
+                "backend": "q1",
+                "quantization": 0,
+                "name": "ic4",
+            },
+            "deviceInfo": {
+                "arch": "extract arch",
+                "device": "extract device",
+                "os": "extract os",
+                "availMem": 0,
+                "totalMem": 0,
+            },
+            "method": "",
+            "metric": "metric1",
+            "actualValue": 100,
+            "targetValue": 100,
+        },
+        {
+            "benchmarkModel": {
+                "backend": "q2",
+                "quantization": 0,
+                "name": "ic4",
+            },
+            "deviceInfo": {
+                "arch": "extract arch",
+                "device": "extract device",
+                "os": "extract os",
+                "availMem": 0,
+                "totalMem": 0,
+            },
+            "method": "",
+            "metric": "metric2",
+            "actualValue": 200,
+            "targetValue": 200,
+        },
+    ]
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/.github/workflows/_android.yml b/.github/workflows/_android.yml
index 36b679eda44..e29833015d3 100644
--- a/.github/workflows/_android.yml
+++ b/.github/workflows/_android.yml
@@ -25,11 +25,36 @@ jobs:
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh buck2
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool buck2
         export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded
+        mkdir -p ${ARTIFACTS_DIR_NAME}/
 
         # Build LLM Demo for Android
-        bash build/build_android_llm_demo.sh ${ARTIFACTS_DIR_NAME}
+        export BUILD_AAR_DIR=aar-out
+        mkdir -p $BUILD_AAR_DIR
+        bash scripts/build_android_library.sh
+        cp ${BUILD_AAR_DIR}/executorch.aar $ARTIFACTS_DIR_NAME
+
+        mkdir -p ${ARTIFACTS_DIR_NAME}/library_test_dir
+        bash .ci/scripts/build_android_instrumentation.sh
+        cp ${BUILD_AAR_DIR}/executorch_android/build/outputs/apk/androidTest/debug/executorch_android-debug-androidTest.apk "${ARTIFACTS_DIR_NAME}/library_test_dir"
+
+        mkdir -p ${ARTIFACTS_DIR_NAME}/fp32-xnnpack-custom
+        bash examples/models/llama/install_requirements.sh
+        bash ".ci/scripts/test_llama.sh" -model stories110M -build_tool cmake -dtype fp16 -mode portable -upload ${ARTIFACTS_DIR_NAME}/fp32-xnnpack-custom
+
+        mkdir -p examples/demo-apps/android/LlamaDemo/app/libs
+        cp aar-out/executorch.aar examples/demo-apps/android/LlamaDemo/app/libs
+        pushd examples/demo-apps/android/LlamaDemo
+        ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew build assembleAndroidTest
+        popd
+
+        DEMO_APP_DIR="${ARTIFACTS_DIR_NAME}/llm_demo"
+        # The app directory is named using its build flavor as a suffix.
+        mkdir -p "${DEMO_APP_DIR}"
+        # Collect the app and its test suite
+        cp examples/demo-apps/android/LlamaDemo/app/build/outputs/apk/debug/*.apk "${DEMO_APP_DIR}"
+        cp examples/demo-apps/android/LlamaDemo/app/build/outputs/apk/androidTest/debug/*.apk "${DEMO_APP_DIR}"
 
   # Running Android emulator directly on the runner and not using Docker
   run-emulator:
@@ -76,8 +101,7 @@ jobs:
           curl -O https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/llm_demo/app-debug.apk
           curl -O https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/llm_demo/app-debug-androidTest.apk
           curl -O https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/fp32-xnnpack-custom/model.zip
-          curl -o android-test-debug.apk https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/library_test_dir/executorch-debug.apk
-          curl -o android-test-debug-androidTest.apk https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/library_test_dir/executorch-debug-androidTest.apk
+          curl -o android-test-debug-androidTest.apk https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/library_test_dir/executorch_android-debug-androidTest.apk
           unzip model.zip
           mv *.pte model.pte
 
@@ -101,7 +125,7 @@ jobs:
         with:
           api-level: ${{ env.API_LEVEL }}
           arch: x86_64
-          script: ./build/run_android_emulator.sh
+          script: ./scripts/run_android_emulator.sh
           # NB: This is to boot the emulator faster following the instructions on
           # https://github.com/ReactiveCircus/android-emulator-runner. The max number
           # of cores we can set is 6, any higher number will be reduced to 6.
diff --git a/.github/workflows/_unittest.yml b/.github/workflows/_unittest.yml
index 414f86494b0..eb6c9c24257 100644
--- a/.github/workflows/_unittest.yml
+++ b/.github/workflows/_unittest.yml
@@ -7,6 +7,18 @@ on:
         required: true
         type: string
         description: Name of the docker image to use.
+      build-mode:
+        required: true
+        type: string
+        description: Build mode to use, Debug or Release.
+      build-tool:
+        required: true
+        type: string
+        description: Build tool to use, cmake or buck2.
+      editable:
+        required: false
+        type: string
+        description: Install ExecuTorch in editable mode or not.
       python-version:
         required: false
         type: string
@@ -26,28 +38,7 @@ jobs:
       timeout: 90
       script: |
         set -eux
-
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        # Setup swiftshader and Vulkan SDK which are required to build the Vulkan delegate
-        source .ci/scripts/setup-vulkan-linux-deps.sh
-
-        # Setup MacOS dependencies as there is no Docker support on MacOS atm
-        PYTHON_EXECUTABLE=python \
-        EXECUTORCH_BUILD_PYBIND=ON \
-        CMAKE_ARGS="-DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON" \
-        .ci/scripts/setup-linux.sh cmake
-
-        # Install llama3_2_vision dependencies.
-        PYTHON_EXECUTABLE=python ./examples/models/llama3_2_vision/install_requirements.sh
-
-        # Run pytest with coverage
-        pytest -n auto --cov=./ --cov-report=xml
-        # Run gtest
-        LLVM_PROFDATA=llvm-profdata-12 LLVM_COV=llvm-cov-12 \
-        test/run_oss_cpp_tests.sh
+        .ci/scripts/unittest-linux.sh --build-tool "${{ inputs.build-tool }}" --build-mode "${{ inputs.build-mode }}" --editable "${{ inputs.editable }}"
 
   macos:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
@@ -58,27 +49,6 @@ jobs:
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
         set -eux
-
-        bash .ci/scripts/setup-conda.sh
-
-        # Create temp directory for sccache shims
-        export TMP_DIR=$(mktemp -d)
-        export PATH="${TMP_DIR}:$PATH"
-        trap 'rm -rfv ${TMP_DIR}' EXIT
-
-        # Setup MacOS dependencies as there is no Docker support on MacOS atm
-        PYTHON_EXECUTABLE=python \
-        EXECUTORCH_BUILD_PYBIND=ON \
-        CMAKE_ARGS="-DEXECUTORCH_BUILD_COREML=ON -DEXECUTORCH_BUILD_MPS=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON" \
-        ${CONDA_RUN} --no-capture-output \
-        .ci/scripts/setup-macos.sh cmake
-
-        # Install llama3_2_vision dependencies.
-        PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
-        ./examples/models/llama3_2_vision/install_requirements.sh
-
-        # Run pytest with coverage
-        ${CONDA_RUN} pytest -n auto --cov=./ --cov-report=xml
-        # Run gtest
-        LLVM_PROFDATA="xcrun llvm-profdata" LLVM_COV="xcrun llvm-cov" \
-        ${CONDA_RUN} test/run_oss_cpp_tests.sh
+        # This is needed to get the prebuilt PyTorch wheel from S3
+        ${CONDA_RUN} --no-capture-output pip install awscli==1.37.21
+        .ci/scripts/unittest-macos.sh --build-tool "${{ inputs.build-tool }}" --build-mode "${{ inputs.build-mode }}" --editable "${{ inputs.editable }}"
diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml
index 8bebc7be1bc..09a6453094f 100644
--- a/.github/workflows/android-perf.yml
+++ b/.github/workflows/android-perf.yml
@@ -2,7 +2,7 @@ name: android-perf
 
 on:
   schedule:
-    - cron: 0 0 * * *
+    - cron: 0 0,8,16 * * *
   pull_request:
     paths:
       - .github/workflows/android-perf.yml
@@ -20,7 +20,7 @@ on:
         description: Models to be benchmarked
         required: false
         type: string
-        default: stories110M
+        default: llama
       devices:
         description: Target devices to run benchmark
         required: false
@@ -36,7 +36,7 @@ on:
         description: Models to be benchmarked
         required: false
         type: string
-        default: stories110M
+        default: llama
       devices:
         description: Target devices to run benchmark
         required: false
@@ -181,7 +181,7 @@ jobs:
             PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
             PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
         fi
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "cmake"
         # Install requirements for export_llama
         PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
 
@@ -356,16 +356,26 @@ jobs:
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh cmake
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool cmake
         export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded
 
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
         PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
 
-        export ANDROID_ABIS="arm64-v8a"
-        PYTHON_EXECUTABLE=python EXECUTORCH_BUILD_QNN=ON QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029 bash build/build_android_llm_demo.sh ${ARTIFACTS_DIR_NAME}
+        mkdir -p aar-out
+        PYTHON_EXECUTABLE=python ANDROID_ABIS="arm64-v8a" BUILD_AAR_DIR=aar-out EXECUTORCH_BUILD_QNN=ON QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029 bash scripts/build_android_library.sh
+        mkdir -p extension/benchmark/android/benchmark/app/libs
+        cp aar-out/executorch.aar extension/benchmark/android/benchmark/app/libs
+        pushd extension/benchmark/android/benchmark
+        ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew build assembleAndroidTest
+        popd
+        MINIBENCH_APP_DIR="${ARTIFACTS_DIR_NAME}/minibench"
+        mkdir -p "${MINIBENCH_APP_DIR}"
+        cp extension/benchmark/android/benchmark/app/build/outputs/apk/debug/*.apk "${MINIBENCH_APP_DIR}"
+        cp extension/benchmark/android/benchmark/app/build/outputs/apk/androidTest/debug/*.apk "${MINIBENCH_APP_DIR}"
 
   # Let's see how expensive this job is, we might want to tone it down by running it periodically
+  # CHANGE IF this job name 'benchmark-on-device' changed: extract_model_info() in executorch/.github/scripts/extract_benchmark_results.py
   benchmark-on-device:
     if: always()
     permissions:
@@ -392,6 +402,7 @@ jobs:
       android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/minibench/app-debug.apk
       android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/minibench/app-debug-androidTest.apk
       test-spec: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.config }}/android-llm-device-farm-test-spec.yml
+      new-output-format-flag: true
 
   upload-benchmark-results:
     needs:
@@ -451,6 +462,8 @@ jobs:
 
       - name: Extract the benchmark results JSON
         shell: bash
+        env:
+          DEVICE_TYPE: android
         run: |
           set -eux
 
@@ -462,29 +475,15 @@ jobs:
             ${CONDA_RUN} python .github/scripts/extract_benchmark_results.py \
               --artifacts "${ARTIFACTS_BY_JOB}" \
               --output-dir benchmark-results \
-              --repo ${{ github.repository }} \
-              --head-branch ${{ github.head_ref || github.ref_name }} \
-              --workflow-name "${{ github.workflow }}" \
-              --workflow-run-id ${{ github.run_id }} \
-              --workflow-run-attempt ${{ github.run_attempt }} \
+              --app "${DEVICE_TYPE}" \
               --benchmark-configs benchmark-configs
           done
 
-          for SCHEMA in v2 v3; do
-            for BENCHMARK_RESULTS in benchmark-results/"${SCHEMA}"/*.json; do
-              cat "${BENCHMARK_RESULTS}"
-              echo
-            done
+          for BENCHMARK_RESULTS in benchmark-results/v3/*.json; do
+            cat "${BENCHMARK_RESULTS}"
+            echo
           done
 
-      # TODO (huydhn): Remove v2 schema once the benchmark dashboard finishes the migration
-      - name: Upload the benchmark results (v2)
-        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
-        with:
-          benchmark-results-dir: benchmark-results/v2
-          dry-run: false
-          schema-version: v2
-
       - name: Upload the benchmark results (v3)
         uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
         with:
diff --git a/.github/workflows/android-release-artifacts.yml b/.github/workflows/android-release-artifacts.yml
index d204e121ffa..24aa6c1ad27 100644
--- a/.github/workflows/android-release-artifacts.yml
+++ b/.github/workflows/android-release-artifacts.yml
@@ -49,13 +49,17 @@ jobs:
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh buck2
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool buck2
         export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded
 
-        # Build LLM Demo for Android
-        bash build/build_android_llm_demo.sh ${ARTIFACTS_DIR_NAME}
+        # Build AAR Package
+        mkdir aar-out
+        export BUILD_AAR_DIR=aar-out
+        bash scripts/build_android_library.sh
+        mkdir -p "${ARTIFACTS_DIR_NAME}"
+        cp aar-out/executorch.aar "${ARTIFACTS_DIR_NAME}/executorch.aar"
 
-        shasum -a 256 "${ARTIFACTS_DIR_NAME}/llm_demo/executorch.aar"
+        shasum -a 256 "${ARTIFACTS_DIR_NAME}/executorch.aar"
 
   upload-release-aar:
     name: upload-release-aar
@@ -74,7 +78,7 @@ jobs:
       - name: Upload AAR RC to AWS S3
         shell: bash
         run: |
-          wget https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/llm_demo/executorch.aar
+          wget https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/executorch.aar
           shasum -a 256 executorch.aar > executorch.aar.sha256sums
 
           pip install awscli==1.32.18
diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml
index ea88be441cb..6e75aaf4f85 100644
--- a/.github/workflows/apple-perf.yml
+++ b/.github/workflows/apple-perf.yml
@@ -20,7 +20,7 @@ on:
         description: Models to be benchmarked
         required: false
         type: string
-        default: stories110M
+        default: llama
       devices:
         description: Target devices to run benchmark
         required: false
@@ -36,7 +36,7 @@ on:
         description: Models to be benchmarked
         required: false
         type: string
-        default: stories110M
+        default: llama
       devices:
         description: Target devices to run benchmark
         required: false
@@ -181,7 +181,7 @@ jobs:
         BUILD_TOOL=cmake
         # Setup MacOS dependencies as there is no Docker support on MacOS atm
         GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
-          .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
+          .ci/scripts/setup-macos.sh --build-tool "${BUILD_TOOL}"
 
         if [[ ${{ matrix.config }} == *"coreml"* ]]; then
           PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
@@ -353,7 +353,7 @@ jobs:
     with:
       runner: macos-latest-xlarge
       python-version: '3.11'
-      submodules: 'true'
+      submodules: 'recursive'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       upload-artifact: ios-apps
       secrets-env: BUILD_CERTIFICATE_BASE64 EXECUTORCH_BENCHMARK_BUILD_PROVISION_PROFILE_BASE64 KEYCHAIN_PASSWORD
@@ -367,7 +367,7 @@ jobs:
         BUILD_TOOL=cmake
         # Setup MacOS dependencies as there is no Docker support on MacOS atm
         GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
-        .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
+        .ci/scripts/setup-macos.sh --build-tool "${BUILD_TOOL}"
         export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded
 
         # Setup Apple certificate for iOS development
@@ -386,25 +386,8 @@ jobs:
         echo "::endgroup::"
 
         echo "::group::Build ExecuTorch iOS frameworks"
-        FRAMEWORKS=(
-          "executorch"
-          "backend_coreml"
-          "backend_mps"
-          "backend_xnnpack"
-          "kernels_custom"
-          "kernels_optimized"
-          "kernels_portable"
-          "kernels_quantized"
-        )
-
-        # Build Release iOS Frameworks
         PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
-          build/build_apple_frameworks.sh --coreml --custom --mps --optimized --portable --quantized --xnnpack
-
-        mkdir -p extension/benchmark/apple/Benchmark/Frameworks
-        for FRAMEWORK in "${FRAMEWORKS[@]}"; do (
-          cp -r "cmake-out/${FRAMEWORK}.xcframework" extension/benchmark/apple/Benchmark/Frameworks/
-        ) done
+          scripts/build_apple_frameworks.sh --Release --Debug --coreml --custom --mps --optimized --portable --quantized --xnnpack
         echo "::endgroup::"
 
         # NB: Although exported models can be copied to this directory and bundled together with the
@@ -413,7 +396,7 @@ jobs:
         # create the directory here to pass the build
         mkdir -p extension/benchmark/apple/Benchmark/Models
         ${CONDA_RUN} --no-capture-output \
-          build/build_apple_llm_demo.sh ${ARTIFACTS_DIR_NAME}
+          scripts/build_apple_llm_demo.sh ${ARTIFACTS_DIR_NAME}
 
   upload-benchmark-app:
     needs: build-benchmark-app
@@ -442,6 +425,7 @@ jobs:
           if-no-files-found: ignore
           path: ${{ runner.temp }}/artifacts/
 
+  # CHANGE IF this job name 'benchmark-on-device' changed: extract_model_info() in executorch/.github/scripts/extract_benchmark_results.py
   benchmark-on-device:
     if: always()
     needs:
@@ -470,6 +454,7 @@ jobs:
       ios-ipa-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/Benchmark.ipa
       ios-xctestrun-zip: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/Benchmark.xctestrun.zip
       test-spec: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.config }}/default-ios-device-farm-appium-test-spec.yml
+      new-output-format-flag: true
 
   upload-benchmark-results:
     needs:
@@ -527,6 +512,8 @@ jobs:
 
       - name: Extract the benchmark results JSON
         shell: bash
+        env:
+          DEVICE_TYPE: ios
         run: |
           set -eux
 
@@ -538,29 +525,15 @@ jobs:
             ${CONDA_RUN} python .github/scripts/extract_benchmark_results.py \
               --artifacts "${ARTIFACTS_BY_JOB}" \
               --output-dir benchmark-results \
-              --repo ${{ github.repository }} \
-              --head-branch ${{ github.head_ref || github.ref_name }} \
-              --workflow-name "${{ github.workflow }}" \
-              --workflow-run-id ${{ github.run_id }} \
-              --workflow-run-attempt ${{ github.run_attempt }} \
+              --app "${DEVICE_TYPE}" \
               --benchmark-configs benchmark-configs
           done
 
-          for SCHEMA in v2 v3; do
-            for BENCHMARK_RESULTS in benchmark-results/"${SCHEMA}"/*.json; do
-              cat "${BENCHMARK_RESULTS}"
-              echo
-            done
+          for BENCHMARK_RESULTS in benchmark-results/v3/*.json; do
+            cat "${BENCHMARK_RESULTS}"
+            echo
           done
 
-      # TODO (huydhn): Remove v2 schema once the benchmark dashboard finishes the migration
-      - name: Upload the benchmark results (v2)
-        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
-        with:
-          benchmark-results-dir: benchmark-results/v2
-          dry-run: false
-          schema-version: v2
-
       - name: Upload the benchmark results (v3)
         uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
         with:
diff --git a/.github/workflows/apple.yml b/.github/workflows/apple.yml
index 8349ddb4192..214d4f13fc8 100644
--- a/.github/workflows/apple.yml
+++ b/.github/workflows/apple.yml
@@ -11,10 +11,10 @@ on:
       - .github/workflows/apple.yml
       - install_executorch.sh
       - backends/apple/**
-      - build/build_apple_frameworks.sh
-      - build/build_apple_llm_demo.sh
-      - build/create_frameworks.sh
-      - build/test_ios_ci.sh
+      - scripts/build_apple_frameworks.sh
+      - scripts/build_apple_llm_demo.sh
+      - scripts/create_frameworks.sh
+      - .ci/scripts/test_ios_ci.sh
       - examples/demo-apps/apple_ios/**
       - extension/apple/**
       - extension/benchmark/apple/**
@@ -49,7 +49,7 @@ jobs:
     with:
       runner: macos-latest-xlarge
       python-version: '3.11'
-      submodules: 'true'
+      submodules: 'recursive'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 90
       secrets-env: BUILD_CERTIFICATE_BASE64 EXECUTORCH_DEMO_BUILD_PROVISION_PROFILE_BASE64 KEYCHAIN_PASSWORD
@@ -69,13 +69,13 @@ jobs:
 
         # Setup MacOS dependencies as there is no Docker support on MacOS atm
         GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
-        .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
+        .ci/scripts/setup-macos.sh --build-tool "${BUILD_TOOL}"
 
         export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded
 
         # Build and test iOS Demo App
         PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
-        build/test_ios_ci.sh "${ARTIFACTS_DIR_NAME}"
+        .ci/scripts/test_ios_ci.sh "${ARTIFACTS_DIR_NAME}"
 
   # Upload the test demo app to S3
   upload-demo-ios:
@@ -136,7 +136,7 @@ jobs:
     with:
       runner: macos-latest-xlarge
       python-version: '3.11'
-      submodules: 'true'
+      submodules: 'recursive'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       upload-artifact: executorch-frameworks-ios
       timeout: 90
@@ -160,7 +160,7 @@ jobs:
 
         # Setup MacOS dependencies as there is no Docker support on MacOS atm
         GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
-        .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
+        .ci/scripts/setup-macos.sh --build-tool "${BUILD_TOOL}"
 
         # Install CoreML Backend Requirements
         PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
@@ -170,24 +170,14 @@ jobs:
         PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
         backends/apple/mps/install_requirements.sh
 
-        # Build Release iOS Frameworks
+        # Build iOS Frameworks
         PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
-        build/build_apple_frameworks.sh --coreml --custom --mps --optimized --portable --quantized --xnnpack
+        scripts/build_apple_frameworks.sh --Release --Debug --coreml --custom --mps --optimized --portable --quantized --xnnpack
 
-        # Bundle Release iOS Frameworks
+        # Bundle iOS Frameworks
         for FRAMEWORK in "${FRAMEWORKS[@]}"; do (
           cd cmake-out && \
           zip -r "${RUNNER_TEMP}/artifacts/${FRAMEWORK}-${VERSION}.zip" "${FRAMEWORK}.xcframework"
-        ) done
-
-        # Build Debug iOS Frameworks
-        PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
-        build/build_apple_frameworks.sh --coreml --custom --mps --optimized --portable --quantized --xnnpack --Debug
-
-        # Bundle Debug iOS Frameworks
-        for FRAMEWORK in "${FRAMEWORKS[@]}"; do (
-          cd cmake-out && \
-          mv "${FRAMEWORK}.xcframework" "${FRAMEWORK}_debug.xcframework" && \
           zip -r "${RUNNER_TEMP}/artifacts/${FRAMEWORK}_debug-${VERSION}.zip" "${FRAMEWORK}_debug.xcframework"
         ) done
 
@@ -289,7 +279,7 @@ jobs:
     with:
       runner: macos-latest-xlarge
       python-version: '3.11'
-      submodules: 'true'
+      submodules: 'recursive'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       upload-artifact: ios-benchmark-app
       secrets-env: BUILD_CERTIFICATE_BASE64 EXECUTORCH_BENCHMARK_BUILD_PROVISION_PROFILE_BASE64 KEYCHAIN_PASSWORD
@@ -303,7 +293,7 @@ jobs:
         BUILD_TOOL=cmake
         # Setup MacOS dependencies as there is no Docker support on MacOS atm
         GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
-        .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
+        .ci/scripts/setup-macos.sh --build-tool "${BUILD_TOOL}"
         export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded
 
         # Setup Apple certificate for iOS development
@@ -322,29 +312,12 @@ jobs:
         echo "::endgroup::"
 
         echo "::group::Build ExecuTorch iOS frameworks"
-        FRAMEWORKS=(
-          "executorch"
-          "backend_coreml"
-          "backend_mps"
-          "backend_xnnpack"
-          "kernels_custom"
-          "kernels_optimized"
-          "kernels_portable"
-          "kernels_quantized"
-        )
-
-        # Build Release iOS Frameworks
         PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
-          build/build_apple_frameworks.sh --coreml --custom --mps --optimized --portable --quantized --xnnpack
-
-        mkdir -p extension/benchmark/apple/Benchmark/Frameworks
-        for FRAMEWORK in "${FRAMEWORKS[@]}"; do (
-          cp -r "cmake-out/${FRAMEWORK}.xcframework" extension/benchmark/apple/Benchmark/Frameworks/
-        ) done
+          scripts/build_apple_frameworks.sh --Release --Debug --coreml --custom --mps --optimized --portable --quantized --xnnpack
         echo "::endgroup::"
 
         echo "::group::Build ExecuTorch benchmark app"
         mkdir -p extension/benchmark/apple/Benchmark/Models
         ${CONDA_RUN} --no-capture-output \
-          build/build_apple_llm_demo.sh "${ARTIFACTS_DIR_NAME}"
+          scripts/build_apple_llm_demo.sh "${ARTIFACTS_DIR_NAME}"
         echo "::endgroup::"
diff --git a/.github/workflows/build-wheels-linux.yml b/.github/workflows/build-wheels-linux.yml
index 75f2c13fa83..62d8728b8a5 100644
--- a/.github/workflows/build-wheels-linux.yml
+++ b/.github/workflows/build-wheels-linux.yml
@@ -4,7 +4,7 @@ name: Build Linux Wheels
 on:
   pull_request:
     paths:
-      - build/packaging/**
+      - .ci/**/*
       - .github/workflows/build-wheels-linux.yml
   push:
     branches:
@@ -39,9 +39,9 @@ jobs:
       matrix:
         include:
           - repository: pytorch/executorch
-            pre-script: build/packaging/pre_build_script.sh
-            post-script: build/packaging/post_build_script.sh
-            smoke-test-script: build/packaging/smoke_test.py
+            pre-script: .ci/scripts/wheel/pre_build_script.sh
+            post-script: .ci/scripts/wheel/post_build_script.sh
+            smoke-test-script: .ci/scripts/wheel/test_linux.py
             package-name: executorch
     name: ${{ matrix.repository }}
     uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@main
@@ -51,11 +51,8 @@ jobs:
       test-infra-repository: pytorch/test-infra
       test-infra-ref: main
       build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
-      # ExecuTorch only needs the first layer of submodules; override the
-      # "recursive" default to do less work, and to give the buck daemon fewer
-      # files to look at.
-      submodules: true
-      env-var-script: build/packaging/env_var_script_linux.sh
+      submodules: recursive
+      env-var-script: .ci/scripts/wheel/envvar_linux.sh
       pre-script: ${{ matrix.pre-script }}
       post-script: ${{ matrix.post-script }}
       package-name: ${{ matrix.package-name }}
diff --git a/.github/workflows/build-wheels-m1.yml b/.github/workflows/build-wheels-macos.yml
similarity index 75%
rename from .github/workflows/build-wheels-m1.yml
rename to .github/workflows/build-wheels-macos.yml
index a160f5ab9b5..490f01a46ca 100644
--- a/.github/workflows/build-wheels-m1.yml
+++ b/.github/workflows/build-wheels-macos.yml
@@ -1,11 +1,11 @@
 # From https://github.com/pytorch/test-infra/wiki/Using-Nova-Reusable-Build-Workflows
-name: Build M1 Wheels
+name: Build macOS Wheels
 
 on:
   pull_request:
     paths:
-      - build/packaging/**
-      - .github/workflows/build-wheels-m1.yml
+      - .ci/**/*
+      - .github/workflows/build-wheels-macos.yml
   push:
     branches:
       - nightly
@@ -39,9 +39,9 @@ jobs:
       matrix:
         include:
           - repository: pytorch/executorch
-            pre-script: build/packaging/pre_build_script.sh
-            post-script: build/packaging/post_build_script.sh
-            smoke-test-script: build/packaging/smoke_test.py
+            pre-script: .ci/scripts/wheel/pre_build_script.sh
+            post-script: .ci/scripts/wheel/post_build_script.sh
+            smoke-test-script: .ci/scripts/wheel/test_macos.py
             package-name: executorch
     name: ${{ matrix.repository }}
     uses: pytorch/test-infra/.github/workflows/build_wheels_macos.yml@main
@@ -51,12 +51,9 @@ jobs:
       test-infra-repository: pytorch/test-infra
       test-infra-ref: main
       build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
-      # ExecuTorch only needs the first layer of submodules; override the
-      # "recursive" default to do less work, and to give the buck daemon fewer
-      # files to look at.
-      submodules: true
+      submodules: recursive
       delocate-wheel: false
-      env-var-script: build/packaging/env_var_script_m1.sh
+      env-var-script: .ci/scripts/wheel/envvar_macos.sh
       pre-script: ${{ matrix.pre-script }}
       post-script: ${{ matrix.post-script }}
       package-name: ${{ matrix.package-name }}
diff --git a/.github/workflows/doc-build.yml b/.github/workflows/doc-build.yml
index 8d9081615be..b8b63078643 100644
--- a/.github/workflows/doc-build.yml
+++ b/.github/workflows/doc-build.yml
@@ -26,7 +26,7 @@ jobs:
     with:
       job-name: Build doc
       runner: linux.2xlarge
-      docker-image: executorch-ubuntu-22.04-clang12
+      docker-image: executorch-ubuntu-22.04-clang12-android
       submodules: 'true'
       repository: pytorch/executorch
       upload-artifact: docs
@@ -38,7 +38,7 @@ jobs:
 
         BUILD_TOOL=${{ matrix.build-tool }}
         # Setup dependencies as there is no Docker support
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "${BUILD_TOOL}"
 
         if [[(${GITHUB_EVENT_NAME} = 'pull_request' && (${GITHUB_BASE_REF} = 'release'*)) || (${GITHUB_REF} = 'refs/heads/release'*) ]]; then
           export CHANNEL=test
@@ -68,6 +68,12 @@ jobs:
         make html
         cd ..
 
+        # Build javadoc:
+        cd extension/android
+        ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew :executorch_android:javaDocReleaseGeneration
+        cp -rf executorch_android/build/intermediates/java_doc_dir/release/javaDocReleaseGeneration "${RUNNER_DOCS_DIR}/javadoc"
+        cd ../..
+
         # If it's main branch, add noindex tag to all .html files to exclude from Google Search indexing.
         echo "GitHub Ref: ${GITHUB_REF}"
         if [[ "${{ github.ref }}" == 'refs/heads/main' ]]; then
@@ -77,6 +83,7 @@ jobs:
         cp -rf docs/_build/html/* "${RUNNER_DOCS_DIR}"
 
         mv docs/_build/html "${RUNNER_ARTIFACT_DIR}"
+        cp -rf "${RUNNER_DOCS_DIR}"/javadoc "${RUNNER_ARTIFACT_DIR}"/html
 
         ls -R "${RUNNER_ARTIFACT_DIR}"/*/*.html
 
diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index 73af9842a20..791a52b96c1 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -27,19 +27,25 @@ env:
 
 jobs:
   docker-build:
-    runs-on: [self-hosted, linux.2xlarge]
     timeout-minutes: 240
     strategy:
       fail-fast: false
       matrix:
+        runner: [linux.2xlarge]
+        docker-image-name: [
+          executorch-ubuntu-22.04-gcc9,
+          executorch-ubuntu-22.04-clang12,
+          executorch-ubuntu-22.04-linter,
+          executorch-ubuntu-22.04-arm-sdk,
+          executorch-ubuntu-22.04-qnn-sdk,
+          executorch-ubuntu-22.04-mediatek-sdk,
+          executorch-ubuntu-22.04-clang12-android
+          ]
         include:
-          - docker-image-name: executorch-ubuntu-22.04-gcc9
-          - docker-image-name: executorch-ubuntu-22.04-clang12
-          - docker-image-name: executorch-ubuntu-22.04-linter
-          - docker-image-name: executorch-ubuntu-22.04-arm-sdk
-          - docker-image-name: executorch-ubuntu-22.04-qnn-sdk
-          - docker-image-name: executorch-ubuntu-22.04-mediatek-sdk
-          - docker-image-name: executorch-ubuntu-22.04-clang12-android
+          - docker-image-name: executorch-ubuntu-22.04-gcc11-aarch64
+            runner: linux.arm64.2xlarge
+
+    runs-on: [self-hosted, "${{ matrix.runner }}"]
     env:
       DOCKER_IMAGE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/executorch/${{ matrix.docker-image-name }}
     steps:
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 53d2bd7910b..5a3c8595c72 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -35,7 +35,7 @@ jobs:
         # For mypy linting, we need to first install executorch first so that
         # it builds the python package information.
         BUILD_TOOL="cmake"
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "${BUILD_TOOL}"
 
         CACHE_DIRECTORY="/tmp/.lintbin"
         # Try to recover the cached binaries
@@ -76,8 +76,8 @@ jobs:
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 90
       script: |
-        FILES_NEEDS_FORMAT=$(/opt/google-java-format -n extension/android/src/main/java/org/pytorch/executorch/*.java \
-          examples/demo-apps/android/ExecuTorchDemo/app/src/main/java/com/example/executorchdemo/*.java \
+        FILES_NEEDS_FORMAT=$(/opt/google-java-format -n \
+          extension/android/executorch_android/src/main/java/org/pytorch/executorch/*.java \
           examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/*.java \
           extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/*.java)
         if [ -n "$FILES_NEEDS_FORMAT" ]; then
diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
index 6b4644bb522..fa3fa6e1cd2 100644
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@@ -63,6 +63,6 @@ jobs:
         BACKEND=${{ matrix.backend }}
         DEMO_BACKEND_DELEGATION=${{ matrix.demo_backend_delegation }}
 
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "${BUILD_TOOL}"
         # Build and test ExecuTorch
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}"
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index fac23197891..81948e4e827 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -13,24 +13,6 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  gather-models:
-    runs-on: ubuntu-22.04
-    outputs:
-      models: ${{ steps.gather-models.outputs.models }}
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          submodules: 'false'
-      - uses: actions/setup-python@v4
-        with:
-          python-version: '3.10'
-      - name: Extract the list of models to test
-        id: gather-models
-        run: |
-          set -eux
-
-          PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --event "${GITHUB_EVENT_NAME}"
-
   test-setup-linux-gcc:
     name: test-setup-linux-gcc
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
@@ -52,41 +34,109 @@ jobs:
 
         BUILD_TOOL="cmake"
 
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "${BUILD_TOOL}"
         # Build and test ExecuTorch with the add model on portable backend.
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh "add" "${BUILD_TOOL}" "portable"
 
+  test-models-linux-basic:
+    name: test-models-linux-basic
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      matrix:
+        model: [mv3, vit]
+        backend: [portable, xnnpack-quantization-delegation]
+        build-tool: [cmake, buck2]
+        runner: [linux.2xlarge, linux.arm64.2xlarge]
+        docker-image: [executorch-ubuntu-22.04-clang12, executorch-ubuntu-22.04-gcc11-aarch64]
+        # Excluding specific runner + docker image combinations that don't make sense:
+        #   - Excluding the ARM64 gcc image on the x86 runner (linux.2xlarge)
+        #   - Excluding the x86 clang image on the ARM64 runner (linux.arm64.2xlarge)
+        exclude:
+          - runner: linux.2xlarge
+            docker-image: executorch-ubuntu-22.04-gcc11-aarch64
+          - runner: linux.arm64.2xlarge
+            docker-image: executorch-ubuntu-22.04-clang12
+          # TODO: Need to figure out why buck2 doesnt work on Graviton instances.
+          - runner: linux.arm64.2xlarge
+            build-tool: buck2
+      fail-fast: false
+    with:
+      runner: ${{ matrix.runner }}
+      docker-image: ${{ matrix.docker-image }}
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        MODEL_NAME=${{ matrix.model }}
+        BUILD_TOOL=${{ matrix.build-tool }}
+        BACKEND=${{ matrix.backend }}
+
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "${BUILD_TOOL}"
+        # Build and test ExecuTorch
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}"
+
   test-models-linux:
     name: test-models-linux
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
       contents: read
-    needs: gather-models
     strategy:
-      matrix: ${{ fromJSON(needs.gather-models.outputs.models) }}
+      matrix:
+        model: [linear, add, add_mul, ic3, mv2, resnet18, resnet50, mobilebert, emformer_transcribe]
+        backend: [portable, xnnpack-quantization-delegation]
+        runner: [linux.2xlarge]
+        include:
+          - model: ic4
+            backend: portable
+            runner: linux.4xlarge.memory
+          - model: ic4
+            backend: xnnpack-quantization-delegation
+            runner: linux.4xlarge.memory
+          - model: emformer_join
+            backend: portable
+            runner: linux.4xlarge.memory
+          - model: emformer_join
+            backend: xnnpack-quantization-delegation
+            runner: linux.4xlarge.memory
+          - model: phi-4-mini
+            backend: portable
+            runner: linux.4xlarge.memory
+          - model: llama3_2_vision_encoder
+            backend: portable
+            runner: linux.4xlarge.memory
+          - model: w2l
+            backend: portable
+            runner: linux.4xlarge.memory
       fail-fast: false
     with:
       runner: ${{ matrix.runner }}
       docker-image: executorch-ubuntu-22.04-clang12
       submodules: 'true'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: ${{ matrix.timeout }}
+      timeout: 90
       script: |
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
 
         MODEL_NAME=${{ matrix.model }}
-        BUILD_TOOL=${{ matrix.build-tool }}
+        BUILD_TOOL=cmake
         BACKEND=${{ matrix.backend }}
-        DEMO_BACKEND_DELEGATION=${{ matrix.demo_backend_delegation }}
 
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "${BUILD_TOOL}"
         # Build and test ExecuTorch
-        PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}"
 
   test-llama-runner-linux:
+    # Test Both linux x86 and linux aarch64
     name: test-llama-runner-linux
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
@@ -95,21 +145,29 @@ jobs:
     strategy:
       matrix:
         dtype: [fp32]
-        mode: [portable, xnnpack+custom, xnnpack+custom+qe,xnnpack+custom+quantize_kv,xnnpack+quantize_kv]
+        mode: [xnnpack+custom+qe,xnnpack+custom+quantize_kv,xnnpack+quantize_kv]
+        runner: [linux.2xlarge, linux.arm64.2xlarge]
+        docker-image: [executorch-ubuntu-22.04-clang12, executorch-ubuntu-22.04-gcc11-aarch64]
         include:
-          - dtype: bf16
-            mode: portable
           - dtype: bf16
             mode: custom
+            runner: linux.2xlarge
+            docker-image: executorch-ubuntu-22.04-clang12
+        # Excluding specific runner + docker image combinations that don't make sense:
+        #   - Excluding the ARM64 gcc image on the x86 runner (linux.2xlarge)
+        #   - Excluding the x86 clang image on the ARM64 runner (linux.arm64.2xlarge)
+        exclude:
+          - runner: linux.2xlarge
+            docker-image: executorch-ubuntu-22.04-gcc11-aarch64
+          - runner: linux.arm64.2xlarge
+            docker-image: executorch-ubuntu-22.04-clang12
       fail-fast: false
     with:
-      runner: linux.2xlarge
-      docker-image: executorch-ubuntu-22.04-clang12
+      runner: ${{ matrix.runner }}
+      docker-image: ${{ matrix.docker-image }}
       submodules: 'true'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 900
-      upload-artifact: android-models
-      upload-artifact-to-s3: true
       script: |
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
@@ -122,7 +180,7 @@ jobs:
         ARTIFACTS_DIR_NAME="${ARTIFACTS_DIR_NAME/+/-}"
 
         # Setup executorch
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "${BUILD_TOOL}"
         # Install requirements for export_llama
         PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
         # Test llama2
@@ -148,7 +206,7 @@ jobs:
         conda activate "${CONDA_ENV}"
 
         source .ci/scripts/utils.sh
-        install_executorch "use-pt-pinned-commit"
+        install_executorch "--use-pt-pinned-commit"
         BUILD_TOOL="cmake"
         PYTHON_EXECUTABLE=python \
         bash .ci/scripts/build_llama_android.sh  "${BUILD_TOOL}"
@@ -173,7 +231,7 @@ jobs:
         conda activate "${CONDA_ENV}"
 
         BUILD_TOOL="cmake"
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "${BUILD_TOOL}"
         # Test custom ops
         PYTHON_EXECUTABLE=python bash examples/portable/custom_ops/test_custom_ops.sh "${BUILD_TOOL}"
 
@@ -197,7 +255,7 @@ jobs:
         conda activate "${CONDA_ENV}"
 
         BUILD_TOOL="cmake"
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "${BUILD_TOOL}"
         # Test selective build
         PYTHON_EXECUTABLE=python bash examples/selective_build/test_selective_build.sh "${BUILD_TOOL}"
 
@@ -220,7 +278,7 @@ jobs:
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
 
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "cmake"
 
         # install Llava requirements
         bash examples/models/llama/install_requirements.sh
@@ -232,6 +290,36 @@ jobs:
         # run e2e (export, tokenizer and runner)
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_llava.sh
 
+  test-moshi-linux:
+    name: test-moshi-linux
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      fail-fast: false
+    with:
+      runner: linux.2xlarge
+      docker-image: executorch-ubuntu-22.04-clang12
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "cmake"
+
+        # install Mimi requirements
+        bash examples/models/moshi/mimi/install_requirements.sh
+
+        # reinstall executorch
+        bash ./install_executorch.sh
+
+        # run python unittest
+        python -m unittest examples.models.moshi.mimi.test_mimi
+
   test-quantized-aot-lib-linux:
     name: test-quantized-aot-lib-linux
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
@@ -252,7 +340,7 @@ jobs:
         conda activate "${CONDA_ENV}"
 
         BUILD_TOOL="cmake"
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "${BUILD_TOOL}"
         PYTHON_EXECUTABLE=python bash examples/xnnpack/quantization/test_quantize.sh "${BUILD_TOOL}" mv2
 
   test-pybind-build-linux:
@@ -279,7 +367,7 @@ jobs:
         PYTHON_EXECUTABLE=python \
         EXECUTORCH_BUILD_XNNPACK=ON \
         EXECUTORCH_BUILD_PYBIND=ON \
-        bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
+        bash .ci/scripts/setup-linux.sh --build-tool "${BUILD_TOOL}"
 
         # see if we can import the module successfully
         python -c "from executorch.extension.pybindings import portable_lib; print('success!')"
@@ -303,6 +391,7 @@ jobs:
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
 
+        ./install_requirements.sh --use-pt-pinned-commit
         # build module for executorch.extension.pybindings.portable_lib
         bash test/build_size_test.sh
         strip cmake-out/test/size_test
@@ -338,6 +427,8 @@ jobs:
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
 
+        ./install_requirements.sh --use-pt-pinned-commit
+
         # build module for executorch.extension.pybindings.portable_lib
         bash test/build_size_test.sh
         strip cmake-out/test/size_test
@@ -359,7 +450,6 @@ jobs:
     permissions:
       id-token: write
       contents: read
-    needs: test-llama-runner-linux
 
   unittest:
     uses: ./.github/workflows/_unittest.yml
@@ -367,6 +457,29 @@ jobs:
       id-token: write
       contents: read
     with:
+      build-mode: Debug
+      build-tool: cmake
+      docker-image: executorch-ubuntu-22.04-clang12
+
+  unittest-editable:
+    uses: ./.github/workflows/_unittest.yml
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      build-mode: Debug
+      build-tool: cmake
+      editable: true
+      docker-image: executorch-ubuntu-22.04-clang12
+
+  unittest-buck:
+    uses: ./.github/workflows/_unittest.yml
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      build-mode: Debug
+      build-tool: buck2
       docker-image: executorch-ubuntu-22.04-clang12
 
   unittest-arm:
@@ -393,7 +506,7 @@ jobs:
         PYTHON_EXECUTABLE=python \
         EXECUTORCH_BUILD_PYBIND=ON \
         EXECUTORCH_BUILD_ARM_BAREMETAL=ON \
-        .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
+        .ci/scripts/setup-linux.sh --build-tool "${BUILD_TOOL}"
 
         # Install Arm dependencies
         .ci/scripts/setup-arm-baremetal-tools.sh
@@ -429,11 +542,12 @@ jobs:
         MODE=${{ matrix.mode }}
         PT2E_QUANTIZE=${{ matrix.pt2e_quantize }}
 
+        ./install_requirements.sh --use-pt-pinned-commit
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
         PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
 
         # Setup executorch
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "${BUILD_TOOL}"
         # Install requirements for export_llama
         PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
         # Test llama2
@@ -460,15 +574,16 @@ jobs:
 
         BUILD_TOOL="cmake"
 
+        ./install_requirements.sh --use-pt-pinned-commit
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
         PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
 
         # Setup executorch
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "${BUILD_TOOL}"
 
         # Setup install_requirements for llama
         PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
-        
+
         # Test static llama weight sharing and accuracy
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_qnn_static_llama.sh
 
@@ -513,7 +628,7 @@ jobs:
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
 
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "cmake"
 
         # install phi-3-mini requirements
         bash examples/models/phi-3-mini/install_requirements.sh
@@ -540,7 +655,7 @@ jobs:
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
 
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "cmake"
 
         # install llama requirements
         bash examples/models/llama/install_requirements.sh
@@ -567,7 +682,7 @@ jobs:
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
 
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "cmake"
 
         # install llama requirements
         bash examples/models/llama/install_requirements.sh
@@ -594,7 +709,7 @@ jobs:
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
 
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "cmake"
 
         # install llama requirements
         bash examples/models/llama/install_requirements.sh
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 18e34bff72a..ecae932b74f 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -18,48 +18,85 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  gather-models:
-    runs-on: ubuntu-22.04
-    outputs:
-      models: ${{ steps.gather-models.outputs.models }}
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          submodules: 'false'
-      - uses: actions/setup-python@v4
-        with:
-          python-version: '3.10'
-      - name: Extract the list of models to test
-        id: gather-models
-        run: |
-          set -eux
-
-          PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --target-os macos --event "${GITHUB_EVENT_NAME}"
-
   test-models-macos:
     name: test-models-macos
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
-    needs: gather-models
     strategy:
-      matrix: ${{ fromJSON(needs.gather-models.outputs.models) }}
+      matrix:
+        # Mac runners are expensive and limited, and non reliable.
+        # Do some basic testing for macos jobs, and rely mostly on
+        # test-models-linux-aarch64 job instead.
+        model: [emformer_join, ic4, llama2, mobilebert, mv3, resnet50, vit, w2l]
+        backend: [xnnpack-quantization-delegation]
+        include:
+          - model: efficient_sam
+            backend: portable
+          - model: llama
+            backend: portable
+          - model: llama3_2_vision_encoder
+            backend: portable
+          - model: mv3
+            backend: portable
       fail-fast: false
     with:
-      runner: ${{ matrix.runner }}
+      runner: macos-m1-stable
       python-version: '3.11'
       submodules: 'true'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: ${{ matrix.timeout }}
+      timeout: 90
       script: |
         MODEL_NAME=${{ matrix.model }}
-        BUILD_TOOL=${{ matrix.build-tool }}
+        BUILD_TOOL=cmake
         BACKEND=${{ matrix.backend }}
-        DEMO_BACKEND_DELEGATION=${{ matrix.demo_backend_delegation }}
 
         bash .ci/scripts/setup-conda.sh
         # Setup MacOS dependencies as there is no Docker support on MacOS atm
-        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
+        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh --build-tool "${BUILD_TOOL}"
         # Build and test executorch
-        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}"
+        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}"
+
+  test-models-linux-aarch64:
+    name: test-models-linux-aarch64
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      matrix:
+        model: [linear, add, add_mul, ic3, ic4, mv2, mv3, resnet18, resnet50, vit, w2l, mobilebert, emformer_join, emformer_transcribe]
+        backend: [portable, xnnpack-quantization-delegation]
+        include:
+          - model: lstm
+            backend: portable
+          - model: mul
+            backend: portable
+          - model: softmax
+            backend: portable
+          - model: phi-4-mini
+            backend: portable
+          - model: qwen2_5
+            backend: portable
+          - model: llama3_2_vision_encoder
+            backend: portable
+      fail-fast: false
+    with:
+      runner: linux.arm64.2xlarge
+      docker-image: executorch-ubuntu-22.04-gcc11-aarch64
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        MODEL_NAME=${{ matrix.model }}
+        BUILD_TOOL="cmake"
+        BACKEND=${{ matrix.backend }}
+
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "${BUILD_TOOL}"
+        # Build and test ExecuTorch
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}"
 
   test-custom-ops-macos:
     name: test-custom-ops-macos
@@ -79,7 +116,7 @@ jobs:
 
         bash .ci/scripts/setup-conda.sh
         # Setup MacOS dependencies as there is no Docker support on MacOS atm
-        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
+        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh --build-tool "${BUILD_TOOL}"
         # Build and test custom ops
         PYTHON_EXECUTABLE=python ${CONDA_RUN} bash examples/portable/custom_ops/test_custom_ops.sh "${BUILD_TOOL}"
 
@@ -101,7 +138,7 @@ jobs:
 
         bash .ci/scripts/setup-conda.sh
         # Setup MacOS dependencies as there is no Docker support on MacOS atm
-        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
+        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh --build-tool "${BUILD_TOOL}"
         # Build and test selective build
         PYTHON_EXECUTABLE=python ${CONDA_RUN} bash examples/selective_build/test_selective_build.sh "${BUILD_TOOL}"
 
@@ -128,7 +165,7 @@ jobs:
         conda activate "${CONDA_ENV}"
 
         BUILD_TOOL=${{ matrix.build-tool }}
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "${BUILD_TOOL}"
         # Test selective build
         PYTHON_EXECUTABLE=python bash examples/portable/scripts/test_demo_backend_delegation.sh "${BUILD_TOOL}"
 
@@ -139,7 +176,7 @@ jobs:
       id-token: write
       contents: read
     with:
-      runner: linux.2xlarge
+      runner: linux.2xlarge.memory
       docker-image: executorch-ubuntu-22.04-arm-sdk
       submodules: 'true'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
@@ -150,7 +187,7 @@ jobs:
         conda activate "${CONDA_ENV}"
 
         source .ci/scripts/utils.sh
-        install_executorch "use-pt-pinned-commit"
+        install_executorch "--use-pt-pinned-commit"
 
         .ci/scripts/setup-arm-baremetal-tools.sh
 
@@ -159,7 +196,7 @@ jobs:
         sudo sysctl fs.inotify.max_user_watches=1048576 # 1024 * 1024
 
         # Test ethos-u delegate examples with run.sh
-        backends/arm/test/test_arm_baremetal.sh test_run_ethosu_fvp
+        backends/arm/test/test_arm_baremetal.sh test_full_ethosu_fvp
 
 
   test-arm-reference-delegation:
@@ -169,7 +206,7 @@ jobs:
       id-token: write
       contents: read
     with:
-      runner: linux.2xlarge
+      runner: linux.2xlarge.memory
       docker-image: executorch-ubuntu-22.04-arm-sdk
       submodules: 'true'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
@@ -180,7 +217,7 @@ jobs:
         conda activate "${CONDA_ENV}"
 
         source .ci/scripts/utils.sh
-        install_executorch "use-pt-pinned-commit"
+        install_executorch "--use-pt-pinned-commit"
 
         .ci/scripts/setup-arm-baremetal-tools.sh
 
@@ -191,7 +228,7 @@ jobs:
     name: test-coreml-delegate
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
-      runner: macos-13-xlarge
+      runner: macos-latest-xlarge
       python-version: '3.11'
       submodules: 'true'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
@@ -201,7 +238,7 @@ jobs:
 
         bash .ci/scripts/setup-conda.sh
         # Setup MacOS dependencies as there is no Docker support on MacOS atm
-        GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
+        GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh --build-tool "${BUILD_TOOL}"
         # Build and test coreml delegate
         PYTHON_EXECUTABLE=python ${CONDA_RUN} bash backends/apple/coreml/scripts/build_all.sh
 
@@ -224,23 +261,119 @@ jobs:
 
         # build module for executorch.extension.pybindings.portable_lib
         BUILD_TOOL=${{ matrix.build-tool }}
-        EXECUTORCH_BUILD_PYBIND=ON PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
+        EXECUTORCH_BUILD_PYBIND=ON PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh --build-tool "${BUILD_TOOL}"
 
         # see if we can import the module successfully
         ${CONDA_RUN} python -c "from executorch.extension.pybindings import portable_lib; print('success!')"
 
-  test-llama-runner-macos:
-    name: test-llama-runner-mac
+  test-static-llama-ane:
+    name: test-static-llama-ane
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    with:
+      runner: macos-m1-stable
+      python-version: '3.11'
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+        bash .ci/scripts/setup-conda.sh
+        eval "$(conda shell.bash hook)"
+
+        # Install requirements
+        ${CONDA_RUN} sh install_requirements.sh
+        ${CONDA_RUN} sh backends/apple/coreml/scripts/install_requirements.sh
+        ${CONDA_RUN} python install_executorch.py --pybind coreml
+        ${CONDA_RUN} sh examples/models/llama/install_requirements.sh
+
+        # Test ANE llama
+        ${CONDA_RUN} sh .ci/scripts/test_ane_static_llama.sh
+
+  test-llama-torchao-lowbit:
+    name: test-llama-torchao-lowbit
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    with:
+      runner: macos-m1-stable
+      python-version: '3.11'
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+        bash .ci/scripts/setup-conda.sh
+        eval "$(conda shell.bash hook)"
+
+        # Install requirements
+        ${CONDA_RUN} python install_executorch.py
+        ${CONDA_RUN} sh examples/models/llama/install_requirements.sh
+
+        # Run test
+        ${CONDA_RUN} sh .ci/scripts/test_llama_torchao_lowbit.sh
+
+  test-llama-runner-linux:
+    # Test Both linux x86 and linux aarch64
+    name: test-llama-runner-linux
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     strategy:
       matrix:
         dtype: [fp32]
-        mode: [portable, xnnpack+kv+custom, mps, coreml, xnnpack+custom+quantize_kv]
+        mode: [portable, xnnpack+custom]
+        runner: [linux.2xlarge, linux.arm64.2xlarge]
+        docker-image: [executorch-ubuntu-22.04-clang12, executorch-ubuntu-22.04-gcc11-aarch64]
         include:
           - dtype: bf16
             mode: portable
+            runner: linux.2xlarge
+            docker-image: executorch-ubuntu-22.04-clang12
+          - dtype: bf16
+            mode: portable
+            runner: linux.arm64.2xlarge
+            docker-image: executorch-ubuntu-22.04-gcc11-aarch64
           - dtype: bf16
             mode: custom
+            runner: linux.arm64.2xlarge
+            docker-image: executorch-ubuntu-22.04-gcc11-aarch64
+        # Excluding specific runner + docker image combinations that don't make sense:
+        #   - Excluding the ARM64 gcc image on the x86 runner (linux.2xlarge)
+        #   - Excluding the x86 clang image on the ARM64 runner (linux.arm64.2xlarge)
+        exclude:
+          - runner: linux.2xlarge
+            docker-image: executorch-ubuntu-22.04-gcc11-aarch64
+          - runner: linux.arm64.2xlarge
+            docker-image: executorch-ubuntu-22.04-clang12
+      fail-fast: false
+    with:
+      runner: ${{ matrix.runner }}
+      docker-image: ${{ matrix.docker-image }}
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 900
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        DTYPE=${{ matrix.dtype }}
+        BUILD_TOOL="cmake"
+        MODE=${{ matrix.mode }}
+        ARTIFACTS_DIR_NAME="artifacts-to-be-uploaded/${DTYPE}-${MODE}"
+        ARTIFACTS_DIR_NAME="${ARTIFACTS_DIR_NAME/+/-}"
+
+        # Setup executorch
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "${BUILD_TOOL}"
+        # Install requirements for export_llama
+        PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
+        # Test llama2
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -dtype "${DTYPE}" -mode "${MODE}" -upload "${ARTIFACTS_DIR_NAME}"
+
+  test-llama-runner-macos:
+    name: test-llama-runner-mac
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    strategy:
+      matrix:
+        dtype: [fp32]
+        mode: [mps, coreml, xnnpack+custom+quantize_kv]
       fail-fast: false
     with:
       runner: macos-m1-stable
@@ -256,7 +389,7 @@ jobs:
         bash .ci/scripts/setup-conda.sh
 
         # Setup executorch
-        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh cmake
+        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh --build-tool cmake
 
         if [[ "${MODE}" == "mps" ]]; then
           # Install mps delegate
@@ -290,7 +423,7 @@ jobs:
 
   #       bash .ci/scripts/setup-conda.sh
   #       # Setup MacOS dependencies as there is no Docker support on MacOS atm
-  #       GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
+  #       GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh --build-tool "${BUILD_TOOL}"
 
   #       # install Llava requirements
   #       ${CONDA_RUN} bash examples/models/llama/install_requirements.sh
@@ -311,7 +444,7 @@ jobs:
     strategy:
       matrix:
         dtype: [fp32]
-        model: [dl3, mv3, mv2, ic4, ic3, vit]
+        model: [dl3, mv3, mv2, ic4, ic3, vit, mb, w2l]
       fail-fast: false
     with:
       runner: linux.2xlarge
@@ -323,7 +456,7 @@ jobs:
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh cmake
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool cmake
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
         PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh ${{ matrix.model }} "cmake" "qnn"
@@ -345,7 +478,7 @@ jobs:
         bash .ci/scripts/setup-conda.sh
 
         # Setup MacOS dependencies as there is no Docker support on MacOS atm
-        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
+        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh --build-tool "${BUILD_TOOL}"
         PYTHON_EXECUTABLE=python ${CONDA_RUN} bash backends/apple/coreml/scripts/install_requirements.sh
         echo "Finishing installing coreml."
         PYTHON_EXECUTABLE=python ${CONDA_RUN} bash backends/apple/mps/install_requirements.sh
@@ -374,7 +507,13 @@ jobs:
     secrets: inherit
     strategy:
       matrix:
-        hf_model_repo: [google/gemma-2-2b]
+        hf_model_id: [
+          google/gemma-2-2b,
+          Qwen/Qwen2.5-0.5B,
+          HuggingFaceTB/SmolLM2-135M,
+          meta-llama/Llama-3.2-1B,
+          allenai/OLMo-1B-hf
+        ]
       fail-fast: false
     with:
       secrets-env: EXECUTORCH_HF_TOKEN
@@ -388,69 +527,40 @@ jobs:
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh cmake
-
-        echo "Installing libexecutorch.a, libextension_module.so, libportable_ops_lib.a"
-        rm -rf cmake-out
-        cmake \
-            -DCMAKE_INSTALL_PREFIX=cmake-out \
-            -DCMAKE_PREFIX_PATH="$(python -c 'import torch as _; print(_.__path__[0])')" \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-            -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-            -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-            -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
-            -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-            -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-            -DEXECUTORCH_BUILD_XNNPACK=ON \
-            -DPYTHON_EXECUTABLE=python \
-            -Bcmake-out .
-        cmake --build cmake-out -j9 --target install --config Release
-
-        echo "Build llama runner"
-        dir="examples/models/llama"
-        cmake \
-            -DCMAKE_INSTALL_PREFIX=cmake-out \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DCMAKE_PREFIX_PATH="$(python -c 'import torch as _; print(_.__path__[0])')" \
-            -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
-            -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-            -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-            -DEXECUTORCH_BUILD_XNNPACK=ON \
-            -DPYTHON_EXECUTABLE=python \
-            -Bcmake-out/${dir} \
-            ${dir}
-        cmake --build cmake-out/${dir} -j9 --config Release
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool cmake
         echo "::endgroup::"
 
-        echo "::group::Set up HuggingFace Dependencies"
-        if [ -z "$SECRET_EXECUTORCH_HF_TOKEN" ]; then
-          echo "::error::SECRET_EXECUTORCH_HF_TOKEN is empty. For security reason secrets won't be accessible on forked PRs. Please make sure you submit a non-forked PR."
-          exit 1
-        fi
+        echo "::group::Set up Hugging Face"
         pip install -U "huggingface_hub[cli]"
         huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+        git clone https://github.com/huggingface/optimum-executorch
+        cd optimum-executorch
+        # There is no release yet, for CI stability, always test from the same commit on main
+        git checkout 6a7e83f3eee2976fa809335bfb78a45b1ea1cb25
+        pip install .
         pip install accelerate sentencepiece
         pip list
         echo "::endgroup::"
 
-        echo "::group::Export to ExecuTorch"
-        TOKENIZER_FILE=tokenizer.model
-        TOKENIZER_BIN_FILE=tokenizer.bin
-        ET_MODEL_NAME=et_model
-        DOWNLOADED_TOKENIZER_FILE_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${{ matrix.hf_model_repo }}" --files "${TOKENIZER_FILE}")
-        if [ -f "$DOWNLOADED_TOKENIZER_FILE_PATH/$TOKENIZER_FILE" ]; then
-            echo "${TOKENIZER_FILE} downloaded successfully at: $DOWNLOADED_TOKENIZER_FILE_PATH"
-            python -m extension.llm.tokenizer.tokenizer -t "$DOWNLOADED_TOKENIZER_FILE_PATH/$TOKENIZER_FILE" -o ./${TOKENIZER_BIN_FILE}
-            ls ./tokenizer.bin
-        else
-            echo "Failed to download ${TOKENIZER_FILE} from ${{ matrix.hf_model_repo }}."
-            exit 1
-        fi
-
-        python -m extension.export_util.export_hf_model -hfm=${{ matrix.hf_model_repo }} -o ${ET_MODEL_NAME}
-
-        cmake-out/examples/models/llama/llama_main --model_path=${ET_MODEL_NAME}.pte --tokenizer_path=${TOKENIZER_BIN_FILE} --prompt="My name is"
+        echo "::group::Export and Run ${{ matrix.hf_model_id }}"
+        # Pass matrix variable as environment variable
+        export MODEL_ID="${{ matrix.hf_model_id }}"
+        python -c "
+        import os
+        from optimum.executorch import ExecuTorchModelForCausalLM
+        from transformers import AutoTokenizer
+
+        model_id = os.getenv('MODEL_ID')
+        print(f'Loading model: {model_id}')
+        model = ExecuTorchModelForCausalLM.from_pretrained(model_id, recipe='xnnpack')
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        generated_text = model.text_generation(
+          tokenizer=tokenizer,
+          prompt='Simply put, the theory of relativity states that',
+          max_seq_len=64
+        )
+        print(generated_text)
+        "
         echo "::endgroup::"
 
 
@@ -482,12 +592,23 @@ jobs:
         MODE=${{ matrix.mode }}
         PT2E_QUANTIZE=${{ matrix.pt2e_quantize }}
 
+        ./install_requirements.sh --use-pt-pinned-commit
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
         PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
 
         # Setup executorch
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "${BUILD_TOOL}"
         # Install requirements for export_llama
         PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
         # Test llama2
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -mode "${MODE}" -dtype "${DTYPE}" -pt2e_quantize "${PT2E_QUANTIZE}"
+
+  unittest-release:
+    uses: ./.github/workflows/_unittest.yml
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      build-mode: Release
+      build-tool: cmake
+      docker-image: executorch-ubuntu-22.04-clang12
diff --git a/.github/workflows/update-viablestrict.yml b/.github/workflows/update-viablestrict.yml
index ab865fc1adf..e639c497549 100644
--- a/.github/workflows/update-viablestrict.yml
+++ b/.github/workflows/update-viablestrict.yml
@@ -12,7 +12,7 @@ concurrency:
 jobs:
   do_update_viablestrict:
     if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
     environment: ${{ (github.event_name == 'schedule') && 'update-viable-strict' || '' }}
     steps:
       - name: Update viable/strict
diff --git a/.gitignore b/.gitignore
index 7327aa2d1cb..3999e3b5dca 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,41 +1,42 @@
+# System files
+.DS_Store
+
+# Python environment and cache
 .hypothesis
-buck-out/
 .mypy_cache/
+.venv/
+__pycache__/
+
+# Build and tool-generated files
+arm_test/
+buck-out/
 buck2-bin/
-cmake-out*
-.DS_Store
+build/
 cmake-android-out/
-cmake-out-android/
 cmake-ios-out/
+cmake-out*
+cmake-out-android/
+dist/
 ethos-u-scratch/
 executorch.egg-info
 pip-out/
-__pycache__/
 
 # Any exported models and profiling outputs
-*.pte
-*.model
-!test_tiktoken_tokenizer.model
 *.bin
+*.model
+*.pte
 !test_bpe_tokenizer.bin
+!test_tiktoken_tokenizer.model
 
 # Editor temporaries
-*.swa
-*.swb
-*.swc
-*.swd
-*.swe
-*.swf
-*.swg
-*.swh
-*.swi
-*.swj
-*.swk
-*.swl
-*.swm
-*.swn
-*.swo
-*.swp
+*.idea
+*.sw[a-z]
 *~
 .~lock.*
-*.idea
+
+# Xcode
+xcuserdata/
+.build/
+.swiftpm/
+*.xcworkspace/
+*.xcframework/
diff --git a/.gitmodules b/.gitmodules
index f7da7e771fb..f0b9dc6c107 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -28,15 +28,9 @@
 [submodule "backends/xnnpack/third-party/pthreadpool"]
 	path = backends/xnnpack/third-party/pthreadpool
 	url = https://github.com/Maratyszcza/pthreadpool.git
-[submodule "extension/llm/third-party/abseil-cpp"]
-	path = extension/llm/third-party/abseil-cpp
-	url = https://github.com/abseil/abseil-cpp.git
-[submodule "extension/llm/third-party/re2"]
-	path = extension/llm/third-party/re2
-	url = https://github.com/google/re2.git
-[submodule "extension/llm/third-party/sentencepiece"]
-	path = extension/llm/third-party/sentencepiece
-	url = https://github.com/google/sentencepiece.git
+[submodule "extension/llm/tokenizers"]
+	path = extension/llm/tokenizers
+	url = https://github.com/pytorch-labs/tokenizers.git
 [submodule "kernels/optimized/third-party/eigen"]
 	path = kernels/optimized/third-party/eigen
 	url = https://gitlab.com/libeigen/eigen.git
@@ -70,3 +64,6 @@
 [submodule "third-party/pocketfft"]
 	path = third-party/pocketfft
 	url = https://github.com/mreineck/pocketfft
+[submodule "shim"]
+	path = shim
+	url = https://github.com/facebook/buck2-shims-meta
diff --git a/.lintrunner.toml b/.lintrunner.toml
index 7667ac430d1..842b4b1c6cb 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -75,6 +75,7 @@ exclude_patterns = [
     'examples/apple/**',
     'examples/demo-apps/apple_ios/**',
     'examples/demo-apps/react-native/rnllama/ios/**',
+    'extension/apple/**',
     # File contains @generated
     'extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h',
     'extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_special_unstrided_cpu.h',
@@ -218,6 +219,8 @@ exclude_patterns = [
     'examples/**',
     'extension/**',
     'kernels/optimized/**',
+    # Justified <functional> include.
+    'runtime/kernel/thread_parallel_interface.h',
     'scripts/**',
     'third-party/**',
     'util/**',
@@ -340,3 +343,29 @@ init_command = [
     '--dry-run={{DRYRUN}}',
     '--requirement=requirements-lintrunner.txt',
 ]
+
+[[linter]]
+code = 'LICENSELINT'
+include_patterns = [
+    '**/*',
+]
+exclude_patterns = [
+    '**/fb/**',
+    '.lintrunner.toml',
+]
+command = [
+    'python',
+    '-m',
+    'lintrunner_adapters',
+    'run',
+    'grep_linter',
+    '--pattern=Confidential and proprietary',
+    '--linter-name=LICENSELINT',
+    '--error-name=Wrong license',
+    """--error-description=\
+        Code contributed to ExecuTorch open source repo should have \
+        BSD-license header \
+    """,
+    '--',
+    '@{{PATHSFILE}}',
+]
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7951adc5cda..c88e1743b83 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -42,9 +42,9 @@
 # It should also be cmake-lint clean.
 #
 
-cmake_minimum_required(VERSION 3.19)
+cmake_minimum_required(VERSION 3.24)
 project(executorch)
-include(build/Utils.cmake)
+include(tools/cmake/Utils.cmake)
 include(CMakeDependentOption)
 
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@@ -250,14 +250,20 @@ cmake_dependent_option(
   "NOT EXECUTORCH_BUILD_ARM_BAREMETAL" OFF
 )
 
+if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
+  set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER ON)
+  set(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR ON)
+  set(EXECUTORCH_BUILD_EXTENSION_MODULE ON)
+  set(EXECUTORCH_BUILD_EXTENSION_TENSOR ON)
+endif()
+
 if(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR)
   set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER ON)
 endif()
 
-if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
-  set(EXECUTORCH_BUILD_EXTENSION_TENSOR ON)
+if(EXECUTORCH_BUILD_EXTENSION_MODULE)
   set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER ON)
-  set(EXECUTORCH_BUILD_EXTENSION_MODULE ON)
+  set(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR ON)
 endif()
 
 if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT)
@@ -456,7 +462,7 @@ endif()
 # tools like `flatc`, along with example executables like `executor_runner` and
 # libraries that it uses, like `gflags`. Disabling this can be helpful when
 # cross-compiling, but some required tools that would have been built need to be
-# provided directly (via, for example, FLATC_EXECUTABLE).
+# provided directly.
 cmake_dependent_option(
   EXECUTORCH_BUILD_HOST_TARGETS "Build host-only targets." ON
   "NOT CMAKE_TOOLCHAIN_IOS" OFF
@@ -467,9 +473,19 @@ cmake_dependent_option(
 #
 cmake_dependent_option(
   EXECUTORCH_BUILD_FLATC "Build the flatc executable." ON
-  "NOT FLATC_EXECUTABLE;EXECUTORCH_BUILD_HOST_TARGETS" OFF
+  "NOT FLATC_EXECUTABLE" OFF
 )
 
+set(FLATBUFFERS_BUILD_FLATC OFF CACHE BOOL "")
+set(FLATBUFFERS_BUILD_FLATHASH OFF CACHE BOOL "")
+set(FLATBUFFERS_BUILD_FLATLIB OFF CACHE BOOL "")
+set(FLATBUFFERS_BUILD_TESTS OFF CACHE BOOL "")
+set(FLATBUFFERS_INSTALL OFF CACHE BOOL "")
+# exir lets users set the alignment of tensor data embedded in the flatbuffer,
+# and some users need an alignment larger than the default, which is typically
+# 32.
+set(FLATBUFFERS_MAX_ALIGNMENT 1024)
+
 if(EXECUTORCH_BUILD_FLATC)
   if(FLATC_EXECUTABLE)
     # We could ignore this, but it could lead to confusion about which `flatc`
@@ -478,41 +494,54 @@ if(EXECUTORCH_BUILD_FLATC)
       FATAL_ERROR "May not set both EXECUTORCH_BUILD_FLATC and FLATC_EXECUTABLE"
     )
   endif()
-  set(FLATC_EXECUTABLE flatc)
-  set(FLATBUFFERS_BUILD_FLATC
-      ON
-      CACHE BOOL ""
-  )
-  set(FLATBUFFERS_BUILD_FLATHASH
-      OFF
-      CACHE BOOL ""
-  )
-  set(FLATBUFFERS_BUILD_FLATLIB
-      OFF
-      CACHE BOOL ""
-  )
-  set(FLATBUFFERS_BUILD_TESTS
-      OFF
-      CACHE BOOL ""
-  )
-  set(FLATBUFFERS_INSTALL
-      OFF
-      CACHE BOOL ""
-  )
-  add_subdirectory(third-party/flatbuffers)
 
-  # exir lets users set the alignment of tensor data embedded in the flatbuffer,
-  # and some users need an alignment larger than the default, which is typically
-  # 32.
-  target_compile_definitions(flatc PRIVATE FLATBUFFERS_MAX_ALIGNMENT=1024)
+  # Build flatc for the *host* to generate files as part of the build step.
+  include(ExternalProject)
+  ExternalProject_Add(
+    flatbuffers
+    PREFIX ${CMAKE_CURRENT_BINARY_DIR}/third-party/flatbuffers
+    BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/flatbuffers
+    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third-party/flatbuffers
+    CMAKE_ARGS -DFLATBUFFERS_BUILD_FLATC=ON
+               -DFLATBUFFERS_BUILD_FLATHASH=${FLATBUFFERS_BUILD_FLATHASH}
+               -DFLATBUFFERS_BUILD_FLATLIB=${FLATBUFFERS_BUILD_FLATLIB}
+               -DFLATBUFFERS_BUILD_TESTS=${FLATBUFFERS_BUILD_TESTS}
+               -DFLATBUFFERS_INSTALL=${FLATBUFFERS_INSTALL}
+               -DCMAKE_CXX_FLAGS="-DFLATBUFFERS_MAX_ALIGNMENT=${FLATBUFFERS_MAX_ALIGNMENT}"
+               # If building for iOS, "unset" these variables to rely on the host (macOS) defaults.
+               $<$<AND:$<BOOL:${CMAKE_TOOLCHAIN_IOS}>,$<BOOL:$<FILTER:${PLATFORM},EXCLUDE,^MAC>>>:-DCMAKE_OSX_SYSROOT=>
+    INSTALL_COMMAND ""
+    BUILD_BYPRODUCTS <BINARY_DIR>/flatc
+  )
+  ExternalProject_Get_Property(flatbuffers BINARY_DIR)
+  if(WIN32)
+    # flatbuffers does not use CMAKE_BUILD_TYPE. Internally, the build forces Release
+    # config, but from CMake's perspective the build type is always Debug.
+    set(FLATC_EXECUTABLE ${BINARY_DIR}/$<CONFIG>/flatc.exe)
+  elseif(CMAKE_GENERATOR STREQUAL "Xcode")
+    set(FLATC_EXECUTABLE ${BINARY_DIR}/$<CONFIG>/flatc)
+  else()
+    set(FLATC_EXECUTABLE ${BINARY_DIR}/flatc)
+  endif()
+  set(FLATC_EXECUTABLE_BUILT_FROM_SOURCE YES)
 endif()
+
 if(NOT FLATC_EXECUTABLE)
   message(
-    FATAL_ERROR
-      "FLATC_EXECUTABLE must be set when EXECUTORCH_BUILD_FLATC is disabled. "
-      "Note that EXECUTORCH_BUILD_FLATC may be disabled implicitly when "
-      "cross-compiling or when EXECUTORCH_BUILD_HOST_TARGETS is disabled."
+    WARNING "FLATC_EXECUTABLE not specified, looking for flatc"
   )
+  find_program(FLATC_EXECUTABLE flatc)
+
+  if(NOT FLATC_EXECUTABLE)
+    message(FATAL_ERROR "FLATC_EXECUTABLE must be set when EXECUTORCH_BUILD_FLATC is disabled.")
+  endif()
+endif()
+
+add_executable(flatc IMPORTED GLOBAL)
+set_target_properties(flatc PROPERTIES IMPORTED_LOCATION ${FLATC_EXECUTABLE})
+
+if(FLATC_EXECUTABLE_BUILT_FROM_SOURCE)
+  add_dependencies(flatc flatbuffers)
 endif()
 
 #
@@ -653,7 +682,7 @@ install(
   INCLUDES
   DESTINATION ${_common_include_directories}
 )
-install(FILES build/executorch-config.cmake DESTINATION lib/cmake/ExecuTorch)
+install(FILES tools/cmake/executorch-config.cmake DESTINATION lib/cmake/ExecuTorch)
 
 #
 # executor_runner: Host tool that demonstrates program execution.
@@ -725,7 +754,6 @@ endif()
 
 if(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/flat_tensor)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/flat_tensor/serialize)
 endif()
 
 if(EXECUTORCH_BUILD_EXTENSION_LLM)
@@ -750,7 +778,6 @@ endif()
 
 if(EXECUTORCH_BUILD_PTHREADPOOL
    AND EXECUTORCH_BUILD_CPUINFO
-   AND CMAKE_CXX_STANDARD GREATER_EQUAL 14
 )
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/threadpool)
 endif()
@@ -767,9 +794,7 @@ if(EXECUTORCH_BUILD_PYBIND)
   endif()
 
   # find pytorch lib, to allow pybind to take at::Tensor as input/output
-  if(NOT TARGET torch)
-    find_package(Torch CONFIG REQUIRED)
-  endif()
+  find_package_torch()
   find_library(
     TORCH_PYTHON_LIBRARY torch_python PATHS "${TORCH_INSTALL_PREFIX}/lib"
   )
@@ -902,6 +927,14 @@ if(EXECUTORCH_BUILD_EXECUTOR_RUNNER)
     list(APPEND _executor_runner_libs quantized_ops_lib)
   endif()
 
+  if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
+    list(APPEND _executor_runner_libs $<LINK_LIBRARY:WHOLE_ARCHIVE,custom_ops>)
+  endif()
+
+  if(EXECUTORCH_BUILD_XNNPACK)
+    list(APPEND _executor_runner_libs xnnpack_backend)
+  endif()
+
   if(EXECUTORCH_ENABLE_EVENT_TRACER)
     if(EXECUTORCH_BUILD_DEVTOOLS)
       list(APPEND _executor_runner_libs etdump flatccrt)
@@ -910,6 +943,10 @@ if(EXECUTORCH_BUILD_EXECUTOR_RUNNER)
     endif()
   endif()
 
+  if(EXECUTORCH_BUILD_COREML)
+    list(APPEND _executor_runner_libs coremldelegate)
+  endif()
+
   add_executable(executor_runner ${_executor_runner__srcs})
   if(CMAKE_BUILD_TYPE STREQUAL "Release")
     if(APPLE)
diff --git a/CODEOWNERS b/CODEOWNERS
new file mode 100644
index 00000000000..014b8ed0fce
--- /dev/null
+++ b/CODEOWNERS
@@ -0,0 +1,85 @@
+# IMPORTANT:
+# This file is ONLY used to subscribe for notifications for PRs
+# related to a specific file path. Approvals from people in this
+# file are not required for merges.
+
+/backends/apple @shoumikhin @cccclai
+/backends/apple/mps @cccclai @DenisVieriu97
+/backends/arm @digantdesai
+/backends/cadence @tarun292
+/backends/example @iseeyuan @JacobSzwejbka @larryliu0820
+/backends/mediatek @cccclai @neuropilot-captain
+/backends/qualcomm @cccclai @chunit-quic @haowhsu-quic @shewu-quic @winskuo-quic
+/backends/test @cccclai
+/backends/transforms @kimishpatel
+/backends/vulkan @SS-JIA
+/backends/xnnpack @digantdesai @mcr229
+
+/build @GregoryComer @kirklandsign
+
+/codegen @larryliu0820 @lucylq
+
+/devtools @tarun292 @Gasoonjia
+
+/docs @mergennachin
+
+/examples/apple @shoumikhin
+/examples/apple/coreml @cccclai @metascroy @cymbalrush @YifanShenSZ
+/examples/arm @digantdesai
+/examples/cadence @tarun292
+/examples/demo-apps @shoumikhin @kirklandsign
+/examples/devtools @tarun292
+/examples/llm_manual @larryliu0820
+/examples/llm_pte_finetuning @JacobSzwejbka
+/examples/mediatek @cccclai
+/examples/models @lucylq @jackzhxng
+/examples/portable @larryliu0820 @manuelcandales
+/examples/qualcomm @cccclai
+/examples/selective_build @lucylq @larryliu0820 @JacobSzwejbka
+/examples/xnnpack @digantdesai @mcr229
+
+/exir/backend @cccclai @kimishpatel @JacobSzwejbka @tarun292
+/exir @JacobSzwejbka @tarun292 @larryliu0820
+
+
+/extension/android @kirklandsign
+/extension/android_test @kirklandsign
+/extension/apple @shoumikhin
+/extension/aten_util @JacobSzwejbka
+/extension/benchmark @tarun292
+/extension/data_loader @JacobSzwejbka @lucylq
+/extension/evalue_util @GregoryComer
+/extension/export_util @kimishpatel
+/extension/flat_tensor @lucylq
+/extension/gguf_util @larryliu0820
+/extension/kernel_util @kimishpatel @manuelcandales @swolchok
+/extension/llm @jackzhxng @iseeyuan @larryliu0820 @swolchok
+/extension/memory_allocator @JacobSzwejbka @swolchok
+/extension/module @shoumikhin
+/extension/parallel @kimishpatel @swolchok
+/extension/pybindings @JacobSzwejbka @larryliu0820
+/extension/pytree @JacobSzwejbka @swolchok
+/extension/runner_util @swolchok
+/extension/tensor @shoumikhin
+/extension/testing_util @swolchok
+/extension/threadpool @kimishpatel @swolchok
+/extension/training @JacobSzwejbka
+
+/kernels @manuelcandales @swolchok
+
+/profiler @tarun292 @Gasoonjia
+
+/runtime @JacobSzwejbka @lucylq @swolchok
+/runtime/backend @cccclai
+
+/schema @JacobSzwejbka @lucylq
+
+/scripts @GregoryComer @swolchok
+
+/shim @larryliu0820 @GregoryComer @swolchok
+
+/third-party @GregoryComer
+
+/test @larryliu0820 @kirklandsign
+
+/util @tarun292
diff --git a/Package.swift b/Package.swift
new file mode 100644
index 00000000000..94acfc4cd7b
--- /dev/null
+++ b/Package.swift
@@ -0,0 +1,86 @@
+// swift-tools-version:5.9
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// NOTE: This package manifest is for frameworks built locally with CMake.
+// It defines dependencies and linker settings for Executorch components.
+//
+// To use prebuilt binaries instead, switch to one of the "swiftpm" branches,
+// which fetch the precompiled `.xcframeworks`.
+//
+// For details on building frameworks locally or using prebuilt binaries,
+// see the documentation:
+// https://pytorch.org/executorch/main/using-executorch-ios.html
+
+import PackageDescription
+
+let debug = "_debug"
+let deliverables = [
+  "backend_coreml": [
+    "frameworks": [
+      "Accelerate",
+      "CoreML",
+    ],
+    "libraries": [
+      "sqlite3",
+    ],
+  ],
+  "backend_mps": [
+    "frameworks": [
+      "Metal",
+      "MetalPerformanceShaders",
+      "MetalPerformanceShadersGraph",
+    ],
+  ],
+  "backend_xnnpack": [:],
+  "executorch": [:],
+  "kernels_custom": [:],
+  "kernels_optimized": [:],
+  "kernels_portable": [:],
+  "kernels_quantized": [:],
+].reduce(into: [String: [String: Any]]()) {
+  $0[$1.key] = $1.value
+  $0[$1.key + debug] = $1.value
+}.reduce(into: [String: [String: Any]]()) {
+  var newValue = $1.value
+  if $1.key.hasSuffix(debug) {
+    $1.value.forEach { key, value in
+      if key.hasSuffix(debug) {
+        newValue[String(key.dropLast(debug.count))] = value
+      }
+    }
+  }
+  $0[$1.key] = newValue.filter { key, _ in !key.hasSuffix(debug) }
+}
+
+let package = Package(
+  name: "executorch",
+  platforms: [
+    .iOS(.v17),
+    .macOS(.v10_15),
+  ],
+  products: deliverables.keys.map { key in
+    .library(name: key, targets: ["\(key)_dependencies"])
+  }.sorted { $0.name < $1.name },
+  targets: deliverables.flatMap { key, value -> [Target] in
+    [
+      .binaryTarget(
+        name: key,
+        path: "cmake-out/\(key).xcframework"
+      ),
+      .target(
+        name: "\(key)_dependencies",
+        dependencies: [.target(name: key)],
+        path: ".Package.swift/\(key)",
+        linkerSettings:
+          (value["frameworks"] as? [String] ?? []).map { .linkedFramework($0) } +
+          (value["libraries"] as? [String] ?? []).map { .linkedLibrary($0) }
+      ),
+    ]
+  }
+)
diff --git a/README.md b/README.md
index 3dda2a1a452..dd1fafe715b 100644
--- a/README.md
+++ b/README.md
@@ -68,7 +68,6 @@ We welcome contributions. To get started review the [guidelines](CONTRIBUTING.md
 ```
 executorch
 ├── backends                        #  Backend delegate implementations.
-├── build                           #  Utilities for managing the build system.
 ├── codegen                         #  Tooling to autogenerate bindings between kernels and the runtime.
 ├── configurations
 ├── docs                            #  Static docs tooling.
@@ -82,8 +81,7 @@ executorch
 |   ├── operator                    #  Operator node manipulation utilities.
 |   ├── passes                      #  Built-in compiler passes.
 |   ├── program                     #  Export artifacts.
-|   ├── serde                       #  Graph module
-serialization/deserialization.
+|   ├── serde                       #  Graph module serialization/deserialization.
 |   ├── verification                #  IR verification.
 ├── extension                       #  Extensions built on top of the runtime.
 |   ├── android                     #  ExecuTorch wrappers for Android apps.
@@ -98,8 +96,7 @@ serialization/deserialization.
 |   ├── parallel                    #  C++ threadpool integration.
 |   ├── pybindings                  #  Python API for executorch runtime.
 |   ├── pytree                      #  C++ and Python flattening and unflattening lib for pytrees.
-|   ├── runner_util                 #  Helpers for writing C++ PTE-execution
-tools.
+|   ├── runner_util                 #  Helpers for writing C++ PTE-execution tools.
 |   ├── testing_util                #  Helpers for writing C++ tests.
 |   ├── training                    #  Experimental libraries for on-device training
 ├── kernels                         #  1st party kernel implementations.
@@ -115,9 +112,9 @@ tools.
 |   ├── executor                    #  Model loading, initialization, and execution.
 |   ├── kernel                      #  Kernel registration and management.
 |   ├── platform                    #  Layer between architecture specific code and portable C++.
-├── schema                          #  ExecuTorch PTE file format flatbuffer
-schemas.
-├── scripts                         #  Utility scripts for size management, dependency management, etc.
+├── schema                          #  ExecuTorch PTE file format flatbuffer schemas.
+├── scripts                         #  Utility scripts for building libs, size management, dependency management, etc.
+├── tools                           #  Development tool management.
 ├── devtools                        #  Model profiling, debugging, and introspection.
 ├── shim                            #  Compatibility layer between OSS and Internal builds
 ├── test                            #  Broad scoped end-to-end tests.
diff --git a/Test.cmake b/Test.cmake
index d4b5f6aa1db..6bd7a86e70b 100644
--- a/Test.cmake
+++ b/Test.cmake
@@ -13,7 +13,6 @@ if(BUILD_TESTING)
   add_subdirectory(extension/evalue_util/test)
   add_subdirectory(extension/kernel_util/test)
   add_subdirectory(extension/memory_allocator/test)
-  add_subdirectory(extension/parallel/test)
   add_subdirectory(extension/pytree/test)
   add_subdirectory(kernels/portable/cpu/util/test)
   add_subdirectory(kernels/prim_ops/test)
diff --git a/backends/apple/coreml/TARGETS b/backends/apple/coreml/TARGETS
index 9d722457e34..a8802e99b56 100644
--- a/backends/apple/coreml/TARGETS
+++ b/backends/apple/coreml/TARGETS
@@ -5,6 +5,14 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
 oncall("executorch")
 
+# TODO: this is a placeholder to support internal fbcode build. We should add the coreml backend target properly.
+runtime.python_library(
+    name = "coreml",
+    visibility = [
+        "@EXECUTORCH_CLIENTS",
+    ],
+)
+
 runtime.python_library(
     name = "backend",
     srcs = glob([
@@ -14,10 +22,10 @@ runtime.python_library(
         "@EXECUTORCH_CLIENTS",
     ],
     deps = [
+        "fbsource//third-party/pypi/coremltools:coremltools",
         ":executorchcoreml",
         "//executorch/exir/backend:backend_details",
         "//executorch/exir/backend:compile_spec_schema",
-        "fbsource//third-party/pypi/coremltools:coremltools",
     ],
 )
 
@@ -30,13 +38,13 @@ runtime.python_library(
         "@EXECUTORCH_CLIENTS",
     ],
     deps = [
+        "fbsource//third-party/pypi/coremltools:coremltools",
         ":backend",
         "//caffe2:torch",
         "//executorch/exir:lib",
         "//executorch/exir/backend:compile_spec_schema",
         "//executorch/exir/backend:partitioner",
         "//executorch/exir/backend:utils",
-        "fbsource//third-party/pypi/coremltools:coremltools",
     ],
 )
 
@@ -64,24 +72,23 @@ runtime.cxx_python_extension(
     headers = glob([
         "runtime/inmemoryfs/**/*.hpp",
     ]),
+    base_module = "",
+    compiler_flags = [
+        "-std=c++17",
+    ],
     preprocessor_flags = [
         "-Iexecutorch/backends/apple/coreml/runtime/util",
     ],
     types = [
         "executorchcoreml.pyi",
     ],
-    compiler_flags = [
-        "-std=c++17",
-    ],
-    base_module = "",
     visibility = [
         "//executorch/examples/apple/coreml/...",
-    ],
-    external_deps = [
-        "pybind11",
+        "@EXECUTORCH_CLIENTS",
     ],
     deps = [
         "fbsource//third-party/nlohmann-json:nlohmann-json",
+        "fbsource//third-party/pybind11:pybind11",
     ],
 )
 
@@ -91,10 +98,10 @@ runtime.python_test(
         "test/*.py",
     ]),
     deps = [
+        "fbsource//third-party/pypi/pytest:pytest",
         ":partitioner",
         ":quantizer",
         "//caffe2:torch",
         "//pytorch/vision:torchvision",
-        "fbsource//third-party/pypi/pytest:pytest",
     ],
 )
diff --git a/backends/apple/coreml/partition/coreml_partitioner.py b/backends/apple/coreml/partition/coreml_partitioner.py
index 99aa2a0a60e..210ef307477 100644
--- a/backends/apple/coreml/partition/coreml_partitioner.py
+++ b/backends/apple/coreml/partition/coreml_partitioner.py
@@ -111,10 +111,16 @@ def ops_to_not_decompose(
         do_not_decompose = []
         op_support = OperatorsSupportedForCoreMLBackend()
         for node in ep.graph.nodes:
-            if (
-                node.op == "call_function"
-                and isinstance(node.target, torch._ops.OpOverload)
-                and op_support.is_node_supported(None, node)
+            if node.op == "call_function" and isinstance(
+                node.target, torch._ops.OpOverload
             ):
-                do_not_decompose.append(node.target)
+                try:
+                    if op_support.is_node_supported(None, node):
+                        do_not_decompose.append(node.target)
+                except Exception as e:
+                    # CoreML's op_support.is_node_supported will sometimes throw
+                    # for unsupported ops, rather than returning False
+                    logger.warning(
+                        f"Encountered exception when checking node support: {e}"
+                    )
         return do_not_decompose, None
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.h b/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.h
index abc5ef517b4..9a9d45a037a 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.h
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.h
@@ -71,7 +71,7 @@ __attribute__((objc_subclassing_restricted))
 /// @param error   On failure, error is filled with the failure information.
 /// @retval `YES` if the execution succeeded otherwise `NO`.
 - (BOOL)executeModelWithHandle:(ModelHandle*)handle
-                       argsVec:(const std::vector<executorchcoreml::MultiArray>&)argsVec
+                       argsVec:(std::vector<executorchcoreml::MultiArray>&)argsVec
                 loggingOptions:(const executorchcoreml::ModelLoggingOptions&)loggingOptions
                    eventLogger:(const executorchcoreml::ModelEventLogger* _Nullable)eventLogger
                          error:(NSError* __autoreleasing*)error;
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm
index cd0fbc86f99..3e11999e939 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm
@@ -734,7 +734,7 @@ - (BOOL)executeModelWithHandle:(ModelHandle *)handle
 }
 
 - (BOOL)executeModelWithHandle:(ModelHandle *)handle
-                       argsVec:(const std::vector<executorchcoreml::MultiArray>&)argsVec
+                       argsVec:(std::vector<executorchcoreml::MultiArray>&)argsVec
                 loggingOptions:(const executorchcoreml::ModelLoggingOptions&)loggingOptions
                    eventLogger:(const executorchcoreml::ModelEventLogger* _Nullable)eventLogger
                          error:(NSError * __autoreleasing *)error {
@@ -785,6 +785,12 @@ - (BOOL)executeModelWithHandle:(ModelHandle *)handle
             return NO;
         }
         
+        // Resize for dynamic shapes
+        for (int i = 0; i < outputArgs.size(); i++) {
+            auto new_size = to_vector<size_t>(modelOutputs[i].shape);
+            outputArgs[i].resize(new_size);
+            argsVec[model.orderedInputNames.count + i].resize(new_size);
+        }
         ::set_outputs(outputArgs, modelOutputs);
         return YES;
     }
diff --git a/backends/apple/coreml/runtime/delegate/backend_delegate.h b/backends/apple/coreml/runtime/delegate/backend_delegate.h
index a6e012a4480..9af3df01af2 100644
--- a/backends/apple/coreml/runtime/delegate/backend_delegate.h
+++ b/backends/apple/coreml/runtime/delegate/backend_delegate.h
@@ -89,7 +89,7 @@ class BackendDelegate {
     /// @param error   On failure, error is filled with the failure information.
     /// @retval `true` if the execution succeeded otherwise `false`.
     virtual bool execute(Handle* handle,
-                         const std::vector<MultiArray>& args,
+                         std::vector<MultiArray>& args,
                          const ModelLoggingOptions& logging_options,
                          ModelEventLogger* event_logger,
                          std::error_code& error) const noexcept = 0;
diff --git a/backends/apple/coreml/runtime/delegate/backend_delegate.mm b/backends/apple/coreml/runtime/delegate/backend_delegate.mm
index efa3dd2472f..d8096e16781 100644
--- a/backends/apple/coreml/runtime/delegate/backend_delegate.mm
+++ b/backends/apple/coreml/runtime/delegate/backend_delegate.mm
@@ -104,7 +104,7 @@ - (ModelHandle*)loadModelFromAOTData:(NSData*)data
                                error:(NSError* __autoreleasing*)error;
 
 - (BOOL)executeModelWithHandle:(ModelHandle*)handle
-                       argsVec:(const std::vector<executorchcoreml::MultiArray>&)argsVec
+                       argsVec:(std::vector<executorchcoreml::MultiArray>&)argsVec
                 loggingOptions:(const executorchcoreml::ModelLoggingOptions&)loggingOptions
                    eventLogger:(const executorchcoreml::ModelEventLogger* _Nullable)eventLogger
                          error:(NSError* __autoreleasing*)error;
@@ -199,7 +199,7 @@ - (ModelHandle*)loadModelFromAOTData:(NSData*)data
 }
 
 - (BOOL)executeModelWithHandle:(ModelHandle*)handle
-                       argsVec:(const std::vector<executorchcoreml::MultiArray>&)argsVec
+                       argsVec:(std::vector<executorchcoreml::MultiArray>&)argsVec
                 loggingOptions:(const executorchcoreml::ModelLoggingOptions&)loggingOptions
                    eventLogger:(const executorchcoreml::ModelEventLogger* _Nullable)eventLogger
                          error:(NSError* __autoreleasing*)error {
@@ -286,7 +286,7 @@ explicit BackendDelegateImpl(const Config& config) noexcept
     }
     
     bool execute(Handle* handle,
-                 const std::vector<MultiArray>& args,
+                 std::vector<MultiArray>& args,
                  const ModelLoggingOptions& logging_options,
                  ModelEventLogger *event_logger,
                  std::error_code& ec) const noexcept override {
diff --git a/backends/apple/coreml/runtime/delegate/coreml_backend_delegate.mm b/backends/apple/coreml/runtime/delegate/coreml_backend_delegate.mm
index 2d94873ce68..380ec52b7d7 100644
--- a/backends/apple/coreml/runtime/delegate/coreml_backend_delegate.mm
+++ b/backends/apple/coreml/runtime/delegate/coreml_backend_delegate.mm
@@ -12,6 +12,7 @@
 #import <coreml_backend/delegate.h>
 #import <executorch/runtime/core/evalue.h>
 #import <executorch/runtime/platform/log.h>
+#import <executorch/runtime/kernel/kernel_includes.h>
 #import <memory>
 #import <model_event_logger.h>
 #import <model_logging_options.h>
@@ -19,6 +20,7 @@
 #import <objc_safe_cast.h>
 #import <unordered_map>
 #import <vector>
+#include <array>
 
 #ifdef ET_EVENT_TRACER_ENABLED
 #import <model_event_logger_impl.h>
@@ -40,6 +42,9 @@
 using executorch::runtime::FreeableBuffer;
 using executorch::runtime::get_backend_class;
 using executorch::runtime::Result;
+using executorch::aten::SizesType;
+using executorch::aten::Tensor;
+using executorch::runtime::kTensorDimensionLimit;
 
 std::optional<MultiArray::DataType> get_data_type(ScalarType scalar_type) {
     switch (scalar_type) {
@@ -221,6 +226,21 @@ ModelLoggingOptions get_logging_options(BackendExecutionContext& context) {
                              ETCoreMLStrings.delegateIdentifier.UTF8String);
 #endif
 
+    // Resize for dynamic shape
+    std::array<SizesType, kTensorDimensionLimit> new_shape;
+    for (size_t i = nInputs; i < nInputs + nOutputs; i++) {
+        Tensor& t = args[i]->toTensor();
+        int rank = delegate_args[i].layout().rank();
+        assert (rank <= new_shape.size());
+        for (int d = 0; d < rank; d++) {
+            new_shape[d] = delegate_args[i].layout().shape()[d];
+        }
+        ET_CHECK_OR_RETURN_ERROR(
+            resize_tensor(t, ArrayRef(new_shape.data(), rank)) == Error::Ok,
+            DelegateInvalidHandle,
+            "%s: Failed to resize delegate output %zu",  ETCoreMLStrings.delegateIdentifier.UTF8String, i);
+    }
+
     return Error::Ok;
 }
 
diff --git a/backends/apple/coreml/runtime/delegate/multiarray.h b/backends/apple/coreml/runtime/delegate/multiarray.h
index 70a2a08a2f7..ecde904409b 100644
--- a/backends/apple/coreml/runtime/delegate/multiarray.h
+++ b/backends/apple/coreml/runtime/delegate/multiarray.h
@@ -84,6 +84,11 @@ class MultiArray final {
         /// Returns `true` if the memory layout is packed otherwise `false`.
         bool is_packed() const noexcept;
 
+        // Resizes memory layout
+        // New shape must be the same dimension and no larger than current shape in all dimensions
+        // New format is contiguous
+        void resize(const std::vector<size_t>& shape);
+
     private:
         DataType dataType_;
         std::vector<size_t> shape_;
@@ -126,6 +131,8 @@ class MultiArray final {
         *ptr = value;
     }
 
+    void resize(const std::vector<size_t>& shape) { layout_.resize(shape); }
+
 private:
     void* data(const std::vector<size_t>& indices) const noexcept;
 
diff --git a/backends/apple/coreml/runtime/delegate/multiarray.mm b/backends/apple/coreml/runtime/delegate/multiarray.mm
index 74996fb8d5a..de705991780 100644
--- a/backends/apple/coreml/runtime/delegate/multiarray.mm
+++ b/backends/apple/coreml/runtime/delegate/multiarray.mm
@@ -512,6 +512,24 @@ ssize_t get_data_offset(size_t index, const std::vector<size_t>& shape, const st
 
 namespace executorchcoreml {
 
+void MultiArray::MemoryLayout::resize(const std::vector<size_t>& shape) {
+    assert(shape.size() == shape_.size());
+    for (int i = 0; i < shape.size(); ++i) {
+        assert (shape[i] >= 1);
+        assert(shape[i] <= shape_[i]);
+    }
+    int stride = 1;
+    for (int i = shape.size() - 1; i >= 0; --i) {
+        shape_[i] = shape[i];
+        strides_[i] = stride;
+        if (shape[i] > 1) {
+            stride *= shape[i];
+        }
+    }
+}
+
+
+
 size_t MultiArray::MemoryLayout::num_elements() const noexcept {
     if (shape_.size() == 0) {
         return 0;
diff --git a/backends/apple/coreml/runtime/test/BackendDelegateTests.mm b/backends/apple/coreml/runtime/test/BackendDelegateTests.mm
index 78ee33429a8..74af483226a 100644
--- a/backends/apple/coreml/runtime/test/BackendDelegateTests.mm
+++ b/backends/apple/coreml/runtime/test/BackendDelegateTests.mm
@@ -162,8 +162,9 @@ - (void)testAddModelExecution {
     MLMultiArray *output = [ETCoreMLTestUtils filledMultiArrayWithShape:inputs[0].shape dataType:inputs[0].dataType repeatedValue:@(0) error:&localError];
     NSArray<MLMultiArray *> *args = [inputs arrayByAddingObject:output];
     std::error_code errorCode;
+    auto argsVec = to_multiarrays(args);
     XCTAssertTrue(_delegate->execute(handle,
-                                     to_multiarrays(args),
+                                     argsVec,
                                      ModelLoggingOptions(),
                                      nullptr,
                                      errorCode));
@@ -187,8 +188,9 @@ - (void)testMulModelExecution {
     MLMultiArray *output = [ETCoreMLTestUtils filledMultiArrayWithShape:inputs[0].shape dataType:inputs[0].dataType repeatedValue:@(0) error:&localError];
     NSArray<MLMultiArray *> *args = [inputs arrayByAddingObject:output];
     std::error_code errorCode;
-    XCTAssertTrue(_delegate->execute(handle, 
-                                     to_multiarrays(args),
+    auto argsVec = to_multiarrays(args);
+    XCTAssertTrue(_delegate->execute(handle,
+                                     argsVec,
                                      ModelLoggingOptions(),
                                      nullptr,
                                      errorCode));
diff --git a/backends/apple/coreml/runtime/test/MultiArrayTests.mm b/backends/apple/coreml/runtime/test/MultiArrayTests.mm
index 895702ae154..b55148cae0d 100644
--- a/backends/apple/coreml/runtime/test/MultiArrayTests.mm
+++ b/backends/apple/coreml/runtime/test/MultiArrayTests.mm
@@ -130,4 +130,20 @@ - (void)testNonAdjacentDataCopy {
     [self verifyDataCopyWithShape:shape srcStrides:srcStrides dstStrides:dstStrides];
 }
 
+- (void)testResize {
+    std::vector<size_t> shape = {3, 1, 2, 5};
+    std::vector<ssize_t> strides = {1*2*5, 2*5, 5, 1};
+    std::vector<uint8_t> storage;
+    std::vector<size_t> newShape = {3, 1, 1, 1};
+    
+    auto array = make_multi_array_and_fill<int>(shape, strides, storage);
+    for (size_t i = 0; i < array.layout().rank(); ++i) {
+        XCTAssertEqual(array.layout().shape()[i], shape[i]);
+    }
+    array.resize(newShape);
+    for (size_t i = 0; i < array.layout().rank(); ++i) {
+        XCTAssertEqual(array.layout().shape()[i], newShape[i]);
+    }
+}
+
 @end
diff --git a/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/xcshareddata/xcschemes/executorchcoreml_tests.xcscheme b/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/xcshareddata/xcschemes/executorchcoreml_tests.xcscheme
index 29b3d3bdfc7..afd89d5c6d1 100644
--- a/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/xcshareddata/xcschemes/executorchcoreml_tests.xcscheme
+++ b/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/xcshareddata/xcschemes/executorchcoreml_tests.xcscheme
@@ -23,6 +23,11 @@
                BlueprintName = "executorchcoreml_tests"
                ReferencedContainer = "container:executorchcoreml.xcodeproj">
             </BuildableReference>
+            <SkippedTests>
+               <Test
+                  Identifier = "ETCoreMLModelDebuggerTests/testMV3ProgramDebugging">
+               </Test>
+            </SkippedTests>
          </TestableReference>
       </Testables>
    </TestAction>
diff --git a/backends/apple/coreml/scripts/build_tests.sh b/backends/apple/coreml/scripts/build_tests.sh
index 5fbde6ac66f..890385d5e24 100755
--- a/backends/apple/coreml/scripts/build_tests.sh
+++ b/backends/apple/coreml/scripts/build_tests.sh
@@ -32,7 +32,6 @@ cmake "$EXECUTORCH_ROOT_PATH" -B"$CMAKE_EXECUTORCH_BUILD_DIR_PATH" \
 -DCMAKE_TOOLCHAIN_FILE="$IOS_TOOLCHAIN_PATH" \
 -DPLATFORM=MAC_UNIVERSAL \
 -DDEPLOYMENT_TARGET=13.0 \
--DFLATC_EXECUTABLE="$(which flatc)" \
 -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \
 -DEXECUTORCH_BUILD_XNNPACK=OFF \
 -DEXECUTORCH_BUILD_GFLAGS=OFF
diff --git a/backends/apple/coreml/setup.md b/backends/apple/coreml/setup.md
index 0efd9bbcc26..6b7ffa4ded8 100644
--- a/backends/apple/coreml/setup.md
+++ b/backends/apple/coreml/setup.md
@@ -50,7 +50,7 @@ xcode-select --install
 
 ```bash
 cd executorch
-./build/build_apple_frameworks.sh --coreml
+./scripts/build_apple_frameworks.sh --coreml
 ```
 5. Open the project in Xcode, and drag `executorch.xcframework` and `coreml_backend.xcframework` frameworks generated from Step 2 to Frameworks.
 
diff --git a/backends/apple/coreml/test/test_coreml_partitioner.py b/backends/apple/coreml/test/test_coreml_partitioner.py
index 03aac6a8611..b8223d84f51 100644
--- a/backends/apple/coreml/test/test_coreml_partitioner.py
+++ b/backends/apple/coreml/test/test_coreml_partitioner.py
@@ -82,11 +82,28 @@ def test_vit_skip_conv(self):
 
     def test_ops_to_not_decompose(self):
         class Model(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+
             def forward(self, q, k, v, mask):
-                return torch.ops.aten.scaled_dot_product_attention.default(
+                out = torch.ops.aten.scaled_dot_product_attention.default(
                     q, k, v, attn_mask=mask
                 )
 
+                # Add non-functional and alias ops
+                # These will be removed by ExecuTorch in non-decomposition
+                # table because they cannot be functionalized
+                out = out.transpose(1, 2)
+                out = out.view(1, -1)
+                out = out.permute(0, 1)
+                out = out.add_(1.0)
+                out = out.mul_(2.0)
+                out = out.div_(3.0)
+                out = out.sub_(4.0)
+                out = torch.ops.aten.view_copy.default(out, (-1,))
+                out = out.select(0, 0)
+                return out
+
         model = Model()
         model.eval()
 
@@ -100,7 +117,7 @@ def forward(self, q, k, v, mask):
         v = torch.randn(batch_size, n_heads, max_seq_length, embedding_dim)
         mask = torch.randn(seq_len, max_seq_length)
         example_inputs = (q, k, v, mask)
-        ep = torch.export.export(model, example_inputs)
+        ep = torch.export.export(model, example_inputs, strict=True)
         coreml_partitioner = CoreMLPartitioner()
 
         # Using to_edge_transform_and_lower, we expect SDPA will be preserved and show up in delegated graph
diff --git a/backends/apple/mps/CMakeLists.txt b/backends/apple/mps/CMakeLists.txt
index 96aa007563b..132307c5c78 100644
--- a/backends/apple/mps/CMakeLists.txt
+++ b/backends/apple/mps/CMakeLists.txt
@@ -16,16 +16,12 @@ if(NOT EXECUTORCH_ROOT)
   set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 endif()
 
-include(${EXECUTORCH_ROOT}/build/Utils.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 
 if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
 endif()
 
-if(NOT FLATC_EXECUTABLE)
-  set(FLATC_EXECUTABLE flatc)
-endif()
-
 set(_common_compile_options -Wno-deprecated-declarations)
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 
@@ -50,6 +46,7 @@ add_custom_command(
     "${_mps_schema__include_dir}/executorch/backends/apple/mps"
     ${_mps_schema__srcs}
   WORKING_DIRECTORY ${EXECUTORCH_ROOT}
+  DEPENDS flatc
   COMMENT "Generating mps_schema headers"
   VERBATIM
 )
diff --git a/backends/apple/mps/runtime/MPSBackend.mm b/backends/apple/mps/runtime/MPSBackend.mm
index ddf642a9fb0..261332436d4 100644
--- a/backends/apple/mps/runtime/MPSBackend.mm
+++ b/backends/apple/mps/runtime/MPSBackend.mm
@@ -43,8 +43,11 @@ bool is_available() const override {
       BackendInitContext& context,
       FreeableBuffer* processed,
       ArrayRef<CompileSpec> compile_specs) const override {
-    auto executor = ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR(
-        context.get_runtime_allocator(), mps::delegate::MPSExecutor);
+    auto executor = context.get_runtime_allocator()->allocateInstance<mps::delegate::MPSExecutor>();
+    if (executor == nullptr) {
+      return Error::MemoryAllocationFailed;
+    }
+
     // NOTE: Since we use placement new and since this type is not trivially
     // destructible, we must call the destructor manually in destroy().
     new (executor) mps::delegate::MPSExecutor;
diff --git a/backends/apple/mps/runtime/MPSDevice.mm b/backends/apple/mps/runtime/MPSDevice.mm
index c34b571c3a9..7f4c0bde9e5 100644
--- a/backends/apple/mps/runtime/MPSDevice.mm
+++ b/backends/apple/mps/runtime/MPSDevice.mm
@@ -22,11 +22,11 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& de
   // MPS Advanced Indexing needs at least Metal 2.0 (support for Argument Buffers and function constants)
   // host_name attribute needs at least Metal 2.2 and ulong needs Metal 2.3 (supported on MacOS 11+)
   MTLLanguageVersion languageVersion = MTLLanguageVersion2_3;
-#if defined(__MAC_13_0)
-  if (macOS13Plus) {
-    languageVersion = MTLLanguageVersion3_0;
+  if (@available(iOS 16, macOS 13, *)) {
+    if (macOS13Plus) {
+      languageVersion = MTLLanguageVersion3_0;
+    }
   }
-#endif
 
   ET_CHECK_MSG([device supportsFamily:MTLGPUFamilyMac2], "Missing Metal support for MTLGPUFamilyMac2");
   return languageVersion;
diff --git a/backends/apple/mps/runtime/operations/IndexingOps.mm b/backends/apple/mps/runtime/operations/IndexingOps.mm
index d4015d10253..34a03851655 100644
--- a/backends/apple/mps/runtime/operations/IndexingOps.mm
+++ b/backends/apple/mps/runtime/operations/IndexingOps.mm
@@ -206,25 +206,32 @@
 
 Error
 MPSGraphBuilder::mpsScatterOp(NodePtr nodePtr) {
-  auto graphNode = nodePtr->mpsnode_union_as_MPSScatter();
-  ET_LOG(
-    Debug, "%s %d: %d",
-    __FUNCTION__, graphNode->input1_id(), graphNode->output_id()
-  );
+  if (@available(iOS 15.4, macOS 12.3, *)) {
+    auto graphNode = nodePtr->mpsnode_union_as_MPSScatter();
+    ET_LOG(
+      Debug, "%s %d: %d",
+      __FUNCTION__, graphNode->input1_id(), graphNode->output_id()
+    );
 
-  int64_t dim = graphNode->dim();
-  MPSGraphTensor* inputTensor = getMPSGraphTensor(graphNode->input1_id());
-  MPSGraphTensor* indicesTensor = getMPSGraphTensor(graphNode->idx_id());
-  MPSGraphTensor* updatesTensor = getMPSGraphTensor(graphNode->src_id());
+    int64_t dim = graphNode->dim();
+    MPSGraphTensor* inputTensor = getMPSGraphTensor(graphNode->input1_id());
+    MPSGraphTensor* indicesTensor = getMPSGraphTensor(graphNode->idx_id());
+    MPSGraphTensor* updatesTensor = getMPSGraphTensor(graphNode->src_id());
 
-  _idToMPSGraphTensor[graphNode->output_id()] =
-    [_mpsGraph scatterAlongAxis:dim
-                 withDataTensor:inputTensor
-                  updatesTensor:updatesTensor
-                  indicesTensor:indicesTensor
-                           mode:MPSGraphScatterModeSet
-                           name:nil];
-  return Error::Ok;
+    _idToMPSGraphTensor[graphNode->output_id()] =
+      [_mpsGraph scatterAlongAxis:dim
+                   withDataTensor:inputTensor
+                    updatesTensor:updatesTensor
+                    indicesTensor:indicesTensor
+                             mode:MPSGraphScatterModeSet
+                             name:nil];
+
+    return Error::Ok;
+  } else {
+    ET_LOG(Error, "MPS: scatter op is not supported on iOS < 15.4 and macOS < 12.3");
+
+    return Error::NotSupported;
+  }
 }
 
 
diff --git a/backends/apple/mps/setup.md b/backends/apple/mps/setup.md
index 82344876531..5c14ad673df 100644
--- a/backends/apple/mps/setup.md
+++ b/backends/apple/mps/setup.md
@@ -130,7 +130,7 @@ python3 -m sdk.inspector.inspector_cli --etdump_path etdump.etdp --etrecord_path
 ***Step 1***. Create the ExecuTorch core and MPS delegate frameworks to link on iOS
 ```bash
 cd executorch
-./build/build_apple_frameworks.sh --mps
+./scripts/build_apple_frameworks.sh --mps
 ```
 
 `mps_delegate.xcframework` will be in `cmake-out` folder, along with `executorch.xcframework` and `portable_delegate.xcframework`:
diff --git a/backends/arm/CMakeLists.txt b/backends/arm/CMakeLists.txt
index d91d3051a55..39a51c56b14 100644
--- a/backends/arm/CMakeLists.txt
+++ b/backends/arm/CMakeLists.txt
@@ -12,7 +12,7 @@ if(NOT EXECUTORCH_ROOT)
   set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
 endif()
 
-include(${EXECUTORCH_ROOT}/build/Utils.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 
 set(_common_include_directories ${EXECUTORCH_ROOT}/.. ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)
 add_compile_definitions(C10_USING_CUSTOM_GENERATED_MACROS)
diff --git a/backends/arm/README.md b/backends/arm/README.md
index 9a5a6f94085..04815bf23d2 100644
--- a/backends/arm/README.md
+++ b/backends/arm/README.md
@@ -55,10 +55,10 @@ To run the unit test suite with Corstone3x0 FVP simulator support use
 backends/arm/test/test_arm_baremetal.sh test_pytest_ethosu_fvp
 ```
 
-You can test to run some models with the run.sh flow
+You can test to run some models with the full fvp test flow
 
 ```
-backends/arm/test/test_arm_baremetal.sh test_run_ethosu_fvp
+backends/arm/test/test_arm_baremetal.sh test_full_ethosu_fvp
 ```
 
 ## Unit tests
diff --git a/backends/arm/TARGETS b/backends/arm/TARGETS
index f4ab883362e..63140dc7b9f 100644
--- a/backends/arm/TARGETS
+++ b/backends/arm/TARGETS
@@ -1,6 +1,5 @@
 # @noautodeps
 load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
-
 python_library(
     name = "arm_partitioner",
     srcs = [
@@ -9,7 +8,6 @@ python_library(
         "tosa_backend.py",
         "tosa_partitioner.py",
     ],
-    typing = True,
     deps = [
         ":arm_backend",
         "//executorch/backends/arm/operator_support:operator_support",
@@ -17,13 +15,11 @@ python_library(
         "//executorch/exir:lib",
     ],
 )
-
 python_library(
     name = "arm_backend",
     srcs = [
         "arm_backend.py",
     ],
-    typing = True,
     deps = [
         "fbsource//third-party/pypi/flatbuffers:flatbuffers",
         "fbsource//third-party/pypi/ml-dtypes:ml-dtypes",
@@ -36,11 +32,9 @@ python_library(
         "//executorch/backends/arm/_passes:passes",
     ],
 )
-
 python_library(
     name = "process_node",
     srcs = ["process_node.py"],
-    typing = True,
     deps = [
         "fbsource//third-party/serialization_lib/python/tosa:tosa",
         "//executorch/backends/arm/operators:node_visitor",
@@ -50,36 +44,30 @@ python_library(
         "//executorch/exir:lib",
     ],
 )
-
 python_library(
     name = "arm_vela",
     srcs = [
         "arm_vela.py",
     ],
-    typing = True,
     deps = [
         "fbsource//third-party/pypi/ethos-u-vela:ethos-u-vela",
     ],
 )
-
 python_library(
     name = "tosa_mapping",
     srcs = [
         "tosa_mapping.py",
     ],
-    typing = True,
     deps = [
         "fbsource//third-party/serialization_lib/python/serializer:serializer",
         "//caffe2:torch",
     ],
 )
-
 python_library(
     name = "tosa_quant_utils",
     srcs = [
         "tosa_quant_utils.py",
     ],
-    typing = True,
     deps = [
         "fbsource//third-party/pypi/numpy:numpy",
         "fbsource//third-party/serialization_lib/python/serializer:serializer",
@@ -88,38 +76,32 @@ python_library(
         "//executorch/exir/dialects:lib",
     ],
 )
-
 python_library(
     name = "tosa_specification",
     srcs = [
         "tosa_specification.py",
     ],
-    typing = True,
     deps = [
         "fbsource//third-party/pypi/packaging:packaging",
         "//executorch/exir/backend:compile_spec_schema",
     ],
 )
-
 python_library(
     name = "tosa_utils",
     srcs = [
         "tosa_utils.py",
     ],
-    typing = True,
     deps = [
         "fbsource//third-party/serialization_lib/python/serializer:serializer",
         ":tosa_quant_utils",
         "//executorch/backends/arm/operators:node_visitor",
     ],
 )
-
 python_library(
     name = "arm_model_evaluator",
     srcs = [
         "util/arm_model_evaluator.py",
     ],
-    typing = True,
     deps = [
         "//caffe2:torch",
     ]
diff --git a/backends/arm/_passes/TARGETS b/backends/arm/_passes/TARGETS
index 6ca59cfee27..ddf4aea5adc 100644
--- a/backends/arm/_passes/TARGETS
+++ b/backends/arm/_passes/TARGETS
@@ -3,11 +3,13 @@ load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
 python_library(
     name = "passes",
     srcs = glob(["*.py"]),
-    typing = True,
     deps = [
         "//executorch/backends/arm:tosa_quant_utils",
         "//executorch/backends/arm:tosa_utils",
+        "//executorch/backends/transforms:fuse_view_copy",
+        "//executorch/backends/transforms:replace_scalar_with_tensor",
         "//executorch/backends/xnnpack/_passes:xnnpack_passes",
         "//executorch/exir:lib",
+        "//executorch/backends/transforms:utils",
     ],
 )
diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
index 28d70591e5e..7ec04ea0844 100644
--- a/backends/arm/_passes/arm_pass_manager.py
+++ b/backends/arm/_passes/arm_pass_manager.py
@@ -15,18 +15,23 @@
 )
 from executorch.backends.arm._passes.cast_int64_pass import CastInt64ToInt32Pass
 from executorch.backends.arm._passes.conv1d_unsqueeze_pass import Conv1dUnsqueezePass
+from executorch.backends.arm._passes.convert_any_default_dim_dims_pass import (
+    ConvertAnyDefaultDimDimsPass,
+)
 from executorch.backends.arm._passes.convert_expand_copy_to_repeat import (
     ConvertExpandCopyToRepeatPass,
 )
 from executorch.backends.arm._passes.convert_full_like_to_full_pass import (
     ConvertFullLikeToFullPass,
 )
+from executorch.backends.arm._passes.convert_minmax_pass import ConvertMinMaxPass
 from executorch.backends.arm._passes.convert_split_to_slice import (
     ConvertSplitToSlicePass,
 )
 from executorch.backends.arm._passes.convert_squeezes_to_view import (  # type: ignore[import-not-found]
     ConvertSqueezesToViewPass,
 )
+from executorch.backends.arm._passes.convert_to_clamp import ConvertToClampPass
 from executorch.backends.arm._passes.decompose_batchnorm_pass import (
     DecomposeBatchNormPass,
 )
@@ -39,8 +44,9 @@
 from executorch.backends.arm._passes.decompose_select import (  # type: ignore[import-not-found]
     DecomposeSelectPass,
 )
-from executorch.backends.arm._passes.decompose_softmaxes_pass import (
-    DecomposeSoftmaxesPass,
+from executorch.backends.arm._passes.decompose_softmax_pass import DecomposeSoftmaxPass
+from executorch.backends.arm._passes.decompose_softmax_unstable_pass import (
+    DecomposeSoftmaxUnstablePass,
 )
 from executorch.backends.arm._passes.decompose_var_pass import DecomposeVarPass
 from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
@@ -49,6 +55,7 @@
     RetraceFoldedDtypesPass,
 )
 from executorch.backends.arm._passes.fuse_batchnorm2d_pass import FuseBatchnorm2DPass
+from executorch.backends.arm._passes.fuse_constant_ops_pass import FuseConstantOpsPass
 from executorch.backends.arm._passes.fuse_quantized_activation_pass import (  # type: ignore[import-not-found]
     FuseQuantizedActivationPass,
 )
@@ -75,7 +82,8 @@
 from executorch.backends.arm._passes.unsqueeze_scalar_placeholders_pass import (
     UnsqueezeScalarPlaceholdersPass,
 )
-from executorch.backends.arm.tosa_specification import TosaSpecification
+from executorch.backends.arm.tosa_specification import Tosa_0_80, TosaSpecification
+from executorch.backends.transforms.fuse_view_copy import FuseViewCopyTransform
 
 from executorch.backends.transforms.replace_scalar_with_tensor import (
     ReplaceScalarWithTensorArgPass,
@@ -104,13 +112,15 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(DecomposeLinearPass())
         self.add_pass(ConvertMeanDimToAveragePoolPass())
         self.add_pass(ConvertFullLikeToFullPass())
+        self.add_pass(ConvertToClampPass())
+        self.add_pass(ConvertMinMaxPass())
+        self.add_pass(ConvertAnyDefaultDimDimsPass())
 
         self.add_pass(ReplaceScalarWithTensorArgPass())
         self.add_pass(AnnotateDecomposedMatmulPass())
         self.add_pass(QuantizeOperatorArguments())
         self.add_pass(FoldAndAnnotateQParamsPass())  # type: ignore[call-arg]
         self.add_pass(RetraceFoldedDtypesPass())
-        self.add_pass(InsertTableOpsPass(exported_program))
 
         self.add_pass(RemoveClonePass())
         self.add_pass(SizeAdjustConv2DPass())
@@ -124,8 +134,12 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(DecomposeSelectPass())
         self.add_pass(ConvertSqueezesToViewPass())
 
+        self.add_pass(FuseViewCopyTransform())
+        self.add_pass(FuseConstantOpsPass(exported_program))
+        self.add_pass(InsertTableOpsPass(exported_program))
         self.add_pass(AnnotateChannelsLastDimOrder())
         self.add_pass(InsertRescalePass())
+
         return self._transform(exported_program.graph_module)
 
     def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
@@ -142,13 +156,16 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(DecomposeMeanDimPass())
         self.add_pass(ConvertMeanDimToAveragePoolPass())
         self.add_pass(DecomposeDivPass())
-        self.add_pass(DecomposeSoftmaxesPass())
+        self.add_pass(DecomposeSoftmaxPass())
         self.add_pass(ConvertFullLikeToFullPass())
+        self.add_pass(ConvertToClampPass())
+        self.add_pass(ConvertMinMaxPass())
+        self.add_pass(ConvertAnyDefaultDimDimsPass())
+
         self.add_pass(AnnotateDecomposedMatmulPass())
         self.add_pass(QuantizeOperatorArguments())
         self.add_pass(FoldAndAnnotateQParamsPass())  # type: ignore[call-arg]
         self.add_pass(RetraceFoldedDtypesPass())
-        self.add_pass(InsertTableOpsPass(exported_program))
 
         self.add_pass(RemoveClonePass())
         self.add_pass(SizeAdjustConv2DPass())
@@ -162,6 +179,9 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(DecomposeSelectPass())
         self.add_pass(ConvertSqueezesToViewPass())
 
+        self.add_pass(FuseViewCopyTransform())
+        self.add_pass(FuseConstantOpsPass(exported_program))
+        self.add_pass(InsertTableOpsPass(exported_program))
         self.add_pass(AnnotateChannelsLastDimOrder())
         self.add_pass(InsertRescalePass())
 
@@ -179,11 +199,18 @@ def transform_to_backend_pipeline(self, exported_program: ExportedProgram):
             )
 
     def transform_for_annotation_pipeline(self, graph_module: GraphModule):
-        self.add_pass(ScalarsToAttributePass())
         self.add_pass(ReplaceScalarWithTensorArgPass())
+        self.add_pass(ScalarsToAttributePass())
         self.add_pass(DecomposeLayerNormPass())
         self.add_pass(DecomposeVarPass())
         self.add_pass(DecomposeMeanDimPass())
         self.add_pass(DecomposeDivPass())
-        self.add_pass(DecomposeSoftmaxesPass())
+
+        if isinstance(self.tosa_spec, Tosa_0_80) and self.tosa_spec.is_U55_subset:
+            # Numerically stable softmax uses amax which is not supported on Ethos-U55
+            self.add_pass(DecomposeSoftmaxUnstablePass())
+        else:
+            self.add_pass(DecomposeSoftmaxPass())
+
+        self.add_pass(ConvertMinMaxPass())
         return self._transform(graph_module)
diff --git a/backends/arm/_passes/arm_pass_utils.py b/backends/arm/_passes/arm_pass_utils.py
index cb43acc7fdb..dba8f557085 100644
--- a/backends/arm/_passes/arm_pass_utils.py
+++ b/backends/arm/_passes/arm_pass_utils.py
@@ -1,6 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -8,7 +8,7 @@
 # pyre-unsafe
 
 from inspect import isclass
-from typing import Optional
+from typing import Optional, Sequence
 
 import torch
 import torch.fx
@@ -26,6 +26,7 @@
 )
 from torch._ops import OpOverload
 from torch._subclasses.fake_tensor import FakeTensor
+from torch.export.graph_signature import InputKind
 
 
 def is_get_attr_node(node: torch.fx.Node) -> bool:
@@ -44,6 +45,30 @@ def is_param_node(exp_prog: ExportedProgram, node: torch.fx.Node) -> bool:
     )
 
 
+def get_constant_placeholder_kind(
+    exp_prog: ExportedProgram, node: torch.fx.Node
+) -> InputKind:
+    if is_param(exp_prog, node):
+        return InputKind.PARAMETER
+    if is_buffer(exp_prog, node):
+        return InputKind.BUFFER
+    if is_lifted_tensor_constant(exp_prog, node):
+        return InputKind.CONSTANT_TENSOR
+
+    raise RuntimeError("Node is neither PARAMETER, BUFFER nor CONSTANT_TENSOR")
+
+
+def is_persistent_buffer(exp_prog: ExportedProgram, node: torch.fx.Node) -> bool | None:
+    if is_buffer(exp_prog, node):
+        buffer_name = exp_prog.graph_signature.inputs_to_buffers[node.name]
+        if buffer_name in exp_prog.graph_signature.non_persistent_buffers:
+            return False
+        else:
+            return True
+
+    return None
+
+
 def get_param_tensor(
     exp_prog: ExportedProgram, node: torch.fx.Node
 ) -> Optional[torch.Tensor]:
@@ -124,7 +149,7 @@ def get_first_fake_tensor(node: torch.fx.Node) -> FakeTensor:
     If the node contains many fake tensors, return the first one.
     """
     if isinstance(
-        node.meta["val"], (tuple, torch.fx.immutable_collections.immutable_list)
+        node.meta["val"], (Sequence, torch.fx.immutable_collections.immutable_list)
     ):
         fake_tensor = node.meta["val"][0]
     else:
diff --git a/backends/arm/_passes/convert_any_default_dim_dims_pass.py b/backends/arm/_passes/convert_any_default_dim_dims_pass.py
new file mode 100644
index 00000000000..7085f17add0
--- /dev/null
+++ b/backends/arm/_passes/convert_any_default_dim_dims_pass.py
@@ -0,0 +1,106 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.exir.dialects._ops import (  # type: ignore[import-not-found]
+    ops as exir_ops,
+)
+from executorch.exir.pass_base import (  # type: ignore[import-not-found]
+    ExportPass,
+    PassResult,
+)
+
+
+class ConvertAnyDefaultDimDimsPass(ExportPass):
+    """
+    Converts any.default, any.dim and any.dims to a sequence of any.dim by unrolling multi-dimensional reduction.
+    Please refer to KeepDimsFalseToSqueezePass for an explanation of this coversion.
+
+    Example 1
+    Original:
+        any()  # x.shape: [dim1, dim2, ..., dimn]
+    After pass:
+        any.dim(dim1, keepdim = True)
+        any.dim(dim2, keepdim = True)
+        ...
+        any.dim(dimn, keepdim = True)
+        squeeze(dim = [dim1, dim2, ...., dimn])
+
+    Example 2
+    Original:
+        any.dim(dim1, keepdim = False)
+    After pass:
+        any.dim(dim1, keepdim = True)
+        squeeze(dim = [dim1])
+
+    Example 3
+    Original:
+        any.dims([dim1, dim2], keepdim = False)
+    After pass:
+        any.dim(dim1, keepdim = True)
+        any.dim(dim2, keepdim = True)
+        squeeze(dim = [dim1, dim2])
+    """
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        modified = False
+        for node in graph_module.graph.nodes:
+            if node.op != "call_function":
+                continue
+            if node.target not in [
+                exir_ops.edge.aten.any.default,
+                exir_ops.edge.aten.any.dim,
+                exir_ops.edge.aten.any.dims,
+            ]:
+                continue
+
+            if len(node.args) == 1:
+                # any.default(input)
+                input_node = (node.args)[0]
+                dims = range(len(input_node.meta["val"].shape))
+                keepdim = False
+            elif len(node.args) == 2:
+                # any.dim/dims(input, dims=dims)
+                input_node, dims = node.args
+                keepdim = False
+            elif len(node.args) == 3:
+                # any.dim/dims(input, dims=dims, keepdim=keepdim)
+                input_node, dims, keepdim = node.args
+            else:
+                raise RuntimeError(
+                    f"Unexpected arg size {len(node.args)} in {node.name}"
+                )
+            try:
+                iter(dims)
+            except:
+                dims = [dims]  # type: ignore[assignment]
+            else:
+                dims = list(dims)  # type: ignore[assignment]
+
+            # Unroll multi-dimensional reduction and keep-dims arg
+            with graph_module.graph.inserting_before(node):
+                for dim in dims:
+                    args = (input_node, dim, True)
+                    input_node = graph_module.graph.create_node(
+                        "call_function", exir_ops.edge.aten.any.dim, args, node.kwargs
+                    )
+
+                if not keepdim:
+                    args = (input_node, dims)  # type: ignore[assignment]
+                    input_node = graph_module.graph.create_node(
+                        "call_function",
+                        exir_ops.edge.aten.squeeze_copy.dims,
+                        args,
+                    )
+
+            node.replace_all_uses_with(input_node)
+            modified = True
+
+        if modified:
+            graph_module.graph.eliminate_dead_code()
+            graph_module.recompile()
+            graph_module = super().call(graph_module).graph_module
+
+        return PassResult(graph_module, modified)
diff --git a/backends/arm/_passes/convert_minmax_pass.py b/backends/arm/_passes/convert_minmax_pass.py
new file mode 100644
index 00000000000..9f409632c20
--- /dev/null
+++ b/backends/arm/_passes/convert_minmax_pass.py
@@ -0,0 +1,136 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+
+
+class ConvertMinMaxPass(ExportPass):
+    """
+    Converts min/max to amin/amax and unrolls multi-dimensional reduction and keep-dims arg to be
+    TOSA compliant.
+
+    The difference between max/min and amax/amin is (from pytorch docs):
+        - amax/amin supports reducing on multiple dimensions,
+        - amax/amin does not return indices,
+        - amax/amin evenly distributes gradient between equal values, while max(dim)/min(dim)
+          propagates gradient only to a single index in the source tensor.
+    Since we do not care about gradients post training, convert min/max ops to amin/amax as long as
+    the indices are not used.
+
+    Original:
+        amax([dim1, dim2], keepdim = False)
+    After pass:
+        amax(dim1, keepdim = True)
+        amax(dim2, keepdim = True)
+        squeeze(dim = [dim1, dim2])
+    """
+
+    def check_argmax(self, node):
+        """
+        Raises a RuntimeError if the argmax value returned by the min/max op is used in the graph.
+        """
+        if node.target in [torch.ops.aten.max.dim, torch.ops.aten.min.dim]:
+            no_argmax = len(node.users) == 1
+            no_argmax_users = (len(node.users) == 2) and (
+                len(list(node.users)[1].users) == 0
+            )
+            if not (no_argmax or no_argmax_users):
+                raise RuntimeError("Argmax is not supported by the arm_quantizer")
+
+    def get_variables(self, node):
+        """Returns variables specific for each op handled by the pass."""
+        if node.target in [
+            exir_ops.edge.aten.amax.default,
+            exir_ops.edge.aten.amin.default,
+        ]:
+            replace_node = node
+            op = node.target
+            squeeze_op = exir_ops.edge.aten.squeeze_copy.dims
+        elif node.target == exir_ops.edge.aten.max.dim:
+            replace_node = list(node.users)[0]
+            op = exir_ops.edge.aten.amax.default
+            squeeze_op = exir_ops.edge.aten.squeeze_copy.dims
+        elif node.target == exir_ops.edge.aten.min.dim:
+            replace_node = list(node.users)[0]
+            op = exir_ops.edge.aten.amin.default
+            squeeze_op = exir_ops.edge.aten.squeeze_copy.dims
+        elif node.target == torch.ops.aten.max.dim:
+            replace_node = list(node.users)[0]
+            op = torch.ops.aten.amax.default
+            squeeze_op = torch.ops.aten.squeeze.dims
+        elif node.target == torch.ops.aten.min.dim:
+            replace_node = list(node.users)[0]
+            op = torch.ops.aten.amin.default
+            squeeze_op = torch.ops.aten.squeeze.dims
+        else:
+            raise RuntimeError(
+                f"{node.name} is not an accepted target for ConvertMinMaxPass()"
+            )
+
+        return (replace_node, op, squeeze_op)
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        modified = False
+        for node in graph_module.graph.nodes:
+            if node.op != "call_function":
+                continue
+            if node.target not in [
+                exir_ops.edge.aten.amax.default,
+                exir_ops.edge.aten.amin.default,
+                exir_ops.edge.aten.max.dim,
+                exir_ops.edge.aten.min.dim,
+                torch.ops.aten.max.dim,
+                torch.ops.aten.min.dim,
+            ]:
+                continue
+
+            self.check_argmax(
+                node
+            )  # TODO: MLETORCH-718 : Quantization of indices in arm_quantizer
+            replace_node, op, squeeze_op = self.get_variables(node)
+
+            # Unwrap args
+            if len(node.args) == 2:
+                input_node, dims = node.args
+                keepdims = False
+            elif len(node.args) == 3:
+                input_node, dims, keepdims = node.args
+            else:
+                raise RuntimeError(f"Unexpected arg size in {node.name}")
+
+            try:
+                iter(dims)
+            except:
+                dims = [dims]
+            else:
+                dims = list(dims)
+
+            # Unroll multi-dimensional reduction and keep-dims arg
+            with graph_module.graph.inserting_before(node):
+
+                for dim in dims:
+                    args = (input_node, dim, True)
+                    input_node = graph_module.graph.create_node(
+                        "call_function", op, args, node.kwargs
+                    )
+
+                if not keepdims:
+                    input_node = graph_module.graph.create_node(
+                        "call_function",
+                        squeeze_op,
+                        (input_node, dims),
+                    )
+
+            replace_node.replace_all_uses_with(input_node)
+            modified = True
+
+        if modified:
+            graph_module.graph.eliminate_dead_code()
+            graph_module.recompile()
+            graph_module = super().call(graph_module).graph_module
+
+        return PassResult(graph_module, True)
diff --git a/backends/arm/_passes/convert_to_clamp.py b/backends/arm/_passes/convert_to_clamp.py
new file mode 100644
index 00000000000..8f2c9b16f9a
--- /dev/null
+++ b/backends/arm/_passes/convert_to_clamp.py
@@ -0,0 +1,36 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
+
+edge_operators = {
+    exir_ops.edge.aten.hardtanh.default,
+    exir_ops.edge.aten.relu.default,
+}
+
+
+def get_clamp_params(op, args) -> Tuple[float | None, float | None]:
+    if op == exir_ops.edge.aten.hardtanh.default:
+        return args[1], args[2]
+    elif op == exir_ops.edge.aten.relu.default:
+        return 0.0, None
+    else:
+        raise ValueError(f"Getting clamp parameters for op {op} is not implemented.")
+
+
+class ConvertToClampPass(ExportPass):
+    def call_operator(self, op, args, kwargs, meta):
+        if op not in edge_operators:
+            return super().call_operator(op, args, kwargs, meta)
+
+        return super().call_operator(
+            exir_ops.edge.aten.clamp.default,
+            (args[0], *get_clamp_params(op, args)),
+            {},
+            meta,
+        )
diff --git a/backends/arm/_passes/decompose_select.py b/backends/arm/_passes/decompose_select.py
index 5e04668df9a..9a25b7c28ae 100644
--- a/backends/arm/_passes/decompose_select.py
+++ b/backends/arm/_passes/decompose_select.py
@@ -35,8 +35,9 @@ def call(self, graph_module: torch.fx.GraphModule):
             input_node, dim, index = node.args
 
             rank = len(input_node.meta["val"].size())
+            shape = input_node.meta["val"].shape
             dim = dim % rank if dim < 0 else dim
-            index = index % rank if index < 0 else index
+            index = index % shape[dim] if index < 0 else index
 
             with graph_module.graph.inserting_before(node):
                 slice_node = create_node(
diff --git a/backends/arm/_passes/decompose_softmax_pass.py b/backends/arm/_passes/decompose_softmax_pass.py
new file mode 100644
index 00000000000..7e8591eb386
--- /dev/null
+++ b/backends/arm/_passes/decompose_softmax_pass.py
@@ -0,0 +1,77 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
+
+# For BI case
+torch_softmax = (torch.ops.aten.softmax.int, torch.ops.aten.log_softmax.int)
+# For MI case
+edge_softmax = (
+    exir_ops.edge.aten._softmax.default,
+    exir_ops.edge.aten._log_softmax.default,
+)
+log_softmax = (torch.ops.aten.log_softmax.int, exir_ops.edge.aten._log_softmax.default)
+
+
+def _get_logsoftmax_ops(op) -> tuple:
+    """
+    Returns the (log_op, sub_op, amax_op, expo_op, sum_op, reciprocal_op), where the ops depends on if
+    the softmax op is an aten or edge op.
+    """
+    if op in edge_softmax:
+        return (
+            exir_ops.edge.aten.log.default,
+            exir_ops.edge.aten.sub.Tensor,
+            exir_ops.edge.aten.amax.default,
+            exir_ops.edge.aten.exp.default,
+            exir_ops.edge.aten.sum.dim_IntList,
+            exir_ops.edge.aten.reciprocal.default,
+            exir_ops.edge.aten.mul.Tensor,
+        )
+    if op in torch_softmax:
+        return (
+            torch.ops.aten.log.default,
+            torch.ops.aten.sub.Tensor,
+            torch.ops.aten.amax.default,
+            torch.ops.aten.exp.default,
+            torch.ops.aten.sum.dim_IntList,
+            torch.ops.aten.reciprocal.default,
+            torch.ops.aten.mul.Tensor,
+        )
+    raise RuntimeError(f"Can't get logsoftmax decomposition ops for op {op}")
+
+
+class DecomposeSoftmaxPass(ExportPass):
+    """
+    This pass decomposes log_softmax or softmax into more primitive ops.
+    Example:
+        %op1 = amax(x)
+        %op2 = sub(x, %op1)
+        %op3 = exp(%op2)
+        %op4 = sum(%op3, dim)
+        %op5 = reciprocal(%op4)
+        %op6 = mul(%op3, %op5)
+        (in logsoftmax case: %op7 = log(%op6))
+    """
+
+    def call_operator(self, op, args, kwargs, meta):
+        if op not in torch_softmax + edge_softmax:
+            return super().call_operator(op, args, kwargs, meta)
+        log_op, sub_op, max_op, exp_op, sum_op, reciprocal_op, mul_op = (
+            _get_logsoftmax_ops(op)
+        )
+        _input = args[0]
+        dim = [args[1]]
+        op1 = super().call_operator(max_op, (_input, dim, True), {}, meta)
+        op2 = super().call_operator(sub_op, (_input, op1), {}, meta)
+        op3 = super().call_operator(exp_op, (op2,), {}, meta)
+        op4 = super().call_operator(sum_op, (op3, dim, True), {}, meta)
+        op5 = super().call_operator(reciprocal_op, (op4,), {}, meta)
+        op6 = super().call_operator(mul_op, (op3, op5), {}, meta)
+        if op in log_softmax:
+            op6 = super().call_operator(log_op, (op6,), {}, meta)
+        return op6
diff --git a/backends/arm/_passes/decompose_softmaxes_pass.py b/backends/arm/_passes/decompose_softmax_unstable_pass.py
similarity index 95%
rename from backends/arm/_passes/decompose_softmaxes_pass.py
rename to backends/arm/_passes/decompose_softmax_unstable_pass.py
index b4804d8bfc6..4a2ce712ab7 100644
--- a/backends/arm/_passes/decompose_softmaxes_pass.py
+++ b/backends/arm/_passes/decompose_softmax_unstable_pass.py
@@ -1,5 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
-# All rights reserved.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -46,7 +45,7 @@ def get_logsoftmax_ops(op) -> tuple:
     raise RuntimeError(f"Can't get softmax decomposition ops for op {op}")
 
 
-class DecomposeSoftmaxesPass(ExportPass):
+class DecomposeSoftmaxUnstablePass(ExportPass):
     """
     This pass decomposes log softmax or softmax into more primitive ops.
 
diff --git a/backends/arm/_passes/fuse_batchnorm2d_pass.py b/backends/arm/_passes/fuse_batchnorm2d_pass.py
index 6cb7548a70c..9eb74aca145 100644
--- a/backends/arm/_passes/fuse_batchnorm2d_pass.py
+++ b/backends/arm/_passes/fuse_batchnorm2d_pass.py
@@ -6,10 +6,15 @@
 # pyre-unsafe
 
 import torch
+from executorch.backends.transforms.utils import (
+    create_constant_placeholder,
+    delete_constant_placeholder,
+)
 from executorch.exir import ExportedProgram
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
 from torch._export.utils import get_buffer, get_param
+from torch.export.graph_signature import InputKind
 from torch.fx import Node
 from torch.nn.utils.fusion import fuse_conv_bn_weights
 
@@ -23,7 +28,7 @@ def __init__(self, exported_program: ExportedProgram):
         self.exported_program = exported_program
         super().__init__()
 
-    def is_fuseable_conv_bn(self, node: Node):
+    def is_fuseable_conv_bn(self, node: Node) -> bool:
         """Returns True if node is a batchnorm that can be fused into
         a parent convolution."""
         if node.op != "call_function":
@@ -44,15 +49,19 @@ def is_fuseable_conv_bn(self, node: Node):
         # Since we change the output of the conv, fuse only if it has single user.
         if len(conv.users) > 1:
             return False
-        # For similar reasons, only fuse if conv parameters have single user.
-        if len(conv.all_input_nodes[1].users) > 1:
-            return False
-        if len(conv.all_input_nodes) > 2 and len(conv.all_input_nodes[2].users) > 1:
-            return False
         return True
 
+    def get_bias_name(self, conv_weight_node: Node, conv_bias_node: Node) -> str:
+        if conv_bias_node:
+            return conv_bias_node.name + "_fused_bn"
+        elif "weight" in conv_weight_node.name:
+            return conv_weight_node.name.replace("weight", "bias") + "_fused_bn"
+        else:
+            return conv_weight_node.name + "_bias_fused_bn"
+
     def call(self, graph_module: torch.fx.GraphModule) -> PassResult:  # noqa: C901
         modified = False
+        constant_placeholders_to_delete = set()
         for node in graph_module.graph.nodes:
             if not self.is_fuseable_conv_bn(node):
                 continue
@@ -64,68 +73,93 @@ def get_param_or_none(arg) -> torch.nn.Parameter | None:
                 )
 
             # Get weight, bias, mean, var and epsilon from the batchnorm
-            bn = node
-            conv, bn_weight_node, bn_bias_node, bn_mean_node, bn_var_node = bn.args[0:5]
-            bn_weight = get_param_or_none(bn_weight_node)
-            bn_bias = get_param_or_none(bn_bias_node)
-
-            running_mean = get_buffer(self.exported_program, bn_mean_node)
-            running_var = get_buffer(self.exported_program, bn_var_node)
-            if running_mean is None or running_var is None:
+            bn_node = node
+            conv, bn_weight_node, bn_bias_node, bn_mean_node, bn_var_node = (
+                bn_node.args[0:5]
+            )
+            bn_weight_tensor = get_param_or_none(bn_weight_node)
+            bn_bias_tensor = get_param_or_none(bn_bias_node)
+            bn_mean_tensor = get_buffer(self.exported_program, bn_mean_node)
+            bn_var_tensor = get_buffer(self.exported_program, bn_var_node)
+            if bn_mean_tensor is None or bn_var_tensor is None:
                 raise ValueError(
                     "Parameters running_mean and running_var of batchnorm can't be None."
                 )
-            epsilon = bn.args[-1]
+            epsilon = bn_node.args[-1]
 
             # Get weight and bias from conv
             conv_weight_node, conv_bias_node = conv.args[1:3]
-            conv_weight = get_param(self.exported_program, conv_weight_node)
-            conv_bias = get_param_or_none(conv_bias_node)
-            if conv_weight is None:
+            conv_weight_tensor = get_param(self.exported_program, conv_weight_node)
+            conv_bias_tensor = get_param_or_none(conv_bias_node)
+            if conv_weight_tensor is None:
                 raise ValueError("Parameter weight of convolution can't be None.")
 
             # Compute conv parameters folded with batchnorm
             fused_conv_weight, fused_conv_bias = fuse_conv_bn_weights(
-                conv_weight,
-                conv_bias,
-                running_mean,
-                running_var,
+                conv_weight_tensor,
+                conv_bias_tensor,
+                bn_mean_tensor,
+                bn_var_tensor,
                 epsilon,
-                bn_weight,
-                bn_bias,
+                bn_weight_tensor,
+                bn_bias_tensor,
             )
 
-            # Set the conv parameters to fused value
-            def try_set_param(
-                param_node: Node | None, param_value: torch.nn.Parameter
-            ) -> bool:
-                """set_param but check if param_node is None first. Return True if param was set successfully, otherwise False."""
-                if param_node is not None:
-                    param_name = (
-                        self.exported_program.graph_signature.inputs_to_parameters[
-                            param_node.name
-                        ]
+            # Create fused weights and bias to conv and replace conv args
+            with graph_module.graph.inserting_before(conv_weight_node):
+                fused_conv_weight_node = create_constant_placeholder(
+                    exp_program=self.exported_program,
+                    graph=graph_module.graph,
+                    kind=InputKind.PARAMETER,
+                    name=conv_weight_node.name + "_fused_bn",
+                    data=fused_conv_weight,
+                )
+
+                if fused_conv_bias is not None:
+                    fused_conv_bias_node = create_constant_placeholder(
+                        exp_program=self.exported_program,
+                        graph=graph_module.graph,
+                        kind=InputKind.PARAMETER,
+                        name=self.get_bias_name(conv_weight_node, conv_bias_node),
+                        data=fused_conv_bias,
                     )
-                    self.exported_program.state_dict[param_name] = param_value
-                    return True
-                return False
+                else:
+                    fused_conv_bias_node = None
+
+                conv.args = (
+                    conv.args[0],
+                    fused_conv_weight_node,
+                    fused_conv_bias_node,
+                    *conv.args[3:],
+                )
 
-            try_set_param(conv_weight_node, fused_conv_weight)
-            if not try_set_param(conv_bias_node, fused_conv_bias) and try_set_param(
-                bn_bias_node, fused_conv_bias
-            ):
-                # pyre-ignore[60]
-                # Conv didn't have bias but batchnorm did, steal bias from batchnorm.
-                conv_args = (*conv.args[0:2], bn_bias_node, *conv.args[3:])
-                conv.args = conv_args
-
-            # Erasing nodes is handled by dead-code elimination.
-            for user in bn.users:
+            # Erasing batch-norm nodes is handled by dead-code elimination. After that we may remove their constant placeholder inputs
+            for user in bn_node.users:
                 user.replace_all_uses_with(conv)
+
+            constant_placeholders_to_delete.update(
+                [
+                    bn_weight_node,
+                    bn_bias_node,
+                    bn_mean_node,
+                    bn_var_node,
+                    conv_weight_node,
+                    conv_bias_node,
+                ]
+            )
             modified = True
 
         if modified:
             graph_module.graph.eliminate_dead_code()
+            for constant_placeholder in constant_placeholders_to_delete:
+                if (constant_placeholder is not None) and (
+                    len(constant_placeholder.users) == 0
+                ):
+                    delete_constant_placeholder(
+                        self.exported_program, constant_placeholder
+                    )
+
             graph_module.recompile()
             graph_module = super().call(graph_module).graph_module
+
         return PassResult(graph_module=graph_module, modified=modified)
diff --git a/backends/arm/_passes/fuse_constant_ops_pass.py b/backends/arm/_passes/fuse_constant_ops_pass.py
new file mode 100644
index 00000000000..1fff7d76dfc
--- /dev/null
+++ b/backends/arm/_passes/fuse_constant_ops_pass.py
@@ -0,0 +1,170 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+import torch._export.utils
+from executorch.backends.arm._passes.arm_pass_utils import (
+    get_constant_placeholder_kind,
+    get_param_tensor,
+    is_persistent_buffer,
+)
+from executorch.backends.transforms.utils import (
+    create_constant_placeholder,
+    delete_constant_placeholder,
+)
+from executorch.exir import ExportedProgram
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+
+logger = logging.getLogger(__name__)
+
+
+class FuseConstantOpsPass(ExportPass):
+    """
+    Fuses ops with only placeholder parameters into one placeholder parameter node with the op
+    pre-calulcated on its data.
+
+    Original:
+        state_dict = {x_tensor_name : data}
+        def f():
+            return x.view(...)
+
+    After pass:
+        state_dict = {x_tensor_name_fused_const : data.view(...)}
+        def f():
+            return x
+    """
+
+    def __init__(self, exported_program: ExportedProgram) -> None:
+        super().__init__()
+        self.exported_program = exported_program
+
+    def fuse_nodes(self, node) -> bool:
+        """
+        Takes a node with only parameter inputs and replaces it with one constant tensor node with
+        the operations already carried out on the data.
+        """
+
+        if node.target == exir_ops.edge.aten.full.default:
+            # Create data from args
+            size, fill_value = node.args
+            dtype = node.kwargs["dtype"]
+            data = torch.full(size, float(fill_value), dtype=dtype)
+
+            insert_pos = list(node.graph.nodes)[0]
+        else:
+            # Extract tensors and args from the node
+
+            if len(node.all_input_nodes) == 0:
+                raise RuntimeError("No inputs found")
+
+            data_list = [
+                get_param_tensor(self.exported_program, input_node)
+                for input_node in node.all_input_nodes
+            ]
+
+            args = node.args[len(node.all_input_nodes) :]
+            kwargs = node.kwargs
+
+            if "input_qparams" in node.meta and len(node.meta["input_qparams"]) > 0:
+                dequantize_op = (
+                    exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default
+                )
+
+                for i in range(len(node.all_input_nodes)):
+                    q_params = node.meta["input_qparams"][i]
+                    data_list[i] = dequantize_op(
+                        data_list[i],
+                        q_params.scale,
+                        q_params.zp,
+                        q_params.qmin,
+                        q_params.qmax,
+                        q_params.dtype,
+                    )
+
+            # Run the op on the extracted tensor
+            data = node.target(*data_list, *args, **kwargs)
+
+            if "output_qparams" in node.meta and len(node.meta["output_qparams"]) > 0:
+                quantize_op = (
+                    exir_ops.edge.quantized_decomposed.quantize_per_tensor.default
+                )
+                q_params = node.meta["output_qparams"][0]
+                data = quantize_op(
+                    data,
+                    q_params.scale,
+                    q_params.zp,
+                    q_params.qmin,
+                    q_params.qmax,
+                    q_params.dtype,
+                )
+
+            insert_pos = list(node.all_input_nodes)[0]
+
+        # Make new node the same kind as the first constant input
+        input_kind = get_constant_placeholder_kind(self.exported_program, insert_pos)
+        persistent_buffer = is_persistent_buffer(self.exported_program, insert_pos)
+
+        # Create new node
+        with node.graph.inserting_before(insert_pos):
+            const_node = create_constant_placeholder(
+                exp_program=self.exported_program,
+                graph=node.graph,
+                kind=input_kind,
+                name=node.name + "_fused_const",
+                data=data,
+                persistent_buffer=persistent_buffer,
+            )
+
+        node.replace_all_uses_with(const_node)
+
+        return True
+
+    def call(self, graph_module):
+        modified = True
+        input_nodes_to_delete = []
+        for node in graph_module.graph.nodes:
+            if node.op != "call_function":
+                continue
+            if node.target == torch.ops.tosa._table.default:
+                continue
+            if node.target == exir_ops.edge.aten.repeat.default:
+                _, multiples = node.args
+                # Do not fuse if the repeat creates a larger output, i.e. any multiple > 1
+                if any((multiple > 1 for multiple in multiples)):
+                    continue
+
+            input_nodes = node.all_input_nodes
+            input_nodes_constant = (
+                torch._export.utils.is_param(self.exported_program, input_node)
+                or torch._export.utils.is_lifted_tensor_constant(
+                    self.exported_program, input_node
+                )
+                or torch._export.utils.is_buffer(self.exported_program, input_node)
+                for input_node in input_nodes
+            )
+            input_nodes_single_users = (
+                len(input_node.users) == 1 for input_node in input_nodes
+            )
+
+            if all(input_nodes_constant) and all(input_nodes_single_users):
+                try:
+                    self.fuse_nodes(node)
+                    graph_module.recompile()  # Recompile needed to catch chains of constant ops
+                    input_nodes_to_delete.extend(input_nodes)
+                except Exception as e:
+                    logger.warning(
+                        f"\nFailed to fuse constant op {node.name} due to exception:\n{str(e)}"
+                    )
+
+        if modified:
+            graph_module.graph.eliminate_dead_code()
+            for input_node in input_nodes_to_delete:
+                delete_constant_placeholder(self.exported_program, input_node)
+
+            graph_module = super().call(graph_module).graph_module
+
+        return PassResult(graph_module, True)
diff --git a/backends/arm/_passes/fuse_quantized_activation_pass.py b/backends/arm/_passes/fuse_quantized_activation_pass.py
index 3ac9f5cbb98..13c69bf92f1 100644
--- a/backends/arm/_passes/fuse_quantized_activation_pass.py
+++ b/backends/arm/_passes/fuse_quantized_activation_pass.py
@@ -13,7 +13,8 @@
 
 
 class FuseQuantizedActivationPass(ExportPass):
-    def _is_fuseable_quantized_activation(self, node: Node):
+    @staticmethod
+    def _is_fuseable_quantized_activation(node: Node):
         """Fuse activations that have a 0 lower bound and quantized with a qmin zero-point"""
         is_fuseable = node.target == exir_ops.edge.aten.relu.default
         if node.target == exir_ops.edge.aten.hardtanh.default:
@@ -29,7 +30,8 @@ def _is_fuseable_quantized_activation(self, node: Node):
         else:
             return False
 
-    def _is_fuseable_input(self, node: Node):
+    @staticmethod
+    def _is_fuseable_input(node: Node):
         return (
             node.target
             in (
@@ -45,11 +47,11 @@ def call(self, graph_module: torch.fx.GraphModule):
             if node.op != "call_function":
                 continue
 
-            if not self._is_fuseable_quantized_activation(node):
+            if not FuseQuantizedActivationPass._is_fuseable_quantized_activation(node):
                 continue
 
             input_node = node.args[0]
-            if not self._is_fuseable_input(input_node):
+            if not FuseQuantizedActivationPass._is_fuseable_input(input_node):
                 continue
 
             node.replace_all_uses_with(input_node)
diff --git a/backends/arm/_passes/insert_rescales_pass.py b/backends/arm/_passes/insert_rescales_pass.py
index e9f6eec63a3..541638b830e 100644
--- a/backends/arm/_passes/insert_rescales_pass.py
+++ b/backends/arm/_passes/insert_rescales_pass.py
@@ -38,17 +38,17 @@ def rescale_fake(
     """Casts the input tensor to dtype `dtype` to produce the correct tensor meta for a _rescale op.
     Additionally validates TOSA constraints of a RESCALE op.
     """
-    if not (dtype == torch.int32 or dtype == torch.int8):
+    if dtype not in (torch.int32, torch.int8, torch.int16):
         raise NotImplementedError(
-            "tosa::rescale currently only supports int32 and int8."
+            f"tosa::rescale currently only supports int32, int16 and int8, not {dtype}"
         )
-    if dtype == torch.int32 and out_zp != 0:
+    if dtype in (torch.int32, torch.int16) and out_zp != 0:
         raise ValueError(
-            "TOSA requires output_zp to be zero when the output dtype is int32."
+            f"TOSA requires output_zp to be zero when the output dtype is {dtype}."
         )
-    if x.dtype == torch.int32 and in_zp != 0:
+    if x.dtype in (torch.int32, torch.int16) and in_zp != 0:
         raise ValueError(
-            "TOSA requires input_zp to be zero when the input dtype is int32."
+            f"TOSA requires input_zp to be zero when the input dtype is {dtype}"
         )
     if x.dtype == torch.int8 and not -128 <= in_zp <= 127:
         raise ValueError(f"{in_zp=} outside valid range (-128,127) for int8.")
diff --git a/backends/arm/_passes/insert_table_ops.py b/backends/arm/_passes/insert_table_ops.py
index ada4d646c06..05d37e1e8e9 100644
--- a/backends/arm/_passes/insert_table_ops.py
+++ b/backends/arm/_passes/insert_table_ops.py
@@ -1,5 +1,4 @@
 # Copyright 2024-2025 Arm Limited and/or its affiliates.
-# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -18,6 +17,7 @@
 
 from executorch.exir.pass_base import ExportPass, PassResult
 from torch.fx import GraphModule
+
 from torch.library import impl, Library
 
 lib = Library("tosa", "DEF")
@@ -26,7 +26,10 @@
 
 @impl(lib, "_table")
 def _table_impl(*args, **kwargs):  # pyre-ignore
-    return args[0]
+    in_dtype = args[0].dtype
+    if in_dtype == torch.int8:
+        return args[0]
+    return args[0].to(dtype=torch.int32)
 
 
 class InsertTableOpsPass(ExportPass):
@@ -38,7 +41,9 @@ class InsertTableOpsPass(ExportPass):
     """
 
     table_ops: Dict[EdgeOpOverload, Callable[[torch.Tensor], torch.Tensor]] = {
+        exir_ops.edge.aten.ceil.default: torch.ceil,
         exir_ops.edge.aten.exp.default: torch.exp,
+        exir_ops.edge.aten.floor.default: torch.floor,
         exir_ops.edge.aten.log.default: torch.log,
         exir_ops.edge.aten.reciprocal.default: torch.reciprocal,
         exir_ops.edge.aten.rsqrt.default: torch.rsqrt,
@@ -58,29 +63,105 @@ def register_buffer(self, buffer_name: str, buffer: torch.Tensor) -> None:
         """
         self.exported_program.state_dict[buffer_name] = buffer
 
-    def generate_table_values(
+    def generate_8bit_table_values(
         self,
         torch_op: Callable[[torch.Tensor], torch.Tensor],
         in_quantargs: QuantArgs,
         out_quantargs: QuantArgs,
-    ) -> torch.Tensor:
+    ) -> tuple[torch.Tensor, int]:
+        """Compute LUT values for a INT8 TOSA.TABLE. Also returns 0 since no shifting is required after 8bit table.
+        The INT8 table is a simple 256 value 1-1 LUT.
+        """
+
         def f(x: torch.Tensor) -> torch.Tensor:
             x = in_quantargs.dequantize_value(x)
             x = torch_op(x)
             return out_quantargs.quantize_value(x)
 
-        input_dtype = in_quantargs.dtype
-        steps = in_quantargs.qmax - in_quantargs.qmin + 1
-        return f(
+        return (
+            f(
+                torch.linspace(
+                    start=in_quantargs.qmin,
+                    end=in_quantargs.qmax,
+                    steps=256,
+                    # use torch.int64 to avoid overflow when dequantizing (subtracting zp).
+                    # e.g. torch.tensor(-50, dtype=torch.int8) - 100 == torch.tensor(106, dtype=torch.int8)
+                    dtype=torch.int64,
+                )
+            ).to(dtype=torch.int8),
+            0,
+        )
+
+    def generate_16_bit_table_values(
+        self,
+        torch_op: Callable[[torch.Tensor], torch.Tensor],
+        in_quantargs: QuantArgs,
+        out_quantargs: QuantArgs,
+    ) -> tuple[torch.Tensor, int]:
+        """Compute LUT values for a INT16 TOSA.TABLE with 32 bit output.
+        In practice the output is 23 bits that should be interpreted as 16 'whole' bits and 7 fractional bits, see
+        the specification: https://www.mlplatform.org/tosa/tosa_spec.html#_table. This means that the output
+        will interpreted as 2**7=128 times too large unless accounted for by rescaling down the table output.
+
+        Quantization can be either int16 or int32 which means that the op output could be larger than the 23 bits from
+        the TOSA.TABLE output. In that case, we need to rescale up the output.
+
+        To handle this we need to:
+        1) Make sure that our table values fit within 16 bits.
+        2) Insert a rescale after the table to handle the x128 from the fractional bits and match the quantization.
+
+        The function returns rescale_lshift which says how much to rescale after the table. This value can negative.
+        """
+
+        def f(x: torch.Tensor) -> torch.Tensor:
+            # Dont use the 7 LSBs.
+            x = in_quantargs.dequantize_value((x & ~0x7F))
+            x = torch_op(x)
+            return out_quantargs.quantize_value(x)
+
+        lut_values = f(
             torch.linspace(
                 start=in_quantargs.qmin,
-                end=in_quantargs.qmax,
-                steps=steps,
+                end=in_quantargs.qmax + 1,
+                steps=513,
                 # use torch.int64 to avoid overflow when dequantizing (subtracting zp).
                 # e.g. torch.tensor(-50, dtype=torch.int8) - 100 == torch.tensor(106, dtype=torch.int8)
                 dtype=torch.int64,
             )
-        ).to(dtype=input_dtype)
+        )
+        # Calculate how much we need to shift table values to fit in 16 signed bits
+        # ceil(log2(max absolute table value)) + 1 bit for signedness - 16
+        # Example:
+        #       Max value in the table is 70 000. We want to fit it in 16 signed bits.
+        #       70 000=0b10001000101110000 (17 digits) has ceil(log2(70 000)) = ceil(16.095) = 17 bits.
+        #       If we shift it 17-16=1 bit, we do get 16 bits (0b1000100010111000),
+        #       but due to signedness this is a negative number! So we need to shift it one more bit.
+        # Note: for out_quantargs.dtype=torch.int16, rshift == 0 and rescale_lshift = -7.
+        rshift = int(torch.ceil(torch.log2(lut_values.abs().max()))) + 1 - 16
+        # The 7 fractional bits are equivalent to a lshift of 7, so subtract 7 from the lshift we do.
+        rescale_lshift = rshift - 7
+        lut_values = lut_values >> rshift
+        return lut_values.to(dtype=torch.int16), rescale_lshift
+
+    def generate_table_values(
+        self,
+        torch_op: Callable[[torch.Tensor], torch.Tensor],
+        in_quantargs: QuantArgs,
+        out_quantargs: QuantArgs,
+    ) -> tuple[torch.Tensor, int]:
+        match out_quantargs.dtype:
+            case torch.int8:
+                return self.generate_8bit_table_values(
+                    torch_op, in_quantargs, out_quantargs
+                )
+            case torch.int16 | torch.int32:
+                return self.generate_16_bit_table_values(
+                    torch_op, in_quantargs, out_quantargs
+                )
+            case _:
+                raise ValueError(
+                    f"Unsupported output dtype for table: {out_quantargs.dtype}"
+                )
 
     def call(self, graph_module: GraphModule) -> PassResult:
         modified = False
@@ -99,10 +180,12 @@ def call(self, graph_module: GraphModule) -> PassResult:
                     op_target=torch.ops.tosa._table.default,
                     args=(node.args[0],),
                 )
+                output_node = table_node
                 assert len(input_qparams) == 1
                 assert len(output_qparams) == 1
-                # Generate table buffer
-                buffer = self.generate_table_values(
+
+                # Generate table buffer and how much to lshift the table output.
+                buffer, lshift = self.generate_table_values(
                     torch_op=self.table_ops[node.target],
                     in_quantargs=input_qparams[0],
                     out_quantargs=output_qparams[0],
@@ -113,10 +196,20 @@ def call(self, graph_module: GraphModule) -> PassResult:
                 self.register_buffer(
                     buffer_name=table_node.name.replace("_default", ""), buffer=buffer
                 )
-                node.replace_all_uses_with(table_node)
+
+                if lshift != 0:
+                    scale = 2.0**lshift
+                    rescale_node = create_node(
+                        graph=graph_module.graph,
+                        op_target=torch.ops.tosa._rescale.default,
+                        args=(table_node, output_qparams[0].dtype, scale, 0, 0),
+                    )
+                    output_node = rescale_node
+
+                node.replace_all_uses_with(output_node)
             graph_module.graph.erase_node(node)
-            table_node.meta["input_qparams"] = input_qparams
-            table_node.meta["output_qparams"] = output_qparams
+            output_node.meta["input_qparams"] = input_qparams
+            output_node.meta["output_qparams"] = output_qparams
             modified = True
 
         if modified:
diff --git a/backends/arm/_passes/keep_dims_false_to_squeeze_pass.py b/backends/arm/_passes/keep_dims_false_to_squeeze_pass.py
index ad95379cc87..744436cba9e 100644
--- a/backends/arm/_passes/keep_dims_false_to_squeeze_pass.py
+++ b/backends/arm/_passes/keep_dims_false_to_squeeze_pass.py
@@ -1,5 +1,4 @@
 # Copyright 2024-2025 Arm Limited and/or its affiliates.
-# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -36,18 +35,19 @@ class KeepDimsFalseToSqueezePass(ExportPass):
     """
 
     # CURRENTLY NOT HANDLED OPS
-    # exir_ops.edge.aten.amax,
-    # exir_ops.edge.aten.amin,
-    # exir_ops.edge.aten.any.dim,
-    # exir_ops.edge.aten.any.dims,
     # exir_ops.edge.aten.argmax,
     # exir_ops.edge.aten.argmin,
-    # exir_ops.edge.aten.max.dim,
-    # exir_ops.edge.aten.min.dim,
     # exir_ops.edge.aten.prod.dim_int,
 
     # HANDLED OPS
     # exir_ops.edge.aten.sum.dim_IntList
+    # exir_ops.edge.aten.any.default (decomposed in convert_any_default_dim_dims_pass)
+    # exir_ops.edge.aten.any.dim (decomposed in convert_any_default_dim_dims_pass)
+    # exir_ops.edge.aten.any.dims (decomposed in convert_any_default_dim_dims_pass)
+    # exir_ops.edge.aten.max.dim (decomposed in convert_minmax_pass)
+    # exir_ops.edge.aten.min.dim (decomposed in convert_minmax_pass)
+    # exir_ops.edge.aten.amin (decomposed in convert_minmax_pass)
+    # exir_ops.edge.aten.amax (decomposed in convert_minmax_pass)
     # exir_ops.edge.aten.var.correction (decomposed in decompose_var_pass)
     # exir_ops.edge.aten.var.dim (decomposed in decompose_var_pass)
     # exir_ops.edge.aten.mean.dim (decomposed in decompose_meandim_pass)
diff --git a/backends/arm/_passes/match_arg_ranks_pass.py b/backends/arm/_passes/match_arg_ranks_pass.py
index 941d20c95a1..3fcfe6edd35 100644
--- a/backends/arm/_passes/match_arg_ranks_pass.py
+++ b/backends/arm/_passes/match_arg_ranks_pass.py
@@ -1,6 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024 Arm Limited and/or its affiliates.
 # All rights reserved.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -23,7 +23,17 @@
 class MatchArgRanksPass(ExportPass):
     """
     For ops in 'targeted_ops', make sure that the inputs share the same rank.
-    New dimensions are inserted at from the beginning of the
+    New dimensions are inserted from the beginning of the inputs that have a
+    lower rank to match the input with the highest rank.
+
+    Example:
+        input0 = shape(4, 3, 2)
+        input1 = shape(2)
+        input2 = shape(3, 1)
+    Becomes:
+        input0 = shape(4, 3, 2)
+        input1 = shape(1, 1, 2)
+        input2 = shape(1, 3, 1)
     """
 
     def __init__(self, exported_program):
@@ -54,34 +64,6 @@ def _match_op_rank(self, graph_module, node, arg, max_rank):
             )
             node.replace_input_with(arg, view)
 
-    def _match_buffer_rank(self, arg, max_rank):
-        """
-        Change arg's fake tensor meta to match max_rank if:
-            - arg is found in inputs_to_buffers or inputs_to_parameters.
-        """
-        fake_tensor = get_first_fake_tensor(arg)
-        shape = fake_tensor.shape
-        rank = len(shape)
-        new_shape = list([1] * (max_rank - rank) + list(shape))
-
-        buffer_name = None
-        if arg.name in self.exported_program.graph_signature.inputs_to_buffers:
-            buffer_name = self.exported_program.graph_signature.inputs_to_buffers[
-                arg.name
-            ]
-        elif arg.name in self.exported_program.graph_signature.inputs_to_parameters:
-            buffer_name = self.exported_program.graph_signature.inputs_to_parameters[
-                arg.name
-            ]
-        if buffer_name:
-            new_tensor = self.exported_program.state_dict[buffer_name].reshape(
-                new_shape
-            )
-            self.exported_program.state_dict[buffer_name] = new_tensor
-            arg.meta["val"] = fake_tensor.fake_mode.from_tensor(
-                new_tensor, static_shapes=True
-            )
-
     def call(self, graph_module: GraphModule) -> PassResult:
         for node in graph_module.graph.nodes:
             node = cast(Node, node)
@@ -105,12 +87,7 @@ def call(self, graph_module: GraphModule) -> PassResult:
                 if rank == max_rank:
                     continue
 
-                # If the argument is call_function, match shape by inserting view node.
-                if arg.op == "call_function":
-                    self._match_op_rank(graph_module, node, arg, max_rank)
-                else:
-                    # If the argument is a buffer or parameter, adjust shape by changing the fake tensor meta.
-                    self._match_buffer_rank(arg, max_rank)
+                self._match_op_rank(graph_module, node, arg, max_rank)
 
         graph_module.recompile()
         graph_module = super().call(graph_module).graph_module
diff --git a/backends/arm/arm_vela.py b/backends/arm/arm_vela.py
index ef7a4b01cda..2d448afead5 100644
--- a/backends/arm/arm_vela.py
+++ b/backends/arm/arm_vela.py
@@ -12,7 +12,13 @@
 from typing import List
 
 import numpy as np
-from ethosu.vela import vela  # type: ignore
+
+try:
+    from ethosu.vela import vela  # type: ignore
+
+    has_vela = True
+except ImportError:
+    has_vela = False
 
 
 # Pack either input or output tensor block, compose the related arrays into
@@ -39,7 +45,17 @@ def vela_bin_pack_io(prefix, data, shape_order=None):
 # Output via Vela to binary stream for ArmBackendEthosU
 # WARNING: Do not change this without changing VelaBinStream.cpp as that
 #          function consumes this format and the two need to align.
-def vela_compile(tosa_flatbuffer: bytes, args: List[str], shape_order=None):
+def vela_compile(
+    tosa_flatbuffer: bytes, args: List[str], shape_order=None, verbose: bool = False
+):
+    """
+    Compile a TOSA graph to a binary stream for ArmBackendEthosU using Vela.
+    """
+    if not has_vela:
+        raise RuntimeError(
+            "ethos-u-vela pip package couldn't be imported. Make sure it's installed!"
+        )
+
     with tempfile.TemporaryDirectory() as tmpdir:
         tosaname = "out.tosa"
         tosa_path = os.path.join(tmpdir, tosaname)
@@ -50,6 +66,8 @@ def vela_compile(tosa_flatbuffer: bytes, args: List[str], shape_order=None):
         output_dir = os.path.join(tmpdir, "output")
         args.append(f"--output-dir={output_dir}")
         args.append(tosa_path)
+        if verbose:
+            args.append("--verbose-all")
         vela.main(" ".join(args).split(" "))
 
         if any("ethos-u85" in arg for arg in args) or any(
diff --git a/backends/arm/ethosu_backend.py b/backends/arm/ethosu_backend.py
index 768389548e9..9b14a7a72b8 100644
--- a/backends/arm/ethosu_backend.py
+++ b/backends/arm/ethosu_backend.py
@@ -58,7 +58,12 @@ def _compile_tosa_flatbuffer(
             )
 
         # Pass on the TOSA flatbuffer to the vela compiler.
-        binary = vela_compile(tosa_flatbuffer, compile_flags, input_order)
+        binary = vela_compile(
+            tosa_flatbuffer,
+            compile_flags,
+            input_order,
+            verbose=logger.getEffectiveLevel() == logging.INFO,
+        )
         return binary
 
     @staticmethod
diff --git a/backends/arm/operator_support/TARGETS b/backends/arm/operator_support/TARGETS
index eb8c78bcf29..c0c5af7487b 100644
--- a/backends/arm/operator_support/TARGETS
+++ b/backends/arm/operator_support/TARGETS
@@ -3,10 +3,10 @@ load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
 python_library(
     name = "operator_support",
     srcs = glob(["*.py"]),
-    typing = True,
     deps = [
+        "//executorch/backends/arm/_passes:passes",
+        "//executorch/backends/arm:tosa_specification",
         "//executorch/backends/xnnpack/_passes:xnnpack_passes",
         "//executorch/exir:lib",
-        "//executorch/backends/arm:tosa_specification"
     ],
 )
diff --git a/backends/arm/operator_support/__init__.py b/backends/arm/operator_support/__init__.py
index 2ac23b0e91b..c1189b2ae59 100644
--- a/backends/arm/operator_support/__init__.py
+++ b/backends/arm/operator_support/__init__.py
@@ -6,11 +6,12 @@
 # pyre-unsafe
 
 from . import (  # noqa
-    bitwise_support,
     convolution_support,
+    minmax_support,
     pool_2d_support,
     reduce_sum_support,
     right_shift_support,
+    slice_copy_support,
     to_copy_support,
     tosa_supported_operators,
 )
diff --git a/backends/arm/operator_support/convolution_support.py b/backends/arm/operator_support/convolution_support.py
index 0d0a32200e8..b07ae82f98f 100644
--- a/backends/arm/operator_support/convolution_support.py
+++ b/backends/arm/operator_support/convolution_support.py
@@ -34,6 +34,9 @@ def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
 
         for pad in output_padding:
             if pad != 0:
+                self.reporter.report_reject(
+                    node, "Convolutions with non-zero output padding not implemented."
+                )
                 return False
 
         # Hardware specific constraints
@@ -56,19 +59,33 @@ def _is_node_supported_u55(self, node: fx.Node):
             # Depthwise convolution
             for dim in shape_in[1:]:
                 if not 1 <= dim <= 65536:
+                    self.reporter.report_reject(
+                        node,
+                        f"Depthwise convolution must have CWH <= 65536, got {dim})",
+                    )
                     return False
         else:
             # Convolution
             if not 1 <= C_in <= 65536:
+                self.reporter.report_reject(
+                    node, f"Convolution must have C <= 65536, got {C_in})"
+                )
                 return False
 
         kernel_w = kernel[2]
         kernel_h = kernel[3] if len(kernel) > 3 else 1
         # Kernel condition misses constraint on sum of absolute weights
         if not 1 <= kernel_h <= 64 or not 1 <= kernel_w * kernel_h <= 4096:
+            self.reporter.report_reject(
+                node,
+                f"Convolution needs to have kernel_y<=64, kernel_x*kernel_y<=4096, got kernel ({kernel_w}, {kernel_h})",
+            )
             return False
 
         if not self._stride_condition(node):
+            self.reporter.report_reject(
+                node, "Failed condition on stride, pad and dilation combination."
+            )
             return False
 
         return True
diff --git a/backends/arm/operator_support/minmax_support.py b/backends/arm/operator_support/minmax_support.py
new file mode 100644
index 00000000000..bdff368a5ce
--- /dev/null
+++ b/backends/arm/operator_support/minmax_support.py
@@ -0,0 +1,37 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch.fx as fx
+from executorch.backends.arm.operator_support.tosa_supported_operators import (
+    register_tosa_support_check,
+    SupportedTOSAOperatorCheck,
+)
+from executorch.backends.arm.tosa_specification import TosaSpecification
+from executorch.exir.dialects._ops import ops as exir_ops
+
+
+@register_tosa_support_check
+class MinMaxSupported(SupportedTOSAOperatorCheck):
+    targets = [
+        exir_ops.edge.aten.max.dim,
+        exir_ops.edge.aten.min.dim,
+    ]
+
+    # TODO : "MLETORCH-718 : Quantization of indices in arm_quantizer"
+    tosa_specs = [
+        TosaSpecification.create_from_string("TOSA-0.80+MI"),
+    ]
+
+    def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
+        if node.target in [exir_ops.edge.aten.max.dim, exir_ops.edge.aten.min.dim]:
+            no_argmax = len(node.users) == 1
+            no_argmax_users = (len(node.users) == 2) and (
+                len(list(node.users)[1].users) == 0
+            )
+
+            if not (no_argmax or no_argmax_users):
+                return False
+
+        return True
diff --git a/backends/arm/operator_support/pool_2d_support.py b/backends/arm/operator_support/pool_2d_support.py
index 7aa35a721b6..8291ede8ad9 100644
--- a/backends/arm/operator_support/pool_2d_support.py
+++ b/backends/arm/operator_support/pool_2d_support.py
@@ -26,8 +26,8 @@ def stride_check(strides: tuple[int, int]) -> bool:
 
 
 def dim_check(shape=torch.Size) -> bool:
-    check = shape[0] == 1
-    for dim in shape:
+    check = True
+    for dim in shape[1:]:
         check &= 1 <= dim <= 65536
     return check
 
@@ -54,12 +54,35 @@ def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
         if len(node.args) > 3:
             # Padding case
             if not all(1 <= k <= 8 for k in kernel):
+                self.reporter.report_reject(
+                    node, f"Avgpool2d with padding needs kernel dims < 8, got {kernel}"
+                )
                 return False
         else:
             if not kernel_check(kernel):
+                self.reporter.report_reject(
+                    node,
+                    f"Avgpool2d needs kernel_y < 256, kernel_x*kernel_y<=65536, got {kernel}",
+                )
                 return False
 
-        return dim_check(shape) and stride_check(stride)
+        if not dim_check(shape):
+            self.reporter.report_reject(
+                node,
+                f"Avgpool2d needs N == 1, rest dims <= 65536, got shape {list(shape)}",
+            )
+            return False
+        if not stride_check(stride):
+            self.reporter.report_reject(
+                node, f"Avgpool2d needs stride <= 3, got {stride}"
+            )
+            return False
+        if not shape[0] == 1:
+            self.reporter.report_reject(
+                node, f"Avgpool2d needs N==1, got N=={shape[0]}"
+            )
+            return False
+        return True
 
 
 @register_tosa_support_check
@@ -82,4 +105,21 @@ def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
         kernel = cast(tuple[int, int], node.args[1])
         stride = cast(tuple[int, int], node.args[2])
 
-        return kernel_check(kernel) and dim_check(shape) and stride_check(stride)
+        if not kernel_check(kernel):
+            self.reporter.report_reject(
+                node,
+                f"Maxpool2d needs kernel_y < 256, kernel_x*kernel_y<=65536, got {kernel}",
+            )
+            return False
+        if not dim_check(shape):
+            self.reporter.report_reject(
+                node,
+                f"Maxpool2d needs N == 1, rest dims <= 65536, got shape {list(shape)}",
+            )
+            return False
+        if not stride_check(stride):
+            self.reporter.report_reject(
+                node, f"Maxpool2d needs stride <= 3, got {stride}"
+            )
+            return False
+        return True
diff --git a/backends/arm/operator_support/reduce_sum_support.py b/backends/arm/operator_support/reduce_sum_support.py
index 8345d69caaa..37a71d7264c 100644
--- a/backends/arm/operator_support/reduce_sum_support.py
+++ b/backends/arm/operator_support/reduce_sum_support.py
@@ -34,6 +34,9 @@ def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
 
         for dim in dim_list:
             if not 1 <= input_shape[dim] <= 65536:
+                self.reporter.report_reject(
+                    node, f"sum needs dims < 65536, got shape {input_shape}"
+                )
                 return False
 
             # We can't be certain of which dim is the last in memory yet,
@@ -45,7 +48,9 @@ def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
             for length in input_shape[dim + 1 :]:
                 post_R_product *= length
             if not 1 <= pre_R_product <= 65536:
+                self.reporter.report_reject(node, "Failed dim check")
                 return False
             if not 1 <= post_R_product <= 65536:
+                self.reporter.report_reject(node, "Failed dim check")
                 return False
         return True
diff --git a/backends/arm/operator_support/bitwise_support.py b/backends/arm/operator_support/slice_copy_support.py
similarity index 52%
rename from backends/arm/operator_support/bitwise_support.py
rename to backends/arm/operator_support/slice_copy_support.py
index e0604622064..1f5ace91cde 100644
--- a/backends/arm/operator_support/bitwise_support.py
+++ b/backends/arm/operator_support/slice_copy_support.py
@@ -3,31 +3,37 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+
+import logging
+
 import torch.fx as fx
 from executorch.backends.arm.operator_support.tosa_supported_operators import (
     register_tosa_support_check,
     SupportedTOSAOperatorCheck,
 )
-from executorch.backends.arm.tosa_specification import Tosa_0_80, TosaSpecification
+from executorch.backends.arm.tosa_specification import TosaSpecification
+from executorch.backends.arm.tosa_utils import getNodeArgs
 from executorch.exir.dialects._ops import ops as exir_ops
 
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.WARNING)
+
 
 @register_tosa_support_check
-class BitwiseSupported(SupportedTOSAOperatorCheck):
-    targets = [
-        exir_ops.edge.aten.bitwise_and.Tensor,
-        exir_ops.edge.aten.bitwise_or.Tensor,
-        exir_ops.edge.aten.bitwise_xor.Tensor,
-    ]
+class SliceCopySupported(SupportedTOSAOperatorCheck):
+    targets = [exir_ops.edge.aten.slice_copy.Tensor]
 
     tosa_specs = [
         TosaSpecification.create_from_string("TOSA-0.80+BI"),
         TosaSpecification.create_from_string("TOSA-0.80+MI"),
     ]
 
-    def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
-        # U55 case, Vela 4.2.0 (25.02 release)
-        if isinstance(tosa_spec, Tosa_0_80) and tosa_spec.is_U55_subset:
+    def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification) -> bool:  # type: ignore[override, misc]
+        if tosa_spec not in self.tosa_specs:
             return False
 
+        inputs = getNodeArgs(node)
+        if len(inputs) == 5 and (step := inputs[4].number) != 1:
+            logging.warning(f"{node.target} with step size of {step} not supported.")
+            return False
         return True
diff --git a/backends/arm/operator_support/to_copy_support.py b/backends/arm/operator_support/to_copy_support.py
index c81c8e58a29..7926b3dc053 100644
--- a/backends/arm/operator_support/to_copy_support.py
+++ b/backends/arm/operator_support/to_copy_support.py
@@ -75,9 +75,6 @@ def is_node_tosa_supported(
     ) -> bool:
         assert node.target in self.targets
 
-        if tosa_spec not in self.tosa_specs:
-            return False
-
         assert tosa_spec.support_integer()
         supported_dtypes = (
             self.ALL_SUPPORTED_TYPES
@@ -97,9 +94,9 @@ def is_node_tosa_supported(
         assert isinstance(input_val, torch._subclasses.FakeTensor)
         input_dtype = input_val.dtype
         if input_dtype not in supported_dtypes:
-            logger.info(
-                f"Input dtype {input_val.dtype} is not supported in "
-                f"{node.target.name()}."  # type: ignore[union-attr]  # pyre-ignore[16]
+            self.reporter.report_reject(
+                node,
+                f"Input dtype {input_val.dtype} is not supported in {node.target}.",
             )
             return False
 
@@ -107,20 +104,22 @@ def is_node_tosa_supported(
         output_val = node.meta["val"]
         assert isinstance(output_val, torch._subclasses.FakeTensor)
         if output_val.dtype not in supported_dtypes[input_dtype]:
-            logger.info(
+            self.reporter.report_reject(
+                node,
                 f"Output dtype {output_val.dtype} is not supported in "
-                f"{node.target.name()} for input dtype {input_dtype}. "  # type: ignore[union-attr]  # pyre-ignore[16]
+                f"{node.target} for input dtype {input_dtype}. "
                 f"Supported output types: "
-                f"{''.join(str(t) for t in supported_dtypes[input_dtype])}"
+                f"{''.join(str(t) for t in supported_dtypes[input_dtype])}",
             )
             return False
 
         # Check memory format (to_copy)
         if "memory_format" in node.kwargs:
             if node.kwargs["memory_format"] in (torch.preserve_format,):
-                logger.info(
+                self.reporter.report_reject(
+                    node,
                     f"Argument 'memory_format' is not supported for "
-                    f"{node.target.name()} right now."  # type: ignore[union-attr]  # pyre-ignore[16]
+                    f"{node.target} right now.",
                 )
                 return False
 
@@ -129,9 +128,10 @@ def is_node_tosa_supported(
             dim_order = node.kwargs["dim_order"]
             # pyre-ignore[6]
             if dim_order != list(range(len(dim_order))):  # type: ignore[arg-type]
-                logger.info(
+                self.reporter.report_reject(
+                    node,
                     f"Argument {dim_order=} is not supported for "
-                    f"{node.target.name()} right now."  # type: ignore[union-attr]  # pyre-ignore[16]
+                    f"{node.target} right now.",
                 )
                 return False
 
diff --git a/backends/arm/operator_support/tosa_supported_operators.py b/backends/arm/operator_support/tosa_supported_operators.py
index 1fa626efce1..982a5746247 100644
--- a/backends/arm/operator_support/tosa_supported_operators.py
+++ b/backends/arm/operator_support/tosa_supported_operators.py
@@ -5,13 +5,25 @@
 
 # pyre-unsafe
 
+import itertools
 import operator
+import typing
 from typing import final, Optional, Sequence, Type
 
+import torch
 import torch.fx as fx
-from executorch.backends.arm.tosa_specification import TosaSpecification
+
+from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor
+from executorch.backends.arm._passes.fuse_quantized_activation_pass import (
+    FuseQuantizedActivationPass,
+)
+from executorch.backends.arm.tosa_specification import Tosa_0_80, TosaSpecification
+from executorch.exir import ExportedProgram
+from executorch.exir.backend.utils import WhyNoPartitionReporter
 from executorch.exir.dialects._ops import ops as exir_ops
+from torch.export.graph_signature import InputKind
 from torch.fx.passes.operator_support import any_chain, chain, OperatorSupportBase
+from torch.fx.passes.utils.source_matcher_utils import get_source_partitions
 
 
 class SupportedTOSAOperatorCheck(OperatorSupportBase):
@@ -19,15 +31,18 @@ class SupportedTOSAOperatorCheck(OperatorSupportBase):
     Supported OP for TOSA lowering
     """
 
-    def __init__(self, tosa_spec: TosaSpecification):
+    def __init__(self, tosa_spec: TosaSpecification, reporter: WhyNoPartitionReporter):
         self.tosa_spec = tosa_spec
+        self.reporter = reporter
 
     # Should be populated by subclass implementation
     tosa_specs: list[TosaSpecification] = []
     targets: list[str] = []
 
     @final
-    def is_node_supported(self, submodules, node: fx.Node) -> bool:
+    def is_node_supported(
+        self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
+    ) -> bool:
         if node.target not in self.targets:
             return False
         return self.is_node_tosa_supported(node, self.tosa_spec)
@@ -62,7 +77,6 @@ def register_tosa_support_check(checker: Type[SupportedTOSAOperatorCheck]):
 def get_registered_tosa_support_checks(
     tosa_spec: TosaSpecification,
 ) -> list[Type[SupportedTOSAOperatorCheck]]:
-
     if tosa_spec not in _tosa_spec_support:
         raise RuntimeError(
             f"TOSA specification not valid: {tosa_spec} not in {list(_tosa_spec_support.keys())}"
@@ -73,27 +87,67 @@ def get_registered_tosa_support_checks(
 
 def tosa_support_factory(
     tosa_spec: TosaSpecification,
+    exported_program: ExportedProgram,
+    reporter: WhyNoPartitionReporter,
     additional_checks: Optional[Sequence[OperatorSupportBase]] = None,
 ) -> OperatorSupportBase:
+    """Generates an OperatorSupport class depending on the given `tosa_spec`.
+    Additional checks can be supplied to avoid partitioning additional nodes.
+    """
+    # Postive checks: Add nodes to partitioning
+    positive_checks: list[OperatorSupportBase] = [
+        BaseTOSASupportList(),
+        *[
+            check(tosa_spec, reporter)
+            for check in get_registered_tosa_support_checks(tosa_spec)
+        ],
+    ]
+
+    # Negative checks: Remove nodes from partitioning
+    negative_checks: list[OperatorSupportBase] = [
+        CheckInt64Inputs(exported_program, reporter),
+        *[
+            reporter.wrap_check(check, f"Rejected by {check.__class__.__name__}")
+            for check in (additional_checks if additional_checks else [])
+        ],
+    ]
+
+    if not tosa_spec.support_float():
+        negative_checks.append(NeedsDecompositionCheck(reporter))
+        negative_checks.append(CheckProperQuantization(reporter))
+    if isinstance(tosa_spec, Tosa_0_80) and tosa_spec.is_U55_subset:
+        negative_checks.append(EthosU55NotSupported(reporter))
+
     return chain(
-        any_chain(
-            BaseTOSASupportList(),
-            *(
-                check(tosa_spec)
-                for check in get_registered_tosa_support_checks(tosa_spec)
-            ),
+        reporter.wrap_check(
+            any_chain(*positive_checks),
+            "Not included in BaseTOSASupportList or a registered tosa_support_check",
         ),
-        *additional_checks if additional_checks else [],
+        *negative_checks,
     )
 
 
 class BaseTOSASupportList(OperatorSupportBase):
 
-    def is_node_supported(self, submodules, node: fx.Node) -> bool:
+    def is_node_supported(
+        self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
+    ) -> bool:
         supported = node.op == "call_function" and node.target in [
+            exir_ops.edge.aten.abs.default,
             exir_ops.edge.aten.add.Tensor,
+            exir_ops.edge.aten.any.default,
+            exir_ops.edge.aten.any.dim,
+            exir_ops.edge.aten.any.dims,
+            exir_ops.edge.aten.logical_and.default,
+            exir_ops.edge.aten.logical_or.default,
+            exir_ops.edge.aten.logical_xor.default,
+            exir_ops.edge.aten.logical_not.default,
+            exir_ops.edge.aten.bitwise_and.Tensor,
+            exir_ops.edge.aten.bitwise_or.Tensor,
+            exir_ops.edge.aten.bitwise_xor.Tensor,
             exir_ops.edge.aten.expand_copy.default,
             exir_ops.edge.aten.cat.default,
+            exir_ops.edge.aten.ceil.default,
             exir_ops.edge.aten.clamp.default,
             exir_ops.edge.aten.bmm.default,
             exir_ops.edge.aten.permute_copy.default,
@@ -106,6 +160,7 @@ def is_node_supported(self, submodules, node: fx.Node) -> bool:
             exir_ops.edge.aten.log.default,
             exir_ops.edge.aten.linear.default,
             exir_ops.edge.aten.split_with_sizes_copy.default,
+            exir_ops.edge.aten.floor.default,
             exir_ops.edge.aten.full.default,
             exir_ops.edge.aten.full_like.default,
             exir_ops.edge.aten.ge.Tensor,
@@ -131,7 +186,6 @@ def is_node_supported(self, submodules, node: fx.Node) -> bool:
             exir_ops.edge.aten._softmax.default,
             exir_ops.edge.aten.select_copy.int,
             exir_ops.edge.aten._log_softmax.default,
-            exir_ops.edge.aten.slice_copy.Tensor,
             exir_ops.edge.aten.sub.Tensor,
             exir_ops.edge.aten.tanh.default,
             exir_ops.edge.aten.upsample_nearest2d.vec,
@@ -145,6 +199,274 @@ def is_node_supported(self, submodules, node: fx.Node) -> bool:
             exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
             exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
             exir_ops.edge.aten.constant_pad_nd.default,
+            exir_ops.edge.aten.amax.default,
+            exir_ops.edge.aten.amin.default,
         ]
 
         return supported
+
+
+class EthosU55NotSupported(OperatorSupportBase):
+    """
+    Certain operators are not supported on U55. These are listed in `unsupported_ops`.
+    The comment mentions the unsupported TOSA operator that the aten operator maps to where it is not obvious.
+    For unimplemented operators, this is the anticipated mapping, and it might be incorrect.
+    """
+
+    unsupported_ops = [
+        exir_ops.edge.aten.any.default,  # REDUCE_ANY
+        exir_ops.edge.aten.any.dim,  # REDUCE_ANY
+        exir_ops.edge.aten.any.dims,  # REDUCE_ANY
+        exir_ops.edge.aten.bitwise_and.Tensor,
+        exir_ops.edge.aten.bitwise_or.Tensor,
+        exir_ops.edge.aten.bitwise_xor.Tensor,
+        exir_ops.edge.aten.bitwise_not,
+        exir_ops.edge.aten.logical_and.default,
+        exir_ops.edge.aten.logical_or.default,
+        exir_ops.edge.aten.logical_xor.default,
+        exir_ops.edge.aten.logical_not.default,
+        exir_ops.edge.aten.amax.default,  # REDUCE_MAX
+        exir_ops.edge.aten.amin.default,  # REDUCE_MIN
+        exir_ops.edge.aten.eq.Tensor,
+        exir_ops.edge.aten.ge.Tensor,
+        exir_ops.edge.aten.gt.Tensor,
+        exir_ops.edge.aten.le.Tensor,
+        exir_ops.edge.aten.lt.Tensor,
+        exir_ops.edge.aten.flip.default,  # REVERSE
+        exir_ops.edge.aten.grid_sampler_2d,  # GATHER
+        exir_ops.edge.aten.scatter.src,
+        exir_ops.edge.aten.scatter.value,
+        exir_ops.edge.aten.select_scatter.default,
+        exir_ops.edge.aten.scatter_reduce.two,
+        exir_ops.edge.aten.scatter_add.default,
+        exir_ops.edge.aten.upsample_nearest2d.vec,  # RESIZE
+        exir_ops.edge.aten.upsample_bilinear2d.vec,  # RESIZE
+        exir_ops.edge.aten.reflection_pad1d.default,  # REVERSE
+        exir_ops.edge.aten.reflection_pad2d.default,  # REVERSE
+        exir_ops.edge.aten.reflection_pad3d.default,  # REVERSE
+    ]
+
+    def __init__(self, reporter: WhyNoPartitionReporter):
+        self.reporter = reporter
+
+    def is_node_supported(
+        self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
+    ) -> bool:
+
+        if node.target in self.unsupported_ops:
+            self.reporter.report_reject(node, "Op is not supported on U55.")
+            return False
+
+        return True
+
+
+class NeedsDecompositionCheck(OperatorSupportBase):
+    """
+    Targeted operators need to be decomposed prior to quantization in order to get a pair of q-dq-nodes surrounding
+    the operator, and to get optimal quantization parameters for each operator. This check will reject operators
+    that need to be decomposed.
+    """
+
+    def __init__(self, reporter: WhyNoPartitionReporter):
+        self.reporter = reporter
+
+    def is_node_supported(
+        self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
+    ) -> bool:
+
+        if node.op != "call_function":
+            return True
+        if node.target == exir_ops.edge.aten.mean.dim:
+            dim = node.args[1]
+            needs_decomp = dim != [-1, -2]
+        else:
+            needs_decomp = node.target in [
+                exir_ops.edge.aten.div.Tensor,
+                exir_ops.edge.aten._native_batch_norm_legit_no_training.default,
+                exir_ops.edge.aten.native_layer_norm.default,
+                exir_ops.edge.aten.mean.dim,
+                exir_ops.edge.aten._softmax.default,
+                exir_ops.edge.aten._log_softmax.default,
+                exir_ops.edge.aten.var.correction,
+                exir_ops.edge.aten.var.dim,
+                exir_ops.edge.aten.add.Scalar,
+                exir_ops.edge.aten.sub.Scalar,
+                exir_ops.edge.aten.mul.Scalar,
+                exir_ops.edge.aten.div.Scalar,
+            ]
+        if needs_decomp:
+            self.reporter.report_reject(node, "Needs to be decomposed.")
+            return False
+        else:
+            return True
+
+
+class CheckProperQuantization(OperatorSupportBase):
+    """
+    For targeted nodes, check that it has been quantized as expected. In most cases this means that a pair of quantize
+    and dequantize nodes surrounds the node. This is neccessary for table operators and operators that need to rescale
+    activations.
+    """
+
+    dq_op = exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default
+    q_op = exir_ops.edge.quantized_decomposed.quantize_per_tensor.default
+
+    def __init__(self, reporter: WhyNoPartitionReporter):
+        self.reporter = reporter
+
+    def _is_matmul_node_supported(
+        self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
+    ):
+        """
+        Find the matmul source partition containing this node and check that all its inputs and outputs are quantized.
+        """
+        for graph_module in submodules.values():
+            graph_module = typing.cast(fx.GraphModule, graph_module)
+            matmul_partitions = get_source_partitions(
+                graph_module.graph,
+                [
+                    torch.matmul,
+                ],
+                None,
+            )
+            matmul_partitions = list(
+                itertools.chain.from_iterable(matmul_partitions.values())
+            )
+            matched_partition = None
+            for partition in matmul_partitions:
+                if node in partition.nodes:
+                    matched_partition = partition
+            if matched_partition is not None:
+                input_quantized = all(
+                    input_node.target == self.dq_op
+                    for input_node in matched_partition.input_nodes
+                )
+                if not input_quantized:
+                    self.reporter.report_reject(
+                        node, "One or more matmul inputs were not quantized."
+                    )
+                    return False
+                output_quantized = all(
+                    output_node_user.target == self.q_op
+                    for output_node_user in matched_partition.output_nodes[0].users
+                )
+                if not output_quantized:
+                    self.reporter.report_reject(
+                        node, "One or more matmul outputs were not quantized."
+                    )
+                    return False
+            else:
+                self.reporter.report_reject(
+                    node, "Node did not match any matmul source partition."
+                )
+                return False
+
+        return True
+
+    def is_node_supported(
+        self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
+    ) -> bool:
+        output_quantized = False
+        input_quantized = False
+        if node.target not in (
+            exir_ops.edge.aten.add.Tensor,
+            exir_ops.edge.aten.avg_pool2d.default,
+            exir_ops.edge.aten.bmm.default,
+            exir_ops.edge.aten.convolution.default,
+            exir_ops.edge.aten.exp.default,
+            exir_ops.edge.aten.full.default,
+            exir_ops.edge.aten.full_like.default,
+            exir_ops.edge.aten.hardtanh.default,
+            exir_ops.edge.aten.linear.default,
+            exir_ops.edge.aten.log.default,
+            exir_ops.edge.aten.max_pool2d_with_indices.default,
+            exir_ops.edge.aten.mm.default,
+            exir_ops.edge.aten.mul.Tensor,
+            exir_ops.edge.aten.reciprocal.default,
+            exir_ops.edge.aten.relu.default,
+            exir_ops.edge.aten.rsqrt.default,
+            exir_ops.edge.aten.sigmoid.default,
+            exir_ops.edge.aten.sub.Tensor,
+            exir_ops.edge.aten.tanh.default,
+            exir_ops.edge.aten.upsample_nearest2d.vec,
+        ):
+            return True
+        elif node.target in (
+            exir_ops.edge.aten.bmm.default,
+            exir_ops.edge.aten.mm.default,
+        ):
+            source_fn_stack: tuple[typing.Any] = node.meta.get("source_fn_stack", [])
+            if len(source_fn_stack) > 0:
+                if source_fn_stack[-1][1] in (torch.matmul,):
+                    return self._is_matmul_node_supported(submodules, node)
+
+        elif node.target in (exir_ops.edge.aten.max_pool2d_with_indices.default,):
+            users = node.users
+            output_quantized = all(
+                user.target == operator.getitem
+                and all(user_user.target == self.q_op for user_user in user.users)
+                for user in users
+            )
+        elif FuseQuantizedActivationPass._is_fuseable_input(node):
+            users = node.users
+            output_quantized = all(
+                FuseQuantizedActivationPass._is_fuseable_quantized_activation(user)
+                for user in users
+            )
+        elif FuseQuantizedActivationPass._is_fuseable_quantized_activation(node):
+            input_node = node.all_input_nodes[0]
+            input_quantized = FuseQuantizedActivationPass._is_fuseable_input(input_node)
+
+        input_quantized = input_quantized or all(
+            (input_node.target == self.dq_op)
+            or (not get_first_fake_tensor(input_node).dtype.is_floating_point)
+            for input_node in node.all_input_nodes
+        )
+
+        if not input_quantized:
+            self.reporter.report_reject(node, "One or more inputs were not quantized.")
+            return False
+
+        all_q_users = all(
+            (output_node.target == self.q_op) for output_node in node.users
+        )
+        is_floating_point = get_first_fake_tensor(node).dtype.is_floating_point
+        output_quantized = output_quantized or all_q_users or not is_floating_point
+
+        if not output_quantized:
+            self.reporter.report_reject(node, "One or more outputs were not quantized.")
+            return False
+        return True
+
+
+class CheckInt64Inputs(OperatorSupportBase):
+
+    def __init__(
+        self, exported_program: ExportedProgram, reporter: WhyNoPartitionReporter
+    ):
+        self.input_names = [
+            spec.arg.name
+            for spec in exported_program.graph_signature.input_specs
+            if spec.kind == InputKind.USER_INPUT
+        ]
+        self.reporter = reporter
+        super().__init__()
+
+    def is_node_supported(
+        self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
+    ) -> bool:
+
+        for input_node in node.all_input_nodes:
+            # We can cast constant placeholders AOT, not call_functions.
+            if (
+                input_node.name in self.input_names
+                or not input_node.op == "placeholder"
+            ):
+                tensor = get_first_fake_tensor(input_node)
+                if tensor.dtype == torch.int64:
+                    self.reporter.report_reject(
+                        node,
+                        f"Had int64 input {input_node.name} that couldn't be handled.",
+                    )
+                    return False
+        return True
diff --git a/backends/arm/operators/TARGETS b/backends/arm/operators/TARGETS
index d12cc7e4dfd..b37823b60c2 100644
--- a/backends/arm/operators/TARGETS
+++ b/backends/arm/operators/TARGETS
@@ -4,7 +4,6 @@ load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
 python_library(
     name = "node_visitor",
     srcs = ["node_visitor.py"],
-    typing = True,
     deps = [
         "//executorch/backends/arm:tosa_mapping",
         "//executorch/backends/arm:tosa_specification",
@@ -13,14 +12,14 @@ python_library(
 
 python_library(
     name = "ops",
-    srcs = glob(["op_*.py"]),
-    typing = True,
+    srcs = glob(["op_*.py", "ops_*.py"]),
     deps = [
         "fbsource//third-party/serialization_lib/python/tosa:tosa",
         ":node_visitor",
         "//executorch/backends/arm:tosa_mapping",
         "//executorch/backends/arm:tosa_quant_utils",
         "//executorch/backends/arm:tosa_utils",
+        "//executorch/backends/arm/_passes:passes",
         "//executorch/exir:lib",
     ],
 )
@@ -28,7 +27,6 @@ python_library(
 python_library(
     name = "lib",
     srcs = ["__init__.py"],
-    typing = True,
     deps = [
         ":node_visitor",
         ":ops",
diff --git a/backends/arm/operators/__init__.py b/backends/arm/operators/__init__.py
index 735debe367f..81743f37b15 100644
--- a/backends/arm/operators/__init__.py
+++ b/backends/arm/operators/__init__.py
@@ -7,7 +7,11 @@
 
 from . import (  # noqa
     node_visitor,
+    op_abs,
     op_add,
+    op_amax,
+    op_amin,
+    op_any,
     op_avg_pool2d,
     op_bmm,
     op_cat,
@@ -20,17 +24,15 @@
     op_ge,
     op_get_item,
     op_gt,
-    op_hardtanh,
     op_le,
     op_log,
     op_lt,
-    op_max,
     op_max_pool2d,
-    op_min,
+    op_maximum,
+    op_minimum,
     op_mul,
     op_permute,
     op_reciprocal,
-    op_relu,
     op_repeat,
     op_rescale,
     op_rshift,
@@ -47,4 +49,5 @@
     op_upsample_nearest2d,
     op_view,
     ops_binary,
+    ops_unary,
 )
diff --git a/backends/arm/operators/node_visitor.py b/backends/arm/operators/node_visitor.py
index afb5f93baa7..f2c7ce9f9ce 100644
--- a/backends/arm/operators/node_visitor.py
+++ b/backends/arm/operators/node_visitor.py
@@ -30,7 +30,7 @@ class NodeVisitor:
     ]
 
     def __init__(self, exported_program: ExportedProgram, tosa_spec: TosaSpecification):
-        self._exported_program = exported_program or None
+        self._exported_program = exported_program
         self.tosa_spec = tosa_spec
 
     def define_node(
diff --git a/backends/arm/operators/op_abs.py b/backends/arm/operators/op_abs.py
new file mode 100644
index 00000000000..886a96fd520
--- /dev/null
+++ b/backends/arm/operators/op_abs.py
@@ -0,0 +1,133 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+from typing import List
+
+import executorch.backends.arm.tosa_quant_utils as tqutils
+import executorch.backends.arm.tosa_utils as tutils
+
+import serializer.tosa_serializer as ts  # type: ignore
+from executorch.backends.arm.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.arm.tosa_mapping import TosaArg
+from executorch.backends.arm.tosa_specification import TosaSpecification
+
+from serializer.tosa_serializer import TosaOp
+from torch.fx import Node
+
+
+@register_node_visitor
+class AbsVisitor_080_BI(NodeVisitor):
+    target = "aten.abs.default"
+
+    tosa_specs = [
+        TosaSpecification.create_from_string("TOSA-0.80+BI"),
+    ]
+
+    def __init__(self, *args):
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: Node,
+        tosa_graph: ts.TosaSerializer,
+        inputs: List[TosaArg],
+        output: TosaArg,
+    ) -> None:
+        # Specification (0.80) states that input and output types
+        # should all be the same
+        if not (inputs[0].dtype == output.dtype):
+            raise ValueError(
+                "All inputs and outputs need same dtype."
+                f"Got {inputs[0].dtype=}, {output.dtype=}"
+            )
+        # Handle int8 (quantized) and int32
+        if not (inputs[0].dtype in [ts.DType.INT8, ts.DType.INT32]):
+            raise ValueError(
+                "All inputs need to be INT8 or INT32." f"Got {inputs[0].dtype=}"
+            )
+
+        if inputs[0].dtype == ts.DType.INT8:
+            rescaled_inputs, scale_back = tqutils.insert_rescale_ops_to_int32(
+                tosa_graph, inputs, node
+            )
+        else:
+            # input[0].dtype == ts.DType.INT32
+            # Non quantized input, natively support by TOSA.abs
+            rescaled_inputs = inputs
+
+        if output.dtype == ts.DType.INT8:
+            broadcasted_shape = tutils.tosa_shape(output.shape, output.dim_order)
+            abs_output = tosa_graph.addIntermediate(broadcasted_shape, ts.DType.INT32)
+        else:
+            # output.dtype == ts.DType.INT32
+            abs_output = output
+
+        # Do the INT32 Abs
+        tosa_graph.addOperator(
+            TosaOp.Op().ABS,
+            [
+                rescaled_inputs[0].name,
+            ],
+            [abs_output.name],
+            None,
+        )
+
+        if output.dtype == ts.DType.INT8:
+            # Scale output back to 8 bit
+            # pyre-ignore
+            tqutils.insert_rescale_op_to_int8(tosa_graph, abs_output, scale_back, node)  # type: ignore[possibly-undefined]
+
+
+@register_node_visitor
+class AbsVisitor_080_MI(AbsVisitor_080_BI):
+    # inheriting 'target' from BI class
+
+    tosa_specs = [
+        TosaSpecification.create_from_string("TOSA-0.80+MI"),
+    ]
+
+    def __init__(self, *args):
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: Node,
+        tosa_graph: ts.TosaSerializer,
+        inputs: List[TosaArg],
+        output: TosaArg,
+    ) -> None:
+        # Specification (0.80) states that input and output types
+        # should all be the same
+        if not (inputs[0].dtype == output.dtype):
+            raise ValueError(
+                "All inputs and output need same dtype."
+                f"Got {inputs[0].dtype=}, {output.dtype=}"
+            )
+
+        if inputs[0].dtype in [ts.DType.INT8, ts.DType.INT32]:
+            # Call the inherited define_node for handling integers
+            super().define_node(node, tosa_graph, inputs, output)
+        else:
+            # FP32 Abs lowering
+
+            if not (inputs[0].dtype == ts.DType.FP32):
+                raise ValueError(
+                    "All inputs need to be FP32." f"Got {inputs[0].dtype=}"
+                )
+
+            if not (output.dtype == ts.DType.FP32):
+                raise ValueError("All outputs need to be FP32." f"Got {output.dtype=}")
+
+            # MI lowering
+            tosa_graph.addOperator(
+                TosaOp.Op().ABS,
+                [inputs[0].name],
+                [output.name],
+                None,
+            )
diff --git a/backends/arm/operators/op_add.py b/backends/arm/operators/op_add.py
index ccdeb2c1bcf..cb14dcb43d8 100644
--- a/backends/arm/operators/op_add.py
+++ b/backends/arm/operators/op_add.py
@@ -45,6 +45,12 @@ def define_node(
         # Handle int8 (quantized) and int32
         assert inputs[0].dtype in [ts.DType.INT8, ts.DType.INT32]
 
+        dim_order = (
+            inputs[0].dim_order
+            if len(inputs[0].shape) > len(inputs[1].shape)
+            else inputs[1].dim_order
+        )
+
         if inputs[0].dtype == ts.DType.INT8:
             rescaled_inputs, scale_back = tqutils.insert_rescale_ops_to_int32(
                 tosa_graph, inputs, node
@@ -61,13 +67,14 @@ def define_node(
             # output.dtype == ts.DType.INT32
             add_output = output
 
+        input1, input2 = tutils.reshape_for_broadcast(
+            tosa_graph, rescaled_inputs, dim_order
+        )
+
         # Do the INT32 Add
         tosa_graph.addOperator(
             TosaOp.Op().ADD,
-            [
-                rescaled_inputs[0].name,
-                rescaled_inputs[1].name,
-            ],
+            [input1.name, input2.name],
             [add_output.name],
             None,
         )
@@ -108,10 +115,12 @@ def define_node(
             assert inputs[0].dtype == ts.DType.FP32
             assert output.dtype == ts.DType.FP32
 
+            input1, input2 = tutils.reshape_for_broadcast(tosa_graph, inputs)
+
             # MI lowering
             tosa_graph.addOperator(
                 TosaOp.Op().ADD,
-                [inputs[0].name, inputs[1].name],
+                [input1.name, input2.name],
                 [output.name],
                 None,
             )
diff --git a/backends/arm/operators/op_amax.py b/backends/arm/operators/op_amax.py
new file mode 100644
index 00000000000..7347648c454
--- /dev/null
+++ b/backends/arm/operators/op_amax.py
@@ -0,0 +1,52 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import List
+
+import serializer.tosa_serializer as ts
+from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor
+from executorch.backends.arm.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.arm.tosa_mapping import TosaArg
+from serializer.tosa_serializer import TosaOp
+from torch.fx import Node
+
+
+@register_node_visitor
+class MaxVisitor(NodeVisitor):
+    target = "aten.amax.default"
+
+    def __init__(self, *args):
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: Node,
+        tosa_graph: ts.TosaSerializer,
+        inputs: List[TosaArg],
+        output: TosaArg,
+    ) -> None:
+
+        input = inputs[0]
+        dim = inputs[1].number
+
+        if dim < 0:
+            tensor = get_first_fake_tensor(node)
+            rank = len(tensor.size())
+            dim = rank + dim
+
+        keep_dims = inputs[2].number
+        if not keep_dims:
+            raise RuntimeError(
+                "TOSA only supports keepdims == True; Did you run the convert_minmax pass?"
+            )
+
+        attr = ts.TosaSerializerAttribute()
+        attr.AxisAttribute(input.dim_order.index(dim))
+
+        tosa_graph.addOperator(
+            TosaOp.Op().REDUCE_MAX, [input.name], [output.name], attr
+        )
diff --git a/backends/arm/operators/op_amin.py b/backends/arm/operators/op_amin.py
new file mode 100644
index 00000000000..37625cfcc52
--- /dev/null
+++ b/backends/arm/operators/op_amin.py
@@ -0,0 +1,52 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import List
+
+import serializer.tosa_serializer as ts
+from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor
+from executorch.backends.arm.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.arm.tosa_mapping import TosaArg
+from serializer.tosa_serializer import TosaOp
+from torch.fx import Node
+
+
+@register_node_visitor
+class MinVisitor(NodeVisitor):
+    target = "aten.amin.default"
+
+    def __init__(self, *args):
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: Node,
+        tosa_graph: ts.TosaSerializer,
+        inputs: List[TosaArg],
+        output: TosaArg,
+    ) -> None:
+
+        input = inputs[0]
+        dim = inputs[1].number
+
+        if dim < 0:
+            tensor = get_first_fake_tensor(node)
+            rank = len(tensor.size())
+            dim = rank + dim
+
+        keep_dims = inputs[2].number
+        if not keep_dims:
+            raise RuntimeError(
+                "TOSA only supports keepdims == True; Did you run the convert_minmax pass?"
+            )
+
+        attr = ts.TosaSerializerAttribute()
+        attr.AxisAttribute(input.dim_order.index(dim))
+
+        tosa_graph.addOperator(
+            TosaOp.Op().REDUCE_MIN, [input.name], [output.name], attr
+        )
diff --git a/backends/arm/operators/op_any.py b/backends/arm/operators/op_any.py
new file mode 100644
index 00000000000..ffb2e8a3c5d
--- /dev/null
+++ b/backends/arm/operators/op_any.py
@@ -0,0 +1,53 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+from typing import cast, List
+
+import serializer.tosa_serializer as ts  # type: ignore
+from executorch.backends.arm.operators.node_visitor import (  # type: ignore
+    NodeVisitor,
+    register_node_visitor,
+)
+
+from executorch.backends.arm.tosa_mapping import TosaArg  # type: ignore
+from serializer.tosa_serializer import TosaOp
+from torch.fx import Node
+
+
+@register_node_visitor
+class AnyVisitor(NodeVisitor):
+    target = "aten.any.dim"
+
+    def define_node(
+        self,
+        node: Node,
+        tosa_graph: ts.TosaSerializer,
+        inputs: List[TosaArg],
+        output: TosaArg,
+    ) -> None:
+
+        if not (inputs[0].dtype == output.dtype):
+            raise ValueError(
+                "All inputs and outputs need same dtype."
+                f"Got {ts.DTypeNames[inputs[0].dtype]=}, {ts.DTypeNames[output.dtype]=}."
+            )
+        if not (inputs[0].dtype == ts.DType.BOOL):
+            raise ValueError("All inputs need to be BOOL." f"Got {inputs[0].dtype=}")
+
+        input_shape = list(inputs[0].shape)
+        dim = cast(int, inputs[1].number) % len(
+            input_shape
+        )  # process the negative index
+        keep_dim = cast(bool, inputs[2].number if len(inputs) > 2 else False)
+        if not keep_dim:
+            raise ValueError("This case should be handled by ConvertAnyDimDimsPass")
+
+        attr = ts.TosaSerializerAttribute()
+        attr.AxisAttribute(inputs[0].dim_order.index(dim))
+
+        tosa_graph.addOperator(
+            TosaOp.Op().REDUCE_ANY, [inputs[0].name], [output.name], attr
+        )
diff --git a/backends/arm/operators/op_avg_pool2d.py b/backends/arm/operators/op_avg_pool2d.py
index e300b3ed016..3e38a7383ed 100644
--- a/backends/arm/operators/op_avg_pool2d.py
+++ b/backends/arm/operators/op_avg_pool2d.py
@@ -41,14 +41,21 @@ def _build_generic_avgpool2d(
         output: TosaArg,
         input_zp: int,
         output_zp: int,
-        accumulator_type,
+        accumulator_type: ts.DType,
     ) -> None:
-        input_tensor = inputs[0]
 
+        input_tensor = inputs[0]
         kernel_size_list = inputs[1].special
         stride_size_list = inputs[2].special
+
         try:
             pad_size_list = inputs[3].special
+            pad_size_list = [
+                pad_size_list[0],
+                pad_size_list[0],
+                pad_size_list[1],
+                pad_size_list[1],
+            ]
         except IndexError:
             pad_size_list = [0, 0, 0, 0]
 
diff --git a/backends/arm/operators/op_bmm.py b/backends/arm/operators/op_bmm.py
index d3261ebde0c..43006ecf4fd 100644
--- a/backends/arm/operators/op_bmm.py
+++ b/backends/arm/operators/op_bmm.py
@@ -80,7 +80,7 @@ def define_node(
 
             build_rescale(
                 tosa_fb=tosa_graph,
-                scale=final_output_scale,
+                scale=[final_output_scale],
                 # pyre-ignore[61]: Uninitialized local [61]: Local variable `bmm_result` is undefined, or not always defined.
                 input_node=bmm_result,  # type: ignore[possibly-undefined]
                 output_name=output.name,
diff --git a/backends/arm/operators/op_conv2d.py b/backends/arm/operators/op_conv2d.py
index f97e408a02a..b0864d0abe4 100644
--- a/backends/arm/operators/op_conv2d.py
+++ b/backends/arm/operators/op_conv2d.py
@@ -22,8 +22,6 @@
 from executorch.backends.arm.tosa_quant_utils import build_rescale_conv_output
 from executorch.backends.arm.tosa_utils import build_reshape, tosa_shape
 
-from serializer.tosa_serializer import TosaOp
-
 
 @register_node_visitor
 class Conv2dVisitor(NodeVisitor):
@@ -36,8 +34,12 @@ def __init__(self, *args):
     # `(input + 2 * pad - dilation * (weight - 1) - 1) / stride`
     # must be an integer, but tosa currently strictly require this property.
     # This function adjusts the pad value to meet the requirement.
-    def adjust_pad_if_needed(self, input, weight, stride, pad, dilation):
-        mod_remainder = (input + 2 * pad - dilation * (weight - 1) - 1) % stride
+    def adjust_pad_if_needed(
+        self, input_size: int, input_weight: int, stride: int, pad: int, dilation: int
+    ) -> int:
+        mod_remainder = (
+            input_size + 2 * pad - dilation * (input_weight - 1) - 1
+        ) % stride
 
         # No need to adjust
         if mod_remainder == 0:
@@ -143,11 +145,11 @@ def define_node(
             build_reshape(
                 tosa_graph, weight.name, weight_post_shape, weight_reshaped.name
             )
-            tosa_op = TosaOp.Op().DEPTHWISE_CONV2D
+            tosa_op = ts.TosaOp.Op().DEPTHWISE_CONV2D
             weight_name = weight_reshaped.name
         else:
             """Regular convolution case"""
-            tosa_op = TosaOp.Op().CONV2D
+            tosa_op = ts.TosaOp.Op().CONV2D
             weight_name = weight.name
 
         tosa_graph.addOperator(
@@ -174,8 +176,8 @@ def define_node(
                 conv2d_res,  # type: ignore[possibly-undefined]
                 output.name,
                 output.dtype,
-                input_scale,
-                weight_scale,
-                output_qargs[0].scale,
+                [input_scale],
+                [weight_scale],
+                [output_qargs[0].scale],
                 output_qargs[0].zp,
             )
diff --git a/backends/arm/operators/op_hardtanh.py b/backends/arm/operators/op_hardtanh.py
deleted file mode 100644
index fc0ee552a9f..00000000000
--- a/backends/arm/operators/op_hardtanh.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright 2023-2025 Arm Limited and/or its affiliates.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# pyre-unsafe
-from typing import List
-
-import serializer.tosa_serializer as ts  # type: ignore
-import torch
-
-# pyre-fixme[21]: 'Could not find a module corresponding to import `executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass`.'
-from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
-    get_input_qparams,
-)
-from executorch.backends.arm.operators.node_visitor import (
-    NodeVisitor,
-    register_node_visitor,
-)
-from executorch.backends.arm.tosa_mapping import TosaArg
-
-from serializer.tosa_serializer import TosaOp
-
-
-@register_node_visitor
-class HardTanhVisitor(NodeVisitor):
-    target = "aten.hardtanh.default"
-
-    def __init__(self, *args):
-        super().__init__(*args)
-
-    def define_node(
-        self,
-        node: torch.fx.Node,
-        tosa_graph: ts.TosaSerializer,
-        inputs: List[TosaArg],
-        output: TosaArg,
-    ) -> None:
-        attr = ts.TosaSerializerAttribute()
-
-        if inputs[0].dtype == ts.DType.INT8:
-            # Get quant parameters
-            input_qparams = get_input_qparams(node)  # pyre-ignore[16]
-            qargs = input_qparams[0]
-            # Convert to quantized representation
-            clamp_min_qs = qargs.quantize_value(inputs[1].number).item()
-            clamp_max_qs = qargs.quantize_value(inputs[2].number).item()
-            # Set fp values to 0.0 since they are not used
-            clamp_min_fp = 0.0
-            clamp_max_fp = 0.0
-        else:
-            clamp_min_fp = inputs[1].number
-            clamp_max_fp = inputs[2].number
-            # Set qs values to 0 since they are not used
-            clamp_min_qs = 0
-            clamp_max_qs = 0
-
-        attr.ClampAttribute(
-            tosa_graph.builder,
-            clamp_min_qs,
-            clamp_max_qs,
-            clamp_min_fp,
-            clamp_max_fp,
-        )
-
-        tosa_graph.addOperator(TosaOp.Op().CLAMP, [inputs[0].name], [output.name], attr)
diff --git a/backends/arm/operators/op_max_pool2d.py b/backends/arm/operators/op_max_pool2d.py
index f32300f561d..5305f95880c 100644
--- a/backends/arm/operators/op_max_pool2d.py
+++ b/backends/arm/operators/op_max_pool2d.py
@@ -42,9 +42,15 @@ def define_node(
         stride = inputs[2].special
 
         try:
-            padding = [*inputs[3].special, *inputs[3].special]
+            pad_size_list = inputs[3].special
+            pad_size_list = [
+                pad_size_list[0],
+                pad_size_list[0],
+                pad_size_list[1],
+                pad_size_list[1],
+            ]
         except IndexError:
-            padding = [0, 0, 0, 0]
+            pad_size_list = [0, 0, 0, 0]
 
         accumulator_type = output.dtype
 
@@ -63,7 +69,7 @@ def define_node(
         attr.PoolAttribute(
             kernel=kernel_size,
             stride=stride,
-            pad=padding,
+            pad=pad_size_list,
             input_zp=input_zp,
             output_zp=output_zp,
             accum_dtype=accumulator_type,
diff --git a/backends/arm/operators/op_max.py b/backends/arm/operators/op_maximum.py
similarity index 100%
rename from backends/arm/operators/op_max.py
rename to backends/arm/operators/op_maximum.py
diff --git a/backends/arm/operators/op_min.py b/backends/arm/operators/op_minimum.py
similarity index 100%
rename from backends/arm/operators/op_min.py
rename to backends/arm/operators/op_minimum.py
diff --git a/backends/arm/operators/op_mul.py b/backends/arm/operators/op_mul.py
index ef886de11e8..4bd36b9dc2e 100644
--- a/backends/arm/operators/op_mul.py
+++ b/backends/arm/operators/op_mul.py
@@ -24,6 +24,7 @@
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
+from executorch.backends.arm.tosa_utils import reshape_for_broadcast
 from serializer.tosa_serializer import TosaOp
 
 
@@ -43,6 +44,12 @@ def define_node(
         output: TosaArg,
     ) -> None:
         assert inputs[0].dtype == inputs[1].dtype == output.dtype == ts.DType.INT8
+
+        dim_order = (
+            inputs[0].dim_order
+            if len(inputs[0].shape) > len(inputs[1].shape)
+            else inputs[1].dim_order
+        )
         input_A = inputs[0]
         input_B = inputs[1]
         input_qparams = get_input_qparams(node)  # pyre-ignore[16]
@@ -56,27 +63,33 @@ def define_node(
             tosa_graph,
             input_A,
             input_A_qargs.zp,
-            rescale_scale=1.0,
+            [1.0],
         )
         input_B_rescaled = tqutils.build_rescale_to_int32(
             tosa_graph,
             input_B,
             input_B_qargs.zp,
-            rescale_scale=1.0,
+            [1.0],
         )
 
         output_shape = tutils.tosa_shape(output.shape, output.dim_order)
         mul_output = tosa_graph.addIntermediate(output_shape, ts.DType.INT32)
 
+        input1, input2 = tutils.reshape_for_broadcast(
+            tosa_graph,
+            [
+                input_A_rescaled,
+                input_B_rescaled,
+            ],
+            dim_order,
+        )
+
         # Do the INT32 Mul
         attr = ts.TosaSerializerAttribute()
         attr.MulAttribute(shift=0)
         tosa_graph.addOperator(
             TosaOp.Op().MUL,
-            [
-                input_A_rescaled.name,
-                input_B_rescaled.name,
-            ],
+            [input1.name, input2.name],
             [mul_output.name],
             attr,
         )
@@ -101,8 +114,11 @@ def define_node(
     ) -> None:
         if inputs[0].dtype == ts.DType.INT8:
             return super().define_node(node, tosa_graph, inputs, output)
+
+        input1, input2 = reshape_for_broadcast(tosa_graph, inputs)
+
         attr = ts.TosaSerializerAttribute()
         attr.MulAttribute(shift=0)
         tosa_graph.addOperator(
-            TosaOp.Op().MUL, [inputs[0].name, inputs[1].name], [output.name], attr
+            TosaOp.Op().MUL, [input1.name, input2.name], [output.name], attr
         )
diff --git a/backends/arm/operators/op_relu.py b/backends/arm/operators/op_relu.py
deleted file mode 100644
index c37e4b3e75d..00000000000
--- a/backends/arm/operators/op_relu.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# Copyright 2024-2025 Arm Limited and/or its affiliates.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# pyre-unsafe
-
-import serializer.tosa_serializer as ts  # type: ignore
-import torch.fx
-
-# pyre-fixme[21]: 'Could not find a module corresponding to import `executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass`.'
-from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
-    get_output_qparams,
-)
-from executorch.backends.arm.operators.node_visitor import (
-    NodeVisitor,
-    register_node_visitor,
-)
-from executorch.backends.arm.tosa_mapping import TosaArg
-from serializer.tosa_serializer import TosaOp
-
-
-@register_node_visitor
-class ReluVisitor(NodeVisitor):
-    target = "aten.relu.default"
-
-    def __init__(self, *args):
-        super().__init__(*args)
-
-    def define_node(
-        self,
-        node: torch.fx.Node,
-        tosa_graph: ts.TosaSerializer,
-        inputs: list[TosaArg],
-        output: TosaArg,
-    ) -> None:
-        attr = ts.TosaSerializerAttribute()
-
-        clamp_min_fp = 0.0
-        clamp_max_fp = 0.0
-        clamp_min_qs = 0
-        clamp_max_qs = 0
-        if inputs[0].dtype == ts.DType.INT8:
-            out_qargs = get_output_qparams(node)  # pyre-ignore[16]
-            clamp_min_qs = out_qargs[0].quantize_value(0).item()
-            clamp_max_qs = out_qargs[0].quantize_value(float("inf")).item()
-        else:
-            clamp_min_fp = 0
-            clamp_max_fp = float("inf")
-
-        attr.ClampAttribute(
-            tosa_graph.builder,
-            clamp_min_qs,
-            clamp_max_qs,
-            clamp_min_fp,
-            clamp_max_fp,
-        )
-
-        tosa_graph.addOperator(TosaOp.Op().CLAMP, [inputs[0].name], [output.name], attr)
diff --git a/backends/arm/operators/op_rescale.py b/backends/arm/operators/op_rescale.py
index a7fe1e8bbcb..098fbeccce1 100644
--- a/backends/arm/operators/op_rescale.py
+++ b/backends/arm/operators/op_rescale.py
@@ -38,7 +38,6 @@ def define_node(
         input_zp = cast(int, node.args[3])
         output_zp = cast(int, node.args[4])
 
-        # Skip int16 cases for now.
         if input_dtype != map_dtype(torch.int8) and input_zp != 0:
             raise ValueError(
                 f"If input dtype is not int8, input_zp must be 0. Got input_dtype{ts.DTypeNames[input_dtype]}, {input_zp=}"
@@ -48,17 +47,20 @@ def define_node(
                 f"If output dtype is not int8, output_zp must be 0. Got {output_dtype=}, {output_zp=}"
             )
 
-        scale_width = 32 if output_dtype == torch.int32 else 16
+        # scale32 gives higher accuracy but for a higher HW cost.
+        # For now, always go for scale32.
+        scale_32 = True
+        scale_width = 32 if scale_32 else 16
         multiplier, shift = tosa_quant_utils.compute_multiplier_and_shift(
-            scale, scale_width
+            [scale], scale_width
         )
         attr_rescale = ts.TosaSerializerAttribute()
         attr_rescale.RescaleAttribute(
             input_zp=input_zp,
             output_zp=output_zp,
-            multiplier=[multiplier],
-            shift=[shift],
-            scale32=output_dtype == torch.int32,
+            multiplier=multiplier,
+            shift=shift,
+            scale32=scale_32,
             double_round=False,
             per_channel=False,
             input_unsigned=False,
diff --git a/backends/arm/operators/op_slice.py b/backends/arm/operators/op_slice.py
index fe4f850b01f..a3ce80c5b24 100644
--- a/backends/arm/operators/op_slice.py
+++ b/backends/arm/operators/op_slice.py
@@ -32,9 +32,12 @@ def define_node(
         output: TosaArg,
     ) -> None:
 
+        # See slice_copy_support.py
+        if not (len(inputs) == 4 or (len(inputs) == 5 and inputs[4].number == 1)):
+            raise ValueError("Unsupported combination of inputs")
+
         # aten.slice_copy supports slicing in 1d at a time.
-        # The arguments are dimension of slicing, start index and end index.
-        assert len(inputs) == 4
+        # The arguments are the actual input, dimension of slicing, start index, end index and optinal step or stride.
         input_node, dim, start, end = inputs
 
         # Translate and check parameters in Pytorch dim order.
diff --git a/backends/arm/operators/op_table.py b/backends/arm/operators/op_table.py
index b411d8b91ba..da7e2e8be95 100644
--- a/backends/arm/operators/op_table.py
+++ b/backends/arm/operators/op_table.py
@@ -30,11 +30,24 @@ def define_node(
         inputs: List[TosaArg],
         output: TosaArg,
     ) -> None:
-        assert node.name in self._exported_program.state_dict.keys()  # type: ignore[union-attr]
-        assert inputs[0].dtype == output.dtype == ts.DType.INT8
+        if node.name not in self._exported_program.state_dict.keys():  # type: ignore[union-attr]
+            raise RuntimeError(
+                f"Did not find key {node.name} in state_dict {self._exported_program.state_dict.keys()}."
+            )
+        if inputs[0].dtype == ts.DType.INT8 and output.dtype != ts.DType.INT8:
+            raise ValueError(f"Int8 tables need int8 output, got {output.dtype=}.")
+        if inputs[0].dtype == ts.DType.INT16 and output.dtype != ts.DType.INT32:
+            raise ValueError(f"Int16 tables need int32 output, got {output.dtype=}.")
+
+        if inputs[0].dtype not in (ts.DType.INT8, ts.DType.INT16):
+            raise ValueError(
+                f"TOSA.TABLE only supports int8 or int16 inputs, got {ts.DTypeNames[inputs[0]]}"
+            )
+
         table = self._exported_program.state_dict[node.name]  # type: ignore[union-attr]
         table_attr = ts.TosaSerializerAttribute()
         table_attr.TableAttribute(np.array(table))
+
         tosa_graph.addOperator(
             TosaOp.Op().TABLE, [inputs[0].name], [output.name], table_attr
         )
diff --git a/backends/arm/operators/ops_binary.py b/backends/arm/operators/ops_binary.py
index 9ce561d0b6d..478901be2e3 100644
--- a/backends/arm/operators/ops_binary.py
+++ b/backends/arm/operators/ops_binary.py
@@ -49,3 +49,6 @@ def define_node(
 binary_operator_factory("aten.bitwise_and.Tensor", TosaOp.Op().BITWISE_AND)
 binary_operator_factory("aten.bitwise_xor.Tensor", TosaOp.Op().BITWISE_XOR)
 binary_operator_factory("aten.bitwise_or.Tensor", TosaOp.Op().BITWISE_OR)
+binary_operator_factory("aten.logical_and.default", TosaOp.Op().LOGICAL_AND)
+binary_operator_factory("aten.logical_xor.default", TosaOp.Op().LOGICAL_XOR)
+binary_operator_factory("aten.logical_or.default", TosaOp.Op().LOGICAL_OR)
diff --git a/backends/arm/operators/ops_unary.py b/backends/arm/operators/ops_unary.py
new file mode 100644
index 00000000000..0a7d45ffe98
--- /dev/null
+++ b/backends/arm/operators/ops_unary.py
@@ -0,0 +1,58 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+from typing import List
+
+import serializer.tosa_serializer as ts  # type: ignore
+import torch.fx
+from executorch.backends.arm.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+
+from executorch.backends.arm.tosa_mapping import TosaArg
+from serializer.tosa_serializer import TosaOp
+
+
+def unary_operator_factory(unary_target: str, tosa_op):
+    "Creates and registers NodeVisitors for operations that have one input and map directly into a TOSA op."
+
+    # Some TOSA unary operators only support float
+    fp_only_ops = ["aten.floor.default"]
+
+    class UnaryOperator(NodeVisitor):
+        target = unary_target
+
+        def __init__(self, *args):
+            super().__init__(*args)
+
+        def define_node(
+            self,
+            node: torch.fx.Node,
+            tosa_graph: ts.TosaSerializer,
+            inputs: List[TosaArg],
+            output: TosaArg,
+        ) -> None:
+
+            if not (inputs[0].dtype == output.dtype):
+                raise ValueError(
+                    "All inputs and output need same dtype."
+                    f"Got {inputs[0].dtype=}, {output.dtype=}"
+                )
+
+            if self.target in fp_only_ops and not (inputs[0].dtype == ts.DType.FP32):
+                raise ValueError(
+                    "All inputs need to be FP32." f"Got {inputs[0].dtype=}"
+                )
+
+            tosa_graph.addOperator(tosa_op, [inputs[0].name], [output.name])
+
+    register_node_visitor(UnaryOperator)
+
+
+unary_operator_factory("aten.ceil.default", TosaOp.Op().CEIL)
+unary_operator_factory("aten.floor.default", TosaOp.Op().FLOOR)
+unary_operator_factory("aten.logical_not.default", TosaOp.Op().LOGICAL_NOT)
diff --git a/backends/arm/quantizer/arm_quantizer.py b/backends/arm/quantizer/arm_quantizer.py
index 009c93432c1..cc119f5bdb4 100644
--- a/backends/arm/quantizer/arm_quantizer.py
+++ b/backends/arm/quantizer/arm_quantizer.py
@@ -191,6 +191,8 @@ def _get_module_type_filter(tp: Callable) -> NodeFilterType:
     True  # the node is from the submodule `Sub` (same for `Block` and `Linear` as well)
     """
 
+    tp_str = tp.__module__ + "." + tp.__qualname__
+
     def module_type_filter(n: Node) -> bool:
         # node_stack example: {
         #     'L__self___sub': ("L['self'].sub", <class '....Sub'>),
@@ -198,7 +200,7 @@ def module_type_filter(n: Node) -> bool:
         # }
         nn_module_stack = n.meta.get("nn_module_stack", {})
         types = [t for _, t in nn_module_stack.values()]
-        return tp in types
+        return tp_str in types
 
     return module_type_filter
 
diff --git a/backends/arm/quantizer/quantization_annotator.py b/backends/arm/quantizer/quantization_annotator.py
index f1cef971782..271210172c4 100644
--- a/backends/arm/quantizer/quantization_annotator.py
+++ b/backends/arm/quantizer/quantization_annotator.py
@@ -125,7 +125,10 @@ def _match_pattern(
 
 
 _one_to_one = [
+    torch.ops.aten.abs.default,
+    torch.ops.aten.ceil.default,
     torch.ops.aten.exp.default,
+    torch.ops.aten.floor.default,
     torch.ops.aten.log.default,
     torch.ops.aten.reciprocal.default,
     torch.ops.aten.rsqrt.default,
@@ -173,6 +176,8 @@ def _match_pattern(
     torch.ops.aten.contiguous.default,
     torch.ops.aten.upsample_nearest2d.vec,
     torch.ops.aten.pad.default,
+    torch.ops.aten.amax.default,
+    torch.ops.aten.amin.default,
 ]
 
 # Operators that can inherit the quantization specs from its parent node
@@ -181,6 +186,7 @@ def _match_pattern(
     torch.ops.aten.hardtanh.default,
     torch.ops.aten.hardtanh_.default,
     torch.ops.aten.relu.default,
+    torch.ops.aten.relu_.default,
     torch.ops.aten.mean.default,
     torch.ops.aten.mean.dim,
     torch.ops.aten.permute.default,
diff --git a/backends/arm/runtime/EthosUBackend.cpp b/backends/arm/runtime/EthosUBackend.cpp
index b0fa5bd9723..c51ea9cc18c 100644
--- a/backends/arm/runtime/EthosUBackend.cpp
+++ b/backends/arm/runtime/EthosUBackend.cpp
@@ -120,8 +120,11 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
     }
 
     MemoryAllocator* allocator = context.get_runtime_allocator();
-    ExecutionHandle* handle =
-        ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR(allocator, ExecutionHandle);
+    ExecutionHandle* handle = allocator->allocateInstance<ExecutionHandle>();
+    if (handle == nullptr) {
+      return Error::MemoryAllocationFailed;
+    }
+
     handle->processed = processed;
 
     // Return the same buffer we were passed - this data will be
@@ -193,6 +196,10 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
       supported |=
           (tensor_in.scalar_type() == ScalarType::Char and
            handles.inputs->io[i].elem_size == 1);
+      // 16 bit int (IOQDQ pass prepared networks)
+      supported |=
+          (tensor_in.scalar_type() == ScalarType::Short and
+           handles.inputs->io[i].elem_size == 2);
       if (!supported) {
         ET_LOG(
             Error,
@@ -220,6 +227,8 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
           handles.inputs->io[i].elem_size == 1;
       bool both_int = tensor_in.scalar_type() == ScalarType::Int and
           handles.inputs->io[i].elem_size == 4;
+      bool both_short = tensor_in.scalar_type() == ScalarType::Short and
+          handles.inputs->io[i].elem_size == 2;
 
       // Select a compatible copy routine
       if (both_char and permuted_input_shape) {
@@ -233,7 +242,7 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
             tensor_in.size(1),
             tensor_in.size(2),
             tensor_in.size(3));
-      } else if (both_char or both_int) {
+      } else if (both_char or both_int or both_short) {
         EXECUTORCH_PROF_SCOPE(
             event_tracer, "+EthosUBackend::execute()handles.input.memcpy()");
         // Sizes match and elt size matches so memcpy
diff --git a/backends/arm/scripts/build_executorch.sh b/backends/arm/scripts/build_executorch.sh
new file mode 100755
index 00000000000..f3359e10787
--- /dev/null
+++ b/backends/arm/scripts/build_executorch.sh
@@ -0,0 +1,145 @@
+#!/usr/bin/env bash
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Optional parameter:
+# --build_type= "Release" | "Debug" | "RelWithDebInfo"
+# --etdump      build with devtools-etdump support
+
+set -eu
+
+script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+et_root_dir=$(cd ${script_dir}/../../.. && pwd)
+et_root_dir=$(realpath ${et_root_dir})
+toolchain_cmake=${script_dir}/../../../examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake
+toolchain_cmake=$(realpath ${toolchain_cmake})
+setup_path_script=${et_root_dir}/examples/arm/ethos-u-scratch/setup_path.sh
+_setup_msg="please refer to ${et_root_dir}/examples/arm/setup.sh to properly install necessary tools."
+
+et_build_root="${et_root_dir}/arm_test"
+build_type="Release"
+build_devtools=false
+build_with_etdump=false
+
+help() {
+    echo "Usage: $(basename $0) [options]"
+    echo "Options:"
+    echo "  --et_build_root=<FOLDER>  Build output root folder to use, defaults to ${et_build_root}"
+    echo "  --build_type=<TYPE>       Build with Release, Debug or RelWithDebInfo, default is ${build_type}"
+    echo "  --devtools                Build Devtools libs"
+    echo "  --etdump                  Adds Devtools etdump support to track timing, etdump area will be base64 encoded in the log"
+    exit 0
+}
+
+for arg in "$@"; do
+    case $arg in
+      -h|--help) help ;;
+      --et_build_root=*) et_build_root="${arg#*=}";;
+      --build_type=*) build_type="${arg#*=}";;
+      --devtools) build_devtools=true ;;
+      --etdump) build_with_etdump=true ;;
+      *)
+      ;;
+    esac
+done
+
+# Source the tools
+# This should be prepared by the setup.sh
+[[ -f ${setup_path_script} ]] \
+    || { echo "Missing ${setup_path_script}. ${_setup_msg}"; exit 1; }
+
+source ${setup_path_script}
+
+et_build_dir="${et_build_root}/cmake-out"
+
+# Used for flatcc host excutable if Devtools is used
+et_build_host_dir=${et_build_root}/cmake-out-host-tools
+
+set -x
+cd "${et_root_dir}"
+
+if [ "$build_with_etdump" = true ] ; then
+    ( set +x ;
+        echo "--------------------------------------------------------------------------------" ;
+        echo "Build ExecuTorch Libraries host flatcc bin ${build_type} into ${et_build_host_dir}/bin/flatcc" ;
+        echo "--------------------------------------------------------------------------------" )
+
+    # Build host flatcc bin
+    # This is a way to work around that the flatcc executable get build for target (e.g. Arm) later
+    # and get replaced. flatcc is a tool used on the host for etdump and BundleIO handling.
+    # The way to solve this is to generate it once for the host, then copy it to ${et_build_host_dir}/bin
+    # and later point that out with -DFLATCC_EXECUTABLE=${et_build_host_dir}/bin/flatcc later.
+
+    cmake                                                 \
+        -DCMAKE_INSTALL_PREFIX=${et_build_host_dir}       \
+        -DCMAKE_BUILD_TYPE=${build_type}                  \
+        -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF            \
+        -DEXECUTORCH_ENABLE_LOGGING=ON                    \
+        -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON               \
+        -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON           \
+        -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON       \
+        -DEXECUTORCH_BUILD_DEVTOOLS=ON                    \
+        -DEXECUTORCH_ENABLE_EVENT_TRACER=ON               \
+        -DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=ON      \
+        -DFLATCC_ALLOW_WERROR=OFF                         \
+        -B"${et_build_host_dir}"                          \
+        "${et_root_dir}"
+
+    # third-party/flatcc/bin/flatcc gets build already in the in the cmake config step above
+    # so there is no cmake building step done
+
+    # Copy host flatcc excutable so it's saved when we build for target (Arm) later
+    et_build_host_dir=$(realpath ${et_build_host_dir})
+    mkdir -p ${et_build_host_dir}/bin
+    cp third-party/flatcc/bin/flatcc ${et_build_host_dir}/bin
+fi
+
+( set +x ;
+    echo "--------------------------------------------------------------------------------" ;
+    echo "Build ExecuTorch target libs ${build_type} into '${et_build_dir}'" ;
+    echo "--------------------------------------------------------------------------------" )
+
+build_devtools_flags=" -DEXECUTORCH_BUILD_DEVTOOLS=OFF "
+if [ "$build_devtools" = true ] ; then
+    build_devtools_flags=" -DEXECUTORCH_BUILD_DEVTOOLS=ON "
+fi
+
+build_with_etdump_flags=" -DEXECUTORCH_ENABLE_EVENT_TRACER=OFF "
+if [ "$build_with_etdump" = true ] ; then
+    # Add DevTools flags use in the Target build below
+    build_with_etdump_flags="-DEXECUTORCH_BUILD_DEVTOOLS=ON                    \
+                            -DEXECUTORCH_ENABLE_EVENT_TRACER=ON               \
+                            -DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=OFF     \
+                            -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=OFF      \
+                            -DFLATCC_ALLOW_WERROR=OFF                         \
+                            -DFLATCC_EXECUTABLE=${et_build_host_dir}/bin/flatcc "
+fi
+
+echo "Building with Devtools: ${build_devtools_flags} ${build_with_etdump_flags}"
+
+
+# Build
+cmake                                                 \
+    -DCMAKE_INSTALL_PREFIX=${et_build_dir}            \
+    -DCMAKE_BUILD_TYPE=${build_type}                  \
+    -DCMAKE_TOOLCHAIN_FILE="${toolchain_cmake}"       \
+    -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF            \
+    -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON               \
+    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON           \
+    -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON       \
+    -DEXECUTORCH_ENABLE_LOGGING=ON                    \
+    ${build_devtools_flags}                           \
+    ${build_with_etdump_flags}                        \
+    -B"${et_build_dir}"                               \
+    "${et_root_dir}"
+
+echo "[$(basename $0)] Configured CMAKE"
+
+cmake --build ${et_build_dir} --parallel --target install --config ${build_type} --
+
+set +x
+
+echo "[$(basename $0)] Generated static libraries for ExecuTorch:"
+find ${et_build_dir} -name "*.a" -exec ls -al {} \;
diff --git a/backends/arm/scripts/build_executorch_runner.sh b/backends/arm/scripts/build_executorch_runner.sh
new file mode 100755
index 00000000000..807821d427f
--- /dev/null
+++ b/backends/arm/scripts/build_executorch_runner.sh
@@ -0,0 +1,159 @@
+#!/usr/bin/env bash
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -eu
+
+script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+et_root_dir=$(cd ${script_dir}/../../.. && pwd)
+et_root_dir=$(realpath ${et_root_dir})
+toolchain_cmake=${et_root_dir}/examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake
+setup_path_script=${et_root_dir}/examples/arm/ethos-u-scratch/setup_path.sh
+_setup_msg="please refer to ${et_root_dir}/examples/arm/setup.sh to properly install necessary tools."
+
+pte_file=""
+target="ethos-u55-128"
+build_type="Release"
+bundleio=false
+system_config=""
+memory_mode=""
+build_with_etdump=false
+extra_build_flags=""
+output_folder_set=false
+output_folder="."
+et_build_root="${et_root_dir}/arm_test"
+ethosu_tools_dir=${et_root_dir}/examples/arm/ethos-u-scratch
+
+build_bundleio_flags=" -DET_BUNDLE_IO=OFF "
+build_with_etdump_flags=" -DEXECUTORCH_ENABLE_EVENT_TRACER=OFF "
+
+help() {
+    echo "Usage: $(basename $0) [options]"
+    echo "Options:"
+    echo "  --pte=<PTE_FILE>                pte file (genrated by the aot_arm_compier from the model to include in the elf"
+    echo "  --target=<TARGET>               Target to build and run for Default: ${target}"
+    echo "  --build_type=<TYPE>             Build with Release, Debug or RelWithDebInfo, default is ${build_type}"
+    echo "  --bundleio                      Support both pte and Bundle IO bpte using Devtools BundelIO with Input/RefOutput included"
+    echo "  --system_config=<CONFIG>        System configuration to select from the Vela configuration file (see vela.ini). Default: Ethos_U55_High_End_Embedded for EthosU55 targets, Ethos_U85_SYS_DRAM_Mid for EthosU85 targets."
+    echo "                                     NOTE: If given, this option must match the given target. This option along with the memory_mode sets timing adapter values customized for specific hardware, see ./executor_runner/CMakeLists.txt."
+    echo "  --memory_mode=<CONFIG>          Vela memory mode, used for setting the Timing Adapter parameters of the Corstone platforms."
+    echo "                                  Valid values are Shared_Sram(for Ethos-U55, Ethos-U65, Ethos-85), Sram_Only(for Ethos-U55, Ethos-U65, Ethos-U85) or Dedicated_Sram(for Ethos-U65, Ethos-U85)."
+    echo "                                  Default: Shared_Sram for the Ethos-U55 and Sram_Only for the Ethos-U85"
+    echo "  --etdump                        Adds Devtools etdump support to track timing, etdump area will be base64 encoded in the log"
+    echo "  --extra_build_flags=<FLAGS>     Extra flags to pass to cmake like -DET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE=60000 Default: none "
+    echo "  --output=<FOLDER>               Output folder Default: <MODEL>/<MODEL>_<TARGET INFO>.pte"
+    echo "  --et_build_root=<FOLDER>        Build output root folder to use, defaults to ${et_build_root}"
+    echo "  --ethosu_tools_dir=<FOLDER>     Path to your Ethos-U tools dir if you not using default: ${ethosu_tools_dir}"
+    exit 0
+}
+
+for arg in "$@"; do
+    case $arg in
+      -h|--help) help ;;
+      --pte=*) pte_file="${arg#*=}";;
+      --target=*) target="${arg#*=}";;
+      --build_type=*) build_type="${arg#*=}";;
+      --bundleio) bundleio=true ;;
+      --system_config=*) system_config="${arg#*=}";;
+      --memory_mode=*) memory_mode="${arg#*=}";;
+      --etdump) build_with_etdump=true ;;
+      --extra_build_flags=*) extra_build_flags="${arg#*=}";;
+      --output=*) output_folder="${arg#*=}" ; output_folder_set=true ;;
+      --et_build_root=*) et_build_root="${arg#*=}";;
+      --ethosu_tools_dir=*) ethosu_tools_dir="${arg#*=}";;
+      *)
+      ;;
+    esac
+done
+
+# Source the tools
+# This should be prepared by the setup.sh
+[[ -f ${setup_path_script} ]] \
+    || { echo "Missing ${setup_path_script}. ${_setup_msg}"; exit 1; }
+
+source ${setup_path_script}
+
+pte_file=$(realpath ${pte_file})
+ethosu_tools_dir=$(realpath ${ethosu_tools_dir})
+ethos_u_root_dir="$ethosu_tools_dir/ethos-u"
+mkdir -p "${ethos_u_root_dir}"
+ethosu_tools_dir=$(realpath ${ethos_u_root_dir})
+
+et_build_dir=${et_build_root}/cmake-out
+et_build_dir=$(realpath ${et_build_dir})
+
+if [ "$output_folder_set" = false ] ; then
+    # remove file ending
+    output_folder=${pte_file%.*}
+fi
+
+if [[ ${system_config} == "" ]]
+then
+    system_config="Ethos_U55_High_End_Embedded"
+    if [[ ${target} =~ "ethos-u85" ]]
+    then
+        system_config="Ethos_U85_SYS_DRAM_Mid"
+    fi
+fi
+
+if [[ ${memory_mode} == "" ]]
+then
+    memory_mode="Shared_Sram"
+    if [[ ${target} =~ "ethos-u85" ]]
+    then
+        memory_mode="Sram_Only"
+    fi
+fi
+
+mkdir -p "${output_folder}"
+output_folder=$(realpath ${output_folder})
+
+if [[ ${target} == *"ethos-u55"*  ]]; then
+    target_cpu=cortex-m55
+else
+    target_cpu=cortex-m85
+fi
+echo "--------------------------------------------------------------------------------"
+echo "Build Arm Baremetal executor_runner for ${target} with ${pte_file} using ${system_config} ${memory_mode} ${extra_build_flags} to '${output_folder}/cmake-out'"
+echo "--------------------------------------------------------------------------------"
+
+cd ${et_root_dir}/examples/arm/executor_runner
+
+if [ "$bundleio" = true ] ; then
+    build_bundleio_flags=" -DET_BUNDLE_IO=ON "
+fi
+
+if [ "$build_with_etdump" = true ] ; then
+    build_with_etdump_flags=" -DEXECUTORCH_ENABLE_EVENT_TRACER=ON "
+fi
+
+echo "Building with BundleIO/etdump/extra flags: ${build_bundleio_flags} ${build_with_etdump_flags} ${extra_build_flags}"
+
+cmake \
+    -DCMAKE_BUILD_TYPE=${build_type}            \
+    -DCMAKE_TOOLCHAIN_FILE=${toolchain_cmake}   \
+    -DTARGET_CPU=${target_cpu}                  \
+    -DET_DIR_PATH:PATH=${et_root_dir}           \
+    -DET_BUILD_DIR_PATH:PATH=${et_build_dir}    \
+    -DET_PTE_FILE_PATH:PATH="${pte_file}"       \
+    -DETHOS_SDK_PATH:PATH=${ethos_u_root_dir}   \
+    -DETHOSU_TARGET_NPU_CONFIG=${target}        \
+    ${build_bundleio_flags}                     \
+    ${build_with_etdump_flags}                  \
+    -DPYTHON_EXECUTABLE=$(which python3)        \
+    -DSYSTEM_CONFIG=${system_config}            \
+    -DMEMORY_MODE=${memory_mode}                \
+    ${extra_build_flags}                        \
+    -B ${output_folder}/cmake-out
+
+echo "[${BASH_SOURCE[0]}] Configured CMAKE"
+
+cmake --build ${output_folder}/cmake-out -j$(nproc) -- arm_executor_runner
+
+echo "[${BASH_SOURCE[0]}] Generated baremetal elf file:"
+find ${output_folder}/cmake-out -name "arm_executor_runner"
+echo "executable_text: $(find ${output_folder}/cmake-out -name arm_executor_runner -exec arm-none-eabi-size {} \; | grep -v filename | awk '{print $1}') bytes"
+echo "executable_data: $(find ${output_folder}/cmake-out -name arm_executor_runner -exec arm-none-eabi-size {} \; | grep -v filename | awk '{print $2}') bytes"
+echo "executable_bss:  $(find ${output_folder}/cmake-out -name arm_executor_runner -exec arm-none-eabi-size {} \; | grep -v filename | awk '{print $3}') bytes"
diff --git a/backends/arm/scripts/build_portable_kernels.sh b/backends/arm/scripts/build_portable_kernels.sh
new file mode 100755
index 00000000000..596407851c8
--- /dev/null
+++ b/backends/arm/scripts/build_portable_kernels.sh
@@ -0,0 +1,83 @@
+#!/usr/bin/env bash
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Optional parameter:
+# --build_type= "Release" | "Debug" | "RelWithDebInfo"
+# --etdump      build with devtools-etdump support
+
+set -eu
+
+script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+et_root_dir=$(cd ${script_dir}/../../.. && pwd)
+et_root_dir=$(realpath ${et_root_dir})
+toolchain_cmake=${script_dir}/../../../examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake
+toolchain_cmake=$(realpath ${toolchain_cmake})
+setup_path_script=${et_root_dir}/examples/arm/ethos-u-scratch/setup_path.sh
+_setup_msg="please refer to ${et_root_dir}/examples/arm/setup.sh to properly install necessary tools."
+
+
+et_build_root="${et_root_dir}/arm_test"
+build_type="Release"
+portable_kernels="aten::_softmax.out"
+
+help() {
+    echo "Usage: $(basename $0) [options]"
+    echo "Options:"
+    echo "  --et_build_root=<FOLDER>   Build output root folder to use, defaults to ${et_build_root}"
+    echo "  --build_type=<TYPE>        Build with Release, Debug or RelWithDebInfo, default is ${build_type}"
+    echo "  --portable_kernels=<OPS>   Comma separated list of portable (non delagated) kernels to include Default: ${portable_kernels}"
+    exit 0
+}
+
+for arg in "$@"; do
+    case $arg in
+      -h|--help) help ;;
+      --et_build_root=*) et_build_root="${arg#*=}";;
+      --build_type=*) build_type="${arg#*=}";;
+      --portable_kernels=*) portable_kernels="${arg#*=}";;
+      *)
+      ;;
+    esac
+done
+
+# Source the tools
+# This should be prepared by the setup.sh
+[[ -f ${setup_path_script} ]] \
+    || { echo "Missing ${setup_path_script}. ${_setup_msg}"; exit 1; }
+
+source ${setup_path_script}
+
+et_build_dir=${et_build_root}/cmake-out
+
+cd "${et_root_dir}"
+
+echo "--------------------------------------------------------------------------------" ;
+echo "Build ExecuTorch Libraries ${build_type} portable kernels: ${portable_kernels} into '${et_build_dir}'" ;
+echo "--------------------------------------------------------------------------------"
+
+if ! [[ $portable_kernels =~ ^((^|,)aten::[a-zA-Z0-9_]+\.[a-zA-Z0-9_]*out)*$ ]]; then
+    echo " ERROR: specified argument --portable_kernels=${portable_kernels}"
+    echo "        is in the wrong format please use \"aten::<OP1>.out,aten::<OP2>.out,...\""
+    echo "        e.g. \"aten::_softmax.out,aten::add.out\""
+    exit 1
+fi
+
+set -x
+
+cmake                                                 \
+    -DCMAKE_INSTALL_PREFIX=${et_build_dir}            \
+    -DCMAKE_BUILD_TYPE=${build_type}                  \
+    -DCMAKE_TOOLCHAIN_FILE="${toolchain_cmake}"       \
+    -DEXECUTORCH_SELECT_OPS_LIST=${portable_kernels}  \
+    -B"${et_build_dir}/examples/arm"                  \
+    "${et_root_dir}/examples/arm"
+
+cmake --build "${et_build_dir}/examples/arm" --parallel --config ${build_type} --
+
+set +x
+
+echo "[$(basename $0)] Generated static libraries for ExecuTorch:"
+find "${et_build_dir}/examples/arm" -name "*.a" -exec ls -al {} \;
diff --git a/backends/arm/scripts/build_quantized_ops_aot_lib.sh b/backends/arm/scripts/build_quantized_ops_aot_lib.sh
index d3d4b669f3b..ad6fad9c122 100755
--- a/backends/arm/scripts/build_quantized_ops_aot_lib.sh
+++ b/backends/arm/scripts/build_quantized_ops_aot_lib.sh
@@ -4,30 +4,51 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# Needs to be run from exeuctorch root.
 # Optional parameter: 1: build_type= "Release" | "Debug" | "RelWithDebInfo"
 
-build_type="Release"
-
-build_type=${1:-$build_type}
+set -eu
+script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+et_root_dir=$(cd ${script_dir}/../../.. && pwd)
+et_root_dir=$(realpath ${et_root_dir})
 
-SITE_PACKAGES="$(python3 -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
-CMAKE_PREFIX_PATH="${SITE_PACKAGES}/torch"
+build_type="Release"
+et_build_root="${et_root_dir}"
+
+help() {
+    echo "Usage: $(basename $0) [options]"
+    echo "Options:"
+    echo "  --et_build_root=<FOLDER>  Build output root folder to use, defaults to ${et_build_root}"
+    echo "  --build_type=<TYPE>       Build with Release, Debug or RelWithDebInfo, default is ${build_type}"
+    exit 0
+}
+
+for arg in "$@"; do
+    case $arg in
+      -h|--help) help ;;
+      --et_build_root=*) et_build_root="${arg#*=}";;
+      --build_type=*) build_type="${arg#*=}";;
+      *)
+      ;;
+    esac
+done
+
+et_build_dir=${et_build_root}/cmake-out-aot-lib
+
+cd "${et_root_dir}"
 
 echo "--------------------------------------------------------------------------------"
-echo "Build .so library to register quant ops with AoT flow ${build_type} into '$(echo $(pwd))/cmake-out-aot-lib'"
+echo "Build quantized_ops_aot_lib library to register quant ops with AoT flow ${build_type} into '${et_build_dir}'"
 echo "--------------------------------------------------------------------------------"
 
 # Since we only want to build the quantized_aot lib in the specified folder,
 # we want exactly the configuration set below and deleting the cache is OK.
-rm -f cmake-out-aot-lib/CMakeCache.txt
+rm -f ${et_build_dir}/CMakeCache.txt
 
 CXXFLAGS="-fno-exceptions -fno-rtti" cmake \
-    -DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH"    \
     -DCMAKE_BUILD_TYPE=${build_type}            \
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON      \
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED_AOT=ON \
-    -Bcmake-out-aot-lib                         \
+    -B${et_build_dir}                         \
     .
 
-cmake --build cmake-out-aot-lib --parallel -- quantized_ops_aot_lib
+cmake --build ${et_build_dir} --parallel -- quantized_ops_aot_lib
diff --git a/backends/arm/scripts/run_fvp.sh b/backends/arm/scripts/run_fvp.sh
new file mode 100755
index 00000000000..52247e08cab
--- /dev/null
+++ b/backends/arm/scripts/run_fvp.sh
@@ -0,0 +1,119 @@
+#!/usr/bin/env bash
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Optional parameter:
+# --build_type= "Release" | "Debug" | "RelWithDebInfo"
+# --etdump      build with devtools-etdump support
+
+set -eu
+
+script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+et_root_dir=$(cd ${script_dir}/../../.. && pwd)
+et_root_dir=$(realpath ${et_root_dir})
+setup_path_script=${et_root_dir}/examples/arm/ethos-u-scratch/setup_path.sh
+_setup_msg="please refer to ${et_root_dir}/examples/arm/setup.sh to properly install necessary tools."
+
+
+elf_file=""
+target="ethos-u55-128"
+timeout="600"
+
+help() {
+    echo "Usage: $(basename $0) [options]"
+    echo "Options:"
+    echo "  --elf=<ELF_FILE>         elf file to run"
+    echo "  --target=<TARGET>        Target to build and run for Default: ${target}"
+    echo "  --timeout=<TIME_IN_SEC>  Maximum target runtime, used to detect hanging, might need to be higer on large models Default: ${timeout}"
+    exit 0
+}
+
+for arg in "$@"; do
+    case $arg in
+      -h|--help) help ;;
+      --elf=*) elf_file="${arg#*=}";;
+      --target=*) target="${arg#*=}";;
+      --timeout=*) timeout="${arg#*=}";;
+      *)
+      ;;
+    esac
+done
+
+elf_file=$(realpath ${elf_file})
+
+if [[ ${target} == *"ethos-u55"*  ]]; then
+    fvp_model=FVP_Corstone_SSE-300_Ethos-U55
+else
+    fvp_model=FVP_Corstone_SSE-320
+fi
+
+# Source the tools
+# This should be prepared by the setup.sh
+[[ -f ${setup_path_script} ]] \
+    || { echo "Missing ${setup_path_script}. ${_setup_msg}"; exit 1; }
+
+source ${setup_path_script}
+
+# basic checks before we get started
+hash ${fvp_model} \
+    || { echo "Could not find ${fvp_model} on PATH, ${_setup_msg}"; exit 1; }
+
+
+[[ ! -f $elf_file ]] && { echo "[${BASH_SOURCE[0]}]: Unable to find executor_runner elf: ${elf_file}"; exit 1; }
+num_macs=$(echo ${target} | cut -d - -f 3)
+
+echo "--------------------------------------------------------------------------------"
+echo "Running ${elf_file} for ${target} run with FVP:${fvp_model} num_macs:${num_macs}"
+echo "WARNING: Corstone FVP is not cycle accurate and should NOT be used to determine valid runtime"
+echo "--------------------------------------------------------------------------------"
+
+# Check if stdbuf is intalled and use stdbuf -oL together with tee below to make the output
+# go all the way to the console more directly and not be buffered
+
+if hash stdbuf 2>/dev/null; then
+    nobuf="stdbuf -oL"
+else
+    nobuf=""
+fi
+
+log_file=$(mktemp)
+
+
+
+if [[ ${target} == *"ethos-u55"*  ]]; then
+    ${nobuf} ${fvp_model}                                            \
+        -C ethosu.num_macs=${num_macs}                      \
+        -C mps3_board.visualisation.disable-visualisation=1 \
+        -C mps3_board.telnetterminal0.start_telnet=0        \
+        -C mps3_board.uart0.out_file='-'                    \
+        -C mps3_board.uart0.shutdown_on_eot=1               \
+        -a "${elf_file}"                                         \
+        --timelimit ${timeout} 2>&1 | tee ${log_file} || true # seconds
+    echo "[${BASH_SOURCE[0]}] Simulation complete, $?"
+elif [[ ${target} == *"ethos-u85"*  ]]; then
+    ${nobuf} ${fvp_model}                                            \
+        -C mps4_board.subsystem.ethosu.num_macs=${num_macs} \
+        -C mps4_board.visualisation.disable-visualisation=1 \
+        -C vis_hdlcd.disable_visualisation=1                \
+        -C mps4_board.telnetterminal0.start_telnet=0        \
+        -C mps4_board.uart0.out_file='-'                    \
+        -C mps4_board.uart0.shutdown_on_eot=1               \
+        -a "${elf_file}"                                         \
+        --timelimit ${timeout} 2>&1 | tee ${log_file} || true # seconds
+    echo "[${BASH_SOURCE[0]}] Simulation complete, $?"
+else
+    echo "Running ${elf_file} for ${target} is not supported"
+    exit 1
+fi
+
+echo "Checking for problems in log:"
+! grep -E "^(F|E|\\[critical\\]|Hard fault.|Info: Simulation is stopping. Reason: CPU time has been exceeded.).*$" ${log_file}
+if [ $? != 0 ]; then
+    echo "Found ERROR"
+    rm "${log_file}"
+    exit 1
+fi
+echo "No problems found!"
+rm "${log_file}"
diff --git a/backends/arm/test/TARGETS b/backends/arm/test/TARGETS
index ef092c55035..38a4ec18787 100644
--- a/backends/arm/test/TARGETS
+++ b/backends/arm/test/TARGETS
@@ -1,13 +1,16 @@
 load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
+load(":targets.bzl", "define_arm_tests")
+
+
+oncall("executorch")
 
 python_library(
-    name = "common",
-    srcs = ["common.py"],
+    name = "conftest",
+    srcs = ["conftest.py"],
     deps = [
-        "//executorch/backends/xnnpack/test/tester:tester",
-        "//executorch/backends/arm:arm_backend",
         "//executorch/exir:lib",
         "//executorch/exir/backend:compile_spec_schema",
+        "fbsource//third-party/pypi/pytest:pytest",
     ]
 )
 
@@ -15,9 +18,35 @@ python_library(
     name = "runner_utils",
     srcs = ["runner_utils.py"],
     deps = [
-        "//executorch/backends/xnnpack/test/tester:tester",
+        ":conftest",
         "//executorch/backends/arm:arm_backend",
         "//executorch/exir:lib",
         "//executorch/exir/backend:compile_spec_schema",
     ]
 )
+
+python_library(
+    name = "common",
+    srcs = ["common.py"],
+    deps = [
+        ":runner_utils",
+        "//executorch/backends/arm:tosa_specification",
+        "fbsource//third-party/pypi/pytest:pytest",
+    ]
+)
+
+python_library(
+    name = "arm_tester",
+    srcs = glob(["tester/*.py"]),
+    deps = [
+        ":common",
+        "//executorch/backends/xnnpack/test/tester:tester",
+        "//executorch/backends/arm:arm_partitioner",
+        "//executorch/backends/arm/quantizer:arm_quantizer",
+        "//executorch/backends/arm:tosa_mapping",
+        "//executorch/devtools/backend_debug:delegation_info",
+        "fbsource//third-party/pypi/tabulate:tabulate",
+    ]
+)
+
+define_arm_tests()
diff --git a/backends/arm/test/common.py b/backends/arm/test/common.py
index 091b2d5f26b..57606e51f47 100644
--- a/backends/arm/test/common.py
+++ b/backends/arm/test/common.py
@@ -1,5 +1,4 @@
 # Copyright 2024-2025 Arm Limited and/or its affiliates.
-# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -11,7 +10,7 @@
 from datetime import datetime
 
 from pathlib import Path
-from typing import Any
+from typing import Any, Optional
 
 import pytest
 from executorch.backends.arm.arm_backend import ArmCompileSpecBuilder
@@ -92,29 +91,49 @@ def get_tosa_compile_spec_unbuilt(
 
 
 def get_u55_compile_spec(
-    custom_path=None,
+    macs: int = 128,
+    system_config: str = "Ethos_U55_High_End_Embedded",
+    memory_mode: str = "Shared_Sram",
+    extra_flags: str = "--debug-force-regor --output-format=raw",
+    custom_path: Optional[str] = None,
 ) -> list[CompileSpec]:
     """
-    Default compile spec for Ethos-U55 tests.
+    Compile spec for Ethos-U55.
     """
     return get_u55_compile_spec_unbuilt(
+        macs=macs,
+        system_config=system_config,
+        memory_mode=memory_mode,
+        extra_flags=extra_flags,
         custom_path=custom_path,
     ).build()
 
 
 def get_u85_compile_spec(
+    macs: int = 128,
+    system_config="Ethos_U85_SYS_DRAM_Mid",
+    memory_mode="Shared_Sram",
+    extra_flags="--output-format=raw",
     custom_path=None,
 ) -> list[CompileSpec]:
     """
-    Default compile spec for Ethos-U85 tests.
+    Compile spec for Ethos-U85.
     """
     return get_u85_compile_spec_unbuilt(  # type: ignore[attr-defined]
+        macs=macs,
+        system_config=system_config,
+        memory_mode=memory_mode,
+        extra_flags=extra_flags,
         custom_path=custom_path,
     ).build()
 
 
 def get_u55_compile_spec_unbuilt(
-    custom_path=None,
+    macs: int,
+    system_config: str,
+    memory_mode: str,
+    extra_flags: str,
+    custom_path: Optional[str],
 ) -> ArmCompileSpecBuilder:
     """Get the ArmCompileSpecBuilder for the Ethos-U55 tests, to modify
     the compile spec before calling .build() to finalize it.
@@ -122,13 +141,17 @@ def get_u55_compile_spec_unbuilt(
     artifact_path = custom_path or tempfile.mkdtemp(prefix="arm_u55_")
     if not os.path.exists(artifact_path):
         os.makedirs(artifact_path, exist_ok=True)
+
+    # https://gitlab.arm.com/artificial-intelligence/ethos-u/ethos-u-vela/-/blob/main/OPTIONS.md
+    assert macs in [32, 64, 128, 256], "Unsupported MACs value"
+
     compile_spec = (
         ArmCompileSpecBuilder()
         .ethosu_compile_spec(
-            "ethos-u55-128",
-            system_config="Ethos_U55_High_End_Embedded",
-            memory_mode="Shared_Sram",
-            extra_flags="--debug-force-regor --output-format=raw",
+            f"ethos-u55-{macs}",
+            system_config=system_config,
+            memory_mode=memory_mode,
+            extra_flags=extra_flags,
         )
         .dump_intermediate_artifacts_to(artifact_path)
     )
@@ -136,19 +159,28 @@ def get_u55_compile_spec_unbuilt(
 
 
 def get_u85_compile_spec_unbuilt(
-    custom_path=None,
+    macs: int,
+    system_config: str,
+    memory_mode: str,
+    extra_flags: str,
+    custom_path: Optional[str],
 ) -> list[CompileSpec]:
     """Get the ArmCompileSpecBuilder for the Ethos-U85 tests, to modify
     the compile spec before calling .build() to finalize it.
     """
     artifact_path = custom_path or tempfile.mkdtemp(prefix="arm_u85_")
+    if not os.path.exists(artifact_path):
+        os.makedirs(artifact_path, exist_ok=True)
+
+    assert macs in [128, 256, 512, 1024, 2048], "Unsupported MACs value"
+
     compile_spec = (
         ArmCompileSpecBuilder()
         .ethosu_compile_spec(
-            "ethos-u85-128",
-            system_config="Ethos_U85_SYS_DRAM_Mid",
-            memory_mode="Shared_Sram",
-            extra_flags="--output-format=raw",
+            f"ethos-u85-{macs}",
+            system_config=system_config,
+            memory_mode=memory_mode,
+            extra_flags=extra_flags,
         )
         .dump_intermediate_artifacts_to(artifact_path)
     )
@@ -159,22 +191,54 @@ def get_u85_compile_spec_unbuilt(
     not corstone300_installed() or not arm_executor_runner_exists("corstone-300"),
     reason="Did not find Corstone-300 FVP or executor_runner on path",
 )
-"""Skips a test if Corsone300 FVP is not installed, or if the executor runner is not built"""
+"""
+TO BE DEPRECATED - Use XfailIfNoCorstone300 instead
+Skips a test if Corsone300 FVP is not installed, or if the executor runner is not built
+"""
 
 SkipIfNoCorstone320 = pytest.mark.skipif(
     not corstone320_installed() or not arm_executor_runner_exists("corstone-320"),
     reason="Did not find Corstone-320 FVP or executor_runner on path",
 )
-"""Skips a test if Corsone320 FVP is not installed, or if the executor runner is not built."""
+"""
+TO BE DEPRECATED - Use XfailIfNoCorstone320 instead
+Skips a test if Corsone320 FVP is not installed, or if the executor runner is not built
+"""
+
+
+XfailIfNoCorstone300 = pytest.mark.xfail(
+    condition=not (
+        corstone300_installed() and arm_executor_runner_exists("corstone-300")
+    ),
+    raises=FileNotFoundError,
+    reason="Did not find Corstone-300 FVP or executor_runner on path",
+)
+"""Xfails a test if Corsone300 FVP is not installed, or if the executor runner is not built"""
+
+XfailIfNoCorstone320 = pytest.mark.xfail(
+    condition=not (
+        corstone320_installed() and arm_executor_runner_exists("corstone-320")
+    ),
+    raises=FileNotFoundError,
+    reason="Did not find Corstone-320 FVP or executor_runner on path",
+)
+"""Xfails a test if Corsone320 FVP is not installed, or if the executor runner is not built"""
+
+xfail_type = str | tuple[str, type[Exception]]
 
 
 def parametrize(
-    arg_name: str, test_data: dict[str, Any], xfails: dict[str, str] = None
+    arg_name: str,
+    test_data: dict[str, Any],
+    xfails: dict[str, xfail_type] | None = None,
+    strict: bool = True,
 ):
     """
     Custom version of pytest.mark.parametrize with some syntatic sugar and added xfail functionality
         - test_data is expected as a dict of (id, test_data) pairs
-        - alllows to specifiy a dict of (id, failure_reason) pairs to mark specific tests as xfail
+        - alllows to specifiy a dict of (id, failure_reason) pairs to mark specific tests as xfail.
+          Failure_reason can be str, type[Exception], or tuple[str, type[Exception]].
+          Strings set the reason for failure, the exception type sets expected error.
     """
     if xfails is None:
         xfails = {}
@@ -184,8 +248,23 @@ def decorator_func(func):
         pytest_testsuite = []
         for id, test_parameters in test_data.items():
             if id in xfails:
+                xfail_info = xfails[id]
+                reason = ""
+                raises = None
+                if isinstance(xfail_info, str):
+                    reason = xfail_info
+                elif isinstance(xfail_info, tuple):
+                    reason, raises = xfail_info
+                else:
+                    raise RuntimeError(
+                        "xfail info needs to be str, or tuple[str, type[Exception]]"
+                    )
                 pytest_param = pytest.param(
-                    test_parameters, id=id, marks=pytest.mark.xfail(reason=xfails[id])
+                    test_parameters,
+                    id=id,
+                    marks=pytest.mark.xfail(
+                        reason=reason, raises=raises, strict=strict
+                    ),
                 )
             else:
                 pytest_param = pytest.param(test_parameters, id=id)
diff --git a/backends/arm/test/conftest.py b/backends/arm/test/conftest.py
index 081d499d4d5..ca6aa4f4dd8 100644
--- a/backends/arm/test/conftest.py
+++ b/backends/arm/test/conftest.py
@@ -13,7 +13,12 @@
 from typing import Any
 
 import pytest
-import torch
+
+try:
+    import tosa_reference_model
+except ImportError:
+    logging.warning("tosa_reference_model not found, can't run reference model tests")
+    tosa_reference_model = None
 
 """
 This file contains the pytest hooks, fixtures etc. for the Arm test suite.
@@ -24,18 +29,30 @@
 
 
 def pytest_configure(config):
-
     pytest._test_options = {}  # type: ignore[attr-defined]
+    pytest._test_options["corstone_fvp"] = False  # type: ignore[attr-defined]
 
-    if config.option.arm_run_corstoneFVP:
+    if (
+        getattr(config.option, "arm_run_corstoneFVP", False)
+        and config.option.arm_run_corstoneFVP
+    ):
         corstone300_exists = shutil.which("FVP_Corstone_SSE-300_Ethos-U55")
         corstone320_exists = shutil.which("FVP_Corstone_SSE-320")
         if not (corstone300_exists and corstone320_exists):
             raise RuntimeError(
                 "Tests are run with --arm_run_corstoneFVP but corstone FVP is not installed."
             )
+        # Only enable if we also have the TOSA reference model available.
         pytest._test_options["corstone_fvp"] = True  # type: ignore[attr-defined]
-    pytest._test_options["fast_fvp"] = config.option.fast_fvp  # type: ignore[attr-defined]
+    pytest._test_options["llama_inputs"] = config.option.llama_inputs  # type: ignore[attr-defined]
+    pytest._test_options["fast_fvp"] = False  # type: ignore[attr-defined]
+    if getattr(config.option, "fast_fvp", False):
+        pytest._test_options["fast_fvp"] = config.option.fast_fvp  # type: ignore[attr-defined]
+
+    # TODO: remove this flag once we have a way to run the reference model tests with Buck
+    pytest._test_options["tosa_ref_model"] = False  # type: ignore[attr-defined]
+    if tosa_reference_model is not None:
+        pytest._test_options["tosa_ref_model"] = True  # type: ignore[attr-defined]
     logging.basicConfig(level=logging.INFO, stream=sys.stdout)
 
 
@@ -44,9 +61,20 @@ def pytest_collection_modifyitems(config, items):
 
 
 def pytest_addoption(parser):
-    parser.addoption("--arm_quantize_io", action="store_true", help="Deprecated.")
-    parser.addoption("--arm_run_corstoneFVP", action="store_true")
-    parser.addoption("--fast_fvp", action="store_true")
+    def try_addoption(*args, **kwargs):
+        try:
+            parser.addoption(*args, **kwargs)
+        except Exception:
+            pass
+
+    try_addoption("--arm_quantize_io", action="store_true", help="Deprecated.")
+    try_addoption("--arm_run_corstoneFVP", action="store_true", help="Deprecated.")
+    try_addoption("--fast_fvp", action="store_true")
+    try_addoption(
+        "--llama_inputs",
+        nargs="+",
+        help="List of two files. Firstly .pt file. Secondly .json",
+    )
 
 
 def pytest_sessionstart(session):
@@ -78,6 +106,8 @@ def set_random_seed():
     Rerun with a specific seed found under a random seed test
         ARM_TEST_SEED=3478246 pytest --config-file=/dev/null --verbose -s --color=yes  backends/arm/test/ops/test_avg_pool.py -k <TESTCASE>
     """
+    import torch
+
     if os.environ.get("ARM_TEST_SEED", "RANDOM") == "RANDOM":
         random.seed()  # reset seed, in case any other test has fiddled with it
         seed = random.randint(0, 2**32 - 1)
@@ -161,6 +191,8 @@ def _load_libquantized_ops_aot_lib():
     res = subprocess.run(find_lib_cmd, capture_output=True)
     if res.returncode == 0:
         library_path = res.stdout.decode().strip()
+        import torch
+
         torch.ops.load_library(library_path)
     else:
         raise RuntimeError(
diff --git a/backends/arm/test/misc/test_custom_partition.py b/backends/arm/test/misc/test_custom_partition.py
index 8d73e1c7836..00bc4d306ae 100644
--- a/backends/arm/test/misc/test_custom_partition.py
+++ b/backends/arm/test/misc/test_custom_partition.py
@@ -3,6 +3,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import logging
+
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
@@ -37,7 +39,9 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
         return self.nested(a, b)
 
 
-def test_single_reject():
+def test_single_reject(caplog):
+    caplog.set_level(logging.INFO)
+
     module = CustomPartitioning()
     inputs = module.inputs
     compile_spec = common.get_tosa_compile_spec("TOSA-0.80+MI")
@@ -57,6 +61,7 @@ def test_single_reject():
         .run_method_and_compare_outputs(inputs=inputs)
     )
     assert check.has_rejected_node()
+    assert "Rejected by DontPartition" in caplog.text
 
 
 def test_multiple_reject():
@@ -83,7 +88,9 @@ def test_multiple_reject():
     assert check.has_rejected_node()
 
 
-def test_torch_op_reject():
+def test_torch_op_reject(caplog):
+    caplog.set_level(logging.INFO)
+
     module = CustomPartitioning()
     inputs = module.inputs
     compile_spec = common.get_tosa_compile_spec("TOSA-0.80+MI")
@@ -103,6 +110,7 @@ def test_torch_op_reject():
         .run_method_and_compare_outputs(inputs=inputs)
     )
     assert check.has_rejected_node()
+    assert "Rejected by DontPartition" in caplog.text
 
 
 def test_string_op_reject():
@@ -128,7 +136,9 @@ def test_string_op_reject():
     assert check.has_rejected_node()
 
 
-def test_name_reject():
+def test_name_reject(caplog):
+    caplog.set_level(logging.INFO)
+
     module = CustomPartitioning()
     inputs = module.inputs
     compile_spec = common.get_tosa_compile_spec("TOSA-0.80+MI")
@@ -148,6 +158,7 @@ def test_name_reject():
         .run_method_and_compare_outputs(inputs=inputs)
     )
     assert check.has_rejected_node()
+    assert "Rejected by DontPartitionName" in caplog.text
 
 
 def test_module_reject():
@@ -172,7 +183,9 @@ def test_module_reject():
     assert check.has_rejected_node()
 
 
-def test_inexact_module_reject():
+def test_inexact_module_reject(caplog):
+    caplog.set_level(logging.INFO)
+
     module = NestedModule()
     inputs = module.inputs
     compile_spec = common.get_tosa_compile_spec("TOSA-0.80+MI")
@@ -192,6 +205,7 @@ def test_inexact_module_reject():
         .run_method_and_compare_outputs(inputs=inputs)
     )
     assert check.has_rejected_node()
+    assert "Rejected by DontPartitionModule" in caplog.text
 
 
 def test_module_instance_reject():
diff --git a/backends/arm/test/misc/test_multiple_outputs.py b/backends/arm/test/misc/test_multiple_outputs.py
index ddddc94d277..d3bea9a4005 100644
--- a/backends/arm/test/misc/test_multiple_outputs.py
+++ b/backends/arm/test/misc/test_multiple_outputs.py
@@ -76,23 +76,21 @@ def _test_ethosu_BI_pipeline(
             tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @pytest.mark.corstone_fvp
-    def test_u85_BI(self):
+    def test_u55_BI(self):
         module = self.MultipleOutputsModule()
         test_data = module.get_inputs()
         self._test_ethosu_BI_pipeline(
             module,
             test_data,
-            common.get_u85_compile_spec(),
+            common.get_u55_compile_spec(),
         )
 
     @pytest.mark.corstone_fvp
-    @conftest.expectedFailureOnFVP
-    # TODO MLETORCH-598
-    def test_u55_BI(self):
+    def test_u85_BI(self):
         module = self.MultipleOutputsModule()
         test_data = module.get_inputs()
         self._test_ethosu_BI_pipeline(
             module,
             test_data,
-            common.get_u55_compile_spec(),
+            common.get_u85_compile_spec(),
         )
diff --git a/backends/arm/test/misc/test_partition_decomposed_quantized_ops.py b/backends/arm/test/misc/test_partition_decomposed_quantized_ops.py
new file mode 100644
index 00000000000..5bb692ebcaf
--- /dev/null
+++ b/backends/arm/test/misc/test_partition_decomposed_quantized_ops.py
@@ -0,0 +1,173 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Test that tosa_supported_operators reject operators that are not
+# quantized properly. This is typically a consequence of a torch op
+# such a Softplus that is decompsed into many other ops without
+# surrounding q/dq nodes.
+
+from typing import Tuple
+
+import torch
+from executorch.backends.arm.test import common
+
+from executorch.backends.arm.test.tester.test_pipeline import (
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+input_t1 = Tuple[torch.Tensor]
+softplus_aten_op: list[str] = [
+    "torch.ops.aten.add.Tensor",
+    "torch.ops.aten.softplus.default",
+]
+softplus_exir_op: list[str] = [
+    "executorch_exir_dialects_edge__ops_aten_add_Tensor",
+    "executorch_exir_dialects_edge__ops_aten_mul_Tensor",
+    "executorch_exir_dialects_edge__ops_aten_exp_default",
+    "executorch_exir_dialects_edge__ops_aten_div_Tensor",
+]
+
+linear_residual_aten_op: list[str] = [
+    "torch.ops.aten.linear.default",
+    "torch.ops.aten.gelu.default",
+    "torch.ops.aten.dropout.default",
+    "torch.ops.aten.add.Tensor",
+]
+linear_residual_exir_op: list[str] = [
+    "executorch_exir_dialects_edge__ops_aten_gelu_default",
+    "executorch_exir_dialects_edge__ops_aten_clone_default",
+    "executorch_exir_dialects_edge__ops_aten_linear_default",
+    "executorch_exir_dialects_edge__ops_aten_add_Tensor",
+]
+
+
+test_data: dict[input_t1] = {
+    "3d_rand": (torch.rand(1, 5, 5),),
+}
+
+
+class SoftplusModule(torch.nn.Module):
+    """Module containing an addition followed by a Softplus. Softplus is currently not supported by TosaBackend."""
+
+    def __init__(self):
+        super().__init__()
+        self.softplus = torch.nn.Softplus()
+
+    def forward(self, x: torch.Tensor):
+        return self.softplus(x + x)
+
+
+class LinearResidualModule(torch.nn.Module):
+    """Module containing a residual and a linear layer followed by GELU and a Dropout.
+    GELU is currently not supported by TosaBackend nor TosaQuantizer.
+    """
+
+    def __init__(
+        self,
+    ):
+        super().__init__()
+        self.linear = torch.nn.Linear(in_features=5, out_features=3)
+        self.gelu = torch.nn.GELU()
+        self.dropout = torch.nn.Dropout(0.5)
+
+    def forward(self, x: torch.Tensor):
+        x1 = self.linear(x)
+        x2 = self.gelu(x1)
+        x3 = self.dropout(x2)
+        return x1 + x3
+
+
+# Softplus is decomposed which messes up the quantization. This test tests that CheckProperQuantization does not
+# partition nodes where quantization is not as expected.
+@common.parametrize("test_data", test_data)
+def test_softplus_tosa_MI(test_data: input_t1):
+    pipeline = TosaPipelineMI[input_t1](
+        SoftplusModule(),
+        test_data=test_data,
+        aten_op=softplus_aten_op,
+        exir_op=softplus_exir_op,
+    )
+    # remove check_count.exir as there will be more than one delegate
+    pipeline.pop_stage("check_count.exir")
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data)
+def test_softplus_tosa_BI(test_data: input_t1):
+    pipeline = TosaPipelineBI[input_t1](
+        SoftplusModule(),
+        test_data=test_data,
+        aten_op=softplus_aten_op,
+        exir_op=softplus_exir_op,
+    )
+    pipeline.pop_stage("check_not.exir")
+    # check that all ops in softplus_exir_op except add are rejected
+    pipeline.add_stage_after(
+        "to_edge_transform_and_lower",
+        pipeline.tester.check,
+        softplus_exir_op[1:],
+        suffix="exir_post_partition",
+    )
+    pipeline.run()
+
+
+# Since GELU will not be quantized by TosaQuantizer, the Dropout's input will not be quantized either.
+# If so, the Dropout should not be partitioned by TosaPartitioner for TOSA BI profile. This test tests that the
+# partitioner indeed does not partition the Dropout (clone) for TOSA BI.
+@common.parametrize("test_data", test_data)
+def test_linear_residaul_tosa_MI(test_data: input_t1):
+    pipeline = TosaPipelineMI[input_t1](
+        LinearResidualModule(),
+        test_data=test_data,
+        aten_op=linear_residual_aten_op,
+        exir_op=linear_residual_exir_op,
+        use_to_edge_transform_and_lower=True,
+    )
+    # remove check_count.exir as there will be more than one delegate
+    pipeline.pop_stage("check_count.exir")
+    pipeline.pop_stage("check_not.exir")
+    # check that all ops in linear_residual_exir_op except GELU are partitioned
+    pipeline.add_stage_after(
+        "to_edge_transform_and_lower",
+        pipeline.tester.check_not,
+        linear_residual_exir_op[1:],
+        suffix="exir_post_partition",
+    )
+    pipeline.add_stage_after(
+        "to_edge_transform_and_lower",
+        pipeline.tester.check,
+        linear_residual_exir_op[:1],
+        suffix="exir_post_partition",
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data)
+def test_linear_residual_tosa_BI(test_data: input_t1):
+    pipeline = TosaPipelineBI[input_t1](
+        LinearResidualModule(),
+        test_data=test_data,
+        aten_op=linear_residual_aten_op,
+        exir_op=linear_residual_exir_op,
+        use_to_edge_transform_and_lower=True,
+    )
+    # remove check_count.exir as there will be more than one delegate
+    pipeline.pop_stage("check_count.exir")
+    pipeline.pop_stage("check_not.exir")
+    # check that all ops in linear_residual_exir_op except GELU and Dropout are partitioned
+    pipeline.add_stage_after(
+        "to_edge_transform_and_lower",
+        pipeline.tester.check_not,
+        linear_residual_exir_op[2:],
+        suffix="exir_post_partition",
+    )
+    pipeline.add_stage_after(
+        "to_edge_transform_and_lower",
+        pipeline.tester.check,
+        linear_residual_exir_op[:2],
+        suffix="exir_post_partition",
+    )
+    pipeline.run()
diff --git a/backends/arm/test/models/test_conformer.py b/backends/arm/test/models/test_conformer.py
index d9bc4e363c1..4ed203a964e 100644
--- a/backends/arm/test/models/test_conformer.py
+++ b/backends/arm/test/models/test_conformer.py
@@ -34,12 +34,10 @@ class TestConformer(unittest.TestCase):
         "executorch_exir_dialects_edge__ops_aten_max_default": 1,
         "executorch_exir_dialects_edge__ops_aten_eq_Scalar": 2,
         "executorch_exir_dialects_edge__ops_aten_where_self": 4,
-        "executorch_exir_dialects_edge__ops_aten_logical_not_default": 4,
-        "executorch_exir_dialects_edge__ops_aten_any_dim": 2,
         "torch.ops.aten._assert_scalar.default": 10,
         "torch.ops.aten._local_scalar_dense.default": 1,
         "torch.ops.aten.scalar_tensor.default": 2,
-        "torch.ops.higher_order.executorch_call_delegate": 4,
+        "torch.ops.higher_order.executorch_call_delegate": 6,
     }
 
     dim = 16
@@ -95,7 +93,7 @@ def test_conformer_tosa_BI(self):
             )
         )
 
-    @unittest.expectedFailure  # TODO(MLETORCH-635)
+    @conftest.expectedFailureOnFVP  # TODO(MLETORCH-635)
     def test_conformer_u55_BI(self):
         tester = (
             ArmTester(
@@ -117,7 +115,7 @@ def test_conformer_u55_BI(self):
                 inputs=get_test_inputs(self.dim, self.lengths, self.num_examples),
             )
 
-    @unittest.expectedFailure  # TODO(MLETORCH-635)
+    @conftest.expectedFailureOnFVP  # TODO(MLETORCH-635)
     def test_conformer_u85_BI(self):
         tester = (
             ArmTester(
diff --git a/backends/arm/test/models/test_llama.py b/backends/arm/test/models/test_llama.py
new file mode 100644
index 00000000000..973f62d2724
--- /dev/null
+++ b/backends/arm/test/models/test_llama.py
@@ -0,0 +1,120 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+import os
+import sys
+import unittest
+
+import torch
+
+from executorch.backends.arm.test import common, conftest
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.examples.models.llama.export_llama_lib import (
+    build_args_parser,
+    get_llama_model,
+)
+
+
+# Add project dir to sys path to workaround importlib.import_module() conditions in model_factory.py
+this_files_dir = os.path.dirname(os.path.abspath(__file__))
+project_dir = os.path.abspath(os.path.join(this_files_dir, "../../../.."))
+sys.path.append(project_dir)
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+class TestLlama(unittest.TestCase):
+    """
+    Test class of Llama models. Type of Llama model depends on command line parameters:
+    --llama_inputs <path to .pt file> <path to json file>
+    Example: --llama_inputs stories110M/stories110M.pt stories110M/params.json
+    """
+
+    def prepare_model(self):
+
+        checkpoint = None
+        params_file = None
+        if conftest.is_option_enabled("llama_inputs"):
+            param_list = conftest.get_option("llama_inputs")
+            assert (
+                isinstance(param_list, list) and len(param_list) == 2
+            ), "invalid number of inputs for --llama_inputs"
+            checkpoint = param_list[0]
+            params_file = param_list[1]
+            assert isinstance(checkpoint, str) and isinstance(
+                params_file, str
+            ), "invalid input for --llama_inputs"
+        else:
+            logging.warning(
+                "Skipping Llama test because of lack of input. To run use --llama_inputs <.pt> <.json>"
+            )
+            return None, None, None
+
+        assert os.path.isfile(checkpoint) and os.path.isfile(
+            params_file
+        ), "Invalid file paths"
+
+        # TODO: Enable key value cache
+        args = [
+            "--disable_dynamic_shape",
+            "-c",
+            checkpoint,
+            "-p",
+            params_file,
+            "--model",
+            "stories110m",
+        ]
+        parser = build_args_parser()
+        args = parser.parse_args(args)
+
+        llama_model, llama_inputs, llama_meta = get_llama_model(args)
+
+        # TODO: Remove workaround since attention mask should not be persistent,
+        # it only works if input shape is always the same
+        freqs_c = "freqs_cos"
+        freqs_s = "freqs_sin"
+        for i in range(llama_model.n_layers):
+            val = llama_model.layers[i].attention.get_buffer("mask")
+            llama_model.layers[i].attention.register_buffer(
+                "mask", val, persistent=True
+            )
+            val = llama_model.layers[i].attention.rope.get_buffer(freqs_c)
+            llama_model.layers[i].attention.rope.register_buffer(
+                freqs_c, val, persistent=True
+            )
+            val = llama_model.layers[i].attention.rope.get_buffer(freqs_s)
+            llama_model.layers[i].attention.rope.register_buffer(
+                freqs_s, val, persistent=True
+            )
+
+        return llama_model, llama_inputs, llama_meta
+
+    def test_llama_tosa_MI(self):
+        llama_model, llama_inputs, llama_meta = self.prepare_model()
+
+        if llama_model is None and llama_inputs is None and llama_meta is None:
+            return
+
+        with torch.no_grad():
+            (
+                ArmTester(
+                    llama_model,
+                    example_inputs=llama_inputs,
+                    compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
+                    constant_methods=llama_meta,
+                )
+                .export()
+                .to_edge_transform_and_lower()
+                .check_count({"torch.ops.higher_order.executorch_call_delegate": 14})
+                .to_executorch()
+                .run_method_and_compare_outputs(
+                    inputs=llama_inputs, atol=1.8, rtol=0.01  # TODO: decrease tolerance
+                )
+            )
diff --git a/backends/arm/test/models/test_nn_functional.py b/backends/arm/test/models/test_nn_functional.py
new file mode 100644
index 00000000000..b0a1e543ed3
--- /dev/null
+++ b/backends/arm/test/models/test_nn_functional.py
@@ -0,0 +1,118 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Tests 10 popular torch.nn.functional not tested in other ways or training related
+- normalize
+- grid_sample
+- one_hot
+- softplus
+- cosine_similarity
+- unfold
+- elu
+- fold
+- affine_grid
+- max_pool1d
+- threshold
+"""
+from typing import Callable
+
+import torch
+from executorch.backends.arm.test.common import parametrize
+from executorch.backends.arm.test.tester.test_pipeline import (
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+
+def module_factory(function: Callable) -> torch.nn.Module:
+    class ModuleWrapper(torch.nn.Module):
+        def forward(self, *args):
+            return function(*args)
+
+    return ModuleWrapper()
+
+
+example_input = torch.rand(1, 6, 16, 16)
+
+module_tests = {
+    "normalize": (module_factory(torch.nn.functional.normalize), (example_input,)),
+    "grid_sample": (
+        module_factory(torch.nn.functional.grid_sample),
+        (torch.rand(1, 1, 4, 4), torch.rand(1, 5, 5, 2)),
+    ),
+    "one_hot": (
+        module_factory(torch.nn.functional.one_hot),
+        (torch.randint(0, 5, (2, 2, 5, 5)), 5),
+    ),
+    "softplus": (module_factory(torch.nn.functional.softplus), (example_input,)),
+    "cosine_similarity": (
+        module_factory(torch.nn.functional.cosine_similarity),
+        (example_input, example_input),
+    ),
+    "unfold": (
+        module_factory(torch.nn.functional.unfold),
+        (torch.randn(1, 3, 10, 12), (4, 5)),
+    ),
+    "elu": (module_factory(torch.nn.functional.elu), (example_input,)),
+    "fold": (
+        module_factory(torch.nn.functional.fold),
+        (torch.randn(1, 12, 12), (4, 5), (2, 2)),
+    ),
+    "affine_grid": (
+        module_factory(torch.nn.functional.affine_grid),
+        (torch.rand(1, 2, 3), (1, 2, 10, 10)),
+    ),
+    "max_pool1d": (
+        module_factory(torch.nn.functional.max_pool1d),
+        (torch.randn(20, 16, 50), 4),
+    ),
+    "threshold": (
+        module_factory(torch.nn.functional.threshold),
+        (example_input, 0.5, 0.1),
+    ),
+}
+
+input_t = tuple[torch.Tensor]
+
+
+@parametrize(
+    "test_data", module_tests, xfails={"max_pool1d": "ValueError: Invalid TOSA graph"}
+)
+def test_nn_functional_MI(test_data):
+    module, inputs = test_data
+    pipeline = TosaPipelineMI[input_t](
+        module, inputs, "", use_to_edge_transform_and_lower=True
+    )
+    pipeline.pop_stage("check.aten")
+    pipeline.pop_stage("check_count.exir")
+    try:
+        pipeline.run()
+    except RuntimeError as e:
+        if (
+            "Ran model with TosaReferenceModelDispatch but never ran TOSABackend delegate."
+            not in str(e)
+        ):
+            raise e
+
+
+@parametrize("test_data", module_tests)
+def test_nn_functional_BI(test_data):
+    module, inputs = test_data
+    pipeline = TosaPipelineBI[input_t](
+        module, inputs, "", use_to_edge_transform_and_lower=True
+    )
+    pipeline.pop_stage("check.aten")
+    pipeline.pop_stage("check_count.exir")
+    pipeline.pop_stage("check.quant_nodes")
+    pipeline.pop_stage("check_not.quant_nodes")
+    try:
+        pipeline.run()
+    except RuntimeError as e:
+        if (
+            "Ran model with TosaReferenceModelDispatch but never ran TOSABackend delegate."
+            not in str(e)
+        ):
+            raise e
diff --git a/backends/arm/test/models/test_nn_modules.py b/backends/arm/test/models/test_nn_modules.py
new file mode 100644
index 00000000000..2793515db70
--- /dev/null
+++ b/backends/arm/test/models/test_nn_modules.py
@@ -0,0 +1,103 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Tests 10 popular nn modules not tested in other ways or training related.
+- Embedding
+- LeakyReLU
+- BatchNorm1d
+- AdaptiveAvgPool2d
+- ConvTranspose2d
+- GRU
+- GroupNorm
+- InstanceNorm2d
+- PReLU
+- Transformer
+"""
+
+import torch
+from executorch.backends.arm.test.common import parametrize
+from executorch.backends.arm.test.tester.test_pipeline import (
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+example_input = torch.rand(1, 6, 16, 16)
+
+module_tests = [
+    (torch.nn.Embedding(10, 10), (torch.LongTensor([[1, 2, 4, 5], [4, 3, 2, 9]]),)),
+    (torch.nn.LeakyReLU(), (example_input,)),
+    (torch.nn.BatchNorm1d(16), (torch.rand(6, 16, 16),)),
+    (torch.nn.AdaptiveAvgPool2d((12, 12)), (example_input,)),
+    (torch.nn.ConvTranspose2d(6, 3, 2), (example_input,)),
+    (torch.nn.GRU(10, 20, 2), (torch.randn(5, 3, 10), torch.randn(2, 3, 20))),
+    (torch.nn.GroupNorm(2, 6), (example_input,)),
+    (torch.nn.InstanceNorm2d(16), (example_input,)),
+    (torch.nn.PReLU(), (example_input,)),
+    (
+        torch.nn.Transformer(
+            d_model=64,
+            nhead=1,
+            num_encoder_layers=1,
+            num_decoder_layers=1,
+            dtype=torch.float32,
+        ),
+        (torch.rand((10, 32, 64)), torch.rand((20, 32, 64))),
+    ),
+]
+
+input_t = tuple[torch.Tensor]
+
+test_parameters = {str(test[0].__class__.__name__): test for test in module_tests}
+
+
+@parametrize(
+    "test_data",
+    test_parameters,
+    xfails={"Transformer": "Output 0 does not match reference output."},
+)
+def test_nn_Modules_MI(test_data):
+    module, inputs = test_data
+    pipeline = TosaPipelineMI[input_t](
+        module, inputs, "", use_to_edge_transform_and_lower=True
+    )
+    pipeline.pop_stage("check.aten")
+    pipeline.pop_stage("check_count.exir")
+    try:
+        pipeline.run()
+    except RuntimeError as e:
+        if (
+            "Ran model with TosaReferenceModelDispatch but never ran TOSABackend delegate."
+            not in str(e)
+        ):
+            raise e
+
+
+@parametrize(
+    "test_data",
+    test_parameters,
+    xfails={
+        "GRU": "RuntimeError: Node aten_linear_default with op <EdgeOpOverload: aten.linear[...]> was not decomposed or delegated.",
+        "PReLU": "RuntimeError: mul(): functions with out=... arguments don't support automatic differentiation, but one of the arguments requires grad.",
+        "Transformer": "RuntimeError: Expected out tensor to have dtype signed char, but got float",
+    },
+)
+def test_nn_Modules_BI(test_data):
+    module, inputs = test_data
+    pipeline = TosaPipelineBI[input_t](
+        module, inputs, "", use_to_edge_transform_and_lower=True
+    )
+    pipeline.pop_stage("check.aten")
+    pipeline.pop_stage("check_count.exir")
+    pipeline.pop_stage("check.quant_nodes")
+    pipeline.pop_stage("check_not.quant_nodes")
+    try:
+        pipeline.run()
+    except RuntimeError as e:
+        if (
+            "Ran model with TosaReferenceModelDispatch but never ran TOSABackend delegate."
+            not in str(e)
+        ):
+            raise e
diff --git a/backends/arm/test/models/test_torch_functions.py b/backends/arm/test/models/test_torch_functions.py
new file mode 100644
index 00000000000..6363de90478
--- /dev/null
+++ b/backends/arm/test/models/test_torch_functions.py
@@ -0,0 +1,150 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+"""
+Tests 10 popular torch ops, not tested in other ways, training related or requring randomness.
+- t
+- zeros
+- ones
+- stack
+- arange
+- norm
+- nonzero
+- eye
+- topk
+- sort
+"""
+
+from typing import Callable
+
+import torch
+from executorch.backends.arm.test.common import parametrize
+from executorch.backends.arm.test.tester.test_pipeline import (
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+
+def module_add_factory(function: Callable) -> torch.nn.Module:
+    class ModuleWrapper(torch.nn.Module):
+        def forward(self, x, *args):
+            return x + function(*args).to(torch.float32)
+
+    return ModuleWrapper()
+
+
+def module_factory(function: Callable) -> torch.nn.Module:
+    class ModuleWrapper(torch.nn.Module):
+        def forward(self, *args):
+            return function(*args)
+
+    return ModuleWrapper()
+
+
+example_input = torch.rand(1, 6, 16, 16)
+
+module_tests = [
+    (
+        "t",
+        module_add_factory(torch.t),
+        (
+            torch.rand(10, 6),
+            torch.rand(6, 10),
+        ),
+    ),
+    (
+        "zeros",
+        module_add_factory(torch.zeros),
+        (
+            torch.rand(4, 3, 2),
+            (4, 1, 2),
+        ),
+    ),
+    (
+        "ones",
+        module_add_factory(torch.ones),
+        (
+            torch.rand(4, 3, 2),
+            (4, 1, 2),
+        ),
+    ),
+    (
+        "stack",
+        module_add_factory(torch.stack),
+        (
+            torch.rand(1, 1, 1, 1),
+            (torch.rand(2, 3, 3), torch.rand(2, 3, 3)),
+            -2,
+        ),
+    ),
+    ("arange", module_add_factory(torch.arange), (torch.rand(1), 0, 10, 2)),
+    ("norm", module_factory(torch.norm), (torch.randn(5, 5),)),
+    ("nonzero", module_factory(torch.nonzero), (example_input,)),
+    ("eye", module_add_factory(torch.eye), (torch.rand(4, 4), 4)),
+    ("topk", module_factory(torch.topk), (torch.rand(10), 5)),
+    ("sort", module_factory(torch.sort), (torch.rand(5),)),
+]
+
+input_t = tuple[torch.Tensor]
+
+test_parameters = {test[0]: test[1:] for test in module_tests}
+
+
+@parametrize(
+    "test_data",
+    test_parameters,
+    xfails={
+        "nonzero": "torch.fx.experimental.symbolic_shapes.GuardOnDataDependentSymNode: Could not guard on data-dependent expression Eq(u4, 0). "
+        "Requires dynamic output shape.",
+        "topk": "NotImplementedError: No registered serialization name for <class 'torch.return_types.topk'> found",
+        "sort": "NotImplementedError: No registered serialization name for <class 'torch.return_types.sort'> found",
+    },
+)
+def test_torch_fns_MI(test_data):
+    module, inputs = test_data
+    pipeline = TosaPipelineMI[input_t](
+        module, inputs, "", use_to_edge_transform_and_lower=True
+    )
+    pipeline.pop_stage("check.aten")
+    pipeline.pop_stage("check_count.exir")
+    try:
+        pipeline.run()
+    except RuntimeError as e:
+        if (
+            "Ran model with TosaReferenceModelDispatch but never ran TOSABackend delegate."
+            not in str(e)
+        ):
+            raise e
+
+
+@parametrize(
+    "test_data",
+    test_parameters,
+    xfails={
+        "nonzero": "torch.fx.experimental.symbolic_shapes.GuardOnDataDependentSymNode: Could not guard on data-dependent expression Eq(u4, 0). "
+        "Requires dynamic output shape.",
+        "topk": "NotImplementedError: No registered serialization name for <class 'torch.return_types.topk'> found",
+        "sort": "NotImplementedError: No registered serialization name for <class 'torch.return_types.sort'> found",
+    },
+)
+def test_torch_fns_BI(test_data):
+    module, inputs = test_data
+    pipeline = TosaPipelineBI[input_t](
+        module, inputs, "", use_to_edge_transform_and_lower=True
+    )
+    pipeline.pop_stage("check.aten")
+    pipeline.pop_stage("check_count.exir")
+    pipeline.pop_stage("check.quant_nodes")
+    pipeline.pop_stage("check_not.quant_nodes")
+
+    try:
+        pipeline.run()
+    except RuntimeError as e:
+        if (
+            "Ran model with TosaReferenceModelDispatch but never ran TOSABackend delegate."
+            not in str(e)
+        ):
+            raise e
diff --git a/backends/arm/test/models/test_w2l_arm.py b/backends/arm/test/models/test_w2l_arm.py
new file mode 100644
index 00000000000..6da1a46700d
--- /dev/null
+++ b/backends/arm/test/models/test_w2l_arm.py
@@ -0,0 +1,149 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import unittest
+from typing import Tuple
+
+import pytest
+
+import torch
+from executorch.backends.arm.test import common, conftest
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
+
+from executorch.exir.backend.compile_spec_schema import CompileSpec
+from torchaudio import models
+
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+def get_test_inputs(batch_size, num_features, input_frames):
+    return (torch.randn(batch_size, num_features, input_frames),)
+
+
+class TestW2L(unittest.TestCase):
+    """Tests Wav2Letter."""
+
+    batch_size = 10
+    input_frames = 400
+    num_features = 1
+
+    w2l = models.Wav2Letter(num_features=num_features).eval()
+    model_example_inputs = get_test_inputs(batch_size, num_features, input_frames)
+
+    all_operators = {
+        "executorch_exir_dialects_edge__ops_aten_convolution_default",
+        "executorch_exir_dialects_edge__ops_aten__log_softmax_default",
+        "executorch_exir_dialects_edge__ops_aten_relu_default",
+    }
+
+    operators_after_quantization = all_operators - {
+        "executorch_exir_dialects_edge__ops_aten__log_softmax_default",
+    }
+
+    @pytest.mark.slow  # about 3min on std laptop
+    def test_w2l_tosa_MI(self):
+        (
+            ArmTester(
+                self.w2l,
+                example_inputs=self.model_example_inputs,
+                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
+            )
+            .export()
+            .dump_operator_distribution()
+            .to_edge_transform_and_lower()
+            .dump_operator_distribution()
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(
+                inputs=get_test_inputs(
+                    self.batch_size, self.num_features, self.input_frames
+                )
+            )
+        )
+
+    @pytest.mark.slow  # about 1min on std laptop
+    def test_w2l_tosa_BI(self):
+        (
+            ArmTester(
+                self.w2l,
+                example_inputs=self.model_example_inputs,
+                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
+            )
+            .quantize()
+            .export()
+            .dump_operator_distribution()
+            .to_edge_transform_and_lower()
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(
+                atol=0.1,
+                qtol=1,
+                inputs=get_test_inputs(
+                    self.batch_size, self.num_features, self.input_frames
+                ),
+            )
+        )
+
+    def _test_w2l_ethos_BI_pipeline(
+        self,
+        module: torch.nn.Module,
+        test_data: Tuple[torch.Tensor],
+        compile_spec: CompileSpec,
+    ):
+        tester = (
+            ArmTester(module, example_inputs=test_data, compile_spec=compile_spec)
+            .quantize()
+            .export()
+            .to_edge()
+            .check(list(self.operators_after_quantization))
+            .partition()
+            .to_executorch()
+            .serialize()
+        )
+        return tester
+
+    # TODO: expected fail as TOSA.Transpose is not supported by Ethos-U55
+    @pytest.mark.slow
+    @pytest.mark.corstone_fvp
+    @conftest.expectedFailureOnFVP
+    def test_w2l_u55_BI(self):
+        tester = self._test_w2l_ethos_BI_pipeline(
+            self.w2l,
+            self.model_example_inputs,
+            common.get_u55_compile_spec(),
+        )
+
+        if conftest.is_option_enabled("corstone_fvp"):
+            tester.run_method_and_compare_outputs(
+                atol=1.0,
+                qtol=1,
+                inputs=get_test_inputs(
+                    self.batch_size, self.num_features, self.input_frames
+                ),
+            )
+
+    @pytest.mark.slow
+    @pytest.mark.corstone_fvp
+    @conftest.expectedFailureOnFVP  # TODO: MLETORCH-761
+    def test_w2l_u85_BI(self):
+        tester = self._test_w2l_ethos_BI_pipeline(
+            self.w2l,
+            self.model_example_inputs,
+            common.get_u85_compile_spec(),
+        )
+
+        if conftest.is_option_enabled("corstone_fvp"):
+            tester.run_method_and_compare_outputs(
+                atol=1.0,
+                qtol=1,
+                inputs=get_test_inputs(
+                    self.batch_size, self.num_features, self.input_frames
+                ),
+            )
diff --git a/backends/arm/test/ops/test_abs.py b/backends/arm/test/ops/test_abs.py
new file mode 100644
index 00000000000..481c7d5ed0d
--- /dev/null
+++ b/backends/arm/test/ops/test_abs.py
@@ -0,0 +1,125 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# Copyright 2025 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+from typing import Tuple
+
+import pytest
+
+import torch
+from executorch.backends.arm.test import common, conftest
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.exir.backend.compile_spec_schema import CompileSpec
+from parameterized import parameterized
+
+
+class TestAbs(unittest.TestCase):
+    class Abs(torch.nn.Module):
+        test_parameters = [
+            (torch.zeros(5),),
+            (torch.full((5,), -1, dtype=torch.float32),),
+            (torch.ones(5) * -1,),
+            (torch.randn(8),),
+            (torch.randn(2, 3, 4),),
+            (torch.randn(1, 2, 3, 4),),
+            (torch.normal(mean=0, std=10, size=(2, 3, 4)),),
+        ]
+
+        def forward(self, x):
+            return torch.abs(x)
+
+    def _test_abs_tosa_MI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
+    ):
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
+            )
+            .export()
+            .check_count({"torch.ops.aten.abs.default": 1})
+            .check_not(["torch.ops.quantized_decomposed"])
+            .to_edge()
+            .partition()
+            .check_not(["torch.ops.aten.abs.default"])
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(inputs=test_data)
+        )
+
+    def _test_abs_tosa_BI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
+    ):
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
+            )
+            .quantize()
+            .export()
+            .check_count({"torch.ops.aten.abs.default": 1})
+            .check(["torch.ops.quantized_decomposed"])
+            .to_edge()
+            .partition()
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(inputs=test_data, qtol=1)
+        )
+
+    def _test_abs_ethosu_BI_pipeline(
+        self,
+        compile_spec: list[CompileSpec],
+        module: torch.nn.Module,
+        test_data: Tuple[torch.Tensor],
+    ):
+        tester = (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=compile_spec,
+            )
+            .quantize()
+            .export()
+            .check_count({"torch.ops.aten.abs.default": 1})
+            .check(["torch.ops.quantized_decomposed"])
+            .to_edge()
+            .partition()
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .serialize()
+        )
+        if conftest.is_option_enabled("corstone_fvp"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
+
+    @parameterized.expand(Abs.test_parameters)
+    def test_abs_tosa_MI(self, test_data: torch.Tensor):
+        test_data = (test_data,)
+        self._test_abs_tosa_MI_pipeline(self.Abs(), test_data)
+
+    @parameterized.expand(Abs.test_parameters)
+    def test_abs_tosa_BI(self, test_data: torch.Tensor):
+        test_data = (test_data,)
+        self._test_abs_tosa_BI_pipeline(self.Abs(), test_data)
+
+    @parameterized.expand(Abs.test_parameters)
+    @pytest.mark.corstone_fvp
+    def test_abs_u55_BI(self, test_data: torch.Tensor):
+        test_data = (test_data,)
+        self._test_abs_ethosu_BI_pipeline(
+            common.get_u55_compile_spec(), self.Abs(), test_data
+        )
+
+    @parameterized.expand(Abs.test_parameters)
+    @pytest.mark.corstone_fvp
+    def test_abs_u85_BI(self, test_data: torch.Tensor):
+        test_data = (test_data,)
+        self._test_abs_ethosu_BI_pipeline(
+            common.get_u85_compile_spec(), self.Abs(), test_data
+        )
diff --git a/backends/arm/test/ops/test_add.py b/backends/arm/test/ops/test_add.py
index 6db87caa7ce..486e53c5f03 100644
--- a/backends/arm/test/ops/test_add.py
+++ b/backends/arm/test/ops/test_add.py
@@ -1,11 +1,10 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-
 from typing import Tuple
 
 import torch
@@ -61,6 +60,17 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
     }
 
 
+class Add3(torch.nn.Module):
+    def forward(self, x: torch.Tensor, y: torch.Tensor):
+        return x + y
+
+    test_data: list[input_t2] = {
+        "3d_randn_diff_rank": (torch.randn(1, 4, 5), torch.randn(4, 1)),
+        "4d_randn_diff_rank": (torch.randn(1, 1, 4, 4), torch.randn(4, 1)),
+        "4d_randn_diff_rank_2": (torch.randn(4, 1), torch.randn(1, 1, 4, 5)),
+    }
+
+
 @common.parametrize("test_data", Add.test_data)
 def test_add_tosa_MI(test_data: input_t1):
     pipeline = TosaPipelineMI[input_t1](Add(), test_data, aten_op, exir_op)
@@ -106,24 +116,8 @@ def test_add_i32_tosa_BI(test_data: input_t1):
 
 
 @common.parametrize("test_data", Add.test_data)
+@common.XfailIfNoCorstone300
 def test_add_u55_BI(test_data: input_t1):
-    pipeline = EthosU55PipelineBI[input_t1](
-        Add(), test_data, aten_op, exir_op, run_on_fvp=False
-    )
-    pipeline.run()
-
-
-@common.parametrize("test_data", Add.test_data)
-def test_add_u85_BI(test_data: input_t1):
-    pipeline = EthosU85PipelineBI[input_t1](
-        Add(), test_data, aten_op, exir_op, run_on_fvp=False
-    )
-    pipeline.run()
-
-
-@common.parametrize("test_data", Add.test_data)
-@common.SkipIfNoCorstone300
-def test_add_u55_BI_on_fvp(test_data: input_t1):
     pipeline = EthosU55PipelineBI[input_t1](
         Add(), test_data, aten_op, exir_op, run_on_fvp=True
     )
@@ -131,8 +125,8 @@ def test_add_u55_BI_on_fvp(test_data: input_t1):
 
 
 @common.parametrize("test_data", Add.test_data)
-@common.SkipIfNoCorstone320
-def test_add_u85_BI_on_fvp(test_data: input_t1):
+@common.XfailIfNoCorstone320
+def test_add_u85_BI(test_data: input_t1):
     pipeline = EthosU85PipelineBI[input_t1](
         Add(), test_data, aten_op, exir_op, run_on_fvp=True
     )
@@ -140,45 +134,41 @@ def test_add_u85_BI_on_fvp(test_data: input_t1):
 
 
 @common.parametrize("test_data", Add2.test_data)
-def test_add2_tosa_MI(test_data: input_t2):
+def test_add_2_tosa_MI(test_data: input_t2):
     pipeline = TosaPipelineMI[input_t2](Add2(), test_data, aten_op, exir_op)
     pipeline.run()
 
 
-@common.parametrize("test_data", Add2.test_data)
-def test_add2_tosa_BI(test_data: input_t2):
-    pipeline = TosaPipelineBI[input_t2](Add2(), test_data, aten_op, exir_op)
+@common.parametrize("test_data", Add3.test_data)
+def test_add3_tosa_MI(test_data: input_t2):
+    pipeline = TosaPipelineMI[input_t2](Add3(), test_data, aten_op, exir_op)
     pipeline.run()
 
 
-@common.parametrize("test_data", Add2.test_data)
-def test_add2_u55_BI(test_data: input_t2):
-    pipeline = EthosU55PipelineBI[input_t2](
-        Add2(), test_data, aten_op, exir_op, run_on_fvp=False
-    )
+@common.parametrize("test_data", Add3.test_data)
+def test_add3_tosa_BI(test_data: input_t2):
+    pipeline = TosaPipelineBI[input_t2](Add3(), test_data, aten_op, exir_op)
     pipeline.run()
 
 
 @common.parametrize("test_data", Add2.test_data)
-@common.SkipIfNoCorstone300
-def test_add2_u55_BI_on_fvp(test_data: input_t2):
-    pipeline = EthosU55PipelineBI[input_t2](
-        Add2(), test_data, aten_op, exir_op, run_on_fvp=True
-    )
+def test_add_2_tosa_BI(test_data: input_t2):
+    pipeline = TosaPipelineBI[input_t2](Add2(), test_data, aten_op, exir_op)
     pipeline.run()
 
 
 @common.parametrize("test_data", Add2.test_data)
-def test_add2_u85_BI(test_data: input_t2):
-    pipeline = EthosU85PipelineBI[input_t2](
-        Add2(), test_data, aten_op, exir_op, run_on_fvp=False
+@common.XfailIfNoCorstone300
+def test_add_2_u55_BI(test_data: input_t2):
+    pipeline = EthosU55PipelineBI[input_t2](
+        Add2(), test_data, aten_op, exir_op, run_on_fvp=True
     )
     pipeline.run()
 
 
 @common.parametrize("test_data", Add2.test_data)
-@common.SkipIfNoCorstone320
-def test_add2_u85_BI_on_fvp(test_data: input_t2):
+@common.XfailIfNoCorstone320
+def test_add_2_u85_BI(test_data: input_t2):
     pipeline = EthosU85PipelineBI[input_t2](
         Add2(), test_data, aten_op, exir_op, run_on_fvp=True
     )
diff --git a/backends/arm/test/ops/test_amax.py b/backends/arm/test/ops/test_amax.py
new file mode 100644
index 00000000000..b2639a5f108
--- /dev/null
+++ b/backends/arm/test/ops/test_amax.py
@@ -0,0 +1,165 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from typing import Dict, Tuple
+
+import pytest
+import torch
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU85PipelineBI,
+    OpNotSupportedPipeline,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+
+class Amax(torch.nn.Module):
+    input_t = Tuple[Tuple[torch.Tensor], int | Tuple[int], bool]
+    aten_op = ["torch.ops.aten.amax"]
+
+    def __init__(self, dim, keep_dims):
+        self.dim = dim
+        self.keep_dims = keep_dims
+        super().__init__()
+
+    def forward(self, x):
+        return torch.amax(x, self.dim, self.keep_dims)
+
+    test_data: Dict[str, input_t] = {
+        "rank_1_dim_0": ((torch.rand([10]),), 0, False),
+        "rank_2_dim_1_keep_dims": ((torch.rand([2, 2]),), (1,), True),
+        "rank_4_all_dim": ((torch.rand([1, 2, 5, 5]),), (0, 1, 2, 3), False),
+        "rank_4_0,3_keep_dims": ((torch.rand([1, 2, 2, 2]),), (0, 3), True),
+        "rank_4_mult_batches": ((torch.rand([2, 2, 2, 2]),), (0), True),
+    }
+
+
+class Max(torch.nn.Module):
+    input_t = Tuple[Tuple[torch.Tensor], int]
+    aten_op = ["torch.ops.aten.amax"]
+
+    def __init__(self, dim):
+        self.dim = dim
+        super().__init__()
+
+    def forward(self, x):
+        x = torch.max(x, self.dim, False)
+        return x[0]
+
+    test_data: Dict[str, input_t] = {
+        "rank_1_dim_0": ((torch.rand([10]),), 0),
+        "rank_2_dim_1": ((torch.rand([2, 2]),), 1),
+        "rank_4_dim_2": ((torch.rand([2, 2, 2, 2]),), 2),
+        "rank_4_dim_3": ((torch.rand([2, 2, 2, 2]),), 3),
+    }
+
+
+class MaxWithIndex(torch.nn.Module):
+    def __init__(self, dim):
+        self.dim = dim
+        super().__init__()
+
+    def forward(self, x):
+        x, i = torch.max(x, self.dim)
+        return x, i
+
+
+@common.parametrize("test_data", Amax.test_data)
+def test_amax_tosa_MI(test_data: Amax.input_t):
+    data, dim, keep_dims = test_data
+    pipeline = TosaPipelineMI[Amax.input_t](
+        Amax(dim, keep_dims),
+        data,
+        Amax.aten_op,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Amax.test_data)
+def test_amax_tosa_BI(test_data: Amax.input_t):
+    data, dim, keep_dims = test_data
+    pipeline = TosaPipelineBI[Amax.input_t](
+        Amax(dim, keep_dims),
+        data,
+        Amax.aten_op,
+    )
+    pipeline.run()
+
+
+def test_amax_u55_BI_not_delegated():
+    data, dim, keep_dims = Amax.test_data["rank_4_all_dim"]
+    pipeline = OpNotSupportedPipeline[Amax.input_t](
+        Amax(dim, keep_dims),
+        data,
+        "TOSA-0.80+BI+u55",
+        {" executorch_exir_dialects_edge__ops_aten_amax_default": 1},
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Amax.test_data)
+def test_amax_u85_BI(test_data: Amax.input_t):
+    data, dim, keep_dims = test_data
+    pipeline = EthosU85PipelineBI[Amax.input_t](
+        Amax(dim, keep_dims),
+        data,
+        Amax.aten_op,
+    )
+    pipeline.run()
+
+
+fvp_xfails = {"rank_4_mult_batches": "MLETORCH-517 : Multiple batches not supported"}
+
+
+@common.parametrize("test_data", Amax.test_data, fvp_xfails)
+@common.SkipIfNoCorstone320
+def test_amax_u85_BI_on_fvp(test_data: Amax.input_t):
+    data, dim, keep_dims = test_data
+    pipeline = EthosU85PipelineBI[Amax.input_t](
+        Amax(dim, keep_dims), data, Amax.aten_op, run_on_fvp=True
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Max.test_data)
+def test_max_to_amax_MI(test_data: Max.input_t):
+    data, dim = test_data
+    pipeline = TosaPipelineMI[Max.input_t](
+        Max(dim),
+        data,
+        "torch.ops.aten.max",
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Max.test_data)
+def test_max_to_amax_BI(test_data: Max.input_t):
+    data, dim = test_data
+    module = Max(dim)
+    pipeline = TosaPipelineBI[Max.input_t](
+        module,
+        data,
+        "torch.ops.aten.amax",
+    )
+    pipeline.run()
+
+
+@pytest.mark.xfail(reason="MLETORCH-718 : Quantization of indices in arm_quantizer")
+def test_max_index_not_delegated_BI():
+    data, dim = Max.test_data["rank_4_dim_3"]
+    pipeline = OpNotSupportedPipeline[Max.input_t](
+        MaxWithIndex(dim), data, "TOSA-0.80+BI", {}
+    )
+    pipeline.run()
+
+
+def test_max_index_not_delegated_MI():
+    data, dim = Max.test_data["rank_4_dim_3"]
+    pipeline = OpNotSupportedPipeline[Max.input_t](
+        MaxWithIndex(dim), data, "TOSA-0.80+MI", {}
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_amin.py b/backends/arm/test/ops/test_amin.py
new file mode 100644
index 00000000000..092ed472bce
--- /dev/null
+++ b/backends/arm/test/ops/test_amin.py
@@ -0,0 +1,166 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from typing import Dict, Tuple
+
+import pytest
+
+import torch
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU85PipelineBI,
+    OpNotSupportedPipeline,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+
+class Amin(torch.nn.Module):
+    input_t = Tuple[Tuple[torch.Tensor], int | Tuple[int], bool]
+    aten_op = ["torch.ops.aten.amin"]
+
+    def __init__(self, dim, keep_dims):
+        self.dim = dim
+        self.keep_dims = keep_dims
+        super().__init__()
+
+    def forward(self, x):
+        return torch.amin(x, self.dim, self.keep_dims)
+
+    test_data: Dict[str, input_t] = {
+        "rank_1_dim_0": ((torch.rand([10]),), 0, False),
+        "rank_2_dim_1_keep_dims": ((torch.rand([2, 2]),), (1,), True),
+        "rank_4_all_dim": ((torch.rand([1, 2, 5, 5]),), (0, 1, 2, 3), False),
+        "rank_4_0,3_keep_dims": ((torch.rand([1, 2, 2, 2]),), (0, 3), True),
+        "rank_4_mult_batches": ((torch.rand([2, 2, 2, 2]),), (0), True),
+    }
+
+
+class Min(torch.nn.Module):
+    input_t = Tuple[Tuple[torch.Tensor], int]
+    aten_op = ["torch.ops.aten.amin"]
+
+    def __init__(self, dim):
+        self.dim = dim
+        super().__init__()
+
+    def forward(self, x):
+        x = torch.min(x, self.dim)
+        return x[0]
+
+    test_data: Dict[str, input_t] = {
+        "rank_1_dim_0": ((torch.rand([10]),), 0),
+        "rank_2_dim_1": ((torch.rand([2, 2]),), 1),
+        "rank_4_dim_2": ((torch.rand([2, 2, 2, 2]),), 2),
+        "rank_4_dim_3": ((torch.rand([2, 2, 2, 2]),), 3),
+    }
+
+
+class MinWithIndex(torch.nn.Module):
+    def __init__(self, dim):
+        self.dim = dim
+        super().__init__()
+
+    def forward(self, x):
+        x, i = torch.min(x, self.dim)
+        return x, i
+
+
+@common.parametrize("test_data", Amin.test_data)
+def test_amin_tosa_MI(test_data: Amin.input_t):
+    data, dim, keep_dims = test_data
+    pipeline = TosaPipelineMI[Amin.input_t](
+        Amin(dim, keep_dims),
+        data,
+        Amin.aten_op,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Amin.test_data)
+def test_amin_tosa_BI(test_data: Amin.input_t):
+    data, dim, keep_dims = test_data
+    pipeline = TosaPipelineBI[Amin.input_t](
+        Amin(dim, keep_dims),
+        data,
+        Amin.aten_op,
+    )
+    pipeline.run()
+
+
+def test_amin_u55_BI_not_delegated():
+    data, dim, keep_dims = Amin.test_data["rank_4_all_dim"]
+    pipeline = OpNotSupportedPipeline[Amin.input_t](
+        Amin(dim, keep_dims),
+        data,
+        "TOSA-0.80+BI+u55",
+        {" executorch_exir_dialects_edge__ops_aten_amin_default": 1},
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Amin.test_data)
+def test_amin_u85_BI(test_data: Amin.input_t):
+    data, dim, keep_dims = test_data
+    pipeline = EthosU85PipelineBI[Amin.input_t](
+        Amin(dim, keep_dims),
+        data,
+        Amin.aten_op,
+    )
+    pipeline.run()
+
+
+fvp_xfails = {"rank_4_mult_batches": "MLETORCH-517 : Multiple batches not supported"}
+
+
+@common.parametrize("test_data", Amin.test_data, fvp_xfails)
+@common.SkipIfNoCorstone320
+def test_amin_u85_BI_on_fvp(test_data: Amin.input_t):
+    data, dim, keep_dims = test_data
+    pipeline = EthosU85PipelineBI[Amin.input_t](
+        Amin(dim, keep_dims), data, Amin.aten_op, run_on_fvp=True
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Min.test_data)
+def test_min_to_amin_MI(test_data: Min.input_t):
+    data, dim = test_data
+    pipeline = TosaPipelineMI[Min.input_t](
+        Min(dim),
+        data,
+        "torch.ops.aten.min",
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Min.test_data)
+def test_min_to_amin_BI(test_data: Min.input_t):
+    data, dim = test_data
+    module = Min(dim)
+    pipeline = TosaPipelineBI[Min.input_t](
+        module,
+        data,
+        "torch.ops.aten.amin",
+    )
+    pipeline.run()
+
+
+@pytest.mark.xfail(reason="MLETORCH-718 : Quantization of indices in arm_quantizer")
+def test_max_index_not_delegated_BI():
+    data, dim = Min.test_data["rank_4_dim_3"]
+    pipeline = OpNotSupportedPipeline[Min.input_t](
+        MinWithIndex(dim), data, "TOSA-0.80+BI", {}
+    )
+    pipeline.run()
+
+
+def test_max_index_not_delegated_MI():
+    data, dim = Min.test_data["rank_4_dim_3"]
+    pipeline = OpNotSupportedPipeline[Min.input_t](
+        MinWithIndex(dim), data, "TOSA-0.80+MI", {}
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_any.py b/backends/arm/test/ops/test_any.py
new file mode 100644
index 00000000000..d73ee1fda66
--- /dev/null
+++ b/backends/arm/test/ops/test_any.py
@@ -0,0 +1,185 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from typing import List, Tuple
+
+import torch
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU85PipelineBI,
+    OpNotSupportedPipeline,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+
+class AnyDim(torch.nn.Module):
+    aten_op = "torch.ops.aten.any.dim"
+    exir_op = "executorch_exir_dialects_edge__ops_aten_any_dim"
+
+    def forward(self, x: torch.Tensor, dim: int, keepdim: bool):
+        return torch.any(x, dim=dim, keepdim=keepdim)
+
+
+class AnyDims(torch.nn.Module):
+    aten_op = "torch.ops.aten.any.dims"
+    exir_op = "executorch_exir_dialects_edge__ops_aten_any_dims"
+
+    def forward(self, x: torch.Tensor, dim: List[int], keepdim: bool):
+        return torch.any(x, dim=dim, keepdim=keepdim)
+
+
+class AnyReduceAll(torch.nn.Module):
+    aten_op = "torch.ops.aten.any.default"
+    exir_op = "executorch_exir_dialects_edge__ops_aten_any_default"
+
+    def forward(self, x: torch.Tensor):
+        return torch.any(x)
+
+
+input_t1 = Tuple[torch.Tensor]  # Input x
+
+
+test_input: dict[input_t1] = {
+    "rank1": (torch.tensor([True, False, False], dtype=torch.bool), 0, True),
+    "rank1_squeeze": (torch.tensor([True, False, False], dtype=torch.bool), -1, False),
+    "rank2": (
+        torch.randint(0, 2, (2, 3), dtype=torch.bool),
+        0,
+        True,
+    ),
+    "rank2_squeeze": (
+        torch.randint(0, 2, (2, 3), dtype=torch.bool),
+        0,
+        False,
+    ),
+    "rank2_dims": (
+        torch.randint(0, 2, (2, 3), dtype=torch.bool),
+        [0, 1],
+        True,
+    ),
+    "rank2_dims_squeeze": (
+        torch.randint(0, 2, (2, 3), dtype=torch.bool),
+        [-2, 1],
+        False,
+    ),
+    "rank3_dims_squeeze": (
+        torch.randint(0, 2, (6, 8, 10), dtype=torch.bool),
+        [1, 2],
+        False,
+    ),
+    "rank4": (
+        torch.randint(0, 2, (1, 6, 8, 10), dtype=torch.bool),
+        1,
+        True,
+    ),
+    "rank4_squeeze": (
+        torch.randint(0, 2, (1, 6, 8, 10), dtype=torch.bool),
+        1,
+        False,
+    ),
+    "rank4_dims": (
+        torch.randint(0, 2, (1, 6, 8, 10), dtype=torch.bool),
+        [0, 2],
+        True,
+    ),
+    "rank4_dims_squeeze": (
+        torch.randint(0, 2, (1, 6, 8, 10), dtype=torch.bool),
+        [1, -1],
+        False,
+    ),
+    "rank1_reduce_all": (torch.tensor([True, False, False], dtype=torch.bool),),
+    "rank2_reduce_all": (torch.randint(0, 2, (2, 3), dtype=torch.bool),),
+    "rank3_reduce_all": (torch.randint(0, 2, (6, 8, 10), dtype=torch.bool),),
+    "rank4_reduce_all": (torch.randint(0, 2, (1, 6, 8, 10), dtype=torch.bool),),
+}
+
+
+test_data = {
+    "any_rank1": (AnyDim(), test_input["rank1"]),
+    "any_rank1_squeeze": (AnyDim(), test_input["rank1_squeeze"]),
+    "any_rank2": (AnyDim(), test_input["rank2"]),
+    "any_rank2_squeeze": (AnyDim(), test_input["rank2_squeeze"]),
+    "any_rank2_dims": (AnyDims(), test_input["rank2_dims"]),
+    "any_rank2_dims_squeeze": (AnyDims(), test_input["rank2_dims_squeeze"]),
+    "any_rank3_dims_squeeze": (AnyDims(), test_input["rank3_dims_squeeze"]),
+    "any_rank4": (AnyDim(), test_input["rank4"]),
+    "any_rank4_squeeze": (AnyDim(), test_input["rank4_squeeze"]),
+    "any_rank4_dims": (AnyDims(), test_input["rank4_dims"]),
+    "any_rank4_dims_squeeze": (AnyDims(), test_input["rank4_dims_squeeze"]),
+    "any_rank1_reduce_all": (AnyReduceAll(), test_input["rank1_reduce_all"]),
+    "any_rank2_reduce_all": (AnyReduceAll(), test_input["rank2_reduce_all"]),
+    "any_rank3_reduce_all": (AnyReduceAll(), test_input["rank3_reduce_all"]),
+    "any_rank4_reduce_all": (AnyReduceAll(), test_input["rank4_reduce_all"]),
+}
+
+
+fvp_xfails = {
+    "any_rank1": "MLETORCH-706 Support ScalarType::Bool in EthosUBackend.",
+    "any_rank1_squeeze": "MLETORCH-706: Support ScalarType::Bool in EthosUBackend.",
+    "any_rank2": "MLETORCH-706: Support ScalarType::Bool in EthosUBackend.",
+    "any_rank2_squeeze": "MLETORCH-706: Support ScalarType::Bool in EthosUBackend.",
+    "any_rank2_dims": "MLETORCH-706: Support ScalarType::Bool in EthosUBackend.",
+    "any_rank2_dims_squeeze": "MLETORCH-706: Support ScalarType::Bool in EthosUBackend.",
+    "any_rank3_dims_squeeze": "MLETORCH-706: Support ScalarType::Bool in EthosUBackend.",
+    "any_rank4": "MLETORCH-706: Support ScalarType::Bool in EthosUBackend.",
+    "any_rank4_squeeze": "MLETORCH-706: Support ScalarType::Bool in EthosUBackend.",
+    "any_rank4_dims": "MLETORCH-706: Support ScalarType::Bool in EthosUBackend.",
+    "any_rank4_dims_squeeze": "MLETORCH-706: Support ScalarType::Bool in EthosUBackend.",
+    "any_rank1_reduce_all": "MLETORCH-706: Support ScalarType::Bool in EthosUBackend.",
+    "any_rank2_reduce_all": "MLETORCH-706: Support ScalarType::Bool in EthosUBackend.",
+    "any_rank3_reduce_all": "MLETORCH-706: Support ScalarType::Bool in EthosUBackend.",
+    "any_rank4_reduce_all": "MLETORCH-706: Support ScalarType::Bool in EthosUBackend.",
+}
+
+
+@common.parametrize("test_data", test_data)
+def test_any_tosa_MI(test_data: input_t1):
+    op, test_input = test_data
+    pipeline = TosaPipelineMI[input_t1](op, test_input, op.aten_op, op.exir_op)
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data)
+def test_any_tosa_BI(test_data: input_t1):
+    op, test_input = test_data
+    pipeline = TosaPipelineBI[input_t1](op, test_input, op.aten_op, op.exir_op)
+    pipeline.pop_stage(pipeline.find_pos("quantize") + 1)
+    pipeline.pop_stage("quantize")
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data)
+def test_logical_u55_BI(test_data: input_t1):
+    # Tests that we don't delegate these ops since they are not supported on U55.
+    op, test_input = test_data
+    pipeline = OpNotSupportedPipeline[input_t1](
+        op, test_input, "TOSA-0.80+BI+u55", {op.exir_op: 1}
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data)
+def test_floor_u85_BI(test_data: input_t1):
+    op, test_input = test_data
+    pipeline = EthosU85PipelineBI[input_t1](
+        op, test_input, op.aten_op, op.exir_op, run_on_fvp=False
+    )
+    pipeline.pop_stage(pipeline.find_pos("quantize") + 1)
+    pipeline.pop_stage("quantize")
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data, fvp_xfails)
+@common.SkipIfNoCorstone320
+def test_floor_u85_BI_on_fvp(test_data: input_t1):
+    op, test_input = test_data
+    pipeline = EthosU85PipelineBI[input_t1](
+        op, test_input, op.aten_op, op.exir_op, run_on_fvp=True
+    )
+    pipeline.pop_stage(pipeline.find_pos("quantize") + 1)
+    pipeline.pop_stage("quantize")
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_avg_pool.py b/backends/arm/test/ops/test_avg_pool.py
deleted file mode 100644
index fa4662e54f0..00000000000
--- a/backends/arm/test/ops/test_avg_pool.py
+++ /dev/null
@@ -1,210 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024-2025 Arm Limited and/or its affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-import unittest
-
-from typing import Tuple
-
-import pytest
-
-import torch
-from executorch.backends.arm.quantizer.arm_quantizer import (
-    EthosUQuantizer,
-    get_symmetric_quantization_config,
-    TOSAQuantizer,
-)
-from executorch.backends.arm.test import common, conftest
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.backends.arm.tosa_specification import TosaSpecification
-from executorch.backends.xnnpack.test.tester.tester import Quantize
-from executorch.exir.backend.backend_details import CompileSpec
-from parameterized import parameterized
-
-
-test_data_suite = [
-    # (test_name, test_data, [kernel_size, stride, padding])
-    ("zeros", torch.zeros(1, 16, 50, 32), [4, 2, 0]),
-    ("ones", torch.zeros(1, 16, 50, 32), [4, 2, 0]),
-    ("rand", torch.rand(1, 16, 50, 32), [4, 2, 0]),
-    ("randn", torch.randn(1, 16, 50, 32), [4, 2, 0]),
-]
-
-
-class TestAvgPool2d(unittest.TestCase):
-    """Tests AvgPool2d."""
-
-    class AvgPool2d(torch.nn.Module):
-        def __init__(
-            self,
-            kernel_size: int | Tuple[int, int],
-            stride: int | Tuple[int, int],
-            padding: int | Tuple[int, int],
-        ):
-            super().__init__()
-            self.avg_pool_2d = torch.nn.AvgPool2d(
-                kernel_size=kernel_size, stride=stride, padding=padding
-            )
-
-        def forward(self, x):
-            return self.avg_pool_2d(x)
-
-    def _test_avgpool2d_tosa_MI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .check(["torch.ops.aten.avg_pool2d.default"])
-            .check_not(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_avg_pool2d_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_avgpool2d_tosa_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
-    ):
-        tosa_spec = TosaSpecification.create_from_string("TOSA-0.80+BI")
-        compile_spec = common.get_tosa_compile_spec(tosa_spec)
-        quantizer = TOSAQuantizer(tosa_spec).set_io(get_symmetric_quantization_config())
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
-            .export()
-            .check_count({"torch.ops.aten.avg_pool2d.default": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_avg_pool2d_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data, qtol=1)
-        )
-
-    def _test_avgpool2d_tosa_ethos_BI_pipeline(
-        self,
-        module: torch.nn.Module,
-        compile_spec: CompileSpec,
-        test_data: Tuple[torch.tensor],
-    ):
-        quantizer = EthosUQuantizer(compile_spec).set_io(
-            get_symmetric_quantization_config()
-        )
-        tester = (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
-            .export()
-            .check_count({"torch.ops.aten.avg_pool2d.default": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_avg_pool2d_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .serialize()
-        )
-        if conftest.is_option_enabled("corstone_fvp"):
-            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
-
-    @parameterized.expand(test_data_suite)
-    def test_avgpool2d_tosa_MI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        model_params: int | Tuple[int, int],
-    ):
-        self._test_avgpool2d_tosa_MI_pipeline(
-            self.AvgPool2d(*model_params), (test_data,)
-        )
-
-    @parameterized.expand(test_data_suite)
-    def test_avgpool2d_tosa_BI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        model_params: int | Tuple[int, int],
-    ):
-        self._test_avgpool2d_tosa_BI_pipeline(
-            self.AvgPool2d(*model_params), (test_data,)
-        )
-
-    @parameterized.expand(test_data_suite)
-    @pytest.mark.corstone_fvp
-    def test_avgpool2d_tosa_u55_BI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        model_params: int | Tuple[int, int],
-    ):
-        self._test_avgpool2d_tosa_ethos_BI_pipeline(
-            self.AvgPool2d(*model_params),
-            common.get_u55_compile_spec(),
-            (test_data,),
-        )
-
-    @parameterized.expand(test_data_suite)
-    @pytest.mark.corstone_fvp
-    def test_avgpool2d_tosa_u85_BI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        model_params: int | Tuple[int, int],
-    ):
-        self._test_avgpool2d_tosa_ethos_BI_pipeline(
-            self.AvgPool2d(*model_params),
-            common.get_u85_compile_spec(),
-            (test_data,),
-        )
-
-    reject_data_suite = [
-        (AvgPool2d(1, 1, 0), torch.rand(2, 5, 5, 5)),
-        (AvgPool2d((2, 9), 1, 1), torch.rand(1, 16, 5, 32)),
-        (AvgPool2d(1, 4, 0), torch.rand(1, 10, 10, 10)),
-        (AvgPool2d((1, 257), 1, 0), torch.rand(1, 16, 5, 300)),
-        (AvgPool2d((800, 90), 1, 0), torch.rand(1, 16, 850, 100)),
-    ]
-
-    @parameterized.expand(reject_data_suite)
-    def test_reject_avgpool2d_u55_BI(
-        self,
-        module: torch.nn.Module,
-        test_data: torch.tensor,
-    ):
-        compile_spec = common.get_u55_compile_spec()
-        quantizer = EthosUQuantizer(compile_spec).set_io(
-            get_symmetric_quantization_config()
-        )
-
-        (
-            ArmTester(
-                module,
-                example_inputs=(test_data,),
-                compile_spec=compile_spec,
-            )
-            .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
-            .export()
-            .check_count({"torch.ops.aten.avg_pool2d.default": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge_transform_and_lower()
-            .check(["executorch_exir_dialects_edge__ops_aten_avg_pool2d_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 0})
-        )
diff --git a/backends/arm/test/ops/test_avg_pool2d.py b/backends/arm/test/ops/test_avg_pool2d.py
new file mode 100644
index 00000000000..2a50ef38834
--- /dev/null
+++ b/backends/arm/test/ops/test_avg_pool2d.py
@@ -0,0 +1,184 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from typing import Tuple
+
+import torch
+
+from executorch.backends.arm.test import common
+
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    OpNotSupportedPipeline,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+
+aten_op = "torch.ops.aten.avg_pool2d.default"
+exir_op = "executorch_exir_dialects_edge__ops_aten_avg_pool2d_default"
+input_t = Tuple[torch.Tensor]
+
+
+class AvgPool2d(torch.nn.Module):
+    def __init__(
+        self,
+        kernel_size: int | Tuple[int, int],
+        stride: int | Tuple[int, int],
+        padding: int | Tuple[int, int],
+    ):
+        super().__init__()
+        self.avg_pool_2d = torch.nn.AvgPool2d(
+            kernel_size=kernel_size, stride=stride, padding=padding
+        )
+
+    def forward(self, x):
+        return self.avg_pool_2d(x)
+
+
+test_modules = {
+    "zeros": (AvgPool2d(4, 2, 0), (torch.zeros(1, 16, 50, 32),)),
+    "ones": (AvgPool2d(4, 2, 0), (torch.ones(1, 16, 50, 32),)),
+    "rand": (AvgPool2d(4, 2, 0), (torch.rand(1, 16, 50, 32),)),
+    "randn": (AvgPool2d(4, 2, 0), (torch.randn(1, 16, 50, 32),)),
+    "kernel_3x3_stride_1_pad_1": (
+        AvgPool2d((3, 3), (1, 1), 1),
+        (torch.rand(1, 16, 50, 32),),
+    ),
+    "kernel_3x2_stride_1x2_pad_1x0": (
+        AvgPool2d((3, 2), (1, 2), (1, 0)),
+        (torch.rand(1, 16, 50, 32),),
+    ),
+    "kernel_4x6_stride_1x2_pad_2x3": (
+        AvgPool2d((4, 6), (1, 2), (2, 3)),
+        (torch.rand(1, 16, 50, 32),),
+    ),
+}
+
+
+@common.parametrize("test_module", test_modules)
+def test_avgpool2d_tosa_MI(test_module):
+    model, input_tensor = test_module
+
+    pipeline = TosaPipelineMI[input_t](model, input_tensor, aten_op, exir_op)
+    pipeline.change_args("run_method_and_compare_outputs", qtol=1, atol=1, rtol=1)
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_modules)
+def test_avgpool2d_tosa_BI(test_module):
+    model, input_tensor = test_module
+
+    pipeline = TosaPipelineBI[input_t](
+        model,
+        input_tensor,
+        aten_op,
+        exir_op,
+        symmetric_io_quantization=True,
+    )
+    pipeline.change_args("run_method_and_compare_outputs", qtol=1, atol=1, rtol=1)
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_modules)
+def test_avgpool2d_u55_BI(test_module):
+    model, input_tensor = test_module
+
+    pipeline = EthosU55PipelineBI[input_t](
+        model,
+        input_tensor,
+        aten_op,
+        exir_op,
+        run_on_fvp=False,
+        symmetric_io_quantization=True,
+    )
+
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_modules)
+def test_avgpool2d_u85_BI(test_module):
+    model, input_tensor = test_module
+
+    pipeline = EthosU85PipelineBI[input_t](
+        model,
+        input_tensor,
+        aten_op,
+        exir_op,
+        run_on_fvp=False,
+        symmetric_io_quantization=True,
+    )
+
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_modules)
+@common.SkipIfNoCorstone300
+def test_avgpool2d_u55_BI_on_fvp(test_module):
+    model, input_tensor = test_module
+
+    pipeline = EthosU55PipelineBI[input_t](
+        model,
+        input_tensor,
+        aten_op,
+        exir_op,
+        run_on_fvp=True,
+        symmetric_io_quantization=True,
+    )
+    pipeline.change_args("run_method_and_compare_outputs", qtol=1, atol=1, rtol=1)
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_modules)
+@common.SkipIfNoCorstone320
+def test_avgpool2d_u85_BI_on_fvp(test_module):
+    model, input_tensor = test_module
+
+    pipeline = EthosU85PipelineBI[input_t](
+        model,
+        input_tensor,
+        aten_op,
+        exir_op,
+        run_on_fvp=True,
+        symmetric_io_quantization=True,
+    )
+    pipeline.change_args("run_method_and_compare_outputs", qtol=1, atol=1, rtol=1)
+
+    pipeline.run()
+
+
+reject_modules = {
+    "kernel_1x1_stride_1_pad_0": (AvgPool2d(1, 1, 0), torch.rand(2, 5, 5, 5)),
+    "kernel_2x9_stride_1_pad_1": (AvgPool2d((2, 9), 1, 1), torch.rand(1, 16, 5, 32)),
+    "kernel_1x4_stride_0_pad_0": (AvgPool2d(1, 4, 0), torch.rand(1, 10, 10, 10)),
+    "kernel_1x257_stride_1_pad_0_large": (
+        AvgPool2d((1, 257), 1, 0),
+        torch.rand(1, 16, 5, 300),
+    ),
+    "kernel_800x90_stride_1_pad_0_extreme": (
+        AvgPool2d((800, 90), 1, 0),
+        torch.rand(1, 16, 850, 100),
+    ),
+}
+
+
+@common.parametrize("reject_module", reject_modules)
+def test_reject_avgpool2d(reject_module):
+
+    model, test_data = reject_module
+
+    pipeline = OpNotSupportedPipeline[input_t](
+        module=model,
+        test_data=(test_data,),
+        tosa_version="TOSA-0.80+BI",
+        non_delegated_ops={},
+        n_expected_delegates=0,
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_batch_norm.py b/backends/arm/test/ops/test_batch_norm.py
index 502f1698612..360429d3d6c 100644
--- a/backends/arm/test/ops/test_batch_norm.py
+++ b/backends/arm/test/ops/test_batch_norm.py
@@ -1,6 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -647,7 +647,7 @@ def _test_batchnorm2d_u55_BI_pipeline(
         )
 
     @parameterized.expand(test_data_suite)
-    def test_batchnorm2d_tosa_MI(
+    def test_native_batch_norm_legit_no_training_tosa_MI(
         self,
         test_name: str,
         test_data: torch.Tensor,
@@ -665,7 +665,7 @@ def test_batchnorm2d_tosa_MI(
     # Expected to fail since not inplemented
     @parameterized.expand(test_no_stats_data_suite)
     @unittest.expectedFailure
-    def test_batchnorm2d_no_stats_tosa_MI(
+    def test_native_batch_norm_legit_tosa_MI(
         self,
         test_name: str,
         test_data: torch.Tensor,
@@ -686,7 +686,7 @@ def test_batchnorm2d_no_stats_tosa_MI(
     @unittest.skip(
         reason="Expected to fail since TOSAQuantizer (for BI) cannot quantize a BatchNorm layer"
     )
-    def test_batchnorm2d_tosa_BI(
+    def test_native_batch_norm_legit_no_training_tosa_BI(
         self,
         test_name: str,
         test_data: torch.Tensor,
@@ -708,7 +708,7 @@ def test_batchnorm2d_tosa_BI(
         reason="Expected to fail since EthosUQuantizer cannot quantize a BatchNorm layer"
     )
     @unittest.expectedFailure
-    def test_batchnorm2d_u55_BI(
+    def test_native_batch_norm_legit_no_training_u55_BI(
         self,
         test_name: str,
         test_data: torch.Tensor,
diff --git a/backends/arm/test/ops/test_bmm.py b/backends/arm/test/ops/test_bmm.py
index 46b6eb6d011..247f5a166b8 100644
--- a/backends/arm/test/ops/test_bmm.py
+++ b/backends/arm/test/ops/test_bmm.py
@@ -152,8 +152,7 @@ def test_bmm_single_input_tosa_BI(self, test_data_generator: Callable[[], Tuple]
 
     @parameterized.expand(BMM.test_data_generators)
     @pytest.mark.corstone_fvp
-    @unittest.expectedFailure
-    def test_bmm_u55_BI_xfails(self, test_data_generator: Callable[[], Tuple]):
+    def test_bmm_u55_BI(self, test_data_generator: Callable[[], Tuple]):
         test_data = test_data_generator()
         self._test_bmm_ethosu_BI_pipeline(
             self.BMM(), common.get_u55_compile_spec(), test_data
@@ -167,13 +166,10 @@ def test_bmm_u85_BI(self, test_data_generator: Callable[[], Tuple]):
             self.BMM(), common.get_u85_compile_spec(), test_data
         )
 
-    # Expected to fail with error: Warning, unsupported fusing of TOSA Rescale previous operator is of type: Memcpy
+    # Expected to fail on FVP as TOSA.MATMUL is not supported on U55
     @parameterized.expand(BMMSingleInput.test_data_generators)
     @pytest.mark.corstone_fvp
-    @unittest.expectedFailure
-    def test_bmm_single_input_u55_BI_xfails(
-        self, test_data_generator: Callable[[], Tuple]
-    ):
+    def test_bmm_single_input_u55_BI(self, test_data_generator: Callable[[], Tuple]):
         test_data = test_data_generator()
         self._test_bmm_ethosu_BI_pipeline(
             self.BMMSingleInput(), common.get_u55_compile_spec(), test_data
diff --git a/backends/arm/test/ops/test_cat.py b/backends/arm/test/ops/test_cat.py
index a1613d1d04b..63423b9e993 100644
--- a/backends/arm/test/ops/test_cat.py
+++ b/backends/arm/test/ops/test_cat.py
@@ -111,7 +111,6 @@ def _test_cat_ethosu_BI_pipeline(
             .check(["torch.ops.quantized_decomposed"])
             .to_edge()
             .partition()
-            .dump_artifact()
             .check_not(["executorch_exir_dialects_edge__ops_aten_cat_default"])
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
diff --git a/backends/arm/test/ops/test_clamp.py b/backends/arm/test/ops/test_clamp.py
index f379732343e..368f7967433 100644
--- a/backends/arm/test/ops/test_clamp.py
+++ b/backends/arm/test/ops/test_clamp.py
@@ -1,167 +1,159 @@
 # Copyright 2025 Arm Limited and/or its affiliates.
-# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
 from numbers import Number
 from typing import Tuple, Union
 
-import pytest
 import torch
 
-from executorch.backends.arm.quantizer.arm_quantizer import (
-    EthosUQuantizer,
-    get_symmetric_quantization_config,
-    TOSAQuantizer,
+from executorch.backends.arm.test import common
+
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
 )
-from executorch.backends.arm.test import common, conftest
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.backends.arm.tosa_specification import TosaSpecification
-from executorch.backends.xnnpack.test.tester.tester import Quantize
-from executorch.exir.backend.compile_spec_schema import CompileSpec
-from parameterized import parameterized
-
-
-test_data_suite = [
-    # (test_name, test_data, min, max)
-    ("rank_1", torch.rand(10) * 2, -1.0, 1.0),
-    ("rank_2", torch.rand(1, 35), 0.5, 0.8),
-    ("rank_3", torch.ones(1, 10, 10), -1, -1),
-    ("rank_4", torch.rand(1, 10, 10, 1) * 2, -0.1, 2.0),
-    ("rank_4_mixed_min_max_dtype", torch.rand(1, 10, 10, 5) + 10, 8.0, 10),
-    ("rank_4_no_min", torch.rand(1, 10, 10, 1) * 10, None, 5),
-    ("rank_4_no_max", torch.rand(1, 10, 10, 1) - 3, -3.3, None),
-]
-
-
-class TestClamp(unittest.TestCase):
-    """Tests Clamp Operator."""
-
-    class Clamp(torch.nn.Module):
-        def __init__(
-            self,
-            min: Union[torch.Tensor, Number, None],
-            max: Union[torch.Tensor, Number, None],
-        ):
-            super().__init__()
-
-            self.clamp_min = min
-            self.clamp_max = max
-
-        def forward(self, x):
-            return torch.clamp(x, self.clamp_min, self.clamp_max)
-
-    def _test_clamp_tosa_MI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .check(["torch.ops.aten.clamp.default"])
-            .check_not(["torch.ops.quantized_decomposed"])
-            .to_edge_transform_and_lower()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_clamp_tosa_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
-    ):
-        tosa_spec = TosaSpecification.create_from_string("TOSA-0.80+BI")
-        compile_spec = common.get_tosa_compile_spec(tosa_spec)
-        quantizer = TOSAQuantizer(tosa_spec).set_io(get_symmetric_quantization_config())
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
-            .export()
-            .check_count({"torch.ops.aten.clamp.default": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge_transform_and_lower()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_clamp_ethos_pipeline(
-        self,
-        compile_spec: list[CompileSpec],
-        module: torch.nn.Module,
-        test_data: Tuple[torch.tensor],
-    ):
-        quantizer = EthosUQuantizer(compile_spec).set_io(
-            get_symmetric_quantization_config()
-        )
-        tester = (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
-            .export()
-            .check_count({"torch.ops.aten.clamp.default": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge_transform_and_lower()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .serialize()
-        )
-        if conftest.is_option_enabled("corstone_fvp"):
-            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
-
-    @parameterized.expand(test_data_suite)
-    def test_clamp_tosa_MI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        min: Union[torch.Tensor, Number, None],
-        max: Union[torch.Tensor, Number, None],
-    ):
-        self._test_clamp_tosa_MI_pipeline(self.Clamp(min, max), (test_data,))
 
-    @parameterized.expand(test_data_suite)
-    def test_clamp_tosa_BI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        min: Union[torch.Tensor, Number, None],
-        max: Union[torch.Tensor, Number, None],
-    ):
-        self._test_clamp_tosa_BI_pipeline(self.Clamp(min, max), (test_data,))
 
-    @parameterized.expand(test_data_suite)
-    @pytest.mark.corstone_fvp
-    def test_clamp_tosa_u55_BI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        min: Union[torch.Tensor, Number, None],
-        max: Union[torch.Tensor, Number, None],
-    ):
-        self._test_clamp_ethos_pipeline(
-            common.get_u55_compile_spec(), self.Clamp(min, max), (test_data,)
-        )
+aten_op = "torch.ops.aten.clamp.default"
+exir_op = "executorch_exir_dialects_edge__ops_aten_clamp_default"
+input_t = Tuple[torch.Tensor]
+
+test_data_suite = {
+    # test_name: (test_data, min, max)
+    "rank_1": (torch.rand(10) * 2, -1.0, 1.0),
+    "rank_2": (torch.rand(1, 35), 0.5, 0.8),
+    "rank_3": (torch.ones(1, 10, 10), -1, -1),
+    "rank_4": (torch.rand(1, 10, 10, 1) * 2, -0.1, 2.0),
+    "rank_4_mixed_min_max_dtype": (torch.rand(1, 10, 10, 5) + 10, 8.0, 10),
+    "rank_4_no_min": (torch.rand(1, 10, 10, 1) * 10, None, 5),
+    "rank_4_no_max": (torch.rand(1, 10, 10, 1) - 3, -3.3, None),
+}
 
-    @parameterized.expand(test_data_suite)
-    @pytest.mark.corstone_fvp
-    def test_clamp_tosa_u85_BI(
+
+class Clamp(torch.nn.Module):
+    def __init__(
         self,
-        test_name: str,
-        test_data: torch.Tensor,
-        min: Union[torch.Tensor, Number, None],
-        max: Union[torch.Tensor, Number, None],
+        clamp_min: Union[torch.Tensor, Number, None],
+        clamp_max: Union[torch.Tensor, Number, None],
     ):
-        self._test_clamp_ethos_pipeline(
-            common.get_u85_compile_spec(), self.Clamp(min, max), (test_data,)
-        )
+        super().__init__()
+
+        self.clamp_min = clamp_min
+        self.clamp_max = clamp_max
+
+    def forward(self, x):
+        return torch.clamp(x, self.clamp_min, self.clamp_max)
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_clamp_tosa_MI(test_data):
+
+    input_tensor, min_val, max_val = test_data
+    model = Clamp(min_val, max_val)
+
+    pipeline = TosaPipelineMI[input_t](
+        model,
+        (input_tensor,),
+        aten_op,
+        exir_op,
+    )
+
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_clamp_tosa_BI(test_data):
+
+    input_tensor, min_val, max_val = test_data
+    model = Clamp(min_val, max_val)
+
+    pipeline = TosaPipelineBI[input_t](
+        model,
+        (input_tensor,),
+        aten_op,
+        exir_op,
+        symmetric_io_quantization=True,
+    )
+    pipeline.change_args("run_method_and_compare_outputs", qtol=1)
+
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_clamp_u55_BI(test_data):
+
+    input_tensor, min_val, max_val = test_data
+    model = Clamp(min_val, max_val)
+
+    pipeline = EthosU55PipelineBI[input_t](
+        model,
+        (input_tensor,),
+        aten_op,
+        exir_op,
+        run_on_fvp=False,
+        symmetric_io_quantization=True,
+    )
+
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_clamp_u85_BI(test_data):
+
+    input_tensor, min_val, max_val = test_data
+    model = Clamp(min_val, max_val)
+
+    pipeline = EthosU85PipelineBI[input_t](
+        model,
+        (input_tensor,),
+        aten_op,
+        exir_op,
+        run_on_fvp=False,
+        symmetric_io_quantization=True,
+    )
+
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+@common.SkipIfNoCorstone300
+def test_clamp_u55_BI_on_fvp(test_data):
+
+    input_tensor, min_val, max_val = test_data
+    model = Clamp(min_val, max_val)
+
+    pipeline = EthosU55PipelineBI[input_t](
+        model,
+        (input_tensor,),
+        aten_op,
+        exir_op,
+        run_on_fvp=True,
+        symmetric_io_quantization=True,
+    )
+
+    pipeline.change_args("run_method_and_compare_outputs", qtol=1)
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+@common.SkipIfNoCorstone320
+def test_clamp_u85_BI_on_fvp(test_data):
+
+    input_tensor, min_val, max_val = test_data
+    model = Clamp(min_val, max_val)
+
+    pipeline = EthosU85PipelineBI[input_t](
+        model,
+        (input_tensor,),
+        aten_op,
+        exir_op,
+        run_on_fvp=True,
+        symmetric_io_quantization=True,
+    )
+    pipeline.change_args("run_method_and_compare_outputs", qtol=1)
+
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_clone.py b/backends/arm/test/ops/test_clone.py
index 543ae9ac40f..2aad62ece24 100644
--- a/backends/arm/test/ops/test_clone.py
+++ b/backends/arm/test/ops/test_clone.py
@@ -1,5 +1,4 @@
 # Copyright 2024-2025 Arm Limited and/or its affiliates.
-# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -8,78 +7,135 @@
 # Tests the clone op which copies the data of the input tensor (possibly with new data format)
 #
 
-import unittest
 from typing import Tuple
 
+import pytest
 import torch
 
-from executorch.backends.arm.quantizer.arm_quantizer import (
-    get_symmetric_quantization_config,
-    TOSAQuantizer,
-)
 from executorch.backends.arm.test import common
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.backends.arm.tosa_specification import TosaSpecification
-
-from executorch.backends.xnnpack.test.tester.tester import Quantize
-
-from parameterized import parameterized
-
-
-class TestSimpleClone(unittest.TestCase):
-    """Tests clone."""
-
-    class Clone(torch.nn.Module):
-        sizes = [10, 15, 50, 100]
-        test_parameters = [(torch.ones(n),) for n in sizes]
-
-        def __init__(self):
-            super().__init__()
-
-        def forward(self, x: torch.Tensor):
-            x = x.clone()
-            return x
-
-    def _test_clone_tosa_MI_pipeline(
-        self, module: torch.nn.Module, test_data: torch.Tensor
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .check_count({"torch.ops.aten.clone.default": 1})
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_clone_tosa_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
-    ):
-        tosa_spec = TosaSpecification.create_from_string("TOSA-0.80+BI")
-        compile_spec = common.get_tosa_compile_spec(tosa_spec)
-        quantizer = TOSAQuantizer(tosa_spec).set_io(get_symmetric_quantization_config())
-        (
-            ArmTester(module, example_inputs=test_data, compile_spec=compile_spec)
-            .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
-            .export()
-            .check_count({"torch.ops.aten.clone.default": 1})
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data, qtol=1)
-        )
-
-    @parameterized.expand(Clone.test_parameters)
-    def test_clone_tosa_MI(self, test_tensor: torch.Tensor):
-        self._test_clone_tosa_MI_pipeline(self.Clone(), (test_tensor,))
-
-    @parameterized.expand(Clone.test_parameters)
-    def test_clone_tosa_BI(self, test_tensor: torch.Tensor):
-        self._test_clone_tosa_BI_pipeline(self.Clone(), (test_tensor,))
+
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+
+aten_op = "torch.ops.aten.clone.default"
+exir_op = "executorch_exir_dialects_edge__ops_aten_clone_default"
+
+input_t = Tuple[torch.Tensor]
+
+
+class Clone(torch.nn.Module):
+    """A simple module that clones an input tensor."""
+
+    def forward(self, x: torch.Tensor):
+        return x.clone()
+
+
+test_data_suite = {
+    "ones_1D_10": (torch.ones(10),),
+    "ones_1D_50": (torch.ones(50),),
+    "rand_1D_20": (torch.rand(20),),
+    "rand_2D_10x10": (torch.rand(10, 10),),
+    "rand_3D_5x5x5": (torch.rand(5, 5, 5),),
+    "rand_4D_2x3x4x5": (torch.rand(2, 3, 4, 5),),
+    "large_tensor": (torch.rand(1000),),
+}
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_clone_tosa_MI(test_data: Tuple[torch.Tensor]):
+
+    pipeline = TosaPipelineMI[input_t](
+        Clone(),
+        test_data,
+        aten_op,
+        exir_op,
+    )
+
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_clone_tosa_BI(test_data):
+    pipeline = TosaPipelineBI[input_t](
+        Clone(),
+        test_data,
+        aten_op,
+        exir_op,
+        symmetric_io_quantization=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+@pytest.mark.xfail(
+    reason="Empty subgraph leads to Vela compilation failure. See: https://jira.arm.com/browse/MLBEDSW-10477"
+)
+def test_clone_u55_BI(test_data):
+    pipeline = EthosU55PipelineBI[input_t](
+        Clone(),
+        test_data,
+        aten_op,
+        exir_op,
+        run_on_fvp=False,
+        symmetric_io_quantization=True,
+    )
+
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+@pytest.mark.xfail(
+    reason="Empty subgraph leads to Vela compilation failure. See: https://jira.arm.com/browse/MLBEDSW-10477"
+)
+def test_clone_u85_BI(test_data):
+    pipeline = EthosU85PipelineBI[input_t](
+        Clone(),
+        test_data,
+        aten_op,
+        exir_op,
+        run_on_fvp=False,
+        symmetric_io_quantization=True,
+    )
+
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+@pytest.mark.xfail(
+    reason="Empty subgraph leads to Vela compilation failure. See: https://jira.arm.com/browse/MLBEDSW-10477"
+)
+@common.SkipIfNoCorstone300
+def test_clone_u55_BI_on_fvp(test_data):
+    pipeline = EthosU55PipelineBI[input_t](
+        Clone(),
+        test_data,
+        aten_op,
+        exir_op,
+        run_on_fvp=True,
+        symmetric_io_quantization=True,
+    )
+
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+@pytest.mark.xfail(
+    reason="Empty subgraph leads to Vela compilation failure. See: https://jira.arm.com/browse/MLBEDSW-10477"
+)
+@common.SkipIfNoCorstone320
+def test_clone_u85_BI_on_fvp(test_data):
+    pipeline = EthosU85PipelineBI[input_t](
+        Clone(),
+        test_data,
+        aten_op,
+        exir_op,
+        run_on_fvp=True,
+        symmetric_io_quantization=True,
+    )
+
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_conv1d.py b/backends/arm/test/ops/test_conv1d.py
index 92da09a5ef3..a1ba23ac73a 100644
--- a/backends/arm/test/ops/test_conv1d.py
+++ b/backends/arm/test/ops/test_conv1d.py
@@ -1,20 +1,24 @@
 # Copyright 2024-2025 Arm Limited and/or its affiliates.
-# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
 
 from typing import List, Tuple, Union
 
-import pytest
-
 import torch
-from executorch.backends.arm.test import common, conftest
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.exir.backend.backend_details import CompileSpec
-from parameterized import parameterized
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+aten_op = "torch.ops.aten.conv1d.default"
+exir_op = "executorch_exir_dialects_edge__ops_aten_convolution_default"
+
+input_t = Tuple[torch.Tensor]
 
 
 class Conv1d(torch.nn.Module):
@@ -245,107 +249,93 @@ def forward(self, x):
     batches=1,
 )
 
-# Shenanigan to get a nicer output when test fails. With unittest it looks like:
-# FAIL: test_conv1d_tosa_BI_2_3x3_1x3x12x12_st2_pd1
-testsuite = [
-    ("2_3x2x40_nobias", conv1d_2_3x2x40_nobias),
-    ("3_1x3x256_st1", conv1d_3_1x3x256_st1),
-    ("3_1x3x12_st2_pd1", conv1d_3_1x3x12_st2_pd1),
-    ("1_1x2x128_st1", conv1d_1_1x2x128_st1),
-    ("2_1x2x14_st2", conv1d_2_1x2x14_st2),
-    ("5_3x2x128_st1", conv1d_5_3x2x128_st1),
-    ("3_1x3x224_st2_pd1", conv1d_3_1x3x224_st2_pd1),
-    ("7_1x3x16_st2_pd1_dl2_needs_adjust_pass", conv1d_7_1x3x16_st2_pd1_dl2),
-    ("7_1x3x15_st1_pd0_dl1_needs_adjust_pass", conv1d_7_1x3x15_st1_pd0_dl1),
-    ("5_1x3x14_st5_pd0_dl1_needs_adjust_pass", conv1d_5_1x3x14_st5_pd0_dl1),
-    ("5_1x3x9_st5_pd0_dl1_needs_adjust_pass", conv1d_5_1x3x9_st5_pd0_dl1),
-    ("two_conv1d_nobias", two_conv1d_nobias),
-    ("two_conv1d", two_conv1d),
-]
-
-
-class TestConv1D(unittest.TestCase):
-    def _test_conv1d_tosa_MI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80+MI",
-                ),
-            )
-            .export()
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .check_not(["executorch_exir_dialects_edge__ops_aten_convolution_default"])
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_conv1d_tosa_BI_pipeline(
-        self,
-        module: torch.nn.Module,
-        test_data: Tuple[torch.Tensor],
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80+BI",
-                ),
-            )
-            .quantize()
-            .export()
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .check_not(["executorch_exir_dialects_edge__ops_aten_convolution_default"])
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data, qtol=1)
-        )
-
-    def _test_conv1d_ethosu_BI_pipeline(
-        self,
-        module: torch.nn.Module,
-        compile_spec: CompileSpec,
-        test_data: Tuple[torch.Tensor],
-    ):
-        tester = (
-            ArmTester(module, example_inputs=test_data, compile_spec=compile_spec)
-            .quantize()
-            .export()
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .check_not(["executorch_exir_dialects_edge__ops_aten_convolution_default"])
-            .to_executorch()
-            .serialize()
-        )
-        if conftest.is_option_enabled("corstone_fvp"):
-            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
-
-    @parameterized.expand(testsuite)
-    def test_conv1d_tosa_MI(self, test_name, model):
-        self._test_conv1d_tosa_MI_pipeline(model, model.get_inputs())
-
-    @parameterized.expand(testsuite)
-    def test_conv1d_tosa_BI(self, test_name, model):
-        self._test_conv1d_tosa_BI_pipeline(model, model.get_inputs())
-
-    @parameterized.expand(testsuite)
-    @pytest.mark.corstone_fvp
-    def test_conv1d_u55_BI(self, test_name, model):
-        self._test_conv1d_ethosu_BI_pipeline(
-            model, common.get_u55_compile_spec(), model.get_inputs()
-        )
-
-    @parameterized.expand(testsuite)
-    @pytest.mark.corstone_fvp
-    def test_conv1d_u85_BI(self, test_name, model):
-        self._test_conv1d_ethosu_BI_pipeline(
-            model, common.get_u85_compile_spec(), model.get_inputs()
-        )
+test_modules = {
+    "2_3x2x40_nobias": conv1d_2_3x2x40_nobias,
+    "3_1x3x256_st1": conv1d_3_1x3x256_st1,
+    "3_1x3x12_st2_pd1": conv1d_3_1x3x12_st2_pd1,
+    "1_1x2x128_st1": conv1d_1_1x2x128_st1,
+    "2_1x2x14_st2": conv1d_2_1x2x14_st2,
+    "5_3x2x128_st1": conv1d_5_3x2x128_st1,
+    "3_1x3x224_st2_pd1": conv1d_3_1x3x224_st2_pd1,
+    "7_1x3x16_st2_pd1_dl2_needs_adjust_pass": conv1d_7_1x3x16_st2_pd1_dl2,
+    "7_1x3x15_st1_pd0_dl1_needs_adjust_pass": conv1d_7_1x3x15_st1_pd0_dl1,
+    "5_1x3x14_st5_pd0_dl1_needs_adjust_pass": conv1d_5_1x3x14_st5_pd0_dl1,
+    "5_1x3x9_st5_pd0_dl1_needs_adjust_pass": conv1d_5_1x3x9_st5_pd0_dl1,
+    "two_conv1d_nobias": two_conv1d_nobias,
+    "two_conv1d": two_conv1d,
+}
+
+
+@common.parametrize("test_module", test_modules)
+def test_convolution_1d_tosa_MI(test_module):
+    pipeline = TosaPipelineMI[input_t](
+        test_module,
+        test_module.get_inputs(),
+        aten_op,
+        exir_op,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_modules)
+def test_convolution_1d_tosa_BI(test_module):
+    pipeline = TosaPipelineBI[input_t](
+        test_module,
+        test_module.get_inputs(),
+        aten_op,
+        exir_op,
+    )
+    pipeline.change_args("run_method_and_compare_outputs", qtol=1)
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_modules)
+def test_convolution_1d_u55_BI(test_module):
+    pipeline = EthosU55PipelineBI[input_t](
+        test_module,
+        test_module.get_inputs(),
+        aten_op,
+        exir_op,
+        run_on_fvp=False,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_modules)
+def test_convolution_1d_u85_BI(test_module):
+    pipeline = EthosU85PipelineBI[input_t](
+        test_module,
+        test_module.get_inputs(),
+        aten_op,
+        exir_op,
+        run_on_fvp=False,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_modules)
+@common.SkipIfNoCorstone300
+def test_convolution_1d_u55_BI_on_fvp(test_module):
+    pipeline = EthosU55PipelineBI[input_t](
+        test_module,
+        test_module.get_inputs(),
+        aten_op,
+        exir_op,
+        run_on_fvp=True,
+    )
+    pipeline.change_args("run_method_and_compare_outputs", qtol=1)
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_modules)
+@common.SkipIfNoCorstone320
+def test_convolution_1d_u85_BI_on_fvp(test_module):
+    pipeline = EthosU85PipelineBI[input_t](
+        test_module,
+        test_module.get_inputs(),
+        aten_op,
+        exir_op,
+        run_on_fvp=True,
+    )
+    pipeline.change_args("run_method_and_compare_outputs", qtol=1)
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_conv2d.py b/backends/arm/test/ops/test_conv2d.py
index 827e6dfffa3..8083b2ecf71 100644
--- a/backends/arm/test/ops/test_conv2d.py
+++ b/backends/arm/test/ops/test_conv2d.py
@@ -328,7 +328,7 @@ def forward(self, x):
 )
 
 # Shenanigan to get a nicer output when test fails. With unittest it looks like:
-# FAIL: test_conv2d_tosa_BI_2_3x3_1x3x12x12_st2_pd1
+# FAIL: test_convolution_2d_tosa_BI_2_3x3_1x3x12x12_st2_pd1
 test_modules = {
     "2x2_3x2x40x40_nobias": conv2d_2x2_3x2x40x40_nobias,
     "3x3_1x3x256x256_st1": conv2d_3x3_1x3x256x256_st1,
@@ -358,7 +358,7 @@ def forward(self, x):
 
 
 @common.parametrize("test_module", test_modules)
-def test_conv2d_tosa_MI(test_module):
+def test_convolution_2d_tosa_MI(test_module):
     pipeline = TosaPipelineMI[input_t](
         test_module, test_module.get_inputs(), aten_op, exir_op
     )
@@ -366,16 +366,16 @@ def test_conv2d_tosa_MI(test_module):
 
 
 @common.parametrize("test_module", test_modules)
-def test_conv2d_tosa_BI(test_module):
+def test_convolution_2d_tosa_BI(test_module):
     pipeline = TosaPipelineBI[input_t](
         test_module, test_module.get_inputs(), aten_op, exir_op
     )
-    pipeline.change_args("run_method_and_compare_outputs.0", qtol=1)
+    pipeline.change_args("run_method_and_compare_outputs", qtol=1)
     pipeline.run()
 
 
 @common.parametrize("test_module", test_modules)
-def test_conv2d_u55_BI(test_module):
+def test_convolution_2d_u55_BI(test_module):
     pipeline = EthosU55PipelineBI[input_t](
         test_module, test_module.get_inputs(), aten_op, exir_op, run_on_fvp=False
     )
@@ -383,7 +383,7 @@ def test_conv2d_u55_BI(test_module):
 
 
 @common.parametrize("test_module", test_modules)
-def test_conv2d_u85_BI(test_module):
+def test_convolution_2d_u85_BI(test_module):
     pipeline = EthosU85PipelineBI[input_t](
         test_module, test_module.get_inputs(), aten_op, exir_op, run_on_fvp=False
     )
@@ -392,7 +392,7 @@ def test_conv2d_u85_BI(test_module):
 
 @common.parametrize("test_module", test_modules, fvp_xfails)
 @common.SkipIfNoCorstone300
-def test_conv2d_u55_BI_on_fvp(test_module):
+def test_convolution_2d_u55_BI_on_fvp(test_module):
     pipeline = EthosU55PipelineBI[input_t](
         test_module, test_module.get_inputs(), aten_op, exir_op, run_on_fvp=True
     )
@@ -401,7 +401,7 @@ def test_conv2d_u55_BI_on_fvp(test_module):
 
 @common.parametrize("test_module", test_modules, fvp_xfails)
 @common.SkipIfNoCorstone320
-def test_conv2d_u85_BI_on_fvp(test_module):
+def test_convolution_2d_u85_BI_on_fvp(test_module):
     pipeline = EthosU85PipelineBI[input_t](
         test_module, test_module.get_inputs(), aten_op, exir_op, run_on_fvp=True
     )
@@ -443,7 +443,7 @@ def test_conv2d_u85_BI_on_fvp(test_module):
 
 
 @common.parametrize("module", reject_suite)
-def test_reject_conv2d_u55_BI(
+def test_reject_convolution_2d_u55_BI(
     module: Conv2d,
 ):
     (
diff --git a/backends/arm/test/ops/test_eq.py b/backends/arm/test/ops/test_eq.py
index 263a042ea1c..329f65dfead 100644
--- a/backends/arm/test/ops/test_eq.py
+++ b/backends/arm/test/ops/test_eq.py
@@ -1,145 +1,136 @@
 # Copyright 2025 Arm Limited and/or its affiliates.
-# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
+from typing import Tuple
 
+import pytest
 import torch
 from executorch.backends.arm.test import common
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.exir.backend.compile_spec_schema import CompileSpec
-from parameterized import parameterized
-
-test_data_suite = [
-    # (test_name, input, other,) See torch.eq() for info
-    (
-        "op_eq_rank1_ones",
-        torch.ones(5),
-        torch.ones(5),
-    ),
-    (
-        "op_eq_rank2_rand",
-        torch.rand(4, 5),
-        torch.rand(1, 5),
-    ),
-    (
-        "op_eq_rank3_randn",
-        torch.randn(10, 5, 2),
-        torch.randn(10, 5, 2),
-    ),
-    (
-        "op_eq_rank4_randn",
-        torch.randn(3, 2, 2, 2),
-        torch.randn(3, 2, 2, 2),
-    ),
-]
-
-
-class TestEqual(unittest.TestCase):
-    class Equal(torch.nn.Module):
-        def forward(
-            self,
-            input_: torch.Tensor,
-            other_: torch.Tensor,
-        ):
-            return input_ == other_
-
-    def _test_eq_tosa_MI_pipeline(
-        self,
-        compile_spec: list[CompileSpec],
-        module: torch.nn.Module,
-        test_data: tuple[torch.Tensor, torch.Tensor],
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .export()
-            .check_count({"torch.ops.aten.eq.Tensor": 1})
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_eq_tosa_BI_pipeline(
-        self,
-        compile_spec: list[CompileSpec],
-        module: torch.nn.Module,
-        test_data: tuple[torch.Tensor, torch.Tensor],
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize()
-            .export()
-            .check_count({"torch.ops.aten.eq.Tensor": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    @parameterized.expand(test_data_suite)
-    def test_eq_tosa_MI(
-        self,
-        test_name: str,
-        input_: torch.Tensor,
-        other_: torch.Tensor,
-    ):
-        test_data = (input_, other_)
-        self._test_eq_tosa_MI_pipeline(
-            common.get_tosa_compile_spec("TOSA-0.80+MI"), self.Equal(), test_data
-        )
 
-    @parameterized.expand(test_data_suite)
-    def test_eq_tosa_BI(
-        self,
-        test_name: str,
-        input_: torch.Tensor,
-        other_: torch.Tensor,
-    ):
-        test_data = (input_, other_)
-        self._test_eq_tosa_BI_pipeline(
-            common.get_tosa_compile_spec("TOSA-0.80+BI"), self.Equal(), test_data
-        )
-
-    @parameterized.expand(test_data_suite)
-    @unittest.skip
-    def test_eq_u55_BI(
-        self,
-        test_name: str,
-        input_: torch.Tensor,
-        other_: torch.Tensor,
-    ):
-        test_data = (input_, other_)
-        self._test_eq_tosa_BI_pipeline(
-            common.get_u55_compile_spec(permute_memory_to_nhwc=True),
-            self.Equal(),
-            test_data,
-        )
-
-    @parameterized.expand(test_data_suite)
-    @unittest.skip
-    def test_eq_u85_BI(
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU85PipelineBI,
+    OpNotSupportedPipeline,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+aten_op = "torch.ops.aten.eq.Tensor"
+exir_op = "executorch_exir_dialects_edge__ops_aten_eq_Tensor"
+
+input_t = Tuple[torch.Tensor]
+
+
+class Equal(torch.nn.Module):
+    def __init__(self, input, other):
+        super().__init__()
+        self.input_ = input
+        self.other_ = other
+
+    def forward(
         self,
-        test_name: str,
         input_: torch.Tensor,
         other_: torch.Tensor,
     ):
-        test_data = (input_, other_)
-        self._test_eq_tosa_BI_pipeline(
-            common.get_u85_compile_spec(permute_memory_to_nhwc=True),
-            self.Equal(),
-            test_data,
-        )
+        return input_ == other_
+
+    def get_inputs(self):
+        return (self.input_, self.other_)
+
+
+op_eq_rank1_ones = Equal(
+    torch.ones(5),
+    torch.ones(5),
+)
+op_eq_rank2_rand = Equal(
+    torch.rand(4, 5),
+    torch.rand(1, 5),
+)
+op_eq_rank3_randn = Equal(
+    torch.randn(10, 5, 2),
+    torch.randn(10, 5, 2),
+)
+op_eq_rank4_randn = Equal(
+    torch.randn(3, 2, 2, 2),
+    torch.randn(3, 2, 2, 2),
+)
+
+test_data_common = {
+    "eq_rank1_ones": op_eq_rank1_ones,
+    "eq_rank2_rand": op_eq_rank2_rand,
+    "eq_rank3_randn": op_eq_rank3_randn,
+    "eq_rank4_randn": op_eq_rank4_randn,
+}
+
+
+@common.parametrize("test_module", test_data_common)
+def test_eq_tosa_MI(test_module):
+    pipeline = TosaPipelineMI[input_t](
+        test_module, test_module.get_inputs(), aten_op, exir_op
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_data_common)
+def test_eq_tosa_BI(test_module):
+    pipeline = TosaPipelineBI[input_t](
+        test_module, test_module.get_inputs(), aten_op, exir_op
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_data_common)
+def test_eq_u55_BI(test_module):
+    # EQUAL is not supported on U55.
+    pipeline = OpNotSupportedPipeline[input_t](
+        test_module,
+        test_module.get_inputs(),
+        "TOSA-0.80+BI+u55",
+        {exir_op: 1},
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_data_common)
+def test_eq_u85_BI(test_module):
+    pipeline = EthosU85PipelineBI[input_t](
+        test_module,
+        test_module.get_inputs(),
+        aten_op,
+        exir_op,
+        run_on_fvp=False,
+        use_to_edge_transform_and_lower=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_data_common)
+@pytest.mark.skip(reason="The same as test_eq_u55_BI")
+def test_eq_u55_BI_on_fvp(test_module):
+    # EQUAL is not supported on U55.
+    pipeline = OpNotSupportedPipeline[input_t](
+        test_module,
+        test_module.get_inputs(),
+        "TOSA-0.80+BI+u55",
+        {exir_op: 1},
+    )
+    pipeline.run()
+
+
+@common.parametrize(
+    "test_module",
+    test_data_common,
+    xfails={"eq_rank4_randn": "4D fails because boolean Tensors can't be subtracted"},
+)
+@common.SkipIfNoCorstone320
+def test_eq_u85_BI_on_fvp(test_module):
+    pipeline = EthosU85PipelineBI[input_t](
+        test_module,
+        test_module.get_inputs(),
+        aten_op,
+        exir_op,
+        run_on_fvp=True,
+        use_to_edge_transform_and_lower=True,
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_ge.py b/backends/arm/test/ops/test_ge.py
index ff6cacd1f97..a6193f6ea08 100644
--- a/backends/arm/test/ops/test_ge.py
+++ b/backends/arm/test/ops/test_ge.py
@@ -1,140 +1,136 @@
 # Copyright 2025 Arm Limited and/or its affiliates.
-# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
+from typing import Tuple
 
+import pytest
 import torch
 from executorch.backends.arm.test import common
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.exir.backend.compile_spec_schema import CompileSpec
-from parameterized import parameterized
-
-test_data_suite = [
-    # (test_name, input, other,) See torch.ge() for info
-    (
-        "op_ge_rank1_ones",
-        torch.ones(5),
-        torch.ones(5),
-    ),
-    (
-        "op_ge_rank2_rand",
-        torch.rand(4, 5),
-        torch.rand(1, 5),
-    ),
-    (
-        "op_ge_rank3_randn",
-        torch.randn(10, 5, 2),
-        torch.randn(10, 5, 2),
-    ),
-    (
-        "op_ge_rank4_randn",
-        torch.randn(3, 2, 2, 2),
-        torch.randn(3, 2, 2, 2),
-    ),
-]
-
-
-class TestGreaterEqual(unittest.TestCase):
-    class GreaterEqual(torch.nn.Module):
-        def forward(
-            self,
-            input_: torch.Tensor,
-            other_: torch.Tensor,
-        ):
-            return input_ >= other_
-
-    def _test_ge_tosa_pipeline(
-        self, module: torch.nn.Module, test_data: tuple[torch.Tensor, torch.Tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .check_count({"torch.ops.aten.ge.Tensor": 1})
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_ge_tosa_BI_pipeline(
-        self,
-        compile_spec: list[CompileSpec],
-        module: torch.nn.Module,
-        test_data: tuple[torch.Tensor, torch.Tensor],
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize()
-            .export()
-            .check_count({"torch.ops.aten.ge.Tensor": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    @parameterized.expand(test_data_suite)
-    def test_ge_tosa_MI(
-        self,
-        test_name: str,
-        input_: torch.Tensor,
-        other_: torch.Tensor,
-    ):
-        test_data = (input_, other_)
-        self._test_ge_tosa_pipeline(self.GreaterEqual(), test_data)
 
-    @parameterized.expand(test_data_suite)
-    def test_ge_tosa_BI(
-        self,
-        test_name: str,
-        input_: torch.Tensor,
-        other_: torch.Tensor,
-    ):
-        test_data = (input_, other_)
-        self._test_ge_tosa_BI_pipeline(
-            common.get_tosa_compile_spec("TOSA-0.80+BI"), self.GreaterEqual(), test_data
-        )
-
-    @parameterized.expand(test_data_suite)
-    @unittest.skip
-    def test_ge_u55_BI(
-        self,
-        test_name: str,
-        input_: torch.Tensor,
-        other_: torch.Tensor,
-    ):
-        test_data = (input_, other_)
-        self._test_ge_tosa_BI_pipeline(
-            common.get_u55_compile_spec(permute_memory_to_nhwc=True),
-            self.GreaterEqual(),
-            test_data,
-        )
-
-    @parameterized.expand(test_data_suite)
-    @unittest.skip
-    def test_ge_u85_BI(
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU85PipelineBI,
+    OpNotSupportedPipeline,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+aten_op = "torch.ops.aten.ge.Tensor"
+exir_op = "executorch_exir_dialects_edge__ops_aten_ge_Tensor"
+
+input_t = Tuple[torch.Tensor]
+
+
+class GreaterEqual(torch.nn.Module):
+    def __init__(self, input, other):
+        super().__init__()
+        self.input_ = input
+        self.other_ = other
+
+    def forward(
         self,
-        test_name: str,
         input_: torch.Tensor,
         other_: torch.Tensor,
     ):
-        test_data = (input_, other_)
-        self._test_ge_tosa_BI_pipeline(
-            common.get_u85_compile_spec(permute_memory_to_nhwc=True),
-            self.GreaterEqual(),
-            test_data,
-        )
+        return input_ >= other_
+
+    def get_inputs(self):
+        return (self.input_, self.other_)
+
+
+op_ge_rank1_ones = GreaterEqual(
+    torch.ones(5),
+    torch.ones(5),
+)
+op_ge_rank2_rand = GreaterEqual(
+    torch.rand(4, 5),
+    torch.rand(1, 5),
+)
+op_ge_rank3_randn = GreaterEqual(
+    torch.randn(10, 5, 2),
+    torch.randn(10, 5, 2),
+)
+op_ge_rank4_randn = GreaterEqual(
+    torch.randn(3, 2, 2, 2),
+    torch.randn(3, 2, 2, 2),
+)
+
+test_data_common = {
+    "ge_rank1_ones": op_ge_rank1_ones,
+    "ge_rank2_rand": op_ge_rank2_rand,
+    "ge_rank3_randn": op_ge_rank3_randn,
+    "ge_rank4_randn": op_ge_rank4_randn,
+}
+
+
+@common.parametrize("test_module", test_data_common)
+def test_ge_tosa_MI(test_module):
+    pipeline = TosaPipelineMI[input_t](
+        test_module, test_module.get_inputs(), aten_op, exir_op
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_data_common)
+def test_ge_tosa_BI(test_module):
+    pipeline = TosaPipelineBI[input_t](
+        test_module, test_module.get_inputs(), aten_op, exir_op
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_data_common)
+def test_ge_u55_BI(test_module):
+    # GREATER_EQUAL is not supported on U55.
+    pipeline = OpNotSupportedPipeline[input_t](
+        test_module,
+        test_module.get_inputs(),
+        "TOSA-0.80+BI+u55",
+        {exir_op: 1},
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_data_common)
+def test_ge_u85_BI(test_module):
+    pipeline = EthosU85PipelineBI[input_t](
+        test_module,
+        test_module.get_inputs(),
+        aten_op,
+        exir_op,
+        run_on_fvp=False,
+        use_to_edge_transform_and_lower=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_data_common)
+@pytest.mark.skip(reason="The same as test_ge_u55_BI")
+def test_ge_u55_BI_on_fvp(test_module):
+    # GREATER_EQUAL is not supported on U55.
+    pipeline = OpNotSupportedPipeline[input_t](
+        test_module,
+        test_module.get_inputs(),
+        "TOSA-0.80+BI+u55",
+        {exir_op: 1},
+    )
+    pipeline.run()
+
+
+@common.parametrize(
+    "test_module",
+    test_data_common,
+    xfails={"ge_rank4_randn": "4D fails because boolean Tensors can't be subtracted"},
+)
+@common.SkipIfNoCorstone320
+def test_ge_u85_BI_on_fvp(test_module):
+    pipeline = EthosU85PipelineBI[input_t](
+        test_module,
+        test_module.get_inputs(),
+        aten_op,
+        exir_op,
+        run_on_fvp=True,
+        use_to_edge_transform_and_lower=True,
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_gt.py b/backends/arm/test/ops/test_gt.py
index 33899f64492..2095f781bdb 100644
--- a/backends/arm/test/ops/test_gt.py
+++ b/backends/arm/test/ops/test_gt.py
@@ -1,140 +1,136 @@
 # Copyright 2025 Arm Limited and/or its affiliates.
-# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
+from typing import Tuple
 
+import pytest
 import torch
 from executorch.backends.arm.test import common
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.exir.backend.compile_spec_schema import CompileSpec
-from parameterized import parameterized
-
-test_data_suite = [
-    # (test_name, input, other,) See torch.gt() for info
-    (
-        "op_gt_rank1_ones",
-        torch.ones(5),
-        torch.ones(5),
-    ),
-    (
-        "op_gt_rank2_rand",
-        torch.rand(4, 5),
-        torch.rand(1, 5),
-    ),
-    (
-        "op_gt_rank3_randn",
-        torch.randn(10, 5, 2),
-        torch.randn(10, 5, 2),
-    ),
-    (
-        "op_gt_rank4_randn",
-        torch.randn(3, 2, 2, 2),
-        torch.randn(3, 2, 2, 2),
-    ),
-]
-
-
-class TestGreater(unittest.TestCase):
-    class Greater(torch.nn.Module):
-        def forward(
-            self,
-            input_: torch.Tensor,
-            other_: torch.Tensor,
-        ):
-            return input_ > other_
-
-    def _test_gt_tosa_pipeline(
-        self, module: torch.nn.Module, test_data: tuple[torch.Tensor, torch.Tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .check_count({"torch.ops.aten.gt.Tensor": 1})
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_gt_tosa_BI_pipeline(
-        self,
-        compile_spec: list[CompileSpec],
-        module: torch.nn.Module,
-        test_data: tuple[torch.Tensor, torch.Tensor],
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize()
-            .export()
-            .check_count({"torch.ops.aten.gt.Tensor": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    @parameterized.expand(test_data_suite)
-    def test_gt_tosa_MI(
-        self,
-        test_name: str,
-        input_: torch.Tensor,
-        other_: torch.Tensor,
-    ):
-        test_data = (input_, other_)
-        self._test_gt_tosa_pipeline(self.Greater(), test_data)
 
-    @parameterized.expand(test_data_suite)
-    def test_gt_tosa_BI(
-        self,
-        test_name: str,
-        input_: torch.Tensor,
-        other_: torch.Tensor,
-    ):
-        test_data = (input_, other_)
-        self._test_gt_tosa_BI_pipeline(
-            common.get_tosa_compile_spec("TOSA-0.80+BI"), self.Greater(), test_data
-        )
-
-    @parameterized.expand(test_data_suite)
-    @unittest.skip
-    def test_gt_u55_BI(
-        self,
-        test_name: str,
-        input_: torch.Tensor,
-        other_: torch.Tensor,
-    ):
-        test_data = (input_, other_)
-        self._test_gt_tosa_BI_pipeline(
-            common.get_u55_compile_spec(permute_memory_to_nhwc=True),
-            self.Greater(),
-            test_data,
-        )
-
-    @parameterized.expand(test_data_suite)
-    @unittest.skip
-    def test_gt_u85_BI(
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU85PipelineBI,
+    OpNotSupportedPipeline,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+aten_op = "torch.ops.aten.gt.Tensor"
+exir_op = "executorch_exir_dialects_edge__ops_aten_gt_Tensor"
+
+input_t = Tuple[torch.Tensor]
+
+
+class Greater(torch.nn.Module):
+    def __init__(self, input, other):
+        super().__init__()
+        self.input_ = input
+        self.other_ = other
+
+    def forward(
         self,
-        test_name: str,
         input_: torch.Tensor,
         other_: torch.Tensor,
     ):
-        test_data = (input_, other_)
-        self._test_gt_tosa_BI_pipeline(
-            common.get_u85_compile_spec(permute_memory_to_nhwc=True),
-            self.Greater(),
-            test_data,
-        )
+        return input_ > other_
+
+    def get_inputs(self):
+        return (self.input_, self.other_)
+
+
+op_gt_rank1_ones = Greater(
+    torch.ones(5),
+    torch.ones(5),
+)
+op_gt_rank2_rand = Greater(
+    torch.rand(4, 5),
+    torch.rand(1, 5),
+)
+op_gt_rank3_randn = Greater(
+    torch.randn(10, 5, 2),
+    torch.randn(10, 5, 2),
+)
+op_gt_rank4_randn = Greater(
+    torch.randn(3, 2, 2, 2),
+    torch.randn(3, 2, 2, 2),
+)
+
+test_data_common = {
+    "gt_rank1_ones": op_gt_rank1_ones,
+    "gt_rank2_rand": op_gt_rank2_rand,
+    "gt_rank3_randn": op_gt_rank3_randn,
+    "gt_rank4_randn": op_gt_rank4_randn,
+}
+
+
+@common.parametrize("test_module", test_data_common)
+def test_gt_tosa_MI(test_module):
+    pipeline = TosaPipelineMI[input_t](
+        test_module, test_module.get_inputs(), aten_op, exir_op
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_data_common)
+def test_gt_tosa_BI(test_module):
+    pipeline = TosaPipelineBI[input_t](
+        test_module, test_module.get_inputs(), aten_op, exir_op
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_data_common)
+def test_gt_u55_BI(test_module):
+    # GREATER is not supported on U55.
+    pipeline = OpNotSupportedPipeline[input_t](
+        test_module,
+        test_module.get_inputs(),
+        "TOSA-0.80+BI+u55",
+        {exir_op: 1},
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_data_common)
+def test_gt_u85_BI(test_module):
+    pipeline = EthosU85PipelineBI[input_t](
+        test_module,
+        test_module.get_inputs(),
+        aten_op,
+        exir_op,
+        run_on_fvp=False,
+        use_to_edge_transform_and_lower=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_data_common)
+@pytest.mark.skip(reason="The same as test_gt_u55_BI")
+def test_gt_u55_BI_on_fvp(test_module):
+    # GREATER is not supported on U55.
+    pipeline = OpNotSupportedPipeline[input_t](
+        test_module,
+        test_module.get_inputs(),
+        "TOSA-0.80+BI+u55",
+        {exir_op: 1},
+    )
+    pipeline.run()
+
+
+@common.parametrize(
+    "test_module",
+    test_data_common,
+    xfails={"gt_rank4_randn": "4D fails because boolean Tensors can't be subtracted"},
+)
+@common.SkipIfNoCorstone320
+def test_gt_u85_BI_on_fvp(test_module):
+    pipeline = EthosU85PipelineBI[input_t](
+        test_module,
+        test_module.get_inputs(),
+        aten_op,
+        exir_op,
+        run_on_fvp=True,
+        use_to_edge_transform_and_lower=True,
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_layer_norm.py b/backends/arm/test/ops/test_layer_norm.py
index 82f0af8dcf7..4e91554e05a 100644
--- a/backends/arm/test/ops/test_layer_norm.py
+++ b/backends/arm/test/ops/test_layer_norm.py
@@ -1,212 +1,123 @@
 # Copyright 2024-2025 Arm Limited and/or its affiliates.
-# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
-
-from typing import List, Tuple, Union
-
-import pytest
+from typing import List, Union
 
 import torch
-from executorch.backends.arm.test import common, conftest
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.exir.backend.backend_details import CompileSpec
-from parameterized import parameterized
-
-
-test_data_suite = [
-    # (test_name, test_data, [normalized_shape, eps, elementwise_affine, has_bias] )
-    ("randn_last_dim", torch.randn(1, 5, 5, 5), [[5]]),
-    ("rand_last_two_dims", torch.rand(1, 5, 5, 5), [[5, 5]]),
-    (
-        "rand_last_two_dims_not_elementwise_affine",
-        torch.rand(1, 5, 5, 5),
-        [[5, 5], 1e-5, False],
-    ),
-    (
-        "rand_last_two_dims_not_elementwise_affine_no_bias",
-        torch.rand(1, 5, 5, 5),
-        [[5, 5], 1e-5, False, False],
-    ),
-    ("randn_last_three_dims", torch.randn(1, 15, 10, 5), [[15, 10, 5]]),
-    (
-        "randn_last_three_dims_no_bias",
-        torch.randn(1, 15, 10, 5),
-        [[15, 10, 5], 1e-2, False, False],
-    ),
-]
-
-
-class TestLayerNorm(unittest.TestCase):
-
-    class LayerNorm(torch.nn.Module):
-
-        def __init__(
-            self,
-            normalized_shape: Union[int, List[int]],
-            eps: float = 1e-5,
-            elementwise_affine: bool = True,
-            has_bias: bool = True,
-        ):
-            super().__init__()
-            self.layer_norm = torch.nn.LayerNorm(
-                normalized_shape,
-                eps=eps,
-                elementwise_affine=elementwise_affine,
-                bias=has_bias,
-            )
-            if elementwise_affine:
-                self.layer_norm.weight = torch.nn.Parameter(
-                    torch.ones(normalized_shape)
-                )
-                if has_bias:
-                    self.layer_norm.bias = torch.nn.Parameter(
-                        torch.rand(normalized_shape)
-                    )
-
-        def forward(self, x):
-            return self.layer_norm(x)
-
-    def _test_layernorm_tosa_MI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
-    ):
-        (
-            ArmTester(
-                model=module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80+MI",
-                ),
-            )
-            .export()
-            .check(["torch.ops.aten.layer_norm.default"])
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .check_not(["torch.ops.aten.layer_norm.default"])
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_layernorm_tosa_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
-    ):
-        (
-            ArmTester(
-                model=module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80+BI",
-                ),
-            )
-            .quantize()
-            .check_not(["torch.ops.aten.layer_norm.default"])
-            .export()
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(qtol=1, inputs=test_data)
-        )
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
 
-    def _test_layernorm_ethosu_BI_pipeline(
-        self,
-        module: torch.nn.Module,
-        compile_spec: CompileSpec,
-        test_data: Tuple[torch.Tensor],
-    ):
-        tester = (
-            ArmTester(
-                model=module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize()
-            .check_not(["torch.ops.aten.layer_norm.default"])
-            .export()
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .serialize()
-        )
-        if conftest.is_option_enabled("corstone_fvp"):
-            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
-    @parameterized.expand(test_data_suite)
-    def test_layer_norm_tosa_MI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        model_params,
-    ):
-        self._test_layernorm_tosa_MI_pipeline(
-            self.LayerNorm(*model_params), (test_data,)
-        )
+class LayerNorm(torch.nn.Module):
 
-    @parameterized.expand(test_data_suite)
-    def test_layer_norm_tosa_BI(
+    def __init__(
         self,
-        test_name: str,
-        test_data: torch.Tensor,
-        model_params,
+        normalized_shape: Union[int, List[int]],
+        eps: float = 1e-5,
+        elementwise_affine: bool = True,
+        has_bias: bool = True,
     ):
-        self._test_layernorm_tosa_BI_pipeline(
-            self.LayerNorm(*model_params), (test_data,)
-        )
-
-    @parameterized.expand(test_data_suite[4:])
-    @pytest.mark.corstone_fvp
-    def test_layer_norm_u55_BI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        model_params,
-    ):
-        self._test_layernorm_ethosu_BI_pipeline(
-            self.LayerNorm(*model_params), common.get_u55_compile_spec(), (test_data,)
-        )
-
-    # Numerical issues on FVP likely due to mul op, MLETORCH-521
-    # Skip tests that require transposes.
-    @parameterized.expand(test_data_suite[:4])
-    @pytest.mark.corstone_fvp
-    @conftest.expectedFailureOnFVP
-    def test_layer_norm_u55_BI_xfails(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        model_params,
-    ):
-        self._test_layernorm_ethosu_BI_pipeline(
-            self.LayerNorm(*model_params), common.get_u55_compile_spec(), (test_data,)
-        )
-
-    # Numerical issues on FVP likely due to mul op, MLETORCH-521
-    @parameterized.expand(test_data_suite[:-2])
-    @pytest.mark.corstone_fvp
-    @conftest.expectedFailureOnFVP
-    def test_layer_norm_u85_BI_xfails(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        model_params,
-    ):
-        self._test_layernorm_ethosu_BI_pipeline(
-            self.LayerNorm(*model_params), common.get_u85_compile_spec(), (test_data,)
-        )
-
-    @parameterized.expand(test_data_suite[-2:])
-    @pytest.mark.corstone_fvp
-    def test_layer_norm_u85_BI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        model_params,
-    ):
-        self._test_layernorm_ethosu_BI_pipeline(
-            self.LayerNorm(*model_params), common.get_u85_compile_spec(), (test_data,)
+        super().__init__()
+        self.layer_norm = torch.nn.LayerNorm(
+            normalized_shape,
+            eps=eps,
+            elementwise_affine=elementwise_affine,
+            bias=has_bias,
         )
+        if elementwise_affine:
+            self.layer_norm.weight = torch.nn.Parameter(torch.ones(normalized_shape))
+        if has_bias:
+            self.layer_norm.bias = torch.nn.Parameter(torch.rand(normalized_shape))
+
+    def forward(self, x):
+        return self.layer_norm(x)
+
+
+input_t = tuple[torch.Tensor]
+test_data_suite = {
+    "randn_last_dim": ((torch.randn(1, 5, 5, 5),), LayerNorm([5])),
+    "rand_last_two_dims": ((torch.rand(1, 5, 5, 5),), LayerNorm([5, 5])),
+    "rand_last_two_dims_not_elementwise_affine": (
+        (torch.rand(1, 5, 5, 5),),
+        LayerNorm([5, 5], 1e-5, False),
+    ),
+    "rand_last_two_dims_not_elementwise_affine_no_bias": (
+        (torch.rand(1, 5, 5, 5),),
+        LayerNorm([5, 5], 1e-5, False, False),
+    ),
+    "randn_last_three_dims": ((torch.randn(1, 15, 10, 5),), LayerNorm([15, 10, 5])),
+    "randn_last_three_dims_no_bias": (
+        (torch.randn(1, 15, 10, 5),),
+        LayerNorm([15, 10, 5], 1e-2, False, False),
+    ),
+}
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_native_layer_norm_tosa_MI(test_data):
+    pipeline = TosaPipelineMI[input_t](
+        test_data[1],
+        test_data[0],
+        "torch.ops.aten.layer_norm.default",
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_native_layer_norm_tosa_BI(test_data):
+    pipeline = TosaPipelineBI[input_t](
+        test_data[1],
+        test_data[0],
+        "torch.ops.aten.sub.Tensor",  # Just check for sub op included in the layernorm decomposition
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_native_layer_norm_u55_BI(test_data):
+    pipeline = EthosU55PipelineBI[input_t](
+        test_data[1],
+        test_data[0],
+        "torch.ops.aten.sub.Tensor",  # Just check for sub op included in the layernorm decomposition
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_native_layer_norm_u85_BI(test_data):
+    pipeline = EthosU85PipelineBI[input_t](
+        test_data[1],
+        test_data[0],
+        "torch.ops.aten.sub.Tensor",  # Just check for sub op included in the layernorm decomposition
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+@common.SkipIfNoCorstone300
+def test_native_layer_norm_u55_BI_on_fvp(test_data):
+    pipeline = EthosU55PipelineBI[input_t](
+        test_data[1],
+        test_data[0],
+        "torch.ops.aten.sub.Tensor",  # Just check for sub op included in the layernorm decomposition
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+@common.SkipIfNoCorstone320
+def test_native_layer_norm_u85_BI_on_fvp(test_data):
+    pipeline = EthosU85PipelineBI[input_t](
+        test_data[1],
+        test_data[0],
+        "torch.ops.aten.sub.Tensor",  # Just check for sub op included in the layernorm decomposition
+        run_on_fvp=True,
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_le.py b/backends/arm/test/ops/test_le.py
index 0710f483a0b..7e243ead620 100644
--- a/backends/arm/test/ops/test_le.py
+++ b/backends/arm/test/ops/test_le.py
@@ -1,140 +1,136 @@
 # Copyright 2025 Arm Limited and/or its affiliates.
-# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
+from typing import Tuple
 
+import pytest
 import torch
 from executorch.backends.arm.test import common
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.exir.backend.compile_spec_schema import CompileSpec
-from parameterized import parameterized
-
-test_data_suite = [
-    # (test_name, input, other,) See torch.le() for info
-    (
-        "op_le_rank1_ones",
-        torch.ones(5),
-        torch.ones(5),
-    ),
-    (
-        "op_le_rank2_rand",
-        torch.rand(4, 5),
-        torch.rand(1, 5),
-    ),
-    (
-        "op_le_rank3_randn",
-        torch.randn(10, 5, 2),
-        torch.randn(10, 5, 2),
-    ),
-    (
-        "op_le_rank4_randn",
-        torch.randn(3, 2, 2, 2),
-        torch.randn(3, 2, 2, 2),
-    ),
-]
-
-
-class TestLessEqual(unittest.TestCase):
-    class LessEqual(torch.nn.Module):
-        def forward(
-            self,
-            input_: torch.Tensor,
-            other_: torch.Tensor,
-        ):
-            return torch.le(input_, other_)
-
-    def _test_le_tosa_pipeline(
-        self, module: torch.nn.Module, test_data: tuple[torch.Tensor, torch.Tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .check_count({"torch.ops.aten.le.Tensor": 1})
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_le_tosa_BI_pipeline(
-        self,
-        compile_spec: list[CompileSpec],
-        module: torch.nn.Module,
-        test_data: tuple[torch.Tensor, torch.Tensor],
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize()
-            .export()
-            .check_count({"torch.ops.aten.le.Tensor": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    @parameterized.expand(test_data_suite)
-    def test_le_tosa_MI(
-        self,
-        test_name: str,
-        input_: torch.Tensor,
-        other_: torch.Tensor,
-    ):
-        test_data = (input_, other_)
-        self._test_le_tosa_pipeline(self.LessEqual(), test_data)
 
-    @parameterized.expand(test_data_suite)
-    def test_le_tosa_BI(
-        self,
-        test_name: str,
-        input_: torch.Tensor,
-        other_: torch.Tensor,
-    ):
-        test_data = (input_, other_)
-        self._test_le_tosa_BI_pipeline(
-            common.get_tosa_compile_spec("TOSA-0.80+BI"), self.LessEqual(), test_data
-        )
-
-    @parameterized.expand(test_data_suite)
-    @unittest.skip
-    def test_le_u55_BI(
-        self,
-        test_name: str,
-        input_: torch.Tensor,
-        other_: torch.Tensor,
-    ):
-        test_data = (input_, other_)
-        self._test_le_tosa_BI_pipeline(
-            common.get_u55_compile_spec(permute_memory_to_nhwc=True),
-            self.LessEqual(),
-            test_data,
-        )
-
-    @parameterized.expand(test_data_suite)
-    @unittest.skip
-    def test_le_u85_BI(
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU85PipelineBI,
+    OpNotSupportedPipeline,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+aten_op = "torch.ops.aten.le.Tensor"
+exir_op = "executorch_exir_dialects_edge__ops_aten_le_Tensor"
+
+input_t = Tuple[torch.Tensor]
+
+
+class GreaterEqual(torch.nn.Module):
+    def __init__(self, input, other):
+        super().__init__()
+        self.input_ = input
+        self.other_ = other
+
+    def forward(
         self,
-        test_name: str,
         input_: torch.Tensor,
         other_: torch.Tensor,
     ):
-        test_data = (input_, other_)
-        self._test_le_tosa_BI_pipeline(
-            common.get_u85_compile_spec(permute_memory_to_nhwc=True),
-            self.LessEqual(),
-            test_data,
-        )
+        return input_ <= other_
+
+    def get_inputs(self):
+        return (self.input_, self.other_)
+
+
+op_le_rank1_ones = GreaterEqual(
+    torch.ones(5),
+    torch.ones(5),
+)
+op_le_rank2_rand = GreaterEqual(
+    torch.rand(4, 5),
+    torch.rand(1, 5),
+)
+op_le_rank3_randn = GreaterEqual(
+    torch.randn(10, 5, 2),
+    torch.randn(10, 5, 2),
+)
+op_le_rank4_randn = GreaterEqual(
+    torch.randn(3, 2, 2, 2),
+    torch.randn(3, 2, 2, 2),
+)
+
+test_data_common = {
+    "le_rank1_ones": op_le_rank1_ones,
+    "le_rank2_rand": op_le_rank2_rand,
+    "le_rank3_randn": op_le_rank3_randn,
+    "le_rank4_randn": op_le_rank4_randn,
+}
+
+
+@common.parametrize("test_module", test_data_common)
+def test_le_tosa_MI(test_module):
+    pipeline = TosaPipelineMI[input_t](
+        test_module, test_module.get_inputs(), aten_op, exir_op
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_data_common)
+def test_le_tosa_BI(test_module):
+    pipeline = TosaPipelineBI[input_t](
+        test_module, test_module.get_inputs(), aten_op, exir_op
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_data_common)
+def test_le_u55_BI(test_module):
+    # GREATER_EQUAL is not supported on U55. LE uses the GREATER_EQUAL Tosa operator.
+    pipeline = OpNotSupportedPipeline[input_t](
+        test_module,
+        test_module.get_inputs(),
+        "TOSA-0.80+BI+u55",
+        {exir_op: 1},
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_data_common)
+def test_le_u85_BI(test_module):
+    pipeline = EthosU85PipelineBI[input_t](
+        test_module,
+        test_module.get_inputs(),
+        aten_op,
+        exir_op,
+        run_on_fvp=False,
+        use_to_edge_transform_and_lower=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_data_common)
+@pytest.mark.skip(reason="The same as test_le_u55_BI")
+def test_le_u55_BI_on_fvp(test_module):
+    # GREATER_EQUAL is not supported on U55. LE uses the GREATER_EQUAL Tosa operator.
+    pipeline = OpNotSupportedPipeline[input_t](
+        test_module,
+        test_module.get_inputs(),
+        "TOSA-0.80+BI+u55",
+        {exir_op: 1},
+    )
+    pipeline.run()
+
+
+@common.parametrize(
+    "test_module",
+    test_data_common,
+    xfails={"le_rank4_randn": "4D fails because boolean Tensors can't be subtracted"},
+)
+@common.SkipIfNoCorstone320
+def test_le_u85_BI_on_fvp(test_module):
+    pipeline = EthosU85PipelineBI[input_t](
+        test_module,
+        test_module.get_inputs(),
+        aten_op,
+        exir_op,
+        run_on_fvp=True,
+        use_to_edge_transform_and_lower=True,
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_linear.py b/backends/arm/test/ops/test_linear.py
index 3bdec0c694a..33bf9932b5a 100644
--- a/backends/arm/test/ops/test_linear.py
+++ b/backends/arm/test/ops/test_linear.py
@@ -127,7 +127,7 @@ def forward(self, x):
     def _test_linear_tosa_MI_pipeline(
         self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
     ):
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -141,13 +141,14 @@ def _test_linear_tosa_MI_pipeline(
             .to_edge_transform_and_lower()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
         )
+        if conftest.is_option_enabled("tosa_ref_model"):
+            tester.run_method_and_compare_outputs(inputs=test_data)
 
     def _test_linear_tosa_BI_pipeline(
         self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
     ):
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -162,8 +163,9 @@ def _test_linear_tosa_BI_pipeline(
             .to_edge_transform_and_lower()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data, qtol=1)
         )
+        if conftest.is_option_enabled("tosa_ref_model"):
+            tester.run_method_and_compare_outputs(inputs=test_data, qtol=1)
 
     def _test_linear_tosa_ethosu_BI_pipeline(
         self,
@@ -186,9 +188,11 @@ def _test_linear_tosa_ethosu_BI_pipeline(
             .to_executorch()
             .serialize()
         )
+        # TODO: Add FVP testing support.
         return tester
 
     @parameterized.expand(test_data_suite_rank1 + test_data_suite_rank4)
+    @pytest.mark.tosa_ref_model
     def test_linear_tosa_MI(
         self,
         test_name: str,
@@ -208,6 +212,7 @@ def test_linear_tosa_MI(
         )
 
     @parameterized.expand(test_data_suite_rank1 + test_data_suite_rank4)
+    @pytest.mark.tosa_ref_model
     def test_linear_tosa_BI(
         self,
         test_name: str,
@@ -249,6 +254,7 @@ def test_linear_tosa_u55_BI(
             tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(test_data_suite_rank1 + test_data_suite_rank4)
+    @pytest.mark.corstone_fvp
     def test_linear_tosa_u85_BI(
         self,
         test_name: str,
diff --git a/backends/arm/test/ops/test_logical.py b/backends/arm/test/ops/test_logical.py
new file mode 100644
index 00000000000..fd59bbbd263
--- /dev/null
+++ b/backends/arm/test/ops/test_logical.py
@@ -0,0 +1,160 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from typing import Tuple
+
+import torch
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU85PipelineBI,
+    OpNotSupportedPipeline,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+
+class And(torch.nn.Module):
+    aten_op = "torch.ops.aten.logical_and.default"
+    exir_op = "executorch_exir_dialects_edge__ops_aten_logical_and_default"
+
+    def forward(self, tensor1: torch.Tensor, tensor2: torch.Tensor):
+        return tensor1.logical_and(tensor2)
+
+
+class Xor(torch.nn.Module):
+    aten_op = "torch.ops.aten.logical_xor.default"
+    exir_op = "executorch_exir_dialects_edge__ops_aten_logical_xor_default"
+
+    def forward(self, tensor1: torch.Tensor, tensor2: torch.Tensor):
+        return tensor1.logical_xor(tensor2)
+
+
+class Or(torch.nn.Module):
+    aten_op = "torch.ops.aten.logical_or.default"
+    exir_op = "executorch_exir_dialects_edge__ops_aten_logical_or_default"
+
+    def forward(self, tensor1: torch.Tensor, tensor2: torch.Tensor):
+        return tensor1.logical_or(tensor2)
+
+
+class Not(torch.nn.Module):
+    aten_op = "torch.ops.aten.logical_not.default"
+    exir_op = "executorch_exir_dialects_edge__ops_aten_logical_not_default"
+
+    def forward(self, tensor: torch.Tensor):
+        return torch.logical_not(tensor)
+
+
+input_t2 = Tuple[torch.Tensor, torch.Tensor]  # Input x, y
+
+
+test_input: dict[input_t2] = {
+    "rank1": (
+        torch.tensor([True, True, False, False], dtype=torch.bool),
+        torch.tensor([True, False, True, False], dtype=torch.bool),
+    ),
+    "rand_rank2": (
+        torch.randint(0, 2, (10, 10), dtype=torch.bool),
+        torch.randint(0, 2, (10, 10), dtype=torch.bool),
+    ),
+    "rand_rank3": (
+        torch.randint(0, 2, (10, 10, 10), dtype=torch.bool),
+        torch.randint(0, 2, (10, 10, 10), dtype=torch.bool),
+    ),
+    "rand_rank4": (
+        torch.randint(0, 2, (1, 10, 10, 10), dtype=torch.bool),
+        torch.randint(0, 2, (1, 10, 10, 10), dtype=torch.bool),
+    ),
+}
+
+
+test_data = {
+    "not_rank1": (Not(), test_input["rank1"][:-1]),
+    "not_rand_rank2": (Not(), test_input["rand_rank2"][:-1]),
+    "not_rand_rank3": (Not(), test_input["rand_rank3"][:-1]),
+    "not_rand_rank4": (Not(), test_input["rand_rank4"][:-1]),
+    "and_rank1": (And(), test_input["rank1"]),
+    "and_rand_rank2": (And(), test_input["rand_rank2"]),
+    "and_rand_rank3": (And(), test_input["rand_rank3"]),
+    "and_rand_rank4": (And(), test_input["rand_rank4"]),
+    "xor_rank1": (Xor(), test_input["rank1"]),
+    "xor_rand_rank2": (Xor(), test_input["rand_rank2"]),
+    "xor_rand_rank3": (Xor(), test_input["rand_rank3"]),
+    "xor_rand_rank4": (Xor(), test_input["rand_rank4"]),
+    "or_rank1": (Or(), test_input["rank1"]),
+    "or_rand_rank2": (Or(), test_input["rand_rank2"]),
+    "or_rand_rank3": (Or(), test_input["rand_rank3"]),
+    "or_rand_rank4": (Or(), test_input["rand_rank4"]),
+}
+
+
+fvp_xfails = {
+    "not_rank1": "MLETORCH-706 Support ScalarType::Bool in EthosUBackend.",
+    "not_rand_rank2": "MLETORCH-706: Support ScalarType::Bool in EthosUBackend.",
+    "not_rand_rank3": "MLETORCH-706: Support ScalarType::Bool in EthosUBackend.",
+    "not_rand_rank4": "MLETORCH-706: Support ScalarType::Bool in EthosUBackend.",
+    "and_rank1": "MLETORCH-706 Support ScalarType::Bool in EthosUBackend.",
+    "and_rand_rank2": "MLETORCH-706: Support ScalarType::Bool in EthosUBackend.",
+    "and_rand_rank3": "MLETORCH-706: Support ScalarType::Bool in EthosUBackend.",
+    "and_rand_rank4": "MLETORCH-706: Support ScalarType::Bool in EthosUBackend.",
+    "xor_rank1": "MLETORCH-706 Support ScalarType::Bool in EthosUBackend.",
+    "xor_rand_rank2": "MLETORCH-706: Support ScalarType::Bool in EthosUBackend.",
+    "xor_rand_rank3": "MLETORCH-706: Support ScalarType::Bool in EthosUBackend.",
+    "xor_rand_rank4": "MLETORCH-706: Support ScalarType::Bool in EthosUBackend.",
+    "or_rank1": "MLETORCH-706 Support ScalarType::Bool in EthosUBackend.",
+    "or_rand_rank2": "MLETORCH-706: Support ScalarType::Bool in EthosUBackend.",
+    "or_rand_rank3": "MLETORCH-706: Support ScalarType::Bool in EthosUBackend.",
+    "or_rand_rank4": "MLETORCH-706: Support ScalarType::Bool in EthosUBackend.",
+}
+
+
+@common.parametrize("test_data", test_data)
+def test_logical_tosa_MI(test_data: input_t2):
+    op, test_input = test_data
+    pipeline = TosaPipelineMI[input_t2](op, test_input, op.aten_op, op.exir_op)
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data)
+def test_logical_tosa_BI(test_data: input_t2):
+    op, test_input = test_data
+    pipeline = TosaPipelineBI[input_t2](op, test_input, op.aten_op, op.exir_op)
+    pipeline.pop_stage(pipeline.find_pos("quantize") + 1)
+    pipeline.pop_stage("quantize")
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data)
+def test_logical_u55_BI(test_data: input_t2):
+    # Tests that we don't delegate these ops since they are not supported on U55.
+    op, test_input = test_data
+    pipeline = OpNotSupportedPipeline[input_t2](
+        op, test_input, "TOSA-0.80+BI+u55", {op.exir_op: 1}
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data)
+def test_logical_u85_BI(test_data: input_t2):
+    op, test_input = test_data
+    pipeline = EthosU85PipelineBI[input_t2](
+        op, test_input, op.aten_op, op.exir_op, run_on_fvp=False
+    )
+    pipeline.pop_stage(pipeline.find_pos("quantize") + 1)
+    pipeline.pop_stage("quantize")
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data, fvp_xfails)
+@common.SkipIfNoCorstone320
+def test_logical_u85_BI_on_fvp(test_data: input_t2):
+    op, test_input = test_data
+    pipeline = EthosU85PipelineBI[input_t2](
+        op, test_input, op.aten_op, op.exir_op, run_on_fvp=True
+    )
+    pipeline.pop_stage(pipeline.find_pos("quantize") + 1)
+    pipeline.pop_stage("quantize")
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_logsoftmax.py b/backends/arm/test/ops/test_logsoftmax.py
index f34d4afbb55..3e4cc1c0faa 100644
--- a/backends/arm/test/ops/test_logsoftmax.py
+++ b/backends/arm/test/ops/test_logsoftmax.py
@@ -1,158 +1,104 @@
 # Copyright 2024-2025 Arm Limited and/or its affiliates.
-# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
-
-from typing import Callable, Tuple
-
-import pytest
+from typing import Tuple
 
 import torch
 from executorch.backends.arm.test import common
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.exir.backend.compile_spec_schema import CompileSpec
-from parameterized import parameterized
-
-
-test_data_generators = [
-    # (test_name, test_data, dim)
-    lambda: ("zeros", torch.zeros(10, 8, 5, 2), 0),
-    lambda: ("zeros_neg_dim", torch.zeros(10, 7, 8, 9), -4),
-    lambda: ("ones", torch.ones(10, 10), 1),
-    lambda: ("ones_neg_dim", torch.ones(10, 3, 4), -1),
-    lambda: ("rand", torch.rand(1, 2, 5, 8), 2),
-    lambda: ("rand_neg_dim", torch.rand(2, 10, 8, 10), -2),
-    lambda: ("randn", torch.randn(10, 10, 10, 10), 3),
-    lambda: ("randn_neg_dim", torch.randn(10, 5, 8, 7), -3),
-]
-test_data_generators_u55 = [
-    # (test_name, test_data, dim)
-    lambda: ("ones", torch.ones(10, 10), 1),
-    lambda: ("ones_neg_dim", torch.ones(10, 3, 4), -1),
-    lambda: ("randn_neg_dim", torch.randn(10, 5, 8, 7), -3),
-    lambda: ("zeros", torch.zeros(10, 8, 5, 2), 0),
-    lambda: ("zeros_neg_dim", torch.zeros(10, 7, 8, 9), -4),
-    lambda: ("rand", torch.rand(1, 2, 5, 8), 2),
-    lambda: ("rand_neg_dim", torch.rand(2, 10, 8, 10), -2),
-    lambda: ("randn", torch.randn(10, 10, 10, 10), 3),
-]
-
-
-class TestLogSoftmax(unittest.TestCase):
-    """Tests logsoftmax."""
-
-    class LogSoftmax(torch.nn.Module):
-        def __init__(self, dim: int = -1):
-            super().__init__()
-            self.logsoftmax = torch.nn.LogSoftmax(dim=dim)
-
-        def forward(self, x):
-            return self.logsoftmax(x)
-
-    def _test_logsoftmax_tosa_MI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .check(["torch.ops.aten.log_softmax.int"])
-            .check_not(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten__logsoftmax_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_logsoftmax_tosa_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
-            .quantize()
-            .export()
-            .check_not(["torch.ops.aten.log_softmax.int"])
-            .check(["torch.ops.quantized_decomposed", "torch.ops.aten.mul.Tensor"])
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten__log_softmax_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data, qtol=1)
-        )
-
-    def _test_logsoftmax_tosa_ethos_BI_pipeline(
-        self,
-        compile_spec: list[CompileSpec],
-        module: torch.nn.Module,
-        test_data: Tuple[torch.tensor],
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize()
-            .export()
-            .check_not(["torch.ops.aten.log_softmax.int"])
-            .check(["torch.ops.quantized_decomposed", "torch.ops.aten.mul.Tensor"])
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten__logsoftmax_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-        )
-
-    def _test_logsoftmax_tosa_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
-    ):
-        self._test_logsoftmax_tosa_ethos_BI_pipeline(
-            common.get_u55_compile_spec(), module, test_data
-        )
-
-    def _test_logsoftmax_tosa_u85_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
-    ):
-        self._test_logsoftmax_tosa_ethos_BI_pipeline(
-            common.get_u85_compile_spec(), module, test_data
-        )
-
-    @parameterized.expand(test_data_generators)
-    def test_logsoftmax_tosa_MI(self, test_data_generator: Callable[[], Tuple]):
-        test_name, test_data, dim = test_data_generator()
-        self._test_logsoftmax_tosa_MI_pipeline(self.LogSoftmax(dim=dim), (test_data,))
-
-    @parameterized.expand(test_data_generators)
-    @pytest.mark.flaky  # TODO: MLETORCH-460 - Numerically stabler (log)softmax implementation
-    def test_logsoftmax_tosa_BI(self, test_data_generator: Callable[[], Tuple]):
-        test_name, test_data, dim = test_data_generator()
-        self._test_logsoftmax_tosa_BI_pipeline(self.LogSoftmax(dim=dim), (test_data,))
-
-    @parameterized.expand(test_data_generators_u55)
-    @pytest.mark.flaky  # TODO: MLETORCH-460 - Numerically stabler (log)softmax implementation
-    def test_logsoftmax_tosa_u55_BI(self, test_data_generator: Callable[[], Tuple]):
-        test_name, test_data, dim = test_data_generator()
-        self._test_logsoftmax_tosa_u55_BI_pipeline(
-            self.LogSoftmax(dim=dim), (test_data,)
-        )
-
-    @parameterized.expand(test_data_generators)
-    @pytest.mark.flaky  # TODO: MLETORCH-460 - Numerically stabler (log)softmax implementation
-    def test_logsoftmax_tosa_u85_BI(self, test_data_generator: Callable[[], Tuple]):
-        test_name, test_data, dim = test_data_generator()
-        self._test_logsoftmax_tosa_u85_BI_pipeline(
-            self.LogSoftmax(dim=dim), (test_data,)
-        )
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+aten_op = "torch.ops.aten.log_softmax.default"  # Used for checking that we do not have log_softmax in the graph
+exir_op = "executorch_exir_dialects_edge__ops_aten__log_softmax_tensor"
+
+input_t1 = Tuple[torch.Tensor]  # Input x
+
+
+class LogSoftmax(torch.nn.Module):
+    def __init__(self, dim: int = -1):
+        super().__init__()
+        self.log_softmax = torch.nn.LogSoftmax(dim=dim)
+
+    def forward(self, x):
+        return self.log_softmax(x)
+
+    test_data = {
+        "ones": ((torch.ones(10, 10),), 1),
+        "ones_neg_dim": ((torch.ones(1, 3, 4),), -1),
+        "randn_neg_dim": ((torch.randn(1, 5, 8, 7),), -3),
+        "zeros": ((torch.zeros(1, 8, 5, 2),), 0),
+        "zeros_neg_dim": ((torch.zeros(1, 7, 8, 9),), -4),
+        "rand": ((torch.rand(1, 2, 5, 8),), 2),
+        "rand_neg_dim": ((torch.rand(1, 10, 8, 10),), -2),
+        "randn_mult_batches": ((torch.randn(2, 10, 10, 10),), 3),
+    }
+
+
+@common.parametrize("test_data", LogSoftmax.test_data)
+def test_log_softmax_tosa_MI(test_data):
+    data, dim = test_data
+    pipeline = TosaPipelineMI[input_t1](LogSoftmax(dim), data, [])
+    pipeline.add_stage_after(
+        "to_edge_transform_and_lower", pipeline.tester.check_not, [exir_op]
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", LogSoftmax.test_data)
+def test_log_softmax_tosa_BI(test_data):
+    data, dim = test_data
+    pipeline = TosaPipelineBI[input_t1](LogSoftmax(dim), data, [])
+    pipeline.add_stage_after("quantize", pipeline.tester.check_not, [aten_op])
+    pipeline.run()
+
+
+@common.parametrize("test_data", LogSoftmax.test_data)
+def test_log_softmax_u55_BI(test_data):
+    data, dim = test_data
+    pipeline = EthosU55PipelineBI[input_t1](LogSoftmax(dim), data, [], run_on_fvp=False)
+    pipeline.add_stage_after("quantize", pipeline.tester.check_not, [aten_op])
+    pipeline.run()
+
+
+@common.parametrize("test_data", LogSoftmax.test_data)
+def test_log_softmax_u85_BI(test_data):
+    data, dim = test_data
+    pipeline = EthosU85PipelineBI[input_t1](LogSoftmax(dim), data, [], run_on_fvp=False)
+    pipeline.add_stage_after("quantize", pipeline.tester.check_not, [aten_op])
+    pipeline.run()
+
+
+@common.parametrize(
+    "test_data",
+    LogSoftmax.test_data,
+    xfails={
+        "randn_mult_batches": "MLETORCH-433: Multiple batches not supported on FVP"
+    },
+)
+@common.SkipIfNoCorstone300()
+def test_log_softmax_u55_BI_on_fvp(test_data):
+    data, dim = test_data
+    pipeline = EthosU55PipelineBI[input_t1](LogSoftmax(dim), data, [], run_on_fvp=True)
+    pipeline.add_stage_after("quantize", pipeline.tester.check_not, [aten_op])
+    pipeline.run()
+
+
+@common.parametrize(
+    "test_data",
+    LogSoftmax.test_data,
+    xfails={
+        "randn_mult_batches": "MLETORCH-433: Multiple batches not supported on FVP"
+    },
+)
+@common.SkipIfNoCorstone320
+def test_log_softmax_u85_BI_on_fvp(test_data):
+    data, dim = test_data
+    pipeline = EthosU85PipelineBI[input_t1](LogSoftmax(dim), data, [], run_on_fvp=True)
+    pipeline.add_stage_after("quantize", pipeline.tester.check_not, [aten_op])
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_lt.py b/backends/arm/test/ops/test_lt.py
index 398df8c2036..cae119cd7a8 100644
--- a/backends/arm/test/ops/test_lt.py
+++ b/backends/arm/test/ops/test_lt.py
@@ -1,140 +1,136 @@
 # Copyright 2025 Arm Limited and/or its affiliates.
-# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
+from typing import Tuple
 
+import pytest
 import torch
 from executorch.backends.arm.test import common
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.exir.backend.compile_spec_schema import CompileSpec
-from parameterized import parameterized
-
-test_data_suite = [
-    # (test_name, input, other,) See torch.lt() for info
-    (
-        "op_lt_rank1_ones",
-        torch.ones(5),
-        torch.ones(5),
-    ),
-    (
-        "op_lt_rank2_rand",
-        torch.rand(4, 5),
-        torch.rand(1, 5),
-    ),
-    (
-        "op_lt_rank3_randn",
-        torch.randn(10, 5, 2),
-        torch.randn(10, 5, 2),
-    ),
-    (
-        "op_lt_rank4_randn",
-        torch.randn(3, 2, 2, 2),
-        torch.randn(3, 2, 2, 2),
-    ),
-]
-
-
-class TestLessThan(unittest.TestCase):
-    class LessThan(torch.nn.Module):
-        def forward(
-            self,
-            input_: torch.Tensor,
-            other_: torch.Tensor,
-        ):
-            return torch.lt(input_, other_)
-
-    def _test_lt_tosa_pipeline(
-        self, module: torch.nn.Module, test_data: tuple[torch.Tensor, torch.Tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .check_count({"torch.ops.aten.lt.Tensor": 1})
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_lt_tosa_BI_pipeline(
-        self,
-        compile_spec: list[CompileSpec],
-        module: torch.nn.Module,
-        test_data: tuple[torch.Tensor, torch.Tensor],
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize()
-            .export()
-            .check_count({"torch.ops.aten.lt.Tensor": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    @parameterized.expand(test_data_suite)
-    def test_lt_tosa_MI(
-        self,
-        test_name: str,
-        input_: torch.Tensor,
-        other_: torch.Tensor,
-    ):
-        test_data = (input_, other_)
-        self._test_lt_tosa_pipeline(self.LessThan(), test_data)
 
-    @parameterized.expand(test_data_suite)
-    def test_lt_tosa_BI(
-        self,
-        test_name: str,
-        input_: torch.Tensor,
-        other_: torch.Tensor,
-    ):
-        test_data = (input_, other_)
-        self._test_lt_tosa_BI_pipeline(
-            common.get_tosa_compile_spec("TOSA-0.80+BI"), self.LessThan(), test_data
-        )
-
-    @parameterized.expand(test_data_suite)
-    @unittest.skip
-    def test_lt_u55_BI(
-        self,
-        test_name: str,
-        input_: torch.Tensor,
-        other_: torch.Tensor,
-    ):
-        test_data = (input_, other_)
-        self._test_lt_tosa_BI_pipeline(
-            common.get_u55_compile_spec(permute_memory_to_nhwc=True),
-            self.LessThan(),
-            test_data,
-        )
-
-    @parameterized.expand(test_data_suite)
-    @unittest.skip
-    def test_lt_u85_BI(
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU85PipelineBI,
+    OpNotSupportedPipeline,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+aten_op = "torch.ops.aten.lt.Tensor"
+exir_op = "executorch_exir_dialects_edge__ops_aten_lt_Tensor"
+
+input_t = Tuple[torch.Tensor]
+
+
+class LessThan(torch.nn.Module):
+    def __init__(self, input, other):
+        super().__init__()
+        self.input_ = input
+        self.other_ = other
+
+    def forward(
         self,
-        test_name: str,
         input_: torch.Tensor,
         other_: torch.Tensor,
     ):
-        test_data = (input_, other_)
-        self._test_lt_tosa_BI_pipeline(
-            common.get_u85_compile_spec(permute_memory_to_nhwc=True),
-            self.LessThan(),
-            test_data,
-        )
+        return input_ < other_
+
+    def get_inputs(self):
+        return (self.input_, self.other_)
+
+
+op_lt_rank1_ones = LessThan(
+    torch.ones(5),
+    torch.ones(5),
+)
+op_lt_rank2_rand = LessThan(
+    torch.rand(4, 5),
+    torch.rand(1, 5),
+)
+op_lt_rank3_randn = LessThan(
+    torch.randn(10, 5, 2),
+    torch.randn(10, 5, 2),
+)
+op_lt_rank4_randn = LessThan(
+    torch.randn(3, 2, 2, 2),
+    torch.randn(3, 2, 2, 2),
+)
+
+test_data_common = {
+    "lt_rank1_ones": op_lt_rank1_ones,
+    "lt_rank2_rand": op_lt_rank2_rand,
+    "lt_rank3_randn": op_lt_rank3_randn,
+    "lt_rank4_randn": op_lt_rank4_randn,
+}
+
+
+@common.parametrize("test_module", test_data_common)
+def test_lt_tosa_MI(test_module):
+    pipeline = TosaPipelineMI[input_t](
+        test_module, test_module.get_inputs(), aten_op, exir_op
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_data_common)
+def test_lt_tosa_BI(test_module):
+    pipeline = TosaPipelineBI[input_t](
+        test_module, test_module.get_inputs(), aten_op, exir_op
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_data_common)
+def test_lt_u55_BI(test_module):
+    # GREATER is not supported on U55. LT uses the GREATER Tosa operator.
+    pipeline = OpNotSupportedPipeline[input_t](
+        test_module,
+        test_module.get_inputs(),
+        "TOSA-0.80+BI+u55",
+        {exir_op: 1},
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_data_common)
+def test_lt_u85_BI(test_module):
+    pipeline = EthosU85PipelineBI[input_t](
+        test_module,
+        test_module.get_inputs(),
+        aten_op,
+        exir_op,
+        run_on_fvp=False,
+        use_to_edge_transform_and_lower=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_data_common)
+@pytest.mark.skip(reason="The same as test_lt_u55_BI")
+def test_lt_u55_BI_on_fvp(test_module):
+    # GREATER is not supported on U55. LT uses the GREATER Tosa operator.
+    pipeline = OpNotSupportedPipeline[input_t](
+        test_module,
+        test_module.get_inputs(),
+        "TOSA-0.80+BI+u55",
+        {exir_op: 1},
+    )
+    pipeline.run()
+
+
+@common.parametrize(
+    "test_module",
+    test_data_common,
+    xfails={"lt_rank4_randn": "4D fails because boolean Tensors can't be subtracted"},
+)
+@common.SkipIfNoCorstone320
+def test_lt_u85_BI_on_fvp(test_module):
+    pipeline = EthosU85PipelineBI[input_t](
+        test_module,
+        test_module.get_inputs(),
+        aten_op,
+        exir_op,
+        run_on_fvp=True,
+        use_to_edge_transform_and_lower=True,
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_max_pool.py b/backends/arm/test/ops/test_max_pool.py
index 6d6e0b8be5c..a31c12be3a0 100644
--- a/backends/arm/test/ops/test_max_pool.py
+++ b/backends/arm/test/ops/test_max_pool.py
@@ -1,6 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -232,8 +232,24 @@ def test_maxpool2d_tosa_u85_BI_mult_batches(
         if conftest.is_option_enabled("corstone_fvp"):
             tester.run_method_and_compare_outputs(qtol=1, inputs=(test_data,))
 
+    @parameterized.expand(test_data_suite_mult_batches)
+    @pytest.mark.corstone_fvp
+    @conftest.expectedFailureOnFVP  # TODO: MLETORCH-433
+    def test_maxpool2d_tosa_u55_BI_mult_batches(
+        self,
+        test_name: str,
+        test_data: torch.Tensor,
+        model_params: int | Tuple[int, int],
+    ):
+        tester = self._test_maxpool2d_tosa_ethos_BI_pipeline(
+            self.MaxPool2d(*model_params),
+            common.get_u55_compile_spec(),
+            (test_data,),
+        )
+        if conftest.is_option_enabled("corstone_fvp"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=(test_data,))
+
     reject_data_suite = [
-        (MaxPool2d(1, 1, 0), torch.rand(2, 5, 5, 5)),
         (MaxPool2d(1, 4, 0), torch.rand(1, 10, 10, 10)),
         (MaxPool2d((1, 257), 1, 0), torch.rand(1, 16, 5, 300)),
         (MaxPool2d((800, 90), 1, 0), torch.rand(1, 16, 850, 100)),
diff --git a/backends/arm/test/ops/test_mean_dim.py b/backends/arm/test/ops/test_mean_dim.py
index 393cf1667e0..9d67030cc4f 100644
--- a/backends/arm/test/ops/test_mean_dim.py
+++ b/backends/arm/test/ops/test_mean_dim.py
@@ -1,298 +1,186 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
-
-from typing import Tuple
-
 import torch
 from executorch.backends.arm.test import common
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.exir.backend.backend_details import CompileSpec
-from parameterized import parameterized
-
-
-class TestMeanDim(unittest.TestCase):
-    """Tests MeanDim, called AdaptiveAvgPool2d in Pytorch."""
-
-    class AdaptiveAveragePool2d(torch.nn.Module):
-        test_data_suite = [
-            # (test_name, test_data)
-            (
-                "zeros",
-                torch.zeros(1, 1280, 7, 7),
-            ),
-            (
-                "ones",
-                torch.ones(1, 1280, 7, 7),
-            ),
-            (
-                "rand",
-                torch.rand(1, 1280, 7, 7),
-            ),
-            (
-                "randn",
-                torch.randn(1, 1280, 7, 7),
-            ),
-        ]
-
-        def __init__(self):
-            super().__init__()
-            self.adaptive_avg_pool2d = torch.nn.AdaptiveAvgPool2d(output_size=(1, 1))
-
-        def forward(self, x):
-            return self.adaptive_avg_pool2d(x)
-
-    class MeanDim(torch.nn.Module):
-        test_data_suite = [
-            # (test_name, test_data)
-            ("zeros", torch.zeros(1, 1280, 7, 7), -1, True),
-            ("ones", torch.ones(1, 1280, 7, 7), (-1, 2), False),
-            (
-                "rand",
-                torch.rand(1, 1280, 7, 7),
-                (-1),
-                True,
-            ),
-            (
-                "randn",
-                torch.randn(1, 1280, 7, 7),
-                (-1, -2, -3),
-                False,
-            ),
-        ]
-
-        def __init__(self, dim: int | list[int] = -1, keepdim: bool = True):
-            super().__init__()
-            self.dim = dim
-            self.keepdim = keepdim
-
-        def forward(self, x: torch.Tensor):
-            return x.mean(dim=self.dim, keepdim=self.keepdim)
-
-    def _test_adaptive_avg_pool2d_tosa_MI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .check(["torch.ops.aten.adaptive_avg_pool2d.default"])
-            .check_not(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_mean_dim"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_adaptive_avg_pool2d_tosa_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
-            .quantize()
-            .export()
-            .check_count({"torch.ops.aten.adaptive_avg_pool2d.default": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_mean_dim"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_adaptive_avg_pool2d_tosa_ethosu_BI_pipeline(
-        self,
-        module: torch.nn.Module,
-        compile_spec: CompileSpec,
-        test_data: Tuple[torch.tensor],
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize()
-            .export()
-            .check(["torch.ops.aten.adaptive_avg_pool2d.default"])
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_not(
-                [
-                    "executorch_exir_dialects_edge__ops_aten_mean_dim",
-                    "executorch_exir_dialects_edge__ops_aten_avg_pool2d_default",
-                ]
-            )
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-        )
-
-    def _test_meandim_tosa_MI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .check_not(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_mean_dim"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_meandim_tosa_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
-            .quantize()
-            .export()
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_mean_dim"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data, qtol=1.0)
-        )
-
-    def _test_meandim_tosa_ethosu_BI_pipeline(
-        self,
-        module: torch.nn.Module,
-        compile_spec: CompileSpec,
-        test_data: Tuple[torch.tensor],
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize()
-            .export()
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_not(
-                [
-                    "executorch_exir_dialects_edge__ops_aten_mean_dim",
-                    "executorch_exir_dialects_edge__ops_aten_avg_pool2d_default",
-                ]
-            )
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-        )
-
-    @parameterized.expand(AdaptiveAveragePool2d.test_data_suite)
-    def test_adaptive_avg_pool2d_tosa_MI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-    ):
-        self._test_adaptive_avg_pool2d_tosa_MI_pipeline(
-            self.AdaptiveAveragePool2d(), (test_data,)
-        )
-
-    @parameterized.expand(AdaptiveAveragePool2d.test_data_suite)
-    def test_adaptive_avg_pool2d_tosa_BI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-    ):
-        self._test_adaptive_avg_pool2d_tosa_BI_pipeline(
-            self.AdaptiveAveragePool2d(), (test_data,)
-        )
-
-    @parameterized.expand(AdaptiveAveragePool2d.test_data_suite)
-    def test_adaptive_avg_pool2d_tosa_u55_BI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-    ):
-        self._test_adaptive_avg_pool2d_tosa_ethosu_BI_pipeline(
-            self.AdaptiveAveragePool2d(), common.get_u55_compile_spec(), (test_data,)
-        )
-
-    @parameterized.expand(AdaptiveAveragePool2d.test_data_suite)
-    def test_adaptive_avg_pool2d_tosa_u85_BI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-    ):
-        self._test_adaptive_avg_pool2d_tosa_ethosu_BI_pipeline(
-            self.AdaptiveAveragePool2d(), common.get_u85_compile_spec(), (test_data,)
-        )
-
-    @parameterized.expand(MeanDim.test_data_suite)
-    def test_meandim_tosa_MI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        dim: int | list[int] = -1,
-        keepdim: bool = True,
-    ):
-        self._test_meandim_tosa_MI_pipeline(self.MeanDim(dim, keepdim), (test_data,))
-
-    @parameterized.expand(MeanDim.test_data_suite)
-    def test_meandim_tosa_BI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        dim: int | list[int] = -1,
-        keepdim: bool = True,
-    ):
-        self._test_meandim_tosa_BI_pipeline(self.MeanDim(dim, keepdim), (test_data,))
-
-    @parameterized.expand(MeanDim.test_data_suite)
-    def test_meandim_tosa_u55_BI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        dim: int | list[int] = -1,
-        keepdim: bool = True,
-    ):
-        self._test_meandim_tosa_ethosu_BI_pipeline(
-            self.MeanDim(dim, keepdim),
-            common.get_u55_compile_spec(),
-            (test_data,),
-        )
-
-    @parameterized.expand(MeanDim.test_data_suite)
-    def test_meandim_tosa_u85_BI(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        dim: int | list[int] = -1,
-        keepdim: bool = True,
-    ):
-        self._test_meandim_tosa_ethosu_BI_pipeline(
-            self.MeanDim(dim, keepdim),
-            common.get_u85_compile_spec(),
-            (test_data,),
-        )
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+input_t = tuple[torch.Tensor]
+
+
+class AdaptiveAveragePool2d(torch.nn.Module):
+    test_data_suite = {
+        # (test_name, test_data)
+        "zeros": (torch.zeros(1, 1280, 7, 7),),
+        "ones": (torch.ones(1, 1280, 7, 7),),
+        "rand": (torch.rand(1, 1280, 7, 7),),
+        "randn": (torch.randn(1, 1280, 7, 7),),
+    }
+    aten_op = "torch.ops.aten.adaptive_avg_pool2d.default"
+    exir_op = "executorch_exir_dialects_edge__ops_aten_mean_dim"
+
+    def __init__(self):
+        super().__init__()
+        self.adaptive_avg_pool2d = torch.nn.AdaptiveAvgPool2d(output_size=(1, 1))
+
+    def forward(self, x):
+        return self.adaptive_avg_pool2d(x)
+
+
+@common.parametrize("test_data", AdaptiveAveragePool2d.test_data_suite)
+def test_adaptive_avg_pool2d_tosa_MI(test_data):
+    TosaPipelineMI[input_t](
+        AdaptiveAveragePool2d(),
+        test_data,
+        AdaptiveAveragePool2d.aten_op,
+        AdaptiveAveragePool2d.exir_op,
+    ).run()
+
+
+@common.parametrize("test_data", AdaptiveAveragePool2d.test_data_suite)
+def test_adaptive_avg_pool2d_tosa_BI(test_data):
+    TosaPipelineBI[input_t](
+        AdaptiveAveragePool2d(),
+        test_data,
+        AdaptiveAveragePool2d.aten_op,
+        AdaptiveAveragePool2d.exir_op,
+    ).run()
+
+
+@common.parametrize("test_data", AdaptiveAveragePool2d.test_data_suite)
+def test_adaptive_avg_pool2d_u55(test_data):
+    EthosU55PipelineBI[input_t](
+        AdaptiveAveragePool2d(),
+        test_data,
+        AdaptiveAveragePool2d.aten_op,
+        AdaptiveAveragePool2d.exir_op,
+    ).run()
+
+
+@common.parametrize("test_data", AdaptiveAveragePool2d.test_data_suite)
+def test_adaptive_avg_pool2d_u85(test_data):
+    EthosU85PipelineBI[input_t](
+        AdaptiveAveragePool2d(),
+        test_data,
+        AdaptiveAveragePool2d.aten_op,
+        AdaptiveAveragePool2d.exir_op,
+    ).run()
+
+
+@common.parametrize("test_data", AdaptiveAveragePool2d.test_data_suite)
+@common.SkipIfNoCorstone300
+def test_adaptive_avg_pool2d_u55_on_fvp(test_data):
+    EthosU55PipelineBI[input_t](
+        AdaptiveAveragePool2d(),
+        test_data,
+        AdaptiveAveragePool2d.aten_op,
+        AdaptiveAveragePool2d.exir_op,
+        run_on_fvp=True,
+    ).run()
+
+
+@common.parametrize("test_data", AdaptiveAveragePool2d.test_data_suite)
+@common.SkipIfNoCorstone320
+def test_adaptive_avg_pool2d_u85_on_fvp(test_data):
+    EthosU85PipelineBI[input_t](
+        AdaptiveAveragePool2d(),
+        test_data,
+        AdaptiveAveragePool2d.aten_op,
+        AdaptiveAveragePool2d.exir_op,
+        run_on_fvp=True,
+    ).run()
+
+
+class MeanDim(torch.nn.Module):
+    test_data_suite: dict[str, tuple] = {
+        "zeros": (torch.zeros(1, 1280, 7, 7), -1, True),
+        "ones": (torch.ones(1, 1280, 7, 7), (-1, 2), False),
+        "rand": (
+            torch.rand(1, 1280, 7, 7),
+            (-1),
+            True,
+        ),
+        "randn": (
+            torch.randn(1, 1280, 7, 7),
+            (-1, -2, -3),
+            False,
+        ),
+    }
+    torch_op = "torch.ops.aten.mean.dim"
+    exir_op = "executorch_exir_dialects_edge__ops_aten_mean_dim"
+
+    def __init__(self, dim: int | list[int] = -1, keepdim: bool = True):
+        super().__init__()
+        self.dim = dim
+        self.keepdim = keepdim
+
+    def forward(self, x: torch.Tensor):
+        return x.mean(dim=self.dim, keepdim=self.keepdim)
+
+
+@common.parametrize("test_data", MeanDim.test_data_suite)
+def test_mean_tosa_MI(test_data):
+    TosaPipelineMI[input_t](
+        MeanDim(test_data[1], test_data[2]),
+        (test_data[0],),
+        MeanDim.torch_op,
+        MeanDim.exir_op,
+    ).run()
+
+
+@common.parametrize("test_data", MeanDim.test_data_suite)
+def test_mean_tosa_BI(test_data):
+    TosaPipelineBI[input_t](
+        MeanDim(test_data[1], test_data[2]),
+        (test_data[0],),
+        "torch.ops.aten.sum.dim_IntList",  # Just check for sum op included in the mean decomposition
+    ).run()
+
+
+@common.parametrize("test_data", MeanDim.test_data_suite)
+def test_mean_u55(test_data):
+    EthosU55PipelineBI[input_t](
+        MeanDim(test_data[1], test_data[2]),
+        (test_data[0],),
+        "torch.ops.aten.sum.dim_IntList",  # Just check for sum op included in the mean decomposition
+    ).run()
+
+
+@common.parametrize("test_data", MeanDim.test_data_suite)
+def test_mean_u85(test_data):
+    EthosU85PipelineBI[input_t](
+        MeanDim(test_data[1], test_data[2]),
+        (test_data[0],),
+        "torch.ops.aten.sum.dim_IntList",  # Just check for sum op included in the mean decomposition
+    ).run()
+
+
+@common.parametrize("test_data", MeanDim.test_data_suite)
+@common.SkipIfNoCorstone300
+def test_mean_u55_on_fvp(test_data):
+    EthosU55PipelineBI[input_t](
+        MeanDim(test_data[1], test_data[2]),
+        (test_data[0],),
+        "torch.ops.aten.sum.dim_IntList",  # Just check for sum op included in the mean decomposition
+        run_on_fvp=True,
+    ).run()
+
+
+@common.parametrize("test_data", MeanDim.test_data_suite)
+@common.SkipIfNoCorstone320
+def test_mean_u85_on_fvp(test_data):
+    EthosU85PipelineBI[input_t](
+        MeanDim(test_data[1], test_data[2]),
+        (test_data[0],),
+        "torch.ops.aten.sum.dim_IntList",  # Just check for sum op included in the mean decomposition
+        run_on_fvp=True,
+    ).run()
diff --git a/backends/arm/test/ops/test_mm.py b/backends/arm/test/ops/test_mm.py
index df75e4ed183..6b906067f7b 100644
--- a/backends/arm/test/ops/test_mm.py
+++ b/backends/arm/test/ops/test_mm.py
@@ -4,162 +4,71 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import logging
-import unittest
+from typing import Callable
 
-from typing import Callable, Tuple
-
-import pytest
 import torch
 from executorch.backends.arm.test import common
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.exir.backend.backend_details import CompileSpec
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
 from parameterized import parameterized
 
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-
-
-class TestMM(unittest.TestCase):
-    """Tests MatMul"""
-
-    class MM(torch.nn.Module):
-        test_data_generators = [
-            lambda: (torch.rand(3, 5), torch.rand(5, 2)),
-            lambda: (torch.rand(1, 1), torch.rand(1, 1)),
-            lambda: (torch.ones(55, 3), torch.ones(3, 44)),
-            lambda: (10000 * torch.randn(1, 10), torch.randn(10, 5)),
-            lambda: (-10 * torch.randn(32, 64), 5 + 5 * torch.randn(64, 32)),
-        ]
-
-        def forward(self, x, y):
-            return torch.mm(x, y)
-
-    class MMSingleInput(torch.nn.Module):
-        test_data_generators = [
-            lambda: (torch.rand(3, 3),),
-            lambda: (torch.ones(128, 128),),
-            lambda: (10000 * torch.randn(25, 25),),
-            lambda: (5 + 5 * torch.randn(64, 64),),
-        ]
-
-        def forward(self, x):
-            return torch.mm(x, x)
-
-    def _test_mm_tosa_MI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .check_count({"torch.ops.aten.mm.default": 1})
-            .check_not(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_mm_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_mm_tosa_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
-            .quantize()
-            .export()
-            .check_count({"torch.ops.aten.mm.default": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten_mm_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_mm_ethosu_BI_pipeline(
-        self,
-        compile_spec: CompileSpec,
-        module: torch.nn.Module,
-        test_data: Tuple[torch.Tensor],
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize()
-            .export()
-            .check_count({"torch.ops.aten.mm.default": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-        )
-
-    @parameterized.expand(MM.test_data_generators)
-    def test_mm_tosa_MI(self, test_data_generator: Callable[[], Tuple]):
-        test_data = test_data_generator()
-        self._test_mm_tosa_MI_pipeline(self.MM(), test_data)
-
-    @parameterized.expand(MMSingleInput.test_data_generators)
-    @pytest.mark.flaky  # TODO: Investigate flakyness (MLETORCH-534)
-    def test_mm_single_input_tosa_MI(self, test_data_generator: Callable[[], Tuple]):
-        test_data = test_data_generator()
-        self._test_mm_tosa_MI_pipeline(self.MMSingleInput(), test_data)
-
-    @parameterized.expand(MM.test_data_generators)
-    @pytest.mark.flaky  # TODO: Investigate flakyness (MLETORCH-534)
-    def test_mm_tosa_BI(self, test_data_generator: Callable[[], Tuple]):
-        test_data = test_data_generator()
-        self._test_mm_tosa_BI_pipeline(self.MM(), test_data)
-
-    @parameterized.expand(MMSingleInput.test_data_generators)
-    @pytest.mark.flaky  # TODO: Investigate flakyness (MLETORCH-534)
-    def test_mm_single_input_tosa_BI(self, test_data_generator: Callable[[], Tuple]):
-        test_data = test_data_generator()
-        self._test_mm_tosa_BI_pipeline(self.MMSingleInput(), test_data)
-
-    # Expected to fail with error: CPU performance estimation for "MatMul" not implemented
-    @parameterized.expand(MM.test_data_generators)
-    @unittest.expectedFailure
-    def test_mm_u55_BI(self, test_data_generator: Callable[[], Tuple]):
-        test_data = test_data_generator()
-        self._test_mm_ethosu_BI_pipeline(
-            common.get_u55_compile_spec(), self.MM(), test_data
-        )
-
-    # Expected to fail with error: Warning, unsupported fusing of TOSA Rescale previous operator is of type: Memcpy
-    @parameterized.expand(MMSingleInput.test_data_generators)
-    @unittest.expectedFailure
-    def test_mm_single_input_u55_BI(self, test_data_generator: Callable[[], Tuple]):
-        test_data = test_data_generator()
-        self._test_mm_ethosu_BI_pipeline(
-            common.get_u55_compile_spec(), self.MMSingleInput(), test_data
-        )
-
-    @parameterized.expand(MM.test_data_generators)
-    def test_mm_u85_BI(self, test_data_generator: Callable[[], Tuple]):
-        test_data = test_data_generator()
-        self._test_mm_ethosu_BI_pipeline(
-            common.get_u85_compile_spec(), self.MM(), test_data
-        )
-
-    @parameterized.expand(MMSingleInput.test_data_generators)
-    def test_mm_single_input_u85_BI(self, test_data_generator: Callable[[], Tuple]):
-        test_data = test_data_generator()
-        self._test_mm_ethosu_BI_pipeline(
-            common.get_u85_compile_spec(), self.MMSingleInput(), test_data
-        )
+test_t = tuple[torch.Tensor, torch.Tensor]
+
+
+class MM(torch.nn.Module):
+    test_data_generators = [
+        lambda: (torch.rand(3, 5), torch.rand(5, 2)),
+        lambda: (torch.rand(1, 1), torch.rand(1, 1)),
+        lambda: (torch.ones(55, 3), torch.ones(3, 44)),
+        lambda: (10000 * torch.randn(1, 10), torch.randn(10, 5)),
+        lambda: (-10 * torch.randn(32, 64), 5 + 5 * torch.randn(64, 32)),
+    ]
+    aten_op = "torch.ops.aten.mm.default"
+    exir_op = "executorch_exir_dialects_edge__ops_aten_mm_default"
+
+    def forward(self, x, y):
+        return torch.mm(x, y)
+
+
+@parameterized.expand(MM.test_data_generators)
+def test_mm_tosa_MI(test_data_generator: Callable[[], tuple]):
+    test_data = test_data_generator()
+    TosaPipelineMI[test_t](MM(), test_data, MM.aten_op).run()
+
+
+@parameterized.expand(MM.test_data_generators)
+def test_mm_tosa_BI(test_data_generator: Callable[[], tuple]):
+    test_data = test_data_generator()
+    TosaPipelineBI[test_t](MM(), test_data, MM.aten_op, MM.exir_op).run()
+
+
+@parameterized.expand(MM.test_data_generators)
+def test_mm_tosa_u55(test_data_generator: Callable[[], tuple]):
+    test_data = test_data_generator()
+    EthosU55PipelineBI[test_t](MM(), test_data, MM.aten_op).run()
+
+
+@parameterized.expand(MM.test_data_generators)
+def test_mm_tosa_u85(test_data_generator: Callable[[], tuple]):
+    test_data = test_data_generator()
+    EthosU85PipelineBI[test_t](MM(), test_data, MM.aten_op, MM.exir_op).run()
+
+
+@parameterized.expand(MM.test_data_generators)
+@common.SkipIfNoCorstone300
+def test_mm_tosa_u55_on_fvp(test_data_generator: Callable[[], tuple]):
+    test_data = test_data_generator()
+    EthosU55PipelineBI[test_t](MM(), test_data, MM.aten_op, run_on_fvp=True).run()
+
+
+@parameterized.expand(MM.test_data_generators)
+@common.SkipIfNoCorstone320
+def test_mm_tosa_u85_on_fvp(test_data_generator: Callable[[], tuple]):
+    test_data = test_data_generator()
+    EthosU85PipelineBI[test_t](
+        MM(), test_data, MM.aten_op, MM.exir_op, run_on_fvp=True
+    ).run()
diff --git a/backends/arm/test/ops/test_mul.py b/backends/arm/test/ops/test_mul.py
index 715673b87c8..739864a4982 100644
--- a/backends/arm/test/ops/test_mul.py
+++ b/backends/arm/test/ops/test_mul.py
@@ -15,7 +15,7 @@
 from executorch.exir.backend.backend_details import CompileSpec
 from parameterized import parameterized
 
-test_data_sute = [
+test_data_suite = [
     # (test_name, input, other,) See torch.mul() for info
     (
         "op_mul_rank1_rand",
@@ -55,6 +55,31 @@
 ]
 
 
+test_data_suite_2 = [
+    # (test_name, input, other,) See torch.mul() for info
+    (
+        "op_mul_rank2_rand",
+        torch.rand(4, 5),
+        torch.rand(5),
+    ),
+    (
+        "op_mul_rank3_randn",
+        torch.randn(10, 5, 2),
+        torch.randn(5, 2),
+    ),
+    (
+        "op_mul_rank4_randn",
+        torch.randn(1, 10, 25, 20),
+        torch.randn(1, 25, 20),
+    ),
+    (
+        "op_mul_rank4_randn_2",
+        torch.randn(1, 25, 1),
+        torch.randn(1, 3, 25, 10),
+    ),
+]
+
+
 class TestMul(unittest.TestCase):
     class Mul(torch.nn.Module):
 
@@ -133,7 +158,7 @@ def _test_mul_ethosu_BI_pipeline(
         if conftest.is_option_enabled("corstone_fvp"):
             tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
-    @parameterized.expand(test_data_sute)
+    @parameterized.expand(test_data_suite)
     def test_mul_tosa_MI(
         self,
         test_name: str,
@@ -143,7 +168,27 @@ def test_mul_tosa_MI(
         test_data = (input_, other_)
         self._test_mul_tosa_MI_pipeline(self.Mul(), test_data)
 
-    @parameterized.expand(test_data_sute)
+    @parameterized.expand(test_data_suite_2)
+    def test_mul_diff_input_ranks_tosa_MI(
+        self,
+        test_name: str,
+        input_: torch.Tensor,
+        other_: torch.Tensor,
+    ):
+        test_data = (input_, other_)
+        self._test_mul_tosa_MI_pipeline(self.Mul(), test_data)
+
+    @parameterized.expand(test_data_suite_2)
+    def test_mul_diff_input_ranks_tosa_BI(
+        self,
+        test_name: str,
+        input_: torch.Tensor,
+        other_: torch.Tensor,
+    ):
+        test_data = (input_, other_)
+        self._test_mul_tosa_BI_pipeline(self.Mul(), test_data)
+
+    @parameterized.expand(test_data_suite)
     def test_mul_tosa_BI(
         self,
         test_name: str,
@@ -154,7 +199,7 @@ def test_mul_tosa_BI(
         test_data = (input_, other_)
         self._test_mul_tosa_BI_pipeline(self.Mul(), test_data)
 
-    @parameterized.expand(test_data_sute)
+    @parameterized.expand(test_data_suite)
     @pytest.mark.corstone_fvp
     def test_mul_u55_BI(
         self,
@@ -167,7 +212,7 @@ def test_mul_u55_BI(
             common.get_u55_compile_spec(), self.Mul(), test_data
         )
 
-    @parameterized.expand(test_data_sute)
+    @parameterized.expand(test_data_suite)
     @pytest.mark.corstone_fvp
     def test_mul_u85_BI(
         self,
diff --git a/backends/arm/test/ops/test_rshift.py b/backends/arm/test/ops/test_rshift.py
index 9637afead1c..d79be67dce6 100644
--- a/backends/arm/test/ops/test_rshift.py
+++ b/backends/arm/test/ops/test_rshift.py
@@ -1,5 +1,4 @@
 # Copyright 2024-2025 Arm Limited and/or its affiliates.
-# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -75,16 +74,14 @@ def test_rshift_tosa_MI(self, test_data):
     def test_rshift_tosa_BI(self, test_data):
         self._test_rshift_tosa_BI(test_data)
 
-    # TODO: MLETORCH-644 - Add support for INT16 input/output
-    @parameterized.expand(Rshift.test_data[:-1])
+    @parameterized.expand(Rshift.test_data)
     def test_rshift_u55_BI(self, test_data):
         compile_spec = common.get_u55_compile_spec()
         tester = self._test_rshift_ethosu_BI(test_data, compile_spec)
         if conftest.is_option_enabled("corstone_fvp"):
             tester.run_method_and_compare_outputs(atol=1, inputs=test_data)
 
-    # TODO: MLETORCH-644 - Add support for INT16 input/output
-    @parameterized.expand(Rshift.test_data[:-1])
+    @parameterized.expand(Rshift.test_data)
     def test_rshift_u85_BI(self, test_data):
         compile_spec = common.get_u85_compile_spec()
         tester = self._test_rshift_ethosu_BI(test_data, compile_spec)
diff --git a/backends/arm/test/ops/test_scalars.py b/backends/arm/test/ops/test_scalars.py
index 2ab420bd59e..17dcd6f1d27 100644
--- a/backends/arm/test/ops/test_scalars.py
+++ b/backends/arm/test/ops/test_scalars.py
@@ -5,11 +5,16 @@
 
 import unittest
 
+from typing import Tuple
+
+import common
 import torch
 
-from executorch.backends.arm.test import common
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from parameterized import parameterized
+from executorch.backends.arm.test.tester.test_pipeline import (
+    TosaPipelineBI,
+    TosaPipelineMI,
+    TransformAnnotationPassPipeline,
+)
 
 """
 Summary of non-working cases.
@@ -24,6 +29,7 @@
         # MLETORCH-408
     Sub or inplace-sub with an integer input.
 """
+input_t1 = Tuple[torch.Tensor, torch.scalar_tensor]  # Input x, Input y
 
 
 class TestScalars(unittest.TestCase):
@@ -92,112 +98,160 @@ def forward(self, x):
             x -= 10
             return x
 
-    # Inplace ops end with '_' (from aten naming)
-    ops = [
-        ("Add", Add()),
-        ("Sub", Sub()),
-        ("Mul", Mul()),
-        ("Div", Div()),
-        ("Add_", AddInplace()),
-        ("Sub_", SubInplace()),
-        ("Mul_", MulInplace()),
-        ("Div_", DivInplace()),
-        ("MulScalar", MulScalar()),
-        ("DivScalar", DivScalar()),
-        ("AddScalar", AddScalar()),
-        ("SubScalar", SubScalar()),
-    ]
-
-    const_ops = [("Add", AddConst())]
-
-    dtypes = [("int", 3), ("float", 3.0)]
-    sizes = [("r1", (1)), ("r4", (2, 4, 5, 3))]
-
-    # Create combinations of tests
-    tensor_scalar_tests = []
-    for op in ops:
-        for dtype in dtypes:
-            for size in sizes:
-                test_name = f"{op[0]}_{dtype[0]}_{size[0]}"
-                tensor = torch.rand(size[1])
-                scalar = dtype[1]
-                tensor_scalar_tests.append((test_name + "_ts", op[1], tensor, scalar))
-
-                # Don't add (scalar, tensor) test case for .Scalar ops.
-                if op[0][-6:] == "Scalar":
-                    continue
-
-                tensor_scalar_tests.append((test_name + "_st", op[1], scalar, tensor))
-
-    tensor_const_tests = []
-    for op in const_ops:
+
+# Inplace ops end with '_' (from aten naming)
+ops = [
+    ("Add", TestScalars.Add()),
+    ("Sub", TestScalars.Sub()),
+    ("Mul", TestScalars.Mul()),
+    ("Div", TestScalars.Div()),
+    ("Add_", TestScalars.AddInplace()),
+    ("Sub_", TestScalars.SubInplace()),
+    ("Mul_", TestScalars.MulInplace()),
+    ("Div_", TestScalars.DivInplace()),
+    ("MulScalar", TestScalars.MulScalar()),
+    ("DivScalar", TestScalars.DivScalar()),
+    ("AddScalar", TestScalars.AddScalar()),
+    ("SubScalar", TestScalars.SubScalar()),
+]
+
+const_ops = [("Add", TestScalars.AddConst())]
+
+dtypes = [("int", 3), ("float", 3.0)]
+sizes = [("r1", (1)), ("r4", (2, 4, 5, 3))]
+
+# Create combinations of tests
+tensor_scalar_tests = {}
+for op in ops:
+    for dtype in dtypes:
         for size in sizes:
-            test_name = f"{op[0]}_{size[0]}"
+            test_name = f"{op[0]}_{dtype[0]}_{size[0]}"
             tensor = torch.rand(size[1])
-            tensor_const_tests.append((test_name, op[1], tensor))
-
-    def _test_add_tosa_MI_pipeline(self, module: torch.nn.Module, test_data: tuple):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .to_edge()
-            .partition()
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_add_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: tuple):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
-            .quantize()
-            .export()
-            .to_edge()
-            .partition()
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    @parameterized.expand(tensor_scalar_tests)
-    def test_MI(self, test_name: str, op: torch.nn.Module, x, y):
-        expected_exception = None
-        if any(token in test_name for token in ("Sub_int", "Sub__int")):
-            expected_exception = AssertionError
-        if test_name.endswith("_st"):
-            expected_exception = AttributeError
-
-        if expected_exception:
-            with self.assertRaises(
-                expected_exception, msg=f"Test {test_name} is expected to fail."
-            ):
-                self._test_add_tosa_MI_pipeline(op, (x, y))
-            return
-
-        self._test_add_tosa_MI_pipeline(op, (x, y))
-
-    # op(Scalar float, tensor) works if the scalar is constant.
-    @parameterized.expand(tensor_const_tests)
-    def test_MI_const(self, test_name: str, op: torch.nn.Module, x):
-        self._test_add_tosa_MI_pipeline(op, (x,))
-
-    @parameterized.expand(tensor_scalar_tests)
-    def test_BI(self, test_name: str, op: torch.nn.Module, x, y):
-        self._test_add_tosa_BI_pipeline(op, (x, y))
-
-    # op(Scalar float, tensor) works if the scalar is constant.
-    @parameterized.expand(tensor_const_tests)
-    def test_BI_const(self, test_name: str, op: torch.nn.Module, x):
-        self._test_add_tosa_BI_pipeline(op, (x,))
-
-    def test_shift_sub_inplace_tosa_MI(self):
-        self._test_add_tosa_MI_pipeline(self.ShiftInplaceSub(), (torch.IntTensor(5),))
-
-    def test_shift_sub_inplace_tosa_BI(self):
-        self._test_add_tosa_BI_pipeline(self.ShiftInplaceSub(), (torch.IntTensor(5),))
+            scalar = dtype[1]
+            tensor_scalar_tests[test_name + "_ts"] = (op[1], tensor, scalar)
+            # Don't add (scalar, tensor) test case for .Scalar ops.
+            if op[0][-6:] == "Scalar":
+                continue
+
+            tensor_scalar_tests[test_name + "_st"] = (op[1], scalar, tensor)
+
+tensor_const_tests = {}
+for op in const_ops:
+    for size in sizes:
+        test_name = f"{op[0]}_{size[0]}"
+        tensor = torch.rand(size[1])
+        tensor_const_tests[test_name] = (op[1], tensor)
+
+
+def _test_add_tosa_MI_pipeline(module: torch.nn.Module, test_data: tuple):
+    pipeline = TosaPipelineMI[input_t1](module, test_data, aten_op=[], exir_op=[])
+    pipeline.run()
+
+
+def _test_add_tosa_BI_pipeline(
+    module: torch.nn.Module, test_data: tuple, check_quant_nodes=True
+):
+    pipeline = TosaPipelineBI[input_t1](module, test_data, aten_op=[], exir_op=[])
+    if not check_quant_nodes:
+        pipeline.pop_stage("check.quant_nodes")
+    pipeline.run()
+
+
+fail_str = "MLETORCH-408: Arithmetic ops can't handle scalars first for MI"
+MI_xfails = {
+    "Add_int_r1_st": fail_str,
+    "Add_int_r4_st": fail_str,
+    "Add_float_r1_st": fail_str,
+    "Add_float_r4_st": fail_str,
+    "Sub_int_r1_ts": fail_str,
+    "Sub_int_r1_st": fail_str,
+    "Sub_int_r4_ts": fail_str,
+    "Sub_int_r4_st": fail_str,
+    "Sub_float_r1_st": fail_str,
+    "Sub_float_r4_st": fail_str,
+    "Mul_int_r1_st": fail_str,
+    "Mul_int_r4_st": fail_str,
+    "Mul_float_r1_st": fail_str,
+    "Mul_float_r4_st": fail_str,
+    "Div_int_r1_st": fail_str,
+    "Div_int_r4_st": fail_str,
+    "Div_float_r1_st": fail_str,
+    "Div_float_r4_st": fail_str,
+    "Add__int_r1_st": fail_str,
+    "Add__float_r1_st": fail_str,
+    "Add__float_r4_st": fail_str,
+    "Add__int_r4_st": fail_str,
+    "Sub__int_r1_ts": fail_str,
+    "Sub__int_r1_st": fail_str,
+    "Sub__int_r4_ts": fail_str,
+    "Sub__int_r4_st": fail_str,
+    "Sub__float_r1_st": fail_str,
+    "Sub__float_r4_st": fail_str,
+    "Mul__int_r1_st": fail_str,
+    "Mul__int_r4_st": fail_str,
+    "Mul__float_r1_st": fail_str,
+    "Mul__float_r4_st": fail_str,
+    "Div__int_r1_st": fail_str,
+    "Div__int_r4_st": fail_str,
+    "Div__float_r1_st": fail_str,
+    "Div__float_r4_st": fail_str,
+}
+
+
+@common.parametrize("tensor_scalar_tests", tensor_scalar_tests, MI_xfails)
+def test_MI(tensor_scalar_tests: list):
+    op, x, y = tensor_scalar_tests
+    _test_add_tosa_MI_pipeline(op, (x, y))
+
+
+def _test_passes_tosa_BI_pipeline(module: torch.nn.Module, test_data: tuple):
+    pipeline = TransformAnnotationPassPipeline[input_t1](
+        module, test_data, tosa_version="TOSA-0.80+BI"
+    )
+    pipeline.run()
+
+
+fail_str = "MLETORCH-770: Numerical issues on Div Scalar."
+passes_xfails = {
+    "Div__int_r1_ts": fail_str,
+    "Div__int_r4_ts": fail_str,
+    "Div__float_r1_ts": fail_str,
+    "Div__float_r4_ts": fail_str,
+}
+
+
+@common.parametrize("tensor_scalar_tests", tensor_scalar_tests, passes_xfails)
+def test_passes_BI(tensor_scalar_tests: list):
+    op, x, y = tensor_scalar_tests
+    _test_passes_tosa_BI_pipeline(op, (x, y))
+
+
+# op(Scalar float, tensor) works if the scalar is constant.
+@common.parametrize("tensor_const_tests", tensor_const_tests)
+def test_MI_const(tensor_const_tests: list):
+    op, x = tensor_const_tests
+    _test_add_tosa_MI_pipeline(op, (x,))
+
+
+@common.parametrize("tensor_scalar_tests", tensor_scalar_tests)
+def test_BI(tensor_scalar_tests: list):
+    op, x, y = tensor_scalar_tests
+    _test_add_tosa_BI_pipeline(op, (x, y))
+
+
+# op(Scalar float, tensor) works if the scalar is constant.
+@common.parametrize("tensor_const_tests", tensor_const_tests)
+def test_BI_const(tensor_const_tests: list):
+    op, x = tensor_const_tests
+    _test_add_tosa_BI_pipeline(op, (x,))
+
+
+def test_shift_sub_inplace_tosa_MI():
+    _test_add_tosa_MI_pipeline(TestScalars.ShiftInplaceSub(), (torch.IntTensor(5),))
+
+
+# Do not check for quant nodes in the graph for rshift.
+def test_shift_sub_inplace_tosa_BI():
+    _test_add_tosa_BI_pipeline(
+        TestScalars.ShiftInplaceSub(), (torch.IntTensor(5),), check_quant_nodes=False
+    )
diff --git a/backends/arm/test/ops/test_select.py b/backends/arm/test/ops/test_select.py
index b474da573f0..fbeb4ebf9e7 100644
--- a/backends/arm/test/ops/test_select.py
+++ b/backends/arm/test/ops/test_select.py
@@ -19,7 +19,7 @@
 test_data_suite: list[tuple[test_data_t]] = [
     # (test_data, dim, index)
     ((torch.zeros(5, 3, 20), -1, 0),),
-    ((torch.zeros(5, 3, 20), 0, -1),),
+    ((torch.rand(5, 3, 20), 0, -1),),
     ((torch.zeros(5, 3, 20), 0, 4),),
     ((torch.ones(10, 10, 10), 0, 2),),
     ((torch.rand(5, 3, 20, 2), 0, 2),),
@@ -61,9 +61,7 @@ def _test_select_tosa_MI_pipeline(
             .check([export_target])
             .check_not(["torch.ops.quantized_decomposed"])
             .to_edge()
-            .dump_artifact()
             .partition()
-            .dump_artifact()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
             .run_method_and_compare_outputs(inputs=test_data)
diff --git a/backends/arm/test/ops/test_sigmoid_16bit.py b/backends/arm/test/ops/test_sigmoid_16bit.py
new file mode 100644
index 00000000000..3f53141543e
--- /dev/null
+++ b/backends/arm/test/ops/test_sigmoid_16bit.py
@@ -0,0 +1,190 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import pytest
+
+import torch
+from executorch.backends.arm.quantizer.arm_quantizer import (
+    get_symmetric_quantization_config,
+    TOSAQuantizer,
+)
+from executorch.backends.arm.quantizer.quantization_config import QuantizationConfig
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+)
+from executorch.backends.xnnpack.test.tester import Quantize
+from torch.ao.quantization.observer import HistogramObserver
+from torch.ao.quantization.quantizer import QuantizationSpec
+
+
+def _get_16_bit_quant_config():
+    int16_spec = QuantizationSpec(
+        dtype=torch.int16,
+        observer_or_fake_quant_ctr=HistogramObserver,
+        qscheme=torch.per_tensor_symmetric,
+    )
+    qconfig = QuantizationConfig(
+        input_activation=int16_spec,
+        output_activation=int16_spec,
+        weight=None,
+        bias=None,
+    )
+    return qconfig
+
+
+def get_16bit_sigmoid_quantizer(tosa_str: str):
+    tosa_spec = common.TosaSpecification.create_from_string(tosa_str)
+    quantizer = TOSAQuantizer(tosa_spec)
+    quantizer.set_global(get_symmetric_quantization_config())
+    quantizer.set_module_type(
+        torch.nn.modules.activation.Sigmoid, _get_16_bit_quant_config()
+    )
+
+    return Quantize(quantizer, get_symmetric_quantization_config())
+
+
+input_t = tuple[torch.Tensor]
+test_data_suite = {
+    "ones": lambda: torch.ones(10, 10, 10),
+    "rand": lambda: torch.rand(10, 10) - 0.5,
+    "rand_4d": lambda: torch.rand(1, 1, 5, 10),
+    "randn_pos": lambda: torch.randn(10) + 10,
+    "randn_neg": lambda: torch.randn(10) - 10,
+    "ramp": lambda: torch.arange(-16, 16, 0.02),
+}
+
+
+class Sigmoid(torch.nn.Module):
+    aten_op = "torch.ops.aten.sigmoid.default"
+    exir_op = "executorch_exir_dialects_edge__ops_aten_sigmoid_default"
+
+    def __init__(self):
+        super().__init__()
+        self.sigmoid = torch.nn.Sigmoid()
+
+    def forward(self, x):
+        return self.sigmoid(x)
+
+
+class SigmoidAddSigmoid(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.sigmoid = torch.nn.Sigmoid()
+
+    def forward(self, x):
+        return self.sigmoid((self.sigmoid(x) + self.sigmoid(x)))
+
+
+@common.parametrize("test_data", test_data_suite)
+@pytest.mark.flaky(reruns=5)
+def test_sigmoid_tosa_BI(test_data):
+    pipeline = TosaPipelineBI(
+        Sigmoid(), (test_data(),), Sigmoid.aten_op, Sigmoid.exir_op
+    )
+    pipeline.change_args("quantize", get_16bit_sigmoid_quantizer("TOSA-0.80+BI"))
+    pipeline.run()
+
+
+@common.parametrize(
+    "test_data",
+    test_data_suite,
+    xfails={
+        "ramp": "AssertionError: Output 0 does not match reference output. MLETORCH-787"
+    },
+)
+@pytest.mark.flaky(reruns=5)
+def test_sigmoid_add_sigmoid_tosa_BI(test_data):
+    pipeline = TosaPipelineBI(
+        SigmoidAddSigmoid(), (test_data(),), Sigmoid.aten_op, Sigmoid.exir_op
+    )
+    pipeline.change_args("quantize", get_16bit_sigmoid_quantizer("TOSA-0.80+BI"))
+    pipeline.run()
+
+
+@common.parametrize(
+    "test_data",
+    test_data_suite,
+    xfails={
+        "ones": "AssertionError: Output 0 does not match reference output. MLBEDSW-9770",
+        "rand": "AssertionError: Output 0 does not match reference output. MLBEDSW-9770",
+        "rand_4d": "AssertionError: Output 0 does not match reference output. MLBEDSW-9770",
+        "randn_pos": "AssertionError: Output 0 does not match reference output. MLBEDSW-9770",
+        "randn_neg": "AssertionError: Output 0 does not match reference output. MLBEDSW-9770",
+        "ramp": "AssertionError: Output 0 does not match reference output. MLBEDSW-9770",
+    },
+    # int16 tables are not supported, but some tests happen to pass regardless.
+    # Set them to xfail but strict=False -> ok if they pass.
+    strict=False,
+)
+@common.XfailIfNoCorstone300
+def test_sigmoid_tosa_u55(test_data):
+    pipeline = EthosU55PipelineBI(
+        Sigmoid(), (test_data(),), Sigmoid.aten_op, Sigmoid.exir_op, run_on_fvp=True
+    )
+    pipeline.change_args("quantize", get_16bit_sigmoid_quantizer("TOSA-0.80+BI+u55"))
+    pipeline.run()
+
+
+@common.parametrize(
+    "test_data",
+    test_data_suite,
+    xfails={
+        "ones": "AssertionError: Output 0 does not match reference output. MLBEDSW-9770",
+        "rand": "AssertionError: Output 0 does not match reference output. MLBEDSW-9770",
+        "rand_4d": "AssertionError: Output 0 does not match reference output. MLBEDSW-9770",
+        "randn_neg": "AssertionError: Output 0 does not match reference output. MLBEDSW-9770",
+        "randn_pos": "AssertionError: Output 0 does not match reference output. MLBEDSW-9770",
+        "ramp": "AsssertionError: Output 0 does not match reference output. MLBEDSW-9770",
+    },
+    # int16 tables are not supported, but some tests happen to pass regardless.
+    # Set them to xfail but strict=False -> ok if they pass.
+    strict=False,
+)
+@common.XfailIfNoCorstone300
+def test_sigmoid_add_sigmoid_tosa_u55(test_data):
+    pipeline = EthosU55PipelineBI(
+        SigmoidAddSigmoid(),
+        (test_data(),),
+        Sigmoid.aten_op,
+        Sigmoid.exir_op,
+        run_on_fvp=True,
+    )
+    pipeline.change_args("quantize", get_16bit_sigmoid_quantizer("TOSA-0.80+BI+u55"))
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+@pytest.mark.flaky(reruns=5)
+@common.XfailIfNoCorstone320
+def test_sigmoid_tosa_u85(test_data):
+    pipeline = EthosU85PipelineBI(
+        Sigmoid(), (test_data(),), Sigmoid.aten_op, Sigmoid.exir_op, run_on_fvp=True
+    )
+    pipeline.change_args("quantize", get_16bit_sigmoid_quantizer("TOSA-0.80+BI"))
+    pipeline.run()
+
+
+@common.parametrize(
+    "test_data",
+    test_data_suite,
+    xfails={
+        "ramp": "AssertionError: Output 0 does not match reference output.",
+    },
+)
+@pytest.mark.flaky(reruns=5)
+@common.XfailIfNoCorstone320
+def test_sigmoid_add_sigmoid_tosa_u85(test_data):
+    pipeline = EthosU85PipelineBI(
+        SigmoidAddSigmoid(),
+        (test_data(),),
+        Sigmoid.aten_op,
+        Sigmoid.exir_op,
+        run_on_fvp=True,
+    )
+    pipeline.change_args("quantize", get_16bit_sigmoid_quantizer("TOSA-0.80+BI"))
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_sigmoid_32bit.py b/backends/arm/test/ops/test_sigmoid_32bit.py
new file mode 100644
index 00000000000..6ba4ab2d030
--- /dev/null
+++ b/backends/arm/test/ops/test_sigmoid_32bit.py
@@ -0,0 +1,206 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import pytest
+import torch
+from executorch.backends.arm.quantizer.arm_quantizer import TOSAQuantizer
+from executorch.backends.arm.quantizer.quantization_config import QuantizationConfig
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+)
+from executorch.backends.xnnpack.test.tester import Quantize
+from torch.ao.quantization.observer import HistogramObserver
+from torch.ao.quantization.quantizer import QuantizationSpec
+
+
+def _get_16_bit_quant_config():
+    int16_spec = QuantizationSpec(
+        dtype=torch.int16,
+        observer_or_fake_quant_ctr=HistogramObserver,
+        qscheme=torch.per_tensor_symmetric,
+    )
+    int32_spec = QuantizationSpec(
+        dtype=torch.int32,
+        observer_or_fake_quant_ctr=HistogramObserver,
+        qscheme=torch.per_tensor_symmetric,
+    )
+    qconfig = QuantizationConfig(
+        input_activation=int16_spec,
+        output_activation=int32_spec,
+        weight=None,
+        bias=None,
+    )
+    return qconfig
+
+
+def _get_32_bit_quant_config():
+    int32_spec = QuantizationSpec(
+        dtype=torch.int32,
+        observer_or_fake_quant_ctr=HistogramObserver,
+        qscheme=torch.per_tensor_symmetric,
+    )
+    qconfig = QuantizationConfig(
+        input_activation=int32_spec,
+        output_activation=int32_spec,
+        weight=None,
+        bias=None,
+    )
+    return qconfig
+
+
+def get_32bit_sigmoid_quantizer(tosa_str: str):
+    tosa_spec = common.TosaSpecification.create_from_string(tosa_str)
+    quantizer = TOSAQuantizer(tosa_spec)
+    quantizer.set_global(_get_32_bit_quant_config())
+    quantizer.set_module_type(
+        torch.nn.modules.activation.Sigmoid, _get_16_bit_quant_config()
+    )
+
+    return Quantize(quantizer, _get_32_bit_quant_config())
+
+
+input_t = tuple[torch.Tensor]
+test_data_suite = {
+    "ones": lambda: torch.ones(10, 10, 10),
+    "rand": lambda: torch.rand(10, 10) - 0.5,
+    "rand_4d": lambda: torch.rand(1, 10, 10, 10),
+    "randn_pos": lambda: torch.randn(10) + 10,
+    "randn_neg": lambda: torch.randn(10) - 10,
+    "ramp": lambda: torch.arange(-16, 16, 0.2),
+}
+
+
+class Sigmoid(torch.nn.Module):
+    aten_op = "torch.ops.aten.sigmoid.default"
+    exir_op = "executorch_exir_dialects_edge__ops_aten_sigmoid_default"
+
+    def __init__(self):
+        super().__init__()
+        self.sigmoid = torch.nn.Sigmoid()
+
+    def forward(self, x):
+        return self.sigmoid(x)
+
+
+class SigmoidAddSigmoid(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.sigmoid = torch.nn.Sigmoid()
+
+    def forward(self, x):
+        return self.sigmoid((self.sigmoid(x) + self.sigmoid(x)))
+
+
+@common.parametrize("test_data", test_data_suite)
+@pytest.mark.flaky(reruns=5)
+def test_sigmoid_tosa_BI(test_data):
+    pipeline = TosaPipelineBI(
+        Sigmoid(),
+        (test_data(),),
+        Sigmoid.aten_op,
+        Sigmoid.exir_op,
+    )
+    pipeline.change_args("quantize", get_32bit_sigmoid_quantizer("TOSA-0.80+BI"))
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+@pytest.mark.flaky(reruns=5)
+def test_sigmoid_add_sigmoid_tosa_BI(test_data):
+    pipeline = TosaPipelineBI(
+        SigmoidAddSigmoid(),
+        (test_data(),),
+        Sigmoid.aten_op,
+        Sigmoid.exir_op,
+    )
+    pipeline.change_args("quantize", get_32bit_sigmoid_quantizer("TOSA-0.80+BI"))
+    pipeline.run()
+
+
+@common.parametrize(
+    "test_data",
+    test_data_suite,
+    xfails={
+        "ones": "AssertionError: Output 0 does not match reference output. MLBEDSW-9770",
+        "rand": "AssertionError: Output 0 does not match reference output. MLBEDSW-9770",
+        "rand_4d": "AssertionError: Output 0 does not match reference output. MLBEDSW-9770",
+        "randn_pos": "AssertionError: Output 0 does not match reference output. MLBEDSW-9770",
+        "randn_neg": "AssertionError: Output 0 does not match reference output. MLBEDSW-9770",
+        "ramp": "AssertionError: Output 0 does not match reference output. MLBEDSW-9770",
+    },
+    # int16 tables are not supported, but some tests happen to pass regardless.
+    # Set them to xfail but strict=False -> ok if they pass.
+    strict=False,
+)
+@common.XfailIfNoCorstone300
+def test_sigmoid_tosa_u55(test_data):
+    pipeline = EthosU55PipelineBI(
+        Sigmoid(), (test_data(),), Sigmoid.aten_op, Sigmoid.exir_op, run_on_fvp=True
+    )
+    pipeline.change_args("quantize", get_32bit_sigmoid_quantizer("TOSA-0.80+BI+u55"))
+    pipeline.run()
+
+
+@common.parametrize(
+    "test_data",
+    test_data_suite,
+    xfails={
+        "ones": "AssertionError: Output 0 does not match reference output. MLBEDSW-9770",
+        "rand": "AssertionError: Output 0 does not match reference output. MLBEDSW-9770",
+        "rand_4d": "AssertionError: Output 0 does not match reference output. MLBEDSW-9770",
+        "randn_pos": "AssertionError: Output 0 does not match reference output. MLBEDSW-9770",
+        "randn_neg": "AssertionError: Output 0 does not match reference output. MLBEDSW-9770",
+        "ramp": "AssertionError: Output 0 does not match reference output. MLBEDSW-9770",
+    },
+    # int16 tables are not supported, but some tests happen to pass regardless.
+    # Set them to xfail but strict=False -> ok if they pass.
+    strict=False,
+)
+@common.XfailIfNoCorstone300
+def test_sigmoid_add_sigmoid_tosa_u55(test_data):
+    pipeline = EthosU55PipelineBI(
+        SigmoidAddSigmoid(),
+        (test_data(),),
+        Sigmoid.aten_op,
+        Sigmoid.exir_op,
+        run_on_fvp=True,
+    )
+    pipeline.change_args("quantize", get_32bit_sigmoid_quantizer("TOSA-0.80+BI+u55"))
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+@pytest.mark.flaky(reruns=5)
+@common.XfailIfNoCorstone320
+def test_sigmoid_tosa_u85(test_data):
+    pipeline = EthosU85PipelineBI(
+        Sigmoid(), (test_data(),), Sigmoid.aten_op, Sigmoid.exir_op, run_on_fvp=True
+    )
+    pipeline.change_args("quantize", get_32bit_sigmoid_quantizer("TOSA-0.80+BI"))
+    pipeline.run()
+
+
+@common.parametrize(
+    "test_data",
+    test_data_suite,
+    xfails={
+        "ramp": "AssertionError: Output 0 does not match reference output.",
+    },
+)
+@pytest.mark.flaky(reruns=5)
+@common.XfailIfNoCorstone320
+def test_sigmoid_add_sigmoid_tosa_u85(test_data):
+    pipeline = EthosU85PipelineBI(
+        SigmoidAddSigmoid(),
+        (test_data(),),
+        Sigmoid.aten_op,
+        Sigmoid.exir_op,
+        run_on_fvp=True,
+    )
+    pipeline.change_args("quantize", get_32bit_sigmoid_quantizer("TOSA-0.80+BI"))
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_softmax.py b/backends/arm/test/ops/test_softmax.py
index c60da18594f..d51f20040b0 100644
--- a/backends/arm/test/ops/test_softmax.py
+++ b/backends/arm/test/ops/test_softmax.py
@@ -1,156 +1,106 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
-
-from typing import Callable, Tuple
-
-import pytest
+from typing import Tuple
 
 import torch
 from executorch.backends.arm.test import common
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.exir.backend.compile_spec_schema import CompileSpec
-from parameterized import parameterized
-
-
-test_data_generators = [
-    # (test_name, test_data, dim)
-    lambda: ("zeros", torch.zeros(10, 8, 5, 2), 0),
-    lambda: ("zeros_neg_dim", torch.zeros(10, 7, 8, 9), -4),
-    lambda: ("ones", torch.ones(10, 10), 1),
-    lambda: ("ones_neg_dim", torch.ones(10, 3, 4), -1),
-    lambda: ("rand", torch.rand(1, 2, 5, 8), 2),
-    lambda: ("rand_neg_dim", torch.rand(2, 10, 8, 10), -2),
-    lambda: ("randn", torch.randn(10, 10, 10, 10), 3),
-    lambda: ("randn_neg_dim", torch.randn(10, 5, 8, 7), -3),
-]
-
-test_data_generators_u55 = [
-    # (test_name, test_data, dim)
-    lambda: ("ones", torch.ones(10, 10), 1),
-    lambda: ("ones_neg_dim", torch.ones(10, 3, 4), -1),
-    lambda: ("randn_neg_dim", torch.randn(10, 5, 8, 7), -3),
-    lambda: ("zeros", torch.zeros(10, 8, 5, 2), 0),
-    lambda: ("zeros_neg_dim", torch.zeros(10, 7, 8, 9), -4),
-    lambda: ("rand", torch.rand(1, 2, 5, 8), 2),
-    lambda: ("rand_neg_dim", torch.rand(2, 10, 8, 10), -2),
-    lambda: ("randn", torch.randn(10, 10, 10, 10), 3),
-]
-
-
-class TestSoftmax(unittest.TestCase):
-    """Tests softmax."""
-
-    class Softmax(torch.nn.Module):
-        def __init__(self, dim: int = -1):
-            super().__init__()
-            self.softmax = torch.nn.Softmax(dim=dim)
-
-        def forward(self, x):
-            return self.softmax(x)
-
-    def _test_softmax_tosa_MI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .check(["torch.ops.aten.softmax.int"])
-            .check_not(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten__softmax_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_softmax_tosa_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
-            .quantize()
-            .export()
-            .check_not(["torch.ops.aten.softmax.int"])
-            .check(["torch.ops.quantized_decomposed", "torch.ops.aten.mul.Tensor"])
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten__softmax_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_softmax_tosa_ethos_BI_pipeline(
-        self,
-        compile_spec: list[CompileSpec],
-        module: torch.nn.Module,
-        test_data: Tuple[torch.tensor],
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize()
-            .export()
-            .check_not(["torch.ops.aten.softmax.int"])
-            .check(["torch.ops.quantized_decomposed", "torch.ops.aten.mul.Tensor"])
-            .to_edge()
-            .partition()
-            .check_not(["executorch_exir_dialects_edge__ops_aten__softmax_default"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-        )
-
-    def _test_softmax_tosa_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
-    ):
-        self._test_softmax_tosa_ethos_BI_pipeline(
-            common.get_u55_compile_spec(), module, test_data
-        )
-
-    def _test_softmax_tosa_u85_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
-    ):
-        self._test_softmax_tosa_ethos_BI_pipeline(
-            common.get_u85_compile_spec(), module, test_data
-        )
-
-    @parameterized.expand(test_data_generators)
-    def test_softmax_tosa_MI(self, test_data_generator: Callable[[], Tuple]):
-        test_name, test_data, dim = test_data_generator()
-        self._test_softmax_tosa_MI_pipeline(self.Softmax(dim=dim), (test_data,))
-
-    @parameterized.expand(test_data_generators)
-    @pytest.mark.flaky  # TODO: MLETORCH-460 - Numerically stabler (log)softmax implementation
-    def test_softmax_tosa_BI(self, test_data_generator: Callable[[], Tuple]):
-        test_name, test_data, dim = test_data_generator()
-        self._test_softmax_tosa_BI_pipeline(self.Softmax(dim=dim), (test_data,))
-
-    @parameterized.expand(test_data_generators_u55)
-    @pytest.mark.flaky  # TODO: MLETORCH-460 - Numerically stabler (log)softmax implementation
-    def test_softmax_tosa_u55_BI(self, test_data_generator: Callable[[], Tuple]):
-        test_name, test_data, dim = test_data_generator()
-        self._test_softmax_tosa_u55_BI_pipeline(self.Softmax(dim=dim), (test_data,))
-
-    @parameterized.expand(test_data_generators)
-    @pytest.mark.flaky  # TODO: MLETORCH-460 - Numerically stabler (log)softmax implementation
-    def test_softmax_tosa_u85_BI(self, test_data_generator: Callable[[], Tuple]):
-        test_name, test_data, dim = test_data_generator()
-        self._test_softmax_tosa_u85_BI_pipeline(self.Softmax(dim=dim), (test_data,))
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+aten_op = "torch.ops.aten.softmax.default"  # Used for checking that we do not have softmax in the graph after decompose
+exir_op = "executorch_exir_dialects_edge__ops_aten__softmax_tensor"
+
+input_t1 = Tuple[torch.Tensor]  # Input x
+
+
+class Softmax(torch.nn.Module):
+    def __init__(self, dim: int = -1):
+        super().__init__()
+        self.softmax = torch.nn.Softmax(dim=dim)
+
+    def forward(self, x):
+        return self.softmax(x)
+
+    test_data = {
+        "ones": ((torch.ones(10, 10),), 1),
+        "ones_neg_dim": ((torch.ones(1, 3, 4),), -1),
+        "randn_neg_dim": ((torch.randn(1, 5, 8, 7),), -3),
+        "zeros": ((torch.zeros(1, 8, 5, 2),), 0),
+        "zeros_neg_dim": ((torch.zeros(1, 7, 8, 9),), -4),
+        "rand": ((torch.rand(1, 2, 5, 8),), 2),
+        "rand_neg_dim": ((torch.rand(1, 10, 8, 10),), -2),
+        "randn_mult_batches": ((torch.randn(2, 10, 10, 10),), 3),
+    }
+
+
+@common.parametrize("test_data", Softmax.test_data)
+def test_softmax_tosa_MI(test_data):
+    data, dim = test_data
+    pipeline = TosaPipelineMI[input_t1](Softmax(dim), data, [])
+    pipeline.add_stage_after(
+        "to_edge_transform_and_lower", pipeline.tester.check_not, [exir_op]
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Softmax.test_data)
+def test_softmax_tosa_BI(test_data):
+    data, dim = test_data
+    pipeline = TosaPipelineBI[input_t1](Softmax(dim), data, [])
+    pipeline.add_stage_after("quantize", pipeline.tester.check_not, [aten_op])
+    pipeline.run()
+
+
+@common.parametrize("test_data", Softmax.test_data)
+def test_softmax_u55_BI(test_data):
+    data, dim = test_data
+    pipeline = EthosU55PipelineBI[input_t1](Softmax(dim), data, [], run_on_fvp=False)
+    pipeline.add_stage_after("quantize", pipeline.tester.check_not, [aten_op])
+    pipeline.run()
+
+
+@common.parametrize("test_data", Softmax.test_data)
+def test_softmax_u85_BI(test_data):
+    data, dim = test_data
+    pipeline = EthosU85PipelineBI[input_t1](Softmax(dim), data, [], run_on_fvp=False)
+    pipeline.add_stage_after("quantize", pipeline.tester.check_not, [aten_op])
+    pipeline.run()
+
+
+@common.parametrize(
+    "test_data",
+    Softmax.test_data,
+    xfails={
+        "randn_mult_batches": "MLETORCH-433: Multiple batches not supported on FVP"
+    },
+)
+@common.SkipIfNoCorstone300
+def test_softmax_u55_BI_on_fvp(test_data):
+    data, dim = test_data
+    pipeline = EthosU55PipelineBI[input_t1](Softmax(dim), data, [], run_on_fvp=True)
+    pipeline.add_stage_after("quantize", pipeline.tester.check_not, [aten_op])
+    pipeline.run()
+
+
+@common.parametrize(
+    "test_data",
+    Softmax.test_data,
+    xfails={
+        "randn_mult_batches": "MLETORCH-433: Multiple batches not supported on FVP"
+    },
+)
+@common.SkipIfNoCorstone320
+def test_softmax_u85_BI_on_fvp(test_data):
+    data, dim = test_data
+    pipeline = EthosU85PipelineBI[input_t1](Softmax(dim), data, [], run_on_fvp=True)
+    pipeline.add_stage_after("quantize", pipeline.tester.check_not, [aten_op])
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_sub.py b/backends/arm/test/ops/test_sub.py
index 0812f8a47a1..d1849e830c9 100644
--- a/backends/arm/test/ops/test_sub.py
+++ b/backends/arm/test/ops/test_sub.py
@@ -1,155 +1,219 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024 Arm Limited and/or its affiliates.
 # All rights reserved.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import unittest
 
 from typing import Tuple
 
-import pytest
-
 import torch
-from executorch.backends.arm.test import common, conftest
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.exir.backend.compile_spec_schema import CompileSpec
-from parameterized import parameterized
-
-
-class TestSub(unittest.TestCase):
-    class Sub(torch.nn.Module):
-        test_parameters = [
-            (torch.ones(5),),
-            (3 * torch.ones(8),),
-            (10 * torch.randn(8),),
-        ]
-
-        def forward(self, x):
-            return x - x
-
-    class Sub2(torch.nn.Module):
-        test_parameters = [
-            (torch.randn(1, 1, 4, 4), torch.randn(1, 1, 4, 1)),
-        ]
-
-        def forward(self, x, y):
-            return x - y
-
-    def _test_sub_tosa_MI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
-            )
-            .export()
-            .check_count({"torch.ops.aten.sub.Tensor": 1})
-            .check_not(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_not(["torch.ops.aten.sub.Tensor"])
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data)
-        )
-
-    def _test_sub_tosa_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
-    ):
-        (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
-            .quantize()
-            .export()
-            .check_count({"torch.ops.aten.sub.Tensor": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data, qtol=1)
-        )
-
-    def _test_sub_ethosu_BI_pipeline(
-        self,
-        compile_spec: list[CompileSpec],
-        module: torch.nn.Module,
-        test_data: Tuple[torch.Tensor],
-    ):
-        tester = (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=compile_spec,
-            )
-            .quantize()
-            .export()
-            .check_count({"torch.ops.aten.sub.Tensor": 1})
-            .check(["torch.ops.quantized_decomposed"])
-            .to_edge()
-            .partition()
-            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-            .to_executorch()
-            .serialize()
-        )
-        if conftest.is_option_enabled("corstone_fvp"):
-            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
-
-    @parameterized.expand(Sub.test_parameters)
-    def test_sub_tosa_MI(self, test_data: torch.Tensor):
-        test_data = (test_data,)
-        self._test_sub_tosa_MI_pipeline(self.Sub(), test_data)
-
-    @parameterized.expand(Sub.test_parameters)
-    def test_sub_tosa_BI(self, test_data: torch.Tensor):
-        test_data = (test_data,)
-        self._test_sub_tosa_BI_pipeline(self.Sub(), test_data)
-
-    @parameterized.expand(Sub.test_parameters)
-    @pytest.mark.corstone_fvp
-    def test_sub_u55_BI(self, test_data: torch.Tensor):
-        test_data = (test_data,)
-        self._test_sub_ethosu_BI_pipeline(
-            common.get_u55_compile_spec(), self.Sub(), test_data
-        )
-
-    @parameterized.expand(Sub.test_parameters)
-    @pytest.mark.corstone_fvp
-    def test_sub_u85_BI(self, test_data: torch.Tensor):
-        test_data = (test_data,)
-        self._test_sub_ethosu_BI_pipeline(
-            common.get_u85_compile_spec(), self.Sub(), test_data
-        )
-
-    @parameterized.expand(Sub2.test_parameters)
-    def test_sub2_tosa_MI(self, operand1: torch.Tensor, operand2: torch.Tensor):
-        test_data = (operand1, operand2)
-        self._test_sub_tosa_MI_pipeline(self.Sub2(), test_data)
-
-    @parameterized.expand(Sub2.test_parameters)
-    def test_sub2_tosa_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
-        test_data = (operand1, operand2)
-        self._test_sub_tosa_BI_pipeline(self.Sub2(), test_data)
-
-    @parameterized.expand(Sub2.test_parameters)
-    @pytest.mark.corstone_fvp
-    def test_sub2_u55_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
-        test_data = (operand1, operand2)
-        self._test_sub_ethosu_BI_pipeline(
-            common.get_u55_compile_spec(), self.Sub2(), test_data
-        )
-
-    @parameterized.expand(Sub2.test_parameters)
-    @pytest.mark.corstone_fvp
-    def test_sub2_u85_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
-        test_data = (operand1, operand2)
-        self._test_sub_ethosu_BI_pipeline(
-            common.get_u85_compile_spec(), self.Sub2(), test_data
-        )
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+aten_op = "torch.ops.aten.sub.Tensor"
+exir_op = "executorch_exir_dialects_edge__ops_aten_sub_Tensor"
+
+# Single-input subtraction (x - x)
+sub_test_data = {
+    "ones_1D_5": (torch.ones(5),),
+    "ones_1D_50": (torch.ones(50),),
+    "rand_1D_10": (torch.rand(10),),
+    "rand_2D_5x5": (torch.rand(5, 5),),
+    "rand_3D_5x5x5": (torch.rand(5, 5, 5),),
+    "rand_4D_2x3x4x5": (torch.rand(2, 3, 4, 5),),
+    "zeros": (torch.zeros(10),),
+}
+
+fvp_sub_xfails = {"rand_4D_2x3x4x5": "MLETORCH-517 : Multiple batches not supported"}
+
+# Two-input subtraction (x - y)
+sub2_test_data = {
+    "rand_2D_4x4": (torch.rand(4, 4), torch.rand(4, 4)),
+    "rand_3D_4x4x4": (torch.rand(4, 2, 2), torch.rand(4, 2, 2)),
+    "rand_4D_2x2x4x4": (torch.rand(2, 2, 4, 4), torch.rand(2, 2, 4, 4)),
+    "zeros": (torch.rand(4, 4), torch.zeros(4, 4)),
+}
+fvp_sub2_xfails = {"rand_4D_2x2x4x4": "MLETORCH-517 : Multiple batches not supported"}
+
+
+class Sub(torch.nn.Module):
+    def forward(self, x: torch.Tensor):
+        return x - x
+
+
+class Sub2(torch.nn.Module):
+    def forward(self, x: torch.Tensor, y: torch.Tensor):
+        return x - y
+
+
+input_t1 = Tuple[torch.Tensor]  # Input x
+input_t2 = Tuple[torch.Tensor, torch.Tensor]  # Input x, y
+
+
+@common.parametrize("test_data", sub_test_data)
+def test_sub_tosa_MI(test_data):
+    """Test Subtraction (TOSA MI)"""
+    pipeline = TosaPipelineMI[input_t1](
+        Sub(),
+        test_data,
+        aten_op,
+        exir_op,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", sub2_test_data)
+def test_sub_2_tosa_MI(test_data: Tuple[torch.Tensor, torch.Tensor]):
+    """Test Two-Operand Subtraction (TOSA MI)"""
+    pipeline = TosaPipelineMI[input_t2](
+        Sub2(),
+        test_data,
+        aten_op,
+        exir_op,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", sub_test_data)
+def test_sub_tosa_BI(test_data):
+    """Test Subtraction (TOSA BI)"""
+    pipeline = TosaPipelineBI[input_t1](
+        Sub(),
+        test_data,
+        aten_op,
+        exir_op,
+    )
+    pipeline.change_args("run_method_and_compare_outputs", qtol=1)
+    pipeline.run()
+
+
+@common.parametrize("test_data", sub2_test_data)
+def test_sub_2_tosa_BI(test_data: Tuple[torch.Tensor, torch.Tensor]):
+    """Test Two-Operand Subtraction (TOSA BI)"""
+    pipeline = TosaPipelineBI[input_t2](
+        Sub2(),
+        test_data,
+        aten_op,
+        exir_op,
+    )
+    pipeline.change_args("run_method_and_compare_outputs", qtol=1)
+    pipeline.run()
+
+
+@common.parametrize("test_data", sub_test_data)
+def test_sub_u55_BI(test_data):
+    """Test Subtraction on Ethos-U55"""
+    pipeline = EthosU55PipelineBI[input_t1](
+        Sub(),
+        test_data,
+        aten_op,
+        exir_op,
+        run_on_fvp=False,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", sub2_test_data)
+def test_sub_2_u55_BI(test_data: Tuple[torch.Tensor, torch.Tensor]):
+    """Test Two-Operand Subtraction on Ethos-U55"""
+    pipeline = EthosU55PipelineBI[input_t2](
+        Sub2(),
+        test_data,
+        aten_op,
+        exir_op,
+        run_on_fvp=False,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", sub_test_data)
+def test_sub_u85_BI(test_data):
+    """Test Subtraction on Ethos-U85 (Quantized Mode)"""
+    pipeline = EthosU85PipelineBI[input_t1](
+        Sub(),
+        test_data,
+        aten_op,
+        exir_op,
+        run_on_fvp=False,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", sub2_test_data)
+def test_sub_2_u85_BI(test_data: Tuple[torch.Tensor, torch.Tensor]):
+    """Test Two-Operand Subtraction on Ethos-U85"""
+    pipeline = EthosU85PipelineBI[input_t2](
+        Sub2(),
+        test_data,
+        aten_op,
+        exir_op,
+        run_on_fvp=False,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", sub_test_data, fvp_sub_xfails)
+@common.SkipIfNoCorstone300
+def test_sub_u55_BI_on_fvp(test_data):
+    """Test Subtraction on Ethos-U55 (FVP Mode)"""
+    pipeline = EthosU55PipelineBI[input_t1](
+        Sub(),
+        test_data,
+        aten_op,
+        exir_op,
+        run_on_fvp=True,
+    )
+    pipeline.change_args("run_method_and_compare_outputs", qtol=1)
+    pipeline.run()
+
+
+@common.parametrize("test_data", sub2_test_data, fvp_sub2_xfails)
+@common.SkipIfNoCorstone300
+def test_sub_2_u55_BI_on_fvp(test_data: Tuple[torch.Tensor, torch.Tensor]):
+    """Test Two-Operand Subtraction on Ethos-U55 (FVP Mode)"""
+    pipeline = EthosU55PipelineBI[input_t2](
+        Sub2(),
+        test_data,
+        aten_op,
+        exir_op,
+        run_on_fvp=True,
+    )
+    pipeline.change_args("run_method_and_compare_outputs", qtol=1)
+    pipeline.run()
+
+
+@common.parametrize("test_data", sub_test_data, fvp_sub_xfails)
+@common.SkipIfNoCorstone320
+def test_sub_u85_BI_on_fvp(test_data):
+    """Test Subtraction on Ethos-U85 (FVP Mode)"""
+    pipeline = EthosU85PipelineBI[input_t1](
+        Sub(),
+        test_data,
+        aten_op,
+        exir_op,
+        run_on_fvp=True,
+    )
+    pipeline.change_args("run_method_and_compare_outputs", qtol=1)
+    pipeline.run()
+
+
+@common.parametrize("test_data", sub2_test_data, fvp_sub2_xfails)
+@common.SkipIfNoCorstone320
+def test_sub_2_u85_BI_on_fvp(test_data: Tuple[torch.Tensor, torch.Tensor]):
+    """Test Two-Operand Subtraction on Ethos-U85 (FVP Mode)"""
+    pipeline = EthosU85PipelineBI[input_t2](
+        Sub2(),
+        test_data,
+        aten_op,
+        exir_op,
+        run_on_fvp=True,
+    )
+    pipeline.change_args("run_method_and_compare_outputs", qtol=1)
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_sum.py b/backends/arm/test/ops/test_sum.py
index 5627c55ad9e..bc0c50b8ee0 100644
--- a/backends/arm/test/ops/test_sum.py
+++ b/backends/arm/test/ops/test_sum.py
@@ -9,7 +9,7 @@
 from typing import Tuple
 
 import torch
-from executorch.backends.arm.test import common
+from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
@@ -29,7 +29,7 @@ class Sum(torch.nn.Module):
             ((torch.rand(10), 0, True),),
             ((torch.rand(10, 10), 1, False),),
             ((torch.rand(10, 10, 10), [-3, 1], True),),
-            ((torch.rand(2, 1, 5, 8), 1, False),),
+            ((torch.rand(1, 1, 5, 8), 1, False),),
             ((torch.rand(1, 2, 3, 4), 3, True),),
             ((torch.rand(1, 2, 8, 8), [2, 3, 0], True),),
         ]
@@ -39,7 +39,7 @@ class Sum(torch.nn.Module):
             ((torch.rand(10, 10), 1, False),),
             ((torch.rand(1, 2, 3, 4), 3, True),),
             ((torch.rand(10, 10, 10), [-3, 1], True),),
-            ((torch.rand(2, 1, 5, 8), 1, False),),
+            ((torch.rand(1, 1, 5, 8), 1, False),),
             ((torch.rand(1, 2, 8, 8), [2, 3, 0], True),),
         ]
 
@@ -82,7 +82,7 @@ def _test_sum_tosa_BI_pipeline(
             .partition()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
-            .run_method_and_compare_outputs(inputs=test_data, qtol=1)
+            .run_method_and_compare_outputs(inputs=test_data)
         )
 
     def _test_sum_ethosu_BI_pipeline(
@@ -91,7 +91,7 @@ def _test_sum_ethosu_BI_pipeline(
         test_data: tuple[exampledata_t],
         compile_spec: CompileSpec,
     ):
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -107,6 +107,8 @@ def _test_sum_ethosu_BI_pipeline(
             .to_executorch()
             .serialize()
         )
+        if conftest.is_option_enabled("corstone_fvp"):
+            tester.run_method_and_compare_outputs(inputs=test_data, qtol=1)
 
     @parameterized.expand(Sum.test_parameters)
     def test_sum_tosa_MI(self, test_data: tuple[exampledata_t]):
diff --git a/backends/arm/test/ops/test_to_copy.py b/backends/arm/test/ops/test_to_copy.py
index 6992ac2f8e6..db3e93fbdc9 100644
--- a/backends/arm/test/ops/test_to_copy.py
+++ b/backends/arm/test/ops/test_to_copy.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -55,9 +55,7 @@ def _test_to_copy_tosa_MI_pipeline(
                 compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
             )
             .export()
-            .dump_artifact()
             .to_edge()
-            .dump_artifact()
             .partition()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
diff --git a/backends/arm/test/ops/test_unary.py b/backends/arm/test/ops/test_unary.py
new file mode 100644
index 00000000000..1f91cab56c1
--- /dev/null
+++ b/backends/arm/test/ops/test_unary.py
@@ -0,0 +1,153 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from typing import Tuple
+
+import torch
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+
+input_t1 = Tuple[torch.Tensor]  # Input x
+
+
+class Ceil(torch.nn.Module):
+    def forward(self, x: torch.Tensor):
+        return torch.ceil(x)
+
+    op_name = "ceil"
+    aten_op = "torch.ops.aten.ceil.default"
+    exir_op = "executorch_exir_dialects_edge__ops_aten_ceil_default"
+
+
+class Floor(torch.nn.Module):
+    def forward(self, x: torch.Tensor):
+        return torch.floor(x)
+
+    op_name = "floor"
+    aten_op = "torch.ops.aten.floor.default"
+    exir_op = "executorch_exir_dialects_edge__ops_aten_floor_default"
+
+
+zeros = torch.zeros(1, 10, 10, 10)
+ones = torch.ones(10, 10, 10)
+rand = torch.rand(10, 10) - 0.5
+randn_pos = torch.randn(1, 4, 4, 4) + 10
+randn_neg = torch.randn(1, 4, 4, 4) - 10
+ramp = torch.arange(-16, 16, 0.2)
+
+
+test_data = {
+    "ceil_zeros": (
+        Ceil(),
+        zeros,
+    ),
+    "floor_zeros": (
+        Floor(),
+        zeros,
+    ),
+    "ceil_ones": (
+        Ceil(),
+        ones,
+    ),
+    "floor_ones": (
+        Floor(),
+        ones,
+    ),
+    "ceil_rand": (
+        Ceil(),
+        rand,
+    ),
+    "floor_rand": (
+        Floor(),
+        rand,
+    ),
+    "ceil_randn_pos": (
+        Ceil(),
+        randn_pos,
+    ),
+    "floor_randn_pos": (
+        Floor(),
+        randn_pos,
+    ),
+    "ceil_randn_neg": (
+        Ceil(),
+        randn_neg,
+    ),
+    "floor_randn_neg": (
+        Floor(),
+        randn_neg,
+    ),
+    "ceil_ramp": (
+        Ceil(),
+        ramp,
+    ),
+    "floor_ramp": (
+        Floor(),
+        ramp,
+    ),
+}
+
+
+@common.parametrize("test_data", test_data)
+def test_unary_tosa_MI(test_data: input_t1):
+    module = test_data[0]
+    pipeline = TosaPipelineMI[input_t1](
+        module, (test_data[1],), module.aten_op, module.exir_op
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data)
+def test_unary_tosa_BI(test_data: input_t1):
+    module = test_data[0]
+    pipeline = TosaPipelineBI[input_t1](
+        module, (test_data[1],), module.aten_op, module.exir_op
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data)
+def test_unary_u55_BI(test_data: input_t1):
+    module = test_data[0]
+    pipeline = EthosU55PipelineBI[input_t1](
+        module, (test_data[1],), module.aten_op, module.exir_op, run_on_fvp=False
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data)
+def test_unary_u85_BI(test_data: input_t1):
+    module = test_data[0]
+    pipeline = EthosU85PipelineBI[input_t1](
+        module, (test_data[1],), module.aten_op, module.exir_op, run_on_fvp=False
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data)
+@common.SkipIfNoCorstone300
+def test_unary_u55_BI_on_fvp(test_data: input_t1):
+    module = test_data[0]
+    pipeline = EthosU55PipelineBI[input_t1](
+        module, (test_data[1],), module.aten_op, module.exir_op, run_on_fvp=True
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data)
+@common.SkipIfNoCorstone320
+def test_unary_u85_BI_on_fvp(test_data: input_t1):
+    module = test_data[0]
+    pipeline = EthosU85PipelineBI[input_t1](
+        module, (test_data[1],), module.aten_op, module.exir_op, run_on_fvp=True
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_var.py b/backends/arm/test/ops/test_var.py
index ad095f01ded..6690c668f94 100644
--- a/backends/arm/test/ops/test_var.py
+++ b/backends/arm/test/ops/test_var.py
@@ -16,7 +16,7 @@
     get_symmetric_quantization_config,
     TOSAQuantizer,
 )
-from executorch.backends.arm.test import common
+from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 
 from executorch.backends.arm.tosa_specification import TosaSpecification
@@ -36,13 +36,16 @@ class Var(torch.nn.Module):
             (torch.rand(1, 50, 10, 20), False, 0.5),
         ]
 
+        def __init__(self, keepdim: bool = True, correction: int = 0):
+            super().__init__()
+            self.keepdim = keepdim
+            self.correction = correction
+
         def forward(
             self,
             x: torch.Tensor,
-            keepdim: bool = True,
-            correction: int = 0,
         ):
-            return x.var(keepdim=keepdim, correction=correction)
+            return x.var(keepdim=self.keepdim, correction=self.correction)
 
     class VarDim(torch.nn.Module):
         test_parameters = [
@@ -62,14 +65,17 @@ class VarDim(torch.nn.Module):
             (torch.rand(1, 50, 10, 20), -1, True, True),
         ]
 
+        def __init__(self, dim: int = -1, keepdim: bool = True, unbiased: bool = False):
+            super().__init__()
+            self.dim = dim
+            self.keepdim = keepdim
+            self.unbiased = unbiased
+
         def forward(
             self,
             x: torch.Tensor,
-            dim: int = -1,
-            keepdim: bool = True,
-            unbiased: bool = False,
         ):
-            return x.var(dim=dim, keepdim=keepdim, unbiased=unbiased)
+            return x.var(dim=self.dim, keepdim=self.keepdim, unbiased=self.unbiased)
 
     class VarCorrection(torch.nn.Module):
         test_parameters = [
@@ -79,14 +85,19 @@ class VarCorrection(torch.nn.Module):
             (torch.rand(1, 50, 10, 20), (-1, -2), True, 0.5),
         ]
 
+        def __init__(
+            self, dim: int = -1, keepdim: bool = True, correction: bool = False
+        ):
+            super().__init__()
+            self.dim = dim
+            self.keepdim = keepdim
+            self.correction = correction
+
         def forward(
             self,
             x: torch.Tensor,
-            dim: int | tuple[int] = -1,
-            keepdim: bool = True,
-            correction: int = 0,
         ):
-            return x.var(dim=dim, keepdim=keepdim, correction=correction)
+            return x.var(dim=self.dim, keepdim=self.keepdim, correction=self.correction)
 
     def _test_var_tosa_MI_pipeline(
         self,
@@ -138,7 +149,7 @@ def _test_var_ethosu_BI_pipeline(
         quantizer = EthosUQuantizer(compile_spec).set_io(
             get_symmetric_quantization_config()
         )
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -150,58 +161,61 @@ def _test_var_ethosu_BI_pipeline(
             .partition()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
+            .serialize()
         )
+        if conftest.is_option_enabled("corstone_fvp"):
+            tester.run_method_and_compare_outputs(inputs=test_data, qtol=1)
 
     @parameterized.expand(Var.test_parameters)
     def test_var_tosa_MI(self, test_tensor: torch.Tensor, keepdim, correction):
-        self._test_var_tosa_MI_pipeline(self.Var(), (test_tensor, keepdim, correction))
+        self._test_var_tosa_MI_pipeline(self.Var(keepdim, correction), (test_tensor,))
 
     @parameterized.expand(Var.test_parameters)
     def test_var_tosa_BI(self, test_tensor: torch.Tensor, keepdim, correction):
-        self._test_var_tosa_BI_pipeline(self.Var(), (test_tensor, keepdim, correction))
+        self._test_var_tosa_BI_pipeline(self.Var(keepdim, correction), (test_tensor,))
 
     @parameterized.expand(Var.test_parameters)
     def test_var_u55_BI(self, test_tensor: torch.Tensor, keepdim, correction):
         self._test_var_ethosu_BI_pipeline(
-            self.Var(),
+            self.Var(keepdim, correction),
             common.get_u55_compile_spec(),
-            (test_tensor, keepdim, correction),
+            (test_tensor,),
         )
 
     @parameterized.expand(Var.test_parameters)
     def test_var_u85_BI(self, test_tensor: torch.Tensor, keepdim, correction):
         self._test_var_ethosu_BI_pipeline(
-            self.Var(),
+            self.Var(keepdim, correction),
             common.get_u85_compile_spec(),
-            (test_tensor, keepdim, correction),
+            (test_tensor,),
         )
 
     @parameterized.expand(VarDim.test_parameters)
-    def test_var_dim_tosa_MI(self, test_tensor: torch.Tensor, dim, keepdim, correction):
+    def test_var_dim_tosa_MI(self, test_tensor: torch.Tensor, dim, keepdim, unbiased):
         self._test_var_tosa_MI_pipeline(
-            self.VarDim(), (test_tensor, dim, keepdim, correction)
+            self.VarDim(dim, keepdim, unbiased), (test_tensor,)
         )
 
     @parameterized.expand(VarDim.test_parameters)
-    def test_var_dim_tosa_BI(self, test_tensor: torch.Tensor, dim, keepdim, correction):
+    def test_var_dim_tosa_BI(self, test_tensor: torch.Tensor, dim, keepdim, unbiased):
         self._test_var_tosa_BI_pipeline(
-            self.VarDim(), (test_tensor, dim, keepdim, correction)
+            self.VarDim(dim, keepdim, unbiased), (test_tensor,)
         )
 
     @parameterized.expand(VarDim.test_parameters_u55)
-    def test_var_dim_u55_BI(self, test_tensor: torch.Tensor, dim, keepdim, correction):
+    def test_var_dim_u55_BI(self, test_tensor: torch.Tensor, dim, keepdim, unbiased):
         self._test_var_ethosu_BI_pipeline(
-            self.VarDim(),
+            self.VarDim(dim, keepdim, unbiased),
             common.get_u55_compile_spec(),
-            (test_tensor, dim, keepdim, correction),
+            (test_tensor,),
         )
 
     @parameterized.expand(VarDim.test_parameters)
-    def test_var_dim_u85_BI(self, test_tensor: torch.Tensor, dim, keepdim, correction):
+    def test_var_dim_u85_BI(self, test_tensor: torch.Tensor, dim, keepdim, unbiased):
         self._test_var_ethosu_BI_pipeline(
-            self.VarDim(),
+            self.VarDim(dim, keepdim, unbiased),
             common.get_u85_compile_spec(),
-            (test_tensor, dim, keepdim, correction),
+            (test_tensor,),
         )
 
     @parameterized.expand(VarCorrection.test_parameters)
@@ -209,7 +223,7 @@ def test_var_correction_tosa_MI(
         self, test_tensor: torch.Tensor, dim, keepdim, correction
     ):
         self._test_var_tosa_MI_pipeline(
-            self.VarCorrection(), (test_tensor, dim, keepdim, correction)
+            self.VarCorrection(dim, keepdim, correction), (test_tensor,)
         )
 
     @parameterized.expand(VarCorrection.test_parameters)
@@ -217,7 +231,7 @@ def test_var_correction_tosa_BI(
         self, test_tensor: torch.Tensor, dim, keepdim, correction
     ):
         self._test_var_tosa_BI_pipeline(
-            self.VarCorrection(), (test_tensor, dim, keepdim, correction)
+            self.VarCorrection(dim, keepdim, correction), (test_tensor,)
         )
 
     @parameterized.expand(VarCorrection.test_parameters)
@@ -225,9 +239,9 @@ def test_var_correction_u55_BI(
         self, test_tensor: torch.Tensor, dim, keepdim, correction
     ):
         self._test_var_ethosu_BI_pipeline(
-            self.VarCorrection(),
+            self.VarCorrection(dim, keepdim, correction),
             common.get_u55_compile_spec(),
-            (test_tensor, dim, keepdim, correction),
+            (test_tensor,),
         )
 
     @parameterized.expand(VarCorrection.test_parameters)
@@ -235,7 +249,7 @@ def test_var_correction_u85_BI(
         self, test_tensor: torch.Tensor, dim, keepdim, correction
     ):
         self._test_var_ethosu_BI_pipeline(
-            self.VarCorrection(),
+            self.VarCorrection(dim, keepdim, correction),
             common.get_u85_compile_spec(),
-            (test_tensor, dim, keepdim, correction),
+            (test_tensor,),
         )
diff --git a/backends/arm/test/passes/test_cast_int64_pass.py b/backends/arm/test/passes/test_cast_int64_pass.py
index 0465a85deb9..ae1e09f52a2 100644
--- a/backends/arm/test/passes/test_cast_int64_pass.py
+++ b/backends/arm/test/passes/test_cast_int64_pass.py
@@ -8,7 +8,7 @@
 import torch
 from executorch.backends.arm._passes.cast_int64_pass import CastInt64ToInt32Pass
 
-from executorch.backends.arm.test.tester.test_pipeline import TestPassPipeline
+from executorch.backends.arm.test.tester.test_pipeline import PassPipeline
 
 input_t = Tuple[torch.Tensor]  # Input x
 
@@ -28,7 +28,7 @@ def test_int64_model_tosa_BI():
         "executorch_exir_dialects_edge__ops_dim_order_ops__to_dim_order_copy_default": 1,
         "executorch_exir_dialects_edge__ops_aten_add_Tensor": 1,
     }
-    pipeline = TestPassPipeline[input_t](
+    pipeline = PassPipeline[input_t](
         module,
         module.get_inputs(),
         tosa_version="TOSA-0.80+BI",
diff --git a/backends/arm/test/passes/test_convert_to_clamp.py b/backends/arm/test/passes/test_convert_to_clamp.py
new file mode 100644
index 00000000000..0b106b7bc82
--- /dev/null
+++ b/backends/arm/test/passes/test_convert_to_clamp.py
@@ -0,0 +1,80 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+from executorch.backends.arm._passes.convert_to_clamp import ConvertToClampPass
+
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
+
+from executorch.backends.xnnpack.test.tester.tester import RunPasses
+
+
+class HardTanh(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+        self.hardtanh = torch.nn.Hardtanh()
+
+    def forward(self, x):
+        return self.hardtanh(x)
+
+    def get_inputs(self):
+        return (torch.rand(1, 64, 64, 3),)
+
+
+class ReLU(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+        self.relu = torch.nn.ReLU()
+
+    def forward(self, x):
+        return self.relu(x)
+
+    def get_inputs(self):
+        return (torch.rand(1, 64, 64, 3),)
+
+
+class TestConvertToClampPass(unittest.TestCase):
+    """
+    Tests the ConvertToClampPass which converts hardtanh.default and relu.default to clamp.default
+    """
+
+    def test_tosa_MI_hardtahn(self):
+        module = HardTanh()
+        test_pass_stage = RunPasses([ConvertToClampPass])
+        (
+            ArmTester(
+                module,
+                example_inputs=module.get_inputs(),
+                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
+            )
+            .export()
+            .to_edge()
+            .check(["executorch_exir_dialects_edge__ops_aten_hardtanh_default"])
+            .run_passes(test_pass_stage)
+            .check(["executorch_exir_dialects_edge__ops_aten_clamp_default"])
+            .check_not(["executorch_exir_dialects_edge__ops_aten_hardtanh_default"])
+        )
+
+    def test_tosa_MI_relu(self):
+        module = ReLU()
+        test_pass_stage = RunPasses([ConvertToClampPass])
+        (
+            ArmTester(
+                module,
+                example_inputs=module.get_inputs(),
+                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
+            )
+            .export()
+            .to_edge()
+            .check(["executorch_exir_dialects_edge__ops_aten_relu_default"])
+            .run_passes(test_pass_stage)
+            .check(["executorch_exir_dialects_edge__ops_aten_clamp_default"])
+            .check_not(["executorch_exir_dialects_edge__ops_aten_relu_default"])
+        )
diff --git a/backends/arm/test/passes/test_fold_qdq_pass.py b/backends/arm/test/passes/test_fold_qdq_pass.py
index f63fa33bca1..8d0ff90755f 100644
--- a/backends/arm/test/passes/test_fold_qdq_pass.py
+++ b/backends/arm/test/passes/test_fold_qdq_pass.py
@@ -9,7 +9,7 @@
 from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
     FoldAndAnnotateQParamsPass,
 )
-from executorch.backends.arm.test.tester.test_pipeline import TestPassPipeline
+from executorch.backends.arm.test.tester.test_pipeline import PassPipeline
 
 
 input_t = Tuple[torch.Tensor, torch.Tensor]  # Input x, y
@@ -32,7 +32,7 @@ def test_fold_qdq_pass_tosa_BI():
     is removed from the representation.
     """
     module = SimpleQuantizeModel()
-    pipeline = TestPassPipeline[input_t](
+    pipeline = PassPipeline[input_t](
         module,
         module.get_inputs(),
         tosa_version="TOSA-0.80+BI",
diff --git a/backends/arm/test/passes/test_fuse_batchnorm_pass.py b/backends/arm/test/passes/test_fuse_batchnorm_pass.py
index b18e536b155..415aa9f6132 100644
--- a/backends/arm/test/passes/test_fuse_batchnorm_pass.py
+++ b/backends/arm/test/passes/test_fuse_batchnorm_pass.py
@@ -8,7 +8,7 @@
 import torch
 from executorch.backends.arm._passes.fuse_batchnorm2d_pass import FuseBatchnorm2DPass
 from executorch.backends.arm.test import common
-from executorch.backends.arm.test.tester.test_pipeline import TestPassPipeline
+from executorch.backends.arm.test.tester.test_pipeline import PassPipeline
 
 input_t = Tuple[torch.Tensor]  # Input x
 
@@ -85,13 +85,13 @@ def forward(self, x):
         return x
 
 
-class MergeNoBN(torch.nn.Module):
+class MergeMultipleUsersBN(torch.nn.Module):
     ops_before_pass = {
         "executorch_exir_dialects_edge__ops_aten__native_batch_norm_legit_no_training_default": 2,
         "executorch_exir_dialects_edge__ops_aten_convolution_default": 3,
     }
     ops_after_pass = {
-        "executorch_exir_dialects_edge__ops_aten__native_batch_norm_legit_no_training_default": 2,
+        "executorch_exir_dialects_edge__ops_aten__native_batch_norm_legit_no_training_default": 1,
         "executorch_exir_dialects_edge__ops_aten_convolution_default": 3,
     }
 
@@ -122,7 +122,7 @@ def forward(self, x):
         z = self.conv2d2(x)
         a = self.batch_norm2d(
             y
-        )  # Can't be fused since paramters of conv2d2 have multiple users.
+        )  # Can be fused despite paramters of conv2d2 having multiple users.
 
         return z, a
 
@@ -131,14 +131,14 @@ def forward(self, x):
     "merge_one_of_two_bn_affine": MergeOneOfTwoBN(True),
     "merge_one_of_two_bn": MergeOneOfTwoBN(False),
     "merge_two_of_two_bn_affine": MergeTwosOfTwoBN(True),
-    "merge_no_bn_affine": MergeNoBN(True),
+    "merge_multiple_users_bn_affine": MergeMultipleUsersBN(True),
 }
 
 
 @common.parametrize("module", modules)
 def test_fuse_batchnorm_tosa_MI(module):
     """Test various cases where the batchnorm should and shouldn't be fused."""
-    pipeline = TestPassPipeline[input_t](
+    pipeline = PassPipeline[input_t](
         module,
         module.get_inputs(),
         tosa_version="TOSA-0.80+MI",
diff --git a/backends/arm/test/passes/test_fuse_constant_ops_pass.py b/backends/arm/test/passes/test_fuse_constant_ops_pass.py
new file mode 100644
index 00000000000..c6ad4420327
--- /dev/null
+++ b/backends/arm/test/passes/test_fuse_constant_ops_pass.py
@@ -0,0 +1,117 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import operator
+import unittest
+from typing import Tuple
+
+import torch
+from executorch.backends.arm._passes.fuse_constant_ops_pass import FuseConstantOpsPass
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    PassPipeline,
+    TosaPipelineBI,
+)
+
+input_t = Tuple[torch.Tensor]  # Input x
+
+
+class FuseParameter(torch.nn.Module):
+    ops_before_pass = {
+        "executorch_exir_dialects_edge__ops_aten_full_default": 1,
+        "executorch_exir_dialects_edge__ops_aten_view_copy_default": 2,
+        "executorch_exir_dialects_edge__ops_aten_permute_copy_default": 1,
+        "executorch_exir_dialects_edge__ops_aten_addmm_default": 1,
+        "executorch_exir_dialects_edge__ops_aten_add_Tensor": 1,
+    }
+    ops_after_pass = {"executorch_exir_dialects_edge__ops_aten_add_Tensor": 1}
+    ops_not_after_pass = [
+        "executorch_exir_dialects_edge__ops_aten_full_default",
+        "executorch_exir_dialects_edge__ops_aten_view_copy_default",
+        "executorch_exir_dialects_edge__ops_aten_permute_copy_default",
+        "executorch_exir_dialects_edge__ops_aten_addmm_default",
+    ]
+
+    def __init__(
+        self,
+        in_features: int = 1,
+        out_features: int = 1,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.fc = torch.nn.Linear(
+            in_features=in_features,
+            out_features=out_features,
+            bias=bias,
+        )
+
+    def forward(self, x):
+        return self.fc(torch.ones(1)) + x
+
+
+class FuseBuffer(torch.nn.Module):
+    ops_before_pass = {
+        "executorch_exir_dialects_edge__ops_aten_add_Tensor": 1,
+        "executorch_exir_dialects_edge__ops_aten_mul_Tensor": 1,
+    }
+    ops_after_pass = {
+        "executorch_exir_dialects_edge__ops_aten_add_Tensor": 1,
+        "executorch_exir_dialects_edge__ops_aten_mul_Tensor": 1,
+    }
+    ops_not_after_pass = [
+        "executorch_exir_dialects_edge__ops_dim_order_ops__to_dim_order_copy_default"
+    ]
+
+    def forward(self, x: torch.Tensor):
+        return (x + 1) * 2
+
+
+class FuseLiftedTensor(torch.nn.Module):
+    ops_before_pass = {
+        "executorch_exir_dialects_edge__ops_aten_select_copy_int": 1,
+        "executorch_exir_dialects_edge__ops_aten_add_Tensor": 1,
+    }
+    ops_after_pass = {"executorch_exir_dialects_edge__ops_aten_add_Tensor": 1}
+    ops_not_after_pass = ["executorch_exir_dialects_edge__ops_aten_select_copy_int"]
+
+    def __init__(
+        self,
+    ):
+        super().__init__()
+        self.lifted_tensor = torch.rand(2)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        sliced = self.lifted_tensor[0]
+        return operator.add(sliced, x)
+
+
+modules = {
+    "fuse_parameter": FuseParameter(),
+    "fuse_buffer": FuseBuffer(),
+    "fuse_const_tensor": FuseLiftedTensor(),
+}
+
+
+@common.parametrize("module", modules)
+def test_fuse_batchnorm_tosa_MI(module):
+    pipeline = PassPipeline[input_t](
+        module=module,
+        test_data=(torch.rand(1),),
+        tosa_version="TOSA-0.80+MI",
+        ops_before_pass=module.ops_before_pass,
+        ops_after_pass=module.ops_after_pass,
+        ops_not_after_pass=module.ops_not_after_pass,
+        passes_with_exported_program=[FuseConstantOpsPass],
+    )
+    pipeline.run()
+
+
+@unittest.skip("Test failing on internal CI")
+@common.parametrize("module", modules)
+def test_fuse_batchnorm_tosa_BI(module):
+    pipeline = TosaPipelineBI[input_t](
+        module, (torch.rand(10, 10),), [], [], use_to_edge_transform_and_lower=True
+    )
+    pipeline.run()
diff --git a/backends/arm/test/passes/test_insert_table_ops_pass.py b/backends/arm/test/passes/test_insert_table_ops_pass.py
index 5c761c8bcb4..bdbcef3713d 100644
--- a/backends/arm/test/passes/test_insert_table_ops_pass.py
+++ b/backends/arm/test/passes/test_insert_table_ops_pass.py
@@ -11,7 +11,7 @@
     FoldAndAnnotateQParamsPass,
 )
 from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass
-from executorch.backends.arm.test.tester.test_pipeline import TestPassPipeline
+from executorch.backends.arm.test.tester.test_pipeline import PassPipeline
 
 input_t = Tuple[torch.Tensor]  # Input x
 
@@ -27,7 +27,7 @@ def get_inputs(self) -> input_t:
 
 def test_insert_table_tosa_BI():
     module = Sigmoid()
-    pipeline = TestPassPipeline[input_t](
+    pipeline = PassPipeline[input_t](
         module,
         module.get_inputs(),
         tosa_version="TOSA-0.80+BI",
diff --git a/backends/arm/test/passes/test_meandim_to_averagepool2d.py b/backends/arm/test/passes/test_meandim_to_averagepool2d.py
index 935085c66e4..66fdff6e532 100644
--- a/backends/arm/test/passes/test_meandim_to_averagepool2d.py
+++ b/backends/arm/test/passes/test_meandim_to_averagepool2d.py
@@ -11,7 +11,7 @@
     ConvertMeanDimToAveragePoolPass,
 )
 from executorch.backends.arm.test import common
-from executorch.backends.arm.test.tester.test_pipeline import TestPassPipeline
+from executorch.backends.arm.test.tester.test_pipeline import PassPipeline
 
 
 input_t = Tuple[torch.Tensor, torch.Tensor]  # Input x
@@ -65,7 +65,7 @@ def test_meandim_to_avgpool_tosa_BI(module):
     Tests the MeanDimToAveragePool2dPass which converts mean.dim to average_pool2d
     for the special case where dim is [-1, -2] and keepdim is True.
     """
-    pipeline = TestPassPipeline[input_t](
+    pipeline = PassPipeline[input_t](
         module,
         module.get_inputs(),
         tosa_version="TOSA-0.80+BI",
diff --git a/backends/arm/test/passes/test_rescale_pass.py b/backends/arm/test/passes/test_rescale_pass.py
index 25052c448d1..21317c23a8a 100644
--- a/backends/arm/test/passes/test_rescale_pass.py
+++ b/backends/arm/test/passes/test_rescale_pass.py
@@ -13,7 +13,6 @@
 from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from parameterized import parameterized
-from torch.testing._internal import optests
 
 
 def test_rescale_op():
@@ -64,7 +63,7 @@ def test_nonzero_zp_for_int32():
         ),
     ]
     for sample_input in sample_inputs:
-        with pytest.raises(optests.generate_tests.OpCheckError):
+        with pytest.raises(Exception, match="opcheck"):
             torch.library.opcheck(torch.ops.tosa._rescale, sample_input)
 
 
@@ -87,7 +86,7 @@ def test_zp_outside_range():
         ),
     ]
     for sample_input in sample_inputs:
-        with pytest.raises(optests.generate_tests.OpCheckError):
+        with pytest.raises(Exception, match="opcheck"):
             torch.library.opcheck(torch.ops.tosa._rescale, sample_input)
 
 
@@ -116,7 +115,7 @@ def _test_rescale_pipeline(
 ):
     """Tests a model with many ops that requires rescales. As more ops are quantized to int32 and
     need the InsertRescalesPass, make sure that they play nicely together."""
-    (
+    tester = (
         ArmTester(
             module,
             example_inputs=test_data,
@@ -126,8 +125,9 @@ def _test_rescale_pipeline(
         .export()
         .to_edge_transform_and_lower()
         .to_executorch()
-        .run_method_and_compare_outputs(test_data)
     )
+    if conftest.is_option_enabled("tosa_ref_model"):
+        tester.run_method_and_compare_outputs(test_data)
 
 
 def _test_rescale_pipeline_ethosu(
@@ -152,6 +152,7 @@ def _test_rescale_pipeline_ethosu(
 class TestRescales(unittest.TestCase):
 
     @parameterized.expand(RescaleNetwork.test_parameters)
+    @pytest.mark.tosa_ref_model
     def test_quantized_rescale(self, x, y):
         _test_rescale_pipeline(RescaleNetwork(), (x, y))
 
diff --git a/backends/arm/test/passes/test_unsqueeze_before_repeat_pass.py b/backends/arm/test/passes/test_unsqueeze_before_repeat_pass.py
index 8f4a9130cea..942d7decfba 100644
--- a/backends/arm/test/passes/test_unsqueeze_before_repeat_pass.py
+++ b/backends/arm/test/passes/test_unsqueeze_before_repeat_pass.py
@@ -10,7 +10,7 @@
     UnsqueezeBeforeRepeatPass,
 )
 from executorch.backends.arm.test import common
-from executorch.backends.arm.test.tester.test_pipeline import TestPassPipeline
+from executorch.backends.arm.test.tester.test_pipeline import PassPipeline
 
 input_t = Tuple[
     torch.Tensor, Dict[str, int], list[str]
@@ -47,7 +47,7 @@ def test_unsqueeze_before_repeat_tosa_MI(test_data):
     """
     module = Repeat()
     data, ops_after_pass, ops_not_after_pass = test_data
-    pipeline = TestPassPipeline(
+    pipeline = PassPipeline(
         module,
         data,
         tosa_version="TOSA-0.80+MI",
diff --git a/backends/arm/test/pytest.ini b/backends/arm/test/pytest.ini
index 3af1f0d0971..e73bd7dbb26 100644
--- a/backends/arm/test/pytest.ini
+++ b/backends/arm/test/pytest.ini
@@ -2,4 +2,5 @@
 addopts = --strict-markers
 markers =
     slow: Tests that take long time
-    corstone_fvp: Tests that use Corstone300 or Corstone320 FVP
\ No newline at end of file
+    corstone_fvp: Tests that use Corstone300 or Corstone320 FVP # And also uses TOSA reference model
+    tosa_ref_model: Tests that use TOSA reference model # Temporary!
diff --git a/backends/arm/test/runner_utils.py b/backends/arm/test/runner_utils.py
index 65be0b88f7b..28bbee052f9 100644
--- a/backends/arm/test/runner_utils.py
+++ b/backends/arm/test/runner_utils.py
@@ -22,7 +22,6 @@
 try:
     import tosa_reference_model
 except ImportError:
-    logger.warning("tosa_reference_model not found, can't run reference model tests")
     tosa_reference_model = None
 from executorch.backends.arm.arm_backend import get_tosa_spec, is_tosa
 
@@ -35,12 +34,33 @@
 from torch.fx.node import Node
 
 from torch.overrides import TorchFunctionMode
-from torch.testing._internal.common_utils import torch_to_numpy_dtype_dict
 from tosa import TosaGraph
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.CRITICAL)
 
+# Copied from PyTorch.
+# From torch/testing/_internal/common_utils.py:torch_to_numpy_dtype_dict
+# To avoid a dependency on _internal stuff.
+_torch_to_numpy_dtype_dict = {
+    torch.bool: np.bool_,
+    torch.uint8: np.uint8,
+    torch.uint16: np.uint16,
+    torch.uint32: np.uint32,
+    torch.uint64: np.uint64,
+    torch.int8: np.int8,
+    torch.int16: np.int16,
+    torch.int32: np.int32,
+    torch.int64: np.int64,
+    torch.float16: np.float16,
+    torch.float32: np.float32,
+    torch.float64: np.float64,
+    torch.bfloat16: np.float32,
+    torch.complex32: np.complex64,
+    torch.complex64: np.complex64,
+    torch.complex128: np.complex128,
+}
+
 
 class QuantizationParams:
     __slots__ = ["node_name", "zp", "scale", "qmin", "qmax", "dtype"]
@@ -181,7 +201,8 @@ def _tosa_dispatch(self, lowered_backend_module: LoweredBackendModule, inputs):
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         super().__exit__(exc_type, exc_val, exc_tb)
-        if not self.ran_tosa_dispatch:
+        # Only raise this error if we ran the model without errors.
+        if not self.ran_tosa_dispatch and exc_type is None:
             raise RuntimeError(
                 "Ran model with TosaReferenceModelDispatch but never ran TOSABackend delegate."
             )
@@ -336,7 +357,7 @@ def run_corstone(
         output_dtype = node.meta["val"].dtype
         tosa_ref_output = np.fromfile(
             os.path.join(intermediate_path, f"out-{i}.bin"),
-            torch_to_numpy_dtype_dict[output_dtype],
+            _torch_to_numpy_dtype_dict[output_dtype],
         )
 
         output_np.append(torch.from_numpy(tosa_ref_output).reshape(output_shape))
@@ -350,7 +371,7 @@ def prep_data_for_save(
 ):
     if isinstance(data, torch.Tensor):
         data_np = np.array(data.detach(), order="C").astype(
-            torch_to_numpy_dtype_dict[data.dtype]
+            _torch_to_numpy_dtype_dict[data.dtype]
         )
     else:
         data_np = np.array(data)
@@ -525,12 +546,12 @@ def corstone320_installed() -> bool:
 
 def get_elf_path(target_board):
     elf_path = os.path.join(
-        "cmake-out",
+        "arm_test",
         f"arm_semihosting_executor_runner_{target_board}",
         "arm_executor_runner",
     )
     if not os.path.exists(elf_path):
-        raise RuntimeError(
+        raise FileNotFoundError(
             f"Did not find build arm_executor_runner in path {elf_path}, run setup_testing.sh?"
         )
     else:
diff --git a/backends/arm/test/setup_testing.sh b/backends/arm/test/setup_testing.sh
index ebf9d799677..ea5e6a4f603 100755
--- a/backends/arm/test/setup_testing.sh
+++ b/backends/arm/test/setup_testing.sh
@@ -12,8 +12,8 @@ et_root_dir=$(cd ${script_dir}/../../.. && pwd)
 ethos_u_root_dir=${et_root_dir}/examples/arm/ethos-u-scratch/ethos-u
 
 toolchain_cmake=${et_root_dir}/examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake
-et_build_dir=${et_root_dir}/cmake-out
-build_root_test_dir=${et_build_dir}/arm_semihosting_executor_runner
+et_build_dir=${et_root_dir}/arm_test/cmake-out
+build_root_test_dir=${et_root_dir}/arm_test/arm_semihosting_executor_runner
 
 # Build Arm Baremetal executor_runner in semihosting mode.
 # Put in backends/arm/test/res to be used by unit tests.
@@ -38,16 +38,16 @@ function build_semihosting_executorch_runner() {
           -DTARGET_CPU=${target_cpu}                         \
           -DSEMIHOSTING=ON                                   \
           -DCMAKE_RUNTIME_OUTPUT_DIRECTORY=${build_test_dir} \
-          -B ${build_test_dir}                               \
           -DETHOS_SDK_PATH:PATH=${ethos_u_root_dir}          \
           -DET_DIR_PATH:PATH=${et_root_dir}                  \
           -DET_BUILD_DIR_PATH:PATH=${et_build_dir}           \
           -DPYTHON_EXECUTABLE=$(which python3)               \
-          -DSYSTEM_CONFIG=${system_config}
+          -DSYSTEM_CONFIG=${system_config}                   \
+          -B ${build_test_dir}
     echo "[${FUNCNAME[0]}] Configured CMAKE"
 
     n=$(nproc)
-    cmake --build ${build_test_dir} -- -j"$((n - 5))" arm_executor_runner
+    cmake --build ${build_test_dir} -j"$((n - 5))" -- arm_executor_runner
     echo "[${FUNCNAME[0]}] Generated baremetal elf file: with semihosting enabled"
     find ${build_test_dir} -name "arm_executor_runner"
 }
diff --git a/backends/arm/test/targets.bzl b/backends/arm/test/targets.bzl
new file mode 100644
index 00000000000..e97b46cb977
--- /dev/null
+++ b/backends/arm/test/targets.bzl
@@ -0,0 +1,40 @@
+# load("//caffe2/test/fb:defs.bzl", "define_tests")
+load("@fbcode_macros//build_defs:python_pytest.bzl", "python_pytest")
+load("@bazel_skylib//lib:paths.bzl", "paths")
+
+def define_arm_tests():
+    # TODO Add more tests
+    test_files = []
+
+    # Passes
+    test_files += native.glob(["passes/test_*.py"])
+    # https://github.com/pytorch/executorch/issues/8606
+    test_files.remove("passes/test_ioquantization_pass.py")
+
+    # Operators
+    test_files += native.glob(["ops/test_linear.py"])
+
+    TESTS = {}
+
+    for test_file in test_files:
+        test_file_name = paths.basename(test_file)
+        test_name = test_file_name.replace("test_", "").replace(".py", "")
+
+        python_pytest(
+            name = test_name,
+            srcs = [test_file],
+            pytest_config = "pytest.ini",
+            resources = ["conftest.py"],
+            compile = "with-source",
+            typing = False,
+            preload_deps = [
+                "//executorch/kernels/quantized:custom_ops_generated_lib",
+            ],
+            deps = [
+                "//executorch/backends/arm/test:arm_tester",
+                "//executorch/backends/arm/test:conftest",
+                "//executorch/exir:lib",
+                "fbsource//third-party/pypi/pytest:pytest",
+                "fbsource//third-party/pypi/parameterized:parameterized",
+            ],
+        )
diff --git a/backends/arm/test/test_arm_baremetal.sh b/backends/arm/test/test_arm_baremetal.sh
index 9f2fa4c17d0..b995341a586 100755
--- a/backends/arm/test/test_arm_baremetal.sh
+++ b/backends/arm/test/test_arm_baremetal.sh
@@ -17,61 +17,67 @@ pwd
 TEST_SUITE=$1
 
 help() {
-  echo "Usage:"
-  echo " $0 <TESTNAME>"
-  echo " where <TESTNAME> can be any of:"
-  # This will list all lines in this file that is starting with test_ remove () { and print it as a list.
-  # e,g, "test_pytest() { # Test ops and other things" -> test_pytest # Test ops and other things
-  echo "all # run all tests"
-  grep "^test_" $0 | sed 's/([^)]*)[[:space:]]*{*//g'
-  exit
+    echo "Usage:"
+    echo " $0 <TESTNAME>"
+    echo " where <TESTNAME> can be any of:"
+    # This will list all lines in this file that is starting with test_ remove () { and print it as a list.
+    # e,g, "test_pytest() { # Test ops and other things" -> test_pytest # Test ops and other things
+    echo "all # run all tests"
+    grep "^test_" $0 | sed 's/([^)]*)[[:space:]]*{*//g'
+    exit
 }
 
 if [[ -z "${TEST_SUITE:-}" ]]; then
-  echo "Missing test suite name, exiting..."
-  help
+    echo "Missing test suite name, exiting..."
+    help
 else
-  echo "Run Arm baremetal test suite ${TEST_SUITE}"
+    echo "Run Arm baremetal test suite ${TEST_SUITE}"
 fi
 
 TEST_SUITE_NAME="$(basename "$0") ${TEST_SUITE}"
 
 all() { # Run all tests
-  # This will list all lines in this file that is starting with test_ remove () { and add this script name in 
-  # front of it and execute it in a sub shell
-  # e.g. from this file:
-  #
-  # test_pytest() { # Test ops and other things
-  #  bla bla bla
-  # }
-  # test_pytest_ethosu_fvp() { # Same as test_pytest but ...
-  #  bla bla bla
-  # }
-  #...
-  # become a small script:
-  # ----
-  # backends/arm/test/test_arm_baremetal.sh test_pytest # Test ops and other things
-  # backends/arm/test/test_arm_baremetal.sh test_pytest_ethosu_fvp # Same as test_pytest but ...
-  # ...
-  # ----
-  # That is executed
-  echo "${TEST_SUITE_NAME}: Run all tests"
-  grep "^test_" backends/arm/test/test_arm_baremetal.sh | sed 's/([^)]*)[[:space:]]*{*//g' | sed "s|^|$0 |" | sh
+    # This will list all lines in this file that is starting with test_ remove () { and add this script name in
+    # front of it and execute it in a sub shell
+    # e.g. from this file:
+    #
+    # test_pytest() { # Test ops and other things
+    #  bla bla bla
+    # }
+    # test_pytest_ethosu_fvp() { # Same as test_pytest but ...
+    #  bla bla bla
+    # }
+    #...
+    # become a small script:
+    # ----
+    # backends/arm/test/test_arm_baremetal.sh test_pytest # Test ops and other things
+    # backends/arm/test/test_arm_baremetal.sh test_pytest_ethosu_fvp # Same as test_pytest but ...
+    # ...
+    # ----
+    # That is executed
+    echo "${TEST_SUITE_NAME}: Run all tests"
+    grep "^test_" backends/arm/test/test_arm_baremetal.sh | sed 's/([^)]*)[[:space:]]*{*//g' | sed "s|^|$0 |" | sh
+    echo "${TEST_SUITE_NAME}: PASS"
 }
 
 test_pytest() { # Test ops and other things
     echo "${TEST_SUITE_NAME}: Run pytest"
+
+    ./examples/models/llama3_2_vision/install_requirements.sh
+
     cd "${et_root_dir}"
     source examples/arm/ethos-u-scratch/setup_path.sh
     backends/arm/scripts/build_quantized_ops_aot_lib.sh
 
     # Run arm baremetal pytest tests without FVP
     pytest  --verbose --color=yes --numprocesses=auto backends/arm/test/
+    echo "${TEST_SUITE_NAME}: PASS"
 }
 
 test_pytest_ethosu_fvp() { # Same as test_pytest but also sometime verify using Corstone FVP
     echo "${TEST_SUITE_NAME}: Run pytest with fvp"
 
+    ./examples/models/llama3_2_vision/install_requirements.sh
     source examples/arm/ethos-u-scratch/setup_path.sh
 
     # Prepare Corstone-3x0 FVP for pytest
@@ -80,28 +86,68 @@ test_pytest_ethosu_fvp() { # Same as test_pytest but also sometime verify using
 
     # Run arm baremetal pytest tests with FVP
     pytest  --verbose --color=yes --numprocesses=auto backends/arm/test/ --arm_run_corstoneFVP
+    echo "${TEST_SUITE_NAME}: PASS"
 }
 
-test_run_ethosu_fvp() { # End to End model tests
+test_run_ethosu_fvp() { # End to End model tests using run.sh
     echo "${TEST_SUITE_NAME}: Test ethos-u delegate examples with run.sh"
 
     source examples/arm/ethos-u-scratch/setup_path.sh
 
     # TOSA quantized
     echo "${TEST_SUITE_NAME}: Test ethos-u target TOSA"
-    examples/arm/run.sh --target=TOSA --model_name=mv2
-    examples/arm/run.sh --target=TOSA --model_name=lstm
-    examples/arm/run.sh --target=TOSA --model_name=edsr
+    examples/arm/run.sh --et_build_root=arm_test/test_run --target=TOSA --model_name=add
+    examples/arm/run.sh --et_build_root=arm_test/test_run --target=TOSA --model_name=mul
 
     # Ethos-U55
     echo "${TEST_SUITE_NAME}: Test ethos-u target Ethos-U55"
-    examples/arm/run.sh --target=ethos-u55-128 --model_name=mv2
-    examples/arm/run.sh --target=ethos-u55-128 --model_name=lstm
+    examples/arm/run.sh --et_build_root=arm_test/test_run --target=ethos-u55-128 --model_name=add
+    examples/arm/run.sh --et_build_root=arm_test/test_run --target=ethos-u55-128 --model_name=mul
 
     # Ethos-U85
     echo "${TEST_SUITE_NAME}: Test ethos-u target Ethos-U85"
-    examples/arm/run.sh --target=ethos-u85-128 --model_name=mv2
-    examples/arm/run.sh --target=ethos-u85-128 --model_name=lstm
+    examples/arm/run.sh --et_build_root=arm_test/test_run --target=ethos-u85-128 --model_name=add
+    examples/arm/run.sh --et_build_root=arm_test/test_run --target=ethos-u85-128 --model_name=mul
+    echo "${TEST_SUITE_NAME}: PASS"
     }
 
-${TEST_SUITE}
\ No newline at end of file
+test_models_ethosu_fvp() { # End to End model tests using model_test.py
+    echo "${TEST_SUITE_NAME}: Test ethos-u delegate models with test_model.py"
+
+    source examples/arm/ethos-u-scratch/setup_path.sh
+
+    # Build common libs once
+    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --build_libs
+
+    # TOSA quantized
+    echo "${TEST_SUITE_NAME}: Test ethos-u target TOSA"
+    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=TOSA --model=mv2
+    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=TOSA --model=mv3
+    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=TOSA --model=lstm
+    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=TOSA --model=edsr
+
+    # Ethos-U55
+    echo "${TEST_SUITE_NAME}: Test ethos-u target Ethos-U55"
+    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u55-128 --model=mv2  --extra_flags="-DET_ATOL=2.00 -DET_RTOL=2.00"
+    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u55-64  --model=mv3  --extra_flags="-DET_ATOL=5.00 -DET_RTOL=5.00"
+    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u55-256 --model=lstm --extra_flags="-DET_ATOL=0.03 -DET_RTOL=0.03"
+
+    # Ethos-U85
+    echo "${TEST_SUITE_NAME}: Test ethos-u target Ethos-U85"
+    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u85-256  --model=mv2  --extra_flags="-DET_ATOL=2.00 -DET_RTOL=2.00"
+    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u85-1024 --model=mv3  --extra_flags="-DET_ATOL=5.00 -DET_RTOL=5.00"
+    python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u85-128  --model=lstm --extra_flags="-DET_ATOL=0.03 -DET_RTOL=0.03"
+    echo "${TEST_SUITE_NAME}: PASS"
+    }
+
+test_full_ethosu_fvp() { # All End to End model tests
+    echo "${TEST_SUITE_NAME}: Test ethos-u delegate models and examples on fvp"
+
+    test_models_ethosu_fvp
+    test_run_ethosu_fvp
+    echo "${TEST_SUITE_NAME}: PASS"
+    }
+
+
+
+${TEST_SUITE}
diff --git a/backends/arm/test/test_model.py b/backends/arm/test/test_model.py
new file mode 100755
index 00000000000..b94a5f65256
--- /dev/null
+++ b/backends/arm/test/test_model.py
@@ -0,0 +1,265 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+import platform
+import subprocess
+import sys
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--build_libs",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Flag for building executorch libs needed for this testing",
+    )
+    parser.add_argument(
+        "--model",
+        required=False,
+        default=None,
+        help="Model to use that aot_arm_compiler.py can handle, can be a builtin, examples/models or a filename.",
+    )
+    parser.add_argument(
+        "--target",
+        required=False,
+        default=None,
+        help="Target name",
+    )
+    parser.add_argument(
+        "--test_output",
+        required=False,
+        default="arm_test",
+        help="Output folder used for build and test defults to arm_test",
+    )
+    parser.add_argument(
+        "--system_config",
+        required=False,
+        default=None,
+        help="Target specific system_config (See Vela compiler)",
+    )
+    parser.add_argument(
+        "--memory_mode",
+        required=False,
+        default=None,
+        help="Target specific memory_mode (See Vela compiler)",
+    )
+    parser.add_argument(
+        "--no_intermediate",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Don't save temporary files during compilation",
+    )
+    parser.add_argument(
+        "--extra_flags",
+        required=False,
+        default=None,
+        help="Extra cmake flags to pass the when building the executor_runner",
+    )
+    args = parser.parse_args()
+
+    if args.model and "ethos-u" in args.target and args.system_config is None:
+        if "u55" in args.target:
+            args.system_config = "Ethos_U55_High_End_Embedded"
+        elif "u85" in args.target:
+            args.system_config = "Ethos_U85_SYS_DRAM_Mid"
+        else:
+            raise RuntimeError(f"Invalid target name {args.target}")
+
+    if args.model and "ethos-u" in args.target and args.memory_mode is None:
+        if "u55" in args.target:
+            args.memory_mode = "Shared_Sram"
+        elif "u85" in args.target:
+            args.memory_mode = "Sram_Only"
+        else:
+            raise RuntimeError(f"Invalid target name {args.target}")
+
+    return args
+
+
+def run_external_cmd(cmd: []):
+    print("CALL:", *cmd, sep=" ")
+    try:
+        subprocess.check_call(cmd)
+    except subprocess.CalledProcessError as err:
+        print("ERROR called: ", *cmd, sep=" ")
+        print(f"Failed with: {err.returncode}")
+        sys.exit(err.returncode)
+
+
+def build_libs(et_build_root: str, script_path: str):
+    run_external_cmd(
+        [
+            "bash",
+            os.path.join(script_path, "build_executorch.sh"),
+            f"--et_build_root={et_build_root}",
+            "--build_type=Release",
+            "--devtools",
+            "--etdump",
+        ]
+    )
+    run_external_cmd(
+        [
+            "bash",
+            os.path.join(script_path, "build_portable_kernels.sh"),
+            f"--et_build_root={et_build_root}",
+            "--build_type=Release",
+            "--portable_kernels=aten::_softmax.out",
+        ]
+    )
+    run_external_cmd(
+        [
+            "bash",
+            os.path.join(script_path, "build_quantized_ops_aot_lib.sh"),
+            f"--et_build_root={et_build_root}",
+            "--build_type=Release",
+        ]
+    )
+
+
+def build_pte(
+    et_build_root: str,
+    model_name: str,
+    target: str,
+    system_config: str,
+    memory_mode: str,
+    build_output: str,
+    no_intermediate: bool,
+):
+    soext = {"Darwin": "dylib", "Linux": "so", "Windows": "dll"}.get(
+        platform.system(), None
+    )
+    solibs_path = os.path.join(
+        et_build_root,
+        "cmake-out-aot-lib",
+        "kernels",
+        "quantized",
+        f"libquantized_ops_aot_lib.{soext}",
+    )
+    solibs = f"--so_library={solibs_path}"
+
+    intermediate = ""
+    if not no_intermediate:
+        intermediate = f"--intermediate={output}"
+
+    run_external_cmd(
+        [
+            "python3",
+            "-m",
+            "examples.arm.aot_arm_compiler",
+            "--delegate",
+            "--quantize",
+            "--bundleio",
+            intermediate,
+            f"--model_name={model_name}",
+            f"--target={target}",
+            f"--output={build_output}",
+            f"--system_config={system_config}",
+            f"--memory_mode={memory_mode}",
+            solibs,
+        ]
+    )
+
+    pte_file = os.path.join(output, f"{model_name}_arm_delegate_{args.target}.bpte")
+    return pte_file
+
+
+def build_ethosu_runtime(
+    et_build_root: str,
+    script_path: str,
+    pte_file: str,
+    target: str,
+    system_config: str,
+    extra_flags: str,
+    elf_build_path: str,
+):
+
+    extra_build_flag = ""
+    if extra_flags:
+        extra_build_flag = f"--extra_build_flags={extra_flags}"
+
+    run_external_cmd(
+        [
+            "bash",
+            os.path.join(script_path, "build_executorch_runner.sh"),
+            f"--et_build_root={et_build_root}",
+            f"--pte={pte_file}",
+            "--bundleio",
+            "--etdump",
+            f"--target={target}",
+            "--build_type=Release",
+            f"--system_config={system_config}",
+            extra_build_flag,
+            f"--output={elf_build_path}",
+        ]
+    )
+
+    elf_file = os.path.join(elf_build_path, "cmake-out", "arm_executor_runner")
+    return elf_file
+
+
+def run_elf_with_fvp(script_path: str, elf_file: str, target: str):
+    run_external_cmd(
+        [
+            "bash",
+            os.path.join(script_path, "run_fvp.sh"),
+            f"--elf={elf_file}",
+            f"--target={target}",
+        ]
+    )
+
+
+if __name__ == "__main__":
+
+    args = get_args()
+    script_path = os.path.join("backends", "arm", "scripts")
+
+    if args.build_libs:
+        build_libs(args.test_output, script_path)
+
+    if args.model:
+        model_name = args.model.split(" ")[0].split(";")[0]
+        if not model_name:
+            print("ERROR: Bad --model specified")
+        if not args.target:
+            print("ERROR: --model need --target to also be set")
+
+        output = os.path.join(
+            args.test_output, f"{model_name}_arm_delegate_{args.target}"
+        )
+
+        pte_file = build_pte(
+            args.test_output,
+            model_name,
+            args.target,
+            args.system_config,
+            args.memory_mode,
+            output,
+            args.no_intermediate,
+        )
+        print(f"PTE file created: {pte_file} ")
+
+        if "ethos-u" in args.target:
+            elf_build_path = os.path.join(
+                output, f"{model_name}_arm_delegate_{args.target}"
+            )
+
+            elf_file = build_ethosu_runtime(
+                args.test_output,
+                script_path,
+                pte_file,
+                args.target,
+                args.system_config,
+                args.extra_flags,
+                elf_build_path,
+            )
+            print(f"ELF file created: {elf_file} ")
+
+            run_elf_with_fvp(script_path, elf_file, args.target)
+        print(f"Model: {model_name} on {args.target} -> PASS")
diff --git a/backends/arm/test/tester/analyze_output_utils.py b/backends/arm/test/tester/analyze_output_utils.py
index 3436bfe618a..1ec0f2304aa 100644
--- a/backends/arm/test/tester/analyze_output_utils.py
+++ b/backends/arm/test/tester/analyze_output_utils.py
@@ -137,6 +137,8 @@ def print_error_diffs(
             N, C, H, W = (1, 1, shape[0], shape[1])
         case 1:
             N, C, H, W = (1, 1, 1, shape[0])
+        case 0:
+            N, C, H, W = (1, 1, 1, 1)
         case _:
             raise ValueError("Invalid tensor rank")
 
diff --git a/backends/arm/test/tester/arm_tester.py b/backends/arm/test/tester/arm_tester.py
index a6da2accd1d..7b74603cfb2 100644
--- a/backends/arm/test/tester/arm_tester.py
+++ b/backends/arm/test/tester/arm_tester.py
@@ -3,12 +3,14 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import copy
+
 import logging
 
 import os
 from collections import Counter
 from pprint import pformat
-from typing import Callable, Iterable, List, Optional, Tuple, Type, Union
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Type, Union
 
 import executorch.backends.xnnpack.test.tester.tester as tester
 
@@ -16,6 +18,7 @@
 
 import torch.fx
 import torch.utils._pytree as pytree
+from executorch.backends.arm._passes.arm_pass_manager import ArmPassManager
 
 from executorch.backends.arm.arm_backend import (
     get_intermediate_path,
@@ -48,20 +51,26 @@
 
 from executorch.backends.xnnpack.test.tester import Tester
 from executorch.devtools.backend_debug import get_delegation_info
+
 from executorch.exir import (
     EdgeCompileConfig,
     EdgeProgramManager,
     ExecutorchProgramManager,
     ExportedProgram,
+    to_edge_transform_and_lower,
 )
 from executorch.exir.backend.backend_api import validation_disabled
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from executorch.exir.backend.partitioner import Partitioner
 from executorch.exir.lowered_backend_module import LoweredBackendModule
 from executorch.exir.pass_base import ExportPass
-from executorch.exir.program._program import _update_exported_program_graph_module
+from executorch.exir.program._program import (
+    _copy_module,
+    _update_exported_program_graph_module,
+)
 
 from tabulate import tabulate
+
 from torch.export.graph_signature import ExportGraphSignature, InputSpec, OutputSpec
 from torch.fx import Graph
 from torch.utils._pytree import tree_flatten
@@ -122,10 +131,28 @@ def dump_artifact(self, path_to_dump: Optional[str]):
 
 
 class ToEdgeTransformAndLower(tester.ToEdgeTransformAndLower):
+    def __init__(
+        self,
+        partitioners: Optional[List[Partitioner]] = None,
+        edge_compile_config: Optional[EdgeCompileConfig] = None,
+        constant_methods: Optional[Dict[str, Any]] = None,
+    ):
+        super().__init__(partitioners, edge_compile_config)
+        self.constant_methods = constant_methods
+
     def dump_artifact(self, path_to_dump: Optional[str]):
         super().dump_artifact(path_to_dump)
         _dump_lowered_modules_artifact(path_to_dump, self.artifact, self.graph_module)
 
+    def run(self, artifact: ExportedProgram, inputs=None) -> None:
+        artifact_to_run = copy.deepcopy(artifact)
+        self.edge_dialect_program = to_edge_transform_and_lower(
+            artifact_to_run,
+            compile_config=self.edge_compile_conf,
+            partitioner=self.partitioners,
+            constant_methods=self.constant_methods,
+        )
+
 
 class Serialize(tester.Serialize):
     def __init__(self, compile_spec: list[CompileSpec], timeout):
@@ -181,6 +208,7 @@ def __init__(
         """Passes are run in the order they are passed: first pass_list, second pass_functions,
         and lastly passes_with_exported_program."""
         self.pass_with_exported_program = passes_with_exported_program
+
         super().__init__(pass_list, pass_functions)
 
     def run(
@@ -236,6 +264,9 @@ def __init__(
         model: torch.nn.Module,
         example_inputs: Tuple,
         compile_spec: List[CompileSpec],
+        tosa_ref_model_path: str | None = None,
+        dynamic_shapes: Optional[Tuple[Any]] = None,
+        constant_methods: Optional[Dict[str, Any]] = None,
     ):
         """
         Args:
@@ -244,8 +275,9 @@ def __init__(
             compile_spec (List[CompileSpec]): The compile spec to use
         """
 
+        self.constant_methods = constant_methods
         self.compile_spec = compile_spec
-        super().__init__(model, example_inputs)
+        super().__init__(model, example_inputs, dynamic_shapes)
         self.pipeline[self.stage_name(InitialModel)] = [
             self.stage_name(tester.Quantize),
             self.stage_name(tester.Export),
@@ -310,7 +342,9 @@ def to_edge_transform_and_lower(
                     raise ValueError("compile spec doesn't target any Arm Partitioner")
                 partitioners = [arm_partitioner]
             to_edge_and_lower_stage = ToEdgeTransformAndLower(
-                partitioners, edge_compile_config
+                partitioners,
+                edge_compile_config,
+                constant_methods=self.constant_methods,
             )
         else:
             if partitioners is not None:
@@ -347,6 +381,7 @@ def run_method_and_compare_outputs(
         rtol=1e-03,
         qtol=0,
         error_callbacks=None,
+        run_eager_mode=False,
     ):
         """
         Compares the run_artifact output of 'stage' with the output of a reference stage.
@@ -362,12 +397,23 @@ def run_method_and_compare_outputs(
             inputs (Optional[Tuple[torch.Tensor]]): Allows you to input custom input data.
                 The default is random data.
         """
-        edge_stage = self.stages[self.stage_name(tester.ToEdge)]
-        if edge_stage is None:
-            edge_stage = self.stages[self.stage_name(tester.ToEdgeTransformAndLower)]
-        assert (
-            edge_stage is not None
-        ), "To compare outputs, at least the ToEdge or ToEdgeTransformAndLower stage needs to be run."
+
+        if not run_eager_mode:
+            edge_stage = self.stages[self.stage_name(tester.ToEdge)]
+            if edge_stage is None:
+                edge_stage = self.stages[
+                    self.stage_name(tester.ToEdgeTransformAndLower)
+                ]
+            assert (
+                edge_stage is not None
+            ), "To compare outputs, at least the ToEdge or ToEdgeTransformAndLower stage needs to be run."
+        else:
+            # Run models in eager mode. We do this when we want to check that the passes
+            # are numerically accurate and the exported graph is correct.
+            export_stage = self.stages[self.stage_name(tester.Export)]
+            assert (
+                export_stage is not None
+            ), "To compare outputs in eager mode, the model must be at Export stage"
 
         stage = stage or self.cur
         test_stage = self.stages[stage]
@@ -380,6 +426,7 @@ def run_method_and_compare_outputs(
 
         exported_program = self.stages[self.stage_name(tester.Export)].artifact
         output_nodes = get_output_nodes(exported_program)
+
         output_qparams = get_output_quantization_params(output_nodes)
 
         quantization_scales = []
@@ -404,9 +451,19 @@ def run_method_and_compare_outputs(
             reference_outputs, _ = pytree.tree_flatten(
                 reference_stage.run_artifact(reference_input)
             )
-            test_outputs, _ = pytree.tree_flatten(
-                test_stage.run_artifact(reference_input)
-            )
+
+            if run_eager_mode:
+                # Run exported module directly
+                test_outputs, _ = pytree.tree_flatten(
+                    self._calculate_reference_output(
+                        exported_program.module(), reference_input
+                    )
+                )
+            else:
+                # Run lowered model with target
+                test_outputs, _ = pytree.tree_flatten(
+                    test_stage.run_artifact(reference_input)
+                )
 
             for reference_output, test_output, quantization_scale in zip(
                 reference_outputs, test_outputs, quantization_scales
@@ -533,6 +590,32 @@ def dump_dtype_distribution(
         _dump_str(to_print, path_to_dump)
         return self
 
+    def run_transform_for_annotation_pipeline(
+        self, stage: str | None = None
+    ) -> torch.fx.GraphModule:
+        """Run transform_for_annotation_pipeline on exported program to ensure
+        passes do not break the initial model before quantization.
+
+        There are caveats to this however. As we register buffers to the graph modules
+        the resulting exported graph can fail. Use this only to compare numerical correctness
+        in eager mode.
+
+        Returns exported program with passes applied.
+        """
+
+        if stage is None:
+            stage = self.cur
+        # We need to clone the artifact in order to ensure that the state_dict is preserved after passes are run.
+        artifact = self.get_artifact(stage)
+        if self.cur == self.stage_name(tester.Export):
+            new_gm = ArmPassManager(get_tosa_spec(self.compile_spec)).transform_for_annotation_pipeline(  # type: ignore[arg-type]
+                graph_module=artifact.graph_module
+            )
+        else:
+            raise RuntimeError("Can only run passes on Export stage.")
+        _copy_module(artifact.graph_module, new_gm)
+        return artifact
+
     @staticmethod
     def _calculate_reference_output(
         module: Union[torch.fx.GraphModule, torch.nn.Module], inputs
diff --git a/backends/arm/test/tester/test_pipeline.py b/backends/arm/test/tester/test_pipeline.py
index 0f079b3a6fd..01707b6364a 100644
--- a/backends/arm/test/tester/test_pipeline.py
+++ b/backends/arm/test/tester/test_pipeline.py
@@ -4,11 +4,19 @@
 # LICENSE file in the root directory of this source tree.
 
 import logging
-from typing import Any, Callable, Dict, Generic, List, Optional, Type, TypeVar
+from typing import Callable, Dict, Generic, List, Optional, Type, TypeVar
 
 import torch
+
+from executorch.backends.arm.quantizer.arm_quantizer import (
+    EthosUQuantizer,
+    get_symmetric_quantization_config,
+    TOSAQuantizer,
+)
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester, RunPasses
+
+from executorch.backends.xnnpack.test.tester.tester import Quantize
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from executorch.exir.pass_base import ExportPass
 from torch._export.pass_base import PassType
@@ -20,16 +28,19 @@
 
 class BasePipelineMaker(Generic[T]):
     """
-    The BasePiplineMaker defines a list of stages to be applied to a torch.nn.module for lowering it in the Arm backend. To be inherited and adjusted for particular targets.
-    Importantly, the pipeline list can be modified before running the pipeline to support various pipeline extensions and debugging usecases.
+    The BasePiplineMaker defines a list of stages to be applied to a torch.nn.module for lowering it
+    in the Arm backend. To be inherited and adjusted for particular targets. Importantly, the
+    pipeline list can be modified before running the pipeline to support various pipeline extensions
+    and debugging usecases.
 
     Attributes:
         module: The module which the pipeline is applied to.
         test_data: Data used for quantizing and testing the module.
         aten_ops: Aten dialect ops expected to be found in the graph after export.
-        exir_ops: Exir dialect ops expected to be found in the graph after to_edge.
-        compile_spec: The compile spec used in the lowering process
-        use_edge_to_transform_and_lower: Selects betweeen two possible routes for lowering the module:
+        compile_spec: The compile spec used in the lowering process.
+        exir_ops: Exir dialect ops expected to be found in the graph after to_edge if not using
+                  use_edge_to_transform_and_lower.
+        use_edge_to_transform_and_lower: Selects betweeen two possible routes for lowering:
                 tester.to_edge_transform_and_lower()
             or
                 tester.to_edge().check(exir_ops).partition()
@@ -40,11 +51,11 @@ class PipelineStage:
         Helper class to store a pipeline stage as a function call + args for calling later on.
 
         Attributes:
-            id: name of the function to be called, used for refering to stages in the pipeline
-            func: handle to the function to be called
-            args: args used when called
-            kwargs: kwargs used when called
-            is_called: keeps track of if the function has been called
+            id: name of the function to be called, used for refering to stages in the pipeline.
+            func: handle to the function to be called.
+            args: args used when called.
+            kwargs: kwargs used when called.
+            is_called: keeps track of if the function has been called.
         """
 
         def __init__(self, func: Callable, id: str, *args, **kwargs):
@@ -73,9 +84,9 @@ def __init__(
         module: torch.nn.Module,
         test_data: T,
         aten_ops: str | List[str],
-        exir_ops: str | List[str],
         compile_spec: List[CompileSpec],
-        use_to_edge_transform_and_lower: bool = False,
+        exir_ops: Optional[str | List[str]] = None,
+        use_to_edge_transform_and_lower: bool = True,
     ):
 
         self.tester = ArmTester(
@@ -83,7 +94,12 @@ def __init__(
         )
 
         self.aten_ops = aten_ops if isinstance(aten_ops, list) else [aten_ops]
-        self.exir_ops = exir_ops if isinstance(exir_ops, list) else [exir_ops]
+        if exir_ops is None:
+            self.exir_ops = []
+        elif isinstance(exir_ops, list):
+            self.exir_ops = exir_ops
+        else:
+            self.exir_ops = [exir_ops]
         self.test_data = test_data
         self._stages = []
 
@@ -152,10 +168,11 @@ def add_stage(self, func: Callable, *args, **kwargs):
 
                 suffix = str(len(stages_containing_stage_id))
 
-            stage_id = stage_id + "." + suffix
+            if not suffix == "0":
+                stage_id = stage_id + "." + suffix
 
-            if stage_id in id_list:
-                raise ValueError("Suffix must be unique in pipeline")
+                if stage_id in id_list:
+                    raise ValueError("Suffix must be unique in pipeline")
 
         pipeline_stage = self.PipelineStage(func, stage_id, *args, **kwargs)
         self._stages.insert(pos, pipeline_stage)
@@ -230,29 +247,55 @@ def run(self):
 
 
 class TosaPipelineBI(BasePipelineMaker, Generic[T]):
-    """Lowers a graph to BI TOSA spec (with quantization) and tests it with the TOSA reference model."""
+    """
+    Lowers a graph to BI TOSA spec (with quantization) and tests it with the TOSA reference model.
+
+    Attributes:
+       module: The module which the pipeline is applied to.
+       test_data: Data used for quantizing and testing the module.
+
+       aten_ops: Aten dialect ops expected to be found in the graph after export.
+       exir_ops: Exir dialect ops expected to be found in the graph after to_edge.
+       if not using use_edge_to_transform_and_lower.
+
+       tosa_version: A string for identifying the TOSA version, see common.get_tosa_compile_spec for
+                     options.
+       use_edge_to_transform_and_lower: Selects betweeen two possible ways of lowering the module.
+       custom_path : Path to dump intermediate artifacts such as tosa and pte to.
+    """
 
     def __init__(
         self,
         module: torch.nn.Module,
-        test_data: Any,
-        aten_op: str,
-        exir_op: str,
+        test_data: T,
+        aten_op: str | List[str],
+        exir_op: Optional[str | List[str]] = None,
         tosa_version: str = "TOSA-0.80+BI",
-        use_to_edge_transform_and_lower: bool = False,
+        symmetric_io_quantization: bool = False,
+        use_to_edge_transform_and_lower: bool = True,
+        custom_path: str = None,
     ):
         compile_spec = common.get_tosa_compile_spec(
-            tosa_version,
+            tosa_version, custom_path=custom_path
+        )
+        quant_stage = (
+            Quantize(
+                TOSAQuantizer(compile_spec).set_io(get_symmetric_quantization_config()),
+                get_symmetric_quantization_config(),
+            )
+            if symmetric_io_quantization
+            else None
         )
         super().__init__(
             module,
             test_data,
             aten_op,
-            exir_op,
             compile_spec,
+            exir_op,
             use_to_edge_transform_and_lower,
         )
-        self.add_stage(self.tester.quantize, pos=0)
+        self.add_stage(self.tester.quantize, quant_stage, pos=0)
+
         self.add_stage_after(
             "quantize",
             self.tester.check,
@@ -284,26 +327,42 @@ def __init__(
 
 
 class TosaPipelineMI(BasePipelineMaker, Generic[T]):
-    """Lowers a graph to MI TOSA spec and tests it with the TOSA reference model"""
+    """
+    Lowers a graph to MI TOSA spec and tests it with the TOSA reference model.
+
+    Attributes:
+       module: The module which the pipeline is applied to.
+       test_data: Data used for quantizing and testing the module.
+
+       aten_ops: Aten dialect ops expected to be found in the graph after export.
+       exir_ops: Exir dialect ops expected to be found in the graph after to_edge.
+       if not using use_edge_to_transform_and_lower.
+
+       tosa_version: A string for identifying the TOSA version, see common.get_tosa_compile_spec for
+                     options.
+       use_edge_to_transform_and_lower: Selects betweeen two possible ways of lowering the module.
+       custom_path : Path to dump intermediate artifacts such as tosa and pte to.
+    """
 
     def __init__(
         self,
         module: torch.nn.Module,
-        test_data: Any,
-        aten_op: str,
-        exir_op: str,
+        test_data: T,
+        aten_op: str | List[str],
+        exir_op: Optional[str | List[str]] = None,
         tosa_version: str = "TOSA-0.80+MI",
-        use_to_edge_transform_and_lower: bool = False,
+        use_to_edge_transform_and_lower: bool = True,
+        custom_path: str = None,
     ):
         compile_spec = common.get_tosa_compile_spec(
-            tosa_version,
+            tosa_version, custom_path=custom_path
         )
         super().__init__(
             module,
             test_data,
             aten_op,
-            exir_op,
             compile_spec,
+            exir_op,
             use_to_edge_transform_and_lower,
         )
         self.add_stage_after(
@@ -322,27 +381,54 @@ def __init__(
 
 
 class EthosU55PipelineBI(BasePipelineMaker, Generic[T]):
-    """Lowers a graph to u55 BI TOSA spec and tests it on the Corstone300 FVP, if run_on_fvp is true."""
+    """
+    Lowers a graph to u55 BI TOSA spec and tests it on the Corstone300 FVP, if run_on_fvp is true.
+
+    Attributes:
+       module: The module which the pipeline is applied to.
+       test_data: Data used for quantizing and testing the module.
+       aten_ops: Aten dialect ops expected to be found in the graph after export.
+
+       exir_ops: Exir dialect ops expected to be found in the graph after to_edge.
+       if not using use_edge_to_transform_and_lower.
+       run_on_fvp: Set to true to test the pte fileon a fvp simulator.
+       use_edge_to_transform_and_lower: Selects betweeen two possible ways of lowering the module.
+       custom_path : Path to dump intermediate artifacts such as tosa and pte to.
+    """
 
     def __init__(
         self,
         module: torch.nn.Module,
         test_data: T,
         aten_ops: str | List[str],
-        exir_ops: str | List[str],
+        exir_ops: Optional[str | List[str]] = None,
         run_on_fvp: bool = False,
+        symmetric_io_quantization: bool = False,
         use_to_edge_transform_and_lower: bool = False,
+        custom_path: str = None,
     ):
-        compile_spec = common.get_u55_compile_spec()
+        compile_spec = common.get_u55_compile_spec(custom_path=custom_path)
+        quant_stage = (
+            Quantize(
+                EthosUQuantizer(compile_spec).set_io(
+                    get_symmetric_quantization_config()
+                ),
+                get_symmetric_quantization_config(),
+            )
+            if symmetric_io_quantization
+            else None
+        )
         super().__init__(
             module,
             test_data,
             aten_ops,
-            exir_ops,
             compile_spec,
+            exir_ops,
             use_to_edge_transform_and_lower,
         )
-        self.add_stage(self.tester.quantize, pos=0)
+
+        self.add_stage(self.tester.quantize, quant_stage, pos=0)
+
         self.add_stage_after(
             "quantize",
             self.tester.check,
@@ -378,27 +464,54 @@ def __init__(
 
 
 class EthosU85PipelineBI(BasePipelineMaker, Generic[T]):
-    """Lowers a graph to u85 BI TOSA spec and tests it on the Corstone320 FVP, if run_on_fvp is true."""
+    """
+    Lowers a graph to u85 BI TOSA spec and tests it on the Corstone320 FVP, if run_on_fvp is true.
+
+    Attributes:
+       module: The module which the pipeline is applied to.
+       test_data: Data used for quantizing and testing the module.
+       aten_ops: Aten dialect ops expected to be found in the graph after export.
+
+       exir_ops: Exir dialect ops expected to be found in the graph after to_edge if not using
+                 use_edge_to_transform_and_lower.
+       run_on_fvp: Set to true to test the pte fileon a fvp simulator.
+       use_edge_to_transform_and_lower: Selects betweeen two possible ways of lowering the module.
+       custom_path : Path to dump intermediate artifacts such as tosa and pte to.
+    """
 
     def __init__(
         self,
         module: torch.nn.Module,
         test_data: T,
         aten_ops: str | List[str],
-        exir_ops: str | List[str],
+        exir_ops: str | List[str] = None,
         run_on_fvp: bool = False,
+        symmetric_io_quantization: bool = False,
         use_to_edge_transform_and_lower: bool = False,
+        custom_path: str = None,
     ):
-        compile_spec = common.get_u85_compile_spec()
+        compile_spec = common.get_u85_compile_spec(custom_path=custom_path)
+        quant_stage = (
+            Quantize(
+                EthosUQuantizer(compile_spec).set_io(
+                    get_symmetric_quantization_config()
+                ),
+                get_symmetric_quantization_config(),
+            )
+            if symmetric_io_quantization
+            else None
+        )
         super().__init__(
             module,
             test_data,
             aten_ops,
-            exir_ops,
             compile_spec,
+            exir_ops,
             use_to_edge_transform_and_lower,
         )
-        self.add_stage(self.tester.quantize, pos=0)
+
+        self.add_stage(self.tester.quantize, quant_stage, pos=0)
+
         self.add_stage_after(
             "quantize",
             self.tester.check,
@@ -433,7 +546,7 @@ def __init__(
             )
 
 
-class TestPassPipeline(BasePipelineMaker, Generic[T]):
+class PassPipeline(BasePipelineMaker, Generic[T]):
     """
     Runs single passes directly on an edge_program and checks operators before/after.
 
@@ -450,6 +563,7 @@ class TestPassPipeline(BasePipelineMaker, Generic[T]):
         pass_list: List of regular passes.
         pass_functions: List of functions applied directly to the exported program.
         passes_with_exported_program: List of passes initiated with an exported_program.
+        custom_path : Path to dump intermediate artifacts such as tosa and pte to.
 
     Passes are run in order pass_list -> pass_functions -> passes_with_exported_program.
     See arm_tester.RunPasses() for more information.
@@ -467,16 +581,17 @@ def __init__(
         pass_list: Optional[List[Type[PassType]]] = None,
         pass_functions: Optional[List[Callable]] = None,
         passes_with_exported_program: Optional[List[Type[ExportPass]]] = None,
+        custom_path: str = None,
     ):
         compile_spec = common.get_tosa_compile_spec(
-            tosa_version,
+            tosa_version, custom_path=custom_path
         )
         super().__init__(
             module,
             test_data,
             None,
-            None,
             compile_spec,
+            None,
             use_to_edge_transform_and_lower=False,
         )
 
@@ -507,3 +622,100 @@ def __init__(
         if ops_not_after_pass:
             self.add_stage(self.tester.check_not, ops_not_after_pass, suffix="after")
         self.add_stage(self.tester.run_method_and_compare_outputs)
+
+
+class TransformAnnotationPassPipeline(BasePipelineMaker, Generic[T]):
+    """
+    Runs transform_for_annotation_pipeline passes directly on an exported program and checks output.
+
+    Attributes:
+        module: The module which the pipeline is applied to.
+        test_data: Data used for testing the module.
+        tosa_version: The TOSA-version which to test for.
+
+        custom_path : Path to dump intermediate artifacts such as tosa and pte to.
+
+    """
+
+    def __init__(
+        self,
+        module: torch.nn.Module,
+        test_data: T,
+        tosa_version: str,
+        custom_path: str = None,
+    ):
+        compile_spec = common.get_tosa_compile_spec(
+            tosa_version, custom_path=custom_path
+        )
+        super().__init__(
+            module,
+            test_data,
+            None,
+            compile_spec,
+            None,
+            use_to_edge_transform_and_lower=True,
+        )
+        self.add_stage_after(
+            "export", self.tester.run_transform_for_annotation_pipeline
+        )
+
+        # Delete most of the pipeline
+        self.pop_stage("check_not.exir")
+        self.pop_stage("check_count.exir")
+        self.pop_stage("to_executorch")
+        self.pop_stage("to_edge_transform_and_lower")
+        self.pop_stage("check.aten")
+        self.add_stage(
+            self.tester.run_method_and_compare_outputs,
+            inputs=test_data,
+            run_eager_mode=True,
+        )
+
+
+class OpNotSupportedPipeline(BasePipelineMaker, Generic[T]):
+    """
+    Runs the partitioner on a module and checks that ops are not delegated to test
+    SupportedTOSAOperatorChecks.
+
+    Attributes:
+        module: The module which the pipeline is applied to.
+        test_data: Data with a representative shape which the operator_check is performed on.
+        tosa_version: The TOSA-version which to test for.
+
+        non_delegated_ops : Exir ops expected not to be delegated.
+        n_expected_delegates : Number of delegate calls (0 in the usual case).
+        custom_path : Path to dump intermediate artifacts such as tosa and pte to.
+    """
+
+    def __init__(
+        self,
+        module: torch.nn.Module,
+        test_data: T,
+        tosa_version: str,
+        non_delegated_ops: Dict[str, int],
+        n_expected_delegates: int = 0,
+        custom_path: str = None,
+    ):
+        compile_spec = common.get_tosa_compile_spec(
+            tosa_version, custom_path=custom_path
+        )
+        super().__init__(
+            module,
+            test_data,
+            [],
+            compile_spec,
+            [],
+        )
+
+        if "BI" in tosa_version:
+            self.add_stage(self.tester.quantize, pos=0)
+
+        self.change_args("check_not.exir", [])
+        self.change_args(
+            "check_count.exir",
+            {
+                "torch.ops.higher_order.executorch_call_delegate": n_expected_delegates,
+                **non_delegated_ops,
+            },
+        )
+        self.pop_stage("to_executorch")
diff --git a/backends/arm/tosa_mapping.py b/backends/arm/tosa_mapping.py
index d1849a7f477..b75f0e88fde 100644
--- a/backends/arm/tosa_mapping.py
+++ b/backends/arm/tosa_mapping.py
@@ -11,7 +11,7 @@
 # the standardised TOSA representation.
 #
 
-from typing import Sequence
+from typing import Any, Sequence
 
 import serializer.tosa_serializer as ts  # type: ignore
 import torch
@@ -44,7 +44,7 @@
 }
 
 
-def map_dtype(data_type):
+def map_dtype(data_type: torch.dtype) -> ts.DType:
     if data_type in UNSUPPORTED_DTYPES:
         raise ValueError(f"Unsupported type: {data_type}")
     if data_type not in DTYPE_MAP:
@@ -79,22 +79,16 @@ def extract_tensor_meta(meta):
 # Class to capture arguments and turn into tensor references for TOSA OPs
 class TosaArg:
     def __process_node(self, argument: torch.fx.Node):
-        self.name = argument.name
+        self.name: str = argument.name
         self.dtype, self.shape, self.dim_order = extract_tensor_meta(argument.meta)
 
     def __process_list(self, argument):
-        self.special = list(argument)
+        self.special: list = list(argument)
 
     def __process_number(self, argument: float | int):
-        self.number = argument
-
-    def __init__(self, argument) -> None:
-        self.name = None  # type: ignore[assignment]
-        self.dtype = None
-        self.shape = None
-        self.dim_order = None
-        self.special = None
+        self.number: float | int = argument
 
+    def __init__(self, argument: Any) -> None:
         if argument is None:
             return
 
@@ -107,7 +101,27 @@ def __init__(self, argument) -> None:
         if isinstance(argument, (int, float)):
             self.__process_number(argument)
             return
+        if isinstance(argument, torch.dtype):
+            # Dtype is parsed from fake tensor
+            return
 
-        RuntimeError(
+        raise RuntimeError(
             f"Unhandled node input argument: {argument}, of type {type(argument)}"
         )
+
+    def __repr__(self):
+        attrs = []
+        if hasattr(self, "name"):
+            if self.name is not None:
+                attrs.append(f"name={self.name!r}")
+            if self.dtype is not None:
+                attrs.append(f"dtype={ts.DTypeNames[self.dtype]}")
+            if self.shape is not None:
+                attrs.append(f"shape={self.shape!r}")
+            if self.dim_order is not None:
+                attrs.append(f"dim_order={self.dim_order!r}")
+        if hasattr(self, "special") and self.special is not None:
+            attrs.append(f"special={self.special!r}")
+        if hasattr(self, "number") and self.number is not None:
+            attrs.append(f"number={self.number!r}")
+        return f"{self.__class__.__name__}({', '.join(attrs)})"
diff --git a/backends/arm/tosa_partitioner.py b/backends/arm/tosa_partitioner.py
index c2bc48d98d7..a53bf6fc725 100644
--- a/backends/arm/tosa_partitioner.py
+++ b/backends/arm/tosa_partitioner.py
@@ -14,6 +14,7 @@
     get_tosa_spec,
     is_tosa,
 )  # usort: skip
+from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor
 from executorch.backends.arm.operator_support.tosa_supported_operators import (
     tosa_support_factory,
 )
@@ -24,7 +25,7 @@
     Partitioner,
     PartitionResult,
 )
-from executorch.exir.backend.utils import tag_constant_data
+from executorch.exir.backend.utils import tag_constant_data, WhyNoPartitionReporter
 from executorch.exir.dialects._ops import ops as exir_ops
 from torch.export.exported_program import ExportedProgram
 from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner
@@ -32,7 +33,7 @@
 
 
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.WARNING)
+logger.setLevel(logging.INFO)
 TOSA_DBG_VERBOSE = os.environ.get("TOSA_DBG_VERBOSE") == "1"
 if TOSA_DBG_VERBOSE:
     logging.basicConfig(level=logging.INFO)
@@ -66,7 +67,7 @@ def __init__(
         self.delegation_spec = DelegationSpec(TOSABackend.__name__, compile_spec)
         self.additional_checks = additional_checks
 
-    def partition(self, exported_program: ExportedProgram) -> PartitionResult:
+    def partition(self, exported_program: ExportedProgram) -> PartitionResult:  # noqa
         # Run the CapabilityBasedPartitioner to return the largest possible
         # subgraphs containing the nodes with the tags
 
@@ -77,9 +78,13 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
 
         logger.info(f"Partitioning for {self.delegation_spec.backend_id}: {tosa_spec}")
 
+        reporter = WhyNoPartitionReporter()
+        operator_support = tosa_support_factory(
+            tosa_spec, exported_program, reporter, self.additional_checks
+        )
         capability_partitioner = CapabilityBasedPartitioner(
             exported_program.graph_module,
-            tosa_support_factory(tosa_spec, self.additional_checks),
+            operator_support,
             allows_single_node_partition=True,
         )
         partition_list = capability_partitioner.propose_partitions()
@@ -110,8 +115,25 @@ def is_partitioned(node: torch.fx.Node, tag=tag) -> bool:
                             del node.meta["delegation_tag"]
                             break
 
-        tag_constant_data(exported_program)
+                if tosa_spec.support_float():
+                    continue
 
+                if is_partitioned(node):
+                    for input in node.all_input_nodes:
+                        if is_partitioned(input):
+                            continue
+                        if get_first_fake_tensor(input).dtype.is_floating_point:
+                            reporter.report_reject(
+                                node,
+                                f"Was first node in partition and input {input.name} had fp dtype.",
+                            )
+                            del node.meta["delegation_tag"]
+                            break
+
+        tag_constant_data(exported_program)
+        logger.info(f"The following nodes were rejected for {tosa_spec}:")
+        logger.info("\n" + reporter.get_table_report())
+        logger.info("(Placeholders and outputs are not included in this list)")
         return PartitionResult(
             tagged_exported_program=exported_program, partition_tags=partition_tags
         )
diff --git a/backends/arm/tosa_quant_utils.py b/backends/arm/tosa_quant_utils.py
index d53362cb363..3028ecce923 100644
--- a/backends/arm/tosa_quant_utils.py
+++ b/backends/arm/tosa_quant_utils.py
@@ -8,14 +8,18 @@
 # Utiliy functions for TOSA quantized lowerings
 
 import math
-from typing import cast, NamedTuple
+from typing import cast, List, NamedTuple, Tuple
+
+import executorch.backends.arm.tosa_mapping
 
 import serializer.tosa_serializer as ts  # type: ignore
 import torch.fx
+import torch.fx.node
 import tosa.Op as TosaOp  # type: ignore
 from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.exir.dialects._ops import ops as exir_ops
-from serializer.tosa_serializer import TosaSerializerTensor
+from serializer.tosa_serializer import TosaSerializer, TosaSerializerTensor
+from torch import Tensor
 from torch.fx import Node
 
 
@@ -65,7 +69,7 @@ def insert_rescale_ops_to_int32(
                 tosa_graph,
                 tensor,
                 qarg.zp,
-                scale,
+                [scale],
             )
         )
     return rescaled_nodes, min_scale
@@ -105,7 +109,7 @@ def insert_rescale_op_to_int8(
         last_tensor.name,
         node.name,
         qargs_out.zp,
-        output_rescale_scale,
+        [output_rescale_scale],
     )
 
 
@@ -116,7 +120,7 @@ class QuantArgs(NamedTuple):
     qmax: int
     dtype: torch.dtype
 
-    def quantize_value(self, x):
+    def quantize_value(self, x: torch.Tensor | float) -> Tensor:
         if not isinstance(x, torch.Tensor):
             x = torch.Tensor([x])
         return torch.clip(
@@ -144,7 +148,7 @@ def from_operator(cls, op, args):
 
 
 # Check if scale32 mode is used for given output element type
-def is_scale32(type):
+def is_scale32(type: int) -> ts.DType:
     return type == ts.DType.INT8
 
 
@@ -152,65 +156,73 @@ def is_scale32(type):
 # The RESCALE operator is defined using an integer multiply, add, and shift.
 # This utility function is for calculating the multier and shift given a scale.
 # Ref: https://www.mlplatform.org/tosa/tosa_spec.html#_precision_scaling
-def compute_multiplier_and_shift(scale, scaleWidth=32):
+def compute_multiplier_and_shift(
+    scales: list[float], scaleWidth: int = 32
+) -> Tuple[list[int], list[int]]:
     if scaleWidth == 16:
         offset = 15
     elif scaleWidth == 32:
         offset = 31
     else:
-        raise AssertionError("unsupported scale width")
-
-    assert isinstance(scale, float)
+        raise ValueError(
+            f"Unsupported scale width: {scaleWidth}, only 16 and 32 are valid values."
+        )
 
-    mantissa, exponent = math.frexp(scale)
-    shift = exponent
+    multipliers = []
+    shifts = []
+    for scale in scales:
+        mantissa, exponent = math.frexp(scale)
+        shift = exponent
 
-    const_2_power_15_or_31 = 1 << offset
-    shifted_mantissa = round(mantissa * const_2_power_15_or_31)
+        const_2_power_15_or_31 = 1 << offset
+        shifted_mantissa = round(mantissa * const_2_power_15_or_31)
 
-    assert shifted_mantissa <= const_2_power_15_or_31
+        assert shifted_mantissa <= const_2_power_15_or_31
 
-    if shifted_mantissa == const_2_power_15_or_31:
-        shifted_mantissa = shifted_mantissa / 2
-        shift += 1
+        if shifted_mantissa == const_2_power_15_or_31:
+            shifted_mantissa = shifted_mantissa // 2
+            shift += 1
 
-    # TOSA expects right shift to be positive, and embed (1 << offset) into right shift bits.
-    shift = offset - shift
+        # TOSA expects right shift to be positive, and embed (1 << offset) into right shift bits.
+        shift = offset - shift
 
-    # INT32_MAX, 2^31 - 1
-    assert shifted_mantissa <= (const_2_power_15_or_31 - 1)
+        # INT32_MAX, 2^31 - 1
+        assert shifted_mantissa <= (const_2_power_15_or_31 - 1)
 
-    multiplier = shifted_mantissa
+        multiplier = shifted_mantissa
 
-    if shift > 62:
-        multiplier = multiplier >> min(31, shift - 62)
-        shift = 62
-    return multiplier, shift
+        if shift > 62:
+            multiplier = multiplier >> min(31, shift - 62)
+            shift = 62
+        multipliers.append(multiplier)
+        shifts.append(shift)
+    return multipliers, shifts
 
 
 def build_rescale(
-    tosa_fb,
-    scale,
-    input_node,
-    output_name,
-    output_type,
-    output_shape,
-    input_zp,
-    output_zp,
-    is_double_round=False,
+    tosa_fb: TosaSerializer,
+    scale: list[float],
+    input_node: TosaSerializerTensor,
+    output_name: str,
+    output_type: ts.DType,
+    output_shape: List[int],
+    input_zp: int,
+    output_zp: int,
+    is_double_round: bool = False,
+    per_channel=False,
 ):
     scale_width = 32 if is_scale32(output_type) else 16
-    multiplier, shift = compute_multiplier_and_shift(scale, scale_width)
+    multipliers, shifts = compute_multiplier_and_shift(scale, scale_width)
 
     attr_rescale = ts.TosaSerializerAttribute()
     attr_rescale.RescaleAttribute(
         input_zp=input_zp,
         output_zp=output_zp,
-        multiplier=[multiplier],
-        shift=[shift],
+        multiplier=multipliers,
+        shift=shifts,
         scale32=is_scale32(output_type),
         double_round=is_double_round,
-        per_channel=False,
+        per_channel=per_channel,
         input_unsigned=False,
         output_unsigned=False,
     )
@@ -223,25 +235,31 @@ def build_rescale(
 
 
 def build_rescale_to_int32(
-    tosa_fb, input, input_zp, rescale_scale, is_scale32=True, is_double_round=False
+    tosa_fb: TosaSerializer,
+    input_arg: executorch.backends.arm.tosa_mapping.TosaArg,
+    input_zp: int,
+    rescale_scale: list[float],
+    is_scale32: bool = True,
+    is_double_round: bool = False,
+    per_channel: bool = False,
 ) -> TosaSerializerTensor:
-    multiplier, shift = compute_multiplier_and_shift(rescale_scale)
+    multipliers, shifts = compute_multiplier_and_shift(rescale_scale)
     attr_rescale = ts.TosaSerializerAttribute()
     attr_rescale.RescaleAttribute(
         input_zp=input_zp,
         output_zp=0,
-        multiplier=[multiplier],
-        shift=[shift],
+        multiplier=multipliers,
+        shift=shifts,
         scale32=is_scale32,
         double_round=is_double_round,
-        per_channel=False,
+        per_channel=per_channel,
         input_unsigned=False,
         output_unsigned=False,
     )
-    input_A_rescaled_to_int32 = tosa_fb.addIntermediate(input.shape, ts.DType.INT32)
+    input_A_rescaled_to_int32 = tosa_fb.addIntermediate(input_arg.shape, ts.DType.INT32)
     tosa_fb.addOperator(
         TosaOp.Op().RESCALE,
-        [input.name],
+        [input_arg.name],
         [input_A_rescaled_to_int32.name],
         attr_rescale,
     )
@@ -250,24 +268,25 @@ def build_rescale_to_int32(
 
 
 def build_rescale_from_int32(
-    tosa_fb,
-    input_name,
-    output_name,
-    output_zp,
-    rescale_scale,
-    is_scale32=True,
-    is_double_round=False,
+    tosa_fb: TosaSerializer,
+    input_name: str,
+    output_name: str,
+    output_zp: int,
+    rescale_scale: list[float],
+    is_scale32: bool = True,
+    is_double_round: bool = False,
+    per_channel: bool = False,
 ) -> None:
-    multiplier, shift = compute_multiplier_and_shift(rescale_scale)
+    multipliers, shifts = compute_multiplier_and_shift(rescale_scale)
     attr_rescale_output = ts.TosaSerializerAttribute()
     attr_rescale_output.RescaleAttribute(
         input_zp=0,
         output_zp=output_zp,
-        multiplier=[multiplier],
-        shift=[shift],
+        multiplier=multipliers,
+        shift=shifts,
         scale32=is_scale32,
         double_round=is_double_round,
-        per_channel=False,
+        per_channel=per_channel,
         input_unsigned=False,
         output_unsigned=False,
     )
@@ -283,17 +302,19 @@ def build_rescale_from_int32(
 
 
 def build_rescale_conv_output(
-    tosa_fb,
-    op,
-    output_name,
-    output_type,
-    input_scale,
-    weight_scale,
-    output_scale,
-    output_zp,
+    tosa_fb: TosaSerializer,
+    op: TosaSerializerTensor,
+    output_name: str,
+    output_type: ts.DType,
+    input_scale: list[float],
+    weight_scale: list[float],
+    output_scale: list[float],
+    output_zp: int,
 ):
     # TODO add check to verify if this is a Per-channel quantization.
-    post_conv2d_scale = (input_scale * weight_scale) / output_scale
+    post_conv2d_scale = [
+        (inp * w) / out for inp, w, out in zip(input_scale, weight_scale, output_scale)
+    ]
 
     # Since we assume the input tensor that is being rescaled is int32 date type, zero point must be 0.
     build_rescale(
@@ -305,5 +326,7 @@ def build_rescale_conv_output(
         op.shape,
         0,
         output_zp,
+        False,
+        isinstance(weight_scale, torch.Tensor),
     )
     return
diff --git a/backends/arm/tosa_specification.py b/backends/arm/tosa_specification.py
index 225e1c5db58..94c307d440c 100644
--- a/backends/arm/tosa_specification.py
+++ b/backends/arm/tosa_specification.py
@@ -112,7 +112,7 @@ def __init__(self, version: Version, extras: List[str]):
         if len(extras) > 0:
             raise ValueError(f"Unhandled extras found: {extras}")
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         extensions = ""
         if self.level_8k:
             extensions += "+8k"
diff --git a/backends/arm/tosa_utils.py b/backends/arm/tosa_utils.py
index 0d4aeba2d55..556e30e2b7f 100644
--- a/backends/arm/tosa_utils.py
+++ b/backends/arm/tosa_utils.py
@@ -7,7 +7,7 @@
 
 import logging
 import os
-from typing import Any
+from typing import Any, Tuple
 
 import serializer.tosa_serializer as ts  # type: ignore
 import torch
@@ -102,6 +102,45 @@ def build_reshape(tosa_fb, input_name, new_shape, output_name):
     tosa_fb.addOperator(TosaOp.Op().RESHAPE, [input_name], [output_name], attr)
 
 
+def reshape_for_broadcast(tosa_fb, inputs, dim_order=None):
+    assert len(inputs) == 2
+    input1 = inputs[0]
+    input2 = inputs[1]
+
+    def get_new_shape(l_rank_in, h_rank_in):
+        rank_diff = len(h_rank_in.shape) - len(l_rank_in.shape)
+        new_shape = list(l_rank_in.shape)
+
+        for _ in range(rank_diff):
+            new_shape.insert(0, 1)
+        return tuple(new_shape)
+
+    if len(input1.shape) == len(input2.shape):
+        return input1, input2
+    elif len(input1.shape) > len(input2.shape):
+        l_rank_in = input2
+        h_rank_in = input1
+    elif len(input1.shape) < len(input2.shape):
+        l_rank_in = input1
+        h_rank_in = input2
+
+    new_shape = get_new_shape(l_rank_in, h_rank_in)
+    dim_order = h_rank_in.dim_order if dim_order is None else dim_order
+    new_shape = tosa_shape(new_shape, dim_order)
+
+    reshaped = tosa_fb.addIntermediate(
+        new_shape,
+        inputs[0].dtype,
+    )
+
+    build_reshape(tosa_fb, l_rank_in.name, new_shape, reshaped.name)
+
+    if len(input1.shape) > len(input2.shape):
+        return input1, reshaped
+    else:
+        return reshaped, input2
+
+
 def is_consumer_node_depthwise_conv2d(node):
     consumer_node = list(node.users)[0]
     if consumer_node.target == exir_ops.edge.aten.convolution.default:
@@ -153,7 +192,7 @@ def get_resize_parameters(
     output_size: torch.Tensor,
     resize_mode: int,
     align_corners: bool,
-):
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     """Get the tosa.resize parameters based on the input and output size.
 
     Args:
diff --git a/backends/cadence/CMakeLists.txt b/backends/cadence/CMakeLists.txt
index e2ac3de5cab..f8bb42cba50 100644
--- a/backends/cadence/CMakeLists.txt
+++ b/backends/cadence/CMakeLists.txt
@@ -19,13 +19,16 @@ if(NOT EXECUTORCH_ROOT)
   set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
 endif()
 
-include(${EXECUTORCH_ROOT}/build/Utils.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 
 # Let files say "include <executorch/path/to/header.h>".
-set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+set(_common_include_directories ${EXECUTORCH_ROOT}/..
+  ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)
+
+add_compile_definitions(C10_USING_CUSTOM_GENERATED_MACROS)
 
 if(EXECUTORCH_CADENCE_CPU_RUNNER)
-  include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+  include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
 
   if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
@@ -74,13 +77,16 @@ endif()
 
 if(EXECUTORCH_NNLIB_OPT)
   set(TARGET_DIR hifi)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/third-party/nnlib)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/third-party/nnlib
+  ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/kernels)
 elseif(EXECUTORCH_FUSION_G3_OPT)
   set(TARGET_DIR fusion_g3)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/third-party/nnlib)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/third-party/nnlib
+  ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)
 else()
   set(TARGET_DIR reference)
 endif()
 
-add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/kernels)
+
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/operators)
diff --git a/backends/cadence/aot/TARGETS b/backends/cadence/aot/TARGETS
index 0590e694602..1c7a73f18c7 100644
--- a/backends/cadence/aot/TARGETS
+++ b/backends/cadence/aot/TARGETS
@@ -115,11 +115,23 @@ python_library(
     ],
     deps = [
         "fbcode//caffe2:torch",
-        "fbcode//executorch/exir:scalar_type",
         "fbcode//executorch/backends/cadence/aot:utils",
     ],
 )
 
+python_library(
+    name = "ref_implementations",
+    srcs = [
+        "ref_implementations.py",
+    ],
+    typing = True,
+    deps = [
+        "fbcode//caffe2:torch",
+        "fbcode//executorch/exir:scalar_type",
+    ],
+)
+
+
 export_file(name = "functions.yaml")
 
 executorch_generated_lib(
@@ -180,6 +192,7 @@ python_library(
     typing = True,
     deps = [
         "//caffe2:torch",
+        ":ops_registrations",
         ":compiler_utils",
         "//executorch/backends/cadence/aot:pass_utils",
         "//executorch/backends/cadence/aot:utils",
@@ -255,6 +268,7 @@ python_library(
         "//executorch/backends/cadence/aot:pass_utils",
         "//executorch/backends/cadence/aot:remove_ops",
         "//executorch/backends/cadence/aot:utils",
+        "//executorch/backends/transforms:replace_scalar_with_tensor",
         "//executorch/exir:pass_base",
         "//executorch/exir/dialects:lib",
         "//executorch/exir/dialects/edge:lib",
diff --git a/backends/cadence/aot/compiler.py b/backends/cadence/aot/compiler.py
index b96a0c7ad39..d2b66a34be2 100644
--- a/backends/cadence/aot/compiler.py
+++ b/backends/cadence/aot/compiler.py
@@ -198,6 +198,8 @@ def export_to_edge(
             _skip_dim_order=True,
             # Allow specific non-core aten ops in the IR.
             _core_aten_ops_exception_list=[
+                torch.ops.aten._linalg_det.default,
+                torch.ops.aten._linalg_svd.default,
                 torch.ops.aten._native_batch_norm_legit_functional.default,
                 torch.ops.aten.linear.default,
                 torch.ops.aten.linalg_vector_norm.default,
@@ -264,7 +266,6 @@ def export_to_executorch_gen_etrecord(
     alloc_graph_output: bool = True,
     memory_config: Optional[MemoryConfig] = None,
     dump_graphs: bool = False,
-    mem_alignment: int = 1,
 ) -> ExecutorchProgramManager:
     cadence_passes = get_cadence_passes(opt_level)
     edge_prog_manager = export_to_edge(model, inputs, dump_graphs)
@@ -291,7 +292,6 @@ def export_to_executorch_gen_etrecord(
         mem_algo=mem_algo,
         alloc_graph_input=alloc_graph_input,
         alloc_graph_output=alloc_graph_output,
-        mem_alignment=mem_alignment,
     )
 
     # Get executorch program after Cadence specific passes
diff --git a/backends/cadence/aot/export_example.py b/backends/cadence/aot/export_example.py
index 7166e3acf45..a860d567e0e 100644
--- a/backends/cadence/aot/export_example.py
+++ b/backends/cadence/aot/export_example.py
@@ -38,6 +38,8 @@ def export_model(
     example_inputs: Tuple[Any, ...],
     file_name: str = "CadenceDemoModel",
     run_and_compare: bool = True,
+    eps_error: float = 1e-1,
+    eps_warn: float = 1e-5,
 ):
     # create work directory for outputs and model binary
     working_dir = tempfile.mkdtemp(dir="/tmp")
@@ -89,4 +91,6 @@ def export_model(
             inputs=example_inputs,
             ref_outputs=ref_outputs,
             working_dir=working_dir,
+            eps_error=eps_error,
+            eps_warn=eps_warn,
         )
diff --git a/backends/cadence/aot/functions.yaml b/backends/cadence/aot/functions.yaml
index f1a5b6a50b0..2cfa4e7e2cc 100644
--- a/backends/cadence/aot/functions.yaml
+++ b/backends/cadence/aot/functions.yaml
@@ -214,6 +214,11 @@
     - arg_meta: null
       kernel_name: impl::reference::quantized_relu_out
 
+- func: cadence::quantized_relu.per_tensor_out(Tensor X, int X_zero_point, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::reference::quantized_relu_per_tensor_out
+
 - func: cadence::quantized_matmul.out(Tensor X, int X_zero_point, Tensor Y, int Y_zero_point, Tensor? bias, int out_multiplier, int out_shift, int out_zero_point, bool transposed, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
@@ -233,3 +238,18 @@
   kernels:
     - arg_meta: null
       kernel_name: impl::reference::quantized_conv_per_tensor_out
+
+- func: cadence::quantized_fully_connected.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::reference::quantized_fully_connected_out
+
+- func: cadence::quantized_fully_connected.per_tensor_out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, int weight_zero_point, int out_multiplier, int out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::reference::quantized_fully_connected_per_tensor_out
+
+- func: cadence::requantize.out(Tensor input, Tensor in_scale, Tensor in_zero_point, Tensor out_scale, Tensor out_zero_point, ScalarType out_dtype, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::reference::requantize_out
diff --git a/backends/cadence/aot/functions_fusion_g3.yaml b/backends/cadence/aot/functions_fusion_g3.yaml
index 5ca05544806..0feb1e47891 100644
--- a/backends/cadence/aot/functions_fusion_g3.yaml
+++ b/backends/cadence/aot/functions_fusion_g3.yaml
@@ -42,6 +42,17 @@
     - arg_meta: null
       kernel_name: cadence::impl::G3::cat_out
 
+- op: clamp.out
+  cpp_no_default_args: ['min']
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::G3::clamp_out
+
+- op: clamp.Tensor_out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::G3::clamp_Tensor_out
+
 - op: clone.out
   kernels:
     - arg_meta: null
@@ -67,6 +78,16 @@
     - arg_meta: null
       kernel_name: torch::executor::full_out
 
+- op: lt.Scalar_out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::G3::lt_Scalar_out
+
+- op: lt.Tensor_out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::G3::lt_Tensor_out
+
 - op: mul.out
   kernels:
     - arg_meta: null
@@ -81,10 +102,15 @@
     - arg_meta: null
       kernel_name: cadence::impl::G3::permute_copy_out
 
+- op: rsqrt.out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::G3::rsqrt_out
+
 - op: sigmoid.out
   kernels:
     - arg_meta: null
-      kernel_name: torch::executor::sigmoid_out
+      kernel_name: cadence::impl::G3::sigmoid_out
 
 - op: slice_copy.Tensor_out
   kernels:
@@ -96,6 +122,11 @@
     - arg_meta: null
       kernel_name: torch::executor::split_with_sizes_copy_out
 
+- op: sqrt.out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::G3::sqrt_out
+
 - op: sub.out
   kernels:
     - arg_meta: null
@@ -106,6 +137,16 @@
     - arg_meta: null
       kernel_name: cadence::impl::G3::sub_scalar_out
 
+- op: tanh.out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::G3::tanh_out
+
+- op: transpose_copy.int_out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::G3::transpose_copy_int_out
+
 - op: view_copy.out
   kernels:
     - arg_meta: null
@@ -114,7 +155,7 @@
 - op: where.self_out
   kernels:
     - arg_meta: null
-      kernel_name: torch::executor::where_out
+      kernel_name: cadence::impl::G3::where_self_out
 
 - op: native_layer_norm.out
   kernels:
@@ -124,7 +165,7 @@
 - op: mean.out
   kernels:
     - arg_meta: null
-      kernel_name:  cadence::impl::G3::mean_dim_out
+      kernel_name:  cadence::impl::G3::mean_out
 
 - op: exp.out
   kernels:
diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml
index 7a98d704d87..b1c2e1e1597 100644
--- a/backends/cadence/aot/functions_hifi.yaml
+++ b/backends/cadence/aot/functions_hifi.yaml
@@ -20,7 +20,7 @@
 - op: _softmax.out
   kernels:
     - arg_meta: null
-      kernel_name: cadence::impl::HiFi::softmax_out
+      kernel_name: cadence::impl::HiFi::_softmax_out
 
 - op: atan2.out
   kernels:
@@ -100,7 +100,7 @@
 - op: mean.out
   kernels:
     - arg_meta: null
-      kernel_name: cadence::impl::HiFi::mean_dim_out
+      kernel_name: cadence::impl::HiFi::mean_out   
 
 - op: minimum.out
   kernels:
@@ -175,7 +175,7 @@
 - op: where.self_out
   kernels:
     - arg_meta: null
-      kernel_name: cadence::impl::HiFi::where_out
+      kernel_name: cadence::impl::HiFi::where_self_out
 
 # custom ops
 - func: cadence::quantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
@@ -189,6 +189,11 @@
   kernels:
     - arg_meta: null
       kernel_name: cadence::impl::HiFi::dequantize_per_tensor_out
+      
+- func: cadence::quantized_conv.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::quantized_conv_out      
 
 - func: cadence::quantized_layer_norm.out(Tensor input, Tensor in_scale, Tensor in_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
@@ -209,6 +214,11 @@
     - arg_meta: null
       kernel_name: cadence::impl::HiFi::quantized_linear_per_tensor_out
 
+- func: cadence::quantized_relu_per_tensor.out(Tensor X, Tensor X_zero_point, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::quantized_relu_per_tensor_out
+
 - func: cadence::quantized_relu.out(Tensor X, Tensor X_zero_point, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
@@ -219,11 +229,6 @@
     - arg_meta: null
       kernel_name: cadence::impl::HiFi::quantized_relu_per_tensor_out
 
-- func: cadence::quantized_linear.per_tensor_out(Tensor src, Tensor weight, Tensor bias, SymInt src_zero_point, SymInt weight_zero_point, SymInt out_multiplier, SymInt out_shift, SymInt out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
-  kernels:
-    - arg_meta: null
-      kernel_name: cadence::impl::HiFi::quantized_linear_per_tensor_out
-
 - func: cadence::quantized_fully_connected.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
diff --git a/backends/cadence/aot/fuse_ops.py b/backends/cadence/aot/fuse_ops.py
index aa79b5582a7..f8a4b114e29 100644
--- a/backends/cadence/aot/fuse_ops.py
+++ b/backends/cadence/aot/fuse_ops.py
@@ -16,6 +16,9 @@
 from numbers import Number
 from typing import cast, Sequence
 
+# Import these for the cadence function signatures.
+import executorch.backends.cadence.aot.ops_registrations  # noqa: F401
+
 import torch
 import torch.fx
 from executorch.backends.cadence.aot.compiler_utils import (
@@ -849,7 +852,10 @@ def attempt_fusion(
             if isinstance(arg, torch.fx.Node)
             and isinstance(arg.target, EdgeOpOverload)
             and get_edge_overload_packet(arg.target)
-            == exir_ops.edge.quantized_decomposed.dequantize_per_tensor
+            in (
+                exir_ops.edge.quantized_decomposed.dequantize_per_tensor,
+                exir_ops.edge.cadence.dequantize_per_tensor,
+            )
         ]
         multiplier_nodes = [
             arg
diff --git a/backends/cadence/aot/memory_planning.py b/backends/cadence/aot/memory_planning.py
index 77ae7eb7995..cfe1b9ab9d8 100644
--- a/backends/cadence/aot/memory_planning.py
+++ b/backends/cadence/aot/memory_planning.py
@@ -40,12 +40,19 @@ def get_size(memory_config: MemoryConfig, exir_id: int) -> int:
     return memory_config.memory_sizes[exir_id - 1]
 
 
+def get_alignment(memory_config: MemoryConfig, exir_id: int) -> int:
+    # EXIR's spec.mem_id is indexed from 1..N.
+    assert memory_config.memory_alignments is not None
+    return memory_config.memory_alignments[exir_id - 1]
+
+
 def get_aligned_offset(pre_aligned_offset: int, alignment: int) -> int:
     return int(math.ceil(pre_aligned_offset / alignment) * alignment)
 
 
 def collect_specs_from_graph_module(
     graph_module: torch.fx.GraphModule,
+    graph_signature: ExportGraphSignature,
     alloc_graph_input: bool,
     alloc_graph_output: bool,
 ) -> Iterable[TensorSpec]:
@@ -56,6 +63,7 @@ def collect_specs_from_graph_module(
     # Collect the specs from all the nodes in the graph module, and return it
     return collect_specs_from_nodes(
         graph_module.graph.nodes,
+        graph_signature,
         ignore_graph_input=not alloc_graph_input,
         ignore_graph_output=not alloc_graph_output,
     )
@@ -82,6 +90,10 @@ def position_based_greedy_with_hierarchy(
         ]
     ] = None,
 ) -> List[int]:
+    # We do not use the `alignment` parameter and instead use the per-memory alignment
+    # constraints from `memory_config`.
+    del alignment
+
     num_memories = get_num_memories(memory_config)
     bufsizes = [0] * num_memories
     allocated_buffers: List[List[TensorSpec]] = [[] for _ in range(num_memories)]
@@ -101,13 +113,14 @@ def overlap(spec: TensorSpec) -> Optional[TensorSpec]:
 
     def memory_available(spec: TensorSpec) -> bool:
         return get_aligned_offset(
-            spec.mem_offset + spec.allocated_memory, alignment
+            spec.mem_offset + spec.allocated_memory,
+            get_alignment(memory_config, spec.mem_id),
         ) <= get_size(memory_config, spec.mem_id)
 
     # Iterate over all the specs in sorted order
     for spec in sorted(
         collect_specs_from_graph_module(
-            graph_module, alloc_graph_input, alloc_graph_output
+            graph_module, graph_signature, alloc_graph_input, alloc_graph_output
         ),
         key=lambda spec: spec.allocated_memory,
         reverse=True,
@@ -122,7 +135,8 @@ def memory_available(spec: TensorSpec) -> bool:
             spec.mem_offset = 0
             while memory_available(spec) and (overlapped := overlap(spec)):
                 spec.mem_offset = get_aligned_offset(
-                    overlapped.mem_offset + overlapped.allocated_memory, alignment
+                    overlapped.mem_offset + overlapped.allocated_memory,
+                    get_alignment(memory_config, spec.mem_id),
                 )
             if memory_available(spec):
                 allocated_buffers[spec.mem_id].append(spec)
@@ -170,6 +184,10 @@ def greedy_by_size_for_offset_calculation_with_hierarchy(
         ]
     ] = None,
 ) -> List[int]:
+    # We do not use the `alignment` parameter and instead use the per-memory alignment
+    # constraints from `memory_config`.
+    del alignment
+
     num_memories = get_num_memories(memory_config)
     bufsizes = [0] * num_memories
     allocated_buffers = [[] for _ in range(num_memories)]
@@ -182,7 +200,7 @@ def greedy_by_size_for_offset_calculation_with_hierarchy(
     # Iterate over all the specs in sorted order
     for spec in sorted(
         collect_specs_from_graph_module(
-            graph_module, alloc_graph_input, alloc_graph_output
+            graph_module, graph_signature, alloc_graph_input, alloc_graph_output
         ),
         key=lambda spec: spec.allocated_memory,
         reverse=True,
@@ -211,13 +229,14 @@ def greedy_by_size_for_offset_calculation_with_hierarchy(
                     prev_offset = max(
                         get_aligned_offset(
                             allocated_spec.mem_offset + allocated_spec.allocated_memory,
-                            alignment,
+                            get_alignment(memory_config, spec.mem_id),
                         ),
                         prev_offset,
                     )
             if spec.mem_offset is None:
                 if get_aligned_offset(
-                    prev_offset + spec.allocated_memory, alignment
+                    prev_offset + spec.allocated_memory,
+                    get_alignment(memory_config, spec.mem_id),
                 ) > get_size(memory_config, spec.mem_id):
                     continue
                 else:
@@ -250,6 +269,7 @@ def greedy_by_size_for_offset_calculation_with_hierarchy(
 
 def find_peak_memory_usages_per_memory(
     graph_module: torch.fx.GraphModule,
+    graph_signature: ExportGraphSignature,
     alloc_graph_input: bool,
     alloc_graph_output: bool,
     mem_constraints: Optional[MemConstraints] = None,
@@ -265,7 +285,7 @@ def find_peak_memory_usages_per_memory(
 
     # go through all nodes in the graph, collect memory usage per spec.mem_id
     for spec in collect_specs_from_graph_module(
-        graph_module, alloc_graph_input, alloc_graph_output
+        graph_module, graph_signature, alloc_graph_input, alloc_graph_output
     ):
         if mem_constraints is not None and mem_constraints.skipped_spec(spec):
             continue
@@ -288,6 +308,7 @@ def find_peak_memory_usages_per_memory(
 
 def find_peak_memory_usage(
     graph_module: torch.fx.GraphModule,
+    graph_signature: ExportGraphSignature,
     alloc_graph_input: bool,
     alloc_graph_output: bool,
     mem_constraints: Optional[MemConstraints] = None,
@@ -303,7 +324,7 @@ def find_peak_memory_usage(
 
     # Iterate over all the node specs
     for spec in collect_specs_from_graph_module(
-        graph_module, alloc_graph_input, alloc_graph_output
+        graph_module, graph_signature, alloc_graph_input, alloc_graph_output
     ):
         if spec.lifetime[0] is None or (
             mem_constraints is not None and mem_constraints.skipped_spec(spec)
@@ -358,6 +379,7 @@ def print_memory_planning_info(
     # Get the peak memory usages per memory space
     peak_memory_usages_per_memory = find_peak_memory_usages_per_memory(
         executorch_prog.exported_program().graph_module,
+        executorch_prog.exported_program().graph_signature,
         alloc_graph_input,
         alloc_graph_output,
         mem_constraints,
@@ -393,6 +415,7 @@ def print_memory_planning_info(
     # Get the total peak memory usage across all memory spaces
     total_peak_memory_usage = find_peak_memory_usage(
         executorch_prog.exported_program().graph_module,
+        executorch_prog.exported_program().graph_signature,
         alloc_graph_input,
         alloc_graph_output,
         mem_constraints,
@@ -433,7 +456,6 @@ def __init__(
                 ]
             ]
         ] = None,
-        mem_alignment: int = 1,
     ) -> None:
         self._init_mem_algos()
 
@@ -444,16 +466,23 @@ def __init__(
         self.alloc_graph_output = alloc_graph_output
         self.additional_constraint_gen_passes = additional_constraint_gen_passes
 
-        assert mem_alignment > 0, "mem_alignment must be positive"
-        self.mem_alignment = mem_alignment
-
     def _init_mem_algos(self) -> None:
         self.available_mem_algos = [
             position_based_greedy_with_hierarchy,
             greedy_by_size_for_offset_calculation_with_hierarchy,
         ]
 
-    def __call__(self, graph_module: torch.fx.GraphModule) -> PassResult:
+    def __call__(
+        self,
+        graph_module: torch.fx.GraphModule,
+    ) -> PassResult:
+        return self.run(graph_module)
+
+    def run(
+        self,
+        graph_module: torch.fx.GraphModule,
+        graph_signature: Optional[ExportGraphSignature] = None,
+    ) -> PassResult:
         mem_constraints = MemConstraints(
             opt_level=self.opt_level,
             alloc_graph_input=self.alloc_graph_input,
@@ -473,8 +502,7 @@ def __call__(self, graph_module: torch.fx.GraphModule) -> PassResult:
             allow_lifetime_and_storage_overlap=(self.opt_level >= 2),
             alloc_graph_input=self.alloc_graph_input,
             alloc_graph_output=self.alloc_graph_output,
-            alignment=self.mem_alignment,
         )
-        mem_planning(graph_module)
+        mem_planning.run(graph_module, graph_signature)
 
         return PassResult(graph_module, True)
diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py
index a8dd1315846..2e131bf6c77 100644
--- a/backends/cadence/aot/ops_registrations.py
+++ b/backends/cadence/aot/ops_registrations.py
@@ -94,11 +94,14 @@
     "int[] dilation, SymInt[] output_padding, int groups, bool channel_last=False) -> (Tensor Y)"
 )
 lib.define("dequantize(Tensor X, Tensor X_scale, Tensor X_zero_point) -> (Tensor Y)")
-# cadence::quantized_relu is defined in OSS
 lib.define(
     "quantized_add(Tensor X, Tensor X_scale, Tensor X_zero_point, Tensor Y, Tensor Y_scale, "
     "Tensor Y_zero_point, float out_scale, int out_zero_point) -> (Tensor Z)"
 )
+lib.define(
+    "quantized_add.per_tensor(Tensor X, float X_scale, int X_zero_point, Tensor Y, float Y_scale, "
+    "int Y_zero_point, float out_scale, int out_zero_point) -> (Tensor Z)"
+)
 lib.define(
     "quantized_mul(Tensor X, Tensor X_scale, Tensor X_zero_point, Tensor Y, Tensor Y_scale, "
     "Tensor Y_zero_point, float out_scale, int out_zero_point) -> (Tensor Z)"
@@ -115,8 +118,6 @@
     "quantized_embedding_byte(Tensor weight, Tensor weight_scales, Tensor weight_zero_points, "
     "Tensor indices, bool pruned_weights=False) -> (Tensor X)"
 )
-# cadence::quantized_layer_norm is defined in OSS
-# cadence::quantized_conv is defined is OSS
 lib.define(
     "quantized_transposed_conv(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, "
     "int[] dilation, SymInt[] output_padding, int groups, int input_zero_point, Tensor weight_zero_point, "
@@ -152,7 +153,7 @@
 )
 
 # ------------------------------------ #
-#   Migrated from custom_ops.ymal      #
+#   Migrated from custom_ops.yaml      #
 # ------------------------------------ #
 # Migrated from the custom_ops.yaml files containing different operator variants (e.g., .out, .tensor_out)
 lib.define(
@@ -163,7 +164,6 @@
     "transposed_convolution.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, "
     "int[] dilation, SymInt[] output_padding, int groups, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)"
 )
-# cadence::quantized_relu.out is defined in OSS
 lib.define(
     "quantized_relu.per_tensor(Tensor X, int X_zero_point, int out_zero_point, int out_multiplier, int out_shift) -> Tensor"
 )
@@ -175,6 +175,10 @@
     "quantized_add.out(Tensor X, Tensor X_scale, Tensor X_zero_point, Tensor Y, Tensor Y_scale, "
     "Tensor Y_zero_point, float out_scale, int out_zero_point, *, Tensor(a!) out) -> Tensor(a!)"
 )
+lib.define(
+    "quantized_add.per_tensor_out(Tensor X, float X_scale, int X_zero_point, Tensor Y, float Y_scale, "
+    "int Y_zero_point, float out_scale, int out_zero_point, *, Tensor(a!) out) -> Tensor(a!)"
+)
 lib.define(
     "quantized_mul.out(Tensor X, Tensor X_scale, Tensor X_zero_point, Tensor Y, Tensor Y_scale, "
     "Tensor Y_zero_point, float out_scale, int out_zero_point, *, Tensor(a!) out) -> Tensor(a!)"
@@ -257,14 +261,12 @@
     "_cat_nop.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)"
 )
 
-# Custom ops with jarvis_nn_ops namespace
+# Custom ops with cadence_nn_ops namespace
 jarvis_nn_lib = Library("jarvis_nn_ops", "DEF")
 jarvis_nn_lib.define(
     "attention_mask.out(Tensor input, Tensor start, Tensor stop, *, Tensor(a!) out) -> Tensor(a!)"
 )
 
-m = Library("cadence", "IMPL", "Meta")
-
 
 @register_fake("cadence::quantize_per_tensor")
 def quantize_per_tensor_meta(
@@ -290,6 +292,42 @@ def dequantize_per_tensor_meta(
     return input.new_empty(input.size(), dtype=torch.float)
 
 
+@register_fake("cadence::quantized_add")
+def quantized_add_meta(
+    X: torch.Tensor,
+    X_scale: torch.Tensor,
+    X_zero_point: torch.Tensor,
+    Y: torch.Tensor,
+    Y_scale: torch.Tensor,
+    Y_zero_point: torch.Tensor,
+    out_scale: float,
+    out_zero_point: int,
+) -> torch.Tensor:
+    out_size = X.size()
+    if list(X.size()) == [1]:
+        out_size = Y.size()
+
+    return X.new_empty(out_size, dtype=X.dtype)
+
+
+@register_fake("cadence::quantized_add.per_tensor")
+def quantized_add_per_tensor_meta(
+    X: torch.Tensor,
+    X_scale: float,
+    X_zero_point: int,
+    Y: torch.Tensor,
+    Y_scale: float,
+    Y_zero_point: int,
+    out_scale: float,
+    out_zero_point: int,
+) -> torch.Tensor:
+    out_size = X.size()
+    if list(X.size()) == [1]:
+        out_size = Y.size()
+
+    return X.new_empty(out_size, dtype=X.dtype)
+
+
 @register_fake("cadence::quantized_linear")
 def quantized_linear_meta(
     src: torch.Tensor,
diff --git a/backends/cadence/aot/pass_utils.py b/backends/cadence/aot/pass_utils.py
index d0166061c7f..6b34021a20a 100644
--- a/backends/cadence/aot/pass_utils.py
+++ b/backends/cadence/aot/pass_utils.py
@@ -104,6 +104,16 @@ def count_node(graph_module: torch.fx.GraphModule, target: torch.fx.node.Target)
     return total
 
 
+def op_counts_match(
+    graph_module: torch.fx.GraphModule,
+    expected_op_counts: dict[EdgeOpOverload, int],
+) -> bool:
+    for op, count in expected_op_counts.items():
+        if count_node(graph_module, op) != count:
+            return False
+    return True
+
+
 # Testing utils
 # Return the compute/function nodes in the graph
 def get_compute_nodes_in_gm(graph_module: torch.fx.GraphModule) -> List[torch.fx.Node]:
diff --git a/backends/cadence/aot/passes.py b/backends/cadence/aot/passes.py
index ab23149e60d..9c47eb4094f 100644
--- a/backends/cadence/aot/passes.py
+++ b/backends/cadence/aot/passes.py
@@ -6,7 +6,7 @@
 
 # pyre-strict
 
-from typing import Any, List, Optional, Type
+from typing import Any, List, Optional
 
 import torch
 import torch.fx
@@ -71,7 +71,7 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
 Argument = Any  # pyre-ignore
 
 
-def get_passes_in_default_order() -> List[Type[PassType]]:
+def get_passes_in_default_order() -> List[ExportPass]:
     passes = [
         InitializePipeline,
         RemoveRedundantOps.passes,
@@ -95,9 +95,8 @@ def get_cadence_passes(
     passes = get_passes_in_default_order()
     pass_filter = create_cadence_pass_filter(opt_level)
     filtered_passes = [
-        # pyre-fixme[20]: Call `torch.fx.passes.infra.pass_base.PassBase.__call__` expects argument `graph_module`.
+        # pyre-ignore[20]: Expect argument graph_module
         filtered_pass()
-        # pyre-fixme[6]: In call `filter.__new__` ... got `List[Type[typing.Callable[[GraphModule], Optional[PassResult]]]]`.
         for filtered_pass in list(filter(pass_filter, passes))
     ]
     return filtered_passes
diff --git a/backends/cadence/aot/quantizer/fusion_pass.py b/backends/cadence/aot/quantizer/fusion_pass.py
index 7c05e9b8678..a726f6c7fba 100644
--- a/backends/cadence/aot/quantizer/fusion_pass.py
+++ b/backends/cadence/aot/quantizer/fusion_pass.py
@@ -11,7 +11,9 @@
 import torch
 from executorch.backends.cadence.aot.quantizer.patterns import (
     AddmmPattern,
+    AddPattern,
     BmmPattern,
+    CatPattern,
     Conv1dPattern,
     Conv2dPattern,
     LayerNormPattern,
@@ -41,6 +43,47 @@
 ReluPatterns = (ReluPattern0, ReluPattern1)
 
 
+def get_args_and_kwargs_add(
+    graph_module: GraphModule,
+    inputs_inputs: List[fx.Node],
+    dequants_inputs: List[fx.Node],
+    quant_node: fx.Node,
+) -> Tuple[Tuple[ArgsType, ...], Dict[str, ArgsType]]:
+    X_scale_ = graph_module.graph.call_function(
+        torch.ops.aten.full.default,
+        ([1], dequants_inputs[0].args[1]),
+        {"dtype": torch.float},
+    )
+    X_zero_point_ = graph_module.graph.call_function(
+        torch.ops.aten.full.default,
+        ([1], dequants_inputs[0].args[2]),
+        {"dtype": torch.int32},
+    )
+    Y_scale_ = graph_module.graph.call_function(
+        torch.ops.aten.full.default,
+        ([1], dequants_inputs[1].args[1]),
+        {"dtype": torch.float},
+    )
+    Y_zero_point_ = graph_module.graph.call_function(
+        torch.ops.aten.full.default,
+        ([1], dequants_inputs[1].args[2]),
+        {"dtype": torch.int32},
+    )
+    args = (
+        inputs_inputs[0],
+        X_scale_,
+        X_zero_point_,
+        inputs_inputs[1],
+        Y_scale_,
+        Y_zero_point_,
+        quant_node.args[1],
+        quant_node.args[2],
+    )
+
+    kwargs = {}
+    return args, kwargs
+
+
 # Helper function to get the args and kwargs for the linear replacement op
 def get_args_and_kwargs_linear(
     graph_module: GraphModule,
@@ -204,6 +247,16 @@ def get_args_and_kwargs_matmul(
     return args, kwargs
 
 
+def get_args_and_kwargs_cat(
+    inputs_inputs: List[fx.Node], other_inputs: List[fx.Node], op_node: fx.Node
+) -> Tuple[Tuple[ArgsType], Dict[str, ArgsType]]:
+    args = tuple([inputs_inputs] + other_inputs)
+    dim = op_node.args[1] if len(op_node.args) > 1 else 0
+    # pyre-fixme[6]: Incompatible parameter type
+    kwargs = {"dim": int(dim)}
+    return args, kwargs
+
+
 def get_args_and_kwargs_conv(
     graph_module: GraphModule,
     inputs_inputs: List[fx.Node],
@@ -339,7 +392,7 @@ def call(self, graph_module: fx.GraphModule) -> PassResult:  # noqa: C901
             )
             for fused_partition in fused_partitions:
                 anchors = pattern.get_anchors(graph_module, fused_partition)
-                if not anchors:
+                if not anchors or anchors.empty:
                     continue
                 if any(self.is_fused(p.nodes) for p in fused_partition):
                     continue
@@ -348,12 +401,17 @@ def call(self, graph_module: fx.GraphModule) -> PassResult:  # noqa: C901
                     self.mark_fused(p.nodes)
 
                 dequants_inputs = []
-                for node, idx in anchors.inputs:
+                for node, idx, *_spec in anchors.inputs:
+                    arg = (
+                        node.args[idx]
+                        if isinstance(idx, int)
+                        else node.args[idx[0]][idx[1]]
+                    )
                     if (
-                        node.args[idx].target
+                        arg.target
                         == torch.ops.quantized_decomposed.dequantize_per_tensor.default
                     ):
-                        dequants_inputs.append(node.args[idx])
+                        dequants_inputs.append(arg)
                 dequants_weights = []
                 for node, idx in anchors.weights:
                     if (
@@ -385,7 +443,18 @@ def call(self, graph_module: fx.GraphModule) -> PassResult:  # noqa: C901
                         inputs_inputs + weights_inputs + other_inputs + bias_inputs
                     )
                     kwargs = {}
-                    if isinstance(pattern, (Conv1dPattern, Conv2dPattern)):
+                    if isinstance(pattern, AddPattern):
+                        args, kwargs = get_args_and_kwargs_add(
+                            graph_module,
+                            inputs_inputs,
+                            dequants_inputs,
+                            quant_node,
+                        )
+                    elif isinstance(pattern, CatPattern):
+                        args, kwargs = get_args_and_kwargs_cat(
+                            inputs_inputs, other_inputs, op_node
+                        )
+                    elif isinstance(pattern, (Conv1dPattern, Conv2dPattern)):
                         args, kwargs = get_args_and_kwargs_conv(
                             graph_module,
                             inputs_inputs,
diff --git a/backends/cadence/aot/quantizer/patterns.py b/backends/cadence/aot/quantizer/patterns.py
index 0dee8ebcd1d..66f6772d942 100644
--- a/backends/cadence/aot/quantizer/patterns.py
+++ b/backends/cadence/aot/quantizer/patterns.py
@@ -33,7 +33,17 @@ class PartitionAnchors:
     is used for other types of input values as well as handling default parameters.
     """
 
-    inputs: List[Tuple[fx.Node, int]] = field(default_factory=list)
+    # Inputs can share quantization parameters
+    inputs: List[
+        Union[
+            Tuple[fx.Node, Union[int, Tuple[int, int]]],
+            Tuple[
+                fx.Node,
+                Union[int, Tuple[int, int]],
+                SharedQuantizationSpec,
+            ],
+        ]
+    ] = field(default_factory=list)
     weights: List[Tuple[fx.Node, int]] = field(default_factory=list)
     biases: List[
         Union[Tuple[fx.Node, int], Tuple[fx.Node, int, DerivedQuantizationSpec]]
@@ -43,6 +53,7 @@ class PartitionAnchors:
     output: List[Union[Tuple[fx.Node], Tuple[fx.Node, SharedQuantizationSpec]]] = field(
         default_factory=list
     )
+    empty: bool = False
 
 
 class QuantizationPattern(ABC):
@@ -101,6 +112,38 @@ def replacement_op(self) -> OpOverload:
         return torch.ops.cadence.quantized_linear
 
 
+class AddPattern(QuantizationPattern):
+    def partition_types(self) -> List[OpOverload]:
+        return [torch.ops.aten.add.Tensor]
+
+    def get_anchors(
+        self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
+    ) -> PartitionAnchors:
+        # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
+        add_node = fused_partition[0].nodes[-1]
+
+        # Bail if:
+        #   - the add node is not a tensor add
+        #   - the add node has kwargs (e.g. alpha)
+        is_tensor_add = isinstance(add_node.args[0], fx.Node) and isinstance(
+            add_node.args[1], fx.Node
+        )
+        if not is_tensor_add or len(add_node.kwargs) > 0:
+            return PartitionAnchors(
+                empty=True,
+            )
+
+        return PartitionAnchors(
+            inputs=[(add_node, 0), (add_node, 1)],
+            weights=[],
+            biases=[],
+            output=[(add_node,)],
+        )
+
+    def replacement_op(self) -> OpOverload:
+        return torch.ops.cadence.quantized_add.default
+
+
 class BmmPattern(QuantizationPattern):
     def partition_types(self) -> List[OpOverload]:
         return [torch.ops.aten.bmm.default]
@@ -122,6 +165,52 @@ def replacement_op(self) -> OpOverload:
         return torch.ops.cadence.quantized_matmul.default
 
 
+class CatPattern(QuantizationPattern):
+    def partition_types(self) -> List[OpOverload]:
+        return [torch.ops.aten.cat.default]
+
+    def get_anchors(
+        self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
+    ) -> PartitionAnchors:
+        # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
+        cat_node = fused_partition[0].nodes[-1]
+
+        # Create args. The first argument does not have quant spec and
+        # will inherit from the overall quant spec. All subsequent args
+        # will share that spec.
+        # Note that outpus also share that spec.
+        args: List[
+            Union[
+                Tuple[fx.Node, Union[int, Tuple[int, int]]],
+                Tuple[
+                    fx.Node,
+                    Union[int, Tuple[int, int]],
+                    SharedQuantizationSpec,
+                ],
+            ]
+        ] = [(cat_node, (0, 0))]
+        for i in range(1, len(cat_node.args[0])):
+            args.append(
+                (
+                    cat_node,
+                    (0, i),
+                    SharedQuantizationSpec((cat_node.args[0][0], cat_node)),
+                )
+            )
+
+        return PartitionAnchors(
+            inputs=args,
+            weights=[],
+            biases=[],
+            output=[
+                (cat_node, SharedQuantizationSpec((cat_node.args[0][0], cat_node)))
+            ],
+        )
+
+    def replacement_op(self) -> OpOverload:
+        return torch.ops.aten.cat.default
+
+
 class Conv1dPattern(QuantizationPattern):
     def partition_types(self) -> List[OpOverload]:
         return [torch.ops.aten.conv1d.default]
diff --git a/backends/cadence/aot/quantizer/quantizer.py b/backends/cadence/aot/quantizer/quantizer.py
index d6765d2ad30..62727985452 100644
--- a/backends/cadence/aot/quantizer/quantizer.py
+++ b/backends/cadence/aot/quantizer/quantizer.py
@@ -6,12 +6,15 @@
 
 # pyre-strict
 
+from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
 
 import torch
 from executorch.backends.cadence.aot.quantizer.patterns import (
     AddmmPattern,
+    AddPattern,
     BmmPattern,
+    CatPattern,
     Conv1dPattern,
     Conv2dPattern,
     LayerNormPattern,
@@ -108,7 +111,7 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
                 continue
 
             anchors = self.pattern.get_anchors(model, fused_partition)
-            if not anchors:
+            if not anchors or anchors.empty:
                 continue
             if is_annotated(
                 [
@@ -142,17 +145,38 @@ def annotate_inputs(
                         "quantization_annotation",
                         QuantizationAnnotation(_annotated=True),
                     )
+                    arg = (
+                        # pyre-ignore[16]: no attribute
+                        node.args[idx]
+                        if isinstance(idx, int)
+                        # pyre-ignore[16]: no attribute
+                        else node.args[idx[0]][idx[1]]
+                    )
+                    annotation.input_qspec_map[arg] = (
+                        custom_spec[0] if custom_spec else spec
+                    )
                     # pyre-ignore[16]: no attribute
+                    node.meta["quantization_annotation"] = annotation
+
+            def annotate_weights_or_biases(
+                weights_or_biases: List[Tuple[fx.Node, int]],
+                spec: Optional[QuantizationSpec],
+            ) -> None:
+                for node, idx, *custom_spec in weights_or_biases:
+                    annotation = node.meta.get(
+                        "quantization_annotation",
+                        QuantizationAnnotation(_annotated=True),
+                    )
                     annotation.input_qspec_map[node.args[idx]] = (
                         custom_spec[0] if custom_spec else spec
                     )
-                    # pyre-ignore[16]: no attribute
                     node.meta["quantization_annotation"] = annotation
 
+            # pyre-ignore[6]: incompatible parameter type
             annotate_inputs(anchors.inputs, input_act_qspec)
-            annotate_inputs(anchors.weights, weight_qspec)
+            annotate_weights_or_biases(anchors.weights, weight_qspec)
             # pyre-ignore[6]: incompatible parameter type
-            annotate_inputs(anchors.biases, bias_qspec)
+            annotate_weights_or_biases(anchors.biases, bias_qspec)
         return model
 
     def validate(self, model: fx.GraphModule) -> None:
@@ -177,6 +201,8 @@ def get_cadence_default_quantizers() -> List[Quantizer]:
     ]
 
 
+# Note: need dataclass to be used in CI configs through OmegaConf and Hydra
+@dataclass
 class CadenceQuantizer(ComposableQuantizer):
     """
     Generic CadenceQuantizer. Although it can be used directly, it is typically a base
@@ -208,3 +234,16 @@ def __init__(
         self,
     ) -> None:
         super().__init__([])
+
+
+class CadenceWakeWordQuantizer(CadenceQuantizer):
+    """
+    Quantizer for WakeWord, including add
+    """
+
+    def __init__(self, quantizers: Optional[list[Quantizer]] = None) -> None:
+        if quantizers is None:
+            quantizers = get_cadence_default_quantizers()
+        quantizers.append(CadenceAtenQuantizer(AddPattern(), qconfig_A8uW8u))
+        quantizers.append(CadenceAtenQuantizer(CatPattern(), qconfig_A8uW8u))
+        super().__init__(quantizers)
diff --git a/backends/cadence/aot/ref_implementations.py b/backends/cadence/aot/ref_implementations.py
new file mode 100644
index 00000000000..9eaac004bcf
--- /dev/null
+++ b/backends/cadence/aot/ref_implementations.py
@@ -0,0 +1,63 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+import torch
+from executorch.exir.scalar_type import ScalarType
+from torch.library import impl, Library
+
+
+m = Library("cadence", "IMPL", "CompositeExplicitAutograd")
+
+qdtype_map: dict[ScalarType, torch.dtype] = {
+    ScalarType.QINT8: torch.qint8,
+    ScalarType.QUINT8: torch.quint8,
+    ScalarType.QINT32: torch.qint32,
+}
+
+
+@impl(m, "requantize")
+def requantize(
+    input: torch.Tensor,
+    in_scale: torch.Tensor,
+    in_zero_point: torch.Tensor,
+    out_scale: torch.Tensor,
+    out_zero_point: torch.Tensor,
+    dtype: ScalarType,
+) -> torch.Tensor:
+    if dtype in qdtype_map:
+        # Old quantization mechanism
+        return torch.quantize_per_tensor(
+            torch.dequantize(input), out_scale, out_zero_point, qdtype_map[dtype]
+        )
+
+    # For in_scale or out_scale other than scalar, it requires quant/dequant
+    # per channel, but the channel dimension value is missing
+    if in_scale.numel() > 1 or out_scale.numel() > 1:
+        raise NotImplementedError("Only scalar scales are supported")
+
+    quant_min = torch.iinfo(input.dtype).min
+    quant_max = torch.iinfo(input.dtype).max
+    # pyre-fixme[6]: This dtype is actually the right one.
+    out_quant_min = torch.iinfo(dtype).min
+    # pyre-fixme[6]: This dtype is actually the right one.
+    out_quant_max = torch.iinfo(dtype).max
+    return torch.ops.quantized_decomposed.quantize_per_tensor(
+        torch.ops.quantized_decomposed.dequantize_per_tensor(
+            input,
+            in_scale.flatten()[0],
+            in_zero_point.flatten()[0],
+            quant_min,
+            quant_max,
+            input.dtype,
+        ),
+        out_scale.flatten()[0],
+        out_zero_point.flatten()[0],
+        out_quant_min,
+        out_quant_max,
+        dtype,
+    )
diff --git a/backends/cadence/aot/remove_ops.py b/backends/cadence/aot/remove_ops.py
index caceabfba82..06e83d92695 100644
--- a/backends/cadence/aot/remove_ops.py
+++ b/backends/cadence/aot/remove_ops.py
@@ -33,7 +33,7 @@
 from executorch.backends.cadence.aot.utils import get_edge_overload_packet
 from executorch.backends.transforms.remove_clone_ops import RemoveCloneOpsTransform
 from executorch.exir.dialects._ops import ops as exir_ops
-from executorch.exir.dialects.edge._ops import EdgeOpOverload
+from executorch.exir.dialects.edge._ops import EdgeOpOverload, EdgeOpOverloadPacket
 from executorch.exir.pass_base import ExportPass, NodeMetadata, PassResult, ProxyValue
 from executorch.exir.pass_manager import PassManager, PassType
 from executorch.exir.passes import dead_code_elimination_pass
@@ -569,6 +569,8 @@ class Subgraph:
         exir_ops.edge.aten.hardtanh.default,
         exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
         exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+        exir_ops.edge.cadence.quantize_per_tensor.default,
+        exir_ops.edge.cadence.dequantize_per_tensor.default,
     }
 
     # must be initialized in the constructor
@@ -743,6 +745,136 @@ def permute_shape(
         return [shape[p] for p in permute_dims]
 
 
+@register_cadence_pass(CadencePassAttribute(opt_level=1))
+class RemoveBranchedQuantDequant(ExportPass):
+    """
+    This pass looks for adjacent quant and dequant nodes with identical
+    parameters, where the quant node has other users in addition to the
+    dequant. The quant and dequant pair would be removed by the
+    FuseQuantDequantToRequantizePass if not for the multiple users. This pass
+    removes just the dequant node by connecting it to the quant's parent node
+    """
+
+    quantize_op_packets: set[EdgeOpOverloadPacket] = {
+        exir_ops.edge.cadence.quantize_per_tensor,
+        exir_ops.edge.quantized_decomposed.quantize_per_tensor,
+    }
+    dequantize_op_packets: set[EdgeOpOverloadPacket] = {
+        exir_ops.edge.cadence.dequantize_per_tensor,
+        exir_ops.edge.quantized_decomposed.dequantize_per_tensor,
+    }
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        self.remove_branched(
+            graph_module, self.quantize_op_packets, self.dequantize_op_packets
+        )
+        self.remove_branched(
+            graph_module, self.dequantize_op_packets, self.quantize_op_packets
+        )
+
+        graph_module.graph.eliminate_dead_code()
+        result = super().call(graph_module)
+        return result
+
+    def remove_branched(
+        self,
+        graph_module: torch.fx.GraphModule,
+        producer_pkts: set[EdgeOpOverloadPacket],
+        consumer_pkts: set[EdgeOpOverloadPacket],
+    ) -> None:
+        for node in graph_module.graph.nodes:
+            if (
+                node.op != "call_function"
+                or not isinstance(node.target, EdgeOpOverload)
+                or get_edge_overload_packet(node.target) not in producer_pkts
+            ):
+                continue
+
+            if len(node.users) < 2:
+                continue
+
+            for user in node.users:
+                if (
+                    not isinstance(user.target, EdgeOpOverload)
+                    or get_edge_overload_packet(user.target) not in consumer_pkts
+                ):
+                    continue
+
+                # check qparams match
+                if node.args[1:] != user.args[1:]:
+                    continue
+
+                user.replace_all_uses_with(node.args[0])
+
+
+class RemoveCatFromSliceCopyPass(ExportPass):
+    def _remove_unused_cat(  # noqa: C901
+        self, graph_module: torch.fx.GraphModule
+    ) -> None:
+        slice_copy_nodes = [
+            node
+            for node in graph_module.graph.nodes
+            if node.target == exir_ops.edge.aten.slice_copy.Tensor
+        ]
+        for slice_copy_node in slice_copy_nodes:
+            slice_dim, start_idx, end_idx, step = 0, 0, float("inf"), 1
+            input_node, *other_args = slice_copy_node.args
+            if len(other_args) >= 1:
+                slice_dim = other_args[0]
+            if len(other_args) >= 2:
+                start_idx = other_args[1]
+            if len(other_args) >= 3:
+                end_idx = other_args[2]
+            if len(other_args) >= 4:
+                step = other_args[3]
+            if step != 1:
+                continue
+            slice_copy_dtype = slice_copy_node.meta["val"].dtype
+            if input_node.target != exir_ops.edge.aten.cat.default:
+                continue
+            cat_dtype = input_node.meta["val"].dtype
+            if slice_copy_dtype != cat_dtype:
+                continue
+            cat_dim = input_node.args[1:]
+            if len(cat_dim) == 0:
+                cat_dim = 0
+            if cat_dim != slice_dim:
+                continue
+            cat_output_shape = input_node.meta["val"].shape
+            start_idx = (
+                cat_output_shape[cat_dim] + start_idx if start_idx < 0 else start_idx
+            )
+            end_idx = (
+                cat_output_shape[cat_dim]
+                if end_idx > cat_output_shape[cat_dim]
+                else end_idx
+            )
+            base_idx = 0
+            cat_input_to_keep = None
+            for cat_input_node in input_node.args[0]:
+                cat_input_dtype = cat_input_node.meta["val"].dtype
+                if slice_copy_dtype != cat_input_dtype:
+                    continue
+                cat_input_shape = cat_input_node.meta["val"].shape
+
+                # check if the slice range overlaps with the cat range
+                if (
+                    base_idx <= start_idx
+                    and end_idx <= list(cat_input_shape)[cat_dim] + base_idx
+                ):
+                    cat_input_to_keep = cat_input_node
+                    break
+                base_idx += list(cat_input_shape)[cat_dim]
+            if cat_input_to_keep is not None:
+                slice_copy_node.replace_input_with(input_node, cat_input_to_keep)
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        self._remove_unused_cat(graph_module)
+        graph_module.recompile()
+        graph_module.graph.eliminate_dead_code()
+        return super().call(graph_module)
+
+
 # The following class consolidates functions to remove ops that are redundant
 # in Jarvis. Currently, each function in this class iterates over each node of
 # the graph module once. In future, we could consolidate them into a monolithic
@@ -763,4 +895,5 @@ class CadenceRemoveNops:
         RemoveNopMulOpPass,
         RemoveNopAddOpPass,
         RemoveNopLinalgVectorNormOpPass,
+        RemoveBranchedQuantDequant,
     ]
diff --git a/backends/cadence/aot/reorder_ops.py b/backends/cadence/aot/reorder_ops.py
index 0fd7f0b61a4..e8a8e230531 100644
--- a/backends/cadence/aot/reorder_ops.py
+++ b/backends/cadence/aot/reorder_ops.py
@@ -118,6 +118,8 @@ def get_descendent_quant_ops(self, node: torch.fx.Node) -> List[torch.fx.Node]:
             if user_target in {
                 torch.ops.quantized_decomposed.quantize_per_tensor,
                 exir_ops.edge.quantized_decomposed.quantize_per_tensor,
+                torch.ops.cadence.quantize_per_tensor,
+                exir_ops.edge.cadence.quantize_per_tensor,
             }:
                 descendent_quant_ops.append(user)
             # If the successor is a trivially quantizable op, consider its users
@@ -300,6 +302,8 @@ def advance_quantize_op(self, graph_module: torch.fx.GraphModule):
             if get_overload_packet(node.target) not in (
                 exir_ops.edge.quantized_decomposed.quantize_per_tensor,
                 torch.ops.quantized_decomposed.quantize_per_tensor,
+                exir_ops.edge.cadence.quantize_per_tensor,
+                torch.ops.cadence.quantize_per_tensor,
             ):
                 continue
 
@@ -413,6 +417,7 @@ def postponing_feasible(self, dequant_node: torch.fx.Node):
             in {
                 exir_ops.edge.quantized_decomposed.quantize_per_tensor,
                 exir_ops.edge.quantized_decomposed.quantize_per_channel,
+                exir_ops.edge.cadence.quantize_per_tensor,
             }
             for x in users
         )
@@ -422,6 +427,7 @@ def postpone_dequantize_op(self, graph_module: torch.fx.GraphModule) -> bool:
         packet_to_overload_map = {
             exir_ops.edge.quantized_decomposed.dequantize_per_tensor: "default",
             exir_ops.edge.quantized_decomposed.dequantize_per_channel: "default",
+            exir_ops.edge.cadence.dequantize_per_tensor: "default",
         }
         graph = graph_module.graph
         modified = False
@@ -500,6 +506,7 @@ class SinkOpsCloserToUsePass(ExportPass):
         exir_ops.edge.aten.dequantize,
         exir_ops.edge.quantized_decomposed.dequantize_per_tensor,
         exir_ops.edge.quantized_decomposed.dequantize_per_channel,
+        exir_ops.edge.cadence.dequantize_per_tensor,
     }
 
     def sink_ops_closer_to_use(self, graph_module: torch.fx.GraphModule):
@@ -558,6 +565,7 @@ class HoistOpsCloserToDefPass(ExportPass):
 
     hoistable_ops: Set[EdgeOpOverload] = {
         exir_ops.edge.quantized_decomposed.quantize_per_tensor,
+        exir_ops.edge.cadence.quantize_per_tensor,
         exir_ops.edge.aten.slice_copy,
         exir_ops.edge.aten.select_copy,
     }
diff --git a/backends/cadence/aot/replace_ops.py b/backends/cadence/aot/replace_ops.py
index 487d374fb80..aa53750b64f 100644
--- a/backends/cadence/aot/replace_ops.py
+++ b/backends/cadence/aot/replace_ops.py
@@ -162,11 +162,12 @@ def call_operator(
         kwargs: Dict[str, Argument],
         meta: NodeMetadata,
     ) -> ProxyValue:
-        if op not in {exir_ops.edge.quantized_decomposed.quantize_per_tensor.default}:
+        ns = exir_ops.edge if isinstance(op, EdgeOpOverload) else torch.ops
+        if op != ns.quantized_decomposed.quantize_per_tensor.default:
             return super().call_operator(op, args, kwargs, meta)
 
         return super().call_operator(
-            exir_ops.edge.cadence.quantize_per_tensor.default,
+            ns.cadence.quantize_per_tensor.default,
             args,
             kwargs,
             meta,
@@ -188,11 +189,12 @@ def call_operator(
         kwargs: Dict[str, Argument],
         meta: NodeMetadata,
     ) -> ProxyValue:
-        if op not in {exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default}:
+        ns = exir_ops.edge if isinstance(op, EdgeOpOverload) else torch.ops
+        if op != ns.quantized_decomposed.dequantize_per_tensor.default:
             return super().call_operator(op, args, kwargs, meta)
 
         return super().call_operator(
-            exir_ops.edge.cadence.dequantize_per_tensor.default,
+            ns.cadence.dequantize_per_tensor.default,
             args,
             kwargs,
             meta,
@@ -1717,9 +1719,10 @@ def call_operator(self, op, args, kwargs, meta):
         )
 
 
-@register_cadence_pass(CadencePassAttribute(opt_level=0))(
-    ReplaceScalarWithTensorArgPass()
-)
+# pyre-ignore[6]: Incompatible parameter type (doesn't get the inheritance)
+register_cadence_pass(CadencePassAttribute(opt_level=0))(ReplaceScalarWithTensorArgPass)
+
+
 @register_cadence_pass(CadencePassAttribute(opt_level=0))
 class ReplaceScalarTensorWithFullPass(ExportPass):
     """
@@ -1837,6 +1840,10 @@ class ReplaceSingleElementTensorArgumentsFromFullOpWithScalarPass(ExportPass):
     replaced_scalar_args: dict[
         EdgeOpOverloadPacket, tuple[EdgeOpOverload, Sequence[int]]
     ] = {
+        exir_ops.edge.cadence.quantized_add: (
+            exir_ops.edge.cadence.quantized_add.per_tensor,
+            [1, 2, 4, 5],
+        ),
         exir_ops.edge.cadence.quantized_conv: (
             exir_ops.edge.cadence.quantized_conv.per_tensor,
             [8, 9, 12, 13],
diff --git a/backends/cadence/aot/tests/test_fusion_ops_passes.py b/backends/cadence/aot/tests/test_fusion_ops_passes.py
index 792a6ee4166..4af3eafb72a 100644
--- a/backends/cadence/aot/tests/test_fusion_ops_passes.py
+++ b/backends/cadence/aot/tests/test_fusion_ops_passes.py
@@ -20,7 +20,7 @@
     FuseTransposeOpPairsPass,
 )
 from executorch.backends.cadence.aot.graph_builder import GraphBuilder
-from executorch.backends.cadence.aot.pass_utils import count_node
+from executorch.backends.cadence.aot.pass_utils import count_node, op_counts_match
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.dialects.edge._ops import EdgeOpOverload
 from torch import nn
@@ -32,8 +32,7 @@ def check_op_counts(
         graph_module: torch.fx.GraphModule,
         expected_op_counts: dict[EdgeOpOverload, int],
     ) -> None:
-        for op, count in expected_op_counts.items():
-            self.assertEqual(count_node(graph_module, op), count)
+        self.assertTrue(op_counts_match(graph_module, expected_op_counts))
 
 
 class TestFusionPasses(TestFusionPassesBase):
diff --git a/backends/cadence/aot/tests/test_memory_passes.py b/backends/cadence/aot/tests/test_memory_passes.py
index d50456796c9..245f3d64003 100644
--- a/backends/cadence/aot/tests/test_memory_passes.py
+++ b/backends/cadence/aot/tests/test_memory_passes.py
@@ -16,6 +16,7 @@
 from executorch.backends.cadence.aot import compiler
 from executorch.backends.cadence.aot.memory_planning import find_peak_memory_usage
 from executorch.backends.cadence.aot.pass_utils import count_node
+from executorch.backends.cadence.aot.utils import MemoryConfig
 from executorch.exir import memory
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.memory_planning import collect_specs_from_nodes
@@ -46,14 +47,13 @@ def calculate_aligned_num_bytes(num: int, alignment: int = 16) -> int:
         inputs = (torch.ones(batch_size, input_dim),)
         model = PeakMemoryTestModel(input_dim, hidden_dim, output_dim)
 
-        graph_module = (
-            compiler.export_to_executorch_gen_etrecord(model, inputs)
-            .exported_program()
-            .graph_module
-        )
+        exported_program = compiler.export_to_executorch_gen_etrecord(
+            model, inputs
+        ).exported_program()
 
         peak_usage, _ = find_peak_memory_usage(
-            graph_module,
+            exported_program.graph_module,
+            exported_program.graph_signature,
             mem_constraints=None,
             alloc_graph_input=True,
             alloc_graph_output=True,
@@ -73,14 +73,13 @@ def calculate_aligned_num_bytes(num: int, alignment: int = 16) -> int:
             input_dim, hidden_dim, hidden_dim, hidden_dim, output_dim
         )
 
-        graph_module = (
-            compiler.export_to_executorch_gen_etrecord(model, inputs)
-            .exported_program()
-            .graph_module
-        )
+        exported_program = compiler.export_to_executorch_gen_etrecord(
+            model, inputs
+        ).exported_program()
 
         peak_usage, _ = find_peak_memory_usage(
-            graph_module,
+            exported_program.graph_module,
+            exported_program.graph_signature,
             mem_constraints=None,
             alloc_graph_input=True,
             alloc_graph_output=True,
@@ -111,6 +110,7 @@ def forward(self, x):
         graph_module.graph.eliminate_dead_code()
         peak_usage, _ = find_peak_memory_usage(
             graph_module,
+            executorch_prog.exported_program().graph_signature,
             alloc_graph_input=False,
             alloc_graph_output=False,
             mem_constraints=None,
@@ -793,7 +793,9 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
                     mem_algo=mem_algo,
                     alloc_graph_input=False,
                     alloc_graph_output=False,
-                    mem_alignment=37,
+                    memory_config=MemoryConfig(
+                        memory_sizes=[0x1000000000], memory_alignments=[37]
+                    ),
                 )
                 .exported_program()
                 .graph_module
diff --git a/backends/cadence/aot/tests/test_remove_ops_passes.py b/backends/cadence/aot/tests/test_remove_ops_passes.py
index 348e0b5de83..42f4b87bdcb 100644
--- a/backends/cadence/aot/tests/test_remove_ops_passes.py
+++ b/backends/cadence/aot/tests/test_remove_ops_passes.py
@@ -17,10 +17,12 @@
 from executorch.backends.cadence.aot import compiler
 from executorch.backends.cadence.aot.compiler import export_to_edge
 
-from executorch.backends.cadence.aot.pass_utils import count_node
+from executorch.backends.cadence.aot.pass_utils import count_node, op_counts_match
 from executorch.backends.cadence.aot.quantizer.quantizer import CadenceDefaultQuantizer
 from executorch.backends.cadence.aot.remove_ops import (
     RemoveAliasCopyOpPass,
+    RemoveBranchedQuantDequant,
+    RemoveCatFromSliceCopyPass,
     RemoveCloneOpPass,
     RemoveContiguousOpPass,
     RemoveDetachCopyPass,
@@ -709,3 +711,85 @@ def forward(self, x):
         self.assertEqual(
             count_node(graph_module, exir_ops.edge.aten.permute_copy.default), 2
         )
+
+    def test_remove_dequant_on_branch(self):
+        class M(torch.nn.Module):
+            def forward(self, x):
+                x = torch.abs(x)
+                x0 = torch.ops.quantized_decomposed.quantize_per_tensor(
+                    x, 1.2, 3, 0, 127, torch.int8
+                )
+                x1 = torch.abs(x0)
+                y0 = torch.ops.quantized_decomposed.dequantize_per_tensor(
+                    x0, 1.2, 3, 0, 127, torch.int8
+                )
+                y1 = y0.view(-1)
+                return x1, y1
+
+        inputs = torch.rand(1, 8, 4, 6)
+        model = M()
+        graph_module = export_to_edge(model, (inputs,)).exported_program().graph_module
+
+        graph_module = RemoveBranchedQuantDequant()(graph_module).graph_module
+        self.assertTrue(
+            op_counts_match(
+                graph_module,
+                expected_op_counts={
+                    exir_ops.edge.quantized_decomposed.quantize_per_tensor.default: 1,
+                    # we expect the pass to remove the dequantize node
+                    exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default: 0,
+                    exir_ops.edge.aten.abs.default: 2,
+                },
+            )
+        )
+
+    def test_remove_cat_from_slice_copy_all_removal(self) -> None:
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                x1 = torch.cat((x, y), 0)  # (2, 4)
+                return torch.slice_copy(x1, dim=0, start=0, end=1)
+
+        inputs = tuple(torch.randn(2, 4) for _ in range(2))
+        graph_module = export_to_edge(M(), inputs).exported_program().graph_module
+        p = RemoveCatFromSliceCopyPass()
+        graph_module = cast(PassResult, p(graph_module)).graph_module
+
+        # Ensure both cat nodes were removed
+        self.assertEqual(count_node(graph_module, exir_ops.edge.aten.cat.default), 0)
+
+    def test_remove_cat_from_slice_copy_no_removal(self) -> None:
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                x1 = torch.cat((x, y), 0)  # (2, 4)
+                return torch.slice_copy(x1, dim=0, start=0, end=3)
+
+        inputs = tuple(torch.randn(2, 4) for _ in range(2))
+        graph_module = export_to_edge(M(), inputs).exported_program().graph_module
+        p = RemoveCatFromSliceCopyPass()
+        graph_module = cast(PassResult, p(graph_module)).graph_module
+
+        # Ensure both cat nodes were removed
+        self.assertEqual(count_node(graph_module, exir_ops.edge.aten.cat.default), 1)
+
+    def test_remove_cat_from_slice_copy_zero_range(self) -> None:
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                x1 = torch.cat((x, y), 0)  # (2, 4)
+                return torch.slice_copy(x1, dim=0, start=0, end=0)
+
+        inputs = tuple(torch.randn(2, 4) for _ in range(2))
+        graph_module = export_to_edge(M(), inputs).exported_program().graph_module
+        p = RemoveCatFromSliceCopyPass()
+        graph_module = cast(PassResult, p(graph_module)).graph_module
+
+        # Ensure both cat nodes were removed
+        self.assertEqual(count_node(graph_module, exir_ops.edge.aten.cat.default), 0)
diff --git a/backends/cadence/aot/utils.py b/backends/cadence/aot/utils.py
index 468bbf4ae66..37011067897 100644
--- a/backends/cadence/aot/utils.py
+++ b/backends/cadence/aot/utils.py
@@ -256,6 +256,8 @@ def save_bpte_program(
 @dataclass
 class MemoryConfig:
     memory_sizes: List[int]
+    # Alignment constraint for each memory region in bytes.
+    memory_alignments: Optional[List[int]] = None
 
     # Optional fields for logs
     memory_names: Optional[List[str]] = None
@@ -263,6 +265,10 @@ class MemoryConfig:
     memory_xml_path: Optional[str] = None
     MemorySpace: Optional[enum.Enum] = None
 
+    def __post_init__(self) -> None:
+        if self.memory_alignments is None:
+            self.memory_alignments = [1] * len(self.memory_sizes)
+
     # get num memories indexed from 1..N, compatible with EXIR's spec.mem_id
     def get_num_memories(self) -> int:
         return len(self.memory_sizes) + 1
diff --git a/backends/cadence/build_cadence_fusionG3.sh b/backends/cadence/build_cadence_fusionG3.sh
index b0e1e777065..081226bcd94 100644
--- a/backends/cadence/build_cadence_fusionG3.sh
+++ b/backends/cadence/build_cadence_fusionG3.sh
@@ -32,7 +32,6 @@ if $STEPWISE_BUILD; then
         -DEXECUTORCH_ENABLE_LOGGING=ON \
         -DEXECUTORCH_USE_DL=OFF \
         -DEXECUTORCH_BUILD_CADENCE=OFF \
-        -DFLATC_EXECUTABLE="$(which flatc)" \
         -DHAVE_FNMATCH_H=OFF \
         -Bcmake-out .
 
@@ -45,7 +44,6 @@ if $STEPWISE_BUILD; then
         -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \
         -DEXECUTORCH_BUILD_PTHREADPOOL=OFF \
         -DEXECUTORCH_BUILD_CADENCE=ON \
-        -DFLATC_EXECUTABLE="$(which flatc)" \
         -DEXECUTORCH_ENABLE_LOGGING=ON \
         -DEXECUTORCH_ENABLE_PROGRAM_VERIFICATION=ON \
         -DEXECUTORCH_USE_DL=OFF \
@@ -71,9 +69,7 @@ else
         -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \
         -DEXECUTORCH_BUILD_PTHREADPOOL=OFF \
         -DEXECUTORCH_BUILD_CPUINFO=OFF \
-        -DEXECUTORCH_BUILD_FLATC=OFF \
         -DEXECUTORCH_BUILD_CADENCE=ON \
-        -DFLATC_EXECUTABLE="$(which flatc)" \
         -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
         -DEXECUTORCH_ENABLE_LOGGING=ON \
         -DEXECUTORCH_ENABLE_PROGRAM_VERIFICATION=ON \
diff --git a/backends/cadence/build_cadence_hifi4.sh b/backends/cadence/build_cadence_hifi4.sh
index 50fd5f032ab..ac1e03ba78e 100644
--- a/backends/cadence/build_cadence_hifi4.sh
+++ b/backends/cadence/build_cadence_hifi4.sh
@@ -32,7 +32,6 @@ if $STEPWISE_BUILD; then
         -DEXECUTORCH_ENABLE_LOGGING=ON \
         -DEXECUTORCH_USE_DL=OFF \
         -DEXECUTORCH_BUILD_CADENCE=OFF \
-        -DFLATC_EXECUTABLE="$(which flatc)" \
         -Bcmake-out .
 
     echo "Building any Cadence-specific binaries on top"
@@ -44,7 +43,6 @@ if $STEPWISE_BUILD; then
         -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \
         -DEXECUTORCH_BUILD_PTHREADPOOL=OFF \
         -DEXECUTORCH_BUILD_CADENCE=ON \
-        -DFLATC_EXECUTABLE="$(which flatc)" \
         -DEXECUTORCH_ENABLE_LOGGING=ON \
         -DEXECUTORCH_ENABLE_PROGRAM_VERIFICATION=ON \
         -DEXECUTORCH_USE_DL=OFF \
@@ -69,9 +67,7 @@ else
         -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \
         -DEXECUTORCH_BUILD_PTHREADPOOL=OFF \
         -DEXECUTORCH_BUILD_CPUINFO=OFF \
-        -DEXECUTORCH_BUILD_FLATC=OFF \
         -DEXECUTORCH_BUILD_CADENCE=ON \
-        -DFLATC_EXECUTABLE="$(which flatc)" \
         -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
         -DEXECUTORCH_ENABLE_LOGGING=ON \
         -DEXECUTORCH_ENABLE_PROGRAM_VERIFICATION=ON \
diff --git a/backends/cadence/fusion_g3/operators/CMakeLists.txt b/backends/cadence/fusion_g3/operators/CMakeLists.txt
index cac16bddc50..561323e045e 100644
--- a/backends/cadence/fusion_g3/operators/CMakeLists.txt
+++ b/backends/cadence/fusion_g3/operators/CMakeLists.txt
@@ -11,8 +11,8 @@ if(NOT CMAKE_CXX_STANDARD)
   set(CMAKE_CXX_STANDARD 17)
 endif()
 
-include(${EXECUTORCH_ROOT}/build/Utils.cmake)
-include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
 
 if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
@@ -41,7 +41,15 @@ set(_aten_ops__srcs
     "${CMAKE_CURRENT_SOURCE_DIR}/op_mean.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/op_slice_copy.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/op_permute_copy.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/op_transpose_copy.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/op_exp.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/op_sigmoid.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/op_sqrt.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/op_rsqrt.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/op_tanh.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/op_lt.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/op_where.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/op_clamp.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_bmm.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_div.cpp"
@@ -64,7 +72,8 @@ target_link_libraries(aten_ops_cadence PUBLIC executorch)
 target_link_libraries(aten_ops_cadence PRIVATE xa_nnlib)
 
 # Let files say "include <executorch/path/to/header.h>".
-set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+set(_common_include_directories ${EXECUTORCH_ROOT}/..
+${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)
 
 target_include_directories(
   aten_ops_cadence PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR}
diff --git a/backends/cadence/fusion_g3/operators/op_cat.cpp b/backends/cadence/fusion_g3/operators/op_cat.cpp
index 84224b37b04..7f8e1ee8710 100644
--- a/backends/cadence/fusion_g3/operators/op_cat.cpp
+++ b/backends/cadence/fusion_g3/operators/op_cat.cpp
@@ -115,7 +115,8 @@ Tensor& cat_out(
       (out.scalar_type() == ScalarType::Char) ||
       (out.scalar_type() == ScalarType::UInt32) ||
       (out.scalar_type() == ScalarType::UInt16) ||
-      (out.scalar_type() == ScalarType::Byte)) {
+      (out.scalar_type() == ScalarType::Byte) ||
+      (out.scalar_type() == ScalarType::Float)) {
     XT_KERNEL_CHECK(
         ctx,
         out,
diff --git a/backends/cadence/fusion_g3/operators/op_clamp.cpp b/backends/cadence/fusion_g3/operators/op_clamp.cpp
new file mode 100644
index 00000000000..fa8424e15eb
--- /dev/null
+++ b/backends/cadence/fusion_g3/operators/op_clamp.cpp
@@ -0,0 +1,665 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/fusion_g3/operators/operators.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <limits>
+
+#include <xa_nnlib_kernels_api.h>
+
+#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
+#include <executorch/kernels/portable/cpu/scalar_utils.h>
+#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
+#include <executorch/kernels/portable/cpu/util/math_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+using ::executorch::aten::optional;
+using ::executorch::aten::Scalar;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::canCast;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::KernelRuntimeContext;
+
+namespace cadence {
+namespace impl {
+namespace G3 {
+namespace native {
+
+namespace {
+
+template <typename CTYPE_VAL, typename CTYPE_OUT, typename CTYPE_CAST>
+/** Check if val, when cast to CTYPE_CAST, is not in the range of CTYPE_OUT */
+bool is_out_of_bounds(CTYPE_VAL val) {
+  const CTYPE_CAST val_cast = static_cast<CTYPE_CAST>(val);
+  return val_cast < std::numeric_limits<CTYPE_OUT>::lowest() ||
+      val_cast > std::numeric_limits<CTYPE_OUT>::max();
+}
+
+ET_NODISCARD bool check_bounds(
+    const Scalar& val_scalar,
+    const ScalarType& val_type,
+    const ScalarType& out_type,
+    const char* val_name) {
+  auto is_valid = true;
+
+  ET_SWITCH_SCALAR_OBJ_TYPES(val_type, ctx, "clamp.out", CTYPE_VAL, [&]() {
+    CTYPE_VAL val = 0;
+    torch::executor::native::utils::extract_scalar(val_scalar, &val);
+    if (executorch::runtime::isIntegralType(out_type, /*includeBool=*/false)) {
+      ET_SWITCH_INT_TYPES(out_type, ctx, "clamp.out", CTYPE_OUT, [&]() {
+        if (is_out_of_bounds<CTYPE_VAL, CTYPE_OUT, long>(val)) {
+          ET_LOG(Error, "%s value out of bounds", val_name);
+          is_valid = false;
+        }
+      });
+    } else if (executorch::runtime::isFloatingType(out_type)) {
+      ET_SWITCH_FLOATH_TYPES(out_type, ctx, "clamp", CTYPE_OUT, [&]() {
+        if (std::isfinite(val) &&
+            is_out_of_bounds<CTYPE_VAL, CTYPE_OUT, double>(val)) {
+          ET_LOG(Error, "%s value out of bounds", val_name);
+          is_valid = false;
+        }
+      });
+    }
+  });
+
+  return is_valid;
+}
+
+} // namespace
+
+Tensor& clamp_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& in,
+    const optional<Scalar>& min_opt,
+    const optional<Scalar>& max_opt,
+    Tensor& out) {
+  bool has_min = min_opt.has_value();
+  bool has_max = max_opt.has_value();
+
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      has_min || has_max,
+      InvalidArgument,
+      out,
+      "At least one of 'min' or 'max' must not be None");
+
+  // Input Dtypes
+  ScalarType in_type = in.scalar_type();
+  ScalarType min_type = has_min
+      ? torch::executor::native::utils::get_scalar_dtype(min_opt.value())
+      : in_type;
+  ScalarType max_type = has_max
+      ? torch::executor::native::utils::get_scalar_dtype(max_opt.value())
+      : in_type;
+  ScalarType out_type = out.scalar_type();
+
+  // Check Scalar Bounds
+  if (has_min) {
+    ET_KERNEL_CHECK(
+        ctx,
+        check_bounds(min_opt.value(), min_type, out_type, "minimum"),
+        InvalidArgument,
+        out);
+  }
+  if (has_max) {
+    ET_KERNEL_CHECK(
+        ctx,
+        check_bounds(max_opt.value(), max_type, out_type, "maximum"),
+        InvalidArgument,
+        out);
+  }
+
+#ifdef OP_ARG_CHECK
+  // Check Dim Order
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::tensors_have_same_dim_order(in, out),
+      InvalidArgument,
+      out);
+
+  // Resize
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::resize_tensor(out, in.sizes()) == Error::Ok,
+      InvalidArgument,
+      out);
+#endif
+
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  static constexpr const char op_name[] = "clamp.out";
+
+  bool optimized = true;
+
+  if (!(((in_type == ScalarType::Float) || (in_type == ScalarType::Short) ||
+         (in_type == ScalarType::Char) || (in_type == ScalarType::Byte)) &&
+        (in_type == out_type))) {
+    optimized = false;
+  }
+
+  if (has_max) {
+    if ((max_opt.value().isFloatingPoint()) &&
+        (in.scalar_type() == ScalarType::Short)) {
+      optimized = false;
+    }
+
+    if ((max_opt.value().isFloatingPoint()) &&
+        (in.scalar_type() == ScalarType::Char)) {
+      optimized = false;
+    }
+
+    if ((max_opt.value().isFloatingPoint()) &&
+        (in.scalar_type() == ScalarType::Byte)) {
+      optimized = false;
+    }
+  }
+
+  if (has_min) {
+    if ((min_opt.value().isFloatingPoint()) &&
+        (in.scalar_type() == ScalarType::Short)) {
+      optimized = false;
+    }
+
+    if ((min_opt.value().isFloatingPoint()) &&
+        (in.scalar_type() == ScalarType::Char)) {
+      optimized = false;
+    }
+
+    if ((min_opt.value().isFloatingPoint()) &&
+        (in.scalar_type() == ScalarType::Byte)) {
+      optimized = false;
+    }
+  }
+
+  if ((in_type == ScalarType::Float) && (optimized)) {
+    const float* const inp1_data = in.const_data_ptr<float>();
+    float* const out_data = out.mutable_data_ptr<float>();
+    float min_val, max_val;
+
+    if (!has_min) {
+      min_val = std::numeric_limits<float>::lowest();
+      torch::executor::native::utils::extract_scalar(max_opt.value(), &max_val);
+    } else if (!has_max) {
+      torch::executor::native::utils::extract_scalar(min_opt.value(), &min_val);
+      max_val = std::numeric_limits<float>::max();
+    } else {
+      torch::executor::native::utils::extract_scalar(min_opt.value(), &min_val);
+      torch::executor::native::utils::extract_scalar(max_opt.value(), &max_val);
+    }
+
+    XT_KERNEL_CHECK(
+        ctx,
+        out,
+        xa_nn_elm_clamp_scalar_f32_f32,
+        out_data,
+        inp1_data,
+        min_val,
+        max_val,
+        out.numel());
+  } else if ((in_type == ScalarType::Short) && (optimized)) {
+    const signed short* const inp1_data = in.const_data_ptr<signed short>();
+    signed short* const out_data = out.mutable_data_ptr<signed short>();
+    signed short min_val, max_val;
+
+    if (!has_min) {
+      min_val = std::numeric_limits<int16_t>::lowest();
+      torch::executor::native::utils::extract_scalar(max_opt.value(), &max_val);
+    } else if (!has_max) {
+      torch::executor::native::utils::extract_scalar(min_opt.value(), &min_val);
+      max_val = std::numeric_limits<int16_t>::max();
+    } else {
+      torch::executor::native::utils::extract_scalar(min_opt.value(), &min_val);
+      torch::executor::native::utils::extract_scalar(max_opt.value(), &max_val);
+    }
+
+    XT_KERNEL_CHECK(
+        ctx,
+        out,
+        xa_nn_elm_clamp_scalar_16_16,
+        out_data,
+        inp1_data,
+        min_val,
+        max_val,
+        out.numel());
+  } else if ((in_type == ScalarType::Char) && (optimized)) {
+    const signed char* const inp1_data = in.const_data_ptr<signed char>();
+    signed char* const out_data = out.mutable_data_ptr<signed char>();
+    signed char min_val, max_val;
+
+    if (!has_min) {
+      min_val = std::numeric_limits<int8_t>::lowest();
+      torch::executor::native::utils::extract_scalar(max_opt.value(), &max_val);
+    } else if (!has_max) {
+      torch::executor::native::utils::extract_scalar(min_opt.value(), &min_val);
+      max_val = std::numeric_limits<int8_t>::max();
+    } else {
+      torch::executor::native::utils::extract_scalar(min_opt.value(), &min_val);
+      torch::executor::native::utils::extract_scalar(max_opt.value(), &max_val);
+    }
+
+    XT_KERNEL_CHECK(
+        ctx,
+        out,
+        xa_nn_elm_clamp_scalar_8_8,
+        out_data,
+        inp1_data,
+        min_val,
+        max_val,
+        out.numel());
+  } else if ((in_type == ScalarType::Byte) && (optimized)) {
+    const unsigned char* const inp1_data = in.const_data_ptr<unsigned char>();
+    unsigned char* const out_data = out.mutable_data_ptr<unsigned char>();
+    unsigned char min_val, max_val;
+
+    if (!has_min) {
+      min_val = std::numeric_limits<uint8_t>::lowest();
+      torch::executor::native::utils::extract_scalar(max_opt.value(), &max_val);
+    } else if (!has_max) {
+      torch::executor::native::utils::extract_scalar(min_opt.value(), &min_val);
+      max_val = std::numeric_limits<uint8_t>::max();
+    } else {
+      torch::executor::native::utils::extract_scalar(min_opt.value(), &min_val);
+      torch::executor::native::utils::extract_scalar(max_opt.value(), &max_val);
+    }
+
+    XT_KERNEL_CHECK(
+        ctx,
+        out,
+        xa_nn_elm_clamp_scalar_8u_8u,
+        out_data,
+        inp1_data,
+        min_val,
+        max_val,
+        out.numel());
+  } else {
+    // Common Dtype
+    ScalarType common_type = in_type;
+    if (has_min) {
+      common_type = torch::executor::native::utils::promote_type_with_scalar(
+          common_type, min_opt.value());
+    }
+    if (has_max) {
+      common_type = torch::executor::native::utils::promote_type_with_scalar(
+          common_type, max_opt.value());
+    }
+
+    // Check Common Dtype
+    ET_KERNEL_CHECK(ctx, common_type == out_type, InvalidArgument, out);
+
+    // Compute Dtype
+    ScalarType compute_type =
+        torch::executor::native::utils::get_compute_type(common_type);
+
+    ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
+      torch::executor::native::utils::
+          apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+              [has_min, min_opt, has_max, max_opt](const CTYPE_COMPUTE val_in) {
+                CTYPE_COMPUTE val_out = val_in;
+                if (has_min) {
+                  val_out = torch::executor::native::utils::max_override(
+                      val_out,
+                      torch::executor::native::utils::scalar_to<CTYPE_COMPUTE>(
+                          min_opt.value()));
+                }
+                if (has_max) {
+                  val_out = torch::executor::native::utils::min_override(
+                      val_out,
+                      torch::executor::native::utils::scalar_to<CTYPE_COMPUTE>(
+                          max_opt.value()));
+                }
+                return val_out;
+              },
+              ctx,
+              in,
+              torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16,
+              out,
+              torch::executor::native::utils::SupportedTensorDtypes::
+                  SAME_AS_COMMON);
+    });
+  }
+
+  return out;
+}
+
+Tensor& clamp_Tensor_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& in,
+    const optional<Tensor>& min_opt,
+    const optional<Tensor>& max_opt,
+    Tensor& out) {
+  bool has_min = min_opt.has_value();
+  bool has_max = max_opt.has_value();
+
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      has_min || has_max,
+      InvalidArgument,
+      out,
+      "At least one of 'min' or 'max' must not be None");
+
+  const Tensor& min = has_min ? min_opt.value() : in;
+  const Tensor& max = has_max ? max_opt.value() : in;
+
+#ifdef OP_ARG_CHECK
+  // Check Dim Order
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::tensors_have_same_dim_order(in, min, max, out),
+      InvalidArgument,
+      out);
+
+  // Resize
+  ET_KERNEL_CHECK(
+      ctx,
+      torch::executor::resize_to_broadcast_target_size(in, min, max, out) ==
+          Error::Ok,
+      InvalidArgument,
+      out);
+#endif
+
+  static constexpr const char op_name[] = "clamp.Tensor_out";
+
+  int kTensorDimensionLimit = 5;
+
+  int inp_shape[kTensorDimensionLimit];
+  int min_shape[kTensorDimensionLimit];
+  int max_shape[kTensorDimensionLimit];
+  int out_shape[kTensorDimensionLimit];
+
+  bool broadcast = false;
+
+  int max1_dim = min.dim() > max.dim() ? min.dim() : max.dim();
+  int max2_dim = in.dim() > out.dim() ? in.dim() : out.dim();
+  int max_dim = max1_dim > max2_dim ? max1_dim : max2_dim;
+
+  bool optimized = true;
+
+  for (int i = 0; i < max_dim; i++) {
+    out_shape[i] = 1;
+    inp_shape[i] = 1;
+    min_shape[i] = 1;
+    max_shape[i] = 1;
+  }
+
+  int offset_out = max_dim - out.dim();
+  int offset_inp = max_dim - in.dim();
+  int offset_min = max_dim - min.dim();
+  int offset_max = max_dim - max.dim();
+
+  for (int i = 0; i < out.dim(); i++) {
+    out_shape[i + offset_out] = out.size(i);
+  }
+  for (int i = 0; i < in.dim(); i++) {
+    inp_shape[i + offset_inp] = in.size(i);
+  }
+  if (has_min) {
+    for (int i = 0; i < min.dim(); i++) {
+      min_shape[i + offset_min] = min.size(i);
+    }
+  }
+  if (has_max) {
+    for (int i = 0; i < max.dim(); i++) {
+      max_shape[i + offset_max] = max.size(i);
+    }
+  }
+
+  /*find broadcast*/
+  for (int i = 0; i < max_dim; i++) {
+    if (((inp_shape[i]) != (out_shape[i])) ||
+        ((min_shape[i]) != (out_shape[i])) ||
+        ((max_shape[i]) != (out_shape[i]))) {
+      broadcast = true;
+    }
+  }
+
+  if (((broadcast) && (max_dim > kTensorDimensionLimit)) ||
+      (!(((in.scalar_type() == ScalarType::Float) ||
+          (in.scalar_type() == ScalarType::Short) ||
+          (in.scalar_type() == ScalarType::Char) ||
+          (in.scalar_type() == ScalarType::Byte)) &&
+         (in.scalar_type() == min.scalar_type()) &&
+         (in.scalar_type() == max.scalar_type()) &&
+         (in.scalar_type() == out.scalar_type())))) {
+    optimized = false;
+  }
+
+  if ((in.scalar_type() == ScalarType::Float) && (optimized)) {
+    const float* const inp1_data = in.const_data_ptr<float>();
+    const float* min_data = min.const_data_ptr<float>();
+    const float* max_data = max.const_data_ptr<float>();
+    float* const out_data = out.mutable_data_ptr<float>();
+    float lowest_val, highest_val;
+
+    if (broadcast || !has_min || !has_max) {
+      if (!has_min) {
+        lowest_val = std::numeric_limits<float>::lowest();
+        min_data = &lowest_val;
+      }
+
+      if (!has_max) {
+        highest_val = std::numeric_limits<float>::max();
+        max_data = &highest_val;
+      }
+
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_clamp_broadcast_5D_f32_f32,
+          out_data,
+          out_shape,
+          inp1_data,
+          inp_shape,
+          min_data,
+          min_shape,
+          max_data,
+          max_shape,
+          max_dim);
+
+    } else {
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_clamp_f32_f32,
+          out_data,
+          inp1_data,
+          min_data,
+          max_data,
+          out.numel());
+    }
+  } else if ((in.scalar_type() == ScalarType::Short) && (optimized)) {
+    const signed short* const inp1_data = in.const_data_ptr<signed short>();
+    const signed short* min_data = min.const_data_ptr<signed short>();
+    const signed short* max_data = max.const_data_ptr<signed short>();
+    signed short* const out_data = out.mutable_data_ptr<signed short>();
+    signed short lowest_val, highest_val;
+
+    if (broadcast || !has_min || !has_max) {
+      if (!has_min) {
+        lowest_val = std::numeric_limits<signed short>::lowest();
+        min_data = &lowest_val;
+      }
+
+      if (!has_max) {
+        highest_val = std::numeric_limits<signed short>::max();
+        max_data = &highest_val;
+      }
+
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_clamp_broadcast_5D_16_16,
+          out_data,
+          out_shape,
+          inp1_data,
+          inp_shape,
+          min_data,
+          min_shape,
+          max_data,
+          max_shape,
+          max_dim);
+
+    } else {
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_clamp_16_16,
+          out_data,
+          inp1_data,
+          min_data,
+          max_data,
+          out.numel());
+    }
+  } else if ((in.scalar_type() == ScalarType::Char) && (optimized)) {
+    const signed char* const inp1_data = in.const_data_ptr<signed char>();
+    const signed char* min_data = min.const_data_ptr<signed char>();
+    const signed char* max_data = max.const_data_ptr<signed char>();
+    signed char* const out_data = out.mutable_data_ptr<signed char>();
+    signed char lowest_val, highest_val;
+
+    if (broadcast || !has_min || !has_max) {
+      if (!has_min) {
+        lowest_val = std::numeric_limits<signed char>::lowest();
+        min_data = &lowest_val;
+      }
+
+      if (!has_max) {
+        highest_val = std::numeric_limits<signed char>::max();
+        max_data = &highest_val;
+      }
+
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_clamp_broadcast_5D_8_8,
+          out_data,
+          out_shape,
+          inp1_data,
+          inp_shape,
+          min_data,
+          min_shape,
+          max_data,
+          max_shape,
+          max_dim);
+
+    } else {
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_clamp_8_8,
+          out_data,
+          inp1_data,
+          min_data,
+          max_data,
+          out.numel());
+    }
+  } else if ((in.scalar_type() == ScalarType::Byte) && (optimized)) {
+    const unsigned char* const inp1_data = in.const_data_ptr<unsigned char>();
+    const unsigned char* min_data = min.const_data_ptr<unsigned char>();
+    const unsigned char* max_data = max.const_data_ptr<unsigned char>();
+    unsigned char* const out_data = out.mutable_data_ptr<unsigned char>();
+    unsigned char lowest_val, highest_val;
+
+    if (broadcast || !has_min || !has_max) {
+      if (!has_min) {
+        lowest_val = std::numeric_limits<unsigned char>::lowest();
+        min_data = &lowest_val;
+      }
+
+      if (!has_max) {
+        highest_val = std::numeric_limits<unsigned char>::max();
+        max_data = &highest_val;
+      }
+
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_clamp_broadcast_5D_8u_8u,
+          out_data,
+          out_shape,
+          inp1_data,
+          inp_shape,
+          min_data,
+          min_shape,
+          max_data,
+          max_shape,
+          max_dim);
+
+    } else {
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_clamp_8u_8u,
+          out_data,
+          inp1_data,
+          min_data,
+          max_data,
+          out.numel());
+    }
+  } else {
+    // Common Dtype
+    ScalarType common_type = in.scalar_type();
+
+    // Check Common Dtype
+    ET_KERNEL_CHECK(
+        ctx, canCast(common_type, out.scalar_type()), InvalidArgument, out);
+
+    if (has_min) {
+      common_type =
+          executorch::runtime::promoteTypes(common_type, min.scalar_type());
+    }
+    if (has_max) {
+      common_type =
+          executorch::runtime::promoteTypes(common_type, max.scalar_type());
+    }
+
+    // Compute Dtype
+    ScalarType compute_type =
+        torch::executor::native::utils::get_compute_type(common_type);
+
+    ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
+      torch::executor::native::utils::apply_tritensor_elementwise_fn<
+          CTYPE_COMPUTE,
+          op_name>(
+          [has_min, has_max](
+              const CTYPE_COMPUTE val_in,
+              const CTYPE_COMPUTE val_min,
+              const CTYPE_COMPUTE val_max) {
+            CTYPE_COMPUTE val_out = val_in;
+            if (has_min) {
+              val_out = torch::executor::native::utils::max_override(
+                  val_out, val_min);
+            }
+            if (has_max) {
+              val_out = torch::executor::native::utils::min_override(
+                  val_out, val_max);
+            }
+            return val_out;
+          },
+          ctx,
+          in,
+          torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16,
+          min,
+          torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16,
+          max,
+          torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16,
+          out,
+          torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16);
+    });
+  }
+
+  return out;
+}
+
+} // namespace native
+} // namespace G3
+} // namespace impl
+} // namespace cadence
diff --git a/backends/cadence/fusion_g3/operators/op_lt.cpp b/backends/cadence/fusion_g3/operators/op_lt.cpp
new file mode 100644
index 00000000000..3f6cdbe3505
--- /dev/null
+++ b/backends/cadence/fusion_g3/operators/op_lt.cpp
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/fusion_g3/operators/operators.h>
+
+#include <xa_nnlib_kernels_api.h>
+
+#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
+#include <executorch/kernels/portable/cpu/pattern/comparison_op.h>
+
+using ::executorch::aten::Scalar;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::KernelRuntimeContext;
+
+namespace cadence {
+namespace impl {
+namespace G3 {
+namespace native {
+
+Tensor& lt_Tensor_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& a,
+    const Tensor& b,
+    Tensor& out) {
+#ifdef OP_ARG_CHECK
+  // Check Dim Order
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::tensors_have_same_dim_order(a, b, out),
+      InvalidArgument,
+      out);
+
+  // Resize
+  ET_KERNEL_CHECK(
+      ctx,
+      torch::executor::resize_to_broadcast_target_size(a, b, out) == Error::Ok,
+      InvalidArgument,
+      out);
+#endif
+
+  int kTensorDimensionLimit = 5;
+
+  int inp1_shape[kTensorDimensionLimit];
+  int inp2_shape[kTensorDimensionLimit];
+  int out_shape[kTensorDimensionLimit];
+
+  bool broadcast = false;
+
+  int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
+  max_dim = out.dim() > max_dim ? out.dim() : max_dim;
+
+  bool optimized = true;
+
+  /* Added change to work with input dimensions more than 5 */
+  for (int i = 0; i < max_dim; i++) {
+    out_shape[i] = 1;
+    inp1_shape[i] = 1;
+    inp2_shape[i] = 1;
+  }
+
+  int offset_out = max_dim - out.dim();
+  int offset_inp1 = max_dim - a.dim();
+  int offset_inp2 = max_dim - b.dim();
+
+  for (int i = 0; i < out.dim(); i++) {
+    out_shape[i + offset_out] = out.size(i);
+  }
+  for (int i = 0; i < a.dim(); i++) {
+    inp1_shape[i + offset_inp1] = a.size(i);
+  }
+  for (int i = 0; i < b.dim(); i++) {
+    inp2_shape[i + offset_inp2] = b.size(i);
+  }
+
+  /*find broadcast*/
+  for (int i = 0; i < max_dim; i++) {
+    if (((inp1_shape[i]) != (out_shape[i])) ||
+        ((inp2_shape[i]) != (out_shape[i]))) {
+      broadcast = true;
+    }
+  }
+
+  if (((broadcast) && (max_dim > kTensorDimensionLimit)) ||
+      (!((a.scalar_type() == ScalarType::Float) &&
+         (b.scalar_type() == ScalarType::Float) &&
+         (out.scalar_type() == ScalarType::Bool)))) {
+    optimized = false;
+  }
+
+  if (optimized) {
+    const float* const inp1_data = a.const_data_ptr<float>();
+    const float* const inp2_data = b.const_data_ptr<float>();
+    signed char* const out_data = out.mutable_data_ptr<signed char>();
+
+    if (b.numel() == 1) {
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_less_scalar_f32xf32_bool,
+          out_data,
+          inp1_data,
+          inp2_data[0],
+          out.numel());
+    } else if (broadcast) {
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_less_broadcast_5D_f32xf32_bool,
+          out_data,
+          out_shape,
+          inp1_data,
+          inp1_shape,
+          inp2_data,
+          inp2_shape,
+          max_dim);
+
+    } else {
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_less_f32xf32_bool,
+          out_data,
+          inp1_data,
+          inp2_data,
+          out.numel());
+    }
+  } else {
+    // @lint-ignore CLANGTIDY facebook-hte-CArray
+    static constexpr const char op_name[] = "lt.Tensor_out";
+    torch::executor::native::internal::comparison_tensor_out<op_name>(
+        ctx, a, b, out);
+  }
+
+  return out;
+}
+
+Tensor& lt_Scalar_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& a,
+    const Scalar& b,
+    Tensor& out) {
+#ifdef OP_ARG_CHECK
+  // Check Dim Order
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::tensors_have_same_dim_order(a, out),
+      InvalidArgument,
+      out);
+
+  // Resize
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::resize_tensor(out, a.sizes()) == Error::Ok,
+      InvalidArgument,
+      out);
+#endif
+
+  bool optimized = true;
+
+  if (!((a.scalar_type() == ScalarType::Float) &&
+        (out.scalar_type() == ScalarType::Bool))) {
+    optimized = false;
+  }
+
+  if (optimized) {
+    const float* const inp1_data = a.const_data_ptr<float>();
+    float inp2_val;
+    torch::executor::native::utils::extract_scalar(b, &inp2_val);
+
+    signed char* const out_data = out.mutable_data_ptr<signed char>();
+
+    XT_KERNEL_CHECK(
+        ctx,
+        out,
+        xa_nn_elm_less_scalar_f32xf32_bool,
+        out_data,
+        inp1_data,
+        inp2_val,
+        out.numel());
+
+  } else {
+    // @lint-ignore CLANGTIDY facebook-hte-CArray
+    static constexpr const char op_name[] = "lt.Scalar_out";
+    torch::executor::native::internal::comparison_scalar_out<op_name>(
+        ctx, a, b, out);
+  }
+
+  return out;
+}
+
+} // namespace native
+} // namespace G3
+} // namespace impl
+} // namespace cadence
diff --git a/backends/cadence/fusion_g3/operators/op_mean.cpp b/backends/cadence/fusion_g3/operators/op_mean.cpp
index ae0cfd1e27b..48f691a145a 100644
--- a/backends/cadence/fusion_g3/operators/op_mean.cpp
+++ b/backends/cadence/fusion_g3/operators/op_mean.cpp
@@ -118,7 +118,7 @@ Tensor& mean_out(
     for (int i = 0; i < kNnlibMaxDim; i++) {
       out_shape[i] = 1;
       inp_shape[i] = 1;
-      p_axis[i] = 1;
+      p_axis[i] = -1;
     }
 
     int num_axis_dims = prepare_data(
@@ -135,20 +135,10 @@ Tensor& mean_out(
       num_out_dims = 1;
     }
 
-    int inp_shape_max = inp_shape[p_axis[0]];
-    for (int i = 1; i < num_axis_dims; i++) {
-      if (inp_shape[p_axis[i]] > inp_shape_max) {
-        inp_shape_max = inp_shape[p_axis[i]];
-      }
+    if ((out.dim() == 0) && (out.numel())) {
+      num_out_dims = 1;
     }
 
-    int scratch_size = in.numel() / inp_shape_max;
-
-    executorch::runtime::Result<void*> temp_mem =
-        ctx.allocate_temp(scratch_size * sizeof(float));
-
-    void* __restrict__ p_scratch_in = (void* __restrict__)(temp_mem.get());
-
     XT_KERNEL_CHECK(
         ctx,
         out,
@@ -160,8 +150,7 @@ Tensor& mean_out(
         inp_shape,
         num_inp_dims,
         p_axis,
-        num_axis_dims,
-        p_scratch_in);
+        num_axis_dims);
   } else {
     ET_KERNEL_CHECK(
         ctx,
diff --git a/backends/cadence/fusion_g3/operators/op_native_layer_norm.cpp b/backends/cadence/fusion_g3/operators/op_native_layer_norm.cpp
index b4f076e8100..09c7c00fd2c 100644
--- a/backends/cadence/fusion_g3/operators/op_native_layer_norm.cpp
+++ b/backends/cadence/fusion_g3/operators/op_native_layer_norm.cpp
@@ -225,7 +225,10 @@ std::tuple<Tensor&, Tensor&, Tensor&> native_layer_norm_out(
     if (weight.has_value()) {
       weight_data = weight.value().mutable_data_ptr<float>();
     } else {
-      weight_data = (float*)malloc(num_elm * sizeof(float));
+      executorch::runtime::Result<void*> temp_mem_weight =
+          ctx.allocate_temp(num_elm * sizeof(float));
+      weight_data = (float*)(temp_mem_weight.get());
+
       for (int i = 0; i < num_elm; i++) {
         weight_data[i] = 1;
       }
@@ -234,7 +237,10 @@ std::tuple<Tensor&, Tensor&, Tensor&> native_layer_norm_out(
     if (bias.has_value()) {
       bias_data = bias.value().mutable_data_ptr<float>();
     } else {
-      bias_data = (float*)malloc(num_elm * sizeof(float));
+      executorch::runtime::Result<void*> temp_mem_bias =
+          ctx.allocate_temp(num_elm * sizeof(float));
+      bias_data = (float*)(temp_mem_bias.get());
+
       for (int i = 0; i < num_elm; i++) {
         bias_data[i] = 0;
       }
@@ -255,12 +261,6 @@ std::tuple<Tensor&, Tensor&, Tensor&> native_layer_norm_out(
         bias_data,
         (float)eps);
 
-    if (!bias.has_value()) {
-      free(bias_data);
-    }
-    if (!weight.has_value()) {
-      free(weight_data);
-    }
   } else {
     ET_KERNEL_CHECK(
         ctx,
diff --git a/backends/cadence/fusion_g3/operators/op_permute_copy.cpp b/backends/cadence/fusion_g3/operators/op_permute_copy.cpp
index 34def4fd1bf..204882f3da9 100644
--- a/backends/cadence/fusion_g3/operators/op_permute_copy.cpp
+++ b/backends/cadence/fusion_g3/operators/op_permute_copy.cpp
@@ -89,6 +89,7 @@ Tensor& permute_copy_out(
 
   int inp_shape[kTensorDimensionLimit];
   int out_shape[kTensorDimensionLimit];
+  int permute_vec[kTensorDimensionLimit];
 
   /* input shapes and output shapes */
   for (auto i = 0; i < in_size.size(); i++) {
@@ -99,7 +100,6 @@ Tensor& permute_copy_out(
     out_shape[i] = out_size[i];
   }
 
-  int permute_vec[in.dim()];
   for (int i = 0; i < in.dim(); i++) {
     permute_vec[i] = (int)dims[i];
   }
@@ -112,7 +112,8 @@ Tensor& permute_copy_out(
        (out.scalar_type() == ScalarType::Char) ||
        (out.scalar_type() == ScalarType::UInt32) ||
        (out.scalar_type() == ScalarType::UInt16) ||
-       (out.scalar_type() == ScalarType::Byte)) &&
+       (out.scalar_type() == ScalarType::Byte) ||
+       (out.scalar_type() == ScalarType::Float)) &&
       (in.dim() <= 5)) {
     XT_KERNEL_CHECK(
         ctx,
@@ -156,4 +157,4 @@ Tensor& permute_copy_out(
 } // namespace native
 } // namespace G3
 } // namespace impl
-} // namespace cadence
\ No newline at end of file
+} // namespace cadence
diff --git a/backends/cadence/fusion_g3/operators/op_rsqrt.cpp b/backends/cadence/fusion_g3/operators/op_rsqrt.cpp
new file mode 100644
index 00000000000..5a869fadd09
--- /dev/null
+++ b/backends/cadence/fusion_g3/operators/op_rsqrt.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/fusion_g3/operators/operators.h>
+
+#include <xa_nnlib_kernels_api.h>
+
+#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
+#include <executorch/kernels/portable/cpu/pattern/pattern.h>
+#include <executorch/kernels/portable/cpu/util/functional_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::KernelRuntimeContext;
+
+namespace cadence {
+namespace impl {
+namespace G3 {
+namespace native {
+
+namespace {
+
+double rsqrt(double x) {
+  return 1.0 / std::sqrt(x);
+}
+
+} // namespace
+
+Tensor& rsqrt_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
+#ifdef OP_ARG_CHECK
+  // Resize for dynamic shape
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      executorch::runtime::resize_tensor(out, in.sizes()) == Error::Ok,
+      InvalidArgument,
+      out,
+      "Failed to resize output tensor.");
+
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::tensors_have_same_dim_order(in, out),
+      InvalidArgument,
+      out);
+#endif
+
+  if ((in.scalar_type() == ScalarType::Float) &&
+      (out.scalar_type() == ScalarType::Float)) {
+    float* const out_data = out.mutable_data_ptr<float>();
+    const float* const in_data = in.const_data_ptr<float>();
+
+    XT_KERNEL_CHECK(
+        ctx, out, xa_nn_elm_rsqrt_f32_f32, out_data, in_data, out.numel());
+
+    return out;
+  } else {
+    return torch::executor::native::internal::
+        unary_ufunc_realhbbf16_to_floathbf16(rsqrt, ctx, in, out);
+  }
+}
+
+} // namespace native
+} // namespace G3
+} // namespace impl
+} // namespace cadence
\ No newline at end of file
diff --git a/backends/cadence/fusion_g3/operators/op_sigmoid.cpp b/backends/cadence/fusion_g3/operators/op_sigmoid.cpp
new file mode 100644
index 00000000000..00149ab7e85
--- /dev/null
+++ b/backends/cadence/fusion_g3/operators/op_sigmoid.cpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/fusion_g3/operators/operators.h>
+
+#include <cmath>
+
+#include <xa_nnlib_kernels_api.h>
+
+#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
+#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
+#include <executorch/kernels/portable/cpu/util/functional_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::KernelRuntimeContext;
+
+namespace cadence {
+namespace impl {
+namespace G3 {
+namespace native {
+
+Tensor& sigmoid_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
+  (void)ctx;
+
+#ifdef OP_ARG_CHECK
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::tensor_is_floating_type(out),
+      InvalidArgument,
+      out);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::tensors_have_same_dim_order(in, out),
+      InvalidArgument,
+      out);
+
+  // Resize for dynamic shape
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      executorch::runtime::resize_tensor(out, in.sizes()) == Error::Ok,
+      InvalidArgument,
+      out,
+      "Failed to resize output tensor.");
+#endif
+
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  static constexpr const char op_name[] = "sigmoid.out";
+
+  if ((in.scalar_type() == ScalarType::Float) &&
+      (out.scalar_type() == ScalarType::Float)) {
+    const float* const in_data = in.const_data_ptr<float>();
+    float* const out_data = out.mutable_data_ptr<float>();
+
+    XT_KERNEL_CHECK(
+        ctx, out, xa_nn_sigmoid_f32_f32, out_data, in_data, out.numel());
+  } else {
+    ET_KERNEL_CHECK(
+        ctx, in.scalar_type() != ScalarType::Bool, InvalidArgument, out);
+
+    ScalarType compute_type =
+        executorch::runtime::isFloatingType(in.scalar_type())
+        ? in.scalar_type()
+        : ScalarType::Float;
+    compute_type =
+        torch::executor::native::utils::get_compute_type(compute_type);
+    ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
+      torch::executor::native::utils::apply_unitensor_elementwise_fn<
+          CTYPE_COMPUTE,
+          op_name>(
+          [](const CTYPE_COMPUTE val_in) {
+            CTYPE_COMPUTE out_val = static_cast<CTYPE_COMPUTE>(1.0) /
+                (static_cast<CTYPE_COMPUTE>(1.0) + exp(-val_in));
+            return out_val;
+          },
+          ctx,
+          in,
+          torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16,
+          out,
+          torch::executor::native::utils::SupportedTensorDtypes::FLOATHBF16);
+    });
+  }
+
+  return out;
+}
+
+} // namespace native
+} // namespace G3
+} // namespace impl
+} // namespace cadence
\ No newline at end of file
diff --git a/backends/cadence/fusion_g3/operators/op_slice_copy.cpp b/backends/cadence/fusion_g3/operators/op_slice_copy.cpp
index 9158eecf133..249da9144a9 100644
--- a/backends/cadence/fusion_g3/operators/op_slice_copy.cpp
+++ b/backends/cadence/fusion_g3/operators/op_slice_copy.cpp
@@ -101,7 +101,8 @@ Tensor& slice_copy_Tensor_out(
        (out.scalar_type() == ScalarType::Char) ||
        (out.scalar_type() == ScalarType::UInt32) ||
        (out.scalar_type() == ScalarType::UInt16) ||
-       (out.scalar_type() == ScalarType::Byte))) {
+       (out.scalar_type() == ScalarType::Byte) ||
+       (out.scalar_type() == ScalarType::Float))) {
     XT_KERNEL_CHECK(
         ctx,
         out,
@@ -132,4 +133,4 @@ Tensor& slice_copy_Tensor_out(
 } // namespace native
 } // namespace G3
 } // namespace impl
-} // namespace cadence
\ No newline at end of file
+} // namespace cadence
diff --git a/backends/cadence/fusion_g3/operators/op_sqrt.cpp b/backends/cadence/fusion_g3/operators/op_sqrt.cpp
new file mode 100644
index 00000000000..c6a5a29fab8
--- /dev/null
+++ b/backends/cadence/fusion_g3/operators/op_sqrt.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/fusion_g3/operators/operators.h>
+
+#include <cmath>
+
+#include <xa_nnlib_kernels_api.h>
+
+#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
+#include <executorch/kernels/portable/cpu/pattern/pattern.h>
+#include <executorch/kernels/portable/cpu/util/functional_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::KernelRuntimeContext;
+
+namespace cadence {
+namespace impl {
+namespace G3 {
+namespace native {
+
+Tensor& sqrt_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
+#ifdef OP_ARG_CHECK
+  // Resize for dynamic shape
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      executorch::runtime::resize_tensor(out, in.sizes()) == Error::Ok,
+      InvalidArgument,
+      out,
+      "Failed to resize output tensor.");
+
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::tensors_have_same_dim_order(in, out),
+      InvalidArgument,
+      out);
+#endif
+
+  if ((in.scalar_type() == ScalarType::Float) &&
+      (out.scalar_type() == ScalarType::Float)) {
+    float* const out_data = out.mutable_data_ptr<float>();
+    const float* const in_data = in.const_data_ptr<float>();
+
+    XT_KERNEL_CHECK(
+        ctx, out, xa_nn_elm_sqrt_f32_f32, out_data, in_data, out.numel());
+
+    return out;
+  } else {
+    return torch::executor::native::internal::
+        unary_ufunc_realhbbf16_to_floathbf16(std::sqrt, ctx, in, out);
+  }
+}
+
+} // namespace native
+} // namespace G3
+} // namespace impl
+} // namespace cadence
diff --git a/backends/cadence/fusion_g3/operators/op_sub.cpp b/backends/cadence/fusion_g3/operators/op_sub.cpp
index 9bafec5df91..9f4c2c3a5c3 100644
--- a/backends/cadence/fusion_g3/operators/op_sub.cpp
+++ b/backends/cadence/fusion_g3/operators/op_sub.cpp
@@ -87,22 +87,21 @@ Tensor& sub_out(
   }
 
   /*find broadcast*/
-  for (int i = 0; i < out.dim(); i++) {
+  for (int i = 0; i < max_dim; i++) {
     if (((inp1_shape[i]) != (out_shape[i])) ||
         ((inp2_shape[i]) != (out_shape[i]))) {
       broadcast = true;
     }
   }
 
-  if (((broadcast) && (max_dim > kTensorDimensionLimit)) ||
-      (!(((a.scalar_type() == ScalarType::Int) ||
-          (a.scalar_type() == ScalarType::Float)) &&
-         (a.scalar_type() == b.scalar_type()) &&
-         (a.scalar_type() == out.scalar_type())))) {
+  if ((broadcast) && (max_dim > kTensorDimensionLimit)) {
     optimized = false;
   }
 
-  if ((a.scalar_type() == ScalarType::Int) && (optimized)) {
+  if (((a.scalar_type() == ScalarType::Int) &&
+       (b.scalar_type() == ScalarType::Int) &&
+       (out.scalar_type() == ScalarType::Int)) &&
+      (optimized)) {
     const int* const inp1_data = a.const_data_ptr<int>();
     const int* const inp2_data = b.const_data_ptr<int>();
     int* const out_data = out.mutable_data_ptr<int>();
@@ -144,7 +143,11 @@ Tensor& sub_out(
           alpha_val,
           out.numel());
     }
-  } else if ((a.scalar_type() == ScalarType::Float) && (optimized)) {
+  } else if (
+      ((a.scalar_type() == ScalarType::Float) &&
+       (b.scalar_type() == ScalarType::Float) &&
+       (out.scalar_type() == ScalarType::Float)) &&
+      (optimized)) {
     const float* const inp1_data = a.const_data_ptr<float>();
     const float* const inp2_data = b.const_data_ptr<float>();
     float* const out_data = out.mutable_data_ptr<float>();
@@ -186,6 +189,174 @@ Tensor& sub_out(
           alpha_val,
           out.numel());
     }
+  } else if (
+      ((a.scalar_type() == ScalarType::Int) &&
+       (b.scalar_type() == ScalarType::Float) &&
+       (out.scalar_type() == ScalarType::Float)) &&
+      (optimized)) {
+    const int* const inp1_data = a.const_data_ptr<int>();
+    const float* const inp2_data = b.const_data_ptr<float>();
+    float* const out_data = out.mutable_data_ptr<float>();
+    if (alpha.isFloatingPoint()) {
+      float alpha_val;
+      torch::executor::native::utils::extract_scalar(alpha, &alpha_val);
+      if (b.numel() == 1) {
+        XT_KERNEL_CHECK(
+            ctx,
+            out,
+            xa_nn_elm_sub_scalar_32xf32xf32_f32,
+            out_data,
+            inp1_data,
+            inp2_data[0],
+            alpha_val,
+            out.numel());
+      } else if (broadcast) {
+        XT_KERNEL_CHECK(
+            ctx,
+            out,
+            xa_nn_elm_sub_broadcast_5D_32xf32xf32_f32,
+            out_data,
+            out_shape,
+            inp1_data,
+            inp1_shape,
+            inp2_data,
+            inp2_shape,
+            max_dim,
+            alpha_val);
+      } else {
+        XT_KERNEL_CHECK(
+            ctx,
+            out,
+            xa_nn_elm_sub_32xf32xf32_f32,
+            out_data,
+            inp1_data,
+            inp2_data,
+            alpha_val,
+            out.numel());
+      }
+    } else {
+      int alpha_val;
+      torch::executor::native::utils::extract_scalar(alpha, &alpha_val);
+
+      if (b.numel() == 1) {
+        XT_KERNEL_CHECK(
+            ctx,
+            out,
+            xa_nn_elm_sub_scalar_32xf32x32_f32,
+            out_data,
+            inp1_data,
+            inp2_data[0],
+            alpha_val,
+            out.numel());
+      } else if (broadcast) {
+        XT_KERNEL_CHECK(
+            ctx,
+            out,
+            xa_nn_elm_sub_broadcast_5D_32xf32x32_f32,
+            out_data,
+            out_shape,
+            inp1_data,
+            inp1_shape,
+            inp2_data,
+            inp2_shape,
+            max_dim,
+            alpha_val);
+      } else {
+        XT_KERNEL_CHECK(
+            ctx,
+            out,
+            xa_nn_elm_sub_32xf32x32_f32,
+            out_data,
+            inp1_data,
+            inp2_data,
+            alpha_val,
+            out.numel());
+      }
+    }
+  } else if (
+      ((a.scalar_type() == ScalarType::Float) &&
+       (b.scalar_type() == ScalarType::Int) &&
+       (out.scalar_type() == ScalarType::Float)) &&
+      (optimized)) {
+    const float* const inp1_data = a.const_data_ptr<float>();
+    const int* const inp2_data = b.const_data_ptr<int>();
+    float* const out_data = out.mutable_data_ptr<float>();
+    if (alpha.isFloatingPoint()) {
+      float alpha_val;
+      torch::executor::native::utils::extract_scalar(alpha, &alpha_val);
+      if (b.numel() == 1) {
+        XT_KERNEL_CHECK(
+            ctx,
+            out,
+            xa_nn_elm_sub_scalar_f32x32xf32_f32,
+            out_data,
+            inp1_data,
+            inp2_data[0],
+            alpha_val,
+            out.numel());
+      } else if (broadcast) {
+        XT_KERNEL_CHECK(
+            ctx,
+            out,
+            xa_nn_elm_sub_broadcast_5D_f32x32xf32_f32,
+            out_data,
+            out_shape,
+            inp1_data,
+            inp1_shape,
+            inp2_data,
+            inp2_shape,
+            max_dim,
+            alpha_val);
+      } else {
+        XT_KERNEL_CHECK(
+            ctx,
+            out,
+            xa_nn_elm_sub_f32x32xf32_f32,
+            out_data,
+            inp1_data,
+            inp2_data,
+            alpha_val,
+            out.numel());
+      }
+    } else {
+      int alpha_val;
+      torch::executor::native::utils::extract_scalar(alpha, &alpha_val);
+
+      if (b.numel() == 1) {
+        XT_KERNEL_CHECK(
+            ctx,
+            out,
+            xa_nn_elm_sub_scalar_f32x32x32_f32,
+            out_data,
+            inp1_data,
+            inp2_data[0],
+            alpha_val,
+            out.numel());
+      } else if (broadcast) {
+        XT_KERNEL_CHECK(
+            ctx,
+            out,
+            xa_nn_elm_sub_broadcast_5D_f32x32x32_f32,
+            out_data,
+            out_shape,
+            inp1_data,
+            inp1_shape,
+            inp2_data,
+            inp2_shape,
+            max_dim,
+            alpha_val);
+      } else {
+        XT_KERNEL_CHECK(
+            ctx,
+            out,
+            xa_nn_elm_sub_f32x32x32_f32,
+            out_data,
+            inp1_data,
+            inp2_data,
+            alpha_val,
+            out.numel());
+      }
+    }
   } else {
     // Common Dtype
     ScalarType common_type =
@@ -254,19 +425,8 @@ Tensor& sub_scalar_out(
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "sub.Scalar_out";
 
-  bool optimized = true;
-
-  if (!(((a.scalar_type() == ScalarType::Int) ||
-         (a.scalar_type() == ScalarType::Float)) &&
-        (a.scalar_type() == out.scalar_type()))) {
-    optimized = false;
-  }
-
-  if ((b.isFloatingPoint()) && (a.scalar_type() == ScalarType::Int)) {
-    optimized = false;
-  }
-
-  if ((a.scalar_type() == ScalarType::Int) && (optimized)) {
+  if ((a.scalar_type() == ScalarType::Int) && (b.isIntegral(false)) &&
+      (out.scalar_type() == ScalarType::Int)) {
     const int* const inp1_data = a.const_data_ptr<int>();
     int inp2_val;
     torch::executor::native::utils::extract_scalar(b, &inp2_val);
@@ -285,7 +445,9 @@ Tensor& sub_scalar_out(
         inp2_val,
         alpha_val,
         out.numel());
-  } else if ((a.scalar_type() == ScalarType::Float) && (optimized)) {
+  } else if (
+      (a.scalar_type() == ScalarType::Float) && (b.isFloatingPoint()) &&
+      (out.scalar_type() == ScalarType::Float)) {
     const float* const inp1_data = a.const_data_ptr<float>();
     float inp2_val;
     torch::executor::native::utils::extract_scalar(b, &inp2_val);
@@ -304,6 +466,76 @@ Tensor& sub_scalar_out(
         inp2_val,
         alpha_val,
         out.numel());
+  } else if (
+      (a.scalar_type() == ScalarType::Int) && (b.isFloatingPoint()) &&
+      (out.scalar_type() == ScalarType::Float)) {
+    const int* const inp1_data = a.const_data_ptr<int>();
+    float inp2_val;
+    torch::executor::native::utils::extract_scalar(b, &inp2_val);
+    float* const out_data = out.mutable_data_ptr<float>();
+
+    if (alpha.isFloatingPoint()) {
+      float alpha_val;
+      torch::executor::native::utils::extract_scalar(alpha, &alpha_val);
+
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_sub_scalar_32xf32xf32_f32,
+          out_data,
+          inp1_data,
+          inp2_val,
+          alpha_val,
+          out.numel());
+    } else {
+      float alpha_val;
+      torch::executor::native::utils::extract_scalar(alpha, &alpha_val);
+
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_sub_scalar_32xf32x32_f32,
+          out_data,
+          inp1_data,
+          inp2_val,
+          alpha_val,
+          out.numel());
+    }
+  } else if (
+      (a.scalar_type() == ScalarType::Float) && (b.isIntegral(false)) &&
+      (out.scalar_type() == ScalarType::Float)) {
+    const float* const inp1_data = a.const_data_ptr<float>();
+    int inp2_val;
+    torch::executor::native::utils::extract_scalar(b, &inp2_val);
+    float* const out_data = out.mutable_data_ptr<float>();
+
+    if (alpha.isFloatingPoint()) {
+      float alpha_val;
+      torch::executor::native::utils::extract_scalar(alpha, &alpha_val);
+
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_sub_scalar_f32x32xf32_f32,
+          out_data,
+          inp1_data,
+          inp2_val,
+          alpha_val,
+          out.numel());
+    } else {
+      float alpha_val;
+      torch::executor::native::utils::extract_scalar(alpha, &alpha_val);
+
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_sub_scalar_f32x32x32_f32,
+          out_data,
+          inp1_data,
+          inp2_val,
+          alpha_val,
+          out.numel());
+    }
   } else {
     // Common Dtype
     ScalarType common_type =
diff --git a/backends/cadence/fusion_g3/operators/op_tanh.cpp b/backends/cadence/fusion_g3/operators/op_tanh.cpp
new file mode 100644
index 00000000000..05f39f1361e
--- /dev/null
+++ b/backends/cadence/fusion_g3/operators/op_tanh.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/fusion_g3/operators/operators.h>
+
+#include <cmath>
+
+#include <xa_nnlib_kernels_api.h>
+
+#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
+#include <executorch/kernels/portable/cpu/pattern/pattern.h>
+#include <executorch/kernels/portable/cpu/util/functional_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::KernelRuntimeContext;
+
+namespace cadence {
+namespace impl {
+namespace G3 {
+namespace native {
+
+Tensor& tanh_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
+#ifdef OP_ARG_CHECK
+  // Resize for dynamic shape
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      executorch::runtime::resize_tensor(out, in.sizes()) == Error::Ok,
+      InvalidArgument,
+      out,
+      "Failed to resize output tensor.");
+
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::tensors_have_same_dim_order(in, out),
+      InvalidArgument,
+      out);
+#endif
+
+  if ((in.scalar_type() == ScalarType::Float) &&
+      (out.scalar_type() == ScalarType::Float)) {
+    float* const out_data = out.mutable_data_ptr<float>();
+    const float* const in_data = in.const_data_ptr<float>();
+
+    XT_KERNEL_CHECK(
+        ctx, out, xa_nn_tanh_f32_f32, out_data, in_data, out.numel());
+
+    return out;
+  } else {
+    return torch::executor::native::internal::
+        unary_ufunc_realhbbf16_to_floathbf16(std::tanh, ctx, in, out);
+  }
+}
+
+} // namespace native
+} // namespace G3
+} // namespace impl
+} // namespace cadence
diff --git a/backends/cadence/fusion_g3/operators/op_transpose_copy.cpp b/backends/cadence/fusion_g3/operators/op_transpose_copy.cpp
new file mode 100644
index 00000000000..734fdcb2cd8
--- /dev/null
+++ b/backends/cadence/fusion_g3/operators/op_transpose_copy.cpp
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/fusion_g3/operators/operators.h>
+#include <executorch/backends/cadence/fusion_g3/operators/xt_utils.h>
+
+#include <xa_nnlib_kernels_api.h>
+
+#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
+#include <executorch/kernels/portable/cpu/util/transpose_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::SizesType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::KernelRuntimeContext;
+
+namespace cadence {
+namespace impl {
+namespace G3 {
+namespace native {
+
+/**
+ * Swaps dimension 'dim0' of 'a' with 'dim1', and copying
+ * that mutation into `out` in a manner such that the data is densely packed
+ * and is_contiguous() would return true (stride dim[size-1] = 1).
+ *
+ * transpose_copy.int_out(Tensor self, int dim0, int dim1, *, Tensor(a!) out)
+ */
+Tensor& transpose_copy_int_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& in,
+    int64_t dim0,
+    int64_t dim1,
+    Tensor& out) {
+  (void)ctx;
+  int kTensorDimensionLimit = 5;
+
+  if (dim0 < 0) {
+    dim0 += executorch::runtime::nonzero_dim(in);
+  }
+  if (dim1 < 0) {
+    dim1 += executorch::runtime::nonzero_dim(in);
+  }
+
+#ifdef OP_ARG_CHECK
+  Tensor::SizesType expected_out_size[kTensorDimensionLimit];
+  size_t expected_out_dim = 0;
+  torch::executor::get_transpose_out_target_size(
+      in, dim0, dim1, expected_out_size, &expected_out_dim);
+
+  // Resize for dynamic shape
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::resize_tensor(
+          out, {expected_out_size, expected_out_dim}) == Error::Ok,
+      InvalidArgument,
+      out);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::tensors_have_same_dim_order(in, out),
+      InvalidArgument,
+      out);
+#endif
+
+  int inp_shape[kTensorDimensionLimit];
+  int out_shape[kTensorDimensionLimit];
+  int permute_vec[kTensorDimensionLimit];
+
+  /* input shapes and output shapes */
+  for (int i = 0; i < in.dim(); i++) {
+    inp_shape[i] = in.size(i);
+  }
+  for (int i = 0; i < out.dim(); i++) {
+    out_shape[i] = out.size(i);
+  }
+
+  for (int i = 0; i < in.dim(); i++) {
+    permute_vec[i] = i;
+  }
+
+  permute_vec[dim0] = dim1;
+  permute_vec[dim1] = dim0;
+
+  signed char* const out_data = out.mutable_data_ptr<signed char>();
+  const signed char* const inp_data = in.const_data_ptr<signed char>();
+
+  if ((in.scalar_type() == out.scalar_type()) &&
+      ((out.scalar_type() == ScalarType::Int) ||
+       (out.scalar_type() == ScalarType::Short) ||
+       (out.scalar_type() == ScalarType::Char) ||
+       (out.scalar_type() == ScalarType::UInt32) ||
+       (out.scalar_type() == ScalarType::UInt16) ||
+       (out.scalar_type() == ScalarType::Byte)) &&
+      (in.dim() <= kTensorDimensionLimit)) {
+    XT_KERNEL_CHECK(
+        ctx,
+        out,
+        xa_nn_permute,
+        out_data,
+        out_shape,
+        inp_data,
+        inp_shape,
+        permute_vec,
+        in.dim(),
+        get_element_size(out.scalar_type()));
+  } else {
+    ET_KERNEL_CHECK(
+        ctx,
+        torch::executor::check_transpose_copy_args(in, dim0, dim1, out),
+        InvalidArgument,
+        out);
+
+    ET_SWITCH_ALL_TYPES(in.scalar_type(), ctx, __func__, CTYPE, [&] {
+      torch::executor::transpose_tensors<CTYPE>(in, dim0, dim1, out);
+    });
+  }
+
+  return out;
+}
+
+} // namespace native
+} // namespace G3
+} // namespace impl
+} // namespace cadence
\ No newline at end of file
diff --git a/backends/cadence/fusion_g3/operators/op_where.cpp b/backends/cadence/fusion_g3/operators/op_where.cpp
new file mode 100644
index 00000000000..54966c4574b
--- /dev/null
+++ b/backends/cadence/fusion_g3/operators/op_where.cpp
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/fusion_g3/operators/operators.h>
+
+#include <xa_nnlib_kernels_api.h>
+
+#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
+#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::KernelRuntimeContext;
+
+namespace cadence {
+namespace impl {
+namespace G3 {
+namespace native {
+
+Tensor& where_self_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& cond,
+    const Tensor& a,
+    const Tensor& b,
+    Tensor& out) {
+#ifdef OP_ARG_CHECK
+  // Check Dim Order
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::tensors_have_same_dim_order(cond, a, b, out),
+      InvalidArgument,
+      out);
+
+  // Resize
+  ET_KERNEL_CHECK(
+      ctx,
+      torch::executor::resize_to_broadcast_target_size(a, b, cond, out) ==
+          Error::Ok,
+      InvalidArgument,
+      out);
+#endif
+
+  static constexpr const char op_name[] = "where.self_out";
+
+  int kTensorDimensionLimit = 5;
+
+  int cond_shape[kTensorDimensionLimit];
+  int inp1_shape[kTensorDimensionLimit];
+  int inp2_shape[kTensorDimensionLimit];
+  int out_shape[kTensorDimensionLimit];
+
+  bool broadcast = false;
+
+  int max1_dim = a.dim() > b.dim() ? a.dim() : b.dim();
+  int max2_dim = cond.dim() > out.dim() ? cond.dim() : out.dim();
+  int max_dim = max1_dim > max2_dim ? max1_dim : max2_dim;
+
+  bool optimized = true;
+
+  for (int i = 0; i < max_dim; i++) {
+    out_shape[i] = 1;
+    cond_shape[i] = 1;
+    inp1_shape[i] = 1;
+    inp2_shape[i] = 1;
+  }
+
+  int offset_out = max_dim - out.dim();
+  int offset_cond = max_dim - cond.dim();
+  int offset_inp1 = max_dim - a.dim();
+  int offset_inp2 = max_dim - b.dim();
+
+  for (int i = 0; i < out.dim(); i++) {
+    out_shape[i + offset_out] = out.size(i);
+  }
+  for (int i = 0; i < cond.dim(); i++) {
+    cond_shape[i + offset_cond] = cond.size(i);
+  }
+  for (int i = 0; i < a.dim(); i++) {
+    inp1_shape[i + offset_inp1] = a.size(i);
+  }
+  for (int i = 0; i < b.dim(); i++) {
+    inp2_shape[i + offset_inp2] = b.size(i);
+  }
+
+  /*find broadcast*/
+  for (int i = 0; i < max_dim; i++) {
+    if (((cond_shape[i]) != (out_shape[i])) ||
+        ((inp1_shape[i]) != (out_shape[i])) ||
+        ((inp2_shape[i]) != (out_shape[i]))) {
+      broadcast = true;
+    }
+  }
+
+  if (((broadcast) && (max_dim > kTensorDimensionLimit)) ||
+      (!((a.scalar_type() == ScalarType::Float) &&
+         (b.scalar_type() == ScalarType::Float) &&
+         (cond.scalar_type() == ScalarType::Bool) &&
+         (out.scalar_type() == ScalarType::Float)))) {
+    optimized = false;
+  }
+
+  if (optimized) {
+    const unsigned char* const cond_data = cond.const_data_ptr<unsigned char>();
+    const float* const inp1_data = a.const_data_ptr<float>();
+    const float* const inp2_data = b.const_data_ptr<float>();
+    float* const out_data = out.mutable_data_ptr<float>();
+
+    if (broadcast) {
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_where_broadcast_5D_f32xf32_f32,
+          out_data,
+          out_shape,
+          inp1_data,
+          inp1_shape,
+          inp2_data,
+          inp2_shape,
+          cond_data,
+          cond_shape,
+          max_dim);
+    } else {
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_where_f32xf32_f32,
+          out_data,
+          inp1_data,
+          inp2_data,
+          cond_data,
+          out.numel());
+    }
+  } else {
+    // Common Dtype
+    ScalarType common_type =
+        executorch::runtime::promoteTypes(a.scalar_type(), b.scalar_type());
+
+    // Check Common Dtype
+    ET_KERNEL_CHECK(
+        ctx, common_type == out.scalar_type(), InvalidArgument, out);
+
+    // Compute Dtype
+    ScalarType compute_type =
+        torch::executor::native::utils::get_compute_type(common_type);
+
+    ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
+      torch::executor::native::utils::apply_tritensor_elementwise_fn<
+          CTYPE_COMPUTE,
+          op_name>(
+          [](const CTYPE_COMPUTE val_a,
+             const CTYPE_COMPUTE val_b,
+             const CTYPE_COMPUTE val_c) { return val_c ? val_a : val_b; },
+          ctx,
+          a,
+          torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16,
+          b,
+          torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16,
+          cond,
+          torch::executor::native::utils::SupportedTensorDtypes::BOOL_OR_BYTE,
+          out,
+          torch::executor::native::utils::SupportedTensorDtypes::
+              SAME_AS_COMMON);
+    });
+  }
+
+  return out;
+}
+
+} // namespace native
+} // namespace G3
+} // namespace impl
+} // namespace cadence
diff --git a/backends/cadence/fusion_g3/operators/operators.h b/backends/cadence/fusion_g3/operators/operators.h
index e1c0d08f44a..52330f39ab3 100644
--- a/backends/cadence/fusion_g3/operators/operators.h
+++ b/backends/cadence/fusion_g3/operators/operators.h
@@ -184,6 +184,66 @@ ::executorch::aten::Tensor& sub_scalar_out(
     const ::executorch::aten::Scalar& alpha,
     ::executorch::aten::Tensor& out);
 
+::executorch::aten::Tensor& sigmoid_out(
+    ::executorch::runtime::KernelRuntimeContext& ctx,
+    const ::executorch::aten::Tensor& in,
+    ::executorch::aten::Tensor& out);
+
+::executorch::aten::Tensor& sqrt_out(
+    ::executorch::runtime::KernelRuntimeContext& ctx,
+    const ::executorch::aten::Tensor& in,
+    ::executorch::aten::Tensor& out);
+
+::executorch::aten::Tensor& rsqrt_out(
+    ::executorch::runtime::KernelRuntimeContext& ctx,
+    const ::executorch::aten::Tensor& in,
+    ::executorch::aten::Tensor& out);
+
+::executorch::aten::Tensor& tanh_out(
+    ::executorch::runtime::KernelRuntimeContext& ctx,
+    const ::executorch::aten::Tensor& in,
+    ::executorch::aten::Tensor& out);
+
+::executorch::aten::Tensor& lt_scalar_out(
+    ::executorch::runtime::KernelRuntimeContext& ctx,
+    const ::executorch::aten::Tensor& a,
+    const ::executorch::aten::Scalar& b,
+    ::executorch::aten::Tensor& out);
+
+::executorch::aten::Tensor& lt_tensor_out(
+    ::executorch::runtime::KernelRuntimeContext& ctx,
+    const ::executorch::aten::Tensor& a,
+    const ::executorch::aten::Tensor& b,
+    ::executorch::aten::Tensor& out);
+
+::executorch::aten::Tensor& where_out(
+    ::executorch::runtime::KernelRuntimeContext& ctx,
+    const ::executorch::aten::Tensor& cond,
+    const ::executorch::aten::Tensor& a,
+    const ::executorch::aten::Tensor& b,
+    ::executorch::aten::Tensor& out);
+
+::executorch::aten::Tensor& clamp_out(
+    ::executorch::runtime::KernelRuntimeContext& ctx,
+    const ::executorch::aten::Tensor& in,
+    const ::executorch::aten::optional<::executorch::aten::Scalar>& min_opt,
+    const ::executorch::aten::optional<::executorch::aten::Scalar>& max_opt,
+    ::executorch::aten::Tensor& out);
+
+::executorch::aten::Tensor& clamp_tensor_out(
+    ::executorch::runtime::KernelRuntimeContext& ctx,
+    const ::executorch::aten::Tensor& in,
+    const ::executorch::aten::optional<::executorch::aten::Tensor>& min_opt,
+    const ::executorch::aten::optional<::executorch::aten::Tensor>& max_opt,
+    ::executorch::aten::Tensor& out);
+
+::executorch::aten::Tensor& transpose_copy_int_out(
+    ::executorch::runtime::KernelRuntimeContext& ctx,
+    const ::executorch::aten::Tensor& in,
+    int64_t dim0,
+    int64_t dim1,
+    ::executorch::aten::Tensor& out);
+
 } // namespace native
 } // namespace G3
 } // namespace impl
diff --git a/backends/cadence/fusion_g3/operators/targets.bzl b/backends/cadence/fusion_g3/operators/targets.bzl
index e1e7c9a8491..fffeee0d7b3 100644
--- a/backends/cadence/fusion_g3/operators/targets.bzl
+++ b/backends/cadence/fusion_g3/operators/targets.bzl
@@ -35,6 +35,14 @@ def define_operator(name: str, deps: list[str] | None = None) -> None:
 OPERATORS = [
     "add",
     "cat",
+    "clamp",
+    "lt",
+    "rsqrt",
+    "sigmoid",
+    "sqrt",
+    "tanh",
+    "transpose_copy",
+    "where",
     "dequantize",
     "mul",
     "native_layer_norm",
diff --git a/backends/cadence/fusion_g3/operators/xt_utils.h b/backends/cadence/fusion_g3/operators/xt_utils.h
index 443d68d0609..14b9b6f4981 100644
--- a/backends/cadence/fusion_g3/operators/xt_utils.h
+++ b/backends/cadence/fusion_g3/operators/xt_utils.h
@@ -19,6 +19,8 @@ inline int get_element_size(ScalarType dtype) {
     return sizeof(short);
   } else if ((dtype == ScalarType::Char) || (dtype == ScalarType::Byte)) {
     return sizeof(char);
+  } else if (dtype == ScalarType::Float) {
+    return sizeof(float);
   }
   return 0;
 }
diff --git a/backends/cadence/hifi/kernels/CMakeLists.txt b/backends/cadence/hifi/kernels/CMakeLists.txt
index 9bbd386c75c..270835dbb74 100644
--- a/backends/cadence/hifi/kernels/CMakeLists.txt
+++ b/backends/cadence/hifi/kernels/CMakeLists.txt
@@ -25,7 +25,8 @@ add_library(
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c
 )
 # Let files say "include <executorch/path/to/header.h>".
-set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+set(_common_include_directories ${EXECUTORCH_ROOT}/..
+${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)
 
 target_include_directories(
   cadence_kernels
diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt
index d6820c0700d..b5aacf3b0f7 100644
--- a/backends/cadence/hifi/operators/CMakeLists.txt
+++ b/backends/cadence/hifi/operators/CMakeLists.txt
@@ -11,8 +11,8 @@ if(NOT CMAKE_CXX_STANDARD)
   set(CMAKE_CXX_STANDARD 17)
 endif()
 
-include(${EXECUTORCH_ROOT}/build/Utils.cmake)
-include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
 
 if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
@@ -67,7 +67,8 @@ target_link_libraries(aten_ops_cadence PUBLIC executorch)
 target_link_libraries(aten_ops_cadence PRIVATE cadence_kernels)
 
 # Let files say "include <executorch/path/to/header.h>".
-set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+set(_common_include_directories ${EXECUTORCH_ROOT}/..
+${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)
 
 target_include_directories(
   aten_ops_cadence PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR}
@@ -78,6 +79,7 @@ target_include_directories(
 add_library(
   custom_ops "op_quantized_linear_out.cpp" "op_quantized_layer_norm.cpp"
              "op_quantize_per_tensor.cpp" "op_quantized_relu_out.cpp" "op_dequantize_per_tensor.cpp"
+             "op_quantized_conv_out.cpp" "op_quantized_fully_connected_out"
 )
 target_include_directories(
   custom_ops PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR}
diff --git a/backends/cadence/hifi/operators/op_add.cpp b/backends/cadence/hifi/operators/op_add.cpp
index 3a590ea0711..c8feea37f7d 100644
--- a/backends/cadence/hifi/operators/op_add.cpp
+++ b/backends/cadence/hifi/operators/op_add.cpp
@@ -138,8 +138,21 @@ Tensor& add_out(
   if ((out_type != ScalarType::Float) || (alpha_val != 1.0))
     optimized = 0;
 
-  if ((a_dim == 0) || (b_dim == 0))
-    optimized = 0;
+  bool float_types =
+      (a_type == ScalarType::Float) && (b_type == ScalarType::Float);
+
+  if ((a_dim == 0) && float_types) {
+    for (int i = 0; i < b.numel(); i++)
+      out.mutable_data_ptr<float>()[i] =
+          a.const_data_ptr<float>()[0] + b.const_data_ptr<float>()[i];
+    return out;
+  }
+  if ((b_dim == 0) && float_types) {
+    for (int i = 0; i < a.numel(); i++)
+      out.mutable_data_ptr<float>()[i] =
+          a.const_data_ptr<float>()[i] + b.const_data_ptr<float>()[0];
+    return out;
+  }
 
   if ((broadcast == 1) && (max_dim > kNnlibMaxDim))
     optimized = 0;
diff --git a/backends/cadence/hifi/operators/op_clamp.cpp b/backends/cadence/hifi/operators/op_clamp.cpp
index 05c8659cbcb..785e6f015d5 100644
--- a/backends/cadence/hifi/operators/op_clamp.cpp
+++ b/backends/cadence/hifi/operators/op_clamp.cpp
@@ -328,7 +328,7 @@ Tensor& clamp_tensor_out(
     const executorch::aten::optional<Tensor>& min_opt,
     const executorch::aten::optional<Tensor>& max_opt,
     Tensor& out) {
-  clamp_Tensor_out(ctx, in, min_opt, max_opt, out);
+  return clamp_Tensor_out(ctx, in, min_opt, max_opt, out);
 }
 
 } // namespace native
diff --git a/backends/cadence/hifi/operators/op_div.cpp b/backends/cadence/hifi/operators/op_div.cpp
index 816422858b1..da3f4ac2d07 100644
--- a/backends/cadence/hifi/operators/op_div.cpp
+++ b/backends/cadence/hifi/operators/op_div.cpp
@@ -86,8 +86,21 @@ div_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) {
   if ((a_type != ScalarType::Float) || (b_type != ScalarType::Float))
     optimized = 0;
 
-  if ((a_dim == 0) || (b_dim == 0))
-    optimized = 0;
+  bool float_types =
+      (a_type == ScalarType::Float) && (b_type == ScalarType::Float);
+
+  if ((a_dim == 0) && float_types) {
+    for (int i = 0; i < b.numel(); i++)
+      out.mutable_data_ptr<float>()[i] =
+          a.const_data_ptr<float>()[0] / b.const_data_ptr<float>()[i];
+    return out;
+  }
+  if ((b_dim == 0) && float_types) {
+    for (int i = 0; i < a.numel(); i++)
+      out.mutable_data_ptr<float>()[i] =
+          a.const_data_ptr<float>()[i] / b.const_data_ptr<float>()[0];
+    return out;
+  }
 
   if ((broadcast == 1) && (max_dim > kNnlibMaxDim))
     optimized = 0;
@@ -201,8 +214,21 @@ Tensor& div_out_mode(
   if ((a_type != ScalarType::Float) || (b_type != ScalarType::Float))
     optimized = 0;
 
-  if ((a_dim == 0) || (b_dim == 0))
-    optimized = 0;
+  bool float_types =
+      (a_type == ScalarType::Float) && (b_type == ScalarType::Float);
+
+  if ((a_dim == 0) && float_types) {
+    for (int i = 0; i < b.numel(); i++)
+      out.mutable_data_ptr<float>()[i] =
+          a.const_data_ptr<float>()[0] / b.const_data_ptr<float>()[i];
+    return out;
+  }
+  if ((b_dim == 0) && float_types) {
+    for (int i = 0; i < a.numel(); i++)
+      out.mutable_data_ptr<float>()[i] =
+          a.const_data_ptr<float>()[i] / b.const_data_ptr<float>()[0];
+    return out;
+  }
 
   if ((broadcast == 1) && (max_dim > kNnlibMaxDim))
     optimized = 0;
diff --git a/backends/cadence/hifi/operators/op_mean.cpp b/backends/cadence/hifi/operators/op_mean.cpp
index 59cf8581583..4b93e55047b 100644
--- a/backends/cadence/hifi/operators/op_mean.cpp
+++ b/backends/cadence/hifi/operators/op_mean.cpp
@@ -175,7 +175,7 @@ Tensor& mean_dim_out(
     bool keepdim,
     optional<ScalarType> dtype,
     Tensor& out) {
-  mean_out(ctx, in, dim_list, keepdim, dtype, out);
+  return mean_out(ctx, in, dim_list, keepdim, dtype, out);
 }
 
 } // namespace native
diff --git a/backends/cadence/hifi/operators/op_mul.cpp b/backends/cadence/hifi/operators/op_mul.cpp
index b8c3ab7c02b..6eb79545be7 100644
--- a/backends/cadence/hifi/operators/op_mul.cpp
+++ b/backends/cadence/hifi/operators/op_mul.cpp
@@ -104,10 +104,23 @@ mul_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) {
   int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
   max_dim = out.dim() > max_dim ? out.dim() : max_dim;
 
-  if ((a_type != ScalarType::Float) || (b_type != ScalarType::Float))
-    optimized = 0;
+  bool float_types =
+      (a_type == ScalarType::Float) && (b_type == ScalarType::Float);
+
+  if ((a_dim == 0) && float_types) {
+    for (int i = 0; i < b.numel(); i++)
+      out.mutable_data_ptr<float>()[i] =
+          a.const_data_ptr<float>()[0] * b.const_data_ptr<float>()[i];
+    return out;
+  }
+  if ((b_dim == 0) && float_types) {
+    for (int i = 0; i < a.numel(); i++)
+      out.mutable_data_ptr<float>()[i] =
+          a.const_data_ptr<float>()[i] * b.const_data_ptr<float>()[0];
+    return out;
+  }
 
-  if ((a_dim == 0) || (b_dim == 0))
+  if ((a_type != ScalarType::Float) || (b_type != ScalarType::Float))
     optimized = 0;
 
   if ((broadcast == 1) && (max_dim > kNnlibMaxDim))
diff --git a/backends/cadence/hifi/operators/op_quantized_conv_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv_out.cpp
new file mode 100644
index 00000000000..a24bad5f9a5
--- /dev/null
+++ b/backends/cadence/hifi/operators/op_quantized_conv_out.cpp
@@ -0,0 +1,1117 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
+#include <executorch/backends/cadence/hifi/operators/operators.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+#define ALIGN_PTR(x, bytes) ((((unsigned)(x)) + (bytes - 1)) & (~(bytes - 1)))
+
+using Tensor = executorch::aten::Tensor;
+using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
+using ScalarType = executorch::aten::ScalarType;
+using ::executorch::aten::IntArrayRef;
+
+namespace cadence {
+namespace impl {
+namespace HiFi {
+namespace native {
+
+// This implements a generic 2d conv kernel that operates on raw pointers.
+// The version handles both quantized and fp32 convolutions.
+// The input is of shape [n x c x h x w]
+// The weight is of shape [oc x wc x wh x ww], where wc == c
+// The output is of shape [n x oc x oh x ow]
+// The bias is of shape [oc]
+template <
+    typename IT = float,
+    typename WT = IT,
+    typename BT = IT,
+    typename OT = IT,
+    bool quantized = false>
+__attribute__((noinline)) void conv2d_nchw_core_generic(
+    // All the arrays
+    const IT* __restrict__ p_in,
+    const WT* __restrict__ p_weight,
+    const BT* __restrict__ p_bias,
+    OT* __restrict__ p_out,
+    // The array sizes
+    int32_t n,
+    int32_t c,
+    int32_t h,
+    int32_t w,
+    int32_t oc,
+    int32_t wc,
+    int32_t wh,
+    int32_t ww,
+    int32_t oh,
+    int32_t ow,
+    // Stride
+    int16_t s0,
+    int16_t s1,
+    // Padding
+    int16_t p0,
+    int16_t p1,
+    // Dilation
+    int16_t d0,
+    int16_t d1,
+    // Group for depthwise conv
+    int16_t groups,
+    // Optional args that are only relevant for quantized convolution
+    // input zero point
+    IT in_zero_point = 0,
+    // weight zero point
+    int32_t weight_zero_point = 0,
+    float bias_scale = 1,
+    float out_scale = 1,
+    OT out_zero_point = 0) {
+  float inv_out_scale = 1. / out_scale;
+  bool zero_pad_unit_dilation = d0 == 1 && d1 == 1 && p0 == 0 && p1 == 0;
+
+  // Compute the number of in and out channels per group
+  const int ocpg = oc / groups;
+  const int icpg = c / groups;
+
+  // Iterate over all the output batches (i.e., n)
+  for (int _n = 0; _n < n; ++_n) {
+    const IT* in_batch = p_in + _n * c * h * w;
+    OT* out_batch = p_out + _n * oc * oh * ow;
+    // Compute separable convolution for each group
+    for (int _g = 0; _g < groups; ++_g) {
+      // Identify the input and output channels involved in the computation
+      // of this group
+      int sic = _g * icpg;
+      int soc = _g * ocpg;
+      // Populate all the output channels in the group
+      for (int _oc = soc; _oc < soc + ocpg; ++_oc) {
+        OT* out_plane = out_batch + _oc * oh * ow;
+        const WT* weight_batch = p_weight + _oc * wc * wh * ww;
+        // We compute one output channel at a time. The computation can be
+        // thought of as a stencil computation: we iterate over an input of size
+        // icpg x h x w, with a stencil of size icpg x wh x ww, to compute an
+        // output channel of size 1 x oh x ow.
+        for (int _h = 0, _oh = 0; _oh < oh; _h += s0, ++_oh) {
+          for (int _w = 0, _ow = 0; _ow < ow; _w += s1, ++_ow) {
+            float acc = p_bias[_oc];
+            // Below is the stencil computation that performs the hadamard
+            // product+accumulation of each input channel (contributing to the
+            // output channel being computed) with the corresponding weight
+            // channel.
+            // If the padding is 0, and dilation is 1, then we can remove the
+            // unnecessary checks, and simplify the code so that it can be
+            // vectorized by Tensilica compiler.
+            if (zero_pad_unit_dilation) {
+              for (int _ic = sic; _ic < sic + icpg; ++_ic) {
+                const IT* in_plane = in_batch + _ic * h * w;
+                const WT* weight_plane = weight_batch + (_ic - sic) * wh * ww;
+                for (int _wh = 0; _wh < wh; ++_wh) {
+                  for (int _ww = 0; _ww < ww; ++_ww) {
+                    int ioff = (_h + _wh) * w + (_w + _ww);
+                    int woff = _wh * ww + _ww;
+                    float lhs = in_plane[ioff] - in_zero_point;
+                    float rhs = weight_plane[woff] -
+                        (quantized ? weight_zero_point : 0);
+                    acc += lhs * rhs;
+                  }
+                }
+              }
+            } else {
+              for (int _ic = sic; _ic < sic + icpg; ++_ic) {
+                const IT* in_plane = in_batch + _ic * h * w;
+                const WT* weight_plane = weight_batch + (_ic - sic) * wh * ww;
+                for (int _wh = 0; _wh < wh; ++_wh) {
+                  for (int _ww = 0; _ww < ww; ++_ww) {
+                    if (((_h + d0 * _wh - p0) >= 0) &&
+                        ((_h + d0 * _wh - p0) < h) &&
+                        ((_w + d1 * _ww - p1) >= 0) &&
+                        ((_w + d1 * _ww - p1) < w)) {
+                      int ioff =
+                          (_h + d0 * _wh - p0) * w + (_w + d1 * _ww - p1);
+                      int woff = _wh * ww + _ww;
+                      float lhs = in_plane[ioff] - in_zero_point;
+                      float rhs = weight_plane[woff] -
+                          (quantized ? weight_zero_point : 0);
+                      acc += lhs * rhs;
+                    }
+                  }
+                }
+              }
+            }
+            if (quantized) {
+              float val = bias_scale * acc;
+              out_plane[_oh * ow + _ow] =
+                  kernels::quantize<OT>(val, inv_out_scale, out_zero_point);
+            } else {
+              out_plane[_oh * ow + _ow] = acc;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template <
+    typename IT = float,
+    typename WT = IT,
+    typename BT = IT,
+    typename OT = IT,
+    bool quantized = false>
+__attribute__((noinline)) void conv2d_nhwc_core_generic(
+    // All the arrays
+    const IT* __restrict__ p_in,
+    const WT* __restrict__ p_weight,
+    const BT* __restrict__ p_bias,
+    OT* __restrict__ p_out,
+    // The array sizes
+    int32_t n,
+    int32_t h,
+    int32_t w,
+    int32_t c,
+    int32_t oc,
+    int32_t wh,
+    int32_t ww,
+    int32_t wc,
+    int32_t oh,
+    int32_t ow,
+    // Stride
+    int16_t s0,
+    int16_t s1,
+    // Padding
+    int16_t p0,
+    int16_t p1,
+    // Dilation
+    int16_t d0,
+    int16_t d1,
+    // Group for depthwise conv
+    int16_t groups,
+    // Optional args that are only relevant for quantized convolution
+    // input zero point
+    IT in_zero_point = 0,
+    // weight zero point
+    int32_t weight_zero_point = 0,
+    float bias_scale = 1,
+    float out_scale = 1,
+    OT out_zero_point = 0) {
+  float inv_out_scale = 1. / out_scale;
+  bool zero_pad_unit_dilation = d0 == 1 && d1 == 1 && p0 == 0 && p1 == 0;
+
+  // Compute the number of in and out channels per group
+  const int ocpg = oc / groups;
+  const int icpg = c / groups;
+
+  // Iterate over all the output batches (i.e., n)
+  for (int _n = 0; _n < n; ++_n) {
+    const IT* in_batch = p_in + _n * h * w * c;
+    OT* out_batch = p_out + _n * oh * ow * oc;
+    for (int _h = 0, _oh = 0; _oh < oh; _h += s0, ++_oh) {
+      for (int _w = 0, _ow = 0; _ow < ow; _w += s1, ++_ow) {
+        OT* out_line = out_batch + (_oh * ow + _ow) * oc;
+        // Compute separable convolution for each group
+        for (int _g = 0; _g < groups; ++_g) {
+          // Identify the input and output channels involved in the computation
+          // of this group
+          int sic = _g * icpg;
+          int soc = _g * ocpg;
+          // Populate all the output channels in the group
+          for (int _oc = soc; _oc < soc + ocpg; ++_oc) {
+            const WT* weight_batch = p_weight + _oc * wh * ww * wc;
+            // We compute one output channel at a time. The computation can be
+            // thought of as a stencil computation: we iterate over an input of
+            // size h x w x icpg, with a stencil of size wh x ww x icpg, to
+            // compute an output channel of size oh x ow x 1.
+            float acc = p_bias[_oc];
+            // Below is the stencil computation that performs the hadamard
+            // product+accumulation of each input channel (contributing to
+            // the output channel being computed) with the corresponding
+            // weight channel. If the padding is 0, and dilation is 1, then
+            // we can remove the unnecessary checks, and simplify the code
+            // so that it can be vectorized by Tensilica compiler.x``
+            if (zero_pad_unit_dilation) {
+              for (int _wh = 0; _wh < wh; ++_wh) {
+                for (int _ww = 0; _ww < ww; ++_ww) {
+                  const IT* in_line =
+                      in_batch + (_h + _wh) * w * c + (_w + _ww) * c;
+                  const WT* weight_line =
+                      weight_batch + _wh * ww * wc + _ww * wc;
+                  for (int _ic = sic; _ic < sic + icpg; ++_ic) {
+                    float lhs = in_line[_ic] - in_zero_point;
+                    float rhs = weight_line[_ic - sic] -
+                        (quantized ? weight_zero_point : 0);
+                    acc += lhs * rhs;
+                  }
+                }
+              }
+            } else {
+              for (int _wh = 0; _wh < wh; ++_wh) {
+                for (int _ww = 0; _ww < ww; ++_ww) {
+                  if (((_h + d0 * _wh - p0) >= 0) &&
+                      ((_h + d0 * _wh - p0) < h) &&
+                      ((_w + d1 * _ww - p1) >= 0) &&
+                      ((_w + d1 * _ww - p1 < w))) {
+                    const IT* in_line = in_batch +
+                        (_h + d0 * _wh - p0) * w * c + (_w + d1 * _ww - p1) * c;
+                    const WT* weight_line =
+                        weight_batch + _wh * ww * wc + _ww * wc;
+                    for (int _ic = sic; _ic < sic + icpg; ++_ic) {
+                      float lhs = in_line[_ic] - in_zero_point;
+                      float rhs = weight_line[_ic - sic] -
+                          (quantized ? weight_zero_point : 0);
+                      acc += lhs * rhs;
+                    }
+                  }
+                }
+              }
+            }
+            if (quantized) {
+              float val = bias_scale * acc;
+              out_line[_oc] =
+                  kernels::quantize<OT>(val, inv_out_scale, out_zero_point);
+            } else {
+              out_line[_oc] = acc;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void xa_opt_quantized_conv_nhwc(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int16_t groups,
+    int32_t in_zero_point,
+    int32_t weight_zero_point,
+    float bias_scale,
+    float output_scale,
+    int32_t output_zero_point,
+    Tensor& out) {
+  bool conv1d = input.dim() == 3;
+  constexpr int kNnlibMaxDim = 4;
+
+  if (input.scalar_type() == ScalarType::Char) {
+    WORD8* __restrict__ p_out =
+        (WORD8* __restrict__)out.mutable_data_ptr<int8_t>();
+    WORD8* __restrict__ p_inp =
+        (WORD8* __restrict__)input.const_data_ptr<int8_t>();
+    WORD8* __restrict__ p_kernel =
+        (WORD8* __restrict__)weight.const_data_ptr<int8_t>();
+    WORD32* __restrict__ p_bias =
+        (WORD32* __restrict__)bias.const_data_ptr<int32_t>();
+
+    WORD32 input_height = conv1d ? 1 : input.size(2);
+    WORD32 input_width = conv1d ? input.size(2) : input.size(3);
+    WORD32 input_channels = input.size(1);
+    WORD32 kernel_height = conv1d ? 1 : weight.size(2);
+    WORD32 kernel_width = conv1d ? weight.size(2) : weight.size(3);
+    WORD32 kernel_channels = weight.size(1);
+    WORD32 out_channels = weight.size(0);
+    WORD32 out_height = conv1d ? 1 : out.size(2);
+    WORD32 out_width = conv1d ? out.size(2) : out.size(3);
+    WORD32 batches = input.size(0);
+
+    WORD32 x_stride = stride[1];
+    WORD32 y_stride = stride[0];
+    WORD32 x_padding = padding[1];
+    WORD32 y_padding = padding[0];
+    WORD32 dilation_width = dilation[1];
+    WORD32 dilation_height = dilation[0];
+
+    // WORD32* kernel_bias_ptr =
+    //   (WORD32*)weight_zero_point.const_data_ptr<int32_t>();
+
+    WORD32 input_zero_bias = -in_zero_point;
+    WORD32 kernel_zero_bias = -weight_zero_point;
+
+    WORD32 out_multiplier32[out_channels];
+    WORD32 out_shift32[out_channels];
+
+    float out_scale = 1. / output_scale;
+
+    for (int i = 0; i < out_channels; i++) {
+      out_multiplier32[i] = bias_scale * out_scale * 2147483648;
+      out_shift32[i] = 0;
+    }
+
+    WORD32 out_zero_bias = output_zero_point;
+    WORD32 inp_precision = 8;
+    WORD32 kernel_precision = 8;
+    pVOID p_scratch = nullptr;
+    WORD32* ptr_scratch;
+
+    WORD32 scratch_size = 0;
+
+    if (groups == 1) {
+      WORD32 out_data_format = 1;
+
+      scratch_size = xa_nn_conv2d_getsize(
+          input_height,
+          input_width,
+          input_channels,
+          kernel_height,
+          kernel_width,
+          kernel_channels,
+          dilation_height,
+          dilation_width,
+          y_stride,
+          y_padding,
+          x_stride,
+          x_padding,
+          out_height,
+          out_width,
+          out_channels,
+          inp_precision,
+          kernel_precision,
+          out_data_format);
+
+      scratch_size = scratch_size < 0 ? 0 : scratch_size;
+
+      ptr_scratch = (WORD32*)kernels::allocate_temp_memory(ctx, scratch_size);
+
+      p_scratch = (pVOID)ALIGN_PTR(ptr_scratch, 8);
+
+      for (int _n = 0; _n < batches; _n++) {
+        WORD8* in_batch =
+            p_inp + _n * input_channels * input_height * input_width;
+        WORD8* out_batch = p_out + _n * out_channels * out_height * out_width;
+
+        xa_nn_conv2d_per_chan_sym8sxasym8s(
+            out_batch,
+            in_batch,
+            p_kernel,
+            p_bias,
+            input_height,
+            input_width,
+            input_channels,
+            kernel_height,
+            kernel_width,
+            kernel_channels,
+            dilation_height,
+            dilation_width,
+            out_channels,
+            x_stride,
+            y_stride,
+            x_padding,
+            y_padding,
+            out_height,
+            out_width,
+            input_zero_bias,
+            out_multiplier32,
+            out_shift32,
+            out_zero_bias,
+            out_data_format,
+            p_scratch);
+      }
+      return;
+    }
+
+    if (groups == input_channels) {
+      WORD32 channels_multiplier = out_channels / input_channels;
+
+      scratch_size = xa_nn_conv2d_depthwise_getsize(
+          input_height,
+          input_width,
+          input_channels,
+          kernel_height,
+          kernel_width,
+          channels_multiplier,
+          x_stride,
+          y_stride,
+          x_padding,
+          y_padding,
+          out_height,
+          out_width,
+          inp_precision,
+          0); // NHWC
+
+      scratch_size = scratch_size < 0 ? 0 : scratch_size;
+
+      ptr_scratch = (WORD32*)kernels::allocate_temp_memory(ctx, scratch_size);
+
+      p_scratch = (pVOID)ALIGN_PTR(ptr_scratch, 8);
+
+      WORD8* ptr1 = (WORD8*)kernels::allocate_temp_memory(
+          ctx,
+          ((batches * out_channels * out_height * out_width) + 8) *
+              sizeof(WORD8));
+
+      WORD8* p_out_temp = (WORD8*)ALIGN_PTR(ptr1, 8);
+
+      for (int _n = 0; _n < batches; _n++) {
+        WORD8* in_batch =
+            p_inp + _n * input_channels * input_height * input_width;
+        WORD8* out_batch =
+            p_out_temp + _n * out_channels * out_height * out_width;
+
+        xa_nn_conv2d_depthwise_per_chan_sym8sxasym8s(
+            out_batch,
+            p_kernel,
+            in_batch,
+            p_bias,
+            input_height,
+            input_width,
+            input_channels,
+            kernel_height,
+            kernel_width,
+            channels_multiplier,
+            x_stride,
+            y_stride,
+            x_padding,
+            y_padding,
+            out_height,
+            out_width,
+            input_zero_bias,
+            out_multiplier32,
+            out_shift32,
+            out_zero_bias,
+            0, // NHWC
+            0, // NHWC
+            p_scratch);
+      }
+
+      return;
+    }
+  }
+}
+
+void xa_opt_quantized_conv_nchw(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int16_t groups,
+    int32_t in_zero_point,
+    int32_t weight_zero_point,
+    float bias_scale,
+    float output_scale,
+    int32_t output_zero_point,
+    Tensor& out) {
+  bool conv1d = input.dim() == 3;
+  constexpr int kNnlibMaxDim = 4;
+
+  if (input.scalar_type() == ScalarType::Char) {
+    WORD8* __restrict__ p_out =
+        (WORD8* __restrict__)out.mutable_data_ptr<int8_t>();
+    WORD8* __restrict__ p_inp =
+        (WORD8* __restrict__)input.const_data_ptr<int8_t>();
+    WORD8* __restrict__ p_kernel =
+        (WORD8* __restrict__)weight.const_data_ptr<int8_t>();
+    WORD32* __restrict__ p_bias =
+        (WORD32* __restrict__)bias.const_data_ptr<int32_t>();
+
+    WORD32 input_height = conv1d ? 1 : input.size(2);
+    WORD32 input_width = conv1d ? input.size(2) : input.size(3);
+    WORD32 input_channels = input.size(1);
+    WORD32 kernel_height = conv1d ? 1 : weight.size(2);
+    WORD32 kernel_width = conv1d ? weight.size(2) : weight.size(3);
+    WORD32 kernel_channels = weight.size(1);
+    WORD32 out_channels = weight.size(0);
+    WORD32 out_height = conv1d ? 1 : out.size(2);
+    WORD32 out_width = conv1d ? out.size(2) : out.size(3);
+    WORD32 batches = input.size(0);
+
+    WORD32 x_stride = stride[1];
+    WORD32 y_stride = stride[0];
+    WORD32 x_padding = padding[1];
+    WORD32 y_padding = padding[0];
+    WORD32 dilation_width = dilation[1];
+    WORD32 dilation_height = dilation[0];
+
+    // WORD32* kernel_bias_ptr =
+    //   (WORD32*)weight_zero_point.const_data_ptr<int32_t>();
+
+    WORD32 input_zero_bias = -in_zero_point;
+    WORD32 kernel_zero_bias = -weight_zero_point;
+
+    WORD32 out_multiplier32[out_channels];
+    WORD32 out_shift32[out_channels];
+
+    float out_scale = 1. / output_scale;
+
+    for (int i = 0; i < out_channels; i++) {
+      out_multiplier32[i] = bias_scale * out_scale * 2147483648;
+      out_shift32[i] = 0;
+    }
+
+    WORD32 out_zero_bias = output_zero_point;
+    WORD32 inp_precision = 8;
+    WORD32 kernel_precision = 8;
+    pVOID p_scratch = nullptr;
+    WORD32* ptr_scratch;
+
+    WORD32 scratch_size = 0;
+
+    if (groups == 1) {
+      WORD32 out_data_format = 1;
+
+      WORD8* ptr1 = (WORD8*)kernels::allocate_temp_memory(
+          ctx,
+          ((batches * input_channels * input_height * input_width) + 8) *
+              sizeof(WORD8));
+
+      WORD8* ptr2 = (WORD8*)kernels::allocate_temp_memory(
+          ctx,
+          ((out_channels * kernel_channels * kernel_height * kernel_width) +
+           8) *
+              sizeof(WORD8));
+
+      WORD8* pin = (WORD8*)ALIGN_PTR(ptr1, 8);
+      WORD8* pkernel = (WORD8*)ALIGN_PTR(ptr2, 8);
+
+      WORD32 p_inp_shape[kNnlibMaxDim];
+      p_inp_shape[0] = input.size(0);
+      p_inp_shape[1] = input_channels;
+      p_inp_shape[2] = input_height;
+      p_inp_shape[3] = input_width;
+
+      WORD32 p_out_shape[kNnlibMaxDim];
+      p_out_shape[0] = input.size(0);
+      p_out_shape[1] = input_height;
+      p_out_shape[2] = input_width;
+      p_out_shape[3] = input_channels;
+
+      WORD32 p_permute_vec[kNnlibMaxDim] = {0, 2, 3, 1};
+
+      xa_nn_transpose_8_8(
+          pin,
+          p_out_shape,
+          p_inp,
+          p_inp_shape,
+          p_permute_vec,
+          kNnlibMaxDim, // input dimensions
+          kNnlibMaxDim); // output dimensions
+
+      WORD32 p_inp_shape1[kNnlibMaxDim];
+      p_inp_shape1[0] = out_channels;
+      p_inp_shape1[1] = kernel_channels;
+      p_inp_shape1[2] = kernel_height;
+      p_inp_shape1[3] = kernel_width;
+
+      WORD32 p_out_shape1[kNnlibMaxDim];
+      p_out_shape1[0] = out_channels;
+      p_out_shape1[1] = kernel_height;
+      p_out_shape1[2] = kernel_width;
+      p_out_shape1[3] = kernel_channels;
+
+      xa_nn_transpose_8_8(
+          pkernel,
+          p_out_shape1,
+          p_kernel,
+          p_inp_shape1,
+          p_permute_vec,
+          kNnlibMaxDim, // input dimensions
+          kNnlibMaxDim); // output dimensions
+
+      scratch_size = xa_nn_conv2d_getsize(
+          input_height,
+          input_width,
+          input_channels,
+          kernel_height,
+          kernel_width,
+          kernel_channels,
+          dilation_height,
+          dilation_width,
+          y_stride,
+          y_padding,
+          x_stride,
+          x_padding,
+          out_height,
+          out_width,
+          out_channels,
+          inp_precision,
+          kernel_precision,
+          out_data_format);
+
+      scratch_size = scratch_size < 0 ? 0 : scratch_size;
+
+      ptr_scratch = (WORD32*)kernels::allocate_temp_memory(ctx, scratch_size);
+
+      p_scratch = (pVOID)ALIGN_PTR(ptr_scratch, 8);
+
+      for (int _n = 0; _n < batches; _n++) {
+        WORD8* in_batch =
+            pin + _n * input_channels * input_height * input_width;
+        WORD8* out_batch = p_out + _n * out_channels * out_height * out_width;
+
+        xa_nn_conv2d_per_chan_sym8sxasym8s(
+            out_batch,
+            in_batch,
+            pkernel,
+            p_bias,
+            input_height,
+            input_width,
+            input_channels,
+            kernel_height,
+            kernel_width,
+            kernel_channels,
+            dilation_height,
+            dilation_width,
+            out_channels,
+            x_stride,
+            y_stride,
+            x_padding,
+            y_padding,
+            out_height,
+            out_width,
+            input_zero_bias,
+            out_multiplier32,
+            out_shift32,
+            out_zero_bias,
+            out_data_format,
+            p_scratch);
+      }
+      return;
+    }
+
+    if (groups == input_channels) {
+      WORD32 channels_multiplier = out_channels / input_channels;
+
+      scratch_size = xa_nn_conv2d_depthwise_getsize(
+          input_height,
+          input_width,
+          input_channels,
+          kernel_height,
+          kernel_width,
+          channels_multiplier,
+          x_stride,
+          y_stride,
+          x_padding,
+          y_padding,
+          out_height,
+          out_width,
+          inp_precision,
+          1); // NCHW
+
+      scratch_size = scratch_size < 0 ? 0 : scratch_size;
+
+      ptr_scratch = (WORD32*)kernels::allocate_temp_memory(ctx, scratch_size);
+
+      p_scratch = (pVOID)ALIGN_PTR(ptr_scratch, 8);
+
+      WORD8* ptr1 = (WORD8*)kernels::allocate_temp_memory(
+          ctx,
+          ((batches * out_channels * out_height * out_width) + 8) *
+              sizeof(WORD8));
+
+      WORD8* p_out_temp = (WORD8*)ALIGN_PTR(ptr1, 8);
+
+      for (int _n = 0; _n < batches; _n++) {
+        WORD8* in_batch =
+            p_inp + _n * input_channels * input_height * input_width;
+        WORD8* out_batch =
+            p_out_temp + _n * out_channels * out_height * out_width;
+
+        xa_nn_conv2d_depthwise_per_chan_sym8sxasym8s(
+            out_batch,
+            p_kernel,
+            in_batch,
+            p_bias,
+            input_height,
+            input_width,
+            input_channels,
+            kernel_height,
+            kernel_width,
+            channels_multiplier,
+            x_stride,
+            y_stride,
+            x_padding,
+            y_padding,
+            out_height,
+            out_width,
+            input_zero_bias,
+            out_multiplier32,
+            out_shift32,
+            out_zero_bias,
+            1, // NCHW
+            0, // NHWC
+            p_scratch);
+      }
+
+      WORD32 p_inp_shape[kNnlibMaxDim];
+      p_inp_shape[0] = batches;
+      p_inp_shape[1] = out_height;
+      p_inp_shape[2] = out_width;
+      p_inp_shape[3] = out_channels;
+
+      WORD32 p_out_shape[kNnlibMaxDim];
+      p_out_shape[0] = batches;
+      p_out_shape[1] = out_channels;
+      p_out_shape[2] = out_height;
+      p_out_shape[3] = out_width;
+
+      WORD32 p_permute_vec[kNnlibMaxDim] = {0, 3, 1, 2};
+
+      xa_nn_transpose_8_8(
+          p_out,
+          p_out_shape,
+          p_out_temp,
+          p_inp_shape,
+          p_permute_vec,
+          kNnlibMaxDim, // input dimensions
+          kNnlibMaxDim); // output dimensions
+
+      return;
+    }
+  }
+}
+
+// The quantized convolution kernel. in_scale and weight_scale are implicit in
+// bias_scale, since it is a product of the two. The kernel will branch to
+// quantized::conv1d or quantized::conv2d based on the dimensionality of
+// activation tensor.
+void quantized_conv_nchw(
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int16_t groups,
+    int32_t in_zero_point,
+    int32_t weight_zero_point,
+    float bias_scale,
+    float output_scale,
+    int32_t output_zero_point,
+    Tensor& out) {
+  bool conv1d = input.dim() == 3;
+  // input = [n, c, h, w]
+  const int n = input.size(0);
+  const int c = input.size(1);
+  const int h = conv1d ? 1 : input.size(2);
+  const int w = conv1d ? input.size(2) : input.size(3);
+  // weight = [oc, wc, wh, ww]
+  const int oc = weight.size(0);
+  const int wc = weight.size(1);
+  const int wh = conv1d ? 1 : weight.size(2);
+  const int ww = conv1d ? weight.size(2) : weight.size(3);
+  // output = [n, oc, oh, ow]
+  const int oh = conv1d ? 1 : out.size(2);
+  const int ow = conv1d ? out.size(2) : out.size(3);
+
+#define typed_quantized_conv2d_nchw(ctype, dtype)                 \
+  case ScalarType::dtype: {                                       \
+    conv2d_nchw_core_generic<ctype, ctype, int32_t, ctype, true>( \
+        input.const_data_ptr<ctype>(),                            \
+        weight.const_data_ptr<ctype>(),                           \
+        bias.const_data_ptr<int32_t>(),                           \
+        out.mutable_data_ptr<ctype>(),                            \
+        n,                                                        \
+        c,                                                        \
+        h,                                                        \
+        w,                                                        \
+        oc,                                                       \
+        wc,                                                       \
+        wh,                                                       \
+        ww,                                                       \
+        oh,                                                       \
+        ow,                                                       \
+        stride[0],                                                \
+        stride[1],                                                \
+        padding[0],                                               \
+        padding[1],                                               \
+        dilation[0],                                              \
+        dilation[1],                                              \
+        groups,                                                   \
+        in_zero_point,                                            \
+        weight_zero_point,                                        \
+        bias_scale,                                               \
+        output_scale,                                             \
+        (ctype)output_zero_point);                                \
+    break;                                                        \
+  }
+  ScalarType dtype = out.scalar_type();
+  switch (dtype) {
+    ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_conv2d_nchw);
+    default:
+      ET_DCHECK_MSG(
+          false, "Unhandled dtype %s", torch::executor::toString(dtype));
+  }
+
+#undef typed_quantized_conv2d_nchw
+}
+
+void quantized_conv_nhwc(
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int16_t groups,
+    int32_t in_zero_point,
+    int32_t weight_zero_point,
+    float bias_scale,
+    float output_scale,
+    int32_t output_zero_point,
+    Tensor& out) {
+  bool conv1d = input.dim() == 3;
+  // input = [n, h, w, c]
+  const int n = input.size(0);
+  const int h = conv1d ? 1 : input.size(1);
+  const int w = conv1d ? input.size(1) : input.size(2);
+  const int c = conv1d ? input.size(2) : input.size(3);
+  // weight = [oc, wh, ww, wc]
+  const int oc = weight.size(0);
+  const int wh = conv1d ? 1 : weight.size(1);
+  const int ww = conv1d ? weight.size(1) : weight.size(2);
+  const int wc = conv1d ? weight.size(2) : weight.size(3);
+  // output = [n, oh, ow, oc]
+  const int oh = conv1d ? 1 : out.size(1);
+  const int ow = conv1d ? out.size(1) : out.size(2);
+
+#define typed_quantized_conv2d_nhwc(ctype, dtype)                 \
+  case ScalarType::dtype: {                                       \
+    conv2d_nhwc_core_generic<ctype, ctype, int32_t, ctype, true>( \
+        input.const_data_ptr<ctype>(),                            \
+        weight.const_data_ptr<ctype>(),                           \
+        bias.const_data_ptr<int32_t>(),                           \
+        out.mutable_data_ptr<ctype>(),                            \
+        n,                                                        \
+        h,                                                        \
+        w,                                                        \
+        c,                                                        \
+        oc,                                                       \
+        wh,                                                       \
+        ww,                                                       \
+        wc,                                                       \
+        oh,                                                       \
+        ow,                                                       \
+        stride[0],                                                \
+        stride[1],                                                \
+        padding[0],                                               \
+        padding[1],                                               \
+        dilation[0],                                              \
+        dilation[1],                                              \
+        groups,                                                   \
+        in_zero_point,                                            \
+        weight_zero_point,                                        \
+        bias_scale,                                               \
+        output_scale,                                             \
+        (ctype)output_zero_point);                                \
+    break;                                                        \
+  }
+  ScalarType dtype = out.scalar_type();
+  switch (dtype) {
+    ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_conv2d_nhwc);
+    default:
+      ET_DCHECK_MSG(
+          false, "Unhandled dtype %s", torch::executor::toString(dtype));
+  }
+
+#undef typed_quantized_conv2d_nhwc
+}
+
+void quantized_conv_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t in_zero_point,
+    const Tensor& weight_zero_point,
+    const Tensor& bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    __ET_UNUSED const Tensor& out_multiplier,
+    __ET_UNUSED const Tensor& out_shift,
+    bool channel_last,
+    Tensor& out) {
+  const float bias_scale_float = bias_scale.const_data_ptr<float>()[0];
+  const int32_t weight_zero_point_int =
+      weight_zero_point.const_data_ptr<int32_t>()[0];
+
+  bool optimized = 0;
+
+  if ((input.scalar_type() == ScalarType::Char) ||
+      (input.scalar_type() == ScalarType::Byte))
+    optimized = 1;
+
+  if ((dilation[0] != 1) || (dilation[1] != 1))
+    optimized = 0;
+
+  if (channel_last) {
+    if (optimized) {
+      xa_opt_quantized_conv_nhwc(
+          ctx,
+          input,
+          weight,
+          bias,
+          stride,
+          padding,
+          dilation,
+          groups,
+          in_zero_point,
+          weight_zero_point_int,
+          bias_scale_float,
+          output_scale,
+          output_zero_point,
+          out);
+    } else {
+      quantized_conv_nhwc(
+          input,
+          weight,
+          bias,
+          stride,
+          padding,
+          dilation,
+          groups,
+          in_zero_point,
+          weight_zero_point_int,
+          bias_scale_float,
+          output_scale,
+          output_zero_point,
+          out);
+    }
+  } else {
+    if (optimized) {
+      xa_opt_quantized_conv_nchw(
+          ctx,
+          input,
+          weight,
+          bias,
+          stride,
+          padding,
+          dilation,
+          groups,
+          in_zero_point,
+          weight_zero_point_int,
+          bias_scale_float,
+          output_scale,
+          output_zero_point,
+          out);
+    } else {
+      quantized_conv_nchw(
+          input,
+          weight,
+          bias,
+          stride,
+          padding,
+          dilation,
+          groups,
+          in_zero_point,
+          weight_zero_point_int,
+          bias_scale_float,
+          output_scale,
+          output_zero_point,
+          out);
+    }
+  }
+}
+
+void quantized_conv_per_tensor_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t in_zero_point,
+    int64_t weight_zero_point,
+    double bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    __ET_UNUSED int64_t out_multiplier,
+    __ET_UNUSED int64_t out_shift,
+    bool channel_last,
+    Tensor& out) {
+  bool optimized = 0;
+
+  if ((input.scalar_type() == ScalarType::Char) ||
+      (input.scalar_type() == ScalarType::Byte))
+    optimized = 1;
+
+  if ((dilation[0] != 1) || (dilation[1] != 1))
+    optimized = 0;
+
+  if (channel_last) {
+    if (optimized) {
+      xa_opt_quantized_conv_nhwc(
+          ctx,
+          input,
+          weight,
+          bias,
+          stride,
+          padding,
+          dilation,
+          groups,
+          in_zero_point,
+          weight_zero_point,
+          bias_scale,
+          output_scale,
+          output_zero_point,
+          out);
+    } else {
+      quantized_conv_nhwc(
+          input,
+          weight,
+          bias,
+          stride,
+          padding,
+          dilation,
+          groups,
+          in_zero_point,
+          weight_zero_point,
+          bias_scale,
+          output_scale,
+          output_zero_point,
+          out);
+    }
+  } else {
+    if (optimized) {
+      xa_opt_quantized_conv_nchw(
+          ctx,
+          input,
+          weight,
+          bias,
+          stride,
+          padding,
+          dilation,
+          groups,
+          in_zero_point,
+          weight_zero_point,
+          bias_scale,
+          output_scale,
+          output_zero_point,
+          out);
+    } else {
+      quantized_conv_nchw(
+          input,
+          weight,
+          bias,
+          stride,
+          padding,
+          dilation,
+          groups,
+          in_zero_point,
+          weight_zero_point,
+          bias_scale,
+          output_scale,
+          output_zero_point,
+          out);
+    }
+  }
+}
+
+} // namespace native
+} // namespace HiFi
+} // namespace impl
+} // namespace cadence
diff --git a/backends/cadence/hifi/operators/op_quantized_relu_out.cpp b/backends/cadence/hifi/operators/op_quantized_relu_out.cpp
index b8baa946b98..9b65751da71 100644
--- a/backends/cadence/hifi/operators/op_quantized_relu_out.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_relu_out.cpp
@@ -100,10 +100,10 @@ void quantized_relu_per_tensor_out(
 void quantized_relu_out(
     KernelRuntimeContext& ctx,
     const Tensor& input,
-    const int64_t in_zero_point,
+    const Tensor& in_zero_point,
     const int64_t out_zero_point,
-    const int64_t out_multiplier,
-    const int64_t out_shift,
+    const Tensor& out_multiplier,
+    const Tensor& out_shift,
     Tensor& output) {
   quantized_relu_per_tensor_out(
       ctx,
diff --git a/backends/cadence/hifi/operators/op_softmax.cpp b/backends/cadence/hifi/operators/op_softmax.cpp
index 852479ed935..25d3ad7d389 100644
--- a/backends/cadence/hifi/operators/op_softmax.cpp
+++ b/backends/cadence/hifi/operators/op_softmax.cpp
@@ -200,7 +200,7 @@ Tensor& softmax_out(
     int64_t dim,
     bool half_to_float,
     Tensor& out) {
-  _softmax_out(ctx, in, dim, half_to_float, out);
+  return _softmax_out(ctx, in, dim, half_to_float, out);
 }
 
 } // namespace native
diff --git a/backends/cadence/hifi/operators/op_sub.cpp b/backends/cadence/hifi/operators/op_sub.cpp
index 02c8c60eac5..d1035c2fb1d 100644
--- a/backends/cadence/hifi/operators/op_sub.cpp
+++ b/backends/cadence/hifi/operators/op_sub.cpp
@@ -133,8 +133,21 @@ Tensor& sub_out(
   if ((out_type != ScalarType::Float) || (alpha_val != 1.0))
     optimized = 0;
 
-  if ((a_dim == 0) || (b_dim == 0))
-    optimized = 0;
+  bool float_types =
+      (a_type == ScalarType::Float) && (b_type == ScalarType::Float);
+
+  if ((a_dim == 0) && float_types) {
+    for (int i = 0; i < b.numel(); i++)
+      out.mutable_data_ptr<float>()[i] =
+          a.const_data_ptr<float>()[0] - b.const_data_ptr<float>()[i];
+    return out;
+  }
+  if ((b_dim == 0) && float_types) {
+    for (int i = 0; i < a.numel(); i++)
+      out.mutable_data_ptr<float>()[i] =
+          a.const_data_ptr<float>()[i] - b.const_data_ptr<float>()[0];
+    return out;
+  }
 
   if ((broadcast == 1) && (max_dim > kNnlibMaxDim))
     optimized = 0;
diff --git a/backends/cadence/hifi/operators/op_where.cpp b/backends/cadence/hifi/operators/op_where.cpp
index ac7559691ae..94c1684fe09 100644
--- a/backends/cadence/hifi/operators/op_where.cpp
+++ b/backends/cadence/hifi/operators/op_where.cpp
@@ -183,6 +183,15 @@ Tensor& where_self_out(
   return out;
 }
 
+Tensor& where_out(
+    RuntimeContext& ctx,
+    const Tensor& cond,
+    const Tensor& a,
+    const Tensor& b,
+    Tensor& out) {
+  return where_out(ctx, cond, a, b, out);
+}
+
 } // namespace native
 } // namespace HiFi
 } // namespace impl
diff --git a/backends/cadence/hifi/operators/targets.bzl b/backends/cadence/hifi/operators/targets.bzl
index c94b9a3b19e..61a526b4073 100644
--- a/backends/cadence/hifi/operators/targets.bzl
+++ b/backends/cadence/hifi/operators/targets.bzl
@@ -47,6 +47,7 @@ OPERATORS = [
     "pow",
     "quantized_fully_connected_out",
     "quantize_per_tensor",
+    "quantized_conv_out",
     "quantized_layer_norm",
     "quantized_linear_out",
     "quantized_relu_out",
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c
index 244f404d2ea..3b73e30db42 100644
--- a/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c
+++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c
@@ -1,3 +1,26 @@
+/*******************************************************************************
+* Copyright (c) 2018-2024 Cadence Design Systems, Inc.
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files (the
+* "Software"), to use this Software with Cadence processor cores only and
+* not with any other processors and platforms, subject to
+* the following conditions:
+*
+* The above copyright notice and this permission notice shall be included
+* in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+******************************************************************************/
+
+
 #include "xa_type_def.h"
 #include "xa_nn_common.h"
 #include "xa_nnlib_kernels_api.h"
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_atan2_f32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_atan2_f32.c
index 65ff9f735ad..2f1d2071777 100644
--- a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_atan2_f32.c
+++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_atan2_f32.c
@@ -1,27 +1,24 @@
-/* ------------------------------------------------------------------------ */
-/* Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED.  */
-/* These coded instructions, statements, and computer programs ("Cadence    */
-/* Libraries") are the copyrighted works of Cadence Design Systems Inc.	    */
-/* Cadence IP is licensed for use with Cadence processor cores only and     */
-/* must not be used for any other processors and platforms. Your use of the */
-/* Cadence Libraries is subject to the terms of the license agreement you   */
-/* have entered into with Cadence Design Systems, or a sublicense granted   */
-/* to you by a direct Cadence licensee.                                     */
-/* ------------------------------------------------------------------------ */
-/*  IntegrIT, Ltd.   www.integrIT.com, info@integrIT.com                    */
-/*                                                                          */
-/* DSP Library                                                              */
-/*                                                                          */
-/* This library contains copyrighted materials, trade secrets and other     */
-/* proprietary information of IntegrIT, Ltd. This software is licensed for  */
-/* use with Cadence processor cores only and must not be used for any other */
-/* processors and platforms. The license to use these sources was given to  */
-/* Cadence, Inc. under Terms and Condition of a Software License Agreement  */
-/* between Cadence, Inc. and IntegrIT, Ltd.                                 */
-/* ------------------------------------------------------------------------ */
-/*          Copyright (C) 2015-2018 IntegrIT, Limited.                      */
-/*                      All Rights Reserved.                                */
-/* ------------------------------------------------------------------------ */
+/*******************************************************************************
+* Copyright (c) 2018-2024 Cadence Design Systems, Inc.
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files (the
+* "Software"), to use this Software with Cadence processor cores only and
+* not with any other processors and platforms, subject to
+* the following conditions:
+*
+* The above copyright notice and this permission notice shall be included
+* in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+******************************************************************************/
 #include <float.h>
 
 #include "../include/NatureDSP_Signal_math.h"
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c
index 03b8d625186..16fc23f59de 100644
--- a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c
+++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c
@@ -1,3 +1,25 @@
+/*******************************************************************************
+* Copyright (c) 2018-2024 Cadence Design Systems, Inc.
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files (the
+* "Software"), to use this Software with Cadence processor cores only and
+* not with any other processors and platforms, subject to
+* the following conditions:
+*
+* The above copyright notice and this permission notice shall be included
+* in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+******************************************************************************/
+
 #include "xa_type_def.h"
 #include "xa_nnlib_common_fpu.h"
 #include "xa_nn_common.h"
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c
index 17c619d150e..722a41b6040 100644
--- a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c
+++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c
@@ -1,3 +1,25 @@
+/*******************************************************************************
+* Copyright (c) 2018-2024 Cadence Design Systems, Inc.
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files (the
+* "Software"), to use this Software with Cadence processor cores only and
+* not with any other processors and platforms, subject to
+* the following conditions:
+*
+* The above copyright notice and this permission notice shall be included
+* in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+******************************************************************************/
+
 #include "xa_type_def.h"
 #include "xa_nnlib_common_fpu.h"
 #include "xa_nn_common.h"
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_pow_f32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_pow_f32.c
index 4dcec52f973..aa81d695784 100644
--- a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_pow_f32.c
+++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_pow_f32.c
@@ -1,33 +1,24 @@
-/* ------------------------------------------------------------------------ */
-/* Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED.  */
-/* These coded instructions, statements, and computer programs ("Cadence    */
-/* Libraries") are the copyrighted works of Cadence Design Systems Inc.	    */
-/* Cadence IP is licensed for use with Cadence processor cores only and     */
-/* must not be used for any other processors and platforms. Your use of the */
-/* Cadence Libraries is subject to the terms of the license agreement you   */
-/* have entered into with Cadence Design Systems, or a sublicense granted   */
-/* to you by a direct Cadence licensee.                                     */
-/* ------------------------------------------------------------------------ */
-/*  IntegrIT, Ltd.   www.integrIT.com, info@integrIT.com                    */
-/*                                                                          */
-/* DSP Library                                                              */
-/*                                                                          */
-/* This library contains copyrighted materials, trade secrets and other     */
-/* proprietary information of IntegrIT, Ltd. This software is licensed for  */
-/* use with Cadence processor cores only and must not be used for any other */
-/* processors and platforms. The license to use these sources was given to  */
-/* Cadence, Inc. under Terms and Condition of a Software License Agreement  */
-/* between Cadence, Inc. and IntegrIT, Ltd.                                 */ 
-/* ------------------------------------------------------------------------ */
-/*          Copyright (C) 2015-2018 IntegrIT, Limited.                      */
-/*                      All Rights Reserved.                                */
-/* ------------------------------------------------------------------------ */
-/*
-  NatureDSP Signal Processing Library. Vector mathematics
-    Vector operations
-    code optimized for HiFi4 core
-  IntegrIT, 2006-2018
-*/
+/*******************************************************************************
+* Copyright (c) 2018-2024 Cadence Design Systems, Inc.
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files (the
+* "Software"), to use this Software with Cadence processor cores only and
+* not with any other processors and platforms, subject to
+* the following conditions:
+*
+* The above copyright notice and this permission notice shall be included
+* in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+******************************************************************************/
 
 #include "../include/NatureDSP_Signal_math.h"
 #include "NatureDSP_types.h"
diff --git a/backends/cadence/reference/kernels/CMakeLists.txt b/backends/cadence/reference/kernels/CMakeLists.txt
index 07394cbe834..3fe0fe2101f 100644
--- a/backends/cadence/reference/kernels/CMakeLists.txt
+++ b/backends/cadence/reference/kernels/CMakeLists.txt
@@ -8,7 +8,8 @@
 add_library(cadence_kernels kernels.cpp)
 
 # Let files say "include <executorch/path/to/header.h>".
-set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+set(_common_include_directories ${EXECUTORCH_ROOT}/..
+${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)
 
 target_include_directories(cadence_kernels PUBLIC .
                     ${_common_include_directories}
diff --git a/backends/cadence/reference/kernels/kernels.cpp b/backends/cadence/reference/kernels/kernels.cpp
index faac3d7cb27..0961b1ac658 100644
--- a/backends/cadence/reference/kernels/kernels.cpp
+++ b/backends/cadence/reference/kernels/kernels.cpp
@@ -58,6 +58,36 @@ void dequantize(
   }
 }
 
+// Requantize the int8_t/uint8_t in value to a uint8_t/int8_t out value.
+// The scale and zero_point for requantization are in the args.
+template <typename IT, typename OT>
+OT requantize(
+    const IT in,
+    float in_scale,
+    int32_t in_zero_point,
+    float inv_out_scale,
+    int32_t out_zero_point) {
+  float dequant = dequantize<IT>(in, in_scale, in_zero_point);
+  return quantize<OT>(dequant, inv_out_scale, out_zero_point);
+}
+
+// Requantize the int8_t/uint8_t in array to a uint8_t/int8_t out array.
+// The scale and zero_point for requantization are in the args.
+template <typename IT, typename OT>
+void requantize(
+    OT* __restrict__ out,
+    const IT* __restrict__ in,
+    float in_scale,
+    int32_t in_zero_point,
+    float inv_out_scale,
+    int32_t out_zero_point,
+    size_t size) {
+  for (size_t i = 0; i < size; ++i) {
+    out[i] = requantize<IT, OT>(
+        in[i], in_scale, in_zero_point, inv_out_scale, out_zero_point);
+  }
+}
+
 // explicit template instantiation
 
 #define typed_quantize_val(dtype) \
@@ -106,6 +136,58 @@ typed_dequantize_vec(uint16_t);
 typed_dequantize_vec(int32_t);
 #undef typed_dequantize_vec
 
+#define typed_requantize_val(itype, otype) \
+  template otype requantize(               \
+      const itype in,                      \
+      float in_scale,                      \
+      int32_t in_zero_point,               \
+      float inv_out_scale,                 \
+      int32_t out_zero_point);
+typed_requantize_val(int8_t, int8_t);
+typed_requantize_val(int8_t, uint8_t);
+typed_requantize_val(int8_t, int16_t);
+typed_requantize_val(int8_t, uint16_t);
+typed_requantize_val(uint8_t, int8_t);
+typed_requantize_val(uint8_t, uint8_t);
+typed_requantize_val(uint8_t, int16_t);
+typed_requantize_val(uint8_t, uint16_t);
+typed_requantize_val(int16_t, int8_t);
+typed_requantize_val(int16_t, uint8_t);
+typed_requantize_val(int16_t, int16_t);
+typed_requantize_val(int16_t, uint16_t);
+typed_requantize_val(uint16_t, int8_t);
+typed_requantize_val(uint16_t, uint8_t);
+typed_requantize_val(uint16_t, int16_t);
+typed_requantize_val(uint16_t, uint16_t);
+#undef typed_requantize_val
+
+#define typed_requantize_vec(itype, otype) \
+  template void requantize(                \
+      otype* __restrict__ out,             \
+      const itype* __restrict__ in,        \
+      float in_scale,                      \
+      int32_t in_zero_point,               \
+      float inv_out_scale,                 \
+      int32_t out_zero_point,              \
+      size_t size);
+typed_requantize_vec(int8_t, int8_t);
+typed_requantize_vec(int8_t, uint8_t);
+typed_requantize_vec(int8_t, int16_t);
+typed_requantize_vec(int8_t, uint16_t);
+typed_requantize_vec(uint8_t, int8_t);
+typed_requantize_vec(uint8_t, uint8_t);
+typed_requantize_vec(uint8_t, int16_t);
+typed_requantize_vec(uint8_t, uint16_t);
+typed_requantize_vec(int16_t, int8_t);
+typed_requantize_vec(int16_t, uint8_t);
+typed_requantize_vec(int16_t, int16_t);
+typed_requantize_vec(int16_t, uint16_t);
+typed_requantize_vec(uint16_t, int8_t);
+typed_requantize_vec(uint16_t, uint8_t);
+typed_requantize_vec(uint16_t, int16_t);
+typed_requantize_vec(uint16_t, uint16_t);
+#undef typed_requantize_vec
+
 }; // namespace kernels
 }; // namespace reference
 }; // namespace impl
diff --git a/backends/cadence/reference/kernels/kernels.h b/backends/cadence/reference/kernels/kernels.h
index 76400405144..de6ae9486f5 100644
--- a/backends/cadence/reference/kernels/kernels.h
+++ b/backends/cadence/reference/kernels/kernels.h
@@ -36,6 +36,24 @@ void dequantize(
     int32_t zero_point,
     size_t size);
 
+template <typename IT, typename OT>
+OT requantize(
+    const IT in,
+    float in_scale,
+    int32_t in_zero_point,
+    float inv_out_scale,
+    int32_t out_zero_point);
+
+template <typename IT, typename OT>
+void requantize(
+    OT* __restrict__ out,
+    const IT* __restrict__ in,
+    float in_scale,
+    int32_t in_zero_point,
+    float inv_out_scale,
+    int32_t out_zero_point,
+    size_t size);
+
 }; // namespace kernels
 }; // namespace reference
 }; // namespace impl
diff --git a/backends/cadence/reference/operators/CMakeLists.txt b/backends/cadence/reference/operators/CMakeLists.txt
index a2d51af2c0c..e0a10c6fa36 100644
--- a/backends/cadence/reference/operators/CMakeLists.txt
+++ b/backends/cadence/reference/operators/CMakeLists.txt
@@ -11,8 +11,8 @@ if(NOT CMAKE_CXX_STANDARD)
   set(CMAKE_CXX_STANDARD 17)
 endif()
 
-include(${EXECUTORCH_ROOT}/build/Utils.cmake)
-include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
 
 if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
@@ -71,7 +71,8 @@ target_link_libraries(aten_ops_cadence PUBLIC executorch)
 target_link_libraries(aten_ops_cadence PRIVATE cadence_kernels)
 
 # Let files say "include <executorch/path/to/header.h>".
-set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+set(_common_include_directories ${EXECUTORCH_ROOT}/..
+${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)
 
 target_include_directories(
   aten_ops_cadence PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR}
@@ -86,8 +87,10 @@ add_library(
   "quantized_relu_out.cpp"
   "quantized_layer_norm.cpp"
   "quantize_per_tensor.cpp"
+  "quantized_fully_connected_out.cpp"
   "dequantize_per_tensor.cpp"
   "quantized_matmul_out.cpp"
+  "requantize_out.cpp"
   "im2row_out.cpp"
 )
 target_include_directories(
diff --git a/backends/cadence/reference/operators/quantized_fully_connected_out.cpp b/backends/cadence/reference/operators/quantized_fully_connected_out.cpp
new file mode 100644
index 00000000000..77b7dd94e9d
--- /dev/null
+++ b/backends/cadence/reference/operators/quantized_fully_connected_out.cpp
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <executorch/backends/cadence/reference/kernels/kernels.h>
+#include <executorch/backends/cadence/reference/operators/operators.h>
+#include <executorch/backends/cadence/reference/operators/quantized_ops.h>
+
+namespace impl {
+namespace reference {
+namespace native {
+
+using ::executorch::aten::optional;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::KernelRuntimeContext;
+
+void quantized_fully_connected_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& in,
+    const Tensor& weight,
+    const Tensor& bias,
+    int64_t in_zero_point,
+    const Tensor& weight_zero_point_t,
+    const Tensor& out_multiplier,
+    const Tensor& out_shift,
+    int64_t out_zero_point,
+    __ET_UNUSED const optional<Tensor>& offset,
+    Tensor& out) {
+#define typed_quantized_linear(ctype, dtype) \
+  case ScalarType::dtype: {                  \
+    quantized_linear_<ctype>(                \
+        in,                                  \
+        weight,                              \
+        bias,                                \
+        in_zero_point,                       \
+        weight_zero_point_t,                 \
+        out_multiplier,                      \
+        out_shift,                           \
+        out_zero_point,                      \
+        out);                                \
+    break;                                   \
+  }
+
+  ScalarType dtype = out.scalar_type();
+  switch (dtype) {
+    ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_linear);
+    default:
+      ET_DCHECK_MSG(
+          false, "Unhandled dtype %s", torch::executor::toString(dtype));
+  }
+#undef typed_quantized_linear
+}
+
+void quantized_fully_connected_per_tensor_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& in,
+    const Tensor& weight,
+    const Tensor& bias,
+    int64_t in_zero_point,
+    int64_t weight_zero_point,
+    int64_t out_multiplier,
+    int64_t out_shift,
+    int64_t out_zero_point,
+    __ET_UNUSED const optional<Tensor>& offset,
+    Tensor& out) {
+#define typed_quantized_linear(ctype, dtype) \
+  case ScalarType::dtype: {                  \
+    quantized_linear_per_tensor_<ctype>(     \
+        in,                                  \
+        weight,                              \
+        bias,                                \
+        in_zero_point,                       \
+        weight_zero_point,                   \
+        out_multiplier,                      \
+        out_shift,                           \
+        out_zero_point,                      \
+        out);                                \
+    break;                                   \
+  }
+
+  ScalarType dtype = out.scalar_type();
+  switch (dtype) {
+    ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_linear);
+    default:
+      ET_DCHECK_MSG(
+          false, "Unhandled dtype %s", torch::executor::toString(dtype));
+  }
+#undef typed_quantized_linear
+}
+
+}; // namespace native
+}; // namespace reference
+}; // namespace impl
diff --git a/backends/cadence/reference/operators/quantized_relu_out.cpp b/backends/cadence/reference/operators/quantized_relu_out.cpp
index 19b971405c9..7a385849aee 100644
--- a/backends/cadence/reference/operators/quantized_relu_out.cpp
+++ b/backends/cadence/reference/operators/quantized_relu_out.cpp
@@ -7,6 +7,7 @@
  */
 
 #include <executorch/backends/cadence/reference/kernels/kernels.h>
+#include <executorch/backends/cadence/reference/operators/operators.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
 namespace impl {
@@ -75,6 +76,59 @@ void quantized_relu_out(
   }
 }
 
+template <typename T>
+void quantized_relu_per_tensor_out_(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const int64_t in_zero_point,
+    const int64_t out_zero_point,
+    const int64_t out_multiplier,
+    const int64_t out_shift,
+    Tensor& output) {
+  const T* __restrict__ in = input.const_data_ptr<T>();
+  T* __restrict__ out = output.mutable_data_ptr<T>();
+
+  // Compute the out_scale from out_multiplier and out_shift
+  const float out_scale = -out_multiplier * 1.0 / (1 << 31) * pow(2, out_shift);
+
+  for (size_t i = 0, e = input.numel(); i < e; ++i) {
+    const float temp = in[i] > in_zero_point ? (in[i] - in_zero_point) : 0;
+    out[i] = kernels::quantize<T>(temp, out_scale, out_zero_point);
+  }
+}
+
+void quantized_relu_per_tensor_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const int64_t in_zero_point,
+    const int64_t out_zero_point,
+    const int64_t out_multiplier,
+    const int64_t out_shift,
+    Tensor& output) {
+#define typed_quantized_relu(ctype, dtype)    \
+  case executorch::aten::ScalarType::dtype: { \
+    quantized_relu_per_tensor_out_<ctype>(    \
+        ctx,                                  \
+        input,                                \
+        in_zero_point,                        \
+        out_zero_point,                       \
+        out_multiplier,                       \
+        out_shift,                            \
+        output);                              \
+    break;                                    \
+  }
+
+  executorch::aten::ScalarType dtype = input.scalar_type();
+  switch (dtype) {
+    ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_relu)
+    default:
+      ET_DCHECK_MSG(
+          false, "Unhandled dtype %s", torch::executor::toString(dtype));
+  }
+
+#undef typed_quantized_relu
+}
+
 }; // namespace native
 }; // namespace reference
 }; // namespace impl
diff --git a/backends/cadence/reference/operators/requantize_out.cpp b/backends/cadence/reference/operators/requantize_out.cpp
new file mode 100644
index 00000000000..10e5a588785
--- /dev/null
+++ b/backends/cadence/reference/operators/requantize_out.cpp
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/reference/kernels/kernels.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace impl {
+namespace reference {
+namespace native {
+
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::runtime::KernelRuntimeContext;
+
+// Requantize the int8_t/uint8_t input tensor to a uint8_t/int8_t out tensor.
+// The scale and zero_point for requantization are in the args.
+Tensor& requantize_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& in_scale_t,
+    const Tensor& in_zero_point_t,
+    const Tensor& out_scale_t,
+    const Tensor& out_zero_point_t,
+    const ScalarType out_dtype,
+    Tensor& out) {
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      in_scale_t.scalar_type() == ScalarType::Float,
+      InvalidArgument,
+      out,
+      "In scale is not a float: %s",
+      torch::executor::toString(in_scale_t.scalar_type()));
+  float in_scale = in_scale_t.const_data_ptr<float>()[0];
+
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      in_zero_point_t.scalar_type() == ScalarType::Int,
+      InvalidArgument,
+      out,
+      "In zero point is not an int: %s",
+      torch::executor::toString(in_zero_point_t.scalar_type()));
+  int32_t in_zero_point = in_zero_point_t.const_data_ptr<int32_t>()[0];
+
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      out_scale_t.scalar_type() == ScalarType::Float,
+      InvalidArgument,
+      out,
+      "Out scale is not a float: %s",
+      torch::executor::toString(out_scale_t.scalar_type()));
+  float out_scale = out_scale_t.const_data_ptr<float>()[0];
+
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      out_zero_point_t.scalar_type() == ScalarType::Int,
+      InvalidArgument,
+      out,
+      "Out zero point is not an int: %s",
+      torch::executor::toString(out_zero_point_t.scalar_type()));
+  int32_t out_zero_point = out_zero_point_t.const_data_ptr<int32_t>()[0];
+
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      out.scalar_type() == out_dtype,
+      InvalidArgument,
+      out,
+      "Out tensor dtype (%s) does not match the passed in out dtype (%s)",
+      torch::executor::toString(out.scalar_type()),
+      torch::executor::toString(out_dtype));
+
+  const size_t numel = out.numel();
+  ScalarType in_dtype = input.scalar_type();
+
+  // Assert that the output tensor's dtype is same as out_dtype.
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      out_dtype == out.scalar_type(),
+      InvalidArgument,
+      out,
+      "Out dtype %s does not match requant dtype %s",
+      torch::executor::toString(out.scalar_type()),
+      torch::executor::toString(out_dtype));
+
+#define typed_requantize(ctype, dtype)                     \
+  const ctype* input_data = input.const_data_ptr<ctype>(); \
+  dtype* out_data = out.mutable_data_ptr<dtype>();         \
+  kernels::requantize<ctype, dtype>(                       \
+      out_data,                                            \
+      input_data,                                          \
+      in_scale,                                            \
+      in_zero_point,                                       \
+      1.0 / out_scale,                                     \
+      out_zero_point,                                      \
+      numel);
+
+#define typed_requantize_in(ctype)               \
+  switch (out_dtype) {                           \
+    case ScalarType::Byte: {                     \
+      typed_requantize(ctype, uint8_t);          \
+      break;                                     \
+    }                                            \
+    case ScalarType::Char: {                     \
+      typed_requantize(ctype, int8_t);           \
+      break;                                     \
+    }                                            \
+    case ScalarType::UInt16: {                   \
+      typed_requantize(ctype, uint16_t);         \
+      break;                                     \
+    }                                            \
+    case ScalarType::Short: {                    \
+      typed_requantize(ctype, int16_t);          \
+      break;                                     \
+    }                                            \
+    default:                                     \
+      ET_KERNEL_CHECK_MSG(                       \
+          ctx,                                   \
+          false,                                 \
+          InvalidArgument,                       \
+          out,                                   \
+          "Unhandled output dtype %s",           \
+          torch::executor::toString(out_dtype)); \
+  }
+
+  switch (in_dtype) {
+    case ScalarType::Byte: {
+      typed_requantize_in(uint8_t);
+      break;
+    }
+    case ScalarType::Char: {
+      typed_requantize_in(int8_t);
+      break;
+    }
+    case ScalarType::UInt16: {
+      typed_requantize_in(uint16_t);
+      break;
+    }
+    case ScalarType::Short: {
+      typed_requantize_in(int16_t);
+      break;
+    }
+    default:
+      ET_KERNEL_CHECK_MSG(
+          ctx,
+          false,
+          InvalidArgument,
+          out,
+          "Unhandled input dtype %s",
+          torch::executor::toString(in_dtype));
+  }
+#undef typed_requantize_in
+#undef typed_requantize
+  return out;
+}
+
+}; // namespace native
+}; // namespace reference
+}; // namespace impl
diff --git a/backends/cadence/utils/facto_util.py b/backends/cadence/utils/facto_util.py
index 2fa0787a08e..9b50b469627 100644
--- a/backends/cadence/utils/facto_util.py
+++ b/backends/cadence/utils/facto_util.py
@@ -18,12 +18,33 @@
 
 # seed to generate identical cases every run to reproduce from bisect
 random_manager.seed(1729)
+MAX_CASES = 50
 
 
 def apply_tensor_contraints(op_name: str, tensor_constraints: list[object]) -> None:
+    additional_tensor_constraints = [
+        cp.Dtype.In(lambda deps: [torch.int, torch.float]),
+        cp.Dtype.NotIn(lambda deps: [torch.int64, torch.float64]),
+        cp.Value.Ge(lambda deps, dtype, struct: -(2**4)),
+        cp.Value.Le(lambda deps, dtype, struct: 2**4),
+        cp.Rank.Ge(lambda deps: 1),
+        cp.Size.Ge(lambda deps, r, d: 1),
+        cp.Size.Le(lambda deps, r, d: 2**9),
+    ]
+
     match op_name:
+        case "where.self":
+            additional_tensor_constraints = [
+                cp.Dtype.In(lambda deps: [torch.float, torch.int, torch.bool]),
+                cp.Dtype.NotIn(lambda deps: [torch.int64, torch.float64]),
+                cp.Value.Ge(lambda deps, dtype, struct: -(2**4)),
+                cp.Value.Le(lambda deps, dtype, struct: 2**4),
+                cp.Rank.Ge(lambda deps: 1),
+                cp.Size.Ge(lambda deps, r, d: 1),
+                cp.Size.Le(lambda deps, r, d: 2**9),
+            ]
         case "sigmoid.default" | "rsqrt.default":
-            tensor_constraints.extend(
+            additional_tensor_constraints.extend(
                 [
                     cp.Dtype.In(lambda deps: [torch.float]),
                     cp.Rank.Le(lambda deps: 2**2),
@@ -32,37 +53,35 @@ def apply_tensor_contraints(op_name: str, tensor_constraints: list[object]) -> N
                 ]
             )
         case "mean.dim":
-            tensor_constraints.extend(
+            additional_tensor_constraints.extend(
                 [
                     cp.Dtype.In(lambda deps: [torch.float]),
                     cp.Rank.Le(lambda deps: 2**2),
                 ]
             )
         case "exp.default":
-            tensor_constraints.extend(
+            additional_tensor_constraints.extend(
                 [
                     cp.Rank.Le(lambda deps: 2**3),
                     cp.Value.Ge(lambda deps, dtype, struct: -(2**2)),
                     cp.Value.Le(lambda deps, dtype, struct: 2**2),
                 ]
             )
+        case "slice_copy.Tensor":
+            additional_tensor_constraints.extend(
+                [
+                    cp.Rank.Le(lambda deps: 2),
+                    cp.Value.Ge(lambda deps, dtype, struct: 1),
+                    cp.Value.Le(lambda deps, dtype, struct: 2),
+                ]
+            )
         case _:
-            tensor_constraints.extend(
+            additional_tensor_constraints.extend(
                 [
                     cp.Rank.Le(lambda deps: 2**2),
                 ]
             )
-    tensor_constraints.extend(
-        [
-            cp.Dtype.In(lambda deps: [torch.int, torch.float]),
-            cp.Dtype.NotIn(lambda deps: [torch.int64, torch.float64]),
-            cp.Value.Ge(lambda deps, dtype, struct: -(2**4)),
-            cp.Value.Le(lambda deps, dtype, struct: 2**4),
-            cp.Rank.Ge(lambda deps: 1),
-            cp.Size.Ge(lambda deps, r, d: 1),
-            cp.Size.Le(lambda deps, r, d: 2**9),
-        ]
-    )
+    tensor_constraints.extend(additional_tensor_constraints)
 
 
 def apply_scalar_contraints(op_name: str) -> list[ScalarDtype]:
@@ -124,4 +143,4 @@ def facto_testcase_gen(op_name: str) -> List[Tuple[List[str], OrderedDict[str, s
     return [
         (posargs, inkwargs)
         for posargs, inkwargs, _ in ArgumentTupleGenerator(spec).gen()
-    ]
+    ][:MAX_CASES]
diff --git a/backends/mediatek/README.md b/backends/mediatek/README.md
index 5d478da3f00..ec4c392eb46 100644
--- a/backends/mediatek/README.md
+++ b/backends/mediatek/README.md
@@ -37,6 +37,7 @@ Download [NeuroPilot Express SDK](https://neuropilot.mediatek.com/resources/publ
 - `libneuron_buffer_allocator.so`: This utility library is designed for allocating DMA buffers necessary for model inference.
 - `mtk_converter-8.8.0.dev20240723+public.d1467db9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl`: This library preprocess the model into a MediaTek representation.
 - `mtk_neuron-8.2.2-py3-none-linux_x86_64.whl`: This library converts the model to binaries.
+- Copy `NeuronAdapter.h` under `backends/mediatek/runtime/include/api/`.
 
 ### Setup
 
@@ -76,4 +77,4 @@ Follow the steps below to setup your build environment:
    export LD_LIBRARY_PATH=<path_to_usdk>:<path_to_neuron_backend>:$LD_LIBRARY_PATH
    ```
 
-Please refer to `executorch/examples/mediatek/` for export and execution examples of various of models.
\ No newline at end of file
+Please refer to `executorch/examples/mediatek/` for export and execution examples of various of models.
diff --git a/backends/mediatek/runtime/NeuronBackend.cpp b/backends/mediatek/runtime/NeuronBackend.cpp
index fa3d0d9dc63..ecbd5ca3cd2 100644
--- a/backends/mediatek/runtime/NeuronBackend.cpp
+++ b/backends/mediatek/runtime/NeuronBackend.cpp
@@ -68,8 +68,12 @@ Result<DelegateHandle*> NeuronBackend::init(
       processed->size());
 
   MemoryAllocator* runtime_allocator = context.get_runtime_allocator();
-  NeuronExecuTorchDelegate* delegate = ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR(
-      runtime_allocator, NeuronExecuTorchDelegate);
+  NeuronExecuTorchDelegate* delegate =
+      runtime_allocator->allocateInstance<NeuronExecuTorchDelegate>();
+  if (delegate == nullptr) {
+    return Error::MemoryAllocationFailed;
+  }
+
   new (delegate) NeuronExecuTorchDelegate();
 
   if (delegate == nullptr) {
diff --git a/backends/mediatek/runtime/include/api/NeuronAdapter.h b/backends/mediatek/runtime/include/api/NeuronAdapter.h
deleted file mode 100644
index 3a4af8299b0..00000000000
--- a/backends/mediatek/runtime/include/api/NeuronAdapter.h
+++ /dev/null
@@ -1,2385 +0,0 @@
-/* Copyright Statement:
- *
- * This software/firmware and related documentation ("MediaTek Software") are
- * protected under relevant copyright laws. The information contained herein
- * is confidential and proprietary to MediaTek Inc. and/or its licensors.
- * Without the prior written permission of MediaTek inc. and/or its licensors,
- * any reproduction, modification, use or disclosure of MediaTek Software,
- * and information contained herein, in whole or in part, shall be strictly
- * prohibited.
- */
-/* MediaTek Inc. (C) 2020. All rights reserved.
- *
- * BY OPENING THIS FILE, RECEIVER HEREBY UNEQUIVOCALLY ACKNOWLEDGES AND AGREES
- * THAT THE SOFTWARE/FIRMWARE AND ITS DOCUMENTATIONS ("MEDIATEK SOFTWARE")
- * RECEIVED FROM MEDIATEK AND/OR ITS REPRESENTATIVES ARE PROVIDED TO RECEIVER ON
- * AN "AS-IS" BASIS ONLY. MEDIATEK EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE OR NONINFRINGEMENT.
- * NEITHER DOES MEDIATEK PROVIDE ANY WARRANTY WHATSOEVER WITH RESPECT TO THE
- * SOFTWARE OF ANY THIRD PARTY WHICH MAY BE USED BY, INCORPORATED IN, OR
- * SUPPLIED WITH THE MEDIATEK SOFTWARE, AND RECEIVER AGREES TO LOOK ONLY TO SUCH
- * THIRD PARTY FOR ANY WARRANTY CLAIM RELATING THERETO. RECEIVER EXPRESSLY
- * ACKNOWLEDGES THAT IT IS RECEIVER'S SOLE RESPONSIBILITY TO OBTAIN FROM ANY
- * THIRD PARTY ALL PROPER LICENSES CONTAINED IN MEDIATEK SOFTWARE. MEDIATEK
- * SHALL ALSO NOT BE RESPONSIBLE FOR ANY MEDIATEK SOFTWARE RELEASES MADE TO
- * RECEIVER'S SPECIFICATION OR TO CONFORM TO A PARTICULAR STANDARD OR OPEN
- * FORUM. RECEIVER'S SOLE AND EXCLUSIVE REMEDY AND MEDIATEK'S ENTIRE AND
- * CUMULATIVE LIABILITY WITH RESPECT TO THE MEDIATEK SOFTWARE RELEASED HEREUNDER
- * WILL BE, AT MEDIATEK'S OPTION, TO REVISE OR REPLACE THE MEDIATEK SOFTWARE AT
- * ISSUE, OR REFUND ANY SOFTWARE LICENSE FEES OR SERVICE CHARGE PAID BY RECEIVER
- * TO MEDIATEK FOR SUCH MEDIATEK SOFTWARE AT ISSUE.
- *
- * The following software/firmware and/or related documentation ("MediaTek
- * Software") have been modified by MediaTek Inc. All revisions are subject to
- * any receiver's applicable license agreements with MediaTek Inc.
- */
-
-/**
- * @file NeuronAdapter.h
- */
-
-#pragma once
-
-#ifdef __ANDROID__
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wnullability-extension"
-#include <android/hardware_buffer.h>
-#pragma clang diagnostic pop
-#endif
-
-#include <stddef.h>
-#include <stdint.h>
-#include <sys/cdefs.h>
-
-__BEGIN_DECLS
-
-/**
- * NeuronModel is an opaque type that contains a description of the mathematical
- * operations that constitute the model.
- */
-typedef struct NeuronModel NeuronModel;
-
-/**
- * NeuronCompilation is an opaque type that can be used to compile a machine
- * learning model.
- */
-typedef struct NeuronCompilation NeuronCompilation;
-
-/**
- * NeuronExecution is an opaque type that can be used to apply a machine
- * learning model to a set of inputs.
- */
-typedef struct NeuronExecution NeuronExecution;
-
-/**
- * NeuronDevice is an opaque type that represents a device.
- *
- * This type is used to query basic properties and supported operations of the
- * corresponding device, and control which device(s) a model is to be run on.
- *
- * Available since 4.1.0
- */
-typedef struct NeuronDevice NeuronDevice;
-
-/**
- * This type is used to represent shared memory, memory mapped files, and
- * similar memories.
- *
- * It is the application's responsibility to ensure that there are no uses of
- * the memory after calling NeuronMemory_free. This includes the execution which
- * references this memory because of a call to
- * NeuronExecution_setInputFromMemory or NeuronExecution_setOutputFromMemory.
- *
- * Available since 4.1.0
- */
-typedef struct NeuronMemory NeuronMemory;
-
-/**
- * NeuronEvent is an opaque type that represents an event
- * that will be signaled once an execution completes.
- *
- * Available since 5.0.0
- */
-typedef struct NeuronEvent NeuronEvent;
-
-/**
- * Result codes.
- */
-typedef enum {
-  NEURON_NO_ERROR = 0,
-  NEURON_OUT_OF_MEMORY = 1,
-  NEURON_INCOMPLETE = 2,
-  NEURON_UNEXPECTED_NULL = 3,
-  NEURON_BAD_DATA = 4,
-  NEURON_OP_FAILED = 5,
-  NEURON_UNMAPPABLE = 6,
-  NEURON_BAD_STATE = 7,
-  NEURON_BAD_VERSION = 8,
-
-  // Available since 5.0.0
-  NEURON_OUTPUT_INSUFFICIENT_SIZE = 9,
-  NEURON_UNAVAILABLE_DEVICE = 10,
-  NEURON_MISSED_DEADLINE_TRANSIENT = 11,
-  NEURON_MISSED_DEADLINE_PERSISTENT = 12,
-  NEURON_RESOURCE_EXHAUSTED_TRANSIENT = 13,
-  NEURON_RESOURCE_EXHAUSTED_PERSISTENT = 14,
-  NEURON_DEAD_OBJECT = 15,
-} NeuronAdapterResultCode;
-
-/**
- * Operand values with size in bytes that are smaller or equal to this will be
- * immediately copied into the model.
- */
-enum { NEURON_MAX_SIZE_OF_IMMEDIATELY_COPIED_VALUES = 128 };
-
-/**
- * Size of the cache token, in bytes, required from the application.
- */
-enum { NEURON_BYTE_SIZE_OF_CACHE_TOKEN = 32 };
-
-/**
- * Operand types.
- * The type of operands that can be added to a model.
- *
- * Some notes on quantized tensors
- *
- * <p>NEURON_TENSOR_QUANT8_ASYMM
- * <p>Attached to this tensor are two numbers that can be used to convert the 8
- * bit integer to the real value and vice versa. These two numbers are:
- * - scale: a 32 bit floating point value greater than zero.
- * - zeroPoint: a 32 bit integer, in range [0, 255].
- * <p>The formula is: real_value = (integer_value - zero_value) * scale.
- *
- * <p>NEURON_TENSOR_QUANT16_SYMM
- * <p>Attached to this tensor is a number representing real value scale that is
- * used to convert the 16 bit number to a real value in the following way:
- * realValue = integerValue * scale. scale is a 32 bit floating point with value
- * greater than zero.
- *
- * <p>NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL
- * <p>This tensor is associated with additional fields that can be used to
- * convert the 8 bit signed integer to the real value and vice versa. These
- * fields are:
- * - channelDim: a 32 bit unsigned integer indicating channel dimension.
- * - scales: an array of positive 32 bit floating point values.
- * <p>The size of the scales array must be equal to dimensions[channelDim].
- * NeuronModel_setOperandSymmPerChannelQuantParams must be used to set the
- * parameters for an Operand of this type. The channel dimension of this tensor
- * must not be unknown (dimensions[channelDim] != 0). The formula is:
- * realValue[..., C, ...] = integerValue[..., C, ...] * scales[C] where C is an
- * index in the Channel dimension.
- *
- * <p>NEURON_TENSOR_QUANT16_ASYMM
- * <p>Attached to this tensor are two numbers that can be used to convert the 16
- * bit integer to the real value and vice versa. These two numbers are:
- * - scale: a 32 bit floating point value greater than zero.
- * - zeroPoint: a 32 bit integer, in range [0, 65535].
- * <p>The formula is: real_value = (integer_value - zeroPoint) * scale.
- *
- * <p>NEURON_TENSOR_QUANT8_SYMM
- * <p>Attached to this tensor is a number representing real value scale that is
- * used to convert the 8 bit number to a real value in the following way:
- * realValue = integerValue * scale. scale is a 32 bit floating point with value
- * greater than zero.
- *
- * <p>NEURON_TENSOR_QUANT8_ASYMM_SIGNED
- * <P>Attached to this tensor are two numbers that can be used to convert the 8
- * bit integer to the real value and vice versa. These two numbers are:
- * - scale: a 32 bit floating point value greater than zero.
- * - zeroPoint: a 32 bit integer, in range [-128, 127].
- * <p>The formula is: real_value = (integer_value - zeroPoint) * scale.
- */
-enum {
-  /** A 32 bit floating point scalar value. */
-  NEURON_FLOAT32 = 0,
-  /** A signed 32 bit integer scalar value. */
-  NEURON_INT32 = 1,
-  /** An unsigned 32 bit integer scalar value. */
-  NEURON_UINT32 = 2,
-  /** A tensor of 32 bit floating point values. */
-  NEURON_TENSOR_FLOAT32 = 3,
-  /** A tensor of 32 bit integer values. */
-  NEURON_TENSOR_INT32 = 4,
-  /** A tensor of 8 bit integers that represent real numbers. */
-  NEURON_TENSOR_QUANT8_ASYMM = 5,
-  /** An 8 bit boolean scalar value. */
-  NEURON_BOOL = 6,
-  /** A tensor of 16 bit signed integers that represent real numbers. */
-  NEURON_TENSOR_QUANT16_SYMM = 7,
-  /** A tensor of IEEE 754 16 bit floating point values. */
-  NEURON_TENSOR_FLOAT16 = 8,
-  /** A tensor of 8 bit boolean values. */
-  NEURON_TENSOR_BOOL8 = 9,
-  /** An IEEE 754 16 bit floating point scalar value. */
-  NEURON_FLOAT16 = 10,
-  /** A tensor of 8 bit signed integers that represent real numbers. */
-  NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL = 11,
-  /** A tensor of 16 bit unsigned integers that represent real numbers. */
-  NEURON_TENSOR_QUANT16_ASYMM = 12,
-  /** A tensor of 8 bit signed integers that represent real numbers. */
-  NEURON_TENSOR_QUANT8_SYMM = 13,
-  /** A tensor of 8 bit signed integers that represent real numbers. */
-  NEURON_TENSOR_QUANT8_ASYMM_SIGNED = 14,
-  /** A reference to a model. */
-  NEURON_MODEL = 15,
-  /** Extended data type - tensor uint32 */
-  NEURON_EXT_TENSOR_UINT32 = 9001,
-  /** Extended data type -A tensor of 8 bit unsigned integers that represent
-     real numbers. */
-  NEURON_EXT_TENSOR_QUANT8_ASYMM_PER_CHANNEL = 9002,
-  /** Extended data type -A tensor of 4 bit unsigned integers that represent
-     real numbers. */
-  NEURON_EXT_TENSOR_QUANT4_ASYMM = 9003,
-  /** Extended data type -A tensor of 4 bit signed integers that represent real
-     numbers. */
-  NEURON_EXT_TENSOR_QUANT4_ASYMM_SIGNED = 9004,
-  /** Extended data type -A tensor of 4 bit signed integers that represent real
-     numbers. */
-  NEURON_EXT_TENSOR_QUANT4_SYMM = 9005,
-  /** Extended data type -A tensor of 16 bit signed integers that represent real
-     numbers. */
-  NEURON_EXT_TENSOR_QUANT16_ASYMM_SIGNED = 9006,
-  /** Extended data type -A raw tensor. */
-  NEURON_EXT_TENSOR_RAW = 9007,
-  /** Extended data type -A tensor of 8 bit signed integers that represent real
-     numbers. */
-  NEURON_EXT_TENSOR_QUANT8_ASYMM_SIGNED_PER_CHANNEL = 9008,
-};
-
-/**
- * NeuronOperandType describes the type of an operand.
- * This structure is used to describe both scalars and tensors.
- */
-typedef struct NeuronOperandType {
-  /** The data type, e.g NEURON_INT8. */
-  int32_t type;
-  /** The number of dimensions. It should be 0 for scalars. */
-  uint32_t dimensionCount;
-  /** The dimensions of the tensor. It should be nullptr for scalars. */
-  const uint32_t* dimensions;
-  /**
-   * These two fields are only used for quantized tensors.
-   * They should be zero for scalars and non-fixed point tensors.
-   * The dequantized value of each entry is (value - zeroPoint) * scale.
-   */
-  float scale;
-  /** Only used with scale for quantized tensors */
-  int32_t zeroPoint;
-} NeuronOperandType;
-
-/**
- * Parameters for NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL operand.
- */
-typedef struct NeuronSymmPerChannelQuantParams {
-  /** The index of the channel dimension. */
-  uint32_t channelDim;
-  /** The size of the scale array. Should be equal to dimension[channelDim] of
-   * the Operand. */
-  uint32_t scaleCount;
-  /** The array of scaling values for each channel. Each value must be greater
-   * than zero. */
-  const float* scales;
-} NeuronSymmPerChannelQuantParams;
-
-/**
- * Parameters for NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL and
- * NEURON_TENSOR_QUANT8_ASYMM_PER_CHANNEL operand.
- */
-typedef struct NeuronPerChannelQuantParams {
-  /** The index of the channel dimension. */
-  uint32_t channelDim;
-  /** The size of the scale array. Should be equal to dimension[channelDim] of
-   * the Operand. */
-  uint32_t scaleCount;
-  /** The array of scaling values for each channel. Each value must be greater
-   * than zero. */
-  const float* scales;
-  /** The size of the zeroPoints. Should be equal to dimension[channelDim] of
-   * the Operand. */
-  uint32_t zeroPointCount;
-  /** The array of zero point values for each channel. */
-  const int32_t* zeroPoints;
-} NeuronPerChannelQuantParams;
-
-/**
- * Operation Types
- *
- * Supported operations are listed with available versions. See
- * Neuron_getVersion for querying version number.
- *
- * Attempting to compile models with operations marked as not available
- * will get a compilation failure.
- *
- * Refer to the operation support status of each hardware platform.
- * Attempting to compile models with operations supported by this library but
- * not supported by the underlying hardware platform will get a compilation
- * failure too.
- *
- * Compatible NNAPI levels are also listed.
- */
-typedef enum {
-  NEURON_ADD = 0, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_AVERAGE_POOL_2D = 1, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_CONCATENATION = 2, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_CONV_2D = 3, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_DEPTHWISE_CONV_2D = 4, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_DEPTH_TO_SPACE = 5, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_DEQUANTIZE = 6, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_EMBEDDING_LOOKUP = 7, ///< Not available.
-  NEURON_FLOOR = 8, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_FULLY_CONNECTED = 9, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_HASHTABLE_LOOKUP = 10, ///< Not available.
-  NEURON_L2_NORMALIZATION = 11, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_L2_POOL_2D = 12, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_LOCAL_RESPONSE_NORMALIZATION = 13, ///< Not available.
-  NEURON_LOGISTIC = 14, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_LSH_PROJECTION = 15, ///< Not available.
-  NEURON_LSTM = 16, ///< Not available.
-  NEURON_MAX_POOL_2D = 17, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_MUL = 18, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_RELU = 19, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_RELU1 = 20, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_RELU6 = 21, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_RESHAPE = 22, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_RESIZE_BILINEAR = 23, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_RNN = 24, ///< Not available.
-  NEURON_SOFTMAX = 25, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_SPACE_TO_DEPTH = 26, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_SVDF = 27, ///< Not available.
-  NEURON_TANH = 28, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_BATCH_TO_SPACE_ND = 29, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_DIV = 30, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_MEAN = 31, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_PAD = 32, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_SPACE_TO_BATCH_ND = 33, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_SQUEEZE = 34, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_STRIDED_SLICE = 35, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_SUB = 36, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_TRANSPOSE = 37, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_ABS = 38, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_ARGMAX = 39, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_ARGMIN = 40, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_AXIS_ALIGNED_BBOX_TRANSFORM =
-      41, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_BIDIRECTIONAL_SEQUENCE_LSTM = 42, ///< Not available.
-  NEURON_BIDIRECTIONAL_SEQUENCE_RNN = 43, ///< Not available.
-  NEURON_BOX_WITH_NMS_LIMIT = 44, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_CAST = 45, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_CHANNEL_SHUFFLE = 46, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_DETECTION_POSTPROCESSING = 47, ///< Not available.
-  NEURON_EQUAL = 48, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_EXP = 49, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_EXPAND_DIMS = 50, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_GATHER = 51, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_GENERATE_PROPOSALS = 52, ///< Not available.
-  NEURON_GREATER = 53, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_GREATER_EQUAL = 54, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_GROUPED_CONV_2D = 55, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_HEATMAP_MAX_KEYPOINT = 56, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_INSTANCE_NORMALIZATION =
-      57, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_LESS = 58, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_LESS_EQUAL = 59, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_LOG = 60, ///< Not available.
-  NEURON_LOGICAL_AND = 61, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_LOGICAL_NOT = 62, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_LOGICAL_OR = 63, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_LOG_SOFTMAX = 64, ///< Not available.
-  NEURON_MAXIMUM = 65, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_MINIMUM = 66, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_NEG = 67, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_NOT_EQUAL = 68, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_PAD_V2 = 69, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_POW = 70, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_PRELU = 71, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_QUANTIZE = 72, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_QUANTIZED_16BIT_LSTM = 73, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_RANDOM_MULTINOMIAL = 74, ///< Not available.
-  NEURON_REDUCE_ALL = 75, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_REDUCE_ANY = 76, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_REDUCE_MAX = 77, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_REDUCE_MIN = 78, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_REDUCE_PROD = 79, ///< Not available.
-  NEURON_REDUCE_SUM = 80, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_ROI_ALIGN = 81, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_ROI_POOLING = 82, ///< Not available.
-  NEURON_RSQRT = 83, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_SELECT = 84, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_SIN = 85, ///< Not available.
-  NEURON_SLICE = 86, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_SPLIT = 87, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_SQRT = 88, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_TILE = 89, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_TOPK_V2 = 90, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_TRANSPOSE_CONV_2D = 91, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_UNIDIRECTIONAL_SEQUENCE_LSTM = 92, ///< Not available.
-  NEURON_UNIDIRECTIONAL_SEQUENCE_RNN = 93, ///< Not available.
-  NEURON_RESIZE_NEAREST_NEIGHBOR =
-      94, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_QUANTIZED_LSTM = 95, ///< Not available.
-  NEURON_IF = 96, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_WHILE = 97, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_ELU = 98, ///< Not available.
-  NEURON_HARD_SWISH = 99, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_FILL = 100, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_RANK = 101, ///< Not available.
-  NEURON_BATCH_MATMUL = 102, ///< Available since 5.1.2. NNAPI FL6.
-  NEURON_PACK = 103, ///< Not available.
-  NEURON_MIRROR_PAD = 104, ///< Not available.
-  NEURON_MIRROR_REVERSE = 105, ///< Not available.
-  /**
-   * Decompress HyFBC to YUV420 frame, support both YUV420_8BITS and
-   * YUV420_10BITS formats. HyFBC (Hybrid Frame Buffer Compression) is a
-   * compressed format used by video decoder (VDEC). This format uses YUV420 to
-   * compress.
-   *
-   * For input part, need to set two inputs with different shape, representing Y
-   * and UV plane respectively. The same HyFBC data will be used for both
-   * inputs. Similarly, the output part also needs to be set to two,
-   * representing Y and UV plane respectively.
-   *
-   * The shape of the two inputs/ outputs (inputY, inputUV, outputY, outputUV)
-   * depends on the original images' shape ([batches, height, width, channels]).
-   * Both height and width shold follow 64 alignment rule. For example, if
-   * original height is 480, its 64 alignment should be 512. For Y plane,
-   * channel size should be 1; for UV plane, channel size should be 2. Besides,
-   * the height and width of UV plane should be half of Y's height and width.
-   * Example:
-   *
-   *      original_img.shape = [1, 384, 640, 3]
-   *      inputY.shape = [1, 384, 640, 1]
-   *      inputUV.shape = [1, 192, 320, 2]
-   *      outputY.shape = [1, 384, 640, 1]
-   *      outputUV.shape = [1, 192, 320, 2]
-   *
-   * Supported tensor {@link OperandCode}:
-   * * {@link NEURON_EXT_TENSOR_RAW} (for inputY, inputUV)
-   * * {@link NEURON_TENSOR_QUANT8_ASYMM} (for outputY, outputUV)
-   * * {@link NEURON_TENSOR_QUANT16_ASYMM} (for outputY, outputUV)
-   * Note:
-   * If image mode is YUV420_8BITS, use NEURON_TENSOR_QUANT8_ASYMM; if mode is
-   * YUV420_10BITS, use NEURON_TENSOR_QUANT16_ASYMM.
-   *
-   * Tensor rank: both input and output require rank 4, with "NHWC" data layout.
-   *
-   * Inputs:
-   * * 0: inputY, a 4-D {@link NEURON_EXT_TENSOR_RAW} tensor.
-   * * 1: inputUV, a 4-D {@link NEURON_EXT_TENSOR_RAW} tensor.
-   * * 2: YHeaderAlignment, an {@link NEURON_INT32} scalar, specifying
-   * the header alignment in Hyfbc format.
-   * * 3: UVHeaderAlignment, an {@link NEURON_INT32} scalar, specifying
-   * the header alignment in Hyfbc format.
-   * * 4: xAlign, an {@link NEURON_INT32} scalar, specifying the frame
-   * width alignment of video decoder.
-   * * 5: yAlign, an {@link NEURON_INT32} scalar, specifying the frame
-   * height alignment of video decoder.
-   * * 6: xOffset, an {@link NEURON_INT32} scalar, specifying the frame
-   * width offset of video decoder.
-   * * 7: yOffset, an {@link NEURON_INT32} scalar, specifying the frame
-   * height offset of video decoder.
-   * * 8: mode, an {@link NEURON_INT32} scalar. Set to 0 for
-   * YUV420_8BITS. Set to 1 for YUV420_10BITS. Note that 8b, 10b here means the
-   * compressed bit width in Hyfbc frame, where the decompressed YUV420 is 8b
-   * for Hyfbc_8b, and YUV420 is 16b for Hyfbc_10b.
-   * * 9: outPitchN, an {@link NEURON_INT32} scalar, specifying the
-   * YUV420 N-axis pitch. Must be set to 1, because only a single batch is
-   * supported for HyfbcDecompress.
-   * * 10: outPitchH, an {@link NEURON_INT32} scalar, specifying the
-   * YUV420 H-axis pitch. Set to the original compressed image height with video
-   * codec alignment.
-   * * 11: outPitchW, an {@link NEURON_INT32} scalar, specifying the
-   * YUV420 W-axis pitch. Set to the original compressed image width with video
-   * codec alignment.
-   * * 12: outPitchC, an {@link NEURON_INT32} scalar, specifying the
-   * YUV420 C-axis pitch. Set to 1 for interleaved YUV420.
-   *
-   * Outputs:
-   * * 0: output Y, a 4-D tensor. Tensor type can be either {@link
-   * NEURON_TENSOR_QUANT8_ASYMM} or {@link
-   * NEURON_TENSOR_QUANT16_ASYMM}, depends on YUV420 bit mode.
-   * * 1: output UV, a 4-D tensor. Tensor type can be either {@link
-   * NEURON_TENSOR_QUANT8_ASYMM} or {@link
-   * NEURON_TENSOR_QUANT16_ASYMM}, depends on YUV420 bit mode.
-   *
-   * Available since NeuroPilot 7.0.0.
-   */
-  NEURON_HYFBCTOYUV420 = 106,
-  /**
-   * Compress YUV420 to AFBC frame, support both YUV420_8BITS and
-   * YUV420_10BITS formats. AFBC (Arm Frame Buffer Compression) is a lossless
-   * compressed image format, created by ARM to reduce the size of images.
-   *
-   * For input part, need to set two inputs with different shape, representing Y
-   * and UV plane respectively. For output part, need to set one output for
-   * AFBC.
-   *
-   * The shape of the two inputs (inputY, inputUV) and output (AFBC)
-   * depends on the original images' shape ([batches, height, width, channels]).
-   * Both height and width shold follow 64 alignment rule. For example, if
-   * original height is 480, its 64 alignment should be 512. For Y plane,
-   * channel size should be 1; for UV plane, channel size should be 2. Besides,
-   * the height and width of UV plane should be half of Y's height and width.
-   * For AFBC output, its height shoud be 3/2 of Y's height, and its width
-   * equals to Y's width. Example:
-   *
-   *      original_img.shape = [1, 384, 640, 3]
-   *      inputY.shape = [1, 384, 640, 1]
-   *      inputUV.shape = [1, 192, 320, 2]
-   *      output.shape = [1, 576, 640, 1]
-   *
-   * Supported tensor {@link OperandCode}:
-   * * {@link NEURON_EXT_TENSOR_RAW} (for output)
-   * * {@link NEURON_TENSOR_QUANT8_ASYMM} (for inputY, inputUV)
-   * * {@link NEURON_TENSOR_QUANT16_ASYMM} (for inputY, inputUV)
-   * Note:
-   * If image mode is YUV420_8BITS, use NEURON_TENSOR_QUANT8_ASYMM; if mode is
-   * YUV420_10BITS, use NEURON_TENSOR_QUANT16_ASYMM.
-   *
-   * Tensor rank: both input and output require rank 4, with "NHWC" data layout.
-   *
-   * Inputs:
-   * * 0: inputY, a 4-D tensor. Tensor type can be either {@link
-   * NEURON_TENSOR_QUANT8_ASYMM} or {@link
-   * NEURON_TENSOR_QUANT16_ASYMM}, depends on YUV420 bit mode.
-   * * 1: inputUV, a 4-D tensor. Tensor type can be either {@link
-   * NEURON_TENSOR_QUANT8_ASYMM} or {@link
-   * NEURON_TENSOR_QUANT16_ASYMM}, depends on YUV420 bit mode.
-   * * 2: HeaderAlignment, an {@link NEURON_INT32} scalar, specifying
-   * the header alignment in AFBC format.
-   * * 3: xAlign, an {@link NEURON_INT32} scalar, specifying the frame
-   * width alignment of AFBC format.
-   * * 4: yAlign, an {@link NEURON_INT32} scalar, specifying the frame
-   * height alignment of AFBC format.
-   * * 5: xOffset, an {@link NEURON_INT32} scalar, specifying the frame
-   * width offset of AFBC format.
-   * * 6: yOffset, an {@link NEURON_INT32} scalar, specifying the frame
-   * height offset of AFBC format.
-   * * 7: mode, an {@link NEURON_INT32} scalar. Set to 0 for
-   * YUV420_8BITS. Set to 1 for YUV420_10BITS. Note that 8b, 10b here means the
-   * compressed bit width in AFBC frame, where the YUV420 must be 8b for
-   * AFBC_8b, and must be 16b for AFBC_10b.
-   * * 8: inPitchN, an {@link NEURON_INT32} scalar, specifying the
-   * YUV420 N-axis pitch. Must be set to 1, because only a single batch is
-   * supported for AfbcCompress.
-   * * 9: inPitchH, an {@link NEURON_INT32} scalar, specifying the
-   * YUV420 H-axis pitch. Set to the expected compressed image height.
-   * * 10: inPitchW, an {@link NEURON_INT32} scalar, specifying the
-   * YUV420 W-axis pitch. Set to the expected compressed image height.
-   * * 11: inPitchC, an {@link NEURON_INT32} scalar, specifying the
-   * YUV420 C-axis pitch. Set to 1 for interleaved YUV420.
-   *
-   * Outputs:
-   * * 0: output, a 4-D {@link NEURON_EXT_TENSOR_RAW} tensor.
-   *
-   * Available since NeuroPilot 7.0.0.
-   */
-  NEURON_YUV420TOAFBC = 107,
-  NEURON_NUMBER_OF_OPERATIONS,
-} NeuronOperationType;
-
-/**
- * Fused activation function types.
- */
-typedef enum {
-  // NO fused activation function.
-  NEURON_FUSED_NONE = 0,
-  // Fused ReLU activation function.
-  NEURON_FUSED_RELU = 1,
-  // Fused ReLU1 activation function.
-  NEURON_FUSED_RELU1 = 2,
-  // Fused ReLU6 activation function.
-  NEURON_FUSED_RELU6 = 3,
-} NeuronAdapterFuseCode;
-
-/**
- * Implicit padding algorithms.
- */
-typedef enum {
-  /**
-   * SAME padding.
-   * Padding on both ends are the "same":
-   *     padding_to_beginning =  total_padding / 2
-   *     padding_to_end       = (total_padding + 1)/2.
-   * i.e., for even number of padding, padding to both ends are exactly
-   * the same; for odd number of padding, padding to the ending is bigger
-   * than the padding to the beginning by 1.
-   *
-   * total_padding is a function of input, stride and filter size.
-   * It could be computed as follows:
-   *    out_size = (input + stride - 1) / stride;
-   *    needed_input = (out_size - 1) * stride + filter_size
-   *    total_padding = max(0, needed_input - input_size)
-   *  The computation is the same for the horizontal and vertical directions.
-   */
-  NEURON_PADDING_SAME = 1,
-
-  /**
-   * VALID padding.
-   * No padding. When the input size is not evenly divisible by
-   * the filter size, the input at the end that could not fill
-   * the whole filter tile will simply be ignored.
-   */
-  NEURON_PADDING_VALID = 2,
-} NeuronAdapterPaddingCode;
-
-/**
- * Execution preferences.
- */
-typedef enum {
-  /* Prefer executing in a way that minimizes battery drain. */
-  NEURON_PREFER_LOW_POWER = 0,
-  /* Prefer executing as fast as possible. (more power consumption)*/
-  NEURON_PREFER_FAST_SINGLE_ANSWER = 1,
-  /* Prefer maximizing the throughput of successive frames */
-  NEURON_PREFER_SUSTAINED_SPEED = 2,
-  /* Prefer executing with turbo boost. (most power consumption) */
-  NEURON_PREFER_TURBO_BOOST = 3,
-} NeuronAdapterPreferenceCode;
-
-/**
- * Relative execution priority.
- */
-typedef enum {
-  NEURON_PRIORITY_LOW = 90,
-  NEURON_PRIORITY_MEDIUM = 100,
-  NEURON_PRIORITY_HIGH = 110,
-  NEURON_PRIORITY_DEFAULT = NEURON_PRIORITY_MEDIUM,
-} NeuronAdapterPriorityCode;
-
-/**
- * Compiler optimization hint.
- */
-typedef enum {
-  /**
-   * Normal optimization.
-   * Available since 4.3.1
-   */
-  NEURON_OPTIMIZATION_NORMAL = 0,
-  /**
-   * Reduce latency by utilizing as many APU cores as possible.
-   * Available since 4.3.1
-   */
-  NEURON_OPTIMIZATION_LOW_LATENCY = 1 << 0,
-  /**
-   * Reducing DRAM access as more as possible.
-   * Available since 4.4.0
-   */
-  NEURON_OPTIMIZATION_DEEP_FUSION = 1 << 1,
-  /**
-   * Reduce latency by using as many APU cores as possible in batch-dimension.
-   * (For models with batch > 1)
-   * Available since 4.4.0
-   */
-  NEURON_OPTIMIZATION_BATCH_PROCESSING = 1 << 2,
-  /**
-   * Default optimization setting.
-   * Available since 4.3.1
-   */
-  NEURON_OPTIMIZATION_DEFAULT = NEURON_OPTIMIZATION_NORMAL,
-} OptimizationCode;
-
-/**
- * CPU cache flush hint.
- */
-typedef enum {
-  /**
-   * Sync input buffer and invalidate output buffer.
-   * Available since 5.0.1
-   */
-  NEURON_CACHE_FLUSH_ENABLE_ALL = 0,
-  /**
-   * Disable sync input buffer.
-   * Available since 5.0.1
-   */
-  NEURON_CACHE_FLUSH_DISABLE_SYNC_INPUT = 1 << 0,
-  /**
-   * Disable invalidate output buffer.
-   * Available since 5.0.1
-   */
-  NEURON_CACHE_FLUSH_DISABLE_INVALIDATE_OUTPUT = 1 << 1,
-  /**
-   * Default cache flush setting.
-   * Available since 5.0.1
-   */
-  NEURON_CACHE_FLUSH_DEFAULT = NEURON_CACHE_FLUSH_ENABLE_ALL,
-} CacheFlushCode;
-
-/**
- * Compilation Type.
- */
-typedef enum {
-  /* Normal Compilation Available since 7.0.0 */
-  COMPILATION_TYPE_NORMAL = 0,
-  /* @deprecate */
-  COMPILATION_TYPE_DEBUG_PLUS = 1,
-  /* Batched Execution: Set input/output from memory every time.
-   * Available since 7.0.0
-   */
-  COMPILATION_TYPE_BATCHED = 2,
-  /* One compilation with multi-executions could be created.
-   * Available since 7.0.0
-   */
-  COMPILATION_TYPE_MULTI_EXECUTIONS = 3,
-  /* Batched Execution: Set input/output from memory 1st time and memcpy next
-   * time. Available since 7.0.1
-   */
-  COMPILATION_TYPE_EXECUTION_CONTROLLER = 4,
-} CompilationType;
-
-/**
- * Supported Feature
- */
-typedef enum {
-  NEURON_FEATURE_NONE = 0,
-  NEURON_THROUGHPUT_MODE = 1,
-} NeuronFeatureType;
-
-/**
- * The structure to represent the neuron version.
- */
-typedef struct {
-  uint8_t major; ///< major version
-  uint8_t minor; ///< minor version
-  uint8_t patch; ///< patch version
-} NeuronRuntimeVersion;
-
-/**
- * Get the version of Neuron runtime library.
- *
- * @param version the version of Neuron runtime library.
- * @return NEURON_NO_ERROR
- */
-int Neuron_getVersion(NeuronRuntimeVersion* version);
-
-/**
- * Get the supported status of feature.
- *
- * Available since 7.0.0
- *
- * @param type input feature @NeuronFeatureType to check supported or not
- * @param supported return the supported status
- * @return NEURON_NO_ERROR if successful.
- */
-int Neuron_getFeatureSupportedStatus(NeuronFeatureType type, bool* supported);
-
-/**
- * Get the size of L1 memory in APU.
- *
- * Available since 4.3.0
- *
- * @param sizeKb L1 memory size in KB
- * @return NEURON_NO_ERROR if successful.
- */
-int Neuron_getL1MemorySizeKb(uint32_t* sizeKb);
-
-/**
- * Creates a shared memory object from a file descriptor.
- *
- * For ion descriptor, application should create the ion memory and descriptor
- * first and then use it in this function.
- *
- * Available since 4.1.0 Only supports ion fd.
- *
- * @param size The requested size in bytes. Must not be larger than the file
- * size.
- * @protect The desired memory protection for the mapping. It is either
- * PROT_NONE or the bitwise OR of one or more of the following flags: PROT_READ,
- * PROT_WRITE.
- * @fd The requested file descriptor. The file descriptor has to be mmap-able.
- * @offset The offset to the beginning of the file of the area to map.
- * @memory The memory object to be created. Set to NULL if unsuccessful.
- */
-int NeuronMemory_createFromFd(
-    size_t size,
-    int protect,
-    int fd,
-    size_t offset,
-    NeuronMemory** memory);
-
-#ifdef __ANDROID__
-/**
- * Creates a shared memory object from an AHardwareBuffer handle.
- *
- * We only support AHardwareBuffer with format AHARDWAREBUFFER_FORMAT_BLOB and
- * it can only be used for Model inputs and outputs.
- *
- * The AHardwareBuffer with AHARDWAREBUFFER_FORMAT_BLOB format can be used the
- * same way as shared memory created from a file handle. See NeuronMemory for
- * description on how to use this shared memory.
- *
- * The provided AHardwareBuffer must outlive the NeuronMemory object.
- *
- * Available since 5.0.0
- *
- * @param ahwb The AHardwareBuffer handle.
- * @param memory The memory object to be created.
- *               Set to NULL if unsuccessful.
- *
- * @return NEURON_NO_ERROR if the request completed normally.
- *
- */
-int NeuronMemory_createFromAHardwareBuffer(
-    const AHardwareBuffer* ahwb,
-    NeuronMemory** memory);
-
-#else // __ANDROID__
-
-/**
- * Not supported at non-android platform
- *
- * @return NEURON_BAD_STATE
- */
-int NeuronMemory_createFromAHardwareBuffer();
-
-#endif
-
-/**
- * Delete a memory object.
- *
- * For ion memory, this function cleans up the internal resource associated with
- * this memory. Applications should clean up the allocated ion memory after this
- * function.
- *
- * Available since 4.1.0
- */
-void NeuronMemory_free(NeuronMemory* memory);
-
-/**
- * Create an empty NeuronModel. The model should be constructed with calls to
- * NeuronModel_addOperation and NeuronModel_addOperand.
- *
- * Available since 4.1.0
- *
- * @param model The NeuronModel to be created. Set to NULL if unsuccessful.
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronModel_create(NeuronModel** model);
-
-/**
- * Destroy a model. The model need not have been finished by a call to
- * NeuronModel_finish.
- *
- * Available since 4.1.0
- *
- * @param model The model to be destroyed.
- */
-void NeuronModel_free(NeuronModel* model);
-
-/**
- * Indicate that we have finished modifying a model. Required before calling
- * NeuronCompilation_compile.
- *
- * Available since 4.1.0
- *
- * @param model The model to be finished.
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronModel_finish(NeuronModel* model);
-
-/**
- * Add an operand to a model. The order in which the operands are added is
- * important. The first one added to a model will have the index value 0, the
- * second 1, etc. These indexes are used as operand identifiers in
- * NeuronModel_addOperation.
- *
- * Available since 4.1.0
- *
- * @param model The model to be modified.
- * @param type The NeuronOperandType that describes the shape of the operand.
- * Neither the NeuronOperandType nor the dimensions it points to need to outlive
- * the call to NeuronModel_addOperand.
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronModel_addOperand(NeuronModel* model, const NeuronOperandType* type);
-
-/**
- * Sets an operand to a constant value.
- * Values of length smaller or equal to
- * NEURON_MAX_SIZE_OF_IMMEDIATELY_COPIED_VALUES are immediately copied into the
- * model. For values of length greater than
- * NEURON_MAX_SIZE_OF_IMMEDIATELY_COPIED_VALUES, a pointer to the buffer is
- * stored within the model. The application must not change the content of this
- * region until all executions using this model have completed. As the data may
- * be copied during processing, modifying the data after this call yields
- * undefined results.
- *
- * Attempting to modify a model once NeuronModel_finish has been called will
- * return an error.
- *
- * A special notice on the buffer lifetime when the length is greater than
- * NEURON_MAX_SIZE_OF_IMMEDIATELY_COPIED_VALUES. The provided buffer must
- * outlive the compilation of this model. I.e. user must keep the buffer
- * unchanged until NeuronCompilation_finish of this model. This is an internal
- * optimization comparing to NNAPI. In NNAPI, NN runtime will copy the buffer to
- * a shared memory between NN runtime and NNAPI HIDL service during
- * ANNModel_finish, and it will be copied again to the compiled result during
- * ANNCompilation_finish. In Neuron Adapter, there will be only one copying
- * during NeuronCompilaiton_finish, so it is required to keep the buffer alive
- * until NeuronCompilaiton_finish returned.
- *
- * Available since 4.1.0
- *
- * @param model The model to be modified.
- * @param index The index of the model operand we're setting.
- * @param buffer A pointer to the data to use.
- * @param length The size in bytes of the data value.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronModel_setOperandValue(
-    NeuronModel* model,
-    int32_t index,
-    const void* buffer,
-    size_t length);
-/**
- * Sets an operand to a value that is a reference to another NeuronModel.
- *
- * The referenced model must already have been finished by a call to
- * NeuronModel_finish.
- *
- * The NeuronModel_relaxComputationFloat32toFloat16 setting of referenced models
- * is overridden by that setting of the main model of a compilation.
- *
- * The referenced model must outlive the model referring to it.
- *
- * Attempting to modify a model once NeuronModel_finish has been called will
- * return an error.
- *
- * Available since 4.1.0
- *
- * @param model The model to be modified.
- * @param index The index of the model operand we're setting.
- * @param value The model to be referenced.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronModel_setOperandValueFromModel(
-    NeuronModel* model,
-    int32_t index,
-    const NeuronModel* value);
-
-/**
- * Sets an operand's per channel quantization parameters
- * Sets parameters required by a tensor of type
- * NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL This function must be called for every
- * tensor of type NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL before calling
- * NeuronModel_finish
- *
- * Available since 4.1.0
- *
- * @param model The model to be modified.
- * @param index The index of the model operand we're setting.
- * @param channelQuant The per channel quantization parameters for the operand.
- * No memory in this struct needs to outlive the call to this function.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronModel_setOperandSymmPerChannelQuantParams(
-    NeuronModel* model,
-    int32_t index,
-    const NeuronSymmPerChannelQuantParams* channelQuant);
-
-/**
- * Sets an operand's per channel quantization parameters
- * Sets parameters required by a tensor of type
- * NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL or
- * NEURON_TENSOR_QUANT8_ASYMM_PER_CHANNEL.
- * This function must be called for every tensor of type
- * NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL or
- * NEURON_TENSOR_QUANT8_ASYMM_PER_CHANNEL before calling NeuronModel_finish.
- *
- * Available since 6.0.0
- *
- * @param model The model to be modified.
- * @param index The index of the model operand we're setting.
- * @param channelQuant The per channel quantization parameters(include
- * per-channel offset) for the operand. No memory in this struct needs to
- * outlive the call to this function.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronModel_setOperandPerChannelQuantParams(
-    NeuronModel* model,
-    int32_t index,
-    const NeuronPerChannelQuantParams* channelQuant);
-
-/**
- * Add an operation to a model.
- * The operands specified by inputs and outputs must have been previously added
- * by calls to NeuronModel_addOperand.
- *
- * Available since 4.1.0
- *
- * @param model The model to be modified.
- * @param type The NeuronOperationType of the operation.
- * @param inputCount The number of entries in the inputs array.
- * @param inputs An array of indexes identifying each operand.
- * @param outputCount The number of entries in the outputs array.
- * @param outputs An array of indexes identifying each operand.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronModel_addOperation(
-    NeuronModel* model,
-    NeuronOperationType type,
-    uint32_t inputCount,
-    const uint32_t* inputs,
-    uint32_t outputCount,
-    const uint32_t* outputs);
-
-/**
- * Add an operation extension to a model.
- * The operands specified by inputs and outputs must have been previously added
- * by calls to NeuronModel_addOperand. User needs to specify the operation
- * extension name and the desired device which will execute the operation
- * extension.
- *
- * Available since 4.1.0
- *
- * @param model The model to be modified.
- * @param name The name of the operation extension.
- * @param vendor The name of the vendor which will implement the operation
- * extension.
- * @param device The device which will execute the operation extension.
- * @param inputCount The number of entries in the inputs array.
- * @param inputs An array of indexes identifying each operand.
- * @param outputCount The number of entries in the outputs array.
- * @param outputs An array of indexes identifying each operand.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronModel_addOperationExtension(
-    NeuronModel* model,
-    const char* name,
-    const char* vendor,
-    const NeuronDevice* device,
-    uint32_t inputCount,
-    const uint32_t* inputs,
-    uint32_t outputCount,
-    const uint32_t* outputs);
-
-/**
- * Specfifies which operands will be the model's inputs and outputs.
- * An operand cannot be used for both input and output. Doing so will return an
- * error.
- *
- * The operands specified by inputs and outputs must have been
- * previously added by calls to NeuronModel_addOperand.
- *
- * Attempting to modify a model once NeuronModel_finish has been
- * called will return an error.
- *
- * Available since 4.1.0
- *
- * @param model The model to be modified.
- * @param inputCount The number of entries in the inputs array.
- * @param inputs An array of indexes identifying the input operands.
- * @param outputCount The number of entries in the outputs array.
- * @param outputs An array of indexes identifying the output operands.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronModel_identifyInputsAndOutputs(
-    NeuronModel* model,
-    uint32_t inputCount,
-    const uint32_t* inputs,
-    uint32_t outputCount,
-    const uint32_t* outputs);
-
-/**
- * Gets the supported operations in a model.
- * This function must be called after calling NeuronModel_finish
- *
- * Available since 4.1.0
- *
- * @param model The model to be queried.
- * @param supported The boolean array to be filled. True means supported. The
- * size of the boolean array must be at least as large as the number of
- * operations in the model. The order of elements in the supported array matches
- * the order in which the corresponding operations were added to the model.
- * @param operationCount number of operations in the model
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronModel_getSupportedOperations(
-    NeuronModel* model,
-    bool* supported,
-    uint32_t operationCount);
-
-/**
- * Get the supported operations for a specified set of devices.
- * If multiple devices are selected, the supported operation list is a union of
- * supported operations of all selected devices.
- *
- * Available since 4.1.0
- *
- * @param model The model to be queried.
- * @param devices Selected devices
- * @param numDevices Number of selected devices
- * @param supportedOps The boolean array to be filled. True means supported. The
- * size of the boolean array must be as least as large as the number of
- * operations in the model. The order of elements in the supportedOps array
- * matches the order in which the corresponding operations were added to the
- * model.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronModel_getSupportedOperationsForDevices(
-    const NeuronModel* model,
-    const NeuronDevice* const* devices,
-    uint32_t numDevices,
-    bool* supportedOps);
-
-/**
- * Specifies whether NEURON_TENSOR_FLOAT32 is allowed to be calculated with
- * range and/or precision as low as that of the IEEE 754 16-bit floating-point
- * format. By default, NEURON_TENSOR_FLOAT32 must be calculated using at least
- * the range and precision of the IEEE 754 32-bit floating-point format.
- *
- * Available since 4.1.0
- *
- * @param model The model to be modified.
- * @param allow 'true' indicates NEURON_TENSOR_FLOAT32 may be calculated with
- * range and/or precision as low as that of the IEEE 754 16-bit floating point
- * format. 'false' indicates NEURON_TENSOR_FLOAT32 must be calculated using at
- * least the range and precision of the IEEE 754 32-bit floating point format.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronModel_relaxComputationFloat32toFloat16(
-    NeuronModel* model,
-    bool allow);
-
-/**
- * Hint compiler to suppress the input data conversion, the users have to
- * convert the input data into platform-expected format before inference.
- *
- * Available since 4.2.0
- *
- * @param model The model to be modified.
- * @param suppress True to suppress the input data conversion.
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronModel_suppressInputConversion(NeuronModel* model, bool suppress);
-
-/**
- * Hint compiler to suppress the output data conversion, the users have to
- * convert the output data from platform-generated format before inference.
- *
- * Available since 4.2.0
- *
- * @param model The model to be modified.
- * @param suppress True to suppress the output data conversion.
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronModel_suppressOutputConversion(NeuronModel* model, bool suppress);
-
-/**
- * Restore the compiled network using user provided buffer.
- *
- * The restored NeuronCompilaton could be used in creating executing instance.
- * The restored NeuronModel cannot be recompiled.
- *
- * Available since 4.3.0
- *
- * @param model Restored model.
- * @param compilation Restored compilation
- * @param buffer User provided buffer to restore the compiled network.
- * @param size Size of the user provided buffer in bytes.
- * @return NEURON_NO_ERROR if compiled network is successfully copied to the
- * user allocated buffer. NEURON_BAD_DATA if it fails to load the compiled
- * network, this could either be the version is not matched or the data is
- * corrupted.
- */
-int NeuronModel_restoreFromCompiledNetwork(
-    NeuronModel** model,
-    NeuronCompilation** compilation,
-    const void* buffer,
-    const size_t size);
-
-/**
- * Restore the compiled network using user provided buffer.
- * Support multiple compilation type; choices are: COMPILATION_TYPE_BATCHED,
- * COMPILATION_TYPE_EXECUTION_CONTROLLER, COMPILATION_TYPE_EXECUTION_CONTROLLER,
- * and COMPILATION_TYPE_NORMAL.
- *
- * There are two ways to use Batched Compilation:
- * 1) load from DLA.
- * 2) create batched compilation directly.
- * To load DLA, one should call NeuronCompilation_create and
- * NeuronModel_restoreFromCompiledNetworkV2. To create directly, one should call
- * NeuronCompilation_createForBatch.
- *
- * The restored NeuronCompilaton could be used in creating executing instance.
- * The restored NeuronModel cannot be recompiled.
- *
- * Available since 7.0.0
- *
- * @param model Restored model.
- * @param compilation Restored compilation
- * @param buffer User provided buffer to restore the compiled network.
- * @param size Size of the user provided buffer in bytes.
- * @param type Type of the compilation needed to be restored.
- * @return NEURON_NO_ERROR if compiled network is successfully copied to the
- * user allocated buffer. NEURON_BAD_DATA if it fails to load the compiled
- * network, this could either be the version is not matched or the data is
- * corrupted.
- */
-int NeuronModel_restoreFromCompiledNetworkV2(
-    NeuronModel** model,
-    NeuronCompilation** compilation,
-    const void* buffer,
-    const size_t size,
-    const CompilationType& type);
-
-/**
- * Set a string into model that can be used for recognition for user.
- * It's only used for debug, the string can be dumped into log and make users
- * check the model behavior easily.
- *
- * Available since 7.0.0
- *
- * @param model The model to be modified.
- * @param name The string, user can free buffer 'name' after calling this API.
- * @return NEURON_NO_ERROR if the string is set success. NEURON_UNEXPECTED_NULL
- * if the input param is nullptr.
- */
-int NeuronModel_setName(NeuronModel* model, const char* name);
-
-/**
- * Create a NeuronCompilation to compile the given model.
- *
- * This function only creates the object. Compilation is only performed once
- * NeuronCompilation_finish is invoked. NeuronCompilation_finish should be
- * called once all desired properties have been set on the compilation.
- * NeuronModel_free should be called once the compilation is no longer needed.
- * The provided model must outlive the compilation. The model must already have
- * been finished by a call to NeuronModel_finish.
- *
- * Available since 4.1.0
- *
- * @param model The NeuronModel to be compiled.
- * @param compilation The newly created object or NULL if unsuccessful.
- *
- * @return NEURON_NO_ERROR if successful
- */
-int NeuronCompilation_create(
-    NeuronModel* model,
-    NeuronCompilation** compilation);
-
-/**
- * Create a NeuronCompilation with different purpose to compile the given model.
- *
- * This function only creates the object. Compilation is only performed once
- * NeuronCompilation_finish is invoked. NeuronCompilation_finish should be
- * called once all desired properties have been set on the compilation.
- * NeuronModel_free should be called once the compilation is no longer needed.
- * The provided model must outlive the compilation. The model must already have
- * been finished by a call to NeuronModel_finish.
- *
- * Available since 7.0.1
- *
- * @param model The NeuronModel to be compiled.
- * @param type Type of the compilation needed to be created.
- * @param options The options which used to create with compilation.
- * @param compilation The newly created object or NULL if unsuccessful.
- *
- * @return NEURON_NO_ERROR if successful
- */
-int NeuronCompilation_createV2(
-    NeuronModel* model,
-    CompilationType type,
-    const char* options,
-    NeuronCompilation** compilation);
-
-/**
- * Destroy a compilation.
- *
- * Available since 4.1.0
- *
- * @param compilation The compilation to be destroyed.
- */
-void NeuronCompilation_free(NeuronCompilation* compilation);
-
-/**
- * Compilation is finished once NeuronCompilation_finish is invoked. Required
- * before calling NeuronExecution_create. This function must only be called once
- * for a given compilation.
- *
- * Available since 4.1.0
- *
- * @param compilation The compilation to be finished.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronCompilation_finish(NeuronCompilation* compilation);
-
-/**
- * Gets the supported operations in a model with specific optimized configures.
- * This function must be called before calling NeuronCompilation_finish.
- *
- * Available since 7.0.0
- *
- * @param compilation The compilation to be queried.
- * @param operationCount number of operations in the model
- * @param supported The boolean array to be filled. True means supported. The
- * size of the boolean array must be at least as large as the number of
- * operations in the model. The order of elements in the supported array matches
- * the order in which the corresponding operations were added to the model.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronCompilation_getSupportedOperations(
-    NeuronCompilation* compilation,
-    uint32_t operationCount,
-    bool* supported);
-
-/**
- * Provides optional caching information for faster re-compilation.
- *
- * Available since 4.1.0
- *
- * @param compilation The compilation to be cached.
- * @param cacheDir The cache directory for storing and retrieving caching data.
- * The user should choose a directory local to the application, and is
- * responsible for managing the cache entries.
- * @param token The token provided by the user to specify a model must be of
- * length NEURON_BYTE_SIZE_OF_CACHE_TOKEN. The user should ensure that the token
- * is unique to a model within the application. Neuron cannot detect token
- * collisions; a collision will result in a failed execution or in a successful
- * execution that produces incorrect output values.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronCompilation_setCaching(
-    NeuronCompilation* compilation,
-    const char* cacheDir,
-    const uint8_t* token);
-
-/**
- * Hint compiler with the size of L1 memory, this value should not be larger
- * than real platform's settings. The user can get the platform's L1 memory size
- * in KB by calling Neuron_getL1MemorySizeKb.
- *
- * Available since 4.3.0
- *
- * @param compilation The compilation to be modified.
- * @param sizeKb L1 memory size in KB.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronCompilation_setL1MemorySizeKb(
-    NeuronCompilation* compilation,
-    uint32_t sizeKb);
-
-/**
- * Create a NeuronCompilation to compile the given model for a specified set of
- * devices. The user must handle all compilation and execution failures from the
- * specified set of devices. This is in contrast to a use of
- * NeuronCompilation_create, where neuron will attempt to recover from such
- * failures.
- *
- * Available since 4.1.0
- *
- * @param model The NeuronModel to be compiled.
- * @param devices The set of devices. Must not contain duplicates.
- * @param numDevices The number of devices in the set.
- * @param compilation The newly created object or NULL if unsuccessful.
- *
- * @return NEURON_NO_ERROR if successful, NEURON_BAD_DATA if the model is
- * invalid.
- */
-int NeuronCompilation_createForDevices(
-    NeuronModel* model,
-    const NeuronDevice* const* devices,
-    uint32_t numDevices,
-    NeuronCompilation** compilation);
-
-/**
- * Create a NeuronCompilation. Which can divide one graph into several subgraph
- * and use the information to debug.
- *
- * Only be used in debug purpose, no guarantees performance and thread safe.
- *
- * Available since 5.0.0
- *
- * @param model The NeuronModel to be compiled.
- * @param compilation The newly created object or NULL if unsuccessful.
- *
- * @return NEURON_NO_ERROR if successful, NEURON_BAD_DATA if the model is
- * invalid.
- */
-int NeuronCompilation_createForDebug(
-    NeuronModel* model,
-    NeuronCompilation** compilation);
-
-/**
- * Sets the execution preference associated with this compilation.
- *
- * Default value of preference is PREFER_SINGLE_FAST_ANSWER
- *
- * Available since 4.1.0
- *
- * @param compilation The compilation to be modified.
- * @param preference Either NEURON_PREFER_LOW_POWER,
- * NEURON_PREFER_SINGLE_FAST_ANSWER, or NEURON_PREFER_SUSTAINED_SPEED.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronCompilation_setPreference(
-    NeuronCompilation* compilation,
-    int32_t preference);
-
-/**
- * Sets the execution priority associated with this compilation.
- *
- * Execution priorities are relative to other executions created by the same
- * application (specifically same uid) for the same device. Specifically,
- * priorities of executions from one application will not affect executions from
- * another application.
- *
- * Higher priority executions may use more compute resources than lower priority
- * executions, and may preempt or starve lower priority executions.
- *
- * Available since 4.1.0
- *
- * @param compilation The compilation to be modified.
- * @param priority The relative priority of the execution compared to other
- * executions created by the application. Must be one of NEURON_PRIORITY_*.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronCompilation_setPriority(NeuronCompilation* compilation, int priority);
-
-/**
- * Get the padded dimensional information of the specified input operand of the
- * compilation. This function must be called after calling
- * NeuronCompilation_finish. If NeuronModel_suppressInputConversion was not
- * applied to the model to be compiled, the returned dimensions are the padded
- * dimension after NeuronCompilation_finish to satisfy the optimization
- * requirement from the underlying hardware accelerators.
- * If NeuronModel_suppressInputConversion was applied to the model to be
- * compiled, the returned dimensions are the same as the original dimensions
- * given from user.
- *
- * Available since 4.2.0
- *
- * @param compilation The compilation to be queried.
- * @param index The index of the input operand we are querying. It is an index
- * into the lists passed to NeuronModel_identifyInputsAndOutputs. It is not the
- * index associated with NeuronModel_addOperand.
- * @param dimensions The dimension array to be filled. The size of the array
- * must be exactly as large as the rank of the input operand to be queried in
- * the model.
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronCompilation_getInputPaddedDimensions(
-    NeuronCompilation* compilation,
-    int32_t index,
-    uint32_t* dimensions);
-
-/**
- * Get the padded dimensional information of the specified output operand of the
- * compilation. This function must be called after calling
- * NeuronCompilation_finish. If NeuronModel_suppressOutputConversion was not
- * applied to the model to be compiled, the returned dimensions are the padded
- * dimension after NeuronCompilation_finish to satisfy the optimization
- * requirement from the underlying hardware accelerators.
- * If NeuronModel_suppressOutputConversion was applied to the model to be
- * compiled, the returned dimensions are the same as the original dimensions
- * given from user.
- *
- * Available since 4.2.0
- *
- * @param compilation The compilation to be queried.
- * @param index The index of the output operand we are querying. It is an index
- * into the lists passed to NeuronModel_identifyInputsAndOutputs. It is not the
- * index associated with NeuronModel_addOperand.
- * @param dimensions The dimension array to be filled. The size of the array
- * must be exactly as large as the rank of the output operand to be queried in
- * the model.
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronCompilation_getOutputPaddedDimensions(
-    NeuronCompilation* compilation,
-    int32_t index,
-    uint32_t* dimensions);
-
-/**
- * Get the expected buffer size (bytes) of the specified input operand of the
- * compilation. If NeuronModel_suppressInputConversion was not applied to the
- * model to be compiled, the returned size are the padded size after
- * NeuronCompilation_finish to satisfy the optimization requirement from the
- * underlying hardware accelerators. If NeuronModel_suppressInputConversion was
- * applied to the model to be compiled, the returned size are the same as the
- * original size given from user.
- *
- * Available since 4.2.0
- *
- * @param compilation The compilation to be queried.
- * @param index The index of the input operand we are querying. It is an index
- * into the lists passed to NeuronModel_identifyInputsAndOutputs. It is not the
- * index associated with NeuronModel_addOperand.
- * @param size the expected buffer size in bytes.
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronCompilation_getInputPaddedSize(
-    NeuronCompilation* compilation,
-    int32_t index,
-    size_t* size);
-
-/**
- * Get the expected buffer size (bytes) of the specified output operand of the
- * compilation. If NeuronModel_suppressOutputConversion was not applied to the
- * model to be compiled, the returned size are the padded size after
- * NeuronCompilation_finish to satisfy the optimization requirement from the
- * underlying hardware accelerators. If NeuronModel_suppressOutputConversion was
- * applied to the model to be compiled, the returned size are the same as the
- * original size given from user.
- *
- * Available since 4.2.0
- *
- * @param compilation The compilation to be queried.
- * @param index The index of the output operand we are querying. It is an index
- * into the lists passed to NeuronModel_identifyInputsAndOutputs. It is not the
- * index associated with NeuronModel_addOperand.
- * @param size the expected buffer size in bytes.
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronCompilation_getOutputPaddedSize(
-    NeuronCompilation* compilation,
-    int32_t index,
-    size_t* size);
-
-/**
- * Get the compiled network size of the compilation.
- *
- * This must be called after NeuronCompilation_finished and before
- * NeuronExecution_create. It is not allowed to call this with a compilation
- * restored from cache.
- *
- * Available since 4.3.0
- *
- * @param compilation The compilation to be queried.
- * @param size The compiled network size in bytes.
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronCompilation_getCompiledNetworkSize(
-    NeuronCompilation* compilation,
-    size_t* size);
-
-/**
- * Store the compiled network.
- *
- * Users have to allocate the buffer with the specified size before calling this
- * function.
- *
- * This must be called after NeuronCompilation_finished and before
- * NeuronExecution_create. It is not allowed to call this with a compilation
- * restored from cache.
- *
- * Available since 4.3.0
- *
- * @param compilation The compilation to be queried.
- * @param buffer User allocated buffer to store the compiled network.
- * @param size Size of the user allocated buffer in bytes.
- * @return NEURON_NO_ERROR if compiled network is successfully copied to the
- * user allocated buffer.
- */
-int NeuronCompilation_storeCompiledNetwork(
-    NeuronCompilation* compilation,
-    void* buffer,
-    const size_t size);
-/**
- * Hint the compiler to apply the optimization strategy according to the user
- * specified parameters.
- *
- * Available since 4.3.0
- *
- * @param compilation The compilation to be modified.
- * @param optimizationCode User specified optimization strategy. Must be one of
- * NEURON_OPTIMIZATION_* or the inclusive OR value of multiple
- * NEURON_OPTIMIZATION_*.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronCompilation_setOptimizationHint(
-    NeuronCompilation* compilation,
-    uint32_t optimizationCode);
-
-/**
- * Hint the compiler to apply the optimization strategy according to the user
- * specified arguments in a null-terminated string.
- *
- * Available since 4.6.0
- *
- * @param compilation The compilation to be modified.
- * @param optimizationString A null-terminated string to represent the user
- * specified optimization strategy.
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronCompilation_setOptimizationString(
-    NeuronCompilation* compilation,
-    const char* optimizationString);
-
-/**
- * Only allow users' optimization string(from
- * NeuronCompilation_setOptimizationString), the system won't set any compiler
- * options for them.
- *
- * Available since 6.0.5
- *
- * @param compilation The compilation to be modified.
- * @param allow Allow only use user's setting or not.
- * strategy.
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronCompilation_setOnlyAllowOptimizationString(
-    NeuronCompilation* compilation,
-    bool allow);
-
-/**
- * Get the compiler hints which are used to apply the optimization strategy
- * according to the user specified arguments in a null-terminated string.
- *
- * Available since 6.0.5
- *
- * @param compilation The compilation to be modified.
- * @param optimizationString A null-terminated string to represent the user
- * specified optimization strategy.
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronCompilation_getOptimizationString(
-    NeuronCompilation* compilation,
-    const char** optimizationString);
-
-/**
- * Hint compiler to trim the model IO alignment.
- *
- * Available since 4.4.8
- *
- * @param compilation The compilation to be modified.
- * @param enable 'true' for trimming model IO alignment.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronCompilation_setTrimIOAlignment(
-    NeuronCompilation* compilation,
-    bool enable);
-
-/**
- * Hint compiler to use software dilated convolution
- *
- * Available since 4.4.8
- *
- * @param compilation The compilation to be modified.
- * @param enable 'true' indicates a hint to compiler to use software dilated
- * convolution
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronCompilation_setSWDilatedConv(
-    NeuronCompilation* compilation,
-    bool enable);
-
-/**
- * Create a new execution instance by calling the NeuronExecution_create
- * function. The provided compilation must outlive the execution.
- *
- * Available since 4.1.0
- *
- * @param compilation The NeuronCompilation to be evaluated.
- * @param execution The newly created object or NULL if unsuccessful.
- *
- * @return NEURON_NO_ERROR if successful
- */
-int NeuronExecution_create(
-    NeuronCompilation* compilation,
-    NeuronExecution** execution);
-
-/**
- * Destroy an execution.
- *
- * Available since 4.1.0
- *
- * @param execution The execution to be destroyed.
- */
-void NeuronExecution_free(NeuronExecution* execution);
-
-/**
- * Associate a user buffer with an input of the model of the NeuronExecution.
- * The provided buffer must outlive the execution.
- *
- * Available since 4.1.0
- *
- * @param execution The execution to be modified.
- * @param index The index of the input argument we are setting. It is an index
- * into the lists passed to NeuronModel_identifyInputsAndOutputs. It is not the
- * index associated with NeuronModel_addOperand.
- * @param type The NeuronOperandType of the operand. Currently NeuronAdapter
- * only takes NULL.
- * @param buffer The buffer containing the data.
- * @param length The length in bytes of the buffer.
- *
- * @return NEURON_NO_ERROR if successful, NEURON_BAD_DATA if the name is not
- * recognized or the buffer is too small for the input.
- */
-int NeuronExecution_setInput(
-    NeuronExecution* execution,
-    int32_t index,
-    const NeuronOperandType* type,
-    const void* buffer,
-    size_t length);
-
-/**
- * Associate a user buffer with an output of the model of the NeuronExecution.
- * The provided buffer must outlive the execution.
- *
- * Available since 4.1.0
- *
- * @param execution The execution to be modified.
- * @param index The index of the output argument we are setting. It is an index
- * into the lists passed to NeuronModel_identifyInputsAndOutputs. It is not the
- * index associated with NeuronModel_addOperand.
- * @param type The NeuronOperandType of the operand. Currently NeuronAdapter
- * only takes NULL.
- * @param buffer The buffer where the data is to be written.
- * @param length The length in bytes of the buffer.
- *
- * @return NEURON_NO_ERROR if successful, NEURON_BAD_DATA if the name is not
- * recognized or the buffer is too small for the output.
- */
-int NeuronExecution_setOutput(
-    NeuronExecution* execution,
-    int32_t index,
-    const NeuronOperandType* type,
-    void* buffer,
-    size_t length);
-
-/**
- * Associate part of a memory object with an input of the model of the
- * NeuronExecution.
- *
- * The provided memory must outlive the execution and should not be changed
- * during computation.
- *
- * Available since 4.1.0
- *
- * @param execution The execution to be modified.
- * @param index The index of the input argument we are setting. It is an index
- * into the lists passed to NeuronModel_identifyInputsAndOutputs. It is not the
- * index associated with Neuronodel_addOperand.
- * @param type The NeuronOperandType of the operand. Currently NueronAdapter
- * only takes NULL.
- * @param memory The memory containing the data.
- * @param offset This specifies the location of the data within the memory. The
- * offset is in bytes from the start of memory.
- * @param length The size in bytes of the data value.
- *
- * @return NEURON_NO_ERROR if successful, NEURON_BAD_DATA if the name is not
- * recognized or the buffer is too small for the input.
- */
-int NeuronExecution_setInputFromMemory(
-    NeuronExecution* execution,
-    uint32_t index,
-    const NeuronOperandType* type,
-    const NeuronMemory* memory,
-    size_t offset,
-    size_t length);
-
-/**
- * Associate part of a memory object with an output of the model of the
- * NeuronExecution.
- *
- * The provided memory must outlive the execution and should not be changed
- * during computation.
- *
- * Available since 4.1.0
- *
- * @param execution The execution to be modified.
- * @param index The index of the output argument we are setting. It is an index
- * into the lists passed to NeuronModel_identifyInputsAndOutputs. It is not the
- * index associated with Neuronodel_addOperand.
- * @param type The NeuronOperandType of the operand. Currently NueronAdapter
- * only takes NULL.
- * @param memory The memory containing the data.
- * @param offset This specifies the location of the data within the memory. The
- * offset is in bytes from the start of memory.
- * @param length The size in bytes of the data value.
- *
- * @return NEURON_NO_ERROR if successful, NEURON_BAD_DATA if the name is not
- * recognized or the buffer is too small for the input.
- */
-int NeuronExecution_setOutputFromMemory(
-    NeuronExecution* execution,
-    uint32_t index,
-    const NeuronOperandType* type,
-    const NeuronMemory* memory,
-    size_t offset,
-    size_t length);
-
-/**
- * Schedule synchronous evaluation of the execution.
- * Returns once the execution has completed and the outputs are ready to be
- * consumed.
- *
- * Available since 4.1.0
- *
- * @param execution The execution to be scheduled and executed.
- *
- * @return NEURON_NO_ERROR if the execution completed normally. NEURON_BAD_STATE
- * if the inference fails. Add two return code since 5.0.0
- * (NEURON_MISSED_DEADLINE_TRANSIENT if  inference timeout, and
- * NEURON_OUTPUT_INSUFFICIENT_SIZE if given outsize is not sufficient for real
- * output)
- *
- */
-int NeuronExecution_compute(NeuronExecution* execution);
-
-/**
- * Schedule asynchronous evaluation of the execution with dependencies.
- *
- * The execution will wait for all the depending events to be signaled before
- * starting the evaluation. Once the execution has completed and the outputs
- * are ready to be consumed, the returned event will be signaled. Depending on
- * which devices are handling the execution, the event could be backed by a sync
- * fence. Use NeuronEvent_wait to wait for that event.
- *
- * NeuronEvent_wait must be called to recurperate the resources used by the
- * execution.
- *
- * If parts of the execution are scheduled on devices that do not support fenced
- * execution, the function call may wait for such parts to finish before
- * returning.
- *
- * The function will return an error if any of the events in dependencies is
- * already in a bad state. After the execution is scheduled, if any of the
- * events in dependencies does not complete normally, the execution will fail,
- * and NeuronEvent_wait on the returned event will return an error.
- *
- * The function will return an error if any of the execution outputs has a
- * tensor operand type that is not fully specified.
- *
- * @param execution The execution to be scheduled and executed.
- * @param dependencies A set of depending events. The actual evaluation will not
- * start until all the events are signaled.
- * @param num_dependencies The number of events in the dependencies set.
- * @param duration currently not used
- * @param event The event that will be signaled on completion. event is set to
- *              NULL if there's an error.
- *
- * @return NEURON_NO_ERROR if the evaluation is successfully scheduled.
- *
- * Available since 5.0.0
- */
-int NeuronExecution_startComputeWithDependencies(
-    NeuronExecution* execution,
-    const NeuronEvent* const* dependencies,
-    uint32_t num_dependencies,
-    uint64_t duration,
-    NeuronEvent** event);
-
-/**
- * Set the maximum duration of WHILE loops in the specified execution.
- *
- * @param execution The execution to be modified.
- * @param duration The maximum amount of time in nanoseconds.
- * @return NEURON_NO_ERROR if successful.
- *
- * Available since 5.0.0
- */
-int NeuronExecution_setLoopTimeout(
-    NeuronExecution* execution,
-    uint64_t duration);
-
-/**
- * Get the default timeout value for WHILE loops.
- *
- * @return The default timeout value in nanoseconds.
- *
- * Available since 5.0.0
- */
-uint64_t Neuron_getDefaultLoopTimeout();
-
-/**
- * Get the maximum timeout value for WHILE loops.
- *
- * @return The maximum timeout value in nanoseconds.
- *
- * Available since 5.0.0
- */
-uint64_t Neuron_getMaximumLoopTimeout();
-
-/**
- * Sets the execution boost hint associated with this execution. Required before
- * calling NeuronExecution_compute.
- *
- * Execution boost is the hint for the device frequency, ranged between 0
- * (lowest) to 100 (highest). For the compilation with preference set as
- * NEURON_PREFER_SUSTAINED_SPEED, scheduler guarantees that the executing boost
- * value would equal to the boost value hint.
- *
- * On the other hand, for the compilation with preference set as
- * NEURON_PREFER_LOW_POWER, scheduler would try to save power by configuring the
- * executing boost value with some value that is not higher than the boost value
- * hint.
- *
- * Available since 4.1.0
- *
- * @param execution The execution to be modified.
- * @param boostValue The hint for the device frequency, ranged between 0
- * (lowest) to 100 (highest).
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronExecution_setBoostHint(
-    NeuronExecution* execution,
-    uint8_t boostValue);
-
-/**
- * Sets the execution CPU cache flush hint associated with this execution.
- * Required before calling NeuronExecution_setInputFromMemory and
- * NeuronExecution_setOutputFromMemory.
- *
- * Default value of preference is NEURON_CACHE_FLUSH_ENABLE_ALL
- *
- * Available since 5.0.1
- *
- * @param execution The execution to be modified.
- * @param hint  It is either NEURON_CACHE_FLUSH_ENABLE_ALL or the bitwise OR
- * of one or more of the following flags: NEURON_CACHE_FLUSH_DISABLE_SYNC_INPUT,
- * NEURON_CACHE_FLUSH_DISABLE_INVALIDATE_OUTPUT.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronExecution_setCacheFlushHint(
-    NeuronExecution* execution,
-    uint8_t flushHint);
-
-/**
- * Get the dimensional information of the specified output operand of the model
- * of the latest computation evaluated on {@link NeuronExecution}.
- *
- * This function may only be invoked when the execution is in the completed
- * state.
- *
- * Available since 5.0.0
- *
- * @param execution The execution to be queried.
- * @param index The index of the output argument we are querying. It is
- *              an index into the lists passed to {@link
- * NeuronModel_identifyInputsAndOutputs}.
- * @param rank The rank of the output operand.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronExecution_getOutputOperandRank(
-    NeuronExecution* execution,
-    int32_t index,
-    uint32_t* rank);
-
-/**
- * Get the dimensional information of the specified output operand of the model
- * of the latest computation evaluated on {@link NeuronExecution}. The target
- * output operand cannot be a scalar.
- *
- * This function may only be invoked when the execution is in the completed
- * state.
- *
- * Available since 5.0.0
- *
- * @param execution The execution to be queried.
- * @param index The index of the output argument we are querying. It is
- *              an index into the lists passed to {@link
- * NeuronModel_identifyInputsAndOutputs}.
- * @param dimensions The dimension array to be filled. The size of the array
- * must be exactly as large as the rank of the output operand to be queried in
- * the model.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronExecution_getOutputOperandDimensions(
-    NeuronExecution* execution,
-    int32_t index,
-    uint32_t* dimensions);
-
-/**
- * Create a NeuronCompilation which can create executions with shared static
- * memory.
- *
- * This function only creates the object. Compilation is only performed once
- * NeuronCompilation_finish is invoked. NeuronCompilation_finish should be
- * called once all desired properties have been set on the compilation.
- * NeuronModel_free should be called once the compilation is no longer needed.
- * The provided model must outlive the compilation. The model must already have
- * been finished by a call to NeuronModel_finish.
- *
- * Available since 7.0.0
- *
- * @param model The NeuronModel to be compiled.
- * @param compilation The newly created object or NULL if unsuccessful.
- *
- * @return NEURON_NO_ERROR if successful
- */
-int NeuronCompilation_createForBatch(
-    NeuronModel* model,
-    NeuronCompilation** compilation);
-
-/**
- * Set the size of runner pool, and create same number of runners.
- *
- * The execution must created by the following steps:
- * NeuronCompilation_createForBatch, NeuronCompilation_finish,
- * NeuronExecution_create.
- *
- * The execution created from this compilation has to use
- * NeuronExecution_setRunnerPoolSize to create thread pool and then set a series
- * of inputs & outputs into the execution. The execution will inference with the
- * series of inputs.
- *
- * Available since 7.0.0
- *
- * @param execution The NeuronExecution to be utilized.
- * @param numRunners The number of runner need to be created.
- *
- * @return NEURON_NO_ERROR if successful
- * @return NEURON_BAD_STATE if the compilation is not created via
- * NeuronCompilation_createForBatch.
- */
-int NeuronExecution_setRunnerPoolSize(
-    NeuronExecution* execution,
-    uint8_t numRunners);
-
-/**
- * Notify the execution that all inputs / outputs have been set.
- * Should be called after NeuronExecution_setInputFromMemory and
- * NeuronExecution_setOutputFromMemory.
- *
- * The execution must created by the following steps:
- * NeuronCompilation_createForBatch, NeuronCompilation_finish,
- * NeuronExecution_create.
- *
- * Available since 7.0.0
- *
- * @param execution The NeuronExecution to be utilized.
- *
- * @return NEURON_NO_ERROR if successful
- * @return NEURON_BAD_STATE if the compilation is not created via
- * NeuronCompilation_createForBatch.
- */
-int NeuronExecution_setBatchDone(NeuronExecution* execution);
-
-/**
- * Notify the execution that all inputs / outputs have been set.
- * Should be called after NeuronExecution_setInputFromMemory and
- * NeuronExecution_setOutputFromMemory.
- *
- * The execution must created by the following steps:
- * 1. NeuronCompilation_createV2 with COMPILATION_TYPE_EXECUTION_CONTROLLER
- * 2. NeuronCompilation_finish
- * 3. NeuronExecution_create.
- * or
- * 1. NeuronModel_restoreFromCompiledNetworkV2  with
- * COMPILATION_TYPE_EXECUTION_CONTROLLER
- * 2. NeuronExecution_create.
- *
- * Available since 7.0.1
- *
- * @param execution The NeuronExecution to be utilized.
- * @param idx The index of runner to set the previous inputs and outputs.
- *
- * @return NEURON_NO_ERROR if successful
- * @return NEURON_BAD_STATE if the compilation is not created via
- *             COMPILATION_TYPE_EXECUTION_CONTROLLER.
- */
-int NeuronExecution_setIODone(NeuronExecution* execution, int idx);
-
-/**
- * Create a NeuronCompilation which can create executions with shared static
- * memory.
- *
- * This function only creates the object. Compilation is only performed once
- * NeuronCompilation_finish is invoked. NeuronCompilation_finish should be
- * called once all desired properties have been set on the compilation.
- * NeuronModel_free should be called once the compilation is no longer needed.
- * The provided model must outlive the compilation. The model must already have
- * been finished by a call to NeuronModel_finish.
- *
- * The executions created from this compilation can be executed at the same
- * time.
- *
- * Available since 7.0.0
- *
- * @param model The NeuronModel to be compiled.
- * @param compilation The newly created object or NULL if unsuccessful.
- *
- * @return NEURON_NO_ERROR if successful
- */
-int NeuronCompilation_createForMultiExecutions(
-    NeuronModel* model,
-    NeuronCompilation** compilation);
-
-/**
- * Set report path for debug plus.
- *
- * Only be used in debug purpose, the execution should be created by
- * NeuronCompilation_createForDebug compilation.
- *
- * Available since 5.0.0
- *
- * @param model The model need to be debug.
- * @param path The path of execution report.
- *
- * @return NEURON_NO_ERROR if successful, NEURON_BAD_DATA if the path is empty.
- */
-int NeuronDebug_setReportPath(NeuronModel* model, const char* path);
-
-/**
- * Get the number of available devices.
- *
- * Available since 4.1.0
- * @param numDevices The number of devices returned.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int Neuron_getDeviceCount(uint32_t* numDevices);
-
-/**
- * Get the representation of the specified device.
- *
- * Available since 4.1.0
- *
- * @param devIndex The index of the specified device. Must be less than the
- * number of available devices.
- * @param device The representation of the specified device. The same
- * representation will always be returned for the specified device.
- *
- * @return NEURONNO_ERROR if successful.
- */
-int Neuron_getDevice(uint32_t devIndex, NeuronDevice** device);
-
-/**
- * Get the name of the specified device.
- *
- * Available since 4.1.0
- *
- * @param device The representation of the specified device.
- * @param name The returned name of the specified device. The name will remain
- * valid for the duration of the application.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronDevice_getName(const NeuronDevice* device, const char** name);
-
-/**
- * Get the description of the specified device.
- *
- * Available since 5.0.0
- *
- * @param device The representation of the specified device.
- * @param description The returned description of the specified device. The
- * description will remain valid for the duration of the application.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronDevice_getDescription(
-    const NeuronDevice* device,
-    const char** description);
-
-/*
- * Destroys the event.
- *
- * See NeuronExecution for information on multithreaded usage.
- *
- * Available since 5.0.0
- *
- * @param event The event object to be destroyed. Passing NULL is acceptable and
- *              results in no operation.
- */
-void NeuronEvent_free(NeuronEvent* event);
-
-/*
- * Force destroys the event without calling NeuronEvent_wait().
- * If user wants do wait before destroying the event, they should use
- * NeuronEvent_free.
- *
- * See NeuronExecution for information on multithreaded usage.
- *
- * Available since 6.0.0
- *
- * @param event The event object to be destroyed. Passing NULL is acceptable and
- *              results in no operation.
- */
-void NeuronEvent_freeForce(NeuronEvent* event);
-
-/**
- * Waits until the execution completes.
- *
- * More than one thread can wait on an event. When the execution completes,
- * all threads will be released.
- *
- * SeeNeuronExecution for information on multithreaded usage.
- *
- * Available since 5.0.0
- *
- * @param event The event that will be signaled on completion.
- * @return NEURON_NO_ERROR if the execution completed normally.
- *         NEURON_UNMAPPABLE if the execution input or output memory cannot
- *         be properly mapped.
- */
-int NeuronEvent_wait(NeuronEvent* event);
-
-/**
- * Create a NeuronEventfrom a sync_fence file descriptor.
- *
- * The newly created NeuronEvent does not take ownership of the provided
- * sync_fence_fd, it will instead dup the provided sync_fence_fd and own the
- * duplicate.
- *
- * @param sync_fence_fd The sync_fence file descriptor.
- * @param event The newly created object or NULL if unsuccessful.
- *
- * @return NEURON_NO_ERROR if successful.
- *
- * Available since 5.0.0
- */
-int NeuronEvent_createFromSyncFenceFd(int sync_fence_fd, NeuronEvent** event);
-
-/**
- * Get sync_fence file descriptor from the event.
- *
- * If the NeuronEvent is not backed by a sync fence, the sync_fence_fd
- * will be set to -1, and NEURON_BAD_DATA will be returned.
- *
- * See NeuronEvent_createFromSyncFenceFd and
- * NeuronExecution_startComputeWithDependencies to see how to create an event
- * backed by a sync fence.
- *
- * The user takes ownership of the returned fd, and must close the returned file
- * descriptor when it is no longer needed.
- *
- * @param event An event that is backed by a sync fence.
- * @param sync_fence_fd The sync_fence file descriptor. The file descriptor will
- *                      be set to -1 if there is an error.
- *
- * @return NEURON_NO_ERROR if successful.
- *
- * Available since 5.0.0
- */
-int NeuronEvent_getSyncFenceFd(const NeuronEvent* event, int* sync_fence_fd);
-
-/**
- * Queries whether an extension is supported by the driver implementation of the
- * specified device.
- *
- * @param extension The extension name.
- * @param isExtensionSupported The boolean value indicating whether the
- * extension is supported.
- *
- * @return NEURON_NO_ERROR if successful.
- *
- * Available since 5.0.0
- */
-// Note: Remove "device"
-int NeuronDevice_getExtensionSupport(
-    const char* extensionName,
-    bool* isExtensionSupported);
-
-/**
- * Creates an operand type from an extension name and an extension operand code.
- *
- * See {@link NeuronModel} for information on multithreaded usage.
- *
- * Available since 5.0.0
- *
- * @param model The model to contain the operand.
- * @param extensionName The extension name.
- * @param operandCodeWithinExtension The extension operand code.
- * @param type The operand type.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronModel_getExtensionOperandType(
-    NeuronModel* model,
-    const char* extensionName,
-    uint16_t operandCodeWithinExtension,
-    int32_t* type);
-
-/**
- * Creates an operation type from an extension name and an extension operation
- * code.
- *
- * See {@link NeuronModel} for information on multithreaded usage.
- *
- * Available since 5.0.0
- *
- * @param model The model to contain the operation.
- * @param extensionName The extension name.
- * @param operationCodeWithinExtension The extension operation code.
- * @param type The operation type.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronModel_getExtensionOperationType(
-    NeuronModel* model,
-    const char* extensionName,
-    uint16_t operationCodeWithinExtension,
-    int32_t* type);
-
-/**
- * Sets extension operand parameters.
- *
- * Available since 5.0.0
- *
- * @param model The model to be modified.
- * @param index The index of the model operand we're setting.
- * @param data A pointer to the extension operand data.
- *             The data does not have to outlive the call to this function.
- * @param length The size in bytes of the data value.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronModel_setOperandExtensionData(
-    NeuronModel* model,
-    int32_t index,
-    const void* data,
-    size_t length);
-
-/**
- * Gets the execution preference associated with this compilation.
- * This function must be called after calling NeuronCompilation_finish.
- *
- * Available since 6.0.0
- *
- * @param compilation The compilation to be queried.
- * @param preference The execution preference will be one of NEURON_PREFER_*.
- * Ignore preference value if this function doesn't return NEURON_NO_ERROR.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronCompilation_getPreference(
-    NeuronCompilation* compilation,
-    int* preference);
-
-/**
- * Gets the execution priority associated with this compilation.
- * This function must be called after calling NeuronCompilation_finish.
- *
- * Available since 6.0.0
- *
- * @param compilation The compilation to be queried.
- * @param priority The priority will be one of NEURON_PRIORITY_*. Ignore
- * priority value if this function doesn't return NEURON_NO_ERROR.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronCompilation_getPriority(
-    NeuronCompilation* compilation,
-    int* priority);
-
-int NeuronCompilation_createWithOptions(
-    NeuronModel* model,
-    NeuronCompilation** compilation,
-    const char* options);
-__END_DECLS
diff --git a/backends/mediatek/runtime/include/api/NeuronAdapterShim.h b/backends/mediatek/runtime/include/api/NeuronAdapterShim.h
index 3b955eb4976..b5c7cd4098f 100644
--- a/backends/mediatek/runtime/include/api/NeuronAdapterShim.h
+++ b/backends/mediatek/runtime/include/api/NeuronAdapterShim.h
@@ -1,38 +1,9 @@
-/* Copyright Statement:
+/*
+ * Copyright (c) 2024 MediaTek Inc.
  *
- * This software/firmware and related documentation ("MediaTek Software") are
- * protected under relevant copyright laws. The information contained herein
- * is confidential and proprietary to MediaTek Inc. and/or its licensors.
- * Without the prior written permission of MediaTek inc. and/or its licensors,
- * any reproduction, modification, use or disclosure of MediaTek Software,
- * and information contained herein, in whole or in part, shall be strictly
- * prohibited.
- */
-/* MediaTek Inc. (C) 2020. All rights reserved.
- *
- * BY OPENING THIS FILE, RECEIVER HEREBY UNEQUIVOCALLY ACKNOWLEDGES AND AGREES
- * THAT THE SOFTWARE/FIRMWARE AND ITS DOCUMENTATIONS ("MEDIATEK SOFTWARE")
- * RECEIVED FROM MEDIATEK AND/OR ITS REPRESENTATIVES ARE PROVIDED TO RECEIVER ON
- * AN "AS-IS" BASIS ONLY. MEDIATEK EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE OR NONINFRINGEMENT.
- * NEITHER DOES MEDIATEK PROVIDE ANY WARRANTY WHATSOEVER WITH RESPECT TO THE
- * SOFTWARE OF ANY THIRD PARTY WHICH MAY BE USED BY, INCORPORATED IN, OR
- * SUPPLIED WITH THE MEDIATEK SOFTWARE, AND RECEIVER AGREES TO LOOK ONLY TO SUCH
- * THIRD PARTY FOR ANY WARRANTY CLAIM RELATING THERETO. RECEIVER EXPRESSLY
- * ACKNOWLEDGES THAT IT IS RECEIVER'S SOLE RESPONSIBILITY TO OBTAIN FROM ANY
- * THIRD PARTY ALL PROPER LICENSES CONTAINED IN MEDIATEK SOFTWARE. MEDIATEK
- * SHALL ALSO NOT BE RESPONSIBLE FOR ANY MEDIATEK SOFTWARE RELEASES MADE TO
- * RECEIVER'S SPECIFICATION OR TO CONFORM TO A PARTICULAR STANDARD OR OPEN
- * FORUM. RECEIVER'S SOLE AND EXCLUSIVE REMEDY AND MEDIATEK'S ENTIRE AND
- * CUMULATIVE LIABILITY WITH RESPECT TO THE MEDIATEK SOFTWARE RELEASED HEREUNDER
- * WILL BE, AT MEDIATEK'S OPTION, TO REVISE OR REPLACE THE MEDIATEK SOFTWARE AT
- * ISSUE, OR REFUND ANY SOFTWARE LICENSE FEES OR SERVICE CHARGE PAID BY RECEIVER
- * TO MEDIATEK FOR SUCH MEDIATEK SOFTWARE AT ISSUE.
- *
- * The following software/firmware and/or related documentation ("MediaTek
- * Software") have been modified by MediaTek Inc. All revisions are subject to
- * any receiver's applicable license agreements with MediaTek Inc.
+ * Licensed under the BSD License (the "License"); you may not use this file
+ * except in compliance with the License. See the license file in the root
+ * directory of this source tree for more details.
  */
 
 #pragma once
diff --git a/backends/openvino/CMakeLists.txt b/backends/openvino/CMakeLists.txt
index 7348ac94a6e..8d07cd9a366 100644
--- a/backends/openvino/CMakeLists.txt
+++ b/backends/openvino/CMakeLists.txt
@@ -26,7 +26,7 @@ endif()
 set(COMMON_INCLUDE_DIRS ${EXECUTORCH_ROOT}/..)
 
 # Include utility CMake scripts from ExecuteTorch
-include(${EXECUTORCH_ROOT}/build/Utils.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 
 # Find OpenVINO libraries
 find_package(OpenVINO REQUIRED)
diff --git a/backends/openvino/runtime/OpenvinoBackend.cpp b/backends/openvino/runtime/OpenvinoBackend.cpp
index 431ffcff67c..a3134f72b4b 100644
--- a/backends/openvino/runtime/OpenvinoBackend.cpp
+++ b/backends/openvino/runtime/OpenvinoBackend.cpp
@@ -82,8 +82,7 @@ exr::Result<exr::DelegateHandle*> OpenvinoBackend::init(
 
   // Allocate execution handle
   exr::MemoryAllocator* allocator = context.get_runtime_allocator();
-  ExecutionHandle* handle =
-      ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR(allocator, ExecutionHandle);
+  ExecutionHandle* handle = allocator->allocateInstance<ExecutionHandle>();
   new (handle) ExecutionHandle;
   handle->compiled_model = std::make_shared<ov::CompiledModel>(compiled_model);
   handle->infer_request = infer_request;
diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt
index aefa929ee9f..f5adc84f903 100644
--- a/backends/qualcomm/CMakeLists.txt
+++ b/backends/qualcomm/CMakeLists.txt
@@ -39,16 +39,13 @@ if(${ANDROID})
   find_library(android_log log)
 endif()
 
-if(NOT FLATC_EXECUTABLE)
-  set(FLATC_EXECUTABLE flatc)
-endif()
-
 set(qcir_schema_include_dir ${CMAKE_CURRENT_LIST_DIR}/aot/ir)
 set(qcir_schema_output ${qcir_schema_include_dir}/qcir_generated.h)
 add_custom_command(
   OUTPUT qcir_schema_output
   COMMAND ${FLATC_EXECUTABLE} --cpp --cpp-std c++11 --scoped-enums -o
           ${qcir_schema_include_dir} ${qcir_schema_include_dir}/qcir.fbs
+  DEPENDS flatc
   COMMENT "Generating qualcomm ir schema headers"
   VERBATIM
 )
@@ -100,6 +97,7 @@ add_custom_command(
     "${_qnn_schema__include_dir}/executorch/backends/qualcomm"
     ${_qnn_schema__srcs}
   WORKING_DIRECTORY ${EXECUTORCH_SOURCE_DIR}
+  DEPENDS flatc
   COMMENT "Generating qnn_schema headers"
   VERBATIM
 )
@@ -126,7 +124,6 @@ add_library(qnn_executorch_logging STATIC)
 add_library(qnn_factory STATIC)
 add_library(qnn_function_interface INTERFACE)
 add_library(qnn_graph STATIC)
-add_library(qnn_header INTERFACE)
 add_library(qnn_implementation STATIC)
 add_library(qnn_logger STATIC)
 add_library(qnn_manager STATIC)
@@ -143,16 +140,12 @@ add_library(utils STATIC)
 # declare dependency
 #
 target_link_libraries(qcir_utils PRIVATE qcir)
-target_link_libraries(wrappers PRIVATE qnn_header qnn_executorch_logging)
-target_link_libraries(qnn_function_interface INTERFACE qnn_header)
+target_link_libraries(wrappers PRIVATE qnn_executorch_logging)
 target_link_libraries(
-  qnn_implementation PRIVATE qnn_function_interface qnn_header
-                             qnn_executorch_logging ${CMAKE_DL_LIBS}
+  qnn_implementation PRIVATE qnn_function_interface qnn_executorch_logging ${CMAKE_DL_LIBS}
 )
-target_link_libraries(qnn_sys_function_interface INTERFACE qnn_header)
 target_link_libraries(
-  qnn_sys_implementation PRIVATE qnn_sys_function_interface qnn_header
-                                 qnn_executorch_logging ${CMAKE_DL_LIBS}
+  qnn_sys_implementation PRIVATE qnn_sys_function_interface qnn_executorch_logging ${CMAKE_DL_LIBS}
 )
 target_link_libraries(qnn_executorch_logging PRIVATE qnn_schema)
 target_link_libraries(qnn_profiler PRIVATE qnn_executorch_logging)
@@ -178,9 +171,7 @@ target_link_libraries(
 )
 
 target_link_libraries(
-  qnn_factory
-  PUBLIC qnn_header
-  PRIVATE qnn_schema qnn_backend qnn_device qnn_context qnn_graph
+  qnn_factory PRIVATE qnn_schema qnn_backend qnn_device qnn_context qnn_graph
           qnn_mem_manager qnn_custom_protocol
 )
 target_link_libraries(
diff --git a/backends/qualcomm/README.md b/backends/qualcomm/README.md
index 9e1974bad6a..85019add313 100644
--- a/backends/qualcomm/README.md
+++ b/backends/qualcomm/README.md
@@ -124,6 +124,7 @@ PRs are always welcome to help improve the codebase in a comprehensive manner. B
     - [shewu-quic](https://github.com/shewu-quic)
     - [chunit-quic](https://github.com/chunit-quic)
     - [winskuo-quic](https://github.com/winskuo-quic)
+    - [DannyYuyang-quic](https://github.com/DannyYuyang-quic)
     - [haowhsu-quic](https://github.com/haowhsu-quic)
 
 Thanks again for your contribution!
diff --git a/backends/qualcomm/_passes/__init__.py b/backends/qualcomm/_passes/__init__.py
index 36e3fb4356a..fb65e6b5f75 100644
--- a/backends/qualcomm/_passes/__init__.py
+++ b/backends/qualcomm/_passes/__init__.py
@@ -1,11 +1,7 @@
-from .annotate_and_quant_scalar import AnnotateAndQuantScalar
 from .annotate_decomposed import AnnotateDecomposed
 from .annotate_quant_attrs import AnnotateQuantAttrs
 from .constant_i64_to_i32 import ConstantI64toI32
-from .convert_binary_op_with_scalar import ConvertBinaryOpsWithScalar
 from .convert_bmm_to_matmul import ConvertBmmToMatmul
-from .convert_interpolate_with_upsample2d import ConvertInterpolateWithUpsample2D
-from .convert_prelu import ConvertPReLU
 from .convert_to_linear import ConvertToLinear
 from .decompose_any import DecomposeAny
 from .decompose_einsum import DecomposeEinsum
@@ -17,7 +13,9 @@
 from .insert_io_qdq import InsertIOQDQ
 from .insert_requantize import InsertRequantize
 from .layout_transform import LayoutTransform
+from .lift_constant_scalar_operands import LiftConstantScalarOperands
 from .recompose_pixel_unshuffle import RecomposePixelUnshuffle
+from .recompose_prelu import RecomposePReLU
 from .recompose_rms_norm import RecomposeRmsNorm
 from .reduce_dynamic_range import ReduceDynamicRange
 from .remove_redundancy import RemoveRedundancy
@@ -27,14 +25,11 @@
 
 
 __all__ = [
-    AnnotateAndQuantScalar,
     AnnotateDecomposed,
     AnnotateQuantAttrs,
     ConstantI64toI32,
     ConvertBmmToMatmul,
-    ConvertBinaryOpsWithScalar,
-    ConvertInterpolateWithUpsample2D,
-    ConvertPReLU,
+    RecomposePReLU,
     ConvertToLinear,
     DecomposeAny,
     DecomposeEinsum,
@@ -46,6 +41,7 @@
     InsertIOQDQ,
     InsertRequantize,
     LayoutTransform,
+    LiftConstantScalarOperands,
     RecomposePixelUnshuffle,
     RecomposeRmsNorm,
     ReduceDynamicRange,
diff --git a/backends/qualcomm/_passes/annotate_and_quant_scalar.py b/backends/qualcomm/_passes/annotate_and_quant_scalar.py
deleted file mode 100644
index 9daaa4aa624..00000000000
--- a/backends/qualcomm/_passes/annotate_and_quant_scalar.py
+++ /dev/null
@@ -1,137 +0,0 @@
-# Copyright (c) Qualcomm Innovation Center, Inc.
-# All rights reserved
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-import itertools
-import operator
-from typing import Dict
-
-import torch
-from executorch.backends.qualcomm.builders.utils import get_parameter
-from executorch.backends.qualcomm.utils.constants import QCOM_QUANT_ATTRS
-from executorch.exir.pass_base import ExportPass, PassResult
-from executorch.exir.passes import dead_code_elimination_pass
-from torch.fx.passes.utils.source_matcher_utils import get_source_partitions
-
-from .utils import dq_ops, get_quant_attrs
-
-
-class AnnotateAndQuantScalar(ExportPass):
-    """
-    For binary operators who take constant scalar as one of its inputs,
-    will annotate encoding to the constant if necessary.
-    """
-
-    binary_op_sources = [
-        operator.add,
-        operator.sub,
-        operator.mul,
-        operator.truediv,
-        torch.add,
-        torch.sub,
-        torch.mul,
-        torch.div,
-        torch.ops.aten.add.Scalar,
-        torch.ops.aten.sub.Scalar,
-        torch.ops.aten.mul.Scalar,
-        torch.ops.aten.div.Scalar,
-        torch.ops.aten.mul.Tensor,
-        "add",
-        "sub",
-        "mul",
-        "truediv",
-    ]
-
-    def __init__(self, edge_program: torch.export.ExportedProgram):
-        super(AnnotateAndQuantScalar, self).__init__()
-        self.edge_program = edge_program
-
-    def _get_source_scalar_node(self, node: torch.fx.Node) -> torch.fx.Node:
-        """
-        This recursion function is specific for multiply followed by a cast
-        """
-        if node.op == "placeholder":
-            if not (shape := node.meta["val"].size()):
-                return node
-            assert (
-                not shape
-            ), f"The output of node {node} is not a scalar, but a tensor with shape {shape}"
-        return self._get_source_scalar_node(node.args[0])
-
-    def _update_scalar_node_attrs(self, node: torch.fx.Node, quant_attrs: Dict) -> Dict:
-        val = get_parameter(node, self.edge_program)
-        quant_range = quant_attrs["quant_max"] - quant_attrs["quant_min"]
-        # Use 0 as the zero_point for scalar
-        quant_attrs["zero_point"] = 0 if val >= 0 else quant_attrs["quant_max"]
-        quant_attrs["scale"] = (
-            val.div(quant_range) if val >= 0 else -val.div(quant_range)
-        )
-        return quant_attrs
-
-    def _annotate_scalar_node(
-        self,
-        be_annotated_node: torch.fx.Node,
-        quant_attrs: Dict,
-    ) -> None:
-        """
-        This recursion function is specific for multiply followed by a cast
-        """
-        if be_annotated_node.meta["val"].dtype not in [
-            float,
-            torch.float32,
-            torch.int32,
-            torch.int64,
-        ]:
-            return
-
-        be_annotated_node.meta[QCOM_QUANT_ATTRS] = quant_attrs
-
-    def _traverse_binary_node(self, graph_module: torch.fx.GraphModule):
-        src_partitions = get_source_partitions(
-            graph_module.graph, self.binary_op_sources
-        )
-        src_partitions = list(itertools.chain(*src_partitions.values()))
-        processed = set()
-        for src_partition in src_partitions:
-            # need post process here to identify partitioned nodes:
-            src_fn_dict = {}
-            for n in src_partition.nodes:
-                # e.g.
-                # meta["source_fn_stack"]: [('mul', <built-in function mul>)]
-                # we'll use <built-in function mul> as grouping key
-                node_list = src_fn_dict.setdefault(n.meta["source_fn_stack"][-1][1], [])
-                node_list.append(n)
-
-            for nodes in src_fn_dict.values():
-                output = [n for n in nodes if n in src_partition.output_nodes][0]
-                # if all args have been annotated, it shouldn't be a scalar operation
-                if all(arg.target in dq_ops for arg in output.args):
-                    continue
-
-                if output not in processed and QCOM_QUANT_ATTRS in output.meta:
-                    dq_node = [n for n in output.args if n.target in dq_ops][0]
-                    q_node = dq_node.args[0]
-                    q_node_attrs = get_quant_attrs(graph_module, q_node)
-
-                    scalar_nodes = [n for n in output.args if n != dq_node]
-                    if len(scalar_nodes) == 0:
-                        continue
-
-                    scalar_node = scalar_nodes[0]
-                    source_scalar_node = self._get_source_scalar_node(scalar_node)
-                    # we'll abandon cast op here, since the constant scalar will
-                    # be pre-loaded into QNN context binary
-                    output.replace_input_with(scalar_node, source_scalar_node)
-
-                    scalar_quant_attrs = self._update_scalar_node_attrs(
-                        source_scalar_node, q_node_attrs
-                    )
-                    self._annotate_scalar_node(source_scalar_node, scalar_quant_attrs)
-                    processed.add(output)
-
-    def call(self, graph_module: torch.fx.GraphModule):
-        self._traverse_binary_node(graph_module)
-        graph_module.recompile()
-        dead_code_elimination_pass(graph_module)
-        return PassResult(graph_module, True)
diff --git a/backends/qualcomm/_passes/annotate_quant_attrs.py b/backends/qualcomm/_passes/annotate_quant_attrs.py
index b9efcd7aa6c..ed19a54b7e7 100644
--- a/backends/qualcomm/_passes/annotate_quant_attrs.py
+++ b/backends/qualcomm/_passes/annotate_quant_attrs.py
@@ -10,6 +10,7 @@
 from executorch.backends.qualcomm.builders.utils import get_parameter, set_parameter
 from executorch.backends.qualcomm.utils.constants import (
     QCOM_AXIS,
+    QCOM_BLOCK_SIZE,
     QCOM_DTYPE,
     QCOM_ENCODING,
     QCOM_QUANT_ATTRS,
@@ -122,13 +123,25 @@ def _dequant_fold_params(self, n, quant_attrs, param):
             scales = self._expand(quant_attrs[QCOM_SCALES], dim, axis)
             offsets = self._expand(quant_attrs[QCOM_ZERO_POINTS], dim, axis)
             param = param.sub(offsets).mul(scales).to(torch.float32).contiguous()
-            set_parameter(param, n.args[0], self.edge_program)
+        elif quant_attrs[QCOM_ENCODING] in [
+            exir_ops.edge.pt2e_quant.dequantize_affine.default
+        ]:
+            param = torch.ops.pt2e_quant.dequantize_affine(
+                param,
+                block_size=quant_attrs[QCOM_BLOCK_SIZE],
+                scale=quant_attrs[QCOM_SCALE],
+                zero_point=quant_attrs[QCOM_ZERO_POINT],
+                input_dtype=quant_attrs[QCOM_DTYPE],
+                quant_min=quant_attrs[QCOM_QUANT_MIN],
+                quant_max=quant_attrs[QCOM_QUANT_MAX],
+                output_dtype=torch.float32,
+            )
         else:
             scale = quant_attrs[QCOM_SCALE]
             offset = quant_attrs[QCOM_ZERO_POINT]
             param = param.sub(offset).mul(scale).to(torch.float32).contiguous()
-            set_parameter(param, n.args[0], self.edge_program)
 
+        set_parameter(param, n.args[0], self.edge_program)
         n.args[0].meta["val"] = param
 
     def _annotate_quant_attrs(
diff --git a/backends/qualcomm/_passes/convert_binary_op_with_scalar.py b/backends/qualcomm/_passes/convert_binary_op_with_scalar.py
deleted file mode 100644
index 22ce48800d0..00000000000
--- a/backends/qualcomm/_passes/convert_binary_op_with_scalar.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright (c) Qualcomm Innovation Center, Inc.
-# All rights reserved
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-from typing import Dict, Tuple
-
-import torch
-from executorch.exir.pass_base import ExportPass
-from torch._export.pass_base import Argument
-from torch._export.pass_infra.node_metadata import NodeMetadata
-from torch._export.pass_infra.proxy_value import ProxyValue
-
-
-class ConvertBinaryOpsWithScalar(ExportPass):
-    """
-    Replace binary ops with scalar into binary ops with tensor.
-    Since torch.ops.aten.xxx.Scalar will not generate a placeholder node
-    for scalar after to_edge.
-    """
-
-    binary_ops_with_scalar = {
-        torch.ops.aten.add.Scalar: torch.ops.aten.add.Tensor,
-        torch.ops.aten.sub.Scalar: torch.ops.aten.sub.Tensor,
-        torch.ops.aten.div.Scalar: torch.ops.aten.div.Tensor,
-        torch.ops.aten.mul.Scalar: torch.ops.aten.mul.Tensor,
-    }
-
-    def __init__(self):
-        super(ConvertBinaryOpsWithScalar, self).__init__()
-
-    def call_operator(
-        self,
-        op,
-        args: Tuple[Argument, ...],
-        kwargs: Dict[str, Argument],
-        meta: NodeMetadata,
-    ) -> ProxyValue:
-        return super().call_operator(
-            self.binary_ops_with_scalar.get(op, op), args, kwargs, meta
-        )
diff --git a/backends/qualcomm/_passes/convert_interpolate_with_upsample2d.py b/backends/qualcomm/_passes/convert_interpolate_with_upsample2d.py
deleted file mode 100644
index 8fa73ebe49e..00000000000
--- a/backends/qualcomm/_passes/convert_interpolate_with_upsample2d.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright (c) Qualcomm Innovation Center, Inc.
-# All rights reserved
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-import torch
-from executorch.exir.dialects._ops import ops as exir_ops
-from executorch.exir.pass_base import ExportPass, PassResult
-from torch.fx.passes.utils.source_matcher_utils import get_source_partitions
-
-
-class ConvertInterpolateWithUpsample2D(ExportPass):
-    """
-    Merge decomposed operators from interpolate back to one super node.
-    TODO: Currently we only map to upsample2d version, should extend the
-    capability by reverse engineering the decomposition process.
-    """
-
-    def __init__(self):
-        super(ConvertInterpolateWithUpsample2D, self).__init__()
-
-    def call(self, graph_module: torch.fx.GraphModule):
-        graph = graph_module.graph
-        partitions = get_source_partitions(graph, [torch.nn.functional.interpolate])
-        for _, src_partitions in partitions.items():
-            for src_partition in src_partitions:
-                input_node = src_partition.input_nodes[0]
-                output_node = src_partition.output_nodes[0]
-                with graph.inserting_after(input_node):
-                    # TODO: robust way to get the configuration parameters and operator
-                    # please check torch/_decomp/decomposition.py for details
-                    if output_node.target.__name__ == "aten.index.Tensor":
-                        # nearest_2d
-                        # args: input, output_size, scales_h, scales_w
-                        output_size = list(output_node.meta["val"].shape)
-                        args = [input_node, output_size[-2:]]
-                        upsample_op = exir_ops.edge.aten.upsample_nearest2d.default
-                    else:
-                        # upsample_2d
-                        # args: input, output_size, aligned_corners, scales_h, scales_w
-                        output_size = list(output_node.meta["val"].shape)
-                        args = [input_node, output_size[-2:], False]
-                        upsample_op = exir_ops.edge.aten.upsample_bilinear2d.default
-
-                    upsample2d_node = graph.create_node(
-                        "call_function", upsample_op, tuple(args)
-                    )
-                    users = output_node.users.copy()
-                    for user in users:
-                        user.replace_input_with(output_node, upsample2d_node)
-                    # copy metadata
-                    upsample2d_node.meta = output_node.meta
-
-        graph.eliminate_dead_code()
-        graph_module.recompile()
-        return PassResult(graph_module, True)
diff --git a/backends/qualcomm/_passes/decompose_any.py b/backends/qualcomm/_passes/decompose_any.py
index c0c65ee7040..e92bf11dd18 100644
--- a/backends/qualcomm/_passes/decompose_any.py
+++ b/backends/qualcomm/_passes/decompose_any.py
@@ -41,7 +41,7 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
                 keepdim = node.args[2] if len(node.args) > 2 else False
                 model = Any(dim, keepdim)
                 edge_mgr = to_edge(
-                    torch.export.export(model, (node.args[0].meta["val"],))
+                    torch.export.export(model, (node.args[0].meta["val"],), strict=True)
                 )
                 decomposed_module = edge_mgr.exported_program()
 
diff --git a/backends/qualcomm/_passes/decompose_linalg_vector_norm.py b/backends/qualcomm/_passes/decompose_linalg_vector_norm.py
index 8006780863b..4a54c2aa50c 100644
--- a/backends/qualcomm/_passes/decompose_linalg_vector_norm.py
+++ b/backends/qualcomm/_passes/decompose_linalg_vector_norm.py
@@ -32,9 +32,9 @@ class DecomposeLinalgVectorNorm(ExportPass):
     Decompose for math equivalent op.
     """
 
-    def __init__(self, quantization_capture=False) -> None:
+    def __init__(self, aten_dialect_capture=False) -> None:
         super().__init__()
-        self.quantization_capture = quantization_capture
+        self.aten_dialect_capture = aten_dialect_capture
 
     def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
         graph = graph_module.graph
@@ -44,13 +44,15 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
                 dim = node.args[2] if len(node.args) > 2 else None
                 keepdim = node.args[3] if len(node.args) > 3 else False
                 model = LinalgVectorNorm(ord, dim, keepdim)
-                if self.quantization_capture:
+                if self.aten_dialect_capture:
                     decomposed_module = torch.export.export(
-                        model, (node.args[0].meta["val"],)
+                        model, (node.args[0].meta["val"],), strict=True
                     ).module()
                 else:
                     edge_mgr = to_edge(
-                        torch.export.export(model, (node.args[0].meta["val"],))
+                        torch.export.export(
+                            model, (node.args[0].meta["val"],), strict=True
+                        )
                     )
                     decomposed_module = edge_mgr.exported_program()
 
diff --git a/backends/qualcomm/_passes/decompose_silu.py b/backends/qualcomm/_passes/decompose_silu.py
index ca1a566be1e..96c48920419 100644
--- a/backends/qualcomm/_passes/decompose_silu.py
+++ b/backends/qualcomm/_passes/decompose_silu.py
@@ -30,13 +30,15 @@ def call(self, graph_module: torch.fx.GraphModule):
                 silu_node_input = node.args[0]
                 with graph_module.graph.inserting_after(silu_node_input):
                     sigmoid_node = graph.create_node(
-                        "call_function", torch.ops.aten.sigmoid, (silu_node_input,)
+                        "call_function",
+                        torch.ops.aten.sigmoid.default,
+                        (silu_node_input,),
                     )
                     sigmoid_node.meta = self._copy_meta(silu_node.meta)
                     with graph_module.graph.inserting_after(sigmoid_node):
                         mul_node = graph.create_node(
                             "call_function",
-                            torch.ops.aten.mul,
+                            torch.ops.aten.mul.Tensor,
                             (silu_node_input, sigmoid_node),
                         )
                         mul_node.meta = self._copy_meta(silu_node.meta)
diff --git a/backends/qualcomm/_passes/fuse_consecutive_transpose.py b/backends/qualcomm/_passes/fuse_consecutive_transpose.py
index 16ce3803076..04d96462c9f 100644
--- a/backends/qualcomm/_passes/fuse_consecutive_transpose.py
+++ b/backends/qualcomm/_passes/fuse_consecutive_transpose.py
@@ -55,12 +55,6 @@ def _clone_transpose(
                             clone_permute_node.meta = n.meta
                             users[i].replace_input_with(n, clone_permute_node)
 
-    def _is_dispensable(self, axis_order):
-        for index, value in enumerate(axis_order):
-            if index != value:
-                return False
-        return True
-
     def _traverse(self, node):
         if node in self.visited or node.target not in self.op_map:
             return
@@ -87,25 +81,22 @@ def _fuse(self, graph_module: torch.fx.GraphModule) -> torch.fx.GraphModule:
                 axis_order = torch.arange(len(input_shape)).tolist()
                 for node in self.nodes:
                     axis_order = [axis_order[i] for i in node.args[1]]
-                # If axis order is just [0,1,2,3], we ignore permute node
-                if self._is_dispensable(axis_order):
-                    for user in output_node.users.copy():
-                        user.replace_input_with(output_node, n.args[0])
-                else:
-                    with graph.inserting_after(input_node):
-                        permute_op = exir_ops.edge.aten.permute_copy.default
-                        permute_node = graph.create_node(
-                            "call_function", permute_op, (input_node, axis_order)
-                        )
-                        users = output_node.users.copy()
-                        for user in users:
-                            user.replace_input_with(output_node, permute_node)
-
-                        # copy metadata
-                        permute_node.meta = output_node.meta
-                        # Without "qnn_permute", we might obtain wrong input shape
-                        if [pn.meta.get(QCOM_INSERTED_PERMUTE) for pn in self.nodes]:
-                            permute_node.meta[QCOM_INSERTED_PERMUTE] = True
+
+                # Reserve [0,1,2,3] permute node to ensure the next node get the right axis order.
+                with graph.inserting_after(input_node):
+                    permute_op = exir_ops.edge.aten.permute_copy.default
+                    permute_node = graph.create_node(
+                        "call_function", permute_op, (input_node, axis_order)
+                    )
+                    users = output_node.users.copy()
+                    for user in users:
+                        user.replace_input_with(output_node, permute_node)
+
+                    # copy metadata
+                    permute_node.meta = output_node.meta
+                    # Without "qnn_permute", we might obtain wrong input shape
+                    if [pn.meta.get(QCOM_INSERTED_PERMUTE) for pn in self.nodes]:
+                        permute_node.meta[QCOM_INSERTED_PERMUTE] = True
 
             # clear current stack
             self.nodes = []
diff --git a/backends/qualcomm/_passes/layout_transform.py b/backends/qualcomm/_passes/layout_transform.py
index e822a52d1cf..31bb936f3c4 100644
--- a/backends/qualcomm/_passes/layout_transform.py
+++ b/backends/qualcomm/_passes/layout_transform.py
@@ -19,8 +19,6 @@
 from executorch.exir.pass_base import ExportPass, PassResult
 from executorch.exir.sym_util import eval_shape
 
-from .utils import dq_ops, q_ops
-
 
 class LayoutTransform(ExportPass):
     """
@@ -41,32 +39,30 @@ class LayoutTransform(ExportPass):
         exir_ops.edge.aten.pixel_shuffle.default,
         exir_ops.edge.aten.pixel_unshuffle.default,
         exir_ops.edge.aten.upsample_bilinear2d.default,
+        exir_ops.edge.aten.upsample_bilinear2d.vec,
         exir_ops.edge.aten.upsample_nearest2d.default,
+        exir_ops.edge.aten.upsample_nearest2d.vec,
     }
 
     layout_agnostic_ops = {
         exir_ops.edge.aten.abs.default,
         exir_ops.edge.aten.add.Tensor,
+        exir_ops.edge.aten.bitwise_or.Tensor,
         exir_ops.edge.aten.bmm.default,
         exir_ops.edge.aten.cat.default,
         exir_ops.edge.aten.ceil.default,
         exir_ops.edge.aten.clamp.default,
         exir_ops.edge.aten.constant_pad_nd.default,
         exir_ops.edge.aten.div.Tensor,
-        exir_ops.edge.aten.eq.Scalar,
         exir_ops.edge.aten.eq.Tensor,
         exir_ops.edge.aten.full.default,
         exir_ops.edge.aten.full_like.default,
-        exir_ops.edge.aten.ge.Scalar,
         exir_ops.edge.aten.ge.Tensor,
         exir_ops.edge.aten.gelu.default,
-        exir_ops.edge.aten.gt.Scalar,
         exir_ops.edge.aten.gt.Tensor,
         exir_ops.edge.aten.hardswish.default,
         exir_ops.edge.aten.hardsigmoid.default,
         exir_ops.edge.aten.hardtanh.default,
-        exir_ops.edge.aten.leaky_relu.default,
-        exir_ops.edge.aten.le.Scalar,
         exir_ops.edge.aten.le.Tensor,
         exir_ops.edge.aten.linear.default,
         exir_ops.edge.aten.log.default,
@@ -94,8 +90,6 @@ class LayoutTransform(ExportPass):
         exir_ops.edge.aten.topk.default,
         exir_ops.edge.aten._to_copy.default,
         exir_ops.edge.aten.where.self,
-        *q_ops,
-        *dq_ops,
         _operator.getitem,
     }
 
@@ -120,7 +114,6 @@ def __init__(
         super(LayoutTransform, self).__init__()
         self.edge_program = edge_program
         self.insert_permute = insert_permute
-        self.qdq_opset = {*q_ops, *dq_ops}
         self.transformed_tag = QCOM_AXIS_ORDER
 
     def mark_as_transformed(self, node: torch.fx.Node) -> None:
diff --git a/backends/qualcomm/_passes/lift_constant_scalar_operands.py b/backends/qualcomm/_passes/lift_constant_scalar_operands.py
new file mode 100644
index 00000000000..749d30f3564
--- /dev/null
+++ b/backends/qualcomm/_passes/lift_constant_scalar_operands.py
@@ -0,0 +1,161 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass
+from numbers import Number
+from types import BuiltinFunctionType, BuiltinMethodType
+from typing import Dict
+
+import torch
+from executorch.backends.qualcomm._passes.utils import is_float_tensor
+from executorch.exir.pass_base import ExportPass, PassResult
+from executorch.exir.passes import dead_code_elimination_pass
+from torch import fx
+from torch.ao.quantization.fx.utils import get_new_attr_name_with_prefix
+from torch.ops import aten as aten
+
+
+@dataclass(frozen=True)
+class TensorConstant:
+    tensor: torch.Tensor
+    name: str
+
+
+@dataclass(frozen=True)
+class TensorOpInfo:
+    target: torch._ops.OpOverload
+    use_schema_args: bool
+
+
+SCALAR_OPS = {
+    aten.eq.Scalar: TensorOpInfo(aten.eq.Tensor, False),
+    aten.ge.Scalar: TensorOpInfo(aten.ge.Tensor, False),
+    aten.gt.Scalar: TensorOpInfo(aten.gt.Tensor, False),
+    aten.le.Scalar: TensorOpInfo(aten.le.Tensor, False),
+    aten.lt.Scalar: TensorOpInfo(aten.lt.Tensor, False),
+    aten.ne.Scalar: TensorOpInfo(aten.ne.Tensor, False),
+    aten.add.Scalar: TensorOpInfo(aten.add.Tensor, False),
+    aten.add_.Scalar: TensorOpInfo(aten.add_.Tensor, False),
+    aten.div.Scalar: TensorOpInfo(aten.div.Tensor, False),
+    aten.mul.Scalar: TensorOpInfo(aten.mul.Tensor, False),
+    aten.rsub.Scalar: TensorOpInfo(aten.rsub.Tensor, False),
+    aten.sub.Scalar: TensorOpInfo(aten.sub.Tensor, False),
+    aten.pow.Tensor_Scalar: TensorOpInfo(aten.pow.Tensor_Tensor, False),
+    # The scalar number arg[1] is missing when using default. Result in a corner case to deal
+    aten.leaky_relu.default: TensorOpInfo(aten.prelu.default, True),
+}
+
+
+SKIP_LIFT_OPS = {aten.full_like.default, aten.arange.start_step}
+
+
+class LiftConstantScalarOperands(ExportPass):
+    """
+    Lift constant scalar so that we can use observer of quantizer
+    """
+
+    def __init__(self):
+        super(LiftConstantScalarOperands, self).__init__()
+
+    def _build_tensor_constant(
+        self, gm: torch.fx.GraphModule, node: fx.Node, const_val
+    ) -> TensorConstant:
+        tensor = torch.tensor(
+            [const_val],
+            dtype=(
+                node.args[0].meta["val"].dtype
+                if not is_float_tensor(node)
+                else node.meta["val"].dtype
+            ),
+            device=node.meta["val"].device,
+        )
+        name = get_new_attr_name_with_prefix("_tensor_constant_")(gm)
+        tensor_constant = TensorConstant(tensor, name)
+        return tensor_constant
+
+    def _register_tensor(
+        self, gm: torch.fx.GraphModule, node: fx.Node, tensor_constant: TensorConstant
+    ) -> fx.Node:
+        gm.register_buffer(tensor_constant.name, tensor_constant.tensor)
+
+        fake_mode = node.meta["val"].fake_mode
+        with gm.graph.inserting_before(node):
+            get_attr_node = gm.graph.get_attr(tensor_constant.name)
+            get_attr_node.meta["val"] = fake_mode.from_tensor(tensor_constant.tensor)
+        return get_attr_node
+
+    def _update_node(self, node: fx.Node, tensor_args: Dict) -> None:
+        new_args = list(node.args)
+        if (info := SCALAR_OPS.get(node.target)) and info.use_schema_args:
+            new_args += [None] * max(
+                0, (len(node.target._schema.arguments) - len(new_args))
+            )
+
+        for k, v in tensor_args.items():
+            new_args[k] = v
+        node.args = tuple(new_args)
+        node.target = SCALAR_OPS.get(node.target, node).target
+
+    def _create_tensor_args(
+        self, node: fx.Node, gm: torch.fx.graph_module
+    ) -> Dict[int, TensorConstant]:
+        tensor_args = {}
+        for i, arg in enumerate(node.args):
+            schema = node.target._schema.arguments[i]
+            is_tensor_arg_got_num = isinstance(
+                schema.type, torch.TensorType
+            ) and isinstance(arg, Number)
+
+            is_scalar_arg = (
+                isinstance(schema.type, torch.NumberType) and node.target in SCALAR_OPS
+            )
+
+            # This is for showing warning of new-coming op
+            is_arg_num_type = (
+                isinstance(schema.type, torch.NumberType)
+                and node.target not in SCALAR_OPS
+            )
+
+            if is_tensor_arg_got_num or is_scalar_arg:
+                tensor_constant = self._build_tensor_constant(gm, node, arg)
+                tensor_constant_node = self._register_tensor(gm, node, tensor_constant)
+                tensor_args[i] = tensor_constant_node
+
+            elif is_arg_num_type:
+                print(
+                    f"[WARNING] the {i} th arg of node {node} is NumberType, might need to lift"
+                )
+
+        if (info := SCALAR_OPS.get(node.target)) and info.use_schema_args:
+            schema_args = list(node.target._schema.arguments)
+            for i, sa in enumerate(schema_args):
+                if isinstance(sa.type, torch.NumberType) and i not in tensor_args:
+                    tensor_constant = self._build_tensor_constant(
+                        gm, node, sa.default_value
+                    )
+                    tensor_constant_node = self._register_tensor(
+                        gm, node, tensor_constant
+                    )
+                    tensor_args[i] = tensor_constant_node
+        return tensor_args
+
+    def _lift(self, gm: torch.fx.GraphModule) -> None:
+        for n in gm.graph.nodes:
+            if (
+                n.op != "call_function"
+                or isinstance(n.target, (BuiltinMethodType, BuiltinFunctionType))
+                or n.target in SKIP_LIFT_OPS
+            ):
+                continue
+
+            if tensor_args := self._create_tensor_args(n, gm):
+                self._update_node(n, tensor_args)
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        self._lift(graph_module)
+        graph_module.recompile()
+        dead_code_elimination_pass(graph_module)
+        return PassResult(graph_module, True)
diff --git a/backends/qualcomm/_passes/convert_prelu.py b/backends/qualcomm/_passes/recompose_prelu.py
similarity index 64%
rename from backends/qualcomm/_passes/convert_prelu.py
rename to backends/qualcomm/_passes/recompose_prelu.py
index 6e2cd677781..082b9c83b27 100644
--- a/backends/qualcomm/_passes/convert_prelu.py
+++ b/backends/qualcomm/_passes/recompose_prelu.py
@@ -3,35 +3,48 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+from typing import List
+
 import torch
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
 from torch.fx.passes.utils.source_matcher_utils import get_source_partitions
 
 
-class ConvertPReLU(ExportPass):
+class RecomposePReLU(ExportPass):
     """
     Merge decomposed operators from prelu back to one super node.
     """
 
     def __init__(self, edge_program: torch.export.ExportedProgram):
-        super(ConvertPReLU, self).__init__()
+        super(RecomposePReLU, self).__init__()
         self.edge_program = edge_program
 
+    def _get_coeff_node(self, nodes: List[torch.fx.Node]):
+        for node in nodes:
+            if node.target == exir_ops.edge.aten.view_copy.default:
+                return node.args[0]
+
+    def _get_input_node(self, nodes: List[torch.fx.Node], coeff_node):
+        return [n for n in nodes if n != coeff_node][0]
+
     def call(self, graph_module: torch.fx.GraphModule):
         graph = graph_module.graph
-        partitions = get_source_partitions(graph, [torch.nn.PReLU])
+        partitions = get_source_partitions(graph, [torch.nn.PReLU, torch.nn.LeakyReLU])
         for _, src_partitions in partitions.items():
             for src_partition in src_partitions:
-                input_node = src_partition.input_nodes[0]
+                # somehow op might not be decomposed, skip it
+                if len(src_partition.nodes) == 1:
+                    continue
+
+                coeff_node = self._get_coeff_node(src_partition.nodes)
+                input_node = self._get_input_node(src_partition.input_nodes, coeff_node)
                 output_node = src_partition.output_nodes[0]
-                placeholders = [n for n in src_partition.nodes if n.op == "placeholder"]
-                assert len(placeholders) == 1
 
-                with graph.inserting_after(input_node):
+                with graph.inserting_before(output_node):
                     prelu_op = exir_ops.edge.aten.prelu.default
                     prelu_node = graph.create_node(
-                        "call_function", prelu_op, (input_node, placeholders[0])
+                        "call_function", prelu_op, (input_node, coeff_node)
                     )
                     users = output_node.users.copy()
                     for user in users:
diff --git a/backends/qualcomm/_passes/recompose_rms_norm.py b/backends/qualcomm/_passes/recompose_rms_norm.py
index bfaddfc47b5..77feecf9c1f 100644
--- a/backends/qualcomm/_passes/recompose_rms_norm.py
+++ b/backends/qualcomm/_passes/recompose_rms_norm.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 import torch
+from executorch.backends.qualcomm.builders.utils import get_parameter, is_parameter
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
 from torch.fx.passes.utils.source_matcher_utils import get_source_partitions
@@ -16,8 +17,9 @@ class RecomposeRmsNorm(ExportPass):
     Merge decomposed operators back to one super node.
     """
 
-    def __init__(self):
-        super().__init__()
+    def __init__(self, edge_program: torch.export.ExportedProgram):
+        super(RecomposeRmsNorm, self).__init__()
+        self.edge_program = edge_program
 
     def _get_eps_node(self, nodes):
         # eps: one of inputs of add node
@@ -47,11 +49,15 @@ def call(self, graph_module: torch.fx.GraphModule):
                     input_node = inp_0 if len(inp_0.users) == 2 else inp_1
                 else:
                     raise RuntimeError(
-                        f"Found a edge case of rms_node partitoin {src_partition}, which has {input_len} inputs"
+                        f"Found a edge case of rms_node partition {src_partition}, which has {input_len} inputs"
                     )
 
                 output_node = src_partition.output_nodes[0]
-                eps_node = self._get_eps_node(src_partition.nodes)
+                eps = self._get_eps_node(src_partition.nodes)
+                if isinstance(eps, torch.fx.Node) and is_parameter(
+                    eps, self.edge_program
+                ):
+                    eps = get_parameter(eps, self.edge_program).item()
                 gamma_node = self._get_gamma_node(output_node)
 
                 with graph.inserting_before(output_node):
@@ -64,7 +70,7 @@ def call(self, graph_module: torch.fx.GraphModule):
                             input_node,
                             list(gamma_node.meta["val"].shape),
                             gamma_node,
-                            eps_node,
+                            eps,
                         ),
                     )
                     users = output_node.users.copy()
diff --git a/backends/qualcomm/_passes/utils.py b/backends/qualcomm/_passes/utils.py
index febea6959db..23dfb569a8f 100755
--- a/backends/qualcomm/_passes/utils.py
+++ b/backends/qualcomm/_passes/utils.py
@@ -6,8 +6,9 @@
 
 import torch
 from executorch.backends.qualcomm.builders.utils import get_parameter
-from executorch.backends.qualcomm.utils.constants import QCOM_ENCODING
+from executorch.backends.qualcomm.utils.constants import QCOM_DTYPE, QCOM_ENCODING
 from executorch.exir.dialects._ops import ops as exir_ops
+from torch._subclasses import FakeTensor
 
 
 q_ops = {
@@ -41,6 +42,10 @@ def get_quant_attrs(
                 value = get_parameter(attr_n, edge_program)
         quant_attrs[quant_attr_keys[i - 1]] = value
 
+    # remap key for compatibility - block quantization only
+    if dtype := quant_attrs.get("input_dtype", None):
+        quant_attrs[QCOM_DTYPE] = dtype
+
     quant_attrs[QCOM_ENCODING] = quant_node.target
     return quant_attrs
 
@@ -57,13 +62,10 @@ def get_passes_dependency_for_capture_program():
         dict: A dictionary mapping each pass to its corresponding list of dependencies.
     """
     from executorch.backends.qualcomm._passes import (
-        AnnotateAndQuantScalar,
         AnnotateDecomposed,
         AnnotateQuantAttrs,
         ConstantI64toI32,
         ConvertBmmToMatmul,
-        ConvertInterpolateWithUpsample2D,
-        ConvertPReLU,
         ConvertToLinear,
         DecomposeAny,
         DecomposeLinalgVectorNorm,
@@ -71,6 +73,7 @@ def get_passes_dependency_for_capture_program():
         FoldQDQ,
         LayoutTransform,
         RecomposePixelUnshuffle,
+        RecomposePReLU,
         RecomposeRmsNorm,
         RemoveRedundancy,
         ReplaceIndexPutInput,
@@ -78,34 +81,34 @@ def get_passes_dependency_for_capture_program():
     )
 
     return {
-        AnnotateAndQuantScalar: [
-            AnnotateQuantAttrs,
-        ],
         AnnotateDecomposed: [RemoveRedundancy],
         AnnotateQuantAttrs: [
             RecomposePixelUnshuffle,
             RecomposeRmsNorm,
             ConvertToLinear,
-            ConvertPReLU,
+            RecomposePReLU,
             ConvertBmmToMatmul,
-            ConvertInterpolateWithUpsample2D,
         ],
-        ConstantI64toI32: [ConvertInterpolateWithUpsample2D],
+        ConstantI64toI32: [RemoveRedundancy],
         ConvertBmmToMatmul: [ConvertToLinear],
-        ConvertInterpolateWithUpsample2D: [RemoveRedundancy],
-        ConvertPReLU: [RemoveRedundancy],
         ConvertToLinear: [RecomposePixelUnshuffle],
         DecomposeAny: [RemoveRedundancy],
         DecomposeLinalgVectorNorm: [RemoveRedundancy],
         ExpandBroadcastTensorShape: [RemoveRedundancy],
-        FoldQDQ: [AnnotateQuantAttrs, AnnotateAndQuantScalar, AnnotateDecomposed],
+        FoldQDQ: [AnnotateQuantAttrs, AnnotateDecomposed],
         LayoutTransform: [
             AnnotateQuantAttrs,
-            AnnotateAndQuantScalar,
             ExpandBroadcastTensorShape,
         ],
         RecomposePixelUnshuffle: [RemoveRedundancy],
+        RecomposePReLU: [RemoveRedundancy],
         RecomposeRmsNorm: [RemoveRedundancy],
         ReplaceIndexPutInput: [LayoutTransform],
         TensorI64toI32: [RemoveRedundancy],
     }
+
+
+def is_float_tensor(node: torch.fx.Node) -> bool:
+    if "val" not in node.meta or not isinstance(node.meta["val"], FakeTensor):
+        return False
+    return node.meta["val"].dtype == torch.float32
diff --git a/backends/qualcomm/aot/ir/qcir.fbs b/backends/qualcomm/aot/ir/qcir.fbs
index f38b57d0665..82e56c405cc 100755
--- a/backends/qualcomm/aot/ir/qcir.fbs
+++ b/backends/qualcomm/aot/ir/qcir.fbs
@@ -54,6 +54,13 @@ enum QuantizeType : byte {
     AXIS_SCALE_OFFSET,
     BW_SCALE_OFFSET,
     BW_AXIS_SCALE_OFFSET,
+    BLOCKWISE_EXPANSION,
+    UNDEFINED,
+}
+
+enum BlockScaleStorageType: byte {
+    BITWIDTH_SCALE_STORAGE_8 = 0,
+    BITWIDTH_SCALE_STORAGE_16,
     UNDEFINED,
 }
 
@@ -72,6 +79,10 @@ table QuantizeParam {
     offsets: [int];
     // used by general quantization
     data: [ScaleOffset];
+    // used by block quantization
+    num_blocks_per_axis: uint;
+    block_scale_storage_type: BlockScaleStorageType;
+    block_scale: [ubyte];
 }
 
 table Tensor {
diff --git a/backends/qualcomm/aot/ir/qcir_utils.cpp b/backends/qualcomm/aot/ir/qcir_utils.cpp
index bff8655a1eb..de9e349abe7 100755
--- a/backends/qualcomm/aot/ir/qcir_utils.cpp
+++ b/backends/qualcomm/aot/ir/qcir_utils.cpp
@@ -118,17 +118,22 @@ flatbuffers::Offset<qcir::QuantizeParam> ToQuantizeParam(
                qcir::QuantizeType::BW_SCALE_OFFSET},
               {QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET,
                qcir::QuantizeType::BW_AXIS_SCALE_OFFSET},
+              {QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION,
+               qcir::QuantizeType::BLOCKWISE_EXPANSION},
               {QNN_QUANTIZATION_ENCODING_UNDEFINED,
                qcir::QuantizeType::UNDEFINED},
           };
 
   int32_t axis = 0;
-  uint32_t bitwidth = 0;
+  uint32_t bitwidth = 0, num_blocks_per_axis = 0;
   auto param = QNN_TENSOR_VER_PTR(tensor)->quantizeParams;
   auto quant_type = type_map.at(param.quantizationEncoding);
   std::vector<qcir::ScaleOffset> data;
+  std::vector<uint8_t> block_scale;
   std::vector<float> scales;
   std::vector<int32_t> offsets;
+  qcir::BlockScaleStorageType block_scale_storage_type =
+      qcir::BlockScaleStorageType::BITWIDTH_SCALE_STORAGE_8;
   switch (quant_type) {
     case qcir::QuantizeType::SCALE_OFFSET: {
       data.emplace_back(qcir::ScaleOffset(
@@ -160,6 +165,28 @@ flatbuffers::Offset<qcir::QuantizeParam> ToQuantizeParam(
         offsets.push_back(param.bwAxisScaleOffsetEncoding.offsets[i]);
       }
     } break;
+    case qcir::QuantizeType::BLOCKWISE_EXPANSION: {
+      bitwidth = param.blockwiseExpansion->blockScaleBitwidth;
+      axis = param.blockwiseExpansion->axis;
+      uint num_channels = QNN_TENSOR_VER_PTR(tensor)->dimensions[axis];
+      for (uint i = 0; i < num_channels; ++i) {
+        data.emplace_back(qcir::ScaleOffset(
+            param.blockwiseExpansion->scaleOffsets[i].scale,
+            param.blockwiseExpansion->scaleOffsets[i].offset));
+      }
+      num_blocks_per_axis = param.blockwiseExpansion->numBlocksPerAxis;
+      uint multiplier = 1;
+      if (param.blockwiseExpansion->blockScaleStorageType ==
+          QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_16) {
+        multiplier = 2;
+        block_scale_storage_type =
+            qcir::BlockScaleStorageType::BITWIDTH_SCALE_STORAGE_16;
+      }
+      uint total_bytes = num_channels * num_blocks_per_axis * multiplier;
+      block_scale = std::vector<uint8_t>(
+          param.blockwiseExpansion->blocksScale8,
+          param.blockwiseExpansion->blocksScale8 + total_bytes);
+    } break;
     default:
       // encodings are not required if lowering with floating point precision
       break;
@@ -172,7 +199,10 @@ flatbuffers::Offset<qcir::QuantizeParam> ToQuantizeParam(
       axis,
       &scales,
       &offsets,
-      &data);
+      &data,
+      num_blocks_per_axis,
+      block_scale_storage_type,
+      &block_scale);
 }
 
 Qnn_QuantizeParams_t ToQuantizeParam(const tensor_type& tensor) {
@@ -192,9 +222,14 @@ Qnn_QuantizeParams_t ToQuantizeParam(const tensor_type& tensor) {
                QNN_QUANTIZATION_ENCODING_BW_SCALE_OFFSET},
               {qcir::QuantizeType::BW_AXIS_SCALE_OFFSET,
                QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET},
+              {qcir::QuantizeType::BLOCKWISE_EXPANSION,
+               QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION},
               {qcir::QuantizeType::UNDEFINED,
                QNN_QUANTIZATION_ENCODING_UNDEFINED},
           };
+  // Qnn_BlockwiseExpansion_t is a pointer type in Qnn_QuantizeParams_t
+  // need a bookkeeper for guarding life cycle
+  static std::vector<std::unique_ptr<Qnn_BlockwiseExpansion_t>> block_param;
 
   Qnn_QuantizeParams_t p = QNN_QUANTIZE_PARAMS_INIT;
   auto param = tensor->qparam();
@@ -226,6 +261,30 @@ Qnn_QuantizeParams_t ToQuantizeParam(const tensor_type& tensor) {
       p.bwAxisScaleOffsetEncoding.offsets =
           const_cast<int32_t*>(param->offsets()->data());
     } break;
+    case QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION: {
+      block_param.emplace_back(std::make_unique<Qnn_BlockwiseExpansion_t>());
+      p.blockwiseExpansion = block_param.back().get();
+      p.blockwiseExpansion->axis = param->axis();
+      p.blockwiseExpansion->scaleOffsets = reinterpret_cast<Qnn_ScaleOffset_t*>(
+          const_cast<uint8_t*>(param->data()->Data()));
+      p.blockwiseExpansion->numBlocksPerAxis = param->num_blocks_per_axis();
+      switch (param->block_scale_storage_type()) {
+        case qcir::BlockScaleStorageType::BITWIDTH_SCALE_STORAGE_8:
+          p.blockwiseExpansion->blockScaleStorageType =
+              QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_8;
+          break;
+        case qcir::BlockScaleStorageType::BITWIDTH_SCALE_STORAGE_16:
+          p.blockwiseExpansion->blockScaleStorageType =
+              QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_16;
+          break;
+        default:
+          p.blockwiseExpansion->blockScaleStorageType =
+              QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_UNDEFINED;
+          break;
+      }
+      p.blockwiseExpansion->blocksScale8 =
+          const_cast<uint8_t*>(param->block_scale()->Data());
+    } break;
     default:
       // encodings are not required if lowering with floating point precision
       break;
diff --git a/backends/qualcomm/aot/python/PyQnnWrapperAdaptor.cpp b/backends/qualcomm/aot/python/PyQnnWrapperAdaptor.cpp
index 2f3a094b3f8..39f1f3ee48f 100644
--- a/backends/qualcomm/aot/python/PyQnnWrapperAdaptor.cpp
+++ b/backends/qualcomm/aot/python/PyQnnWrapperAdaptor.cpp
@@ -59,6 +59,28 @@ std::unique_ptr<QuantizeParamsWrapper> CreateQuantizationParamWrapper(
     int32_t offset = quant_info["offset"].cast<int32_t>();
     quantize_param_wrapper =
         std::make_unique<ScaleOffsetQuantizeParamsWrapper>(scale, offset);
+  } else if (encoding == QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION) {
+    int32_t axis = quant_info["axis"].cast<int32_t>();
+    std::vector<Qnn_ScaleOffset_t> scale_offset =
+        quant_info["block_scale_offset"].cast<std::vector<Qnn_ScaleOffset_t>>();
+    uint32_t num_blocks_per_axis =
+        quant_info["num_blocks_per_axis"].cast<uint32_t>();
+    uint32_t block_scale_bitwidth =
+        quant_info["block_scale_bitwidth"].cast<uint32_t>();
+    Qnn_BlockwiseExpansionBlockScaleStorageType_t block_storage_type =
+        quant_info["block_storage_type"]
+            .cast<Qnn_BlockwiseExpansionBlockScaleStorageType_t>();
+    std::vector<uint8_t> buf =
+        quant_info["block_scales"].cast<std::vector<uint8_t>>();
+    quantize_param_wrapper =
+        std::make_unique<BlockwiseExpansionQuantizeParamsWrapper>(
+            axis,
+            scale_offset,
+            num_blocks_per_axis,
+            block_scale_bitwidth,
+            block_storage_type,
+            buf.data(),
+            buf.size());
   } else {
     QNN_EXECUTORCH_LOG_ERROR(
         "Unknown the encoding of quantization: %d", encoding);
@@ -179,9 +201,6 @@ PYBIND11_MODULE(PyQnnWrapperAdaptor, m) {
       .export_values();
 
   py::enum_<Qnn_QuantizationEncoding_t>(m, "Qnn_QuantizationEncoding_t")
-      .value(
-          "QNN_QUANTIZATION_ENCODING_UNDEFINED",
-          Qnn_QuantizationEncoding_t::QNN_QUANTIZATION_ENCODING_UNDEFINED)
       .value(
           "QNN_QUANTIZATION_ENCODING_SCALE_OFFSET",
           Qnn_QuantizationEncoding_t::QNN_QUANTIZATION_ENCODING_SCALE_OFFSET)
@@ -196,6 +215,29 @@ PYBIND11_MODULE(PyQnnWrapperAdaptor, m) {
           "QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET",
           Qnn_QuantizationEncoding_t::
               QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET)
+      .value(
+          "QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION",
+          Qnn_QuantizationEncoding_t::
+              QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION)
+      .value(
+          "QNN_QUANTIZATION_ENCODING_UNDEFINED",
+          Qnn_QuantizationEncoding_t::QNN_QUANTIZATION_ENCODING_UNDEFINED)
+      .export_values();
+
+  py::enum_<Qnn_BlockwiseExpansionBlockScaleStorageType_t>(
+      m, "Qnn_BlockwiseExpansionBlockScaleStorageType_t")
+      .value(
+          "QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_8",
+          Qnn_BlockwiseExpansionBlockScaleStorageType_t::
+              QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_8)
+      .value(
+          "QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_16",
+          Qnn_BlockwiseExpansionBlockScaleStorageType_t::
+              QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_16)
+      .value(
+          "QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_UNDEFINED",
+          Qnn_BlockwiseExpansionBlockScaleStorageType_t::
+              QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_UNDEFINED)
       .export_values();
 
   py::class_<OpWrapper, std::shared_ptr<OpWrapper>>(m, "OpWrapper")
@@ -476,7 +518,6 @@ PYBIND11_MODULE(PyQnnWrapperAdaptor, m) {
             return std::vector<Qnn_ScaleOffset_t>(
                 aso.scaleOffset, aso.scaleOffset + aso.numScaleOffsets);
           });
-  // op_wrapper.GetParams() get std::vector<ParamWrapper*>
 }
 } // namespace qnn
 } // namespace backends
diff --git a/backends/qualcomm/aot/python/targets.bzl b/backends/qualcomm/aot/python/targets.bzl
index e1f5a6a8fc5..f29c02aa593 100644
--- a/backends/qualcomm/aot/python/targets.bzl
+++ b/backends/qualcomm/aot/python/targets.bzl
@@ -33,10 +33,10 @@ def define_common_targets():
             "//executorch/backends/qualcomm:schema",
             "//executorch/backends/qualcomm/aot/ir:qcir_utils",
             "//executorch/backends/qualcomm/runtime:runtime",
+            "fbsource//third-party/pybind11:pybind11",
             "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()),
         ],
         external_deps = [
-            "pybind11",
             "libtorch_python",
         ],
         use_static_deps = True,
@@ -66,10 +66,10 @@ def define_common_targets():
             "//executorch/backends/qualcomm:schema",
             "//executorch/backends/qualcomm/aot/ir:qcir_utils",
             "//executorch/backends/qualcomm/runtime:runtime",
+            "fbsource//third-party/pybind11:pybind11",
             "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()),
         ],
         external_deps = [
-            "pybind11",
             "libtorch_python",
         ],
         use_static_deps = True,
@@ -93,9 +93,7 @@ def define_common_targets():
             "//executorch/backends/qualcomm:schema",
             "//executorch/backends/qualcomm/aot/ir:qcir_utils",
             "//executorch/backends/qualcomm/runtime:runtime",
+            "fbsource//third-party/pybind11:pybind11",
             "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()),
         ],
-        external_deps = [
-            "pybind11",
-        ],
     )
diff --git a/backends/qualcomm/aot/wrappers/QuantizeParamsWrapper.cpp b/backends/qualcomm/aot/wrappers/QuantizeParamsWrapper.cpp
index cab6390d73e..81b6d04855c 100644
--- a/backends/qualcomm/aot/wrappers/QuantizeParamsWrapper.cpp
+++ b/backends/qualcomm/aot/wrappers/QuantizeParamsWrapper.cpp
@@ -6,13 +6,16 @@
  * LICENSE file in the root directory of this source tree.
  */
 #include <executorch/backends/qualcomm/aot/wrappers/QuantizeParamsWrapper.h>
+#include <executorch/backends/qualcomm/aot/wrappers/TensorWrapper.h>
 #include <executorch/backends/qualcomm/runtime/Logging.h>
+
 namespace executorch {
 namespace backends {
 namespace qnn {
 std::unique_ptr<QuantizeParamsWrapper> CreateQuantizationParamWrapper(
-    const Qnn_QuantizeParams_t& quantization) {
+    const Qnn_Tensor_t& tensor) {
   std::unique_ptr<QuantizeParamsWrapper> quantize_param_wrapper;
+  auto& quantization = QNN_TENSOR_VER_PTR(tensor)->quantizeParams;
 
   if (quantization.quantizationEncoding ==
       QNN_QUANTIZATION_ENCODING_UNDEFINED) {
@@ -60,6 +63,29 @@ std::unique_ptr<QuantizeParamsWrapper> CreateQuantizationParamWrapper(
     quantize_param_wrapper = std::make_unique<ScaleOffsetQuantizeParamsWrapper>(
         quantization.scaleOffsetEncoding.scale,
         quantization.scaleOffsetEncoding.offset);
+  } else if (
+      quantization.quantizationEncoding ==
+      QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION) {
+    int ch_axis = quantization.blockwiseExpansion->axis;
+    int ele_sz = quantization.blockwiseExpansion->blockScaleStorageType ==
+            QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_16
+        ? 2
+        : 1;
+    size_t block_scales_sz = quantization.blockwiseExpansion->numBlocksPerAxis *
+        QNN_TENSOR_VER_PTR(tensor)->dimensions[ch_axis] * ele_sz;
+    std::vector<Qnn_ScaleOffset_t> scale_offsets(
+        quantization.blockwiseExpansion->scaleOffsets,
+        quantization.blockwiseExpansion->scaleOffsets +
+            QNN_TENSOR_VER_PTR(tensor)->dimensions[ch_axis]);
+    quantize_param_wrapper =
+        std::make_unique<BlockwiseExpansionQuantizeParamsWrapper>(
+            quantization.blockwiseExpansion->axis,
+            scale_offsets,
+            quantization.blockwiseExpansion->numBlocksPerAxis,
+            quantization.blockwiseExpansion->blockScaleBitwidth,
+            quantization.blockwiseExpansion->blockScaleStorageType,
+            quantization.blockwiseExpansion->blocksScale8,
+            block_scales_sz);
   } else {
     QNN_EXECUTORCH_LOG_ERROR(
         "Unknown the encoding of quantization: %d",
diff --git a/backends/qualcomm/aot/wrappers/QuantizeParamsWrapper.h b/backends/qualcomm/aot/wrappers/QuantizeParamsWrapper.h
index 88a90ca816e..c2532c96388 100644
--- a/backends/qualcomm/aot/wrappers/QuantizeParamsWrapper.h
+++ b/backends/qualcomm/aot/wrappers/QuantizeParamsWrapper.h
@@ -281,9 +281,79 @@ class AxisScaleOffsetQuantizeParamsWrapper final
   std::vector<Qnn_ScaleOffset_t> scale_offsets_;
 };
 
+class BlockwiseExpansionQuantizeParamsWrapper final
+    : public QuantizeParamsWrapper {
+ public:
+  explicit BlockwiseExpansionQuantizeParamsWrapper(
+      std::int32_t axis,
+      const std::vector<Qnn_ScaleOffset_t>& scale_offsets,
+      std::uint32_t num_blocks_per_axis,
+      std::uint32_t block_scale_bitwidth,
+      Qnn_BlockwiseExpansionBlockScaleStorageType_t storage_type,
+      const uint8_t* block_scales_ptr,
+      std::uint32_t block_scales_size)
+      : QuantizeParamsWrapper(
+            QNN_DEFINITION_DEFINED,
+            QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION),
+        axis_(axis),
+        scale_offsets_(scale_offsets),
+        num_blocks_per_axis_(num_blocks_per_axis),
+        block_scale_bitwidth_(block_scale_bitwidth),
+        block_storage_type_(storage_type),
+        block_scales_(block_scales_ptr, block_scales_ptr + block_scales_size) {}
+
+  BlockwiseExpansionQuantizeParamsWrapper(
+      const BlockwiseExpansionQuantizeParamsWrapper& rhs)
+      : QuantizeParamsWrapper(
+            rhs.GetEncodingDefinition(),
+            rhs.GetQuantizationEncoding()),
+        axis_(rhs.axis_),
+        scale_offsets_(rhs.scale_offsets_),
+        num_blocks_per_axis_(rhs.num_blocks_per_axis_),
+        block_scale_bitwidth_(rhs.block_scale_bitwidth_),
+        block_storage_type_(rhs.block_storage_type_),
+        block_scales_(rhs.block_scales_) {}
+
+  BlockwiseExpansionQuantizeParamsWrapper(
+      BlockwiseExpansionQuantizeParamsWrapper&& rhs) = delete;
+  BlockwiseExpansionQuantizeParamsWrapper& operator=(
+      const BlockwiseExpansionQuantizeParamsWrapper& rhs) = delete;
+  BlockwiseExpansionQuantizeParamsWrapper& operator=(
+      BlockwiseExpansionQuantizeParamsWrapper&& rhs) = delete;
+
+  ~BlockwiseExpansionQuantizeParamsWrapper() override = default;
+
+  std::unique_ptr<QuantizeParamsWrapper> Clone() override {
+    return std::make_unique<BlockwiseExpansionQuantizeParamsWrapper>(*this);
+  }
+
+  Qnn_QuantizeParams_t CreateQuantizeParams() override {
+    Qnn_QuantizeParams_t rval;
+    rval.encodingDefinition = GetEncodingDefinition();
+    rval.quantizationEncoding = GetQuantizationEncoding();
+    block_expansion_.axis = axis_;
+    block_expansion_.scaleOffsets = scale_offsets_.data();
+    block_expansion_.numBlocksPerAxis = num_blocks_per_axis_;
+    block_expansion_.blockScaleBitwidth = block_scale_bitwidth_;
+    block_expansion_.blockScaleStorageType = block_storage_type_;
+    block_expansion_.blocksScale8 = block_scales_.data();
+    rval.blockwiseExpansion = &block_expansion_;
+    return rval;
+  }
+
+ private:
+  std::int32_t axis_;
+  std::vector<Qnn_ScaleOffset_t> scale_offsets_;
+  std::uint32_t num_blocks_per_axis_;
+  std::uint32_t block_scale_bitwidth_;
+  Qnn_BlockwiseExpansionBlockScaleStorageType_t block_storage_type_;
+  std::vector<std::uint8_t> block_scales_;
+  Qnn_BlockwiseExpansion_t block_expansion_;
+};
+
 // Factory function to create quantization param wrapper from QnnQuantization
 std::unique_ptr<QuantizeParamsWrapper> CreateQuantizationParamWrapper(
-    const Qnn_QuantizeParams_t& quantization);
+    const Qnn_Tensor_t& tensor);
 } // namespace qnn
 } // namespace backends
 } // namespace executorch
diff --git a/backends/qualcomm/aot/wrappers/TensorWrapper.cpp b/backends/qualcomm/aot/wrappers/TensorWrapper.cpp
index a0a61022094..17d76aac412 100644
--- a/backends/qualcomm/aot/wrappers/TensorWrapper.cpp
+++ b/backends/qualcomm/aot/wrappers/TensorWrapper.cpp
@@ -221,8 +221,7 @@ std::shared_ptr<TensorWrapper> CreateTensorWrapper(const Qnn_Tensor_t& tensor) {
       std::string(QNN_TENSOR_VER_PTR(tensor)->name),
       QNN_TENSOR_VER_PTR(tensor)->type,
       QNN_TENSOR_VER_PTR(tensor)->dataType,
-      CreateQuantizationParamWrapper(
-          QNN_TENSOR_VER_PTR(tensor)->quantizeParams),
+      CreateQuantizationParamWrapper(tensor),
       QNN_TENSOR_VER_PTR(tensor)->rank,
       QNN_TENSOR_VER_PTR(tensor)->dimensions,
       tensor.version == QNN_TENSOR_VERSION_2
diff --git a/backends/qualcomm/builders/__init__.py b/backends/qualcomm/builders/__init__.py
index a16d4fb5057..c5352a7fbee 100644
--- a/backends/qualcomm/builders/__init__.py
+++ b/backends/qualcomm/builders/__init__.py
@@ -52,6 +52,7 @@
     op_mul,
     op_ne,
     op_neg,
+    op_or,
     op_pad,
     op_pow,
     op_prelu,
@@ -131,6 +132,7 @@
     op_mul,
     op_neg,
     op_ne,
+    op_or,
     op_pad,
     op_pow,
     op_prelu,
diff --git a/backends/qualcomm/builders/node_visitor.py b/backends/qualcomm/builders/node_visitor.py
index f450811ab70..7965a30caea 100644
--- a/backends/qualcomm/builders/node_visitor.py
+++ b/backends/qualcomm/builders/node_visitor.py
@@ -15,8 +15,14 @@
     QCOM_AXIS,
     QCOM_AXIS_ORDER,
     QCOM_BITWIDTH,
+    QCOM_BLOCK_SCALE_BITWIDTH,
+    QCOM_BLOCK_SCALE_OFFSET,
+    QCOM_BLOCK_SCALES,
+    QCOM_BLOCK_SIZE,
+    QCOM_BLOCK_STORAGE_TYPE,
     QCOM_DTYPE,
     QCOM_ENCODING,
+    QCOM_NUM_BLOCKS_PER_AXIS,
     QCOM_OFFSET,
     QCOM_QUANT_ATTRS,
     QCOM_QUANT_MAX,
@@ -106,10 +112,56 @@ def _get_tensor(node, index):
             return node.meta["val"]
 
         tensor = _get_tensor(input_node, idx)
-        if len(tensor.shape) != 0 and QCOM_AXIS_ORDER in op_node.meta:
+        if len(tensor.shape) > 1 and QCOM_AXIS_ORDER in op_node.meta:
             tensor = tensor.permute(dims=op_node.meta[QCOM_AXIS_ORDER]).contiguous()
         return tensor
 
+    def make_qnn_per_block_config(self, node: torch.fx.Node, quant_attrs: Dict):
+        import math
+
+        quant_config = copy.deepcopy(quant_attrs)
+        scales, scale_offset, quantized_scales = quant_attrs[QCOM_SCALE], [], []
+        # channel in observers defaults to zero
+        num_channels = node.meta["val"].shape[0]
+        # TODO: expand this when QNN starts to support more configurations
+        bitwidth_of_scale = 4
+        quant_scales_dtype = torch.uint8
+        num_steps = 2**bitwidth_of_scale
+        scale_storage_type = (
+            PyQnnWrapper.Qnn_BlockwiseExpansionBlockScaleStorageType_t.QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_8
+        )
+
+        for ch in range(num_channels):
+            max_scale = scales[ch].reshape(1, -1).amax(dim=-1) / num_steps
+            q_scales = torch.clamp(
+                input=scales[ch] / max_scale,
+                min=torch.iinfo(quant_scales_dtype).min,
+                max=torch.iinfo(quant_scales_dtype).max,
+            ).to(quant_scales_dtype)
+            quantized_scales.append(torch.where(q_scales == 0, 1, q_scales))
+            # symmetric quantization is required
+            scale_offset.append(PyQnnWrapper.Qnn_ScaleOffset_t(max_scale, 0))
+
+        if "convolution" in list(node.users)[0].target.__name__:
+            # OIHW (pytorch) -> HWIO (QNN)
+            quant_config[QCOM_AXIS] = 3
+            quant_config[QCOM_AXIS_ORDER] = (2, 3, 1, 0)
+        else:
+            raise AttributeError("undetermined axis for block quantization")
+
+        quant_config[QCOM_NUM_BLOCKS_PER_AXIS] = quantized_scales[0].shape.numel()
+        quant_config[QCOM_BLOCK_SCALE_OFFSET] = scale_offset
+        quant_config[QCOM_BLOCK_SCALES] = torch.cat(quantized_scales).detach().numpy()
+        # e.g. if use 16 bit for quantized scales, we need to expand 16 - 4 = 12 bits
+        quant_config[QCOM_BLOCK_SCALE_BITWIDTH] = (
+            int(math.log2(torch.iinfo(quant_scales_dtype).max + 1)) - bitwidth_of_scale
+        )
+        quant_config[QCOM_BLOCK_STORAGE_TYPE] = scale_storage_type
+        return (
+            PyQnnWrapper.Qnn_QuantizationEncoding_t.QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION,
+            quant_config,
+        )
+
     def make_qnn_per_channel_config(self, node: torch.fx.Node, quant_attrs: Dict):
         quant_config = copy.deepcopy(quant_attrs)
 
@@ -188,6 +240,14 @@ def get_quant_encoding_conf(
             and target_node.name in node.meta[QCOM_REQUANTIZE]
             else node.meta[QCOM_QUANT_ATTRS]
         )
+        # TODO: refactor this when target could be correctly detected
+        per_block_encoding = {
+            exir_ops.edge.pt2e_quant.quantize_affine.default,
+            exir_ops.edge.pt2e_quant.dequantize_affine.default,
+        }
+        if quant_attrs[QCOM_ENCODING] in per_block_encoding:
+            return self.make_qnn_per_block_config(node, quant_attrs)
+
         if quant_attrs[QCOM_ENCODING] in PER_CHANNEL_ENCODING:
             return self.make_qnn_per_channel_config(node, quant_attrs)
 
@@ -196,16 +256,33 @@ def get_quant_encoding_conf(
     def get_quant_tensor_value(
         self, tensor: torch.Tensor, quant_attrs: Dict, quant_configs: Dict
     ) -> torch.Tensor:
+        dtype = quant_configs[QCOM_DTYPE]
         if quant_attrs[QCOM_ENCODING] in PER_TENSOR_ENCODING:
             scale = quant_attrs[QCOM_SCALE]
             zero_point = quant_attrs[QCOM_ZERO_POINT]
-        else:  # per channel case
+            tensor = tensor.div(scale).add(zero_point).round().to(dtype)
+        elif quant_attrs[QCOM_ENCODING] in PER_CHANNEL_ENCODING:
             scale = quant_attrs[QCOM_SCALES]
             zero_point = quant_attrs[QCOM_ZERO_POINTS]
+            tensor = tensor.div(scale).add(zero_point).round().to(dtype)
+        else:  # per_block
+            if axis_order := quant_configs.get(QCOM_AXIS_ORDER, None):
+                origin_order = tuple(
+                    axis_order.index(x) for x in range(len(axis_order))
+                )
+                tensor = tensor.permute(origin_order)
+            tensor = torch.ops.pt2e_quant.quantize_affine(
+                tensor,
+                block_size=quant_attrs[QCOM_BLOCK_SIZE],
+                scale=quant_attrs[QCOM_SCALE],
+                zero_point=quant_attrs[QCOM_ZERO_POINT],
+                output_dtype=dtype,
+                quant_min=quant_attrs[QCOM_QUANT_MIN],
+                quant_max=quant_attrs[QCOM_QUANT_MAX],
+            )
+            if axis_order:
+                tensor = tensor.permute(axis_order)
 
-        dtype = quant_configs[QCOM_DTYPE]
-
-        tensor = tensor.div(scale).add(zero_point).round().to(dtype)
         # Make the backends access data correctly
         if quant_configs.get(QCOM_BITWIDTH) == 4:
             mask = torch.full(tensor.size(), 0x0F, dtype=torch.int8)
diff --git a/backends/qualcomm/builders/op_eq.py b/backends/qualcomm/builders/op_eq.py
index ac682c3c1e2..855c5e13be6 100644
--- a/backends/qualcomm/builders/op_eq.py
+++ b/backends/qualcomm/builders/op_eq.py
@@ -8,14 +8,6 @@
 import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
 
 import torch
-from executorch.backends.qualcomm.utils.constants import (
-    QCOM_QUANT_ATTRS,
-    QCOM_QUANT_MAX,
-    QCOM_QUANT_MIN,
-    QCOM_SCALE,
-    QCOM_ZERO_POINT,
-)
-from executorch.exir.dialects._ops import ops as exir_ops
 
 from .node_visitor import NodeVisitor, register_node_visitor
 from .qnn_constants import OpElementWiseEqual, QNN_OP_PACKAGE_NAME_QTI_AISW
@@ -23,7 +15,7 @@
 
 @register_node_visitor
 class Equal(NodeVisitor):
-    target = ["aten.eq.Tensor", "aten.eq.Scalar"]
+    target = ["aten.eq.Tensor"]
 
     def __init__(self, *args) -> None:
         super().__init__(*args)
@@ -46,37 +38,8 @@ def define_node(
         input_tensors = []
         for index in range(2):
             input_node = node.args[index]
-            if isinstance(input_node, torch.fx.Node):
-                input_tensor = self.get_tensor(input_node, node)
-                tensor_type = PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE
-            else:
-                scalar = input_node
-                input_tensor = torch.tensor(scalar, dtype=torch.float32)
-                tensor_type = PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC
-
-                # 'graph', 'name', 'op', 'target', 'args', and 'kwargs'
-                input_node = torch.fx.Node(
-                    node.graph,
-                    node.name + "_runtime_scalar",
-                    "call_function",
-                    exir_ops.edge.aten.scalar_tensor.default,
-                    (),  # args
-                    {},  # kwargs
-                )
-                # Because the output data type of the eq node is boolean.
-                # We need to take the quant attr from the non-scalar node.
-                if quant_attrs := node.args[index ^ 1].meta.get(QCOM_QUANT_ATTRS):
-                    quant_attrs = quant_attrs.copy()
-                    quant_range = (
-                        quant_attrs[QCOM_QUANT_MAX] - quant_attrs[QCOM_QUANT_MIN]
-                    )
-                    quant_attrs[QCOM_ZERO_POINT] = (
-                        0 if scalar >= 0 else quant_attrs[QCOM_QUANT_MAX]
-                    )
-                    quant_attrs[QCOM_SCALE] = (
-                        scalar / quant_range if scalar >= 0 else -scalar / quant_range
-                    )
-                    input_node.meta[QCOM_QUANT_ATTRS] = quant_attrs
+            input_tensor = self.get_tensor(input_node, node)
+            tensor_type = PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE
 
             input_tensor_wrapper = self.define_tensor(
                 input_node,
diff --git a/backends/qualcomm/builders/op_ge.py b/backends/qualcomm/builders/op_ge.py
index 552cab659cc..6784167aa5b 100644
--- a/backends/qualcomm/builders/op_ge.py
+++ b/backends/qualcomm/builders/op_ge.py
@@ -8,14 +8,6 @@
 import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
 
 import torch
-from executorch.backends.qualcomm.utils.constants import (
-    QCOM_QUANT_ATTRS,
-    QCOM_QUANT_MAX,
-    QCOM_QUANT_MIN,
-    QCOM_SCALE,
-    QCOM_ZERO_POINT,
-)
-from executorch.exir.dialects._ops import ops as exir_ops
 
 from .node_visitor import NodeVisitor, register_node_visitor
 from .qnn_constants import OpElementWiseGreaterEqual, QNN_OP_PACKAGE_NAME_QTI_AISW
@@ -23,7 +15,7 @@
 
 @register_node_visitor
 class GreaterEqual(NodeVisitor):
-    target = ["aten.ge.Tensor", "aten.ge.Scalar"]
+    target = ["aten.ge.Tensor"]
 
     def __init__(self, *args) -> None:
         super().__init__(*args)
@@ -46,37 +38,8 @@ def define_node(
         input_tensors = []
         for index in range(2):
             input_node = node.args[index]
-            if isinstance(input_node, torch.fx.Node):
-                input_tensor = self.get_tensor(input_node, node)
-                tensor_type = PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE
-            else:
-                scalar = input_node
-                input_tensor = torch.tensor(scalar, dtype=torch.float32)
-                tensor_type = PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC
-
-                # 'graph', 'name', 'op', 'target', 'args', and 'kwargs'
-                input_node = torch.fx.Node(
-                    node.graph,
-                    node.name + "_runtime_scalar",
-                    "call_function",
-                    exir_ops.edge.aten.scalar_tensor.default,
-                    (),  # args
-                    {},  # kwargs
-                )
-                # Because the output data type of the ge node is boolean.
-                # We need to take the quant attr from the non-scalar node.
-                if quant_attrs := node.args[index ^ 1].meta.get(QCOM_QUANT_ATTRS):
-                    quant_attrs = quant_attrs.copy()
-                    quant_range = (
-                        quant_attrs[QCOM_QUANT_MAX] - quant_attrs[QCOM_QUANT_MIN]
-                    )
-                    quant_attrs[QCOM_ZERO_POINT] = (
-                        0 if scalar >= 0 else quant_attrs[QCOM_QUANT_MAX]
-                    )
-                    quant_attrs[QCOM_SCALE] = (
-                        scalar / quant_range if scalar >= 0 else -scalar / quant_range
-                    )
-                    input_node.meta[QCOM_QUANT_ATTRS] = quant_attrs
+            input_tensor = self.get_tensor(input_node, node)
+            tensor_type = PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE
 
             input_tensor_wrapper = self.define_tensor(
                 input_node,
diff --git a/backends/qualcomm/builders/op_group_norm.py b/backends/qualcomm/builders/op_group_norm.py
index d498b202d71..26700216b53 100644
--- a/backends/qualcomm/builders/op_group_norm.py
+++ b/backends/qualcomm/builders/op_group_norm.py
@@ -10,6 +10,7 @@
 
 import numpy as np
 import torch
+from executorch.backends.qualcomm.utils.constants import QCOM_DATA
 
 from .node_visitor import NodeVisitor, register_node_visitor
 from .qnn_constants import OpGroupNorm, QNN_OP_PACKAGE_NAME_QTI_AISW
@@ -81,12 +82,12 @@ def define_node(
         group_norm_op.AddScalarParam(
             OpGroupNorm.param_epsilon,
             PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_FLOAT_32,
-            {"data": np.float32(epsilon)},
+            {QCOM_DATA: np.float32(epsilon)},
         )
         group_norm_op.AddScalarParam(
             OpGroupNorm.param_group,
             PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
-            {"data": np.uint32(group)},
+            {QCOM_DATA: np.uint32(group)},
         )
 
         return group_norm_op
diff --git a/backends/qualcomm/builders/op_gt.py b/backends/qualcomm/builders/op_gt.py
index 443017b7b0d..6c311f42b7f 100644
--- a/backends/qualcomm/builders/op_gt.py
+++ b/backends/qualcomm/builders/op_gt.py
@@ -8,14 +8,6 @@
 import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
 
 import torch
-from executorch.backends.qualcomm.utils.constants import (
-    QCOM_QUANT_ATTRS,
-    QCOM_QUANT_MAX,
-    QCOM_QUANT_MIN,
-    QCOM_SCALE,
-    QCOM_ZERO_POINT,
-)
-from executorch.exir.dialects._ops import ops as exir_ops
 
 from .node_visitor import NodeVisitor, register_node_visitor
 from .qnn_constants import OpElementWiseGreater, QNN_OP_PACKAGE_NAME_QTI_AISW
@@ -23,7 +15,7 @@
 
 @register_node_visitor
 class GreaterThan(NodeVisitor):
-    target = ["aten.gt.Tensor", "aten.gt.Scalar"]
+    target = ["aten.gt.Tensor"]
 
     def __init__(self, *args) -> None:
         super().__init__(*args)
@@ -46,37 +38,8 @@ def define_node(
         input_tensors = []
         for index in range(2):
             input_node = node.args[index]
-            if isinstance(input_node, torch.fx.Node):
-                input_tensor = self.get_tensor(input_node, node)
-                tensor_type = PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE
-            else:
-                scalar = input_node
-                input_tensor = torch.tensor(scalar, dtype=torch.float32)
-                tensor_type = PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC
-
-                # 'graph', 'name', 'op', 'target', 'args', and 'kwargs'
-                input_node = torch.fx.Node(
-                    node.graph,
-                    node.name + "_runtime_scalar",
-                    "call_function",
-                    exir_ops.edge.aten.scalar_tensor.default,
-                    (),  # args
-                    {},  # kwargs
-                )
-                # Because the output data type of the gt node is boolean.
-                # We need to take the quant attr from the non-scalar node.
-                if quant_attrs := node.args[index ^ 1].meta.get(QCOM_QUANT_ATTRS):
-                    quant_attrs = quant_attrs.copy()
-                    quant_range = (
-                        quant_attrs[QCOM_QUANT_MAX] - quant_attrs[QCOM_QUANT_MIN]
-                    )
-                    quant_attrs[QCOM_ZERO_POINT] = (
-                        0 if scalar >= 0 else quant_attrs[QCOM_QUANT_MAX]
-                    )
-                    quant_attrs[QCOM_SCALE] = (
-                        scalar / quant_range if scalar >= 0 else -scalar / quant_range
-                    )
-                    input_node.meta[QCOM_QUANT_ATTRS] = quant_attrs
+            input_tensor = self.get_tensor(input_node, node)
+            tensor_type = PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE
 
             input_tensor_wrapper = self.define_tensor(
                 input_node,
diff --git a/backends/qualcomm/builders/op_index.py b/backends/qualcomm/builders/op_index.py
index 4ddab23aeae..ff039f9d7a8 100644
--- a/backends/qualcomm/builders/op_index.py
+++ b/backends/qualcomm/builders/op_index.py
@@ -9,6 +9,7 @@
 
 import numpy as np
 import torch
+from executorch.backends.qualcomm.utils.constants import QCOM_DATA
 
 from .node_visitor import NodeVisitor, register_node_visitor
 from .qnn_constants import OpGather, QNN_OP_PACKAGE_NAME_QTI_AISW
@@ -37,11 +38,11 @@ def define_node(
             nodes_to_wrappers,
         )
 
-        if len(node.args[1]) > 1:
-            # TODO consider to implement it in a recursive way.
-            raise NotImplementedError("Not support tuple of tensor.")
-
-        indices_node = node.args[1][0]
+        # e.g. x[:, index]:
+        # > node.args[1] = [None, indices]
+        # > axis = 1
+        axis = len(node.args[1]) - 1
+        indices_node = node.args[1][axis]
         indices_tensor = self.get_tensor(indices_node, node).to(torch.int32)
         assert indices_tensor.size(0) != 0, "Not support empty indices list"
 
@@ -77,7 +78,7 @@ def define_node(
         gather_op.AddScalarParam(
             OpGather.param_axis,
             PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_INT_32,
-            {"data": np.int32(0)},
+            {QCOM_DATA: np.int32(axis)},
         )
 
         return gather_op
diff --git a/backends/qualcomm/builders/op_le.py b/backends/qualcomm/builders/op_le.py
index d057c04708a..1dd2a06b777 100644
--- a/backends/qualcomm/builders/op_le.py
+++ b/backends/qualcomm/builders/op_le.py
@@ -8,14 +8,6 @@
 import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
 
 import torch
-from executorch.backends.qualcomm.utils.constants import (
-    QCOM_QUANT_ATTRS,
-    QCOM_QUANT_MAX,
-    QCOM_QUANT_MIN,
-    QCOM_SCALE,
-    QCOM_ZERO_POINT,
-)
-from executorch.exir.dialects._ops import ops as exir_ops
 
 from .node_visitor import NodeVisitor, register_node_visitor
 from .qnn_constants import OpElementWiseLessEqual, QNN_OP_PACKAGE_NAME_QTI_AISW
@@ -23,7 +15,7 @@
 
 @register_node_visitor
 class LessEqual(NodeVisitor):
-    target = ["aten.le.Tensor", "aten.le.Scalar"]
+    target = ["aten.le.Tensor"]
 
     def __init__(self, *args) -> None:
         super().__init__(*args)
@@ -46,37 +38,8 @@ def define_node(
         input_tensors = []
         for index in range(2):
             input_node = node.args[index]
-            if isinstance(input_node, torch.fx.Node):
-                input_tensor = self.get_tensor(input_node, node)
-                tensor_type = PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE
-            else:
-                scalar = input_node
-                input_tensor = torch.tensor(scalar, dtype=torch.float32)
-                tensor_type = PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC
-
-                # 'graph', 'name', 'op', 'target', 'args', and 'kwargs'
-                input_node = torch.fx.Node(
-                    node.graph,
-                    node.name + "_runtime_scalar",
-                    "call_function",
-                    exir_ops.edge.aten.scalar_tensor.default,
-                    (),  # args
-                    {},  # kwargs
-                )
-                # Because the output data type of the le node is boolean.
-                # We need to take the quant attr from the non-scalar node.
-                if quant_attrs := node.args[index ^ 1].meta.get(QCOM_QUANT_ATTRS):
-                    quant_attrs = quant_attrs.copy()
-                    quant_range = (
-                        quant_attrs[QCOM_QUANT_MAX] - quant_attrs[QCOM_QUANT_MIN]
-                    )
-                    quant_attrs[QCOM_ZERO_POINT] = (
-                        0 if scalar >= 0 else quant_attrs[QCOM_QUANT_MAX]
-                    )
-                    quant_attrs[QCOM_SCALE] = (
-                        scalar / quant_range if scalar >= 0 else -scalar / quant_range
-                    )
-                    input_node.meta[QCOM_QUANT_ATTRS] = quant_attrs
+            input_tensor = self.get_tensor(input_node, node)
+            tensor_type = PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE
 
             input_tensor_wrapper = self.define_tensor(
                 input_node,
diff --git a/backends/qualcomm/builders/op_lt.py b/backends/qualcomm/builders/op_lt.py
index 6275478254e..b4a080efc38 100644
--- a/backends/qualcomm/builders/op_lt.py
+++ b/backends/qualcomm/builders/op_lt.py
@@ -8,14 +8,6 @@
 import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
 
 import torch
-from executorch.backends.qualcomm.utils.constants import (
-    QCOM_QUANT_ATTRS,
-    QCOM_QUANT_MAX,
-    QCOM_QUANT_MIN,
-    QCOM_SCALE,
-    QCOM_ZERO_POINT,
-)
-from executorch.exir.dialects._ops import ops as exir_ops
 
 from .node_visitor import NodeVisitor, register_node_visitor
 from .qnn_constants import OpElementWiseLess, QNN_OP_PACKAGE_NAME_QTI_AISW
@@ -23,7 +15,7 @@
 
 @register_node_visitor
 class LessThan(NodeVisitor):
-    target = ["aten.lt.Tensor", "aten.lt.Scalar"]
+    target = ["aten.lt.Tensor"]
 
     def __init__(self, *args) -> None:
         super().__init__(*args)
@@ -46,37 +38,8 @@ def define_node(
         input_tensors = []
         for index in range(2):
             input_node = node.args[index]
-            if isinstance(input_node, torch.fx.Node):
-                input_tensor = self.get_tensor(input_node, node)
-                tensor_type = PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE
-            else:
-                scalar = input_node
-                input_tensor = torch.tensor(scalar, dtype=torch.float32)
-                tensor_type = PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC
-
-                # 'graph', 'name', 'op', 'target', 'args', and 'kwargs'
-                input_node = torch.fx.Node(
-                    node.graph,
-                    node.name + "_runtime_scalar",
-                    "call_function",
-                    exir_ops.edge.aten.scalar_tensor.default,
-                    (),  # args
-                    {},  # kwargs
-                )
-                # Because the output data type of the lt node is boolean.
-                # We need to take the quant attr from the non-scalar node.
-                if quant_attrs := node.args[index ^ 1].meta.get(QCOM_QUANT_ATTRS):
-                    quant_attrs = quant_attrs.copy()
-                    quant_range = (
-                        quant_attrs[QCOM_QUANT_MAX] - quant_attrs[QCOM_QUANT_MIN]
-                    )
-                    quant_attrs[QCOM_ZERO_POINT] = (
-                        0 if scalar >= 0 else quant_attrs[QCOM_QUANT_MAX]
-                    )
-                    quant_attrs[QCOM_SCALE] = (
-                        scalar / quant_range if scalar >= 0 else -scalar / quant_range
-                    )
-                    input_node.meta[QCOM_QUANT_ATTRS] = quant_attrs
+            input_tensor = self.get_tensor(input_node, node)
+            tensor_type = PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE
 
             input_tensor_wrapper = self.define_tensor(
                 input_node,
diff --git a/backends/qualcomm/builders/op_or.py b/backends/qualcomm/builders/op_or.py
new file mode 100644
index 00000000000..c2751744788
--- /dev/null
+++ b/backends/qualcomm/builders/op_or.py
@@ -0,0 +1,59 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Dict
+
+import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
+
+import torch
+
+from .node_visitor import NodeVisitor, register_node_visitor
+from .qnn_constants import OpElementWiseOr, QNN_OP_PACKAGE_NAME_QTI_AISW
+
+
+@register_node_visitor
+class OpOr(NodeVisitor):
+    target = ["aten.bitwise_or.Tensor"]
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
+    ) -> PyQnnWrapper.PyQnnOpWrapper:
+        out_tensor = self.get_tensor(node, node)
+        output_tensor_wrapper = self.define_tensor(
+            node,
+            node,
+            out_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+        )
+        or_output_tensors = [output_tensor_wrapper]
+
+        or_input_tensors = []
+        for index in range(2):
+            input_node = node.args[index]
+            input_tensor = self.get_tensor(input_node, node)
+            tensor_type = PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE
+
+            input_tensor_wrapper = self.define_tensor(
+                input_node,
+                node,
+                input_tensor,
+                tensor_type,
+                nodes_to_wrappers,
+            )
+            or_input_tensors.append(input_tensor_wrapper)
+        or_op = PyQnnWrapper.PyQnnOpWrapper(
+            node.name,
+            QNN_OP_PACKAGE_NAME_QTI_AISW,
+            OpElementWiseOr.op_name,
+        )
+        or_op.AddInputTensors(or_input_tensors)
+        or_op.AddOutputTensors(or_output_tensors)
+        return or_op
diff --git a/backends/qualcomm/builders/op_pow.py b/backends/qualcomm/builders/op_pow.py
index cf5b7595697..3e89bdcfc4d 100644
--- a/backends/qualcomm/builders/op_pow.py
+++ b/backends/qualcomm/builders/op_pow.py
@@ -8,17 +8,15 @@
 import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
 
 import torch
-from executorch.backends.qualcomm.utils.constants import QCOM_QUANT_ATTRS
-from executorch.exir.dialects._ops import ops as exir_ops
 
 from .node_visitor import NodeVisitor, register_node_visitor
 from .qnn_constants import OpElementWisePower, QNN_OP_PACKAGE_NAME_QTI_AISW
 
 
-# TODO Add more class Like PowTensorTensor if needed
+# pow.Tensor_Scalar should fall in this visitor because LiftConstantScalarOperands pass
 @register_node_visitor
-class PowTensorScalar(NodeVisitor):
-    target = ["aten.pow.Tensor_Scalar"]
+class PowTensorTensor(NodeVisitor):
+    target = ["aten.pow.Tensor_Tensor"]
 
     def __init__(self, *args) -> None:
         super().__init__(*args)
@@ -52,38 +50,18 @@ def define_node(
             nodes_to_wrappers,
         )
 
-        # scalar input
-        scalar = node.args[1]
-        scalar_tensor = torch.tensor(scalar).to(torch.float32)
-
-        # 'graph', 'name', 'op', 'target', 'args', and 'kwargs'
-        scalar_node = torch.fx.Node(
-            node.graph,
-            node.name + "_runtime_scalar",
-            "call_function",
-            exir_ops.edge.aten.scalar_tensor.default,
-            (),  # args
-            {},  # kwargs
-        )
-
-        if pow_quant_attrs := node.meta.get(QCOM_QUANT_ATTRS):
-            quant_attrs = pow_quant_attrs.copy()
-            quant_range = quant_attrs["quant_max"] - quant_attrs["quant_min"]
-            quant_attrs["zero_point"] = 0 if scalar >= 0 else quant_attrs["quant_max"]
-            quant_attrs["scale"] = (
-                scalar / quant_range if scalar >= 0 else -scalar / quant_range
-            )
-            scalar_node.meta[QCOM_QUANT_ATTRS] = quant_attrs
-
-        scalar_tensor_wrapper = self.define_tensor(
-            scalar_node,
+        # exp input
+        exp_node = node.args[1]
+        exp_tensor = self.get_tensor(exp_node, node)
+        exp_tensor_wrapper = self.define_tensor(
+            exp_node,
             node,
-            scalar_tensor,
+            exp_tensor,
             PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC,
             nodes_to_wrappers,
         )
 
-        pow_input_tensors = [input_tensor_wrapper, scalar_tensor_wrapper]
+        pow_input_tensors = [input_tensor_wrapper, exp_tensor_wrapper]
 
         pow_op = PyQnnWrapper.PyQnnOpWrapper(
             node.name,
diff --git a/backends/qualcomm/builders/op_prelu.py b/backends/qualcomm/builders/op_prelu.py
index 4057b3d5559..e35839f535e 100644
--- a/backends/qualcomm/builders/op_prelu.py
+++ b/backends/qualcomm/builders/op_prelu.py
@@ -8,15 +8,7 @@
 import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
 
 import torch
-from executorch.backends.qualcomm.utils.constants import (
-    QCOM_AXIS_ORDER,
-    QCOM_QUANT_ATTRS,
-    QCOM_QUANT_MAX,
-    QCOM_QUANT_MIN,
-    QCOM_SCALE,
-    QCOM_ZERO_POINT,
-)
-from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.backends.qualcomm.utils.constants import QCOM_AXIS_ORDER
 
 from .node_visitor import get_parameter, NodeVisitor, register_node_visitor
 from .qnn_constants import OpPRelu, QNN_OP_PACKAGE_NAME_QTI_AISW
@@ -24,7 +16,7 @@
 
 @register_node_visitor
 class PReLU(NodeVisitor):
-    target = ["aten.leaky_relu.default", "aten.prelu.default"]
+    target = ["aten.prelu.default"]
 
     def __init__(self, *args) -> None:
         super().__init__(*args)
@@ -44,57 +36,32 @@ def define_node(
             nodes_to_wrappers,
         )
 
-        if node.target.__name__ == "aten.leaky_relu.default":
-            coeff = 1e-2 if len(node.args) < 2 else node.args[1]
-            coeff_tensor = torch.full(input_tensor.shape, coeff).to(torch.float32)
+        coeff_node = node.args[1]
+        coeff_tensor = torch.zeros(input_node.meta["val"].shape)
+        coeff = get_parameter(coeff_node, self.edge_program)
+        # param nodes will be FakeTensor when doing partition
+        # fill in random numeric for validation
+        if isinstance(coeff, torch._subclasses.fake_tensor.FakeTensor):
+            coeff = torch.ones(coeff.shape)
+        # per-channel activation
+        if coeff_node.meta["val"].shape[0] > 1:
+            for i in range(input_node.meta["val"].shape[1]):
+                coeff_tensor = coeff_tensor.index_fill(1, torch.tensor([i]), coeff[i])
+            if QCOM_AXIS_ORDER in input_node.meta:
+                axis_order = input_node.meta[QCOM_AXIS_ORDER]
+                coeff_tensor = coeff_tensor.permute(dims=axis_order).contiguous()
         else:
-            coeff_node = node.args[1]
-            coeff_tensor = torch.zeros(input_node.meta["val"].shape)
-            coeff = get_parameter(coeff_node, self.edge_program)
-            # param nodes will be FakeTensor when doing partition
-            # fill in random numeric for validation
-            if isinstance(coeff, torch._subclasses.fake_tensor.FakeTensor):
-                coeff = torch.ones(coeff.shape)
-            # per-channel activation
-            if coeff_node.meta["val"].shape[0] > 1:
-                for i in range(input_node.meta["val"].shape[1]):
-                    coeff_tensor = coeff_tensor.index_fill(
-                        1, torch.tensor([i]), coeff[i]
-                    )
-                if QCOM_AXIS_ORDER in input_node.meta:
-                    axis_order = input_node.meta[QCOM_AXIS_ORDER]
-                    coeff_tensor = coeff_tensor.permute(dims=axis_order).contiguous()
-                # simple min-max quantization
-                coeff = torch.max(coeff).item()
-            else:
-                coeff = coeff.item()
-                coeff_tensor = torch.full(input_tensor.shape, coeff).to(torch.float32)
-
-        # 'graph', 'name', 'op', 'target', 'args', and 'kwargs'
-        scalar_node = torch.fx.Node(
-            node.graph,
-            node.name + "_runtime_scalar",
-            "call_function",
-            exir_ops.edge.aten.full.default,
-            (),  # args
-            {},  # kwargs
-        )
-        if pow_quant_attrs := node.meta.get(QCOM_QUANT_ATTRS):
-            quant_attrs = pow_quant_attrs.copy()
-            quant_range = quant_attrs[QCOM_QUANT_MAX] - quant_attrs[QCOM_QUANT_MIN]
-            # coeff is guaranteed to be positive
-            quant_attrs[QCOM_ZERO_POINT] = 0
-            quant_attrs[QCOM_SCALE] = coeff / quant_range
-            scalar_node.meta[QCOM_QUANT_ATTRS] = quant_attrs
+            coeff = coeff.item()
+            coeff_tensor = torch.full(input_tensor.shape, coeff).to(torch.float32)
 
-        scalar_tensor_wrapper = self.define_tensor(
-            scalar_node,
+        coeff_tensor_wrapper = self.define_tensor(
+            coeff_node,
             node,
             coeff_tensor,
             PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC,
             nodes_to_wrappers,
         )
-        prelu_input_tensors = [prelu_inp_tensor_wrapper, scalar_tensor_wrapper]
+        prelu_input_tensors = [prelu_inp_tensor_wrapper, coeff_tensor_wrapper]
 
         output_tensor = self.get_tensor(node, node)
         output_tensor_wrapper = self.define_tensor(
diff --git a/backends/qualcomm/builders/op_rms_norm.py b/backends/qualcomm/builders/op_rms_norm.py
index e5b4778312e..d224e34feb5 100644
--- a/backends/qualcomm/builders/op_rms_norm.py
+++ b/backends/qualcomm/builders/op_rms_norm.py
@@ -12,7 +12,11 @@
 
 import torch
 from executorch.backends.qualcomm.builders.utils import get_parameter
-from executorch.backends.qualcomm.utils.constants import QCOM_DATA, QCOM_QUANT_ATTRS
+from executorch.backends.qualcomm.utils.constants import (
+    QCOM_DATA,
+    QCOM_QUANT_ATTRS,
+    QCOM_ZERO_POINT,
+)
 from executorch.exir.dialects._ops import ops as exir_ops
 
 from .node_visitor import NodeVisitor, register_node_visitor
@@ -66,7 +70,7 @@ def define_node(
             nodes_to_wrappers,
         )
 
-        # Fake node, nn module seems to be inconsistant with document
+        # Fake node, nn module seems to be inconsistent with document
         bias_tensor = torch.zeros(weight_tensor.shape)
         bias_node = torch.fx.Node(
             node.graph,
@@ -78,6 +82,7 @@ def define_node(
         )
         if quant_attrs := node.meta.get(QCOM_QUANT_ATTRS):
             bias_node.meta[QCOM_QUANT_ATTRS] = quant_attrs
+            bias_node.meta[QCOM_QUANT_ATTRS][QCOM_ZERO_POINT] = 0
         bias_tensor_wrapper = self.define_tensor(
             bias_node,
             node,
@@ -87,14 +92,6 @@ def define_node(
         )
 
         epsilon = node.args[3]
-        if isinstance(epsilon, torch.fx.Node):
-            epsilon = get_parameter(epsilon, self.edge_program)
-            epsilon = (
-                epsilon
-                if isinstance(epsilon, float)
-                else torch.finfo(epsilon.dtype).eps
-            )
-
         output_tensor = self.get_tensor(node, node)
         output_tensor_wrapper = self.define_tensor(
             node,
diff --git a/backends/qualcomm/builders/op_topk.py b/backends/qualcomm/builders/op_topk.py
index 1bbf19c84bd..745cf7b9935 100644
--- a/backends/qualcomm/builders/op_topk.py
+++ b/backends/qualcomm/builders/op_topk.py
@@ -10,7 +10,11 @@
 
 import numpy as np
 import torch
-from executorch.backends.qualcomm.utils.constants import QCOM_AXIS_ORDER, QCOM_DATA
+from executorch.backends.qualcomm.utils.constants import (
+    QCOM_AXIS_ORDER,
+    QCOM_DATA,
+    QCOM_QUANT_ATTRS,
+)
 
 from .node_visitor import NodeVisitor, register_node_visitor
 from .qnn_constants import OpTopK, QNN_OP_PACKAGE_NAME_QTI_AISW
@@ -60,7 +64,7 @@ def define_node(
         output_idx_tensor = self.get_tensor(node, node, 1).to(torch.int32)
 
         # QNN constraint, topk output_0 requires having the same quant config as input
-        node.meta["quant_attrs"] = input_node.meta.get("quant_attrs")
+        node.meta[QCOM_QUANT_ATTRS] = input_node.meta.get(QCOM_QUANT_ATTRS)
         output_val_tensor_wrapper = self.define_tensor(
             node,
             node,
@@ -70,7 +74,7 @@ def define_node(
         )
 
         # topk output_1 is index, do not quantize it.
-        node.meta.pop("quant_attrs", None)
+        node.meta.pop(QCOM_QUANT_ATTRS, None)
         output_index_tensor_wrapper = self.define_tensor(
             node,
             node,
@@ -92,10 +96,10 @@ def define_node(
         topk_op.AddScalarParam(
             OpTopK.param_k,
             PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
-            {"data": np.uint32(k)},
+            {QCOM_DATA: np.uint32(k)},
         )
 
-        # As of QNN 2.26, QNN HTP backend only allows users to set this value to 1, or else it will fail at op validation
+        # As of QNN 2.26, QNN HTP backend only allows users to set this value to 1, or it will fail at op validation
         if len(node.args) > 3:
             largest = cast(bool, node.args[3])
             topk_op.AddScalarParam(
diff --git a/backends/qualcomm/builders/op_upsample_bilinear2d.py b/backends/qualcomm/builders/op_upsample_bilinear2d.py
index 160f15494d8..654fb934571 100644
--- a/backends/qualcomm/builders/op_upsample_bilinear2d.py
+++ b/backends/qualcomm/builders/op_upsample_bilinear2d.py
@@ -16,7 +16,7 @@
 
 @register_node_visitor
 class ResizeBilinear(NodeVisitor):
-    target = ["aten.upsample_bilinear2d.default"]
+    target = ["aten.upsample_bilinear2d.default", "aten.upsample_bilinear2d.vec"]
 
     def __init__(self, *args) -> None:
         super().__init__(*args)
diff --git a/backends/qualcomm/builders/op_upsample_nearest2d.py b/backends/qualcomm/builders/op_upsample_nearest2d.py
index 6b7949716cb..c4b353fd3e9 100644
--- a/backends/qualcomm/builders/op_upsample_nearest2d.py
+++ b/backends/qualcomm/builders/op_upsample_nearest2d.py
@@ -16,7 +16,7 @@
 
 @register_node_visitor
 class ResizeBilinear(NodeVisitor):
-    target = ["aten.upsample_nearest2d.default"]
+    target = ["aten.upsample_nearest2d.default", "aten.upsample_nearest2d.vec"]
 
     def __init__(self, *args) -> None:
         super().__init__(*args)
diff --git a/backends/qualcomm/builders/qnn_constants.py b/backends/qualcomm/builders/qnn_constants.py
index 5e0b63d6d19..1d55d56de0f 100644
--- a/backends/qualcomm/builders/qnn_constants.py
+++ b/backends/qualcomm/builders/qnn_constants.py
@@ -168,6 +168,11 @@ class OpElementWiseNotEqual:
     op_name: str = "ElementWiseNotEqual"
 
 
+@dataclass(init=False, frozen=True)
+class OpElementWiseOr:
+    op_name: str = "ElementWiseOr"
+
+
 @dataclass(init=False, frozen=True)
 class OpElementWisePower:
     op_name: str = "ElementWisePower"
diff --git a/backends/qualcomm/quantizer/annotators.py b/backends/qualcomm/quantizer/annotators.py
index 3f27dbdb163..c1e1aa25b08 100644
--- a/backends/qualcomm/quantizer/annotators.py
+++ b/backends/qualcomm/quantizer/annotators.py
@@ -194,39 +194,37 @@ def annotate_sub(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_binary(node, quantization_config)
 
 
-@register_annotator([torch.ops.aten.eq.Scalar, torch.ops.aten.eq.Tensor])
+@register_annotator([torch.ops.aten.eq.Tensor])
 def annotate_eq(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_binary(node, quantization_config)
 
 
-@register_annotator([torch.ops.aten.ne.Scalar, torch.ops.aten.ne.Tensor])
+@register_annotator([torch.ops.aten.ne.Tensor])
 def annotate_ne(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_binary(node, quantization_config)
 
 
-@register_annotator([torch.ops.aten.ge.Scalar, torch.ops.aten.ge.Tensor])
+@register_annotator([torch.ops.aten.ge.Tensor])
 def annotate_ge(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_binary(node, quantization_config)
 
 
-@register_annotator([torch.ops.aten.gt.Scalar, torch.ops.aten.gt.Tensor])
+@register_annotator([torch.ops.aten.gt.Tensor])
 def annotate_gt(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_binary(node, quantization_config)
 
 
-@register_annotator([torch.ops.aten.le.Scalar, torch.ops.aten.le.Tensor])
+@register_annotator([torch.ops.aten.le.Tensor])
 def annotate_le(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_binary(node, quantization_config)
 
 
-@register_annotator([torch.ops.aten.lt.Scalar, torch.ops.aten.lt.Tensor])
+@register_annotator([torch.ops.aten.lt.Tensor])
 def annotate_lt(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_binary(node, quantization_config)
 
 
-@register_annotator(
-    [torch.ops.aten.mul, torch.ops.aten.mul.Tensor, torch.ops.aten.mul.Scalar]
-)
+@register_annotator([torch.ops.aten.mul, torch.ops.aten.mul.Tensor])
 def annotate_mul(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_binary(node, quantization_config)
 
@@ -308,7 +306,7 @@ def _derive_div_qparams_fn(
         raise NotImplementedError(f"No quant annotation is implemented for {node}.")
 
 
-@register_annotator([torch.ops.aten.rsub.Scalar])
+@register_annotator([torch.ops.aten.rsub.Tensor])
 def annotate_rsub(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_binary(node, quantization_config)
 
@@ -460,15 +458,9 @@ def annotate_permute(node: Node, quantization_config: QuantizationConfig) -> Non
         annotate_single_in_single_out(node, quantization_config)
 
 
-@register_annotator(
-    [
-        torch.ops.aten.leaky_relu.default,
-        torch.ops.aten.leaky_relu_.default,
-        torch.ops.aten.prelu.default,
-    ]
-)
+@register_annotator([torch.ops.aten.prelu.default])
 def annotate_prelu(node: Node, quantization_config: QuantizationConfig) -> None:
-    annotate_single_in_single_out(node, quantization_config)
+    annotate_binary(node, quantization_config)
 
 
 @register_annotator([torch.ops.aten.view.default, torch.ops.aten._unsafe_view.default])
@@ -688,7 +680,12 @@ def annotate_sigmoid(node: Node, quantization_config: QuantizationConfig) -> Non
         )
 
 
-@register_annotator([torch.ops.aten.pow.Tensor_Scalar])
+@register_annotator([torch.ops.aten.bitwise_or.Tensor, torch.ops.aten.__or__.Tensor])
+def annotate_bitwise_or(node: Node, quantization_config: QuantizationConfig) -> None:
+    annotate_binary(node, quantization_config)
+
+
+@register_annotator([torch.ops.aten.pow.Tensor_Tensor])
 def annotate_pow(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_single_in_single_out(node, quantization_config)
 
@@ -903,6 +900,12 @@ def annotate_conv2d(node: Node, quantization_config: QuantizationConfig) -> None
     if _is_annotated([node]):
         return
 
+    # block quantization
+    if quantization_config.block_size is not None:
+        quantization_config.weight.observer_or_fake_quant_ctr.p.keywords.update(
+            {"block_size": quantization_config.block_size}
+        )
+
     input_qspec_map = {}
     input_act = node.args[0]
     assert isinstance(input_act, Node)
@@ -933,6 +936,7 @@ def annotate_linear(node: Node, quantization_config: QuantizationConfig) -> None
     act_node = node.args[0]
     weight_node = node.args[1]
     bias_node = None
+
     if len(node.args) > 2:
         bias_node = node.args[2]
 
diff --git a/backends/qualcomm/quantizer/observers/per_block_param_observer.py b/backends/qualcomm/quantizer/observers/per_block_param_observer.py
new file mode 100644
index 00000000000..e60f15c6d9c
--- /dev/null
+++ b/backends/qualcomm/quantizer/observers/per_block_param_observer.py
@@ -0,0 +1,101 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+from torch.ao.quantization.observer import MappingType, PerBlock
+from torch.ao.quantization.pt2e._affine_quantization import (
+    _get_reduction_params,
+    AffineQuantizedMinMaxObserver,
+    choose_qparams_affine_with_min_max,
+)
+
+
+class PerBlockParamObserver(AffineQuantizedMinMaxObserver):
+    def __init__(
+        self,
+        dtype: torch.dtype,
+        block_size: torch.Size,
+        quant_min=None,
+        quant_max=None,
+        eps=torch.finfo(torch.float32).eps,  # noqa: B008
+        **kwargs,
+    ):
+        super().__init__(
+            mapping_type=MappingType.SYMMETRIC,
+            target_dtype=dtype,
+            granularity=PerBlock,
+            quant_min=quant_min,
+            quant_max=quant_max,
+            eps=eps,
+            **kwargs,
+        )
+        self.block_size = block_size
+        # TODO: expand this when QNN starts to support more configurations
+        self.bitwidth_of_scale = 4
+        self.quant_scales_dtype = torch.uint8
+
+    def forward(self, input: torch.Tensor):
+        if input.numel() == 0:
+            return input
+
+        input_detached = input.detach()
+        self.original_dtype = input_detached.dtype
+        shape_for_reduction, reduction_dims = _get_reduction_params(
+            self.block_size, input_detached.size()
+        )
+        input_detached = input_detached.view(shape_for_reduction)
+        min_val = torch.amin(input_detached, dim=reduction_dims)
+        max_val = torch.amax(input_detached, dim=reduction_dims)
+        if not hasattr(self, "min_val") or not hasattr(self, "max_val"):
+            self.min_val = min_val
+            self.max_val = max_val
+        else:
+            assert (
+                self.min_val.shape == min_val.shape
+            ), f"Can't update existing min_val - shape mismatch, self.min_val:{self.min_val.shape} != min_val:{min_val.shape}"
+            assert (
+                self.max_val.shape == max_val.shape
+            ), f"Can't update existing max_val - shape mismatch, self.max_val {self.max_val.shape} != max_val:{max_val.shape}"
+            min_val = torch.min(self.min_val, min_val)
+            max_val = torch.max(self.max_val, max_val)
+            self.min_val.copy_(min_val)
+            self.max_val.copy_(max_val)
+
+        return input
+
+    def calculate_qparams(self) -> Tuple[torch.Tensor, torch.Tensor]:
+        assert hasattr(self, "min_val") and hasattr(
+            self, "max_val"
+        ), "Expecting the observer has min_val and max_val, please run the observer before calling calculate_qparams"
+        scales, offsets = choose_qparams_affine_with_min_max(
+            self.min_val,
+            self.max_val,
+            self.mapping_type,
+            [],
+            self.target_dtype,
+            self.quant_min,
+            self.quant_max,
+            self.eps,
+            self.scale_dtype,
+            self.zero_point_dtype,
+            self.preserve_zero,
+            self.zero_point_domain,
+        )
+        num_channels = scales.shape[0]
+        num_steps = 2**self.bitwidth_of_scale
+        for ch in range(num_channels):
+            max_scale = scales[ch].reshape(1, -1).amax(dim=-1) / num_steps
+            q_scales = torch.clamp(
+                input=scales[ch] / max_scale,
+                min=torch.iinfo(self.quant_scales_dtype).min,
+                max=torch.iinfo(self.quant_scales_dtype).max,
+            ).to(self.quant_scales_dtype)
+            # compensate the error from double quantization
+            scales[ch] = q_scales * max_scale
+
+        return scales, offsets
diff --git a/backends/qualcomm/quantizer/observers/per_channel_param_observer.py b/backends/qualcomm/quantizer/observers/per_channel_param_observer.py
index f04a88d7d7a..0bba4d5ffeb 100644
--- a/backends/qualcomm/quantizer/observers/per_channel_param_observer.py
+++ b/backends/qualcomm/quantizer/observers/per_channel_param_observer.py
@@ -1,3 +1,9 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
 import torch
 from torch.ao.quantization.observer import UniformQuantizationObserverBase
 
diff --git a/backends/qualcomm/quantizer/qconfig.py b/backends/qualcomm/quantizer/qconfig.py
index a6c551e2413..67968363eb6 100644
--- a/backends/qualcomm/quantizer/qconfig.py
+++ b/backends/qualcomm/quantizer/qconfig.py
@@ -2,6 +2,10 @@
 from typing import Any, Callable, Dict, List, Optional, Tuple
 
 import torch
+
+from executorch.backends.qualcomm.quantizer.observers.per_block_param_observer import (
+    PerBlockParamObserver,
+)
 from torch import Tensor
 from torch.ao.quantization.fake_quantize import (
     FakeQuantize,
@@ -17,12 +21,13 @@
 from torch.fx import Node
 
 
-@dataclass(eq=True, frozen=True)
+@dataclass(eq=True)
 class QuantizationConfig:
     input_activation: Optional[QuantizationSpec]
     output_activation: Optional[QuantizationSpec]
     weight: Optional[QuantizationSpec]
     bias: Optional[QuantizationSpec | Callable]
+    block_size: Optional[Tuple] = None
 
 
 def _derived_bias_quant_spec(node: Node) -> DerivedQuantizationSpec:
@@ -41,6 +46,10 @@ def _derive_bias_qparams_fn(
         )
         derived_scale = (broadcast_act_scale * broadcast_weight_scale).to(torch.float32)
         derived_zero = torch.zeros(derived_scale.size()).to(torch.int32)
+        if isinstance(weight_obs_or_fq, PerBlockParamObserver):
+            # keep maximum scale of each channel for bias
+            derived_scale = derived_scale.view(derived_scale.size(0), -1).amax(dim=-1)
+            derived_zero = derived_zero.view(derived_zero.size(0), -1).amax(dim=-1)
         return (derived_scale, derived_zero)
 
     input_act = node.args[0]
@@ -241,7 +250,7 @@ def get_ptq_per_channel_quant_config(
         weight_dtype in supported_weight_dtypes
     ), f"weight_dtype, {weight_dtype} is not one of supported types, {supported_weight_dtypes}"
 
-    # torch do not support uint16 quantization, use int32 to bypass
+    # torch does not support uint16 quantization, use int32 to bypass
     if act_symmetric:
         # If zero_point is 128, htp can do optimizations.
         # If we keep quant_min and quant_max none, observer will default use 128 as zero_point.
@@ -286,6 +295,35 @@ def get_ptq_per_channel_quant_config(
     return quantization_config
 
 
+def get_ptq_per_block_quant_config(
+    act_dtype=torch.uint8,
+    weight_dtype=torch.int8,
+    act_observer=MovingAverageMinMaxObserver,
+    act_symmetric: bool = False,
+) -> QuantizationConfig:
+    extra_args: Dict[str, Any] = {"eps": 2**-12}
+    quantization_config = get_ptq_per_channel_quant_config(
+        act_dtype=act_dtype,
+        weight_dtype=weight_dtype,
+        act_observer=act_observer,
+        act_symmetric=act_symmetric,
+    )
+    weight_quantization_spec = QuantizationSpec(
+        dtype=torch.int8 if weight_dtype == "int4" else weight_dtype,
+        quant_min=-7 if weight_dtype == "int4" else torch.iinfo(weight_dtype).min + 1,
+        quant_max=7 if weight_dtype == "int4" else torch.iinfo(weight_dtype).max,
+        qscheme=torch.per_channel_symmetric,
+        ch_axis=0,
+        observer_or_fake_quant_ctr=PerBlockParamObserver.with_args(**extra_args),
+    )
+    return QuantizationConfig(
+        input_activation=quantization_config.input_activation,
+        output_activation=quantization_config.output_activation,
+        weight=weight_quantization_spec,
+        bias=quantization_config.bias,
+    )
+
+
 # TODO merge qat and ptq to a fucntion, and use a bool flag to control it
 def get_8a8w_qnn_qat_config(
     act_symmetric: bool = False, act_observer=MovingAverageMinMaxObserver
@@ -434,7 +472,7 @@ def get_qat_per_channel_quant_config(
         weight_dtype in supported_weight_dtypes
     ), f"weight_dtype, {weight_dtype} is not one of supported types, {supported_weight_dtypes}"
 
-    # torch do not support uint16 quantization, use int32 to bypass
+    # torch does not support uint16 quantization, use int32 to bypass
     act_fake_quant_ctr = FakeQuantize.with_args(
         dtype=torch.int32 if act_dtype == torch.uint16 else act_dtype,
         quant_min=torch.iinfo(act_dtype).min,
diff --git a/backends/qualcomm/quantizer/quantizer.py b/backends/qualcomm/quantizer/quantizer.py
index 37c9e9ab21e..38570835bea 100644
--- a/backends/qualcomm/quantizer/quantizer.py
+++ b/backends/qualcomm/quantizer/quantizer.py
@@ -5,13 +5,14 @@
 # LICENSE file in the root directory of this source tree.
 from enum import IntEnum, unique
 from functools import partial
-from typing import Callable, Optional, Sequence, Set
+from typing import Callable, Dict, Optional, Sequence, Set, Tuple
 
 import torch
 from executorch.backends.qualcomm._passes import (
     DecomposeEinsum,
     DecomposeLinalgVectorNorm,
     DecomposeSilu,
+    LiftConstantScalarOperands,
     RecomposePixelUnshuffle,
     ReduceDynamicRange,
     ReplaceInfBuffer,
@@ -33,6 +34,7 @@
     get_16a8w_qnn_ptq_config,
     get_8a8w_qnn_ptq_config,
     get_8a8w_qnn_qat_config,
+    get_ptq_per_block_quant_config,
     get_ptq_per_channel_quant_config,
     get_qat_per_channel_quant_config,
     QuantizationConfig,
@@ -50,6 +52,7 @@
     "get_8a8w_qnn_ptq_config",
     "get_8a8w_qnn_qat_config",
     "get_16a4w_qnn_qat_config",
+    "get_ptq_per_block_quant_config",
 ]
 
 
@@ -62,7 +65,8 @@ class QuantDtype(IntEnum):
     use_16a16w = 0
     use_16a8w = 1
     use_16a4w = 2
-    use_8a8w = 3
+    use_16a4w_block = 3
+    use_8a8w = 4
 
 
 quant_config_dict = {
@@ -74,6 +78,7 @@ class QuantDtype(IntEnum):
             act_dtype=torch.uint16,
             weight_dtype=torch.int16,
         ),
+        None,
     ),
     (QuantDtype.use_16a8w, False): (
         get_16a8w_qnn_ptq_config,
@@ -82,6 +87,7 @@ class QuantDtype(IntEnum):
             act_dtype=torch.uint16,
             weight_dtype=torch.int8,
         ),
+        None,
     ),
     (QuantDtype.use_16a4w, False): (
         get_16a4w_qnn_ptq_config,
@@ -90,10 +96,25 @@ class QuantDtype(IntEnum):
             act_dtype=torch.uint16,
             weight_dtype="int4",
         ),
+        None,
+    ),
+    (QuantDtype.use_16a4w_block, False): (
+        get_16a4w_qnn_ptq_config,
+        partial(
+            get_ptq_per_channel_quant_config,
+            act_dtype=torch.uint16,
+            weight_dtype="int4",
+        ),
+        partial(
+            get_ptq_per_block_quant_config,
+            act_dtype=torch.uint16,
+            weight_dtype="int4",
+        ),
     ),
     (QuantDtype.use_8a8w, False): (
         get_8a8w_qnn_ptq_config,
         partial(get_ptq_per_channel_quant_config),
+        None,
     ),
     # QAT,
     (QuantDtype.use_16a4w, True): (
@@ -103,10 +124,12 @@ class QuantDtype(IntEnum):
             act_dtype=torch.uint16,
             weight_dtype="int4",
         ),
+        None,
     ),
     (QuantDtype.use_8a8w, True): (
         get_8a8w_qnn_qat_config,
         partial(get_qat_per_channel_quant_config),
+        None,
     ),
 }
 
@@ -122,7 +145,10 @@ def __init__(self):
         self.quant_dtype = QuantDtype.use_8a8w
         self.quant_config: QuantizationConfig = get_8a8w_qnn_ptq_config()
         self.per_channel_quant_config = get_ptq_per_channel_quant_config()
+        self.per_block_quant_config = get_ptq_per_block_quant_config()
+        self.block_size_map = {}
         self.use_per_channel_weight_quant_ops: Set[OpOverload] = set()
+        self.use_per_block_weight_quant_ops: Set[OpOverload] = set()
 
         self.custom_quant_annotations: Sequence[Callable] = []
         self.discard_nodes: Set[str] = set()
@@ -132,7 +158,7 @@ def _annotate(self, gm: GraphModule) -> None:
             if node.name in self.discard_nodes:
                 continue
 
-            quant_config = self._get_quant_config(node.target)
+            quant_config = self._get_quant_config(node)
             if quant_config:
                 OP_ANNOTATOR[node.target](node, quant_config)
 
@@ -140,23 +166,36 @@ def _annotate_custom_annotation(self, gm: GraphModule) -> None:
         for annotation_func in self.custom_quant_annotations:
             annotation_func(gm)
 
-    def _get_quant_config(self, op: str | OpOverload) -> Optional[QuantizationConfig]:
+    def _get_quant_config(self, op: torch.fx.Node) -> Optional[QuantizationConfig]:
         """
         Priority:
-            1. is one of use_per_channel_weight_quant_ops
-            2. quant config
+            1. is one of use_per_block_weight_quant_ops
+            2. is one of use_per_channel_weight_quant_ops
+            3. quant config
         """
-        if isinstance(op, str):
+        target = op.target
+        if isinstance(target, str):
             return
 
-        if op in self.use_per_channel_weight_quant_ops:
+        if target in self.use_per_block_weight_quant_ops:
+            if block_size := self.block_size_map.get(op.name):
+                self.per_block_quant_config.block_size = block_size
+                return self.per_block_quant_config
+
+        if target in self.use_per_channel_weight_quant_ops:
             return self.per_channel_quant_config
 
-        if op in self.quant_ops:
+        if target in self.quant_ops:
             return self.quant_config
 
         print(f"No quant config is implemented for op, {op}")
 
+    def _update_per_block_weight_quant_ops(self, ops: Set[OpOverload], enable: bool):
+        if enable:
+            self.use_per_block_weight_quant_ops.update(ops)
+        else:
+            self.use_per_block_weight_quant_ops.difference_update(ops)
+
     def _update_per_channel_weight_quant_ops(self, ops: Set[OpOverload], enable: bool):
         if enable:
             self.use_per_channel_weight_quant_ops.update(ops)
@@ -194,9 +233,9 @@ def set_quant_config(
                 f"the quant config, (quant_dtype: {quant_dtype}, is_qat: {is_qat}) is not support"
             )
 
-        quant_config_fuc, per_channel_quant_config_fuc = quant_config_dict[
-            (quant_dtype, is_qat)
-        ]
+        quant_config_fuc, per_channel_quant_config_fuc, per_block_quant_config_fuc = (
+            quant_config_dict[(quant_dtype, is_qat)]
+        )
         self.quant_config = (
             quant_config_fuc(act_observer=act_observer)
             if act_observer
@@ -207,6 +246,19 @@ def set_quant_config(
             if act_observer
             else per_channel_quant_config_fuc()
         )
+        if per_block_quant_config_fuc is not None:
+            self.per_block_quant_config = (
+                per_block_quant_config_fuc(act_observer=act_observer)
+                if act_observer
+                else per_block_quant_config_fuc()
+            )
+
+    def set_block_size_map(self, block_size_map: Dict[str, Tuple]) -> None:
+        self.block_size_map = block_size_map
+
+    def set_per_block_conv_quant(self, enable: bool) -> None:
+        conv_ops = {torch.ops.aten.conv2d.default}
+        self._update_per_block_weight_quant_ops(conv_ops, enable)
 
     def set_per_channel_conv_quant(self, enable: bool) -> None:
         conv_ops = {torch.ops.aten.conv1d.default, torch.ops.aten.conv2d.default}
@@ -224,8 +276,9 @@ def transform_for_annotation(self, model: GraphModule) -> GraphModule:
         model = DecomposeScaledDotProductAttention()(model).graph_module
         model = DecomposeSilu()(model).graph_module
         model = DecomposeEinsum()(model).graph_module
-        model = DecomposeLinalgVectorNorm(quantization_capture=True)(model).graph_module
+        model = DecomposeLinalgVectorNorm(aten_dialect_capture=True)(model).graph_module
         model = ReplaceInfBuffer()(model).graph_module
+        model = LiftConstantScalarOperands()(model).graph_module
         return model
 
     def validate(self, model: GraphModule) -> None:
diff --git a/backends/qualcomm/quantizer/targets.bzl b/backends/qualcomm/quantizer/targets.bzl
index a6689012b25..e7a94faa652 100644
--- a/backends/qualcomm/quantizer/targets.bzl
+++ b/backends/qualcomm/quantizer/targets.bzl
@@ -10,6 +10,7 @@ def define_common_targets():
         name = "quantizer",
         srcs = glob([
             "*.py",
+            "*/*.py",
         ]),
         visibility = [
             "@EXECUTORCH_CLIENTS",
diff --git a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
index 10b5bab15e0..32d82950908 100644
--- a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
+++ b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
@@ -66,8 +66,10 @@ Result<DelegateHandle*> QnnExecuTorchBackend::init(
 
   // Create QnnManager
   MemoryAllocator* runtime_allocator = context.get_runtime_allocator();
-  QnnManager* qnn_manager =
-      ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR(runtime_allocator, QnnManager);
+  QnnManager* qnn_manager = runtime_allocator->allocateInstance<QnnManager>();
+  if (qnn_manager == nullptr) {
+    return Error::MemoryAllocationFailed;
+  }
 
   // NOTE: Since we use placement new and since this type is not trivially
   // destructible, we must call the destructor manually in destroy().
diff --git a/backends/qualcomm/runtime/backends/CMakeLists.txt b/backends/qualcomm/runtime/backends/CMakeLists.txt
index 81536d26f78..2df38086133 100644
--- a/backends/qualcomm/runtime/backends/CMakeLists.txt
+++ b/backends/qualcomm/runtime/backends/CMakeLists.txt
@@ -122,54 +122,3 @@ target_sources(
   PUBLIC ${CMAKE_CURRENT_LIST_DIR}/QnnCustomProtocol.h
   PRIVATE ${CMAKE_CURRENT_LIST_DIR}/QnnCustomProtocol.cpp
 )
-
-set(qnn_header_basenames
-    QnnBackend.h
-    QnnCommon.h
-    QnnContext.h
-    QnnDevice.h
-    GPU/QnnGpuBackend.h
-    GPU/QnnGpuCommon.h
-    GPU/QnnGpuContext.h
-    GPU/QnnGpuGraph.h
-    QnnGraph.h
-    HTP/QnnHtpCommon.h
-    HTP/QnnHtpDevice.h
-    HTP/QnnHtpGraph.h
-    HTP/QnnHtpMem.h
-    HTP/QnnHtpPerfInfrastructure.h
-    HTP/QnnHtpProfile.h
-    HTP/QnnHtpProperty.h
-    HTP/QnnHtpSystemContext.h
-    QnnInterface.h
-    QnnLog.h
-    QnnMem.h
-    QnnOpDef.h
-    QnnOpPackage.h
-    QnnProfile.h
-    QnnProperty.h
-    Saver/QnnSaver.h
-    Saver/QnnSaverCommon.h
-    QnnSdkBuildId.h
-    QnnSignal.h
-    QnnTensor.h
-    QnnTypes.h
-    System/QnnSystemCommon.h
-    System/QnnSystemContext.h
-    System/QnnSystemInterface.h
-)
-
-set(QNN_HEADER_DIR_DST ${CMAKE_CURRENT_BINARY_DIR}/QNN/include)
-
-# add the custom commands to copy each headers
-foreach(_qnn_header ${qnn_header_basenames})
-  # copy at generation time to make below target_sources(qnn_header) happy.
-  configure_file(
-    ${QNN_SDK_ROOT}/include/QNN/${_qnn_header}
-    ${QNN_HEADER_DIR_DST}/${_qnn_header} COPYONLY
-  )
-  list(APPEND qnn_header_files ${QNN_HEADER_DIR_DST}/${_qnn_header})
-endforeach()
-
-# qnn_header
-target_sources(qnn_header INTERFACE ${qnn_header_files})
diff --git a/backends/qualcomm/scripts/build.sh b/backends/qualcomm/scripts/build.sh
index 506bb92752d..fef177fd300 100755
--- a/backends/qualcomm/scripts/build.sh
+++ b/backends/qualcomm/scripts/build.sh
@@ -144,6 +144,10 @@ if [ "$BUILD_X86_64" = true ]; then
    EXAMPLE_ROOT=examples/qualcomm
    CMAKE_PREFIX_PATH="${BUILD_ROOT}/lib/cmake/ExecuTorch;${BUILD_ROOT}/third-party/gflags;"
 
+   echo "Update tokenizers submodule..."
+   pushd $PRJ_ROOT/extension/llm/tokenizers
+   git submodule update --init
+   popd
    cmake $PRJ_ROOT/$EXAMPLE_ROOT \
        -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
        -DCMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH \
diff --git a/backends/qualcomm/serialization/qc_compiler_spec.fbs b/backends/qualcomm/serialization/qc_compiler_spec.fbs
index 0ce41736394..3e78ba23ac8 100644
--- a/backends/qualcomm/serialization/qc_compiler_spec.fbs
+++ b/backends/qualcomm/serialization/qc_compiler_spec.fbs
@@ -32,16 +32,17 @@ table HtpInfo {
 /// to get SoC Model in supported snapdragon devices
 enum QcomChipset: int {
   UNKNOWN_SM = 0,
+  SA8295 = 39,
   SM8450 = 36,
   SM8475 = 42,
   SM8550 = 43,
-  SSG2115P = 46,
   SM8650 = 57,
-  SA8295 = 39,
   SM8750 = 69,
+  SSG2115P = 46,
+  SSG2125P = 58,
   SXR1230P = 45,
   SXR2230P = 53,
-  SSG2125P = 58,
+  SXR2330P = 75,
 }
 
 /// Indicate the information of the specified SoC.
diff --git a/backends/qualcomm/serialization/qc_schema.py b/backends/qualcomm/serialization/qc_schema.py
index a1ce2b2f53c..56ba27bb000 100644
--- a/backends/qualcomm/serialization/qc_schema.py
+++ b/backends/qualcomm/serialization/qc_schema.py
@@ -8,7 +8,7 @@
 Please refer to executorch/backends/qualcomm/serialization/schema.fbs for the schema definitions
 """
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from enum import IntEnum, unique
 
 
@@ -37,35 +37,37 @@ class HtpInfo:
 @unique
 class QcomChipset(IntEnum):
     UNKNOWN_SM = 0
+    SA8295 = 39  # v68
     SM8450 = 36  # v69
     SM8475 = 42  # v69
     SM8550 = 43  # v73
-    SSG2115P = 46  # v73
     SM8650 = 57  # v75
-    SA8295 = 39  # v68
     SM8750 = 69  # v79
-    SXR1230P = 45
-    SXR2230P = 53
-    SSG2125P = 58
+    SSG2115P = 46  # v73
+    SSG2125P = 58  # v73
+    SXR1230P = 45  # v73
+    SXR2230P = 53  # v69
+    SXR2330P = 75  # v79
 
 
 @dataclass
 class SocInfo:
     soc_model: QcomChipset = QcomChipset.UNKNOWN_SM
-    htp_info: HtpInfo = HtpInfo()
+    htp_info: HtpInfo = field(default_factory=HtpInfo)
 
 
 _soc_info_table = {
+    QcomChipset.SA8295: SocInfo(QcomChipset.SA8295, HtpInfo(HtpArch.V68, 8)),
     QcomChipset.SM8450: SocInfo(QcomChipset.SM8450, HtpInfo(HtpArch.V69, 8)),
     QcomChipset.SM8475: SocInfo(QcomChipset.SM8475, HtpInfo(HtpArch.V69, 8)),
     QcomChipset.SM8550: SocInfo(QcomChipset.SM8550, HtpInfo(HtpArch.V73, 8)),
     QcomChipset.SM8650: SocInfo(QcomChipset.SM8650, HtpInfo(HtpArch.V75, 8)),
     QcomChipset.SM8750: SocInfo(QcomChipset.SM8750, HtpInfo(HtpArch.V79, 8)),
     QcomChipset.SSG2115P: SocInfo(QcomChipset.SSG2115P, HtpInfo(HtpArch.V73, 2)),
-    QcomChipset.SA8295: SocInfo(QcomChipset.SA8295, HtpInfo(HtpArch.V68, 8)),
+    QcomChipset.SSG2125P: SocInfo(QcomChipset.SSG2125P, HtpInfo(HtpArch.V73, 2)),
     QcomChipset.SXR1230P: SocInfo(QcomChipset.SXR1230P, HtpInfo(HtpArch.V73, 2)),
     QcomChipset.SXR2230P: SocInfo(QcomChipset.SXR2230P, HtpInfo(HtpArch.V69, 8)),
-    QcomChipset.SSG2125P: SocInfo(QcomChipset.SSG2125P, HtpInfo(HtpArch.V73, 2)),
+    QcomChipset.SXR2330P: SocInfo(QcomChipset.SXR2330P, HtpInfo(HtpArch.V79, 8)),
 }
 
 
diff --git a/backends/qualcomm/tests/TARGETS b/backends/qualcomm/tests/TARGETS
new file mode 100644
index 00000000000..b6a9664dcbf
--- /dev/null
+++ b/backends/qualcomm/tests/TARGETS
@@ -0,0 +1,39 @@
+load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
+load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
+load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_verision")
+
+python_library(
+    name = "models",
+    srcs = ["models.py"],
+    deps = [
+        "//caffe2:torch",
+    ]
+)
+
+python_library(
+    name = "test_qnn_delegate",
+    srcs = [
+        "test_qnn_delegate.py",
+        "utils.py",
+    ],
+    # env = {
+    #     "LD_LIBRARY_PATH": "$(location fbsource//third-party/qualcomm/qnn/qnn-{0}:qnn_offline_compile_libs)".format(get_qnn_library_verision()),
+    # },
+    deps = [
+        ":models",
+        "//caffe2:torch",
+        "//caffe2/functorch:functorch_src",
+        "//executorch/exir/backend:partitioner",
+        "//executorch/exir/dialects:lib",
+        "//executorch/extension/pybindings:portable_lib",  # @manual
+        "//executorch/extension/pytree:pylib",
+        "//executorch/backends/qualcomm/partition:partition",
+        "//executorch/backends/qualcomm/quantizer:quantizer",
+        "//executorch/backends/qualcomm/serialization:serialization",
+        "//executorch/backends/qualcomm/utils:utils",
+        "//executorch/devtools:lib",
+        "//executorch/examples/qualcomm:utils",
+        "//executorch/examples/models:models",
+        "//executorch/backends/qualcomm/debugger:utils",
+    ],
+)
diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py
index bdb5541353b..e5a9be8e75b 100644
--- a/backends/qualcomm/tests/models.py
+++ b/backends/qualcomm/tests/models.py
@@ -441,13 +441,20 @@ def forward(self, x):
 
 
 class Conv2dSingle(torch.nn.Module):
-    def __init__(self, bias=True):
+    def __init__(
+        self,
+        bias=True,
+        in_channel=1,
+        out_channel=3,
+        kernel_size=(3, 3),
+        padding=1,
+    ):
         super().__init__()
         self.conv = torch.nn.Conv2d(
-            in_channels=1,
-            out_channels=3,
-            kernel_size=(3, 3),
-            padding=1,
+            in_channels=in_channel,
+            out_channels=out_channel,
+            kernel_size=kernel_size,
+            padding=padding,
             bias=bias,
         )
 
@@ -746,13 +753,19 @@ def forward(self, x):
 
 
 class Index(torch.nn.Module):
-    def __init__(self):
+    def __init__(self, axis):
         super().__init__()
         self.idx0 = torch.tensor([[0, 1], [2, 3], [4, 5]], dtype=torch.int32)
         self.idx1 = torch.tensor([[1, 2], [3, 4], [5, 6]], dtype=torch.int32)
+        self.axis = axis
+        self.dispatcher = {
+            0: lambda x: x[self.idx0] + x[self.idx1],
+            1: lambda x: x[:, self.idx0] + x[:, self.idx1],
+            2: lambda x: x[:, :, self.idx0] + x[:, :, self.idx1],
+        }
 
     def forward(self, x):
-        return x[self.idx0] + x[self.idx1]
+        return self.dispatcher[self.axis](x)
 
 
 class IndexPut(torch.nn.Module):
@@ -1025,6 +1038,28 @@ def forward(self, x):
         return x != self.constant
 
 
+class OrBitWise(torch.nn.Module):
+    def __init__(self, pos, neg):
+        super().__init__()
+        self.pos = pos
+        self.neg = neg
+
+    def forward(self, x, y):
+        bitwise_or = torch.bitwise_or(x, y).bool()
+        return torch.where(bitwise_or, self.pos, self.neg)
+
+
+class OrOperator(torch.nn.Module):
+    def __init__(self, pos, neg):
+        super().__init__()
+        self.pos = pos
+        self.neg = neg
+
+    def forward(self, x, y):
+        operator_or = x.to(torch.bool) | y.to(torch.bool)
+        return torch.where(operator_or, self.pos, self.neg)
+
+
 class Pad(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -1160,7 +1195,7 @@ def forward(self, x):
         output_shape = [dim * 2 for dim in x.shape[-2:]]
         return torch.nn.functional.interpolate(
             x,
-            size=list(torch.randn(output_shape).shape),
+            size=output_shape,
             mode="bilinear",
             align_corners=False,
         )
@@ -1174,11 +1209,22 @@ def forward(self, x):
         output_shape = [dim * 2 for dim in x.shape[-2:]]
         return torch.nn.functional.interpolate(
             x,
-            size=list(torch.randn(output_shape).shape),
+            size=output_shape,
             mode="nearest",
         )
 
 
+class UpsampleNearest2D(torch.nn.Module):
+    def __init__(self, sizes=None, scale_factor=None):
+        super().__init__()
+        self.upsample_neareast_2d = torch.nn.UpsamplingNearest2d(  # noqa: TOR101
+            size=sizes, scale_factor=scale_factor
+        )
+
+    def forward(self, x):
+        return self.upsample_neareast_2d(x)
+
+
 class RmsNorm(torch.nn.Module):
     def __init__(self):
         super().__init__()
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index 6ea94ba9e07..936b9c3efe4 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -16,7 +16,6 @@
 from executorch.backends.qualcomm.tests.utils import (
     generate_context_binary,
     QnnPartitioner,
-    QnnQuantizer,
     QuantDtype,
     TestQNN,
     to_backend,
@@ -29,7 +28,6 @@
     QCOM_QUANT_DTYPE,
     QCOM_SAMPLE_INPUTS,
 )
-
 from executorch.backends.qualcomm.utils.utils import (
     capture_program,
     dump_context_from_pte,
@@ -41,11 +39,15 @@
     skip_annotation,
     update_spill_fill_size,
 )
+
 from executorch.examples.models.llama.llama_transformer import MOEFeedForward
 
 from executorch.examples.models.llama.model_args import ModelArgs
 
-from executorch.examples.qualcomm.utils import setup_common_args_and_variables
+from executorch.examples.qualcomm.utils import (
+    make_quantizer,
+    setup_common_args_and_variables,
+)
 
 from executorch.backends.qualcomm.tests.models import *  # noqa: F403
 
@@ -68,13 +70,12 @@
 from executorch.examples.models.inception_v3 import InceptionV3Model
 from executorch.examples.models.inception_v4 import InceptionV4Model
 
-# from executorch.examples.models.llama import Llama2Model
-from executorch.examples.models.mobilebert import MobileBertModelExample
+# from executorch.examples.models.mobilebert import MobileBertModelExample
 from executorch.examples.models.mobilenet_v2 import MV2Model
 from executorch.examples.models.mobilenet_v3 import MV3Model
 from executorch.examples.models.torchvision_vit.model import TorchVisionViTModel
 
-# from executorch.examples.models.wav2letter import Wav2LetterModel
+from executorch.examples.models.wav2letter import Wav2LetterModel
 from executorch.exir import to_edge
 from executorch.exir.backend.backend_api import disable_validation
 from executorch.exir.passes import PassManager
@@ -311,6 +312,33 @@ def test_qnn_backend_element_wise_mul(self):
                         self.lower_module_and_test_output(module, sample_input)
                         index += 1
 
+    def test_qnn_backend_element_wise_or(self):
+        test_comb = [
+            {
+                QCOM_MODULE: OrBitWise(  # noqa: F405
+                    torch.tensor(1.7), torch.tensor(0.2)
+                ),
+                QCOM_SAMPLE_INPUTS: (
+                    torch.tensor([1, 0, 1, 0], dtype=torch.bool),
+                    torch.tensor([1, 1, 0, 0], dtype=torch.bool),
+                ),
+            },
+            {
+                QCOM_MODULE: OrOperator(  # noqa: F405
+                    torch.tensor(1.5), torch.tensor(-1.2)
+                ),
+                QCOM_SAMPLE_INPUTS: (
+                    torch.full((3, 3), 1).triu(),
+                    torch.full((3, 3), 1).tril(diagonal=0),
+                ),
+            },
+        ]
+        for i, test in enumerate(test_comb):
+            with self.subTest(i=i):
+                self.lower_module_and_test_output(
+                    test[QCOM_MODULE], test[QCOM_SAMPLE_INPUTS]
+                )
+
     def test_qnn_backend_element_wise_sqrt(self):
         modules = [Sqrt(), SqrtConstant()]  # noqa: F405
         for i, module in enumerate(modules):
@@ -443,9 +471,11 @@ def test_qnn_backend_hardtanh(self):
         self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_index(self):
-        module = Index()  # noqa: F405
+        modules = [Index(0), Index(1), Index(2)]  # noqa: F405
         sample_input = (torch.randn([8, 172, 64]),)
-        self.lower_module_and_test_output(module, sample_input)
+        for i, module in enumerate(modules):
+            with self.subTest(i=i):
+                self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_index_put(self):
         module = IndexPut()  # noqa: F405
@@ -472,6 +502,16 @@ def test_qnn_backend_interpolate_nearest_2d(self):
         sample_input = (torch.randn(2, 3, 4, 5),)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_up_sampling_nearest_2d_with_scale_factor(self):
+        module = UpsampleNearest2D(scale_factor=2)  # noqa: F405
+        sample_input = (torch.randn(1, 16, 72, 104),)
+        self.lower_module_and_test_output(module, sample_input)
+
+    def test_qnn_backend_up_sampling_nearest_2d_with_size(self):
+        module = UpsampleNearest2D(sizes=(144, 208))  # noqa: F405
+        sample_input = (torch.randn(1, 16, 72, 104),)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_layer_norm(self):
         modules = [LayerNorm(), LayerNorm(bias=False)]  # noqa: F405
         sample_input = (torch.randn(196, 768),)
@@ -892,19 +932,19 @@ def test_qnn_backend_view_permute_matmul(self):
         self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_example_models(self):
+        # TODO Fix MobileBertModelExample and TorchVisionViTModel
         instances = [
             DeepLabV3ResNet101Model(),
             EdsrModel(),
             InceptionV3Model(),
             InceptionV4Model(),
             # The module of llama is changing frequently. Reopen it when it's stable
-            # Llama2Model(),
             MV2Model(),
             MV3Model(),
-            MobileBertModelExample(),
-            TorchVisionViTModel(),
-            # Encountered undefined symbol in mainline. Reopen once resolved.
-            # Wav2LetterModel(),
+            # Fail during lowering Reopen once resolved
+            # MobileBertModelExample(),
+            # TorchVisionViTModel(),
+            Wav2LetterModel(),
         ]
         expected_partitions = [
             1,
@@ -913,9 +953,8 @@ def test_qnn_backend_example_models(self):
             1,
             1,
             1,
-            1,
-            1,
-            1,
+            # 1,
+            # 1,
             1,
         ]
         # TODO: Due to trigger maximum recursion depth exceeded, need to check it.
@@ -990,7 +1029,6 @@ def test_qnn_backend_16a4w_linear(self):
         )
         self.lower_module_and_test_output(module, sample_input)
 
-    @unittest.skip("segfault happens in QNN 2.26")
     def test_qnn_backend_16a4w_per_channel_linear(self):
         module = Linear(use_bias=False)  # noqa: F405
         sample_input = (torch.randn([3, 4]),)
@@ -1108,6 +1146,55 @@ def test_qnn_backend_conv2d(self):
                 module = self.get_qdq_module(module, sample_input)
                 self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_conv2d_block(self):
+        import numpy as np
+
+        np.random.seed(1)
+        o_ch, i_ch, kernel, padding = 32, 512, (1, 1), 0
+        input = (
+            torch.from_numpy(np.random.uniform(-3, 3, size=(1, 1, 32, i_ch)))
+            .to(torch.float)
+            .permute(0, 3, 1, 2)
+        )
+        weight = (
+            torch.from_numpy(np.random.uniform(-3, 3, size=(1, 1, i_ch, o_ch)))
+            .to(torch.float)
+            .permute(3, 2, 0, 1)
+        )
+
+        modules = [
+            Conv2dSingle(  # noqa: F405
+                bias=False,
+                in_channel=i_ch,
+                out_channel=o_ch,
+                kernel_size=kernel,
+                padding=padding,
+            ),
+            Conv2dSingle(  # noqa: F405
+                in_channel=i_ch,
+                out_channel=o_ch,
+                kernel_size=kernel,
+                padding=padding,
+            ),
+        ]
+        for module in modules:
+            module.conv.weight = torch.nn.Parameter(weight)
+
+        sample_input = (input,)
+        for i, module in enumerate(modules):
+            with self.subTest(i=i):
+                # update block size for convolution weight (OIHW)
+                # channel dimension(O) is defaultly sliced in QNN
+                # divide dimension(I) into 4 groups
+                module = self.get_qdq_module(
+                    module,
+                    sample_input,
+                    is_conv_per_block=True,
+                    quant_dtype=QuantDtype.use_16a4w_block,
+                    block_size_map={"conv2d": (1, 128, 1, 1)},
+                )
+                self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_conv2d_channel_last(self):
         modules = [
             Conv2dSequential(channel_last=True),  # noqa: F405
@@ -1244,6 +1331,34 @@ def test_qnn_backend_element_wise_mul(self):
                         self.lower_module_and_test_output(module, sample_input)
                         index += 1
 
+    def test_qnn_backend_element_wise_or(self):
+        test_comb = [
+            {
+                QCOM_MODULE: OrBitWise(  # noqa: F405
+                    torch.tensor(1.7), torch.tensor(0.2)
+                ),
+                QCOM_SAMPLE_INPUTS: (
+                    torch.tensor([1, 0, 1, 0], dtype=torch.bool),
+                    torch.tensor([1, 1, 0, 0], dtype=torch.bool),
+                ),
+            },
+            {
+                QCOM_MODULE: OrOperator(  # noqa: F405
+                    torch.tensor(1.5), torch.tensor(-1.2)
+                ),
+                QCOM_SAMPLE_INPUTS: (
+                    torch.full((3, 3), 1).triu(),
+                    torch.full((3, 3), 1).tril(diagonal=0),
+                ),
+            },
+        ]
+        for i, test in enumerate(test_comb):
+            with self.subTest(i=i):
+                module = self.get_qdq_module(
+                    test[QCOM_MODULE], test[QCOM_SAMPLE_INPUTS]
+                )
+                self.lower_module_and_test_output(module, test[QCOM_SAMPLE_INPUTS])
+
     def test_qnn_backend_element_wise_sqrt(self):
         modules = [Sqrt(), SqrtConstant()]  # noqa: F405
         for i, module in enumerate(modules):
@@ -1390,10 +1505,12 @@ def test_qnn_backend_hardtanh(self):
         self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_index(self):
-        module = Index()  # noqa: F405
+        modules = [Index(0), Index(1), Index(2)]  # noqa: F405
         sample_input = (torch.randn([8, 172, 64]),)
-        module = self.get_qdq_module(module, sample_input)
-        self.lower_module_and_test_output(module, sample_input)
+        for i, module in enumerate(modules):
+            with self.subTest(i=i):
+                module = self.get_qdq_module(module, sample_input)
+                self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_index_put(self):
         module = IndexPut()  # noqa: F405
@@ -1424,6 +1541,18 @@ def test_qnn_backend_interpolate_nearest_2d(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_up_sampling_nearest_2d_with_scale_factor(self):
+        module = UpsampleNearest2D(scale_factor=2)  # noqa: F405
+        sample_input = (torch.randn(1, 16, 72, 104),)
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
+    def test_qnn_backend_up_sampling_nearest_2d_with_size(self):
+        module = UpsampleNearest2D(sizes=(144, 208))  # noqa: F405
+        sample_input = (torch.randn(1, 16, 72, 104),)
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_layer_norm(self):
         modules = [LayerNorm(), LayerNorm(bias=False)]  # noqa: F405
         sample_input = (torch.randn(196, 768),)
@@ -1938,7 +2067,6 @@ def test_qnn_backend_example_models(self):
                 QCOM_QUANT_DTYPE: QuantDtype.use_8a8w,
             },
             # The module of llama is changing frequently. Reopen it when it's stable
-            # {QCOM_MODULE: Llama2Model(), QCOM_ANNOTATION: (), QCOM_QUANT_DTYPE: QuantDtype.use_8a8w},
             {
                 QCOM_MODULE: MV2Model(),
                 QCOM_ANNOTATION: (),
@@ -1956,12 +2084,11 @@ def test_qnn_backend_example_models(self):
                 QCOM_ANNOTATION: (),
                 QCOM_QUANT_DTYPE: QuantDtype.use_8a8w,
             },
-            # Encountered undefined symbol in mainline. Reopen once resolved.
-            # {
-            #     QCOM_MODULE: Wav2LetterModel(),
-            #     QCOM_ANNOTATION: (),
-            #     QCOM_QUANT_DTYPE: QuantDtype.use_8a8w,
-            # },
+            {
+                QCOM_MODULE: Wav2LetterModel(),
+                QCOM_ANNOTATION: (),
+                QCOM_QUANT_DTYPE: QuantDtype.use_8a8w,
+            },
         ]
         expected_partitions = [
             1,
@@ -1970,11 +2097,10 @@ def test_qnn_backend_example_models(self):
             1,
             1,
             1,
-            1,
             # For MobileBertModelExample
             # 1,
             1,
-            # 1, For Wav2LetterModel
+            1,
         ]
         # TODO: Due to trigger maximum recursion depth exceeded, need to check it.
         disable_validation()
@@ -2045,7 +2171,9 @@ def test_qnn_backend_skip_node_op(self):
             skip_node_op_set={"aten.add.Tensor"},
         )
 
+    @unittest.expectedFailure
     def test_qnn_backend_spill_fill_buffer_size(self):
+        # TODO: Fix self.assertNotEqual(0, max_sf_size)
         module = LargeTensorLinear()  # noqa: F405
         sample_input = (torch.randn(1, 256, 512),)
         edge_prog = capture_program(module, sample_input)
@@ -2199,7 +2327,9 @@ def test_qnn_backend_online_prepare(self):
         sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
         self.lower_module_and_test_output(module, sample_input)
 
+    @unittest.expectedFailure
     def test_qnn_backend_context_direct(self):
+        # TODO: Fix QNN tools pairs with np 2.x
         with tempfile.TemporaryDirectory() as tmp_dir:
             module = ContextBinaryExample()  # noqa: F405
             generate_context_binary(
@@ -2573,7 +2703,7 @@ def test_qnn_backend_skip_node_id_quantizer(self):
         )
         partitioner = QnnPartitioner(compiler_specs)
         # define quantizer
-        quantizer = QnnQuantizer()
+        quantizer = make_quantizer()
 
         # define calibration method
         def calibrator(gm):
@@ -2620,7 +2750,7 @@ def test_qnn_backend_skip_node_op_quantizer(self):
         )
         partitioner = QnnPartitioner(compiler_specs)
         # define quantizer
-        quantizer = QnnQuantizer()
+        quantizer = make_quantizer()
 
         # define calibration method
         def calibrator(gm):
@@ -2642,7 +2772,9 @@ def calibrator(gm):
         ).to_executorch()
         self.verify_output(module, sample_input, exec_prog)
 
+    @unittest.expectedFailure
     def test_qnn_backend_spill_fill_buffer_size(self):
+        # TODO: Fix self.assertNotEqual(0, max_sf_size)
         module = LargeTensorLinear()  # noqa: F405
         sample_input = (torch.randn(1, 256, 512),)
         module = self.get_qdq_module(module, sample_input)
@@ -2675,7 +2807,7 @@ def test_qnn_backend_graph_level_mixed_precision(self):
         )
         partitioner = QnnPartitioner(compiler_specs)
         # define quantizer
-        quantizer = QnnQuantizer()
+        quantizer = make_quantizer()
 
         # define calibration method
         def calibrator(gm):
@@ -2839,7 +2971,9 @@ def test_qnn_backend_online_prepare(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    @unittest.expectedFailure
     def test_qnn_backend_context_direct(self):
+        # TODO: Fix QNN tools pairs with np 2.x
         with tempfile.TemporaryDirectory() as tmp_dir:
             module = ContextBinaryExample()  # noqa: F405
             generate_context_binary(
@@ -3106,6 +3240,173 @@ def test_qnn_backend_draw_graph(self):
         ), "Generated .dot file does not match the golden file."
 
 
+class TestExampleLLMScript(TestQNN):
+    def required_envs(self, conditions=None) -> bool:
+        conditions = [] if conditions is None else conditions
+        return all(
+            [
+                self.executorch_root,
+                self.artifact_dir,
+                *conditions,
+            ]
+        )
+
+    def test_llama3_2_1b(self):
+        if not self.required_envs():
+            self.skipTest("missing required envs")
+        assert (
+            self.llama_artifacts is not None
+        ), "Please provide path to llama artifacts"
+
+        prompt = "What is the meaning of life?"
+        cmds = [
+            "python",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
+            "--artifact",
+            self.artifact_dir,
+            "--build_folder",
+            self.build_folder,
+            "--model",
+            self.model,
+            "--checkpoint",
+            f"{self.llama_artifacts}/consolidated.00.pth",
+            "--params",
+            f"{self.llama_artifacts}/params.json",
+            "--tokenizer_model",
+            f"{self.llama_artifacts}/tokenizer.model",
+            "--ip",
+            self.ip,
+            "--port",
+            str(self.port),
+            "--prompt",
+            f"{prompt}",
+            "--ptq",
+            "16a4w",
+            "--temperature",
+            "0",
+            "--llama_model",
+            "llama3_2",
+            "--model_mode",
+            "hybrid",
+            "--prefill_ar_len",
+            "32",
+            "--max_seq_len",
+            "512",
+            "--num_sharding",
+            "4",
+        ]
+        if self.compile_only:
+            cmds.extend(["--compile_only"])
+        elif self.device:
+            cmds.extend(["--device", self.device])
+        if self.host:
+            cmds.extend(["--host", self.host])
+        elif self.enable_x86_64:
+            cmds.extend(["--enable_x86_64"])
+        if self.pre_gen_pte:
+            cmds.extend(["--pre_gen_pte", self.pre_gen_pte])
+
+        golden_start_with = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>"
+        p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
+        with Listener((self.ip, self.port)) as listener:
+            conn = listener.accept()
+            p.communicate()
+            msg = json.loads(conn.recv())
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                if not self.compile_only:
+                    model_out = msg["result"][0]
+                    self.assertTrue(
+                        model_out.startswith(golden_start_with),
+                        f"Expected Output: {golden_start_with}. Actual Output: {model_out}",
+                    )
+                # x86 does not allow weight sharing, so we don't check pte size.
+                # Inference speed on x86 is slow, so we only check when running on Android
+                if not self.enable_x86_64:
+                    pte_size = msg["pte_size"]
+                    self.assertLessEqual(pte_size, 1300000000)
+                if not self.compile_only and not self.enable_x86_64:
+                    self.assertGreaterEqual(msg["inference_speed"], 66)  # Lanai
+
+    def test_llama_stories_110m(self):
+        if not self.required_envs():
+            self.skipTest("missing required envs")
+        assert (
+            self.llama_artifacts is not None
+        ), "Please provide path to llama artifacts"
+
+        prompt = "Once"
+        cmds = [
+            "python",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
+            "--artifact",
+            self.artifact_dir,
+            "--build_folder",
+            self.build_folder,
+            "--model",
+            self.model,
+            "--checkpoint",
+            f"{self.llama_artifacts}/stories110M.pt",
+            "--params",
+            f"{self.llama_artifacts}/params.json",
+            "--tokenizer_model",
+            f"{self.llama_artifacts}/tokenizer.model",
+            "--tokenizer_bin",
+            f"{self.llama_artifacts}/tokenizer.bin",
+            "--ip",
+            self.ip,
+            "--port",
+            str(self.port),
+            "--prompt",
+            f"{prompt}",
+            "--ptq",
+            "16a4w",
+            "--temperature",
+            "0",
+            "--llama_model",
+            "stories110m",
+            "--model_mode",
+            "hybrid",
+            "--prefill_ar_len",
+            "32",
+            "--max_seq_len",
+            "128",
+        ]
+        if self.compile_only:
+            cmds.extend(["--compile_only"])
+        elif self.device:
+            cmds.extend(["--device", self.device])
+        if self.host:
+            cmds.extend(["--host", self.host])
+        elif self.enable_x86_64:
+            cmds.extend(["--enable_x86_64"])
+        if self.pre_gen_pte:
+            cmds.extend(["--pre_gen_pte", self.pre_gen_pte])
+
+        golden_start_with = "Once upon a time,"
+        p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
+        with Listener((self.ip, self.port)) as listener:
+            conn = listener.accept()
+            p.communicate()
+            msg = json.loads(conn.recv())
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                if not self.compile_only:
+                    model_out = msg["result"][0]
+                    self.assertTrue(
+                        model_out.startswith(golden_start_with),
+                        f"Expected Output: {golden_start_with}. Actual Output: {model_out}",
+                    )
+                # x86 does not allow weight sharing, so we don't check pte size
+                if not self.enable_x86_64:
+                    pte_size = msg["pte_size"]
+                    self.assertLessEqual(pte_size, 130000000)
+                if not self.compile_only and not self.enable_x86_64:
+                    self.assertGreaterEqual(msg["inference_speed"], 220)  # Lanai
+
+
 class TestExampleOssScript(TestQNN):
     def required_envs(self, conditions=None) -> bool:
         conditions = [] if conditions is None else conditions
@@ -3153,6 +3454,7 @@ def test_conv_former(self):
                 self.assertGreaterEqual(msg["top_1"], 60)
                 self.assertGreaterEqual(msg["top_5"], 80)
 
+    @unittest.skip("bicubic resize is not supported")
     def test_dino_v2(self):
         if not self.required_envs([self.image_dataset]):
             self.skipTest("missing required envs")
@@ -4001,72 +4303,6 @@ def test_deeplab_v3(self):
                 self.assertGreaterEqual(msg["MPA"], 0.70)
                 self.assertGreaterEqual(msg["MIoU"], 0.55)
 
-    def test_stories_single_llama(self):
-        if not self.required_envs():
-            self.skipTest("missing required envs")
-
-        cmds = [
-            "python",
-            f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
-            "--artifact",
-            self.artifact_dir,
-            "--build_folder",
-            self.build_folder,
-            "--model",
-            self.model,
-            "--checkpoint",
-            f"{self.artifact_dir}/stories110M.pt",
-            "--params",
-            f"{self.artifact_dir}/params.json",
-            "--tokenizer_model",
-            f"{self.artifact_dir}/tokenizer.model",
-            "--tokenizer_bin",
-            f"{self.artifact_dir}/tokenizer.bin",
-            "--ip",
-            self.ip,
-            "--port",
-            str(self.port),
-            "--prompt",
-            "Once",
-            "--ptq",
-            "16a4w",
-            "--temperature",
-            "0",
-            "--llama_model",
-            "stories110m",
-            "--model_mode",
-            "hybrid",
-            "--prefill_seq_len",
-            "32",
-            "--kv_seq_len",
-            "128",
-        ]
-        if self.compile_only:
-            cmds.extend(["--compile_only"])
-        elif self.device:
-            cmds.extend(["--device", self.device])
-        if self.host:
-            cmds.extend(["--host", self.host])
-        elif self.enable_x86_64:
-            cmds.extend(["--enable_x86_64"])
-
-        golden_start_with = "Once upon a time,"
-        p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
-        with Listener((self.ip, self.port)) as listener:
-            conn = listener.accept()
-            p.communicate()
-            msg = json.loads(conn.recv())
-            if "Error" in msg:
-                self.fail(msg["Error"])
-            else:
-                if not self.compile_only:
-                    model_out = msg["result"][0]
-                    self.assertTrue(model_out.startswith(golden_start_with))
-                # x86 does not allow weight sharing, so we don't check pte size
-                if not self.enable_x86_64:
-                    pte_size = msg["pte_size"]
-                    self.assertLessEqual(pte_size, 130000000)
-
     @unittest.skip("dynamic shape inputs appear in recent torch.export.export")
     def test_mobilebert(self):
         if not self.required_envs([self.pretrained_weight]):
@@ -4271,6 +4507,18 @@ def setup_environment():
         type=str,
     )
 
+    parser.add_argument(
+        "--pre_gen_pte",
+        help="Run the pre-generated pte in the given directory.",
+        type=str,
+    )
+
+    parser.add_argument(
+        "--llama_artifacts",
+        help="A folder that contains: weight, tokenizer, and params.",
+        type=str,
+    )
+
     args, ns_args = parser.parse_known_args(namespace=unittest)
     TestQNN.host = args.host
     TestQNN.device = args.device
@@ -4289,6 +4537,8 @@ def setup_environment():
     TestQNN.enable_x86_64 = args.enable_x86_64
     TestQNN.dump_intermediate_outputs = args.dump_intermediate_outputs
     TestQNN.compile_only = args.compile_only
+    TestQNN.pre_gen_pte = args.pre_gen_pte
+    TestQNN.llama_artifacts = args.llama_artifacts
 
     return sys.argv[:1] + ns_args
 
diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py
index 46cc9b65fcf..769f24ba0d8 100644
--- a/backends/qualcomm/tests/utils.py
+++ b/backends/qualcomm/tests/utils.py
@@ -17,7 +17,7 @@
 from executorch import exir
 from executorch.backends.qualcomm.partition.qnn_partitioner import QnnPartitioner
 from executorch.backends.qualcomm.qnn_preprocess import QnnBackend
-from executorch.backends.qualcomm.quantizer.quantizer import QnnQuantizer, QuantDtype
+from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
 from executorch.backends.qualcomm.serialization.qc_schema import QcomChipset
 from executorch.backends.qualcomm.utils.constants import (
     QCOM_DTYPE,
@@ -32,6 +32,7 @@
 from executorch.examples.qualcomm.utils import (
     generate_inputs,
     make_output_dir,
+    make_quantizer,
     SimpleADB,
 )
 
@@ -188,6 +189,9 @@ class TestQNN(unittest.TestCase):
     shared_buffer: bool = False
     enable_x86_64: bool = False
     compile_only: bool = False
+    pre_gen_pte: str = ""
+    llama_artifacts: str = ""
+    dump_intermediate_outputs: bool = False
 
     def _assert_outputs_equal(self, model_output, ref_output):
         self.assertTrue(len(ref_output) == len(model_output))
@@ -516,21 +520,28 @@ def get_qdq_module(
         self,
         module: torch.nn.Module,
         inputs: Tuple[torch.Tensor],
+        is_conv_per_block: Optional[bool] = False,
         is_conv_per_channel: Optional[bool] = True,
         is_linear_per_channel: Optional[bool] = False,
         custom_quant_annotations: Tuple[Callable] = (),
         quant_dtype: QuantDtype = QuantDtype.use_8a8w,
         dynamic_shapes: Dict = None,
         bypass_check: bool = False,
+        block_size_map: Dict[str, Tuple] = None,
     ) -> torch.fx.GraphModule:
-        m = torch.export.export(module, inputs, dynamic_shapes=dynamic_shapes).module()
-
-        quantizer = QnnQuantizer()
-        quantizer.add_custom_quant_annotations(custom_quant_annotations)
-        quantizer.set_per_channel_conv_quant(is_conv_per_channel)
-        quantizer.set_per_channel_linear_quant(is_linear_per_channel)
-        quantizer.set_quant_config(quant_dtype)
-
+        m = torch.export.export(
+            module, inputs, dynamic_shapes=dynamic_shapes, strict=True
+        ).module()
+
+        quantizer = make_quantizer(
+            quant_dtype=quant_dtype,
+            custom_annotations=custom_quant_annotations,
+            per_block_conv=is_conv_per_block,
+            per_channel_conv=is_conv_per_channel,
+            per_channel_linear=is_linear_per_channel,
+        )
+        if block_size_map is not None:
+            quantizer.set_block_size_map(block_size_map)
         prepared = prepare_pt2e(m, quantizer)
         prepared(*inputs)
         quantized_module = convert_pt2e(prepared)
@@ -540,6 +551,8 @@ def get_qdq_module(
             torch.ops.quantized_decomposed.dequantize_per_tensor.default,
             torch.ops.quantized_decomposed.quantize_per_channel.default,
             torch.ops.quantized_decomposed.dequantize_per_channel.default,
+            torch.ops.pt2e_quant.quantize_affine.default,
+            torch.ops.pt2e_quant.dequantize_affine.default,
         }
         if not bypass_check:
             self.assertTrue(nodes.intersection(q_and_dq))
@@ -556,10 +569,12 @@ def get_prepared_qat_module(
     ) -> torch.fx.GraphModule:
         m = torch.export.export_for_training(module, inputs).module()
 
-        quantizer = QnnQuantizer()
-        quantizer.add_custom_quant_annotations(custom_quant_annotations)
-        quantizer.set_per_channel_conv_quant(is_conv_per_channel)
-        quantizer.set_per_channel_linear_quant(is_linear_per_channel)
+        quantizer = make_quantizer(
+            quant_dtype=quant_dtype,
+            custom_annotations=custom_quant_annotations,
+            per_channel_conv=is_conv_per_channel,
+            per_channel_linear=is_linear_per_channel,
+        )
 
         if quant_dtype == QuantDtype.use_8a8w:
             quantizer.set_quant_config(quant_dtype, is_qat=True)
diff --git a/backends/qualcomm/utils/constants.py b/backends/qualcomm/utils/constants.py
index c31e8d2f35d..2e364c37119 100644
--- a/backends/qualcomm/utils/constants.py
+++ b/backends/qualcomm/utils/constants.py
@@ -10,11 +10,17 @@
 QCOM_AXIS = "axis"
 QCOM_AXIS_ORDER = "axis_order"
 QCOM_BITWIDTH = "bitwidth"
+QCOM_BLOCK_SIZE = "block_size"
+QCOM_BLOCK_SCALES = "block_scales"
+QCOM_BLOCK_SCALE_BITWIDTH = "block_scale_bitwidth"
+QCOM_BLOCK_SCALE_OFFSET = "block_scale_offset"
+QCOM_BLOCK_STORAGE_TYPE = "block_storage_type"
 QCOM_DATA = "data"
 QCOM_DTYPE = "dtype"
 QCOM_ENCODING = "encoding"
 QCOM_INSERTED_PERMUTE = "qnn_permute"
 QCOM_LAYOUT_CHANGE = "layout_change"
+QCOM_NUM_BLOCKS_PER_AXIS = "num_blocks_per_axis"
 QCOM_OFFSET = "offset"
 QCOM_ORIG_DTYPE = "orig_dtype"
 QCOM_QUANTIZED_IO = "q_tensor_io"
diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
index 1da17cb25f6..8045e9e6443 100644
--- a/backends/qualcomm/utils/utils.py
+++ b/backends/qualcomm/utils/utils.py
@@ -17,21 +17,19 @@
 
 import torch
 from executorch.backends.qualcomm._passes import (
-    AnnotateAndQuantScalar,
     AnnotateDecomposed,
     AnnotateQuantAttrs,
     ConstantI64toI32,
-    ConvertBinaryOpsWithScalar,
     ConvertBmmToMatmul,
-    ConvertInterpolateWithUpsample2D,
-    ConvertPReLU,
     ConvertToLinear,
     DecomposeAny,
     DecomposeLinalgVectorNorm,
     ExpandBroadcastTensorShape,
     FoldQDQ,
     LayoutTransform,
+    LiftConstantScalarOperands,
     RecomposePixelUnshuffle,
+    RecomposePReLU,
     RecomposeRmsNorm,
     RemoveRedundancy,
     ReplaceIndexPutInput,
@@ -73,6 +71,9 @@
     QCOM_QNN_COMPILE_SPEC,
     QCOM_QUANTIZED_IO,
 )
+from executorch.backends.transforms.decompose_sdpa import (
+    DecomposeScaledDotProductAttention,
+)
 
 from executorch.exir import (
     EdgeCompileConfig,
@@ -330,6 +331,8 @@ def get_decomp_table() -> Dict[torch._ops.OperatorBase, Callable]:
         torch.ops.aten.pixel_unshuffle.default,
         torch.ops.aten.hardsigmoid.default,
         torch.ops.aten.hardswish.default,
+        torch.ops.pt2e_quant.quantize_affine.default,
+        torch.ops.pt2e_quant.dequantize_affine.default,
         torch.ops.aten._safe_softmax.default,
     ]
 
@@ -350,19 +353,17 @@ def get_capture_program_passes():
     # The second value in each tuple in `default_passes_and_setting` indicates whether the corresponding pass is activated by default.
     # If a pass is activated, it will be executed by default.
     default_passes_and_setting = [
-        (AnnotateAndQuantScalar, True),
         (AnnotateDecomposed, True),
         (AnnotateQuantAttrs, True),
         (ConstantI64toI32, True),
         (ConvertBmmToMatmul, True),
-        (ConvertInterpolateWithUpsample2D, True),
-        (ConvertPReLU, True),
         (ConvertToLinear, True),
         (DecomposeAny, True),
         (DecomposeLinalgVectorNorm, True),
         (ExpandBroadcastTensorShape, False),
         (FoldQDQ, True),
         (LayoutTransform, True),
+        (RecomposePReLU, True),
         (RecomposePixelUnshuffle, True),
         (RecomposeRmsNorm, True),
         (RemoveRedundancy, True),
@@ -408,6 +409,13 @@ def _topological_sort_passes(passes: OrderedDict):
 def _transform(
     edge_program: ExportedProgram, passes_job: OrderedDict = None
 ) -> ExportedProgram:
+    # TODO: remove this workaround when target could be correclty detected
+    from executorch.backends.qualcomm._passes import utils
+    from executorch.exir.dialects._ops import ops as exir_ops
+
+    utils.q_ops.add(exir_ops.edge.pt2e_quant.quantize_affine.default)
+    utils.dq_ops.add(exir_ops.edge.pt2e_quant.dequantize_affine.default)
+
     # currently ExirExportedProgram.transform does not accept
     # changes of input number which was caused by FoldQDQ
     # apply passes one by one here to avoid IR capture failure
@@ -432,22 +440,29 @@ def _transform(
     return edge_program
 
 
+# Modify the fx graph at very beginning for floating point model
+# Aim to reduce registration of scalar at graph_module or program
+def _preprocess_module(module: torch.nn.Module, inputs: Tuple[torch.Tensor]):
+    if isinstance(module, torch.fx.graph_module.GraphModule):
+        return module
+    module = torch.export.export(module, inputs, strict=True).module()
+    module = DecomposeScaledDotProductAttention()(module).graph_module
+    module = DecomposeLinalgVectorNorm(True)(module).graph_module
+    module = LiftConstantScalarOperands()(module).graph_module
+    return module
+
+
 def capture_program(
     module: torch.nn.Module,
     inputs: Tuple[torch.Tensor],
     passes_job: OrderedDict = None,
     dynamic_shapes: Dict = None,
 ) -> exir.ExirExportedProgram:
-    ep = torch.export.export(module, inputs, dynamic_shapes=dynamic_shapes)
+    module = _preprocess_module(module, inputs)
+    ep = torch.export.export(module, inputs, dynamic_shapes=dynamic_shapes, strict=True)
     decomposed_ep = ep.run_decompositions(get_decomp_table())
-    # We choose call_operator by target in ConvertBinaryOpsWithScalar
-    # because it is the same source_fn_stack for MultiheadAttention
-    # TODO: Should modify the scalar op in the op builder instead of
-    #       using transformation
     core_ep = ExirExportedProgram(decomposed_ep, False)
-    core_ep.transform(
-        TensorI64toI32(edge_program=core_ep), ConvertBinaryOpsWithScalar()
-    )
+    core_ep.transform(TensorI64toI32(edge_program=core_ep))
     edge_ep = core_ep.to_edge(qnn_edge_config())
     _transform(edge_ep.exported_program, passes_job)
     return edge_ep
@@ -1272,25 +1287,33 @@ def generate_qnn_executorch_compiler_spec(
 
 def get_soc_to_arch_map():
     return {
-        "SSG2115P": HtpArch.V73,
-        "SM8750": HtpArch.V79,
-        "SM8650": HtpArch.V75,
-        "SM8550": HtpArch.V73,
-        "SM8475": HtpArch.V69,
-        "SM8450": HtpArch.V69,
         "SA8295": HtpArch.V68,
+        "SM8450": HtpArch.V69,
+        "SM8475": HtpArch.V69,
+        "SM8550": HtpArch.V73,
+        "SM8650": HtpArch.V75,
+        "SM8750": HtpArch.V79,
+        "SSG2115P": HtpArch.V73,
+        "SSG2125P": HtpArch.V73,
+        "SXR1230P": HtpArch.V73,
+        "SXR2230P": HtpArch.V69,
+        "SXR2330P": HtpArch.V79,
     }
 
 
 def get_soc_to_chipset_map():
     return {
-        "SSG2115P": QcomChipset.SSG2115P,
-        "SM8750": QcomChipset.SM8750,
-        "SM8650": QcomChipset.SM8650,
-        "SM8550": QcomChipset.SM8550,
-        "SM8475": QcomChipset.SM8475,
-        "SM8450": QcomChipset.SM8450,
         "SA8295": QcomChipset.SA8295,
+        "SM8450": QcomChipset.SM8450,
+        "SM8475": QcomChipset.SM8475,
+        "SM8550": QcomChipset.SM8550,
+        "SM8650": QcomChipset.SM8650,
+        "SM8750": QcomChipset.SM8750,
+        "SSG2115P": QcomChipset.SSG2115P,
+        "SSG2125P": QcomChipset.SSG2125P,
+        "SXR1230P": QcomChipset.SXR1230P,
+        "SXR2230P": QcomChipset.SXR2230P,
+        "SXR2330P": QcomChipset.SXR2330P,
     }
 
 
diff --git a/backends/transforms/fuse_view_copy.py b/backends/transforms/fuse_view_copy.py
index bbc155dc451..22e20d1c88b 100644
--- a/backends/transforms/fuse_view_copy.py
+++ b/backends/transforms/fuse_view_copy.py
@@ -40,7 +40,24 @@ def merge_view_copy_chains(graph: torch.fx.Graph) -> torch.fx.Graph:
     return graph
 
 
+def remove_noop_view_copy(graph: torch.fx.Graph) -> torch.fx.Graph:
+    """
+    Remove view_copy nodes that are no-ops.
+    """
+    ops = exir_ops.edge
+    view_op = ops.aten.view_copy.default
+    for node in graph.nodes:
+        if node.op == "call_function" and node.target == view_op:
+            input_shape = list(node.args[0].meta["val"].shape)
+            target_shape = node.args[1]
+            if input_shape == target_shape:
+                node.replace_all_uses_with(node.args[0])
+    graph.eliminate_dead_code()
+    return graph
+
+
 class FuseViewCopyTransform(ExportPass):
     def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
         graph_module.graph = merge_view_copy_chains(graph_module.graph)
+        graph_module.graph = remove_noop_view_copy(graph_module.graph)
         return PassResult(graph_module, True)
diff --git a/backends/transforms/targets.bzl b/backends/transforms/targets.bzl
index c532798546d..66ff9111f52 100644
--- a/backends/transforms/targets.bzl
+++ b/backends/transforms/targets.bzl
@@ -149,6 +149,9 @@ def define_common_targets():
     runtime.python_library(
         name = "utils",
         srcs = ["utils.py"],
+        visibility = [
+            "//executorch/backends/...",
+        ],
         deps = [
             "//caffe2:torch",
             "//executorch/exir:lib",
@@ -201,6 +204,20 @@ def define_common_targets():
         ],
     )
 
+    runtime.python_library(
+        name = "replace_scalar_with_tensor",
+        srcs = [
+            "replace_scalar_with_tensor.py",
+        ],
+        visibility = [
+            "//executorch/backends/...",
+        ],
+        deps = [
+            "//caffe2:torch",
+            "//executorch/exir:pass_base",
+        ],
+    )
+
     runtime.python_test(
         name = "test_duplicate_dynamic_quant_chain",
         srcs = [
diff --git a/backends/transforms/test/test_create_delete_constant_placeholder.py b/backends/transforms/test/test_create_delete_constant_placeholder.py
new file mode 100644
index 00000000000..ad24f8bfaaf
--- /dev/null
+++ b/backends/transforms/test/test_create_delete_constant_placeholder.py
@@ -0,0 +1,122 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.backends.transforms.utils import (
+    create_constant_placeholder,
+    delete_constant_placeholder,
+)
+from executorch.exir import to_edge
+from executorch.exir.dialects._ops import ops as exir_ops
+from torch.export import export
+from torch.export.graph_signature import InputKind
+
+
+class EmptyNetwork(torch.nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x
+
+    test_data: torch.Tensor = (torch.zeros(1),)
+
+
+def _test_create_delete(kind: InputKind, persistent_buffer: bool = None):
+    """
+    Tests the utility functions create_constant_placeholder and delete_constant_placeholder
+    """
+
+    # Toy network with two nodes, input and output
+    # The result should be 0 = 0
+    module = EmptyNetwork()
+    exported_program = export(module, args=module.test_data, strict=True)
+    exported_program = to_edge(exported_program).exported_program()
+    graph = exported_program.graph_module.graph
+    assert len(graph.nodes) == 2
+    assert exported_program.module()(torch.zeros(1)) == 0
+    assert len(exported_program.graph_signature.input_specs) == 1
+    assert len(exported_program.state_dict) == 0
+    assert len(exported_program.constants) == 0
+
+    const_name = "test_node"
+
+    # Create one const node with value 1 and add it to the input
+    input_node = list(graph.nodes)[0]
+    with graph.inserting_before(input_node):
+        const_node = create_constant_placeholder(
+            exp_program=exported_program,
+            graph=graph,
+            kind=kind,
+            name=const_name,
+            data=torch.ones(1),
+            persistent_buffer=persistent_buffer,
+        )
+    assert "val" in const_node.meta
+
+    with graph.inserting_after(input_node):
+        add_node = graph.create_node(
+            "call_function",
+            exir_ops.edge.aten.add.Tensor,
+            args=(input_node, const_node),
+            kwargs={},
+        )
+
+    output_node = list(graph.nodes)[-1]
+    output_node.replace_input_with(input_node, add_node)
+
+    # We should now have four nodes: test_node, input, add, output
+    # The result should be 0 + 1 = 1
+    assert exported_program.module()(torch.zeros(1)) == 1
+    assert len(graph.nodes) == 4
+
+    if kind == InputKind.PARAMETER:
+        assert const_name in exported_program.graph_signature.inputs_to_parameters
+        assert const_name in exported_program.state_dict
+        assert len(exported_program.constants) == 0
+    elif kind == InputKind.BUFFER and persistent_buffer:
+        assert const_name in exported_program.graph_signature.inputs_to_buffers
+        assert const_name in exported_program.state_dict
+        assert len(exported_program.constants) == 0
+    elif kind == InputKind.BUFFER and not persistent_buffer:
+        assert const_name in exported_program.graph_signature.inputs_to_buffers
+        assert len(exported_program.state_dict) == 0
+        assert const_name in exported_program.constants
+    elif kind == InputKind.CONSTANT_TENSOR:
+        assert (
+            const_name
+            in exported_program.graph_signature.inputs_to_lifted_tensor_constants
+        )
+        assert len(exported_program.state_dict) == 0
+        assert const_name in exported_program.constants
+    else:
+        raise RuntimeError("Wrong input kind")
+
+    # Replacing the add op and using eliminate_dead_code() deletes the add op but not the input op
+    output_node.replace_input_with(add_node, input_node)
+    graph.eliminate_dead_code()
+    assert len(graph.nodes) == 3
+
+    # Delete the input op manually
+    # The result should again be 0 = 0
+    delete_constant_placeholder(exported_program, const_node)
+    assert exported_program.module()(torch.zeros(1)) == 0
+    assert len(graph.nodes) == 2
+    assert len(exported_program.graph_signature.input_specs) == 1
+    assert len(exported_program.state_dict) == 0
+    assert len(exported_program.constants) == 0
+
+
+def test_create_delete_parameter():
+    _test_create_delete(InputKind.PARAMETER)
+
+
+def test_create_delete_persistent_buffer():
+    _test_create_delete(InputKind.BUFFER, True)
+
+
+def test_create_delete_non_persistent_buffer():
+    _test_create_delete(InputKind.BUFFER, False)
+
+
+def test_create_delete_constant_tensor():
+    _test_create_delete(InputKind.CONSTANT_TENSOR)
diff --git a/backends/transforms/test/test_rank_0_to_rank_1.py b/backends/transforms/test/test_rank_0_to_rank_1.py
index 50c6357fb67..eddad536e06 100644
--- a/backends/transforms/test/test_rank_0_to_rank_1.py
+++ b/backends/transforms/test/test_rank_0_to_rank_1.py
@@ -17,7 +17,7 @@ def forward(self, x, y):
         model.eval()
 
         example_inputs = (torch.tensor(1.0), torch.tensor(2.0))
-        aten = torch.export.export(model, example_inputs)
+        aten = torch.export.export(model, example_inputs, strict=True)
 
         # Check that the input rank is 0
         for node in aten.graph.nodes:
diff --git a/backends/transforms/utils.py b/backends/transforms/utils.py
index 03c48039b93..4e451928ee4 100644
--- a/backends/transforms/utils.py
+++ b/backends/transforms/utils.py
@@ -1,5 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -8,7 +9,6 @@
 
 import torch
 from executorch.exir import ExportedProgram
-
 from torch._export.utils import (
     get_buffer,
     get_lifted_tensor_constant,
@@ -17,6 +17,13 @@
     is_lifted_tensor_constant,
     is_param,
 )
+from torch._subclasses.fake_tensor import FakeTensorConverter
+from torch.export.graph_signature import (
+    ExportGraphSignature,
+    InputKind,
+    InputSpec,
+    TensorArgument,
+)
 
 
 def is_get_attr_node(node: torch.fx.Node) -> bool:
@@ -53,3 +60,130 @@ def get_param_tensor(
         except AttributeError:
             return getattr(exp_prog.graph_module, node.target)
     raise RuntimeError(f"unsupported param type, {node.op}.")
+
+
+def create_constant_placeholder(
+    exp_program: ExportedProgram,
+    graph: torch.fx.Graph,
+    name: str,
+    kind: InputKind,
+    data: torch.Tensor,
+    persistent_buffer: Optional[bool] = None,
+) -> torch.fx.Node:
+    """
+    Creates and returns a constant placeholder node, meaning that it is of type parameter, buffer,
+    or lifted constant tensor. graph.inserting_before/after() should be used before the call to
+    decide where to insert the node, at an insertion point before the first input node.
+    """
+
+    target = name
+
+    # Add data to state_dict/ constants
+    match kind:
+        case InputKind.PARAMETER:
+            exp_program.state_dict[target] = torch.nn.Parameter(
+                data, requires_grad=False
+            )
+        case InputKind.BUFFER:
+            if persistent_buffer is None:
+                raise RuntimeError(
+                    "Must set persistent_buffer when creating a new buffer."
+                )
+            elif persistent_buffer:
+                exp_program.state_dict[target] = data
+            else:
+                exp_program.constants[target] = data
+        case InputKind.CONSTANT_TENSOR:
+            exp_program.constants[target] = data
+        case _:
+            raise RuntimeError("Can only create constant input nodes.")
+
+    # Create fake tensor using the same fake_mode as the other fake tensors in the graph
+    example_node = list(graph.nodes)[0]
+    if isinstance(
+        example_node.meta["val"], (tuple, torch.fx.immutable_collections.immutable_list)
+    ):
+        example_fake_tensor = example_node.meta["val"][0]
+    else:
+        example_fake_tensor = example_node.meta["val"]
+    fake_tensor = FakeTensorConverter().from_real_tensor(
+        example_fake_tensor.fake_mode, t=data
+    )
+
+    # Create node
+    node = graph.create_node(op="placeholder", name=name, target=name)
+    node.meta["val"] = fake_tensor
+
+    # Add tensor to graph_signature in the same order as nodes in the graph
+    node_names = [n.name for n in graph.nodes if n.op == "placeholder"]
+    node_index = node_names.index(name)
+
+    input_specs = exp_program.graph_signature.input_specs
+    user_input_indices = [
+        i for i, spec in enumerate(input_specs) if spec.kind == InputKind.USER_INPUT
+    ]
+    if not all(
+        (user_input_index >= node_index for user_input_index in user_input_indices)
+    ):
+        raise RuntimeError(
+            f"Failed to insert {name}; Const placeholder nodes must be inserted before user input nodes in the graph."
+        )
+
+    arg_spec = TensorArgument(name)
+    input_spec = InputSpec(kind, arg_spec, target, persistent_buffer)
+    input_specs.insert(node_index, input_spec)
+
+    new_graph_signature = ExportGraphSignature(
+        input_specs, exp_program.graph_signature.output_specs
+    )
+    exp_program._graph_signature = new_graph_signature
+
+    return node
+
+
+def delete_constant_placeholder(exp_program: ExportedProgram, node: torch.fx.Node):
+    """
+    Deletes a node of type parameter, buffer, or lifted constant tensor and its related
+    graph signature and state_dict/constant entries. The node may not have any users.
+    """
+    if not len(node.users) == 0:
+        raise RuntimeError(
+            f"Cannot delete input node {node.name} since it has users in the graph."
+        )
+
+    # Remove tensor from state_dict/ constants
+    if node.name in exp_program.graph_signature.inputs_to_parameters:
+        target = exp_program.graph_signature.inputs_to_parameters[node.name]
+        del exp_program.state_dict[target]
+
+    elif node.name in exp_program.graph_signature.inputs_to_buffers:
+        target = exp_program.graph_signature.inputs_to_buffers[node.name]
+
+        if target in exp_program.graph_signature.non_persistent_buffers:
+            del exp_program.constants[target]
+        else:
+            del exp_program.state_dict[target]
+
+    elif node.name in exp_program.graph_signature.inputs_to_lifted_tensor_constants:
+        target = exp_program.graph_signature.inputs_to_lifted_tensor_constants[
+            node.name
+        ]
+        del exp_program.constants[target]
+    else:
+        raise RuntimeError(
+            f"Cannot delete input node {node.name} since it is not a parameter, a buffer, nor a lifted tensor constant."
+        )
+
+    # Remove input from graph signature
+    input_specs = [
+        spec
+        for spec in exp_program.graph_signature.input_specs
+        if spec.arg.name != node.name
+    ]
+    new_graph_signature = ExportGraphSignature(
+        input_specs, exp_program.graph_signature.output_specs
+    )
+    exp_program._graph_signature = new_graph_signature
+
+    # Remove node from graph
+    node.graph.erase_node(node)
diff --git a/backends/transforms/view_copy_to_squeeze_unsqueeze.py b/backends/transforms/view_copy_to_squeeze_unsqueeze.py
index f4a0670072c..08ed70b2fa8 100644
--- a/backends/transforms/view_copy_to_squeeze_unsqueeze.py
+++ b/backends/transforms/view_copy_to_squeeze_unsqueeze.py
@@ -75,7 +75,11 @@ def find_unsqueeze_dim(
         j = 0
         idx = -1
         while j < len(view_shape):
-            if input_shape[i] != view_shape[j]:
+            # account for added dim being last dim in view_shape
+            if i == j and j == len(input_shape):
+                if view_shape[j] != 1:
+                    return None
+            elif input_shape[i] != view_shape[j]:
                 if view_shape[j] == 1:
                     idx = j
                     i -= 1
diff --git a/backends/vulkan/CMakeLists.txt b/backends/vulkan/CMakeLists.txt
index fca34fdf6a4..ef499553c49 100644
--- a/backends/vulkan/CMakeLists.txt
+++ b/backends/vulkan/CMakeLists.txt
@@ -28,15 +28,11 @@ if(NOT PYTHON_EXECUTABLE)
   set(PYTHON_EXECUTABLE python3)
 endif()
 
-if(NOT FLATC_EXECUTABLE)
-  set(FLATC_EXECUTABLE flatc)
-endif()
-
 # Include this file to access target_link_options_shared_lib This is required to
 # provide access to target_link_options_shared_lib which allows libraries to be
 # linked with the --whole-archive flag. This is required for libraries that
 # perform dynamic registration via static initialization.
-include(${EXECUTORCH_ROOT}/build/Utils.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 
 include(cmake/ShaderLibrary.cmake)
 
@@ -92,6 +88,7 @@ add_custom_command(
     ${FLATC_EXECUTABLE} --cpp --cpp-std c++11 --scoped-enums -o
     "${SCHEMA_INCLUDE_DIR}/executorch/backends/vulkan/serialization/" ${_vulkan_schema__srcs}
   WORKING_DIRECTORY ${EXECUTORCH_ROOT}
+  DEPENDS flatc
   COMMENT "Generating vulkan_schema headers"
   VERBATIM
 )
diff --git a/backends/vulkan/README.md b/backends/vulkan/README.md
index b428333c913..2cfff6a6eb6 100644
--- a/backends/vulkan/README.md
+++ b/backends/vulkan/README.md
@@ -1,4 +1,4 @@
-# ExecuTorch Vulkan Delegate
+# Vulkan Backend
 
 The ExecuTorch Vulkan delegate is a native GPU delegate for ExecuTorch that is
 built on top of the cross-platform Vulkan GPU API standard. It is primarily
diff --git a/backends/vulkan/_passes/TARGETS b/backends/vulkan/_passes/TARGETS
index 59658e58f28..5478ad0eab6 100644
--- a/backends/vulkan/_passes/TARGETS
+++ b/backends/vulkan/_passes/TARGETS
@@ -31,14 +31,15 @@ runtime.python_library(
 )
 
 runtime.python_library(
-    name = "squeeze_int4_linear_inputs",
+    name = "squeeze_unsqueeze_inputs",
     srcs = [
-        "squeeze_int4_linear_inputs.py",
+        "squeeze_unsqueeze_inputs.py",
     ],
     visibility = [
         "//executorch/backends/...",
     ],
     deps = [
+        "//caffe2:torch",
         "//executorch/backends/vulkan:custom_ops_lib",
         "//executorch/exir:pass_base",
         "//executorch/exir/dialects:lib",
@@ -114,7 +115,7 @@ runtime.python_library(
         ":remove_asserts",
         ":remove_local_scalar_dense",
         ":remove_redundant_ops",
-        ":squeeze_int4_linear_inputs",
+        ":squeeze_unsqueeze_inputs",
         ":tag_memory_meta_pass",
     ]
 )
diff --git a/backends/vulkan/_passes/__init__.py b/backends/vulkan/_passes/__init__.py
index 2a4a2b4b5c9..220afa6a35c 100644
--- a/backends/vulkan/_passes/__init__.py
+++ b/backends/vulkan/_passes/__init__.py
@@ -20,8 +20,8 @@
 from executorch.backends.vulkan._passes.remove_redundant_ops import (
     RemoveRedundantOpsTransform,
 )
-from executorch.backends.vulkan._passes.squeeze_int4_linear_inputs import (
-    SqueezeInt4LinearInputs,
+from executorch.backends.vulkan._passes.squeeze_unsqueeze_inputs import (
+    SqueezeUnsqueezeInputs,
 )
 from executorch.backends.vulkan._passes.tag_memory_meta_pass import TagMemoryMetaPass
 
@@ -32,6 +32,6 @@
     "RemoveAssertsTransform",
     "RemoveLocalScalarDenseOpsTransform",
     "RemoveRedundantOpsTransform",
-    "SqueezeInt4LinearInputs",
+    "SqueezeUnsqueezeInputs",
     "TagMemoryMetaPass",
 ]
diff --git a/backends/vulkan/_passes/squeeze_int4_linear_inputs.py b/backends/vulkan/_passes/squeeze_unsqueeze_inputs.py
similarity index 80%
rename from backends/vulkan/_passes/squeeze_int4_linear_inputs.py
rename to backends/vulkan/_passes/squeeze_unsqueeze_inputs.py
index 95fcef7f754..a0160efa90f 100644
--- a/backends/vulkan/_passes/squeeze_int4_linear_inputs.py
+++ b/backends/vulkan/_passes/squeeze_unsqueeze_inputs.py
@@ -6,16 +6,27 @@
 
 # pyre-strict
 
-from typing import Dict, List, Tuple
+from typing import Dict, List, Set, Tuple, Union
 
 import executorch.backends.vulkan.custom_ops_lib  # noqa: needed to access vk op
 from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.dialects.edge._ops import EdgeOpOverload
 from executorch.exir.pass_base import ExportPass, NodeMetadata, ProxyValue
 
+from torch._ops import OpOverload
+
 from torch.fx.node import Argument
 
+OpType = Union[str, OpOverload, EdgeOpOverload]
+
+
+class SqueezeUnsqueezeInputs(ExportPass):
+    _squeezable_ops: Set[OpType] = {
+        exir_ops.edge.et_vk.linear_weight_int4.default,
+        exir_ops.edge.aten.relu.default,
+        exir_ops.edge.aten.gelu.default,
+    }
 
-class SqueezeInt4LinearInputs(ExportPass):
     def call_operator(
         self,
         op,  # pyre-ignore
@@ -26,7 +37,7 @@ def call_operator(
         def _squeezable(shape: List[int]) -> bool:
             return len(shape) > 2 and 1 in shape
 
-        if op != exir_ops.edge.et_vk.linear_weight_int4.default:
+        if op not in self._squeezable_ops:
             return super().call_operator(op, args, kwargs, meta)
 
         # pyre-ignore[16]: `None` has no attribute `node`
diff --git a/backends/vulkan/_passes/tag_memory_meta_pass.py b/backends/vulkan/_passes/tag_memory_meta_pass.py
index f2f54404ca8..03721066f1c 100644
--- a/backends/vulkan/_passes/tag_memory_meta_pass.py
+++ b/backends/vulkan/_passes/tag_memory_meta_pass.py
@@ -6,7 +6,7 @@
 
 import logging
 from copy import deepcopy
-from typing import Set
+from typing import Any, Set
 
 import executorch.backends.vulkan.utils as utils
 
@@ -190,20 +190,24 @@ def propose_node_layout(
             return next(iter(valid_layouts))
 
     def should_annotate(self, node) -> bool:
-        if not isinstance(node, torch.fx.Node):
-            return False
-
-        if not utils.is_tensor_node(node):
-            return False
-
-        # Storage type and memory layout for tensorref will be determined at runtime
-        # so there's no use in setting those attributes ahead of time.
-        if node.meta.get("vkdg_tensorref", False):
-            return False
-
-        # Skip annotating output node. The output tensors should be annotated by the
-        # time the output node is observed.
-        if node.op == "output":
+        if isinstance(node, torch.fx.Node):
+            if not utils.is_tensor_node(node):
+                return False
+
+            # Storage type and memory layout for tensorref will be determined at runtime
+            # so there's no use in setting those attributes ahead of time.
+            if node.meta.get("vkdg_tensorref", False):
+                return False
+
+            # Skip annotating output node. The output tensors should be annotated by the
+            # time the output node is observed.
+            if node.op == "output":
+                return False
+        elif isinstance(node, (list, tuple)):
+            return all(
+                isinstance(n, torch.fx.Node) and self.should_annotate(n) for n in node
+            )
+        else:
             return False
 
         return True
@@ -215,6 +219,70 @@ def should_delay_annotation(self, node: torch.fx.Node) -> bool:
         # time the prepack node is observed.
         return node.target == exir_ops.edge.et_vk.prepack.default
 
+    def set_or_transition_arg_node(
+        self,
+        i: int,
+        arg: torch.fx.Node,
+        node: torch.fx.Node,
+        graph_module: torch.fx.GraphModule,
+        dirty: bool,
+    ) -> bool:
+        assert isinstance(arg, torch.fx.Node)
+
+        storage = utils.get_node_storage_type(node)
+        assert storage is not None
+        layout = utils.get_node_memory_layout(node)
+        assert layout is not None
+
+        arg_storage = utils.get_node_storage_type(arg)
+        arg_layout = utils.get_node_memory_layout(arg)
+
+        if arg_storage is None:
+            utils.set_node_spec_attr(arg, "vk_storage_type", storage)
+            arg_storage = storage
+        if arg_layout is None:
+            utils.set_node_spec_attr(arg, "vk_memory_layout", layout)
+            arg_layout = layout
+
+        if arg_storage == storage and arg_layout == layout:
+            return False
+
+        if not dirty:
+            logger.info(
+                f"[Vulkan Delegate] Inserting transition(s) for {node.format_node()}:"
+            )
+
+        insert_transition_node(graph_module, node, arg, storage, layout)
+
+        logger.info(
+            f"   args {i} ({arg}): ({arg_storage}, {arg_layout}) -> ({storage}, {layout})"
+        )
+
+        return True
+
+    def set_or_transition_arg(
+        self,
+        i: int,
+        arg: Any,
+        node: torch.fx.Node,
+        graph_module: torch.fx.GraphModule,
+        dirty: bool,
+    ) -> bool:
+        if isinstance(arg, torch.fx.Node):
+            return self.set_or_transition_arg_node(i, arg, node, graph_module, dirty)
+        elif isinstance(arg, (list, tuple)):
+            need_transition = False
+            for arg_node in arg:
+                need_transition = (
+                    self.set_or_transition_arg_node(
+                        i, arg_node, node, graph_module, need_transition
+                    )
+                    or need_transition
+                )
+            return need_transition
+        else:
+            return False
+
     # noqa
     def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
         for node in graph_module.graph.nodes:
@@ -226,36 +294,16 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
 
             set_memory_metadata(node, storage, layout)
 
-            inserting_transitions_for_node = False
+            need_transition = False
             for i, arg in enumerate(node.args):
                 if not self.should_annotate(arg):
                     continue
 
-                assert isinstance(arg, torch.fx.Node)
-
-                arg_storage = utils.get_node_storage_type(arg)
-                arg_layout = utils.get_node_memory_layout(arg)
-
-                if arg_storage is None:
-                    utils.set_node_spec_attr(arg, "vk_storage_type", storage)
-                    arg_storage = storage
-                if arg_layout is None:
-                    utils.set_node_spec_attr(arg, "vk_memory_layout", layout)
-                    arg_layout = layout
-
-                if arg_storage == storage and arg_layout == layout:
-                    continue
-
-                if not inserting_transitions_for_node:
-                    inserting_transitions_for_node = True
-                    logger.info(
-                        f"[Vulkan Delegate] Inserting transition(s) for {node.format_node()}:"
+                need_transition = (
+                    self.set_or_transition_arg(
+                        i, arg, node, graph_module, need_transition
                     )
-
-                insert_transition_node(graph_module, node, arg, storage, layout)
-
-                logger.info(
-                    f"   args {i} ({arg}): ({arg_storage}, {arg_layout}) -> ({storage}, {layout})"
+                    or need_transition
                 )
 
         return PassResult(graph_module, True)
diff --git a/backends/vulkan/cmake/ShaderLibrary.cmake b/backends/vulkan/cmake/ShaderLibrary.cmake
index b44736d20dd..bbf81e7bcba 100644
--- a/backends/vulkan/cmake/ShaderLibrary.cmake
+++ b/backends/vulkan/cmake/ShaderLibrary.cmake
@@ -41,7 +41,7 @@ else()
 endif()
 
 # Required to enable linking with --whole-archive
-include(${EXECUTORCH_ROOT}/build/Utils.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 
 function(gen_vulkan_shader_lib_cpp shaders_path)
   set(VULKAN_SHADERGEN_ENV "")
diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
index 25cf74dc8f2..5aa805dc1b3 100644
--- a/backends/vulkan/op_registry.py
+++ b/backends/vulkan/op_registry.py
@@ -522,19 +522,12 @@ def register_view_op(features: OpFeatures):
 @update_features(
     [
         # Shape Manipulation
-        exir_ops.edge.aten.squeeze_copy.dims,
-        exir_ops.edge.aten.unsqueeze_copy.default,
-        exir_ops.edge.aten.permute_copy.default,
         exir_ops.edge.aten.t_copy.default,
         # Indexing and lookup
         exir_ops.edge.aten.flip.default,
         exir_ops.edge.aten.index_select.default,
         exir_ops.edge.aten.select_copy.int,
-        exir_ops.edge.aten.slice_copy.Tensor,
         # Tensor combination
-        exir_ops.edge.aten.cat.default,
-        exir_ops.edge.aten.split_with_sizes_copy.default,
-        exir_ops.edge.aten.split.Tensor,
         exir_ops.edge.aten.repeat.default,
         # Tensor creation
         exir_ops.edge.aten.arange.start_step,
@@ -557,6 +550,28 @@ def register_ported_op(features: OpFeatures):
     return features
 
 
+# Ops ported from PyTorch Vulkan backend. These ops are in a separate registry becasue they support all packed dimensions
+@update_features(
+    [
+        # Indexing and lookup
+        exir_ops.edge.aten.slice_copy.Tensor,
+        # Shape Manipulation
+        exir_ops.edge.aten.squeeze_copy.dims,
+        exir_ops.edge.aten.unsqueeze_copy.default,
+        exir_ops.edge.aten.permute_copy.default,
+        # Tensor combination
+        exir_ops.edge.aten.cat.default,
+        exir_ops.edge.aten.split_with_sizes_copy.default,
+        exir_ops.edge.aten.split.Tensor,
+    ]
+)
+def register_ported_op_all_packed_dims(features: OpFeatures):
+    features.texture_impl = TextureImplFeatures(
+        valid_packed_dims=all_packed_dims,
+    )
+    return features
+
+
 # Ported ops that support their own prepacking.
 @update_features(
     [
diff --git a/backends/vulkan/runtime/VulkanBackend.cpp b/backends/vulkan/runtime/VulkanBackend.cpp
index 3d249aab4a7..526a5df6f59 100644
--- a/backends/vulkan/runtime/VulkanBackend.cpp
+++ b/backends/vulkan/runtime/VulkanBackend.cpp
@@ -510,8 +510,11 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
       BackendInitContext& context,
       FreeableBuffer* processed,
       ArrayRef<CompileSpec> compile_specs) const override {
-    ComputeGraph* compute_graph = ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR(
-        context.get_runtime_allocator(), ComputeGraph);
+    ComputeGraph* compute_graph =
+        context.get_runtime_allocator()->allocateInstance<ComputeGraph>();
+    if (compute_graph == nullptr) {
+      return Error::MemoryAllocationFailed;
+    }
 
     new (compute_graph) ComputeGraph(get_graph_config(compile_specs));
 
diff --git a/backends/vulkan/runtime/api/Context.cpp b/backends/vulkan/runtime/api/Context.cpp
index 8178ada3a45..258afba7502 100644
--- a/backends/vulkan/runtime/api/Context.cpp
+++ b/backends/vulkan/runtime/api/Context.cpp
@@ -74,7 +74,7 @@ void Context::cmd_reset_querypool() {
 void Context::report_shader_dispatch_start(
     const std::string& shader_name,
     const utils::uvec3& global_wg_size,
-    const utils::uvec3& local_wg_size,
+    const utils::WorkgroupSize& local_wg_size,
     const uint32_t dispatch_id) {
   if (querypool_) {
     querypool_.shader_profile_begin(
@@ -82,7 +82,7 @@ void Context::report_shader_dispatch_start(
         dispatch_id,
         shader_name,
         vkapi::create_extent3d(global_wg_size),
-        vkapi::create_extent3d(local_wg_size));
+        vkapi::create_extent3d((utils::uvec3)local_wg_size));
   }
 }
 
@@ -115,7 +115,7 @@ void Context::check_device_capabilities(const vkapi::ShaderInfo& shader) {
 
 vkapi::DescriptorSet Context::get_descriptor_set(
     const vkapi::ShaderInfo& shader_descriptor,
-    const utils::uvec3& local_workgroup_size,
+    const utils::WorkgroupSize& local_workgroup_size,
     const vkapi::SpecVarList& additional_constants,
     const uint32_t push_constants_size) {
   VkDescriptorSetLayout shader_layout =
@@ -280,13 +280,13 @@ VkPipeline Context::get_shader_pipeline(
   VkPipelineLayout pipeline_layout =
       pipeline_layout_cache().retrieve(shader_layout, push_constants_size);
 
-  vkapi::SpecVarList spec_constants_full_list = {4u, 4u, 1u};
-  spec_constants_full_list.append(spec_constants);
+  const utils::WorkgroupSize local_workgroup_size(4u, 4u, 1u);
 
   VkPipeline pipeline = pipeline_cache().retrieve(
       {pipeline_layout,
        shader_cache().retrieve(shader),
-       spec_constants_full_list});
+       spec_constants,
+       local_workgroup_size});
 
   return pipeline;
 }
diff --git a/backends/vulkan/runtime/api/Context.h b/backends/vulkan/runtime/api/Context.h
index 8bbcf79b45c..6cfbc64f141 100644
--- a/backends/vulkan/runtime/api/Context.h
+++ b/backends/vulkan/runtime/api/Context.h
@@ -11,6 +11,7 @@
 // @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
 
 #include <executorch/backends/vulkan/runtime/utils/MacroUtils.h>
+#include <executorch/backends/vulkan/runtime/utils/VecUtils.h>
 
 #include <executorch/backends/vulkan/runtime/vk_api/Adapter.h>
 #include <executorch/backends/vulkan/runtime/vk_api/Command.h>
@@ -150,7 +151,7 @@ class Context final {
   void report_shader_dispatch_start(
       const std::string& shader_name,
       const utils::uvec3& global_wg_size,
-      const utils::uvec3& local_wg_size,
+      const utils::WorkgroupSize& local_wg_size,
       const uint32_t dispatch_id = UINT32_MAX);
 
   /*
@@ -189,13 +190,13 @@ class Context final {
 
   vkapi::DescriptorSet get_descriptor_set(
       const vkapi::ShaderInfo&,
-      const utils::uvec3&,
+      const utils::WorkgroupSize&,
       const vkapi::SpecVarList&,
       const uint32_t push_constants_size);
 
   inline vkapi::DescriptorSet get_descriptor_set(
       const vkapi::ShaderInfo& shader_descriptor,
-      const utils::uvec3& local_work_group_size) {
+      const utils::WorkgroupSize& local_work_group_size) {
     return get_descriptor_set(shader_descriptor, local_work_group_size, {}, 0u);
   }
 
@@ -362,14 +363,17 @@ inline bool Context::submit_compute_job(
   report_shader_dispatch_start(
       shader.kernel_name,
       global_work_group,
-      local_work_group_size,
+      utils::WorkgroupSize(local_work_group_size),
       dispatch_id);
 
   // Factor out template parameter independent code to minimize code bloat.
   // Note that push constants are not exposed yet via this API, therefore the
   // push constants size is assumed to be 0.
   vkapi::DescriptorSet descriptor_set = get_descriptor_set(
-      shader, local_work_group_size, specialization_constants, 0u);
+      shader,
+      utils::WorkgroupSize(local_work_group_size),
+      specialization_constants,
+      0u);
 
   detail::bind(
       descriptor_set,
diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp
index 856ff4d618a..4cbd1290401 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.cpp
+++ b/backends/vulkan/runtime/api/containers/Tensor.cpp
@@ -7,6 +7,7 @@
  */
 
 #include <executorch/backends/vulkan/runtime/api/containers/Tensor.h>
+#include <algorithm>
 #include <cassert>
 #include <cstring>
 
@@ -673,7 +674,8 @@ utils::GPUMemoryLayout vTensor::estimate_memory_layout() const {
 }
 
 const vkapi::BufferBindInfo vTensor::sizes_ubo() {
-  const size_t size_per_ubo = context()->adapter_ptr()->min_ubo_alignment();
+  const size_t size_per_ubo =
+      storage_.context_->adapter_ptr()->min_ubo_alignment();
   const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo;
   if (!uniforms_.buffer()) {
     uniforms_ = ParamsBuffer(storage_.context_, max_ubo_size, true);
@@ -691,7 +693,8 @@ const vkapi::BufferBindInfo vTensor::sizes_ubo() {
 }
 
 const vkapi::BufferBindInfo vTensor::strides_ubo() {
-  const size_t size_per_ubo = context()->adapter_ptr()->min_ubo_alignment();
+  const size_t size_per_ubo =
+      storage_.context_->adapter_ptr()->min_ubo_alignment();
   const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo;
   if (!uniforms_.buffer()) {
     uniforms_ = ParamsBuffer(storage_.context_, max_ubo_size, true);
@@ -711,7 +714,8 @@ const vkapi::BufferBindInfo vTensor::strides_ubo() {
 }
 
 const vkapi::BufferBindInfo vTensor::logical_limits_ubo() {
-  const size_t size_per_ubo = context()->adapter_ptr()->min_ubo_alignment();
+  const size_t size_per_ubo =
+      storage_.context_->adapter_ptr()->min_ubo_alignment();
   const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo;
   if (!uniforms_.buffer()) {
     uniforms_ = ParamsBuffer(storage_.context_, max_ubo_size, true);
@@ -729,7 +733,8 @@ const vkapi::BufferBindInfo vTensor::logical_limits_ubo() {
 }
 
 const vkapi::BufferBindInfo vTensor::numel_ubo() {
-  const size_t size_per_ubo = context()->adapter_ptr()->min_ubo_alignment();
+  const size_t size_per_ubo =
+      storage_.context_->adapter_ptr()->min_ubo_alignment();
   const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo;
   if (!uniforms_.buffer()) {
     uniforms_ = ParamsBuffer(storage_.context_, max_ubo_size, true);
diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h
index 8238962ae31..d9cbadb46b9 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.h
+++ b/backends/vulkan/runtime/api/containers/Tensor.h
@@ -245,7 +245,7 @@ class vTensor final {
     TextureLimits logical_limits;
     // Contains the number of elements in the tensor according to the canonical
     // sizes.
-    size_t numel;
+    int32_t numel;
 
     friend class vTensor;
 
@@ -253,11 +253,11 @@ class vTensor final {
         const std::vector<int64_t>& sizes,
         const std::vector<int64_t>& strides,
         const TextureLimits& logical_limits,
-        const size_t numel)
+        const size_t numel_ll)
         : sizes_v(utils::make_whcn_ivec4(sizes)),
           strides_v(utils::make_whcn_ivec4(strides)),
           logical_limits(logical_limits),
-          numel(numel) {}
+          numel(utils::safe_downcast<int32_t>(numel_ll)) {}
 
    public:
     /*
diff --git a/backends/vulkan/runtime/gen_vulkan_spv.py b/backends/vulkan/runtime/gen_vulkan_spv.py
index 7d3d2d52950..a3d214f5ae8 100644
--- a/backends/vulkan/runtime/gen_vulkan_spv.py
+++ b/backends/vulkan/runtime/gen_vulkan_spv.py
@@ -549,7 +549,12 @@ def __init__(
 
         self.env = env
         self.glslc_path = glslc_path
-        self.glslc_flags = glslc_flags
+        self.glslc_flags = glslc_flags.split()
+        self.glslc_flags_no_opt = self.glslc_flags.copy()
+        if "-O" in self.glslc_flags_no_opt:
+            self.glslc_flags_no_opt.remove("-O")
+        if "-Os" in self.glslc_flags_no_opt:
+            self.glslc_flags_no_opt.remove("-Os")
         self.replace_u16vecn = replace_u16vecn
 
         self.glsl_src_files: Dict[str, str] = {}
@@ -751,25 +756,37 @@ def process_shader(shader_paths_pair):
             if self.glslc_path is not None:
                 spv_out_path = os.path.join(output_dir, f"{shader_name}.spv")
 
-                cmd = (
-                    [
-                        self.glslc_path,
-                        "-fshader-stage=compute",
-                        glsl_out_path,
-                        "-o",
-                        spv_out_path,
-                        "--target-env=vulkan1.1",
-                        "-Werror",
-                    ]
-                    + [
-                        arg
-                        for src_dir_path in self.src_dir_paths
-                        for arg in ["-I", src_dir_path]
-                    ]
-                    + self.glslc_flags.split()
-                )
+                cmd_base = [
+                    self.glslc_path,
+                    "-fshader-stage=compute",
+                    glsl_out_path,
+                    "-o",
+                    spv_out_path,
+                    "--target-env=vulkan1.1",
+                    "-Werror",
+                ] + [
+                    arg
+                    for src_dir_path in self.src_dir_paths
+                    for arg in ["-I", src_dir_path]
+                ]
+                cmd = cmd_base + self.glslc_flags
+
+                try:
+                    subprocess.run(cmd, check=True, capture_output=True, text=True)
+                except subprocess.CalledProcessError as e:
+                    opt_fail = "compilation succeeded but failed to optimize"
+                    err_msg_base = f"Failed to compile {os.getcwd()}/{glsl_out_path}: "
+                    if opt_fail in e.stderr or opt_fail in e.stdout:
+                        cmd_no_opt = cmd_base + self.glslc_flags_no_opt
+                        try:
+                            subprocess.run(cmd_no_opt, check=True, capture_output=True)
+                        except subprocess.CalledProcessError as e_no_opt:
+                            raise RuntimeError(
+                                f"{err_msg_base} {e_no_opt.stderr}"
+                            ) from e_no_opt
 
-                subprocess.check_call(cmd)
+                    else:
+                        raise RuntimeError(f"{err_msg_base} {e.stderr}") from e
 
                 return (spv_out_path, glsl_out_path)
 
diff --git a/backends/vulkan/runtime/graph/containers/PushConstantData.cpp b/backends/vulkan/runtime/graph/containers/PushConstantData.cpp
new file mode 100644
index 00000000000..7999118443b
--- /dev/null
+++ b/backends/vulkan/runtime/graph/containers/PushConstantData.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/containers/PushConstantData.h>
+
+namespace vkcompute {
+
+uint32_t PushConstantDataInfo::write(
+    void* dst,
+    const uint32_t dst_offset,
+    const uint32_t max_dst_size) const {
+  if (tensorUniformData != nullptr) {
+    return tensorUniformData->write_attribute(
+        dst, dst_offset, max_dst_size, payload_.attr);
+  }
+
+  VK_CHECK_COND(
+      (dst_offset + payload_.dataSize) <= max_dst_size,
+      "Attempting to write push constant data outside data boundary.");
+  memcpy((uint8_t*)dst + dst_offset, payload_.data, payload_.dataSize);
+  return payload_.dataSize;
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/containers/PushConstantData.h b/backends/vulkan/runtime/graph/containers/PushConstantData.h
new file mode 100644
index 00000000000..39cde4722a7
--- /dev/null
+++ b/backends/vulkan/runtime/graph/containers/PushConstantData.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/vulkan/runtime/api/api.h>
+
+namespace vkcompute {
+
+class ComputeGraph;
+
+constexpr uint32_t kMaxPushConstantSize = 128;
+/*
+ * Represents a push constant data entry
+ * Which is either shared pointer to a tensor's uniform data with an attribute
+ * Or data with a maximum size of 16 bytes
+ */
+class PushConstantDataInfo {
+  std::shared_ptr<api::vTensor::UniformData> tensorUniformData;
+  union Payload {
+    struct {
+      api::vTensor::Attribute attr;
+    };
+    struct {
+      uint8_t data[16];
+      uint32_t dataSize;
+    };
+  };
+
+  Payload payload_;
+
+ public:
+  explicit PushConstantDataInfo(
+      const std::shared_ptr<api::vTensor::UniformData>& tensorUniformData,
+      api::vTensor::Attribute attr)
+      : tensorUniformData(tensorUniformData) {
+    payload_.attr = attr;
+  }
+
+  explicit PushConstantDataInfo(
+      const void* data,
+      uint32_t dataLen,
+      uint32_t pushConstantLen = 0)
+      : tensorUniformData(nullptr) {
+    VK_CHECK_COND(
+        dataLen <= 16, "Single push constant data size must be <= 16 bytes");
+    payload_.dataSize = pushConstantLen ? pushConstantLen : dataLen;
+    memcpy(payload_.data, data, dataLen);
+  }
+
+  /*
+   * Function writes push constant data to the destination buffer
+   */
+  uint32_t write(
+      void* dst,
+      const uint32_t dst_offset,
+      const uint32_t max_dst_size) const;
+};
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/BlitNode.cpp b/backends/vulkan/runtime/graph/ops/BlitNode.cpp
index 463a2d19c36..03ee4caa51a 100644
--- a/backends/vulkan/runtime/graph/ops/BlitNode.cpp
+++ b/backends/vulkan/runtime/graph/ops/BlitNode.cpp
@@ -46,7 +46,7 @@ void BlitNode::encode(ComputeGraph* graph) {
   kernel_name += vkapi::to_string(dst_tensor->dtype());
 
   context->report_shader_dispatch_start(
-      kernel_name, utils::uvec3(), utils::uvec3(), node_id_);
+      kernel_name, utils::uvec3(), utils::WorkgroupSize(), node_id_);
 
   context->register_blit(
       pipeline_barrier,
diff --git a/backends/vulkan/runtime/graph/ops/DispatchNode.cpp b/backends/vulkan/runtime/graph/ops/DispatchNode.cpp
index 63b8798f2c1..6730d851483 100644
--- a/backends/vulkan/runtime/graph/ops/DispatchNode.cpp
+++ b/backends/vulkan/runtime/graph/ops/DispatchNode.cpp
@@ -14,22 +14,6 @@
 
 namespace vkcompute {
 
-uint32_t PushConstantDataInfo::write(
-    void* dst,
-    const uint32_t dst_offset,
-    const uint32_t max_dst_size) const {
-  if (tensorUniformData != nullptr) {
-    return tensorUniformData->write_attribute(
-        dst, dst_offset, max_dst_size, payload_.attr);
-  }
-
-  VK_CHECK_COND(
-      (dst_offset + payload_.dataSize) <= max_dst_size,
-      "Attempting to write push constant data outside data boundary.");
-  memcpy((uint8_t*)dst + dst_offset, payload_.data, payload_.dataSize);
-  return payload_.dataSize;
-}
-
 DispatchNode::DispatchNode(
     ComputeGraph& graph,
     const vkapi::ShaderInfo& shader,
diff --git a/backends/vulkan/runtime/graph/ops/DispatchNode.h b/backends/vulkan/runtime/graph/ops/DispatchNode.h
index 7d04f7714e9..e3794e9a9e4 100644
--- a/backends/vulkan/runtime/graph/ops/DispatchNode.h
+++ b/backends/vulkan/runtime/graph/ops/DispatchNode.h
@@ -10,6 +10,7 @@
 
 #include <executorch/backends/vulkan/runtime/api/api.h>
 
+#include <executorch/backends/vulkan/runtime/graph/containers/PushConstantData.h>
 #include <executorch/backends/vulkan/runtime/graph/containers/Value.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/ExecuteNode.h>
@@ -18,54 +19,6 @@ namespace vkcompute {
 
 class ComputeGraph;
 
-constexpr uint32_t kMaxPushConstantSize = 128;
-/*
- * Represents a push constant data entry
- * Which is either shared pointer to a tensor's uniform data with an attribute
- * Or data with a maximum size of 16 bytes
- */
-class PushConstantDataInfo {
-  std::shared_ptr<api::vTensor::UniformData> tensorUniformData;
-  union Payload {
-    struct {
-      api::vTensor::Attribute attr;
-    };
-    struct {
-      uint8_t data[16];
-      uint32_t dataSize;
-    };
-  };
-
-  Payload payload_;
-
- public:
-  explicit PushConstantDataInfo(
-      const std::shared_ptr<api::vTensor::UniformData>& tensorUniformData,
-      api::vTensor::Attribute attr)
-      : tensorUniformData(tensorUniformData) {
-    payload_.attr = attr;
-  }
-
-  explicit PushConstantDataInfo(
-      const void* data,
-      uint32_t dataLen,
-      uint32_t pushConstantLen = 0)
-      : tensorUniformData(nullptr) {
-    VK_CHECK_COND(
-        dataLen <= 16, "Single push constant data size must be <= 16 bytes");
-    payload_.dataSize = pushConstantLen ? pushConstantLen : dataLen;
-    memcpy(payload_.data, data, dataLen);
-  }
-
-  /*
-   * Function writes push constant data to the destination buffer
-   */
-  uint32_t write(
-      void* dst,
-      const uint32_t dst_offset,
-      const uint32_t max_dst_size) const;
-};
-
 /*
  * Represents a single shader execution op in a ML model.
  */
@@ -92,7 +45,7 @@ class DispatchNode final : public ExecuteNode {
  protected:
   const vkapi::ShaderInfo shader_;
   const utils::uvec3 global_workgroup_size_;
-  const utils::uvec3 local_workgroup_size_;
+  const utils::WorkgroupSize local_workgroup_size_;
   const vkapi::ParamsBindList params_;
   const vkapi::SpecVarList spec_vars_;
   const std::vector<PushConstantDataInfo> push_constants_;
diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
index bf501296b1b..d84d893540c 100644
--- a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
+++ b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
@@ -32,7 +32,8 @@ PrepackNode::PrepackNode(
     const ValueRef tref,
     const ValueRef packed,
     const vkapi::ParamsBindList& params,
-    const vkapi::SpecVarList& spec_vars)
+    const vkapi::SpecVarList& spec_vars,
+    const std::vector<PushConstantDataInfo>& push_constants)
     : shader_(shader),
       noop_shader_(get_noop_shader(graph, packed)),
       global_workgroup_size_(global_workgroup_size),
@@ -40,7 +41,8 @@ PrepackNode::PrepackNode(
       tref_(tref),
       packed_(packed),
       params_(params),
-      spec_vars_(spec_vars) {
+      spec_vars_(spec_vars),
+      push_constants_(push_constants) {
   graph.update_descriptor_counts(shader, /*execute = */ false);
   graph.update_descriptor_counts(noop_shader_, /*execute = */ false);
 }
@@ -75,10 +77,20 @@ void PrepackNode::encode(ComputeGraph* graph) {
 
   std::unique_lock<std::mutex> cmd_lock = context->dispatch_lock();
 
+  std::array<uint8_t, kMaxPushConstantSize> push_constants_data;
+  uint32_t push_constants_offset = 0;
+
+  for (const auto& push_constant : push_constants_) {
+    push_constants_offset += push_constant.write(
+        push_constants_data.data(),
+        push_constants_offset,
+        kMaxPushConstantSize);
+  }
+
   {
     vkapi::PipelineBarrier pipeline_barrier{};
     vkapi::DescriptorSet descriptor_set = context->get_descriptor_set(
-        shader_, local_workgroup_size_, spec_vars_, 0u);
+        shader_, local_workgroup_size_, spec_vars_, push_constants_offset);
 
     uint32_t idx = 0;
     bind_tensor_to_descriptor_set(
@@ -91,7 +103,12 @@ void PrepackNode::encode(ComputeGraph* graph) {
     bind_params_to_descriptor_set(params_, descriptor_set, idx);
 
     context->register_shader_dispatch(
-        descriptor_set, pipeline_barrier, shader_, global_workgroup_size_);
+        descriptor_set,
+        pipeline_barrier,
+        shader_,
+        global_workgroup_size_,
+        push_constants_data.data(),
+        push_constants_offset);
   }
 
   // Submit a compute shader that performs a no-op with the packed tensor in
@@ -100,8 +117,8 @@ void PrepackNode::encode(ComputeGraph* graph) {
   // bound with the correct image layout.
   {
     vkapi::PipelineBarrier pipeline_barrier{};
-    vkapi::DescriptorSet descriptor_set =
-        context->get_descriptor_set(noop_shader_, {1, 1, 1});
+    vkapi::DescriptorSet descriptor_set = context->get_descriptor_set(
+        noop_shader_, utils::WorkgroupSize(1, 1, 1));
 
     bind_tensor_to_descriptor_set(
         *packed,
diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.h b/backends/vulkan/runtime/graph/ops/PrepackNode.h
index 3e713303c3d..a45deb9ff70 100644
--- a/backends/vulkan/runtime/graph/ops/PrepackNode.h
+++ b/backends/vulkan/runtime/graph/ops/PrepackNode.h
@@ -10,6 +10,7 @@
 
 #include <executorch/backends/vulkan/runtime/api/api.h>
 
+#include <executorch/backends/vulkan/runtime/graph/containers/PushConstantData.h>
 #include <executorch/backends/vulkan/runtime/graph/containers/Value.h>
 
 namespace vkcompute {
@@ -34,7 +35,8 @@ class PrepackNode final {
       const ValueRef tref,
       const ValueRef packed,
       const vkapi::ParamsBindList& params,
-      const vkapi::SpecVarList& spec_vars = {});
+      const vkapi::SpecVarList& spec_vars = {},
+      const std::vector<PushConstantDataInfo>& push_constants = {});
 
   ~PrepackNode() = default;
 
@@ -49,11 +51,12 @@ class PrepackNode final {
   const vkapi::ShaderInfo shader_;
   vkapi::ShaderInfo noop_shader_;
   const utils::uvec3 global_workgroup_size_;
-  const utils::uvec3 local_workgroup_size_;
+  const utils::WorkgroupSize local_workgroup_size_;
   const ValueRef tref_;
   const ValueRef packed_;
   const vkapi::ParamsBindList params_;
   const vkapi::SpecVarList spec_vars_;
+  const std::vector<PushConstantDataInfo> push_constants_;
 
  private:
   api::StagingBuffer create_staging_buffer(ComputeGraph* graph);
diff --git a/backends/vulkan/runtime/graph/ops/glsl/batchnorm.glsl b/backends/vulkan/runtime/graph/ops/glsl/batchnorm.glsl
index deb03192af0..c2fc5a56754 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/batchnorm.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/batchnorm.glsl
@@ -13,24 +13,18 @@
 
 layout(std430) buffer;
 
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
-layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
-layout(set = 0, binding = 2) uniform PRECISION sampler3D weight_in;
-layout(set = 0, binding = 3) uniform PRECISION sampler3D bias_in;
-layout(set = 0, binding = 4) uniform PRECISION sampler3D mean_in;
-layout(set = 0, binding = 5) uniform PRECISION sampler3D var_in;
+#include "indexing_utils.h"
 
-layout(set = 0, binding = 6) uniform PRECISION restrict OutLimits {
-  ivec3 out_limits;
-};
+${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "weight_in", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "bias_in", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "mean_in", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "var_in", DTYPE, STORAGE)}
 
-layout(set = 0, binding = 7) uniform PRECISION restrict Params {
-  float eps;
-};
-
-layout(set = 0, binding = 8) uniform PRECISION restrict Params2 {
-  int num_texel_per_batch;
-};
+${layout_declare_ubo(B, "ivec3", "out_limits")}
+${layout_declare_ubo(B, "float", "eps")}
+${layout_declare_ubo(B, "int", "num_texel_per_batch")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
@@ -40,16 +34,16 @@ void main() {
     return;
   }
 
-  VEC4_T v = VEC4_T(texelFetch(image_in, pos, 0));
+  VEC4_T v = VEC4_T(load_texel(t_in, pos));
 
   ivec3 param_pos = ivec3(pos.z % num_texel_per_batch, 0, 0);
 
-  VEC4_T weight = VEC4_T(texelFetch(weight_in, param_pos, 0));
-  VEC4_T bias = VEC4_T(texelFetch(bias_in, param_pos, 0));
-  VEC4_T mean = VEC4_T(texelFetch(mean_in, param_pos, 0));
-  VEC4_T var = VEC4_T(texelFetch(var_in, param_pos, 0));
+  VEC4_T weight = VEC4_T(load_texel(weight_in, param_pos));
+  VEC4_T bias = VEC4_T(load_texel(bias_in, param_pos));
+  VEC4_T mean = VEC4_T(load_texel(mean_in, param_pos));
+  VEC4_T var = VEC4_T(load_texel(var_in, param_pos));
 
   v = ((v - mean) / sqrt(var + eps)) * weight + bias;
 
-  imageStore(image_out, pos, v);
+  write_texel(t_out, pos, v);
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/batchnorm.yaml b/backends/vulkan/runtime/graph/ops/glsl/batchnorm.yaml
index a92e44f636b..116773c816a 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/batchnorm.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/batchnorm.yaml
@@ -2,6 +2,7 @@ batchnorm:
   parameter_names_with_default_values:
     DTYPE: float
     NDIM: 3
+    STORAGE: texture3d
   generate_variant_forall:
     DTYPE:
       - VALUE: half
diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
index 62aa2f810dc..ce986d4e12f 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
@@ -11,37 +11,83 @@
 #define PRECISION ${PRECISION}
 
 #define VEC4_T ${texel_type(DTYPE)}
+#define T ${buffer_scalar_type(DTYPE)}
 
 #define op(X, Y, A) ${OPERATOR}
 
+${define_active_storage_type(STORAGE)}
+${define_required_extensions(DTYPE)}
+
 layout(std430) buffer;
 
 ${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
 ${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
 ${layout_declare_tensor(B, "r", "t_other", DTYPE, STORAGE)}
 
+$if STORAGE == "buffer":
+  layout(push_constant) uniform restrict Block {
+    ivec4 in_sizes;
+    ivec4 other_sizes;
+    ivec4 out_strides;
+    ivec4 in_strides;
+    ivec4 other_strides;
+    int out_numel;
+    float alpha;
+  };
+$else:
+  layout(push_constant) uniform restrict Block {
+    ivec4 out_sizes;
+    ivec4 in_sizes;
+    ivec4 other_sizes;
+    ivec2 broadcast_params;
+    float alpha;
+  };
+
 #include "broadcasting_utils.h"
 #include "indexing_utils.h"
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
-const lowp int packed_dim = unhash_packed_dim(out_layout);
+$if STORAGE == "buffer":
+  ${layout_declare_spec_const(C, "int", "out_packed_dim", "DEFAULT_LAYOUT")}
+  ${layout_declare_spec_const(C, "int", "in_packed_dim", "DEFAULT_LAYOUT")}
+  ${layout_declare_spec_const(C, "int", "other_packed_dim", "DEFAULT_LAYOUT")}
+$else:
+  ${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
+  const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
+  const lowp int packed_dim = unhash_packed_dim(out_layout);
 
-${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
+  ${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
+  const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
 
-${layout_declare_spec_const(C, "int", "other_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 other_axis_map = unhash_axis_map(other_layout);
+  ${layout_declare_spec_const(C, "int", "other_layout", "DEFAULT_LAYOUT")}
+  const lowp ivec4 other_axis_map = unhash_axis_map(other_layout);
 
-layout(push_constant) uniform restrict Block {
-  ivec4 out_sizes;
-  ivec4 in_sizes;
-  ivec4 other_sizes;
-  ivec2 broadcast_params;
-  float alpha;
-};
+#ifdef USING_BUFFER
+
+void main() {
+  const int out_bufi = ivec3(gl_GlobalInvocationID).x;
+  if (out_bufi >= out_numel) {
+    return;
+  }
+
+  // Simple case; no broadcasting
+  if (in_sizes == other_sizes) {
+    t_out[out_bufi] = T(op(t_in[out_bufi], t_other[out_bufi], T(alpha)));
+    return;
+  }
+
+  const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim);
+  const ivec4 in_tidx = min(out_tidx, in_sizes - 1);
+  const ivec4 other_tidx = min(out_tidx, other_sizes - 1);
+
+  const int in_bufi = tidx_to_bufi(in_tidx, in_strides);
+  const int other_bufi = tidx_to_bufi(other_tidx, other_strides);
+
+  t_out[out_bufi] = T(op(t_in[in_bufi], t_other[other_bufi], T(alpha)));
+}
+
+#else // USING_TEXTURE
 
 void main() {
   const ivec3 lpos = ivec3(gl_GlobalInvocationID);
@@ -79,3 +125,5 @@ void main() {
     VEC4_T(op(in_texel, other_texel, alpha)),
     out_axis_map);
 }
+
+#endif
diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_op.yaml b/backends/vulkan/runtime/graph/ops/glsl/binary_op.yaml
index 2ef7681092d..c0efdd81eb9 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/binary_op.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/binary_op.yaml
@@ -10,8 +10,10 @@ binary_op:
     NDIM: 3
     DTYPE: float
     PACKING: C_packed
-    STORAGE: texture3d
   generate_variant_forall:
+    STORAGE:
+      - VALUE: texture3d
+      - VALUE: buffer
     DTYPE:
       - VALUE: half
       - VALUE: float
diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl
index a42a592762b..178814a90c3 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl
@@ -19,8 +19,10 @@ ${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
 
 layout(push_constant) uniform restrict Block {
   ivec3 range;
-  ivec3 src_offset;
-  ivec3 dst_offset;
+  // xyz is source offset w is channel size
+  ivec4 src_offset;
+  // xyz is destination offset w is channel size
+  ivec4 dst_offset;
 };
 
 #include "indexing_utils.h"
@@ -33,16 +35,31 @@ const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
 ${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
 const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
 
+${layout_declare_spec_const(C, "int", "batch_index_function", "0")}
+
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  const ivec3 out_pos = pos + dst_offset;
-  const ivec3 in_pos = pos + src_offset;
-
   if (any(greaterThanEqual(pos, range))) {
     return;
   }
 
+  ivec3 in_pos = pos + src_offset.xyz;
+  ivec3 out_pos = pos + dst_offset.xyz;
+  if (src_offset.w > 0) {
+    if (batch_index_function == 1) {
+      // batch index is calculated using source channel size
+      const int channel_index = pos.z % src_offset.w;
+      const int batch_index = pos.z / src_offset.w;
+      out_pos.z = channel_index + dst_offset.z + batch_index * dst_offset.w;
+    } else if (batch_index_function == 2) {
+      // batch index is calculated using destination channel size
+      const int channel_index = pos.z % dst_offset.w;
+      const int batch_index = pos.z / dst_offset.w;
+      in_pos.z = channel_index + src_offset.z + batch_index * src_offset.w;
+    }
+  }
+
   write_texel_lpos(
     t_out,
     out_pos,
diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl
new file mode 100644
index 00000000000..e0f09f0be43
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+
+layout(std430) buffer;
+
+${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "existing_out", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
+
+layout(push_constant) uniform restrict Block {
+  ivec4 range;
+  // xyz is source offset w is channel size
+  ivec4 src_offset;
+  // xyz is destination offset w is channel size
+  ivec4 dst_offset;
+};
+
+#include "indexing_utils.h"
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
+const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
+const lowp int packed_dim = unhash_packed_dim(out_layout);
+
+${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
+const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (any(greaterThanEqual(pos, range.xyz))) {
+    return;
+  }
+
+  // Position in input tensor
+  ivec3 in_pos = pos + src_offset.xyz;
+  in_pos[packed_dim] = pos[packed_dim] + (src_offset[packed_dim] >> 2);
+
+  // Read input value mapping to this output texel
+  VEC4_T in_value = load_texel_lpos(t_in, in_pos, in_axis_map);
+
+  // Starting offset to read from a texel
+  const int src_lane_offset = src_offset[packed_dim] & 0x3;
+  const bool has_src_lane_offset = src_lane_offset != 0;
+
+  // If input lane offset is non zero i.e packed texel is composed from multiple sources
+  if (has_src_lane_offset) {
+    // Boundary values will come from next input texel in the packed dim.
+    ivec3 next_in_pos = in_pos;
+    next_in_pos[packed_dim] = in_pos[packed_dim] + 1;
+    VEC4_T next_value = load_texel_lpos(t_in, next_in_pos, in_axis_map);
+
+    // Keep input values from the end of current input pixel based on src_lane_offset
+    // offset 1 means the first lane of current input texel is not a part of the output texel
+    // offset 2 means first 2 lanes are not and so on
+    if (src_lane_offset == 1) {
+      in_value.xyz = in_value.yzw;
+    } else if (src_lane_offset == 2) {
+      in_value.xy = in_value.zw;
+    } else {
+      in_value.x = in_value.w;
+    }
+    // Copy next texel's values towards the end of input texel, based on lane offset
+    // offset 1 means the first lane from next texel is part of the input texel
+    // offset 2 means first 2 lanes from next texel is part of the input texel and so on
+    if (src_lane_offset == 1) {
+      in_value.w = next_value.x;
+    } else if (src_lane_offset == 2) {
+      in_value.zw = next_value.xy;
+    } else {
+      in_value.yzw = next_value.xyz;
+    }
+  }
+
+  // Starting offset to write at within a texel
+  const int out_lane_offset = dst_offset[packed_dim] & 0x3;
+  const bool has_dst_lane_offset = out_lane_offset != 0;
+
+  ivec3 out_pos = pos + dst_offset.xyz;
+  out_pos[packed_dim] = pos[packed_dim] + (dst_offset[packed_dim] >> 2);
+
+  VEC4_T out_value;
+
+  // If lane offset is non zero i.e packed texel is composed from multiple sources
+  if (has_dst_lane_offset) {
+    // When position in packed dim is > 0
+    if (pos[packed_dim] > 0) {
+      // Boundary values will come from previous input texel in the packed dim.
+      ivec3 prev_in_pos = in_pos;
+      prev_in_pos[packed_dim] = in_pos[packed_dim] - 1;
+      VEC4_T prev_value = load_texel_lpos(t_in, prev_in_pos, in_axis_map);
+
+      // Shift values toward the beginning based on out_lane_offset
+      // offset 1 means the last lane from the previous texel is a part of the output texel
+      // offset 2 means last 2 lanes and so on
+      if (out_lane_offset == 1) {
+        out_value.x = prev_value.w;
+      } else if (out_lane_offset == 2) {
+        out_value.xy = prev_value.zw;
+      } else {
+        out_value.xyz = prev_value.yzw;
+      }
+    } else {
+      // When position in packed dim is == 0
+      // Boundary values will be the previous texel values.
+      out_value = load_texel_lpos(existing_out, out_pos, out_axis_map);
+    }
+
+    // Copy input values towards the end of output array, based on lane offset
+    // offset 1 means the first lane from previous texel is part of the output texel starting at offset
+    // offset 2 means first 2 lanes from the previous texel is part of the output texel and so on
+    if (out_lane_offset == 1) {
+      out_value.yzw = in_value.xyz;
+    } else if (out_lane_offset == 2) {
+      out_value.zw = in_value.xy;
+    } else {
+      out_value.w = in_value.x;
+    }
+  } else {
+    out_value = in_value;
+  }
+
+  write_texel_lpos(
+    t_out,
+    out_pos,
+    out_value,
+    out_axis_map);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.yaml b/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.yaml
new file mode 100644
index 00000000000..e872d64e3c3
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.yaml
@@ -0,0 +1,12 @@
+copy_packed_dim_offset:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NDIM: 3
+    STORAGE: texture3d
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+      - VALUE: int
+  shader_variants:
+    - NAME: copy_packed_dim_offset
diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
index 0b372ab70a4..2126104430f 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
+++ b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
@@ -104,16 +104,19 @@ ivec4 tidx_to_4bufi(
 }
 
 ivec4 nchwi_to_tidx(const int nchwi, const ivec4 sizes) {
+  const int nchwi_div_x = nchwi / sizes.x;
+  const int nchwi_div_y = nchwi_div_x / sizes.y;
   return ivec4(
       nchwi % sizes.x,
-      (nchwi / (sizes.x)) % sizes.y,
-      (nchwi / (sizes.x * sizes.y)) % sizes.z,
-      (nchwi / (sizes.x * sizes.y * sizes.z)));
+      nchwi_div_x % sizes.y,
+      nchwi_div_y % sizes.z,
+      nchwi_div_y / sizes.z);
 }
 
 int tidx_to_nchwi(const ivec4 tidx, const ivec4 sizes) {
-  return tidx.w * sizes.x * sizes.y * sizes.z + tidx.z * sizes.x * sizes.y +
-      tidx.y * sizes.x + tidx.x;
+  const int sizes_xy = sizes.x * sizes.y;
+  return tidx.w * sizes_xy * sizes.z + tidx.z * sizes_xy + tidx.y * sizes.x +
+      tidx.x;
 }
 
 // TODO(ssjia): make this function use dim order so that it can work with any
@@ -360,8 +363,8 @@ ivec4 to_texture_elem_pos(ivec4 idx, ivec4 sizes, int packed_dim) {
   //  pos[4] is set to a placeholder value
   ivec4 pos = idx.xyzx;
   pos[BATCH_AXIS] += idx.w * sizes[BATCH_AXIS];
-  pos[packed_dim] /= 4;
-  pos.w = idx[packed_dim] % 4;
+  pos[packed_dim] >>= 2;
+  pos.w = idx[packed_dim] & 0x3;
   return pos;
 }
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/permute.glsl b/backends/vulkan/runtime/graph/ops/glsl/permute.glsl
index 59d6aecdc15..d4ad736a563 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/permute.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/permute.glsl
@@ -21,56 +21,65 @@ layout(set = 0, binding = 1) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} image_i
 
 layout(push_constant) uniform PRECISION restrict Block {
   ivec4 out_limits;
-  ivec4 sizes;
+  ivec4 in_sizes;
   // output dims
   ivec4 out_ndims;
   // x = output channels aligned to 4, y = input channels aligned to 4
-  ivec2 ch_info;
+  ivec2 channel_info;
 };
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
+layout(constant_id = 3) const int packed_dim = C_DIM;
 
 void main() {
-  const u16vec3 pos = u16vec3(gl_GlobalInvocationID);
+  ivec3 pos = ivec3(gl_GlobalInvocationID);
 
   if (any(greaterThanEqual(pos, out_limits.xyz))) {
     return;
   }
 
-  const int out_channel_4up = int(ch_info.x);
-  const int in_channel_4up = int(ch_info.y);
-  const int out_batch = int(sizes[3]);
   VEC4_T outval = VEC4_T(0.0);
-  ivec4 v = ivec4(0); // holds b,c,h,w
 
-  v[out_ndims[2]] = pos.y;
-  v[out_ndims[3]] = pos.x;
+  // scale up output position's packed dim
+  pos[packed_dim] <<= 2;
+
+  // index of packed dim in bchw format
+  const int in_packed_dim_bchw_index = 3 - packed_dim;
 
-  const int dst_index = pos.z << 2;
-  int dst_out_index = dst_index / out_channel_4up;
-  int dst_out_lane = dst_index % out_channel_4up;
+  // determine input position based on output position and permute map
+  // out_ndims is in BCHW format
+  ivec4 in_bchw_pos = ivec4(0); // holds b,c,h,w
+  in_bchw_pos[out_ndims[0]] = (pos.z / channel_info.x);
+  in_bchw_pos[out_ndims[1]] = (pos.z % channel_info.x);
+  in_bchw_pos[out_ndims[2]] = pos.y;
+  in_bchw_pos[out_ndims[3]] = pos.x;
 
-  for (int j = 0; j < 4; ++j, ++dst_out_lane) {
-    if (dst_out_index >= out_batch) {
-      // out of range
+  for (int j = 0; j < 4; ++j) {
+    // terminate the loop if trying to access input texture out of bounds
+    if (any(greaterThanEqual(in_bchw_pos.wzyx, in_sizes.xyzw))) {
       break;
     }
+    ivec3 fetch_pos;
 
-    if (dst_out_lane == out_channel_4up) {
-      dst_out_lane = 0;
-      dst_out_index++;
-    }
+    fetch_pos.xy = in_bchw_pos.wz;
+    // calculate input position in z axis using batch and channel index which is in_bchw_pos.x and in_bchw_pos.y respectively
+    fetch_pos.z = in_bchw_pos.y + in_bchw_pos.x * channel_info.y;
 
-    v[out_ndims[0]] = dst_out_index;
-    v[out_ndims[1]] = dst_out_lane;
+    // input tensor's packed dim lane corresponding to output tensor's pos
+    const int in_packed_dim_lane_index = fetch_pos[packed_dim] & 0x3;
 
-    int src_index = v[0] * in_channel_4up + v[1];
+    // scale down input tensor's packed dim pos to perform fetch
+    fetch_pos[packed_dim] >>= 2;
 
-    VEC4_T inval = VEC4_T(texelFetch(image_in, u16vec3(v[3], v[2], src_index >> 2), 0));
-    outval[j] = inval[src_index & 0x3];
+    // fetch input texel
+    VEC4_T inval = VEC4_T(texelFetch(image_in, fetch_pos, 0));
+    outval[j] = inval[in_packed_dim_lane_index];
+
+    // go to next position in the input, that is mapped to the packed dim in the output
+    in_bchw_pos[out_ndims[in_packed_dim_bchw_index]]++;
   }
 
+  pos[packed_dim] = int(gl_GlobalInvocationID[packed_dim]);
+
   imageStore(image_out, pos, outval);
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl
index cd1a08909d0..e98d2e919b0 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl
@@ -52,19 +52,26 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 #define FLOAT_T float
 #endif
 
-FLOAT_T q_8w_linear(const ivec4 out_idx, const int K) {
-  const FLOAT_T scale = t_scales[out_idx.x];
+void main() {
+  const int out_bufi = int(gl_GlobalInvocationID.x);
+  if (out_bufi >= out_numel) {
+    return;
+  }
+
+  const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, 0);
+
+  const FLOAT_T scale = t_scales[out_tidx.x];
 
   FLOAT_T outval = FLOAT_T(0.0);
 
-  // Initial mat1 tensor idx will be (0, out_idx.y, out_idx.z, 0)
-  int mat1_offset = out_idx.y * mat1_strides.y + out_idx.z * qmat2_strides.z;
-  // Initial qmat2 tensor idx wil be (0, out_idx.x, 0, 0); note that the qmat2
+  // Initial mat1 tensor idx will be (0, out_tidx.y, out_tidx.z, 0)
+  int mat1_offset = out_tidx.y * mat1_strides.y + out_tidx.z * qmat2_strides.z;
+  // Initial qmat2 tensor idx wil be (0, out_tidx.x, 0, 0); note that the qmat2
   // tensor is transposed
-  int qmat2_offset = out_idx.x * qmat2_strides.y;
+  int qmat2_offset = out_tidx.x * qmat2_strides.y;
 
-  // TODO(ssjia): optimize memory access pattern by traversing K in inner loop
-  for (int i = 0; i < K; i++) {
+  // TODO(ssjia): optimize memory access pattern by traversing mat1 x in inner loop
+  for (int i = 0; i < mat1_sizes.x; i++) {
     const FLOAT_T mat1_val = t_mat1[mat1_offset];
     const FLOAT_T mat2_val = t_qmat2[qmat2_offset] * scale;
 
@@ -74,33 +81,32 @@ FLOAT_T q_8w_linear(const ivec4 out_idx, const int K) {
     qmat2_offset++;
   }
 
-  return outval;
-}
-
-void main() {
-  const int out_bufi = int(gl_GlobalInvocationID.x);
-  if (out_bufi >= out_numel) {
-    return;
-  }
-
-  const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, 0);
-
-  t_out[out_bufi] = q_8w_linear(out_tidx, mat1_sizes.x);
+  t_out[out_bufi] = outval;
 }
 
 #else // USING_TEXTURE
 
 #extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
 
-VEC4_T q_8w_linear(const u16vec2 out_pos, const uint16_t K) {
+void main() {
+  const u16vec2 out_pos = u16vec2(
+    gl_GlobalInvocationID.x / out_limits.y,
+    gl_GlobalInvocationID.x % out_limits.y);
+  if (out_pos.x >= out_limits.x) {
+    return;
+  }
+
   const uint16_t qmat2_pos_y = out_pos.x * uint16_t(4);
 
   VEC4_T outtex = VEC4_T(0);
 
-  const u16vec3 scales_pos = u16vec3(out_pos.x, 0, 0);
-  const VEC4_T scales = load_texel(t_scales, scales_pos);
+  const VEC4_T scales = load_texel(t_scales,  u16vec3(out_pos.x, 0, 0));
 
-  for (uint16_t i = uint16_t(0), x = uint16_t(0); i < K; i += uint16_t(4), x++) {
+  for (
+    uint16_t i = uint16_t(0), x = uint16_t(0);
+    i < uint16_t(mat1_sizes.x);
+    i += uint16_t(4), x++)
+  {
     const VEC4_T mat1_tex = load_texel(t_mat1, u16vec3(x, out_pos.y, 0));
     const VEC4_T sums = VEC4_T(
         dot(mat1_tex, load_texel(t_qmat2, u16vec3(x, qmat2_pos_y, 0))),
@@ -112,19 +118,6 @@ VEC4_T q_8w_linear(const u16vec2 out_pos, const uint16_t K) {
   }
 
   outtex *= scales;
-
-  return outtex;
-}
-
-void main() {
-  const u16vec2 out_pos = u16vec2(
-    gl_GlobalInvocationID.x / out_limits.y,
-    gl_GlobalInvocationID.x % out_limits.y);
-  if (out_pos.x >= out_limits.x) {
-    return;
-  }
-
-  VEC4_T outtex = q_8w_linear(out_pos, uint16_t(mat1_sizes.x));
   write_texel(t_out, u16vec3(out_pos, 0), outtex);
 }
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice_channel.glsl b/backends/vulkan/runtime/graph/ops/glsl/slice_packed_dim.glsl
similarity index 93%
rename from backends/vulkan/runtime/graph/ops/glsl/slice_channel.glsl
rename to backends/vulkan/runtime/graph/ops/glsl/slice_packed_dim.glsl
index 45e6c3358e8..0a6fa31a65f 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/slice_channel.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/slice_packed_dim.glsl
@@ -49,10 +49,10 @@ void main() {
   for (int i=0;i<4;i++) {
       ivec4 user_coor = nchwi_to_tidx(buf_indices[i], out_sizes);
 
-      int in_channel = user_coor.z;
+      int in_dim = user_coor[packed_dim];
 
       ivec4 in_user_coor = user_coor;
-      in_user_coor.z = slice_arg.offset + in_channel * slice_arg.step;
+      in_user_coor[packed_dim] = slice_arg.offset + in_dim * slice_arg.step;
 
       ivec4 in_pow_elem = to_texture_elem_pos(
         in_user_coor,
diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice_channel.yaml b/backends/vulkan/runtime/graph/ops/glsl/slice_packed_dim.yaml
similarity index 80%
rename from backends/vulkan/runtime/graph/ops/glsl/slice_channel.yaml
rename to backends/vulkan/runtime/graph/ops/glsl/slice_packed_dim.yaml
index 56317260170..718e7316824 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/slice_channel.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/slice_packed_dim.yaml
@@ -1,4 +1,4 @@
-slice_channel:
+slice_packed_dim:
   parameter_names_with_default_values:
     DTYPE: float
     NDIM: 3
@@ -8,4 +8,4 @@ slice_channel:
       - VALUE: half
       - VALUE: float
   shader_variants:
-    - NAME: slice_channel
+    - NAME: slice_packed_dim
diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice_batch_height_width.glsl b/backends/vulkan/runtime/graph/ops/glsl/slice_unpacked_dim.glsl
similarity index 55%
rename from backends/vulkan/runtime/graph/ops/glsl/slice_batch_height_width.glsl
rename to backends/vulkan/runtime/graph/ops/glsl/slice_unpacked_dim.glsl
index 72594830cd4..54f0bd0b78c 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/slice_batch_height_width.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/slice_unpacked_dim.glsl
@@ -27,8 +27,7 @@ layout(set = 0, binding = 3) uniform PRECISION restrict SliceArg {
   int dim;
   int offset;
   int step;
-  // Used when dim=batch. Stride is the # of plances for each batch  value.
-  int stride;
+  int image_in_channel_size;
 }
 slice_arg;
 
@@ -45,11 +44,24 @@ void main() {
 
   ivec3 in_pos = pos;
 
-  int index = pos[slice_arg.dim] / slice_arg.stride;
-  int within_stride = pos[slice_arg.dim] % slice_arg.stride;
-
-  in_pos[slice_arg.dim] = slice_arg.offset * slice_arg.stride + index * slice_arg.step *
-    slice_arg.stride + within_stride;
+  // slice along batch axis
+  if (slice_arg.dim == 3) {
+    // index of the channel inside a batch
+    const int chanl_index = pos.z % slice_arg.image_in_channel_size;
+    // index of batch
+    const int batch_index = pos.z / slice_arg.image_in_channel_size;
+    in_pos.z = (slice_arg.offset + batch_index * slice_arg.step) * slice_arg.image_in_channel_size + chanl_index;
+  } else if (slice_arg.dim == C_DIM) {
+    // index of the channel inside a batch
+    const int chanl_index = pos.z % sizes.z;
+    // index of batch
+    const int batch_index = pos.z / sizes.z;
+    in_pos.z = slice_arg.offset + batch_index * slice_arg.image_in_channel_size + chanl_index * slice_arg.step;
+  } else if (slice_arg.dim == H_DIM) {
+    in_pos.y = slice_arg.offset + pos.y * slice_arg.step;
+  } else {
+    in_pos.x = slice_arg.offset + pos.x * slice_arg.step;
+  }
 
   imageStore(image_out, pos, texelFetch(image_in, in_pos, 0));
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice_batch_height_width.yaml b/backends/vulkan/runtime/graph/ops/glsl/slice_unpacked_dim.yaml
similarity index 72%
rename from backends/vulkan/runtime/graph/ops/glsl/slice_batch_height_width.yaml
rename to backends/vulkan/runtime/graph/ops/glsl/slice_unpacked_dim.yaml
index 9e69b09a304..0453bb707b1 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/slice_batch_height_width.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/slice_unpacked_dim.yaml
@@ -1,4 +1,4 @@
-slice_batch_height_width:
+slice_unpacked_dim:
   parameter_names_with_default_values:
     DTYPE: float
     NDIM: 3
@@ -7,4 +7,4 @@ slice_batch_height_width:
       - VALUE: half
       - VALUE: float
   shader_variants:
-    - NAME: slice_batch_height_width
+    - NAME: slice_unpacked_dim
diff --git a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
index 7e88982aaee..c3c686772e1 100644
--- a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
@@ -44,7 +44,7 @@ void resize_binary_op_node(
   out->virtual_resize(new_out_sizes);
 }
 
-void add_binary_op_node(
+void add_binary_op_texture_node(
     ComputeGraph& graph,
     const ValueRef in1,
     const ValueRef in2,
@@ -75,6 +75,7 @@ void add_binary_op_node(
   std::string kernel_name("binary_");
   kernel_name.reserve(kShaderNameReserve);
   kernel_name += op_name;
+  add_storage_type_suffix(kernel_name, *t_out);
   add_dtype_suffix(kernel_name, *t_out);
 
   graph.execute_nodes().emplace_back(new DispatchNode(
@@ -98,6 +99,70 @@ void add_binary_op_node(
         PushConstantDataInfo(&binary_ops_params, sizeof(binary_ops_params))}}));
 }
 
+void add_binary_op_buffer_node(
+    ComputeGraph& graph,
+    const ValueRef in1,
+    const ValueRef in2,
+    const ValueRef alpha,
+    const ValueRef out,
+    const std::string& op_name) {
+  // check_binary_op_args(*t_in1, *t_in2, *t_out);
+
+  float alpha_val = 1.0f;
+  // String is checked since floor_div passes in an unused string argument in
+  // place of alpha
+  if (is_valid(alpha) && !graph.val_is_string(alpha)) {
+    alpha_val = graph.extract_scalar<float>(alpha);
+  }
+
+  std::string kernel_name("binary_");
+  kernel_name.reserve(kShaderNameReserve);
+  kernel_name += op_name;
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
+  add_dtype_suffix(kernel_name, graph.dtype_of(out));
+
+  graph.execute_nodes().emplace_back(new DispatchNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      graph.create_global_wg_size(out),
+      graph.create_local_wg_size(out),
+      // Inputs and Outputs
+      {{out, vkapi::MemoryAccessType::WRITE},
+       {{in1, in2}, vkapi::MemoryAccessType::READ}},
+      // Shader params buffers
+      {},
+      // Specialization Constants
+      {graph.packed_dim_of(out),
+       graph.packed_dim_of(in1),
+       graph.packed_dim_of(in2)},
+      // Resizing Logic
+      resize_binary_op_node,
+      {},
+      {{
+          graph.sizes_pc_of(in1),
+          graph.sizes_pc_of(in2),
+          graph.strides_pc_of(out),
+          graph.strides_pc_of(in1),
+          graph.strides_pc_of(in2),
+          graph.numel_pc_of(out),
+          PushConstantDataInfo(&alpha_val, sizeof(float)),
+      }}));
+}
+
+void add_binary_op_node(
+    ComputeGraph& graph,
+    const ValueRef in1,
+    const ValueRef in2,
+    const ValueRef alpha,
+    const ValueRef out,
+    const std::string& op_name) {
+  if (graph.is_buffer_storage(out)) {
+    add_binary_op_buffer_node(graph, in1, in2, alpha, out, op_name);
+  } else {
+    add_binary_op_texture_node(graph, in1, in2, alpha, out, op_name);
+  }
+}
+
 #define DEFINE_BINARY_OP_WITH_ALPHA_FN(op_name)                          \
   void op_name(ComputeGraph& graph, const std::vector<ValueRef>& args) { \
     return add_binary_op_node(                                           \
diff --git a/backends/vulkan/runtime/graph/ops/impl/Cat.cpp b/backends/vulkan/runtime/graph/ops/impl/Cat.cpp
index d5cfd5f4505..25a0ff9a7f5 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Cat.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Cat.cpp
@@ -22,65 +22,68 @@ void add_cat_default_node(
     ValueRef dim_ref,
     ValueRef out) {
   ValueListPtr input_list = graph.get_value_list(in_list_ref);
-
-  for (ValueRef input_ref : *input_list) {
-    vTensorPtr t_in = graph.get_tensor(input_ref);
-    VK_CHECK_COND(check_packed_dim_is(*t_in, WHCN::kChannelsDim));
-  }
-
   int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
   vTensorPtr t_out = graph.get_tensor(out);
 
+  const auto packed_dim = t_out->packed_dim();
+  const auto packed_dim_index = static_cast<DimIndex>(kWidth4D - packed_dim);
+
   DimIndex dim_index = normalize_to_dim_index(*t_out, dim);
+  // Index of dimension to be concatenated in (w, h, c * b) coordinate system
+  const auto dim_xyz_index = std::min(2, -dim_index - 1);
 
-  // TODO: Find ways to factor out the similar code for width, height, and batch
-  if (dim_index == kWidth4D) {
-    utils::ivec3 src_offset = utils::make_ivec3({0, 0, 0}, false);
-    utils::ivec3 dst_offset = utils::make_ivec3({0, 0, 0}, false);
+  if (dim_index > kWidth4D || dim_index < kBatch4D) {
+    VK_THROW("Unexpected value of dim_index=", dim_index);
+  }
 
-    for (ValueRef input_ref : *input_list) {
-      vTensorPtr t_in = graph.get_tensor(input_ref);
-      utils::ivec3 range = t_in->logical_limits();
-      add_copy_offset_node(
-          graph, input_ref, range, src_offset, dst_offset, out);
-      dst_offset[0] += range[0];
-    }
+  utils::ivec4 src_offset = utils::make_ivec4({0, 0, 0, 0}, false);
+  utils::ivec4 dst_offset = utils::make_ivec4({0, 0, 0, 0}, false);
 
-  } else if (dim_index == kHeight4D) {
-    utils::ivec3 src_offset = utils::make_ivec3({0, 0, 0}, false);
-    utils::ivec3 dst_offset = utils::make_ivec3({0, 0, 0}, false);
+  const bool is_concat_channel = (dim_index == kChannel4D);
 
-    for (ValueRef input_ref : *input_list) {
-      vTensorPtr t_in = graph.get_tensor(input_ref);
-      utils::ivec3 range = t_in->logical_limits();
-      add_copy_offset_node(
-          graph, input_ref, range, src_offset, dst_offset, out);
-      dst_offset[1] += range[1];
-    }
-  } else if (dim_index == kBatch4D) {
-    utils::ivec3 src_offset = utils::make_ivec3({0, 0, 0}, false);
-    utils::ivec3 dst_offset = utils::make_ivec3({0, 0, 0}, false);
+  // if concatenating channels
+  if (is_concat_channel) {
+    // set destination offset w as channel size of the output tensor
+    dst_offset[3] = dim_at(t_out->sizes(), kChannel4D);
+  }
 
-    for (ValueRef input_ref : *input_list) {
-      vTensorPtr t_in = graph.get_tensor(input_ref);
-      utils::ivec3 range = t_in->logical_limits();
+  for (ValueRef input_ref : *input_list) {
+    const vTensorPtr t_in = graph.get_tensor(input_ref);
+    const utils::ivec3 range = t_in->logical_limits();
+    const auto in_channel_size = dim_at(t_in->sizes(), kChannel4D);
+    // if concatenating same dimension as the packed dimension
+    if (dim_index == packed_dim_index) {
+      // if concatenating channels, use add_copy_channel_offset_node function as
+      // add_copy_packed_dim_offset_node does not support channel packing
+      if (is_concat_channel) {
+        add_copy_channel_offset_node(
+            graph,
+            input_ref,
+            in_channel_size,
+            src_offset[2],
+            dst_offset[2],
+            out);
+        dst_offset[dim_xyz_index] += in_channel_size;
+      } else {
+        // src_offset[3] is not used now but will be used in the future when
+        // add_copy_packed_dim_offset_node will support channel packing
+        //
+        // set source offset w as channel size of the output tensor if
+        // concatenating channels
+        src_offset[3] = is_concat_channel ? in_channel_size : 0;
+        add_copy_packed_dim_offset_node(
+            graph, input_ref, range, src_offset, dst_offset, out);
+        dst_offset[dim_xyz_index] += dim_at(t_in->sizes(), packed_dim_index);
+      }
+    } else {
+      // set source offset w as channel size of the output tensor if
+      // concatenating channels
+      src_offset[3] = is_concat_channel ? in_channel_size : 0;
       add_copy_offset_node(
-          graph, input_ref, range, src_offset, dst_offset, out);
-      dst_offset[2] += range[2];
-    }
-  } else if (dim_index == kChannel4D) {
-    int32_t src_offset = 0;
-    int32_t dst_offset = 0;
-
-    for (ValueRef input_ref : *input_list) {
-      vTensorPtr t_in = graph.get_tensor(input_ref);
-      int32_t range = dim_at(t_in->sizes(), kChannel4D);
-      add_copy_channel_offset_node(
-          graph, input_ref, range, src_offset, dst_offset, out);
-      dst_offset += range;
+          graph, input_ref, range, src_offset, dst_offset, out, true, false);
+      dst_offset[dim_xyz_index] +=
+          is_concat_channel ? in_channel_size : range[dim_xyz_index];
     }
-  } else {
-    VK_THROW("Unexpected value of dim_index=", dim_index);
   }
 }
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
index 71b7ce80cc0..18599ed4ba6 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -475,7 +475,12 @@ void add_conv1d_node(
     const ValueRef out,
     const bool clamp_out) {
   ValueRef arg_weight = prepack_standard(
-      graph, weight, graph.storage_type_of(out), utils::kChannelsPacked);
+      graph,
+      weight,
+      graph.storage_type_of(out),
+      utils::kChannelsPacked,
+      /* passthrough = */ false,
+      utils::kOptimizedAxisMap);
   ValueRef arg_bias = prepack_biases(
       graph,
       bias,
diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp
index 69378524afb..5756d3a9052 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp
@@ -16,15 +16,18 @@
 namespace vkcompute {
 
 using utils::ivec3;
+using utils::ivec4;
 using utils::uvec3;
 
 void add_copy_offset_node(
     ComputeGraph& graph,
     const ValueRef in,
     const ivec3& range,
-    const ivec3& src_offset,
-    const ivec3& dst_offset,
-    const ValueRef out) {
+    const ivec4& src_offset,
+    const ivec4& dst_offset,
+    const ValueRef out,
+    bool calc_out_pos_using_src_chnl,
+    bool calc_in_pos_using_dst_chnl) {
   vTensorPtr t_in = graph.get_tensor(in);
   vTensorPtr t_out = graph.get_tensor(out);
 
@@ -48,15 +51,107 @@ void add_copy_offset_node(
       // Parameter buffers
       {},
       // Specialization Constants
+      {graph.hashed_layout_of(out),
+       graph.hashed_layout_of(in),
+       (calc_out_pos_using_src_chnl      ? 1
+            : calc_in_pos_using_dst_chnl ? 2
+                                         : 0)},
+      nullptr,
+      {},
+      {
+          PushConstantDataInfo(&range, sizeof(range), sizeof(ivec4)),
+          PushConstantDataInfo(&src_offset, sizeof(src_offset), sizeof(ivec4)),
+          PushConstantDataInfo(&dst_offset, sizeof(dst_offset), sizeof(ivec4)),
+      }));
+}
+
+void add_copy_packed_dim_offset_node(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ivec3& range,
+    const ivec4& src_offset,
+    const ivec4& dst_offset,
+    const ValueRef out) {
+  vTensorPtr t_in = graph.get_tensor(in);
+  vTensorPtr t_out = graph.get_tensor(out);
+
+  // Check the packed dimension is same for both tensors, and if the packed
+  // dimension is Width or Height. Since the function does not support channel
+  // packing.
+  VK_CHECK_COND(
+      check_same_packed_dim(*t_in, *t_out) &&
+      (check_packed_dim_is(*t_in, WHCN::kWidthDim) ||
+       check_packed_dim_is(*t_in, WHCN::kHeightDim)));
+
+  std::string kernel_name = "copy_packed_dim_offset";
+  kernel_name.reserve(kShaderNameReserve);
+  add_dtype_suffix(kernel_name, *t_out);
+
+  const auto packed_dim = t_in->packed_dim();
+  // A copy of range with the last element set to batch size of the input tensor
+  ivec4 final_range = {
+      range[0], range[1], range[2], dim_at(t_in->sizes(), kBatch4D)};
+  ivec3 global_wg_size = t_out->logical_limits();
+  // The starting offset in a texel where this tensor will start copying from
+  const auto src_lane_offset = src_offset[packed_dim] & 0x3;
+  // The starting offset in a texel where this tensor will start copying to
+  const auto dst_lane_offset = dst_offset[packed_dim] & 0x3;
+
+  // The total packed texels this tensor will be copied from
+  // The first texel of tensor data in packed dimension will be copied from
+  // remaining lanes from current source Hence (4 - src_lane_offset) is added
+  // to tensor size in packed dimension
+  const auto src_packed_size = utils::div_up_4(
+      (4 - src_lane_offset) +
+      dim_at(t_out->sizes(), normalize_to_dim_index(*t_out, packed_dim)));
+
+  // The total packed texels this tensor will be copied to
+  // The first texel of tensor data in packed dimension will be copied to
+  // remaining lanes from previous write Hence (4 - dst_lane_offset) is added to
+  // tensor size in packed dimension
+  const auto dst_packed_size = utils::div_up_4(
+      (4 - dst_lane_offset) +
+      dim_at(t_in->sizes(), normalize_to_dim_index(*t_in, packed_dim)));
+
+  // If the starting src offset is not 0, and the total packed texels is greater
+  // than the source texel range
+  const bool has_additional_src_work =
+      src_lane_offset != 0 && src_packed_size > final_range[packed_dim];
+  // If the starting dst offset is not 0, and the total packed texels is greater
+  // than the source texel range
+  const bool has_additional_dst_work =
+      dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim];
+
+  if (has_additional_src_work || has_additional_dst_work) {
+    global_wg_size[packed_dim]++; // Increase the global work group size in
+                                  // packed dimension
+    final_range[packed_dim]++; // Increase the range in packed dimension
+  }
+
+  auto shader = VK_KERNEL_FROM_STR(kernel_name);
+
+  graph.execute_nodes().emplace_back(new DispatchNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      global_wg_size,
+      graph.create_local_wg_size(global_wg_size),
+      // Inputs and Outputs
+      {
+          {out, vkapi::MemoryAccessType::WRITE},
+          {out, vkapi::MemoryAccessType::READ},
+          {in, vkapi::MemoryAccessType::READ},
+      },
+      // Parameter buffers
+      {},
+      // Specialization Constants
       {graph.hashed_layout_of(out), graph.hashed_layout_of(in)},
       nullptr,
       {},
       {
-          PushConstantDataInfo(&range, sizeof(range), sizeof(utils::ivec4)),
           PushConstantDataInfo(
-              &src_offset, sizeof(src_offset), sizeof(utils::ivec4)),
-          PushConstantDataInfo(
-              &dst_offset, sizeof(dst_offset), sizeof(utils::ivec4)),
+              &final_range, sizeof(final_range), sizeof(ivec4)),
+          PushConstantDataInfo(&src_offset, sizeof(src_offset), sizeof(ivec4)),
+          PushConstantDataInfo(&dst_offset, sizeof(dst_offset), sizeof(ivec4)),
       }));
 }
 
@@ -140,7 +235,7 @@ void add_copy_channel_offset_node(
         static_cast<int>(global_size[2]),
         channel_range};
 
-    const utils::ivec4 offset_params = {
+    const ivec4 offset_params = {
         dst_offset[0], dst_offset[1], dst_offset[2], dst_channel_offset};
 
     auto shader = VK_KERNEL_FROM_STR(kernel_name);
@@ -179,10 +274,14 @@ void add_copy_offset_node(
     ValueRef dst_offset_ref,
     ValueRef out) {
   ivec3 range = utils::make_ivec3(*graph.get_int_list(range_ref));
-  ivec3 src_offset = utils::make_ivec3(*graph.get_int_list(src_offset_ref));
-  ivec3 dst_offset = utils::make_ivec3(*graph.get_int_list(dst_offset_ref));
+  ivec3 src = utils::make_ivec3(*graph.get_int_list(src_offset_ref));
+  ivec3 dst = utils::make_ivec3(*graph.get_int_list(dst_offset_ref));
+
+  ivec4 src_offset = {src[0], src[1], src[2], 0};
+  ivec4 dst_offset = {dst[0], dst[1], dst[2], 0};
 
-  add_copy_offset_node(graph, in, range, src_offset, dst_offset, out);
+  add_copy_offset_node(
+      graph, in, range, src_offset, dst_offset, out, false, false);
 }
 
 void copy_offset(ComputeGraph& graph, const std::vector<ValueRef>& args) {
diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.h b/backends/vulkan/runtime/graph/ops/impl/Copy.h
index 60bb20eedf0..e9388345afa 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Copy.h
+++ b/backends/vulkan/runtime/graph/ops/impl/Copy.h
@@ -17,16 +17,48 @@ namespace vkcompute {
 // add_copy_offset_node resumes the vkCmdCopyImage command. It copies the
 // texture extents specified by the range, src_offset, and dst_offset (all are
 // in texture coordinate (x, y, z) from the input image to the output image.
+// src_offset.w and dst_offset.w may contain channel size information.
 //
 // It is possible to have input and output to point to the same image
 // object. But when the source range and destination range overlap, the behavior
 // is undefined.
+//
+// boolean flags calc_out_pos_using_src_chnl and calc_in_pos_using_dst_chnl
+// can be used to specify an indexing function in the shader
+// If calc_out_pos_using_src_chnl is set to true channel and batch index will be
+// calculated based on source channel size and will be used to determine
+// destination texel position.
+//
+// If calc_in_pos_using_dst_chnl is set to truechannel and batch index will be
+// calculated based on destination channel size and will be used to determine
+// source texel position.
+//
+// If both are true calc_out_pos_using_src_chnl is picked. If both are false no
+// index calculation happens.
 void add_copy_offset_node(
     ComputeGraph& graph,
     const ValueRef in,
     const utils::ivec3& range,
-    const utils::ivec3& src_offset,
-    const utils::ivec3& dst_offset,
+    const utils::ivec4& src_offset,
+    const utils::ivec4& dst_offset,
+    const ValueRef out,
+    bool calc_out_pos_using_src_chnl,
+    bool calc_in_pos_using_dst_chnl);
+
+// add_copy_packed_dim_offset_node behaves similar to add_copy_node, except that
+// its used when copying packed dimension, if tensor is width or height packed.
+// src_offset.w and dst_offset.w may contain channel size information.
+//
+// It copies the texture extents specified by the range, src_offset, and
+// dst_offset (all are in texture coordinate (x, y, z) from the input image to
+// the output image.
+//
+void add_copy_packed_dim_offset_node(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const utils::ivec3& range,
+    const utils::ivec4& src_offset,
+    const utils::ivec4& dst_offset,
     const ValueRef out);
 
 // add_copy_channel_offset_node behaves similar to add_copy_node, except that it
diff --git a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
index a56925751e7..4352e98de0b 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
@@ -28,8 +28,7 @@ void check_args(
     const api::vTensor& in,
     const std::vector<int64_t>& permute_dims,
     const api::vTensor& out) {
-  VK_CHECK_COND(check_packed_dim_is(in, WHCN::kChannelsDim));
-  VK_CHECK_COND(check_packed_dim_is(out, WHCN::kChannelsDim));
+  VK_CHECK_COND(check_same_packed_dim(in, out));
 
   // This implementation doesn't not requires the input tensor to have the same
   // dim size as the argument. The code will work as long as the input tensor's
@@ -72,10 +71,14 @@ void add_permute_node(
   int32_t out_channels = dim_at<kChannel4D>(t_out->sizes());
   int32_t in_channels = dim_at<kChannel4D>(t_in->sizes());
 
-  int32_t out_c_aligned = utils::align_up_4(out_channels);
-  int32_t in_c_aligned = utils::align_up_4(in_channels);
+  const auto packed_dim = graph.packed_dim_of(in);
+  ivec2 channel_info = {out_channels, in_channels};
+  if (packed_dim == WHCN::kChannelsDim) {
+    channel_info[0] = utils::align_up_4(channel_info[0]);
+    channel_info[1] = utils::align_up_4(channel_info[1]);
+  }
 
-  const ivec2 ch_info = {out_c_aligned, in_c_aligned};
+  const vkapi::SpecVarList spec_vars = {packed_dim};
 
   graph.execute_nodes().emplace_back(new DispatchNode(
       graph,
@@ -86,14 +89,14 @@ void add_permute_node(
        {in, vkapi::MemoryAccessType::READ}},
       {},
       // Specialization Constants
-      {},
+      spec_vars,
       // Resizing Logic
       nullptr,
       {},
       {{graph.logical_limits_pc_of(out),
-        graph.sizes_pc_of(out),
+        graph.sizes_pc_of(in),
         PushConstantDataInfo(&out_dims, sizeof(out_dims)),
-        PushConstantDataInfo(&ch_info, sizeof(ch_info))}}));
+        PushConstantDataInfo(&channel_info, sizeof(channel_info))}}));
 }
 
 void add_permute_node(
diff --git a/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp b/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp
index 00199ba7a80..3f4ed4f1090 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp
@@ -148,10 +148,11 @@ void add_repeat_node(
   if (int64_t channel_repeat = dim_at<kChannel4D>(repeats);
       channel_repeat == 1) {
     // If no repeat, short-cut to a direct copy
-    utils::ivec3 src_offset{0, 0, 0};
-    utils::ivec3 dst_offset{0, 0, 0};
+    utils::ivec4 src_offset{0, 0, 0, 0};
+    utils::ivec4 dst_offset{0, 0, 0, 0};
 
-    add_copy_offset_node(graph, in, running_range, src_offset, dst_offset, out);
+    add_copy_offset_node(
+        graph, in, running_range, src_offset, dst_offset, out, false, false);
 
   } else {
     add_repeat_channel_node(graph, in, channel_repeat, out, running_range);
@@ -160,13 +161,13 @@ void add_repeat_node(
   // TODO: refactor width, height, and batch into a common helper function.
   // Width
   if (int64_t width_repeat = dim_at<kWidth4D>(repeats); width_repeat > 1) {
-    utils::ivec3 src_offset{0, 0, 0};
+    utils::ivec4 src_offset{0, 0, 0, 0};
 
     for (int i = 1; i < width_repeat; ++i) {
-      utils::ivec3 dst_offset{i * dim_at<kWidth4D>(in_sizes), 0, 0};
+      utils::ivec4 dst_offset{i * dim_at<kWidth4D>(in_sizes), 0, 0, 0};
 
       add_copy_offset_node(
-          graph, out, running_range, src_offset, dst_offset, out);
+          graph, out, running_range, src_offset, dst_offset, out, true, false);
     }
 
     running_range[0] = running_range[0] * width_repeat;
@@ -174,13 +175,13 @@ void add_repeat_node(
 
   // Height
   if (int64_t height_repeat = dim_at<kHeight4D>(repeats); height_repeat > 1) {
-    utils::ivec3 src_offset{0, 0, 0};
+    utils::ivec4 src_offset{0, 0, 0, 0};
 
     for (int i = 1; i < height_repeat; ++i) {
-      utils::ivec3 dst_offset = {0, i * dim_at<kHeight4D>(in_sizes), 0};
+      utils::ivec4 dst_offset = {0, i * dim_at<kHeight4D>(in_sizes), 0, 0};
 
       add_copy_offset_node(
-          graph, out, running_range, src_offset, dst_offset, out);
+          graph, out, running_range, src_offset, dst_offset, out, true, false);
     }
 
     running_range[1] = running_range[1] * height_repeat;
@@ -188,13 +189,13 @@ void add_repeat_node(
 
   // Batch
   if (int64_t batch_repeat = dim_at<kBatch4D>(repeats); batch_repeat > 1) {
-    utils::ivec3 src_offset{0, 0, 0};
+    utils::ivec4 src_offset{0, 0, 0, 0};
 
     for (int i = 1; i < batch_repeat; ++i) {
-      utils::ivec3 dst_offset = {0, 0, i * running_range[2]};
+      utils::ivec4 dst_offset = {0, 0, i * running_range[2], 0};
 
       add_copy_offset_node(
-          graph, out, running_range, src_offset, dst_offset, out);
+          graph, out, running_range, src_offset, dst_offset, out, true, false);
     }
 
     running_range[2] = running_range[2] * batch_repeat;
diff --git a/backends/vulkan/runtime/graph/ops/impl/Slice.cpp b/backends/vulkan/runtime/graph/ops/impl/Slice.cpp
index 40603394660..efda6e04992 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Slice.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Slice.cpp
@@ -44,8 +44,7 @@ void add_slice_tensor_copy_node(
   vTensorPtr t_in = graph.get_tensor(in);
   vTensorPtr t_out = graph.get_tensor(out);
 
-  VK_CHECK_COND(check_packed_dim_is(*t_in, WHCN::kChannelsDim));
-  VK_CHECK_COND(check_packed_dim_is(*t_out, WHCN::kChannelsDim));
+  VK_CHECK_COND(check_same_packed_dim(*t_in, *t_out));
 
   // Need normalize the dim
   int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
@@ -76,9 +75,15 @@ void add_slice_tensor_copy_node(
   start = normalize_idx(start, in_sizes[dim], 0);
   end = normalize_idx(end, in_sizes[dim], in_sizes[dim]);
 
-  if (dim_index == kChannel4D) {
+  const vkapi::SpecVarList spec_vars = {t_in->packed_dim()};
+
+  const auto packed_dim_idx =
+      static_cast<DimIndex>(DimIndex::DIM_LAST - t_in->packed_dim());
+
+  // if slice dim is the same as the packed dim, we can use the channel slice
+  if (dim_index == packed_dim_idx) {
     // slice by channel
-    std::string kernel_name = "slice_channel";
+    std::string kernel_name = "slice_packed_dim";
     kernel_name.reserve(kShaderNameReserve);
     add_dtype_suffix(kernel_name, *t_out);
 
@@ -99,29 +104,22 @@ void add_slice_tensor_copy_node(
          {in, vkapi::MemoryAccessType::READ}},
         {t_out->sizes_ubo(),
          t_in->sizes_ubo(),
-         graph.create_params_buffer(params)}));
+         graph.create_params_buffer(params)},
+        spec_vars));
 
   } else {
-    // GPU's coordinate is in x, y, z
-    int64_t gpu_dim = -1;
-    int64_t stride = 1;
-    if (dim_index == kWidth4D) {
-      gpu_dim = 0; // width: x dimension in gpu
-      VK_CHECK_COND(out_sizes[dim] == (1 + (end - start - 1) / step));
-    } else if (dim_index == kHeight4D) {
-      gpu_dim = 1; // height: y dimension
-      VK_CHECK_COND(out_sizes[dim] == (1 + (end - start - 1) / step));
-    } else if (dim_index == kBatch4D) {
-      gpu_dim = 2; // batch: z dimension
-
-      // Due to channel packing, each batch value is span over stride planes
-      int64_t n_channels = dim_at(in_sizes, kChannel4D);
-      stride = utils::div_up_4(n_channels);
-    } else {
-      VK_THROW("Unexpected ncwh_dim!");
+    // GPU's coordinate is in x = 0, y = 1, z = 2, w = 3
+    const int64_t gpu_dim = -(dim_index + 1);
+    // stride of input tensor's channel dimension
+    int64_t in_channel_stride = dim_at(in_sizes, kChannel4D);
+    VK_CHECK_COND(out_sizes[dim] == (1 + (end - start - 1) / step));
+
+    // Due to channel packing, each batch value is span over stride planes
+    if (dim_index == kBatch4D && packed_dim_idx == kChannel4D) {
+      in_channel_stride = utils::div_up_4(in_channel_stride);
     }
 
-    std::string kernel_name = "slice_batch_height_width";
+    std::string kernel_name = "slice_unpacked_dim";
     kernel_name.reserve(kShaderNameReserve);
     add_dtype_suffix(kernel_name, *t_out);
 
@@ -137,7 +135,7 @@ void add_slice_tensor_copy_node(
         static_cast<int32_t>(gpu_dim),
         static_cast<int32_t>(start),
         static_cast<int32_t>(step),
-        static_cast<int32_t>(stride),
+        static_cast<int32_t>(in_channel_stride),
     };
 
     graph.execute_nodes().emplace_back(new DispatchNode(
@@ -147,7 +145,8 @@ void add_slice_tensor_copy_node(
         local_size,
         {{out, vkapi::MemoryAccessType::WRITE},
          {in, vkapi::MemoryAccessType::READ}},
-        {t_out->sizes_ubo(), graph.create_params_buffer(params)}));
+        {t_out->sizes_ubo(), graph.create_params_buffer(params)},
+        spec_vars));
   }
 }
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/Split.cpp b/backends/vulkan/runtime/graph/ops/impl/Split.cpp
index 39039e51025..8002dadc538 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Split.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Split.cpp
@@ -25,8 +25,6 @@ void add_split_with_sizes_default_node(
     ValueRef out_list_ref) {
   vTensorPtr t_in = graph.get_tensor(in);
 
-  VK_CHECK_COND(check_packed_dim_is(*t_in, WHCN::kChannelsDim));
-
   ValueListPtr out_list = graph.get_value_list(out_list_ref);
 
   DimIndex dim_index = normalize_to_dim_index(*t_in, dim);
@@ -38,59 +36,60 @@ void add_split_with_sizes_default_node(
     ValueRef out_ref = (*out_list)[split_idx];
 
     vTensorPtr t_out = graph.get_tensor(out_ref);
-    VK_CHECK_COND(check_packed_dim_is(*t_out, WHCN::kChannelsDim));
     VK_CHECK_COND(dim_at(*t_out, dim_index) == split_size);
   }
 
-  if (dim_index == kWidth4D) {
-    utils::ivec3 src_offset = utils::make_ivec3({0, 0, 0}, false);
-    utils::ivec3 dst_offset = utils::make_ivec3({0, 0, 0}, false);
-
-    for (ValueRef out_ref : *out_list) {
-      // Doesn't need to use split_size since we have already verified that the
-      // output tensor's size matches with the split_size.
-      vTensorPtr t_out = graph.get_tensor(out_ref);
-      utils::ivec3 range = t_out->logical_limits();
-      add_copy_offset_node(graph, in, range, src_offset, dst_offset, out_ref);
+  const auto packed_dim = t_in->packed_dim();
+  const auto packed_dim_index = static_cast<DimIndex>(kWidth4D - packed_dim);
 
-      src_offset[0] += range[0];
-    }
-  } else if (dim_index == kHeight4D) {
-    utils::ivec3 src_offset = utils::make_ivec3({0, 0, 0}, false);
-    utils::ivec3 dst_offset = utils::make_ivec3({0, 0, 0}, false);
+  // Index of dimension to be concatenated in (w, h, c * b) coordinate system
+  const auto dim_xyz_index = std::min(2, -dim_index - 1);
 
-    for (ValueRef out_ref : *out_list) {
-      vTensorPtr t_out = graph.get_tensor(out_ref);
-      utils::ivec3 range = t_out->logical_limits();
-      add_copy_offset_node(graph, in, range, src_offset, dst_offset, out_ref);
+  utils::ivec4 src_offset = utils::make_ivec4({0, 0, 0, 0}, false);
+  utils::ivec4 dst_offset = utils::make_ivec4({0, 0, 0, 0}, false);
 
-      src_offset[1] += range[1];
-    }
-  } else if (dim_index == kBatch4D) {
-    utils::ivec3 src_offset = utils::make_ivec3({0, 0, 0}, false);
-    utils::ivec3 dst_offset = utils::make_ivec3({0, 0, 0}, false);
+  const bool is_splitting_channel = (dim_index == kChannel4D);
 
-    for (ValueRef out_ref : *out_list) {
-      vTensorPtr t_out = graph.get_tensor(out_ref);
-      utils::ivec3 range = t_out->logical_limits();
-      add_copy_offset_node(graph, in, range, src_offset, dst_offset, out_ref);
+  // if splitting channels
+  if (is_splitting_channel) {
+    // set source offset w as channel size of the input tensor
+    src_offset[3] = dim_at(t_in->sizes(), kChannel4D);
+  }
 
-      src_offset[2] += range[2];
-    }
-  } else if (dim_index == kChannel4D) {
-    int32_t src_offset = 0;
-    int32_t dst_offset = 0;
-
-    for (ValueRef out_ref : *out_list) {
-      vTensorPtr t_out = graph.get_tensor(out_ref);
-      int32_t range = dim_at<kChannel4D>(t_out->sizes());
-      add_copy_channel_offset_node(
-          graph, in, range, src_offset, dst_offset, out_ref);
-      src_offset += range;
+  for (ValueRef out_ref : *out_list) {
+    // Doesn't need to use split_size since we have already verified that the
+    // output tensor's size matches with the split_size.
+    vTensorPtr t_out = graph.get_tensor(out_ref);
+    const auto out_channel_size = dim_at(t_out->sizes(), kChannel4D);
+    utils::ivec3 range = t_out->logical_limits();
+
+    if (dim_index == packed_dim_index) {
+      // if splitting channels, use add_copy_channel_offset_node function as
+      // add_copy_packed_dim_offset_node does not support channel packing
+      if (is_splitting_channel) {
+        add_copy_channel_offset_node(
+            graph, in, out_channel_size, src_offset[2], dst_offset[2], out_ref);
+        src_offset[dim_xyz_index] += out_channel_size;
+      } else {
+        // dst_offset[3] is not used now but will be used in the future when
+        // add_copy_packed_dim_offset_node will support channel packing
+        //
+        // set destination offset w as channel size of the output tensor if
+        // splitting channel
+        dst_offset[3] = is_splitting_channel ? out_channel_size : 0;
+        add_copy_packed_dim_offset_node(
+            graph, in, range, src_offset, dst_offset, out_ref);
+        src_offset[dim_xyz_index] += dim_at(t_out->sizes(), packed_dim_index);
+      }
+    } else {
+      // set destination offset w as channel size of the output tensor if
+      // splitting channels
+      dst_offset[3] = is_splitting_channel ? out_channel_size : 0;
+      add_copy_offset_node(
+          graph, in, range, src_offset, dst_offset, out_ref, false, true);
+      src_offset[dim_xyz_index] +=
+          is_splitting_channel ? out_channel_size : range[dim_xyz_index];
     }
-
-  } else {
-    VK_THROW("not ipmlemented");
   }
 }
 
diff --git a/backends/vulkan/runtime/utils/VecUtils.h b/backends/vulkan/runtime/utils/VecUtils.h
index ad4434cf5af..c084a563544 100644
--- a/backends/vulkan/runtime/utils/VecUtils.h
+++ b/backends/vulkan/runtime/utils/VecUtils.h
@@ -479,5 +479,49 @@ inline int64_t multiply_integers(Iter begin, Iter end) {
       begin, end, static_cast<int64_t>(1), std::multiplies<>());
 }
 
+class WorkgroupSize final {
+  uint32_t val;
+
+ public:
+  explicit WorkgroupSize() : val(0) {}
+  explicit WorkgroupSize(const uint32_t x, const uint32_t y, const uint32_t z) {
+    // shift numbers by multiple of 11 bits, since each local workgroup axis can
+    // be 1024 at most and which is 0x400. only z axis can't store 1024, because
+    // it would overflow uint32_t storage.
+    if (z == 1024) {
+      throw std::runtime_error(
+          "Workgroup size in z axis cannot be 1024 because it would overflow uint32_t storage");
+    }
+    val = x | (y << 11) | (z << 22);
+  }
+
+  explicit WorkgroupSize(const uvec3& vec) {
+    // shift numbers by multiple of 11 bits, since each local workgroup axis can
+    // be 1024 at most and which is 0x400. only z axis can't store 1024, because
+    // it would overflow uint32_t storage.
+    if (vec[2u] == 1024) {
+      throw std::runtime_error(
+          "Workgroup size in z axis cannot be 1024 because it would overflow uint32_t storage");
+    }
+    val = vec[0u] | (vec[1u] << 11) | (vec[2u] << 22);
+  }
+
+  explicit inline operator uvec3() const {
+    return {
+        val & 0x7ffu,
+        (val >> 11) & 0x7ffu,
+        (val >> 22),
+    };
+  }
+
+  explicit inline operator uint32_t() const {
+    return val;
+  }
+
+  inline constexpr uint32_t operator[](const int idx) const {
+    return (val >> (11 * idx)) & 0x7ffu;
+  }
+};
+
 } // namespace utils
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/vk_api/Adapter.cpp b/backends/vulkan/runtime/vk_api/Adapter.cpp
index 4102417436c..db6fdc2909a 100644
--- a/backends/vulkan/runtime/vk_api/Adapter.cpp
+++ b/backends/vulkan/runtime/vk_api/Adapter.cpp
@@ -274,6 +274,7 @@ std::string Adapter::stringize() const {
   PRINT_PROP(limits, maxImageDimension1D);
   PRINT_PROP(limits, maxImageDimension2D);
   PRINT_PROP(limits, maxImageDimension3D);
+  PRINT_PROP(limits, maxStorageBufferRange);
   PRINT_PROP(limits, maxTexelBufferElements);
   PRINT_PROP(limits, maxPushConstantsSize);
   PRINT_PROP(limits, maxMemoryAllocationCount);
diff --git a/backends/vulkan/runtime/vk_api/Command.cpp b/backends/vulkan/runtime/vk_api/Command.cpp
index 3be790b53cf..3a5041f9500 100644
--- a/backends/vulkan/runtime/vk_api/Command.cpp
+++ b/backends/vulkan/runtime/vk_api/Command.cpp
@@ -81,7 +81,7 @@ void CommandBuffer::end() {
 void CommandBuffer::bind_pipeline(
     VkPipeline pipeline,
     VkPipelineLayout pipeline_layout,
-    const utils::uvec3 local_workgroup_size) {
+    const utils::WorkgroupSize local_workgroup_size) {
   VK_CHECK_COND(
       state_ == CommandBuffer::State::RECORDING,
       "Vulkan CommandBuffer: called bind_pipeline() on a command buffer whose state "
diff --git a/backends/vulkan/runtime/vk_api/Command.h b/backends/vulkan/runtime/vk_api/Command.h
index 99cd5d17c99..ff1e5934a5c 100644
--- a/backends/vulkan/runtime/vk_api/Command.h
+++ b/backends/vulkan/runtime/vk_api/Command.h
@@ -51,7 +51,7 @@ class CommandBuffer final {
   struct Bound {
     VkPipeline pipeline;
     VkPipelineLayout pipeline_layout;
-    utils::uvec3 local_workgroup_size;
+    utils::WorkgroupSize local_workgroup_size;
     VkDescriptorSet descriptors;
 
     explicit Bound()
@@ -63,7 +63,7 @@ class CommandBuffer final {
     inline void reset() {
       pipeline = VK_NULL_HANDLE;
       pipeline_layout = VK_NULL_HANDLE;
-      local_workgroup_size = {0u, 0u, 0u};
+      local_workgroup_size = utils::WorkgroupSize{0u, 0u, 0u};
       descriptors = VK_NULL_HANDLE;
     }
   };
@@ -87,7 +87,7 @@ class CommandBuffer final {
   void begin();
   void end();
 
-  void bind_pipeline(VkPipeline, VkPipelineLayout, const utils::uvec3);
+  void bind_pipeline(VkPipeline, VkPipelineLayout, const utils::WorkgroupSize);
   void bind_descriptors(VkDescriptorSet);
   void set_push_constants(VkPipelineLayout, const void*, uint32_t);
 
diff --git a/backends/vulkan/runtime/vk_api/Pipeline.cpp b/backends/vulkan/runtime/vk_api/Pipeline.cpp
index 0c66a085ad9..5dcb00168b2 100644
--- a/backends/vulkan/runtime/vk_api/Pipeline.cpp
+++ b/backends/vulkan/runtime/vk_api/Pipeline.cpp
@@ -174,6 +174,14 @@ void SpecVarList::append(const SpecVarList& other) {
   vars.insert(vars.end(), other.vars.begin(), other.vars.end());
 }
 
+void SpecVarList::reserve(const size_t size) {
+  vars.reserve(size);
+}
+
+void SpecVarList::append(const SpecVar& other) {
+  vars.push_back(other);
+}
+
 std::vector<VkSpecializationMapEntry> SpecVarList::generate_map_entries()
     const {
   std::vector<VkSpecializationMapEntry> map_entries;
@@ -267,12 +275,11 @@ ComputePipeline::ComputePipeline(
     const ComputePipeline::Descriptor& descriptor,
     VkPipelineCache pipeline_cache)
     : device_(device), handle_{VK_NULL_HANDLE} {
-  std::vector<VkSpecializationMapEntry> map_entries =
-      descriptor.specialization_constants.generate_map_entries();
+  map_entries_ = descriptor.specialization_constants.generate_map_entries();
 
   const VkSpecializationInfo specialization_info{
       descriptor.specialization_constants.size(), // mapEntryCount
-      map_entries.data(), // pMapEntries
+      map_entries_.data(), // pMapEntries
       descriptor.specialization_constants.data_nbytes(), // dataSize
       descriptor.specialization_constants.data(), // pData
   };
@@ -313,7 +320,9 @@ ComputePipeline::ComputePipeline(
 }
 
 ComputePipeline::ComputePipeline(ComputePipeline&& other) noexcept
-    : device_(other.device_), handle_(other.handle_) {
+    : device_(other.device_),
+      handle_(other.handle_),
+      map_entries_(std::move(other.map_entries_)) {
   other.handle_ = VK_NULL_HANDLE;
 }
 
diff --git a/backends/vulkan/runtime/vk_api/Pipeline.h b/backends/vulkan/runtime/vk_api/Pipeline.h
index 5460a0acba7..1e0fc1e28aa 100644
--- a/backends/vulkan/runtime/vk_api/Pipeline.h
+++ b/backends/vulkan/runtime/vk_api/Pipeline.h
@@ -82,6 +82,10 @@ class SpecVarList final {
 
   void append(const SpecVarList& other);
 
+  void reserve(const size_t size);
+
+  void append(const SpecVar& other);
+
   std::vector<VkSpecializationMapEntry> generate_map_entries() const;
 
   friend bool operator==(const SpecVarList& lhs, const SpecVarList& rhs);
@@ -170,6 +174,7 @@ class ComputePipeline final {
  private:
   VkDevice device_;
   VkPipeline handle_;
+  std::vector<VkSpecializationMapEntry> map_entries_;
 
  public:
   inline VkPipeline handle() const {
diff --git a/backends/vulkan/runtime/vk_api/QueryPool.cpp b/backends/vulkan/runtime/vk_api/QueryPool.cpp
index b029cea7081..2f6d433b887 100644
--- a/backends/vulkan/runtime/vk_api/QueryPool.cpp
+++ b/backends/vulkan/runtime/vk_api/QueryPool.cpp
@@ -185,19 +185,20 @@ std::vector<ShaderResult> QueryPool::get_shader_timestamp_data() {
   std::vector<ShaderResult> shader_result;
   for (ShaderDuration& entry : shader_durations_) {
     shader_result.push_back(ShaderResult{
-        .kernel_name = entry.kernel_name,
-        .dispatch_id = entry.dispatch_id,
-        .start_time_ns = entry.start_time_ns,
-        .end_time_ns = entry.end_time_ns,
-        .metadata = ShaderMetadata{
-            .global_workgroup_size =
-                {entry.global_workgroup_size.width,
-                 entry.global_workgroup_size.height,
-                 entry.global_workgroup_size.depth},
-            .local_workgroup_size =
-                {entry.local_workgroup_size.width,
-                 entry.local_workgroup_size.height,
-                 entry.local_workgroup_size.depth},
+        /* .kernel_name = */ entry.kernel_name,
+        /* .dispatch_id = */ entry.dispatch_id,
+        /* .start_time_ns = */ entry.start_time_ns,
+        /* .end_time_ns = */ entry.end_time_ns,
+        /* .metadata = */
+        ShaderMetadata{
+            /* .global_workgroup_size = */
+            {entry.global_workgroup_size.width,
+             entry.global_workgroup_size.height,
+             entry.global_workgroup_size.depth},
+            /* .local_workgroup_size = */
+            {entry.local_workgroup_size.width,
+             entry.local_workgroup_size.height,
+             entry.local_workgroup_size.depth},
         }});
   }
   return shader_result;
diff --git a/backends/vulkan/runtime/vk_api/Shader.h b/backends/vulkan/runtime/vk_api/Shader.h
index d9fec65febc..7d0fa7b7476 100644
--- a/backends/vulkan/runtime/vk_api/Shader.h
+++ b/backends/vulkan/runtime/vk_api/Shader.h
@@ -61,7 +61,7 @@ struct ShaderInfo final {
   ShaderLayout::Signature kernel_layout{};
 
   // Shader Metadata
-  utils::uvec3 out_tile_size{1u, 1u, 1u};
+  utils::WorkgroupSize out_tile_size{1u, 1u, 1u};
   bool requires_shader_int16 = false;
   bool requires_16bit_storage = false;
   bool requires_8bit_storage = false;
diff --git a/backends/vulkan/serialization/vulkan_graph_builder.py b/backends/vulkan/serialization/vulkan_graph_builder.py
index 81447472123..d01c8b53b35 100644
--- a/backends/vulkan/serialization/vulkan_graph_builder.py
+++ b/backends/vulkan/serialization/vulkan_graph_builder.py
@@ -267,7 +267,6 @@ def get_or_create_value_for(self, arg: _Argument):
         elif isinstance(arg, list) and isinstance(arg[0], Node):
             return self.create_value_list_value(arg)
         elif isinstance(arg, torch.fx.immutable_collections.immutable_list):
-            # pyre-ignore[6]
             return self.create_value_list_value(arg)
         elif isinstance(arg, str):
             return self.create_string_value(arg)
diff --git a/backends/vulkan/targets.bzl b/backends/vulkan/targets.bzl
index 150ae32dfce..d2314d138bf 100644
--- a/backends/vulkan/targets.bzl
+++ b/backends/vulkan/targets.bzl
@@ -5,7 +5,13 @@ load("@fbsource//tools/build_defs:platform_defs.bzl", "ANDROID", "CXX", "FBCODE"
 
 
 def get_vulkan_compiler_flags():
-    return ["-Wno-missing-prototypes", "-Wno-global-constructors"]
+    return select({
+        "DEFAULT": [
+            "-Wno-global-constructors",
+            "-Wno-missing-prototypes",
+        ],
+        "ovr_config//os:windows": [],
+    })
 
 def get_labels(no_volk):
     if no_volk:
diff --git a/backends/vulkan/test/CMakeLists.txt b/backends/vulkan/test/CMakeLists.txt
index 93e69e2d08a..4559077ccf8 100644
--- a/backends/vulkan/test/CMakeLists.txt
+++ b/backends/vulkan/test/CMakeLists.txt
@@ -37,7 +37,7 @@ if(LIB_VULKAN_BACKEND)
   # to provide access to target_link_options_shared_lib which allows libraries
   # to be linked with the --whole-archive flag. This is required for libraries
   # that perform dynamic registration via static initialization.
-  include(${EXECUTORCH_ROOT}/build/Utils.cmake)
+  include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 
   include(../cmake/ShaderLibrary.cmake)
 
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
index 38d87240b80..41d8edf1f25 100644
--- a/backends/vulkan/test/op_tests/cases.py
+++ b/backends/vulkan/test/op_tests/cases.py
@@ -5,6 +5,8 @@
 # LICENSE file in the root directory of this source tree.
 
 
+import itertools
+
 from collections import namedtuple
 from typing import Callable
 
@@ -56,6 +58,7 @@ def get_binary_elementwise_inputs():
         "utils::kWidthPacked",
         "utils::kChannelsPacked",
     ]
+    test_suite.storage_types = ["utils::kBuffer", "utils::kTexture3D"]
     return test_suite
 
 
@@ -456,26 +459,20 @@ def get_select_int_inputs():
 
 @register_test_suite(["aten.permute.default", "aten.permute_copy.default"])
 def get_permute_inputs():
-    test_suite = VkTestSuite(
-        [
-            ((9, 2, 9, 4), [0, 1, 2, 3]),
-            ((9, 2, 9, 4), [0, 1, 3, 2]),
-            ((9, 2, 9, 4), [0, 2, 1, 3]),
-            ((9, 2, 9, 4), [0, 2, 3, 1]),
-            ((9, 2, 9, 4), [0, 3, 1, 2]),
-            ((9, 2, 9, 4), [0, 3, 2, 1]),
-            ((9, 2, 9, 4), [3, 0, 1, 2]),
-            ((9, 2, 9, 4), [3, 2, 0, 1]),
-            ((9, 2, 9, 4), [2, 3, 0, 1]),
-            ((9, 2, 9, 4), [2, 0, 3, 1]),
-            ((9, 2, 9), [2, 0, 1]),
-            ((9, 2, 9), [1, 2, 0]),
-            ((9, 2), [0, 1]),
-            ((9, 2), [1, 0]),
-        ]
-    )
+    batch_tests = [
+        ((9, 2, 5, 7), out_axis) for out_axis in itertools.permutations([0, 1, 2, 3])
+    ]
+    channel_tests = [
+        ((9, 2, 5), out_axis) for out_axis in itertools.permutations([0, 1, 2])
+    ]
+    wh_tests = [((9, 2), out_axis) for out_axis in itertools.permutations([0, 1])]
+    test_suite = VkTestSuite(batch_tests + channel_tests + wh_tests)
 
-    test_suite.layouts = ["utils::kChannelsPacked"]
+    test_suite.layouts = [
+        "utils::kWidthPacked",
+        "utils::kHeightPacked",
+        "utils::kChannelsPacked",
+    ]
     return test_suite
 
 
@@ -585,7 +582,11 @@ def get_slice_out_inputs():
     test_suite = VkTestSuite([tuple(tc) for tc in test_cases])
 
     test_suite.dtypes = ["at::kFloat", "at::kHalf"]
-    test_suite.layouts = ["utils::kChannelsPacked"]
+    test_suite.layouts = [
+        "utils::kWidthPacked",
+        "utils::kHeightPacked",
+        "utils::kChannelsPacked",
+    ]
     test_suite.data_gen = "make_seq_tensor"
     return test_suite
 
@@ -849,8 +850,11 @@ def get_cat_inputs():
     test_suite = VkTestSuite(
         [
             # Cat on Height
+            ([(M, M, 3, 5), (M, M, 0, 5)], 2),
             ([(S1, S1, 3, 5), (S1, S1, 0, 5)], 2),
+            ([(M, M, 3, 5), (M, M, 4, 5)], 2),
             ([(S1, S1, 3, 5), (S1, S1, 4, 5)], 2),
+            ([(M2, 3, 5), (M2, 4, 5)], 1),
             ([(S1, 3, 5), (S1, 4, 5)], 1),
             ([(3, 5), (4, 5)], 0),
             ([(3, 5), (4, 5), (1, 5)], 0),
@@ -859,7 +863,9 @@ def get_cat_inputs():
                 0,
             ),
             # Cat on Width
+            ([(M, M, 5, 3), (M, M, 5, 4)], 3),
             ([(S1, S1, 5, 3), (S1, S1, 5, 4)], 3),
+            ([(M, 5, 3), (M, 5, 4)], 2),
             ([(S1, 5, 3), (S1, 5, 4)], 2),
             ([(5, 0), (5, 4)], 1),
             ([(5, 3), (5, 4)], 1),
@@ -870,7 +876,9 @@ def get_cat_inputs():
             ),
             ([(5,), (6,)], 0),
             # Cat on Batch
+            ([(M, S1, 5, 4), (M1, S1, 5, 4)], 0),
             ([(S, S1, 5, 4), (S1, S1, 5, 4)], 0),
+            ([(S, M, 5, 4), (S1, M, 5, 4)], 0),
             ([(S, XS, 5, 4), (S1, XS, 5, 4)], 0),
             ([(S, S2, 5, 4), (S1, S2, 5, 4)], 0),
             (
@@ -882,7 +890,9 @@ def get_cat_inputs():
                 0,
             ),
             # Cat on Channel
+            ([(M, 5, 4), (0, 5, 4), (M1, 5, 4)], 0),
             ([(S, 5, 4), (0, 5, 4), (S2, 5, 4)], 0),
+            ([(M, 5, 4), (M1, 5, 4), (M2, 5, 4)], 0),
             ([(S, 5, 4), (S1, 5, 4), (S2, 5, 4)], 0),
             ([(XS, 5, 4), (XS, 5, 4), (S2, 5, 4)], 0),
             ([(XS, S, 5, 4), (XS, S1, 5, 4), (XS, S2, 5, 4)], 1),
@@ -898,6 +908,8 @@ def get_cat_inputs():
         ]
     )
     test_suite.layouts = [
+        "utils::kWidthPacked",
+        "utils::kHeightPacked",
         "utils::kChannelsPacked",
     ]
     test_suite.data_gen = "make_seq_tensor"
@@ -910,14 +922,20 @@ def get_split_with_sizes_inputs():
     Test = namedtuple("VkSliceTest", ["self", "sizes", "dim"])
     test_cases = [
         # Split on Width
+        Test(self=(S1, 7, 10, 11), sizes=[1, 3, 2, 5], dim=3),
         Test(self=(S1, 7, 10, 10), sizes=[1, 2, 3, 4], dim=3),
+        Test(self=(7, 10, 11), sizes=[1, 3, 2, 5], dim=2),
         Test(self=(7, 10, 10), sizes=[1, 2, 3, 4], dim=2),
+        Test(self=(7, 10, 11), sizes=[3, 8], dim=2),
         Test(self=(7, 10, 10), sizes=[1, 9], dim=2),
         Test(self=(10, 10), sizes=[1, 9], dim=1),
         Test(self=(10,), sizes=[1, 9], dim=0),
         # Split on Height
+        Test(self=(S1, 7, 11, 10), sizes=[1, 3, 2, 5], dim=2),
         Test(self=(S1, 7, 10, 10), sizes=[1, 2, 3, 4], dim=2),
+        Test(self=(7, 11, 10), sizes=[1, 3, 2, 5], dim=1),
         Test(self=(7, 10, 10), sizes=[1, 2, 3, 4], dim=1),
+        Test(self=(7, 11, 11), sizes=[3, 8], dim=1),
         Test(self=(7, 10, 10), sizes=[10], dim=1),
         Test(self=(7, 6, 10), sizes=[1, 1, 1, 1, 1, 1], dim=1),
         Test(self=(10, 10), sizes=[1, 2, 3, 4], dim=0),
@@ -925,8 +943,11 @@ def get_split_with_sizes_inputs():
         Test(self=(10, 7, 10, 10), sizes=[3, 6, 1], dim=0),
         Test(self=(10, 7, 10, 10), sizes=[10], dim=0),
         # Split on Channel
+        Test(self=(7, 13, 4, 8), sizes=[3, 5, 2, 3], dim=1),
         Test(self=(7, 13, 4, 8), sizes=[3, 6, 1, 3], dim=1),
+        Test(self=(7, 13, 4, 8), sizes=[3, 2, 2, 5, 1], dim=1),
         Test(self=(7, 13, 4, 8), sizes=[3, 3, 3, 3, 1], dim=1),
+        Test(self=(13, 4, 8), sizes=[3, 5, 2, 1, 2], dim=0),
         Test(self=(13, 4, 8), sizes=[3, 3, 3, 3, 1], dim=0),
         Test(self=(13, 4, 8), sizes=[2, 9, 2], dim=0),
         Test(self=(13, 4, 8), sizes=[13], dim=0),
@@ -934,6 +955,8 @@ def get_split_with_sizes_inputs():
     test_suite = VkTestSuite([tuple(tc) for tc in test_cases])
 
     test_suite.layouts = [
+        "utils::kWidthPacked",
+        "utils::kHeightPacked",
         "utils::kChannelsPacked",
     ]
     test_suite.data_gen = "make_seq_tensor"
@@ -985,6 +1008,8 @@ def get_split_tensor_inputs():
     )
 
     test_suite.layouts = [
+        "utils::kWidthPacked",
+        "utils::kHeightPacked",
         "utils::kChannelsPacked",
     ]
     test_suite.data_gen = "make_seq_tensor"
diff --git a/backends/vulkan/vulkan_preprocess.py b/backends/vulkan/vulkan_preprocess.py
index c6b444e5def..3cfcac13a8d 100644
--- a/backends/vulkan/vulkan_preprocess.py
+++ b/backends/vulkan/vulkan_preprocess.py
@@ -26,7 +26,7 @@
     insert_prepack_nodes,
     RemoveLocalScalarDenseOpsTransform,
     RemoveRedundantOpsTransform,
-    SqueezeInt4LinearInputs,
+    SqueezeUnsqueezeInputs,
     TagMemoryMetaPass,
 )
 
@@ -153,7 +153,7 @@ def preprocess(  # noqa: C901
                 RemoveRedundantOpsTransform(),
                 AddmmToLinearTransform(),
                 FuseDequantLinearPass(),
-                SqueezeInt4LinearInputs(),
+                SqueezeUnsqueezeInputs(),
                 FuseViewCopyTransform(),
                 ViewCopyToSqueezeUnsqueezePass(),
                 FuseBatchNormWithConvPass(program),
diff --git a/backends/xnnpack/CMakeLists.txt b/backends/xnnpack/CMakeLists.txt
index a703d67c1b2..a1a110cc10a 100644
--- a/backends/xnnpack/CMakeLists.txt
+++ b/backends/xnnpack/CMakeLists.txt
@@ -10,7 +10,7 @@
 # cmake-format -i CMakeLists.txt
 # ~~~
 
-cmake_minimum_required(VERSION 3.19)
+cmake_minimum_required(VERSION 3.24)
 
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
@@ -18,29 +18,38 @@ if(NOT CMAKE_CXX_STANDARD)
   set(CMAKE_CXX_STANDARD 17)
 endif()
 
-if(NOT FLATC_EXECUTABLE)
-  set(FLATC_EXECUTABLE flatc)
-endif()
-
 # Source root directory for executorch.
 if(NOT EXECUTORCH_ROOT)
   set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
 endif()
 
-include(${EXECUTORCH_ROOT}/build/Utils.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 
 if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
 endif()
 
-# NB: Enabling this will serialize execution of delegate instances
-# Keeping this OFF by default to maintain existing behavior, to be revisited.
+# NB: Enabling this will serialize execution of delegate instances Keeping this
+# OFF by default to maintain existing behavior, to be revisited.
 option(EXECUTORCH_XNNPACK_SHARED_WORKSPACE
-  "Enable workspace sharing across different delegate instances" ON)
-# Keeping this OFF by default due to regressions in decode
-# and model load with kleidi kernels
-option(EXECUTORCH_XNNPACK_ENABLE_KLEIDI
-  "Enable Arm Kleidi kernels" OFF)
+       "Enable workspace sharing across different delegate instances" ON
+)
+# Keeping this OFF by default due to regressions in decode and model load with
+# kleidi kernels
+option(EXECUTORCH_XNNPACK_ENABLE_KLEIDI "Enable Arm Kleidi kernels" OFF)
+
+# Turning this on cache weights between partitions and methods. If weights
+# are shared across methods/partitions then this can reduce load time and
+# memory usage
+
+# Keeping this off maintains existing behavior. Turning this on serializes
+# execution and initialization of delegates, to be revisited
+option(EXECUTORCH_XNNPACK_ENABLE_WEIGHT_CACHE
+        "Enable weights cache to cache and manage all packed weights" OFF)
+
+if(EXECUTORCH_XNNPACK_ENABLE_WEIGHT_CACHE)
+  add_definitions(-DENABLE_XNNPACK_WEIGHTS_CACHE)
+endif()
 if(EXECUTORCH_XNNPACK_SHARED_WORKSPACE)
   add_definitions(-DENABLE_XNNPACK_SHARED_WORKSPACE)
 endif()
@@ -73,6 +82,12 @@ foreach(fbs_file ${_xnnpack_schema__srcs})
   )
 endforeach()
 
+if(WIN32)
+  set(MV_COMMAND powershell -Command "Move-Item -Path ${_xnnpack_flatbuffer__outputs} -Destination ${_xnnpack_schema__outputs}")
+else()
+  set(MV_COMMAND mv ${_xnnpack_flatbuffer__outputs} ${_xnnpack_schema__outputs})
+endif()
+
 # Generate the headers from the .fbs files.
 add_custom_command(
   OUTPUT ${_xnnpack_schema__outputs}
@@ -80,12 +95,15 @@ add_custom_command(
     ${FLATC_EXECUTABLE} --cpp --cpp-std c++11 --scoped-enums -o
     "${_xnnpack_schema__include_dir}/executorch/backends/xnnpack/serialization"
     ${_xnnpack_schema__srcs}
-  COMMAND mv ${_xnnpack_flatbuffer__outputs} ${_xnnpack_schema__outputs}
+  COMMAND ${MV_COMMAND}
   WORKING_DIRECTORY ${EXECUTORCH_ROOT}
+  DEPENDS flatc
   COMMENT "Generating xnnpack_schema headers"
   VERBATIM
 )
 
+unset(MV_COMMAND)
+
 add_library(xnnpack_schema INTERFACE ${_xnnpack_schema__outputs})
 set_target_properties(xnnpack_schema PROPERTIES LINKER_LANGUAGE CXX)
 target_include_directories(
@@ -93,15 +111,14 @@ target_include_directories(
                            ${EXECUTORCH_ROOT}/third-party/flatbuffers/include
 )
 
-set(xnnpack_third_party pthreadpool cpuinfo)
+set(xnnpack_third_party pthreadpool extension_threadpool cpuinfo)
 
 include(cmake/Dependencies.cmake)
 
 list(TRANSFORM _xnnpack_backend__srcs PREPEND "${EXECUTORCH_ROOT}/")
 add_library(xnnpack_backend STATIC ${_xnnpack_backend__srcs})
 target_link_libraries(
-  xnnpack_backend PRIVATE ${xnnpack_third_party} executorch_core
-                          xnnpack_schema
+  xnnpack_backend PUBLIC ${xnnpack_third_party} executorch_core xnnpack_schema extension_threadpool
 )
 
 target_include_directories(
@@ -119,6 +136,20 @@ target_include_directories(
 target_compile_options(xnnpack_backend PUBLIC ${_common_compile_options})
 target_link_options_shared_lib(xnnpack_backend)
 
+if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
+  list(APPEND xnn_executor_runner_libs optimized_native_cpu_ops_lib)
+else()
+  list(APPEND xnn_executor_runner_libs portable_ops_lib)
+endif()
+
+if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
+  list(APPEND xnn_executor_runner_libs $<LINK_LIBRARY:WHOLE_ARCHIVE,custom_ops>)
+endif()
+
+if(EXECUTORCH_BUILD_KERNELS_QUANTIZED)
+  list(APPEND xnn_executor_runner_libs quantized_ops_lib)
+endif()
+
 list(APPEND xnn_executor_runner_libs xnnpack_backend executorch)
 
 # ios can only build library but not binary
@@ -134,14 +165,19 @@ if(NOT CMAKE_TOOLCHAIN_FILE MATCHES ".*(iOS|ios\.toolchain)\.cmake$")
     if(EXECUTORCH_BUILD_DEVTOOLS)
       list(APPEND xnn_executor_runner_libs etdump)
     else()
-      message(SEND_ERROR "Use of 'EXECUTORCH_ENABLE_EVENT_TRACER' requires 'EXECUTORCH_BUILD_DEVTOOLS' to be enabled.")
+      message(
+        SEND_ERROR
+          "Use of 'EXECUTORCH_ENABLE_EVENT_TRACER' requires 'EXECUTORCH_BUILD_DEVTOOLS' to be enabled."
+      )
     endif()
   endif()
 
-  target_link_libraries(
-    xnn_executor_runner gflags portable_ops_lib ${xnn_executor_runner_libs}
-  )
+  target_link_libraries(xnn_executor_runner gflags ${xnn_executor_runner_libs})
   target_compile_options(xnn_executor_runner PUBLIC ${_common_compile_options})
+  if(EXECUTORCH_BUILD_PTHREADPOOL)
+    target_link_libraries(xnn_executor_runner extension_threadpool pthreadpool)
+    target_compile_definitions(xnn_executor_runner PRIVATE ET_USE_THREADPOOL)
+  endif()
 endif()
 
 install(
diff --git a/backends/xnnpack/_passes/TARGETS b/backends/xnnpack/_passes/TARGETS
index a199e1aab01..972980570ec 100644
--- a/backends/xnnpack/_passes/TARGETS
+++ b/backends/xnnpack/_passes/TARGETS
@@ -19,5 +19,6 @@ python_library(
         "//executorch/exir/passes:const_prop_pass",
         "//executorch/exir/passes:memory_format_ops_pass",
         "//executorch/exir/program:program",
+        "//executorch/backends/transforms:utils",
     ],
 )
diff --git a/backends/xnnpack/_passes/fuse_batch_norm_with_conv.py b/backends/xnnpack/_passes/fuse_batch_norm_with_conv.py
index b0f4779eb4c..6f31fe698ba 100644
--- a/backends/xnnpack/_passes/fuse_batch_norm_with_conv.py
+++ b/backends/xnnpack/_passes/fuse_batch_norm_with_conv.py
@@ -7,13 +7,22 @@
 import operator
 
 import torch
+from executorch.backends.transforms.utils import (
+    create_constant_placeholder,
+    delete_constant_placeholder,
+)
 
 from executorch.backends.xnnpack._passes.xnnpack_pass import XNNPACKPass
 
-from executorch.backends.xnnpack.utils.utils import get_param_tensor, is_param_node
+from executorch.backends.xnnpack.utils.utils import (
+    get_param_tensor,
+    get_tensor_name,
+    is_param_node,
+)
 from executorch.exir import ExportedProgram
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import PassResult
+from torch.export.graph_signature import InputKind
 
 from torch.nn.utils.fusion import fuse_conv_bn_weights
 
@@ -28,7 +37,7 @@ class FuseBatchNormWithConvPass(XNNPACKPass):
 
     def call(self, graph_module: torch.fx.GraphModule):
         graph = graph_module.graph
-        counter = 0
+        constant_placeholders_to_delete = set()
         for conv in graph.nodes:
             # We want to discover a chain of conv -> batch_norm.
             # Only proceed if the current node is a conv node, and has a single
@@ -55,9 +64,11 @@ def call(self, graph_module: torch.fx.GraphModule):
             assert len(conv.args) == 9
 
             conv_weight = get_param_tensor(self.exported_program, conv.args[1])
+            conv_weight_name = get_tensor_name(self.exported_program, conv.args[1])
             assert conv_weight is not None
 
             conv_bias = get_param_tensor(self.exported_program, conv.args[2])
+            conv_bias_name = get_tensor_name(self.exported_program, conv.args[2])
 
             # Get the parameters from the batchnorm op
             assert (
@@ -95,23 +106,43 @@ def call(self, graph_module: torch.fx.GraphModule):
                 bn_bias,
                 is_transpose,
             )
+            fused_weight_name = (conv_weight_name + "_fused_bn").replace(".", "_")
+            if conv_bias_name == "":
+                fused_bias_name = (conv_weight_name + "_bias_fused_bn").replace(
+                    ".", "_"
+                )
+            else:
+                fused_bias_name = (conv_bias_name + "_fused_bn").replace(".", "_")
 
             # Modify the graph by updating the weight and bias of conv op
             # with the fused weight and bias params, and replacing all the users
             # of getitem(batchnorm) with the conv op.
-            with graph.inserting_before(conv):
-                fused_weight_name = f"_fused_with_bn_weight_{counter}"
-                graph_module.register_parameter(fused_weight_name, fused_weight)
-                fused_weight_node = graph.get_attr(fused_weight_name)
-                fused_bias_name = f"_fused_with_bn_bias_{counter}"
-                graph_module.register_parameter(fused_bias_name, fused_bias)
-                fused_bias_node = graph.get_attr(fused_bias_name)
-
-            # Update the weight and bias of conv op
-            conv_args = list(conv.args) + ([None] if len(conv.args) == 2 else [])
-            conv_args[1] = fused_weight_node
-            conv_args[2] = fused_bias_node
-            conv.args = tuple(conv_args)
+            with graph.inserting_before(conv.args[1]):
+                fused_conv_weight_node = create_constant_placeholder(
+                    exp_program=self.exported_program,
+                    graph=graph_module.graph,
+                    kind=InputKind.PARAMETER,
+                    name=fused_weight_name,
+                    data=fused_weight,
+                )
+                if fused_bias is not None:
+                    fused_conv_bias_node = create_constant_placeholder(
+                        exp_program=self.exported_program,
+                        graph=graph_module.graph,
+                        kind=InputKind.PARAMETER,
+                        name=fused_bias_name,
+                        data=fused_bias,
+                    )
+                else:
+                    fused_conv_bias_node = None
+
+                conv.args = (
+                    conv.args[0],
+                    fused_conv_weight_node,
+                    fused_conv_bias_node,
+                    *conv.args[3:],
+                )
+
             # Remove any use of batchnorm from the graph
             for user in bn.users.copy():
                 assert user.target == operator.getitem
@@ -119,8 +150,13 @@ def call(self, graph_module: torch.fx.GraphModule):
                 graph.erase_node(user)
 
             graph.erase_node(bn)
+            constant_placeholders_to_delete.update(conv.args[1:3] + bn.args[1:5])
 
-            counter += 1
+        if len(constant_placeholders_to_delete) > 0:
+            graph_module.graph.eliminate_dead_code()
+            for node in constant_placeholders_to_delete:
+                if (node is not None) and (len(node.users) == 0):
+                    delete_constant_placeholder(self.exported_program, node)
 
         graph_module.recompile()
         # To Regenerate meta data and shape information, retrace module
diff --git a/backends/xnnpack/operators/__init__.py b/backends/xnnpack/operators/__init__.py
index e199c95b0f0..f056ad8b086 100644
--- a/backends/xnnpack/operators/__init__.py
+++ b/backends/xnnpack/operators/__init__.py
@@ -15,7 +15,6 @@
     op_ceiling,
     op_clamp,
     op_conv2d,
-    op_dequantize_per_tensor,
     op_div,
     op_dynamic_dequantize_ops,
     op_dynamic_quantize_ops,
@@ -35,7 +34,7 @@
     op_negate,
     op_permute,
     op_prelu,
-    op_quantize_per_tensor,
+    op_quant_dequant,
     op_relu,
     op_rsqrt,
     op_sdpa,
diff --git a/backends/xnnpack/operators/node_visitor.py b/backends/xnnpack/operators/node_visitor.py
index 0a825a94bef..0185f18d249 100644
--- a/backends/xnnpack/operators/node_visitor.py
+++ b/backends/xnnpack/operators/node_visitor.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import ctypes
+import hashlib
 
 from typing import cast, Dict, List, Optional, Tuple
 
@@ -38,7 +39,11 @@
     PERM_NCHW_TO_NHWC,
 )
 
-from executorch.backends.xnnpack.utils.xnnpack_constants import XNN_INVALID_VALUE_ID
+from executorch.backends.xnnpack.utils.xnnpack_constants import (
+    UINT64_MAX,
+    XNN_INVALID_VALUE_ID,
+)
+from executorch.exir._serialize._named_data_store import NamedDataStore
 from torch.export import ExportedProgram
 
 XNN_TYPE_MAP = {
@@ -46,8 +51,6 @@
 }
 
 from executorch.backends.xnnpack.serialization.xnnpack_graph_serialize import (
-    _aligned_size,
-    _pad_to,
     CONSTANT_TENSOR_ALIGNMENT,
 )
 
@@ -86,11 +89,11 @@ def __init__(
         self,
         exported_program: ExportedProgram,
         external_ids: Dict,
-        constant_data_bytes: bytearray,
+        named_data_store: NamedDataStore,
     ) -> None:
         self._external_ids = external_ids or {}
         self._exported_program = exported_program or None
-        self._constant_data_bytes = constant_data_bytes
+        self._named_data_store = named_data_store
 
     @property
     def external_ids(self) -> Dict:
@@ -573,17 +576,26 @@ def get_serialized_buffer_index(
         if quant_params is not None and quant_params.is_qc4w:
             const_val = self.convert_to_qc4w(const_val)
 
-        array_type = ctypes.c_char * const_val.untyped_storage().nbytes()
+        size = const_val.untyped_storage().nbytes()
+        array_type = ctypes.c_char * size
         array = ctypes.cast(
             const_val.untyped_storage().data_ptr(),
             ctypes.POINTER(array_type),
         ).contents
 
-        offset = len(self._constant_data_bytes)
+        check_or_raise(
+            size > 0,
+            f"Serializing constant data node {tensor} but tensor value has no bytes",
+        )
+        sha256_hash = hashlib.sha256(bytes(array))
+        named_key = sha256_hash.hexdigest()
+
         size = const_val.untyped_storage().nbytes()
-        xnn_graph.constant_data.append(ConstantDataOffset(offset=offset, size=size))
-        self._constant_data_bytes.extend(
-            _pad_to(bytes(array), _aligned_size(size, CONSTANT_TENSOR_ALIGNMENT))
+        xnn_graph.constant_data.append(
+            ConstantDataOffset(offset=UINT64_MAX, size=size, named_key=named_key)
+        )
+        self._named_data_store.add_named_data(
+            named_key, bytes(array), alignment=CONSTANT_TENSOR_ALIGNMENT
         )
 
         return buffer_idx
diff --git a/backends/xnnpack/operators/op_dequantize_per_tensor.py b/backends/xnnpack/operators/op_dequantize_per_tensor.py
deleted file mode 100644
index cea76b31057..00000000000
--- a/backends/xnnpack/operators/op_dequantize_per_tensor.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-from typing import Dict
-
-import torch
-from executorch.backends.xnnpack._passes.tag_implicit_q_dq_pass import (
-    TagImplicitQDqPass,
-)
-from executorch.backends.xnnpack.operators.node_visitor import (
-    NodeVisitor,
-    register_node_visitor,
-)
-from executorch.backends.xnnpack.operators.quant_params import QuantParams
-from executorch.backends.xnnpack.serialization.xnnpack_graph_schema import (
-    XNNConvert,
-    XNNGraph,
-    XNode,
-)
-from executorch.backends.xnnpack.utils.utils import get_input_node
-
-
-@register_node_visitor
-class OpDeQuantizePerTensor(NodeVisitor):
-    """
-    Dequantize Per Tensor Node visitor
-    """
-
-    target = "quantized_decomposed.dequantize_per_tensor.default"
-
-    def __init__(self, *args) -> None:
-        super().__init__(*args)
-
-    def define_node(
-        self,
-        node: torch.fx.Node,
-        xnn_graph: XNNGraph,
-        vals_to_ids: Dict[torch.fx.Node, int],
-        debug_handle: int,
-    ) -> None:
-        """
-        We only define a node if it is not an implict dq node
-        """
-        if not TagImplicitQDqPass.is_tagged_as_implicit_q_dq(node):
-            dq_input = get_input_node(node, 0)
-            input_quant_params = QuantParams.from_q_dq_node(node)
-            # fp32 output
-            self.define_tensor(node, xnn_graph, vals_to_ids)
-            output_id = vals_to_ids[node]
-
-            # qint8 input
-            input_quant_params.is_output = False
-            self.define_tensor(
-                dq_input, xnn_graph, vals_to_ids, quant_params=input_quant_params
-            )
-            input_id = vals_to_ids[dq_input]
-
-            ser_node = XNode(
-                xnode_union=XNNConvert(input_id=input_id, output_id=output_id, flags=0),
-                debug_handle=debug_handle,
-            )
-            xnn_graph.xnodes.append(ser_node)
-        else:
-            # If this node was ignored, then its id is the same as its parent
-            dq_input = get_input_node(node, 0)
-            if dq_input in vals_to_ids:
-                vals_to_ids[node] = vals_to_ids[dq_input]
diff --git a/backends/xnnpack/operators/op_dynamic_dequantize_ops.py b/backends/xnnpack/operators/op_dynamic_dequantize_ops.py
index f8f0c54ee68..82a35236294 100644
--- a/backends/xnnpack/operators/op_dynamic_dequantize_ops.py
+++ b/backends/xnnpack/operators/op_dynamic_dequantize_ops.py
@@ -13,6 +13,7 @@
 )
 from executorch.backends.xnnpack.serialization.xnnpack_graph_schema import XNNGraph
 from executorch.backends.xnnpack.utils.quant_utils import (
+    is_dynamic_qdq,
     is_per_channel_group,
     is_per_token,
 )
@@ -92,7 +93,8 @@ def define_node(
         """
         We always define dequantize affine nodes because they are always explicit
         """
-        if is_per_channel_group(node):
+        is_dynamic = is_dynamic_qdq(node)
+        if is_per_channel_group(node) and not is_dynamic:
             check_or_raise(
                 is_param_node(self._exported_program, node.all_input_nodes[0]),
                 f"Expected quantize affine node with per-token semantics to be used "
@@ -103,7 +105,7 @@ def define_node(
             return
 
         check_or_raise(
-            is_per_token(node),
+            is_per_token(node) and is_dynamic,
             "Expecting Affine Dequantized Op to have per-token semantics",
         )
         # This must be a per-token affine dequantized node, so let us serialize as such
diff --git a/backends/xnnpack/operators/op_dynamic_quantize_ops.py b/backends/xnnpack/operators/op_dynamic_quantize_ops.py
index 23047e731f7..9369f025216 100644
--- a/backends/xnnpack/operators/op_dynamic_quantize_ops.py
+++ b/backends/xnnpack/operators/op_dynamic_quantize_ops.py
@@ -18,6 +18,7 @@
     XNode,
 )
 from executorch.backends.xnnpack.utils.quant_utils import (
+    is_dynamic_qdq,
     is_per_channel_group,
     is_per_token,
 )
@@ -138,13 +139,14 @@ def define_node(
         """
         We always define quantize affine nodes because they are always explicit
         """
-        if is_per_channel_group(node):
+        is_dynamic = is_dynamic_qdq(node)
+        if is_per_channel_group(node) and not is_dynamic:
             # Affine quantized was recognized as per channel group which means that it should
             # be skipped as this means it is used in front of a weight node
             return
 
         check_or_raise(
-            is_per_token(node),
+            is_per_token(node) and is_dynamic,
             "Encountered affine quantized op which does not have per-token semantics",
         )
         # Treat this node as dynamic per-token quantization
diff --git a/backends/xnnpack/operators/op_quant_dequant.py b/backends/xnnpack/operators/op_quant_dequant.py
new file mode 100644
index 00000000000..521a8b6475a
--- /dev/null
+++ b/backends/xnnpack/operators/op_quant_dequant.py
@@ -0,0 +1,198 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict
+
+import torch
+from executorch.backends.xnnpack._passes.tag_implicit_q_dq_pass import (
+    TagImplicitQDqPass,
+)
+from executorch.backends.xnnpack.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.xnnpack.operators.quant_params import QuantParams
+from executorch.backends.xnnpack.serialization.xnnpack_graph_schema import (
+    XNNConvert,
+    XNNGraph,
+    XNode,
+)
+from executorch.backends.xnnpack.utils.quant_utils import (
+    is_per_channel_group,
+    validate_quant_scales,
+    validate_quant_zeropoints,
+)
+from executorch.backends.xnnpack.utils.utils import get_input_node, get_param_tensor
+
+
+class OpStaticQDQNode(NodeVisitor):
+    def check_scales_zeropoints(self, node) -> None:
+        scales = node.args[1]
+        zero_points = node.args[2]
+        is_groupwise = is_per_channel_group(node)
+        dtype = node.args[-1]
+        if is_groupwise:
+            dtype = node.args[-3]
+
+        if isinstance(scales, torch.fx.Node):
+            scales = get_param_tensor(self.exported_program, scales)
+
+        if isinstance(zero_points, torch.fx.Node):
+            zero_points = get_param_tensor(self.exported_program, zero_points)
+
+        try:
+            validate_quant_scales(scales)
+            validate_quant_zeropoints(zero_points, dtype, is_groupwise)
+        except ValueError as e:
+            raise ValueError(
+                f"Invalid quantization scale or zero point for {node}: {e}"
+            )
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        xnn_graph: XNNGraph,
+        vals_to_ids: Dict[torch.fx.Node, int],
+        debug_handle: int,
+    ) -> None:
+        # check scales and zp are valid
+        self.check_scales_zeropoints(node)
+
+
+@register_node_visitor
+class OpDeQuantizePerTensor(OpStaticQDQNode):
+    """
+    Dequantize Per Tensor Node visitor
+    """
+
+    target = "quantized_decomposed.dequantize_per_tensor.default"
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        xnn_graph: XNNGraph,
+        vals_to_ids: Dict[torch.fx.Node, int],
+        debug_handle: int,
+    ) -> None:
+        """
+        We only define a node if it is not an implict dq node
+        """
+        # check scales and zp are valid
+        super().define_node(node, xnn_graph, vals_to_ids, debug_handle)
+
+        if not TagImplicitQDqPass.is_tagged_as_implicit_q_dq(node):
+            dq_input = get_input_node(node, 0)
+            input_quant_params = QuantParams.from_q_dq_node(node)
+            # fp32 output
+            self.define_tensor(node, xnn_graph, vals_to_ids)
+            output_id = vals_to_ids[node]
+
+            # qint8 input
+            input_quant_params.is_output = False
+            self.define_tensor(
+                dq_input, xnn_graph, vals_to_ids, quant_params=input_quant_params
+            )
+            input_id = vals_to_ids[dq_input]
+
+            ser_node = XNode(
+                xnode_union=XNNConvert(input_id=input_id, output_id=output_id, flags=0),
+                debug_handle=debug_handle,
+            )
+            xnn_graph.xnodes.append(ser_node)
+        else:
+            # If this node was ignored, then its id is the same as its parent
+            dq_input = get_input_node(node, 0)
+            if dq_input in vals_to_ids:
+                vals_to_ids[node] = vals_to_ids[dq_input]
+
+
+@register_node_visitor
+class OpQuantizePerTensor(OpStaticQDQNode):
+    """
+    Quantize Per Tensor Node visitor
+    """
+
+    target = "quantized_decomposed.quantize_per_tensor.default"
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        xnn_graph: XNNGraph,
+        vals_to_ids: Dict[torch.fx.Node, int],
+        debug_handle: int,
+    ) -> None:
+        """
+        We only define a node if it is not an implict q node
+        """
+        # check scales and zp are valid
+        super().define_node(node, xnn_graph, vals_to_ids, debug_handle)
+
+        q_input = get_input_node(node, 0)
+        if not TagImplicitQDqPass.is_tagged_as_implicit_q_dq(node):
+            input_quant_params = QuantParams.from_q_dq_node(node)
+            # fp32 input
+            self.define_tensor(q_input, xnn_graph, vals_to_ids)
+            input_id = vals_to_ids[q_input]
+
+            # qint8 output
+            input_quant_params.q_input = node
+            input_quant_params.is_input = False
+            self.define_tensor(
+                node, xnn_graph, vals_to_ids, quant_params=input_quant_params
+            )
+            output_id = vals_to_ids[node]
+
+            ser_node = XNode(
+                xnode_union=XNNConvert(input_id=input_id, output_id=output_id, flags=0),
+                debug_handle=debug_handle,
+            )
+            xnn_graph.xnodes.append(ser_node)
+        else:
+            # If this node was ignored, then its id is the same as its parents
+            if q_input in vals_to_ids:
+                vals_to_ids[node] = vals_to_ids[q_input]
+
+
+@register_node_visitor
+class OpDequantizePerChannelDefault(OpStaticQDQNode):
+    """
+    do nothing if node is dequantize_per_channel.default
+    """
+
+    target = "quantized_decomposed.dequantize_per_channel.default"
+
+
+@register_node_visitor
+class OpQuantizePerChannelDefault(OpStaticQDQNode):
+    """
+    do nothing if node is quantize_per_channel.default
+    """
+
+    target = "quantized_decomposed.quantize_per_channel.default"
+
+
+@register_node_visitor
+class OpQuantizePerChannelGroupDefault(OpStaticQDQNode):
+    """
+    do nothing if node is quantize_per_channel_group.default
+    """
+
+    target = "quantized_decomposed.quantize_per_channel_group.default"
+
+
+@register_node_visitor
+class OpDequantizePerChannelGroupDefault(OpStaticQDQNode):
+    """
+    do nothing if node is dequantize_per_channel_group.default
+    """
+
+    target = "quantized_decomposed.dequantize_per_channel_group.default"
diff --git a/backends/xnnpack/operators/op_quantize_per_tensor.py b/backends/xnnpack/operators/op_quantize_per_tensor.py
deleted file mode 100644
index da15559410e..00000000000
--- a/backends/xnnpack/operators/op_quantize_per_tensor.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-from typing import Dict
-
-import torch
-from executorch.backends.xnnpack._passes.tag_implicit_q_dq_pass import (
-    TagImplicitQDqPass,
-)
-from executorch.backends.xnnpack.operators.node_visitor import (
-    NodeVisitor,
-    register_node_visitor,
-)
-from executorch.backends.xnnpack.operators.quant_params import QuantParams
-from executorch.backends.xnnpack.serialization.xnnpack_graph_schema import (
-    XNNConvert,
-    XNNGraph,
-    XNode,
-)
-from executorch.backends.xnnpack.utils.utils import get_input_node
-
-
-@register_node_visitor
-class OpQuantizePerTensor(NodeVisitor):
-    """
-    Quantize Per Tensor Node visitor
-    """
-
-    target = "quantized_decomposed.quantize_per_tensor.default"
-
-    def __init__(self, *args) -> None:
-        super().__init__(*args)
-
-    def define_node(
-        self,
-        node: torch.fx.Node,
-        xnn_graph: XNNGraph,
-        vals_to_ids: Dict[torch.fx.Node, int],
-        debug_handle: int,
-    ) -> None:
-        """
-        We only define a node if it is not an implict q node
-        """
-        q_input = get_input_node(node, 0)
-        if not TagImplicitQDqPass.is_tagged_as_implicit_q_dq(node):
-            input_quant_params = QuantParams.from_q_dq_node(node)
-            # fp32 input
-            self.define_tensor(q_input, xnn_graph, vals_to_ids)
-            input_id = vals_to_ids[q_input]
-
-            # qint8 output
-            input_quant_params.q_input = node
-            input_quant_params.is_input = False
-            self.define_tensor(
-                node, xnn_graph, vals_to_ids, quant_params=input_quant_params
-            )
-            output_id = vals_to_ids[node]
-
-            ser_node = XNode(
-                xnode_union=XNNConvert(input_id=input_id, output_id=output_id, flags=0),
-                debug_handle=debug_handle,
-            )
-            xnn_graph.xnodes.append(ser_node)
-        else:
-            # If this node was ignored, then its id is the same as its parents
-            if q_input in vals_to_ids:
-                vals_to_ids[node] = vals_to_ids[q_input]
diff --git a/backends/xnnpack/operators/op_skip_ops.py b/backends/xnnpack/operators/op_skip_ops.py
index 6597c0568e3..face7342d8f 100644
--- a/backends/xnnpack/operators/op_skip_ops.py
+++ b/backends/xnnpack/operators/op_skip_ops.py
@@ -41,15 +41,6 @@ class OpChooseQparamsTensor(OpSkipOps):
     target = "quantized_decomposed.choose_qparams.tensor"
 
 
-@register_node_visitor
-class OpDequantizePerChannelDefault(OpSkipOps):
-    """
-    do nothing if node is dequantize_per_channel.default
-    """
-
-    target = "quantized_decomposed.dequantize_per_channel.default"
-
-
 @register_node_visitor
 class OpGetItem(OpSkipOps):
     """
@@ -59,15 +50,6 @@ class OpGetItem(OpSkipOps):
     target = "getitem"
 
 
-@register_node_visitor
-class OpQuantizePerChannelDefault(OpSkipOps):
-    """
-    do nothing if node is quantize_per_channel.default
-    """
-
-    target = "quantized_decomposed.quantize_per_channel.default"
-
-
 @register_node_visitor
 class OpTCopyDefault(OpSkipOps):
     """
@@ -113,21 +95,3 @@ class OpChooseQparamsToken(OpSkipOps):
     """
 
     target = "quantized_decomposed.choose_qparams_per_token_asymmetric.default"
-
-
-@register_node_visitor
-class OpQuantizePerChannelGroupDefault(OpSkipOps):
-    """
-    do nothing if node is quantize_per_channel_group.default
-    """
-
-    target = "quantized_decomposed.quantize_per_channel_group.default"
-
-
-@register_node_visitor
-class OpDequantizePerChannelGroupDefault(OpSkipOps):
-    """
-    do nothing if node is dequantize_per_channel_group.default
-    """
-
-    target = "quantized_decomposed.dequantize_per_channel_group.default"
diff --git a/backends/xnnpack/operators/quant_params.py b/backends/xnnpack/operators/quant_params.py
index a2d26772ecc..e695b151560 100644
--- a/backends/xnnpack/operators/quant_params.py
+++ b/backends/xnnpack/operators/quant_params.py
@@ -102,6 +102,16 @@ def __init__(
             assert group_size > 0, "Group size must be greater than 0"
         self.is_per_channel_group = self.per_channel and self.group_size > 0
 
+        if per_channel and not self.is_per_channel_group:
+            tensor = q_input.meta["val"]
+            assert (
+                tensor.shape[self.axis] == cast(torch.Tensor, self.scale).shape[0]
+            ), f"Invalid size of per channel quantization scales, axis: {self.axis}, scale size: {self.scale.shape}, tensor shape: {tensor.shape}"
+
+            assert (
+                tensor.shape[self.axis] == cast(torch.Tensor, self.zp).shape[0]
+            ), f"Invalid size of per channel quantization zero-points, axis: {self.axis}, zp size: {self.zp.shape}, tensor shape: {tensor.shape}"
+
     def quantize_tensor(self, tensor: torch.Tensor) -> torch.Tensor:
         # Do nothing if already quantized by the Quantizer
         if tensor.dtype == self.dtype:
diff --git a/backends/xnnpack/partition/config/gemm_configs.py b/backends/xnnpack/partition/config/gemm_configs.py
index bf16855afc1..8712c2709ac 100644
--- a/backends/xnnpack/partition/config/gemm_configs.py
+++ b/backends/xnnpack/partition/config/gemm_configs.py
@@ -21,6 +21,7 @@
     is_dynamic_qdq,
     is_per_channel,
     is_per_channel_group,
+    is_per_tensor,
     is_qparam,
     is_quant,
 )
@@ -66,8 +67,6 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool:
             return False
 
         is_valid, _ = self.get_deps(node, ep)
-        if not is_valid:
-            why(node, "Failed to get valid dependent nodes.")
         return is_valid
 
     def get_node_and_deps(
@@ -97,9 +96,9 @@ def _detect_precision(self, node: torch.fx.Node) -> ConfigPrecisionType:
     def _overwrite_precision(self, node: torch.fx.Node):
         precision = self._detect_precision(node)
         if precision not in self.enabled_precision_types:
-            # detected precision is not enabled, lets try to partition it as fp32
+            # detected precision is not enabled, try to partition it as fp32
             if self.enabled_precision_types == [ConfigPrecisionType.FP32]:
-                # if only fp32 is enabled, then we can still partition fp32 gemms
+                # when only fp32 is enabled, then we can still partition fp32 gemms
                 # even with in a quantized graph
                 if precision in [
                     ConfigPrecisionType.STATIC_QUANT,
@@ -108,6 +107,7 @@ def _overwrite_precision(self, node: torch.fx.Node):
                     precision = ConfigPrecisionType.FP32
                     logging.info(f"Overwriting precision, partitioning {node} as FP32")
                     return True, precision
+
         return False, precision
 
     def get_deps(
@@ -123,6 +123,7 @@ def get_deps(
         precision = self._detect_precision(node)
         if precision not in self.supported_precision_types():
             # detected precision but it is either disabled or not supported
+            why(node, f"Unsupported precision type {precision}")
             return (False, [])
         _, precision = self._overwrite_precision(node)
         valid_bias, bias_deps = self._get_bias_deps(node, ep, precision)
@@ -143,7 +144,8 @@ def _get_weight_deps(
             # First find the weight
             weight_node = get_input_node(node, self.weight_idx)
             if not is_param_node(ep, weight_node):
-                return (False, [])  # weight must be a static param
+                why(node, "Expected weight to be a static param")
+                return (False, [])
             gemm_deps.append(weight_node)
 
             return (True, gemm_deps)
@@ -151,19 +153,33 @@ def _get_weight_deps(
             # Quantized Weight deps
             dequant_node = get_input_node(node, self.weight_idx)
             if not is_dequant(dequant_node):
+                why(node, "Expected  weight to have a dequantized node")
                 return False, []
             gemm_deps.append(dequant_node)
             weight = get_input_node(dequant_node, 0)
             if not is_param_node(ep, weight):
+                why(node, "Expected weight to be a static param")
                 return False, []
             gemm_deps.append(weight)
 
+            if (
+                is_per_tensor(dequant_node)
+                and precision == ConfigPrecisionType.DYNAMIC_QUANT
+            ):
+                why(
+                    node,
+                    "XNNPACK does not support per tensor quantized weights for dynamic quantization of activations",
+                )
+                return False, []
+
             if is_per_channel(dequant_node) or is_per_channel_group(dequant_node):
                 if len(dequant_node.all_input_nodes) < 2:
                     # Expected channel quantized to have scale/zp nodes
+                    why(node, "Expected channel quantized to have scale/zp nodes")
                     return False, []
 
                 gemm_deps.extend(dequant_node.all_input_nodes[1:3])
+
             return (True, gemm_deps)
 
     def _get_output_deps(
@@ -174,7 +190,7 @@ def _get_output_deps(
             # Look for fused activations and tail end quant node
             node_users = list(node.users.keys())
             if len(node_users) != 1:
-                # Expect quantized node to have a single output (fused act or dequant)
+                why(node, "Expected quantized node to have a single output")
                 return False, []
 
             # Check if the quantized pattern has a fused activation
@@ -190,6 +206,7 @@ def _get_output_deps(
 
             if not is_quant(n_output):
                 # Expected gemm_node --> fused_act (optional) --> dequant
+                why(node, "Expected output node to have a dequantized node")
                 return (False, [])
             gemm_deps.append(n_output)
         elif precision == ConfigPrecisionType.FP32:
@@ -210,11 +227,20 @@ def _get_bias_deps(
         self, node: torch.fx.Node, ep: ExportedProgram, precision: ConfigPrecisionType
     ) -> Tuple[bool, List[torch.fx.Node]]:
         gemm_deps = []
+        if (
+            precision == ConfigPrecisionType.FP32
+            and self.force_non_static_weights_for_f32_linear
+        ):
+            # if force_non_static_weights_for_f32_linear is enabled, then we
+            # do not partition the weight node
+            return (True, gemm_deps)
+
         if len(node.all_input_nodes) > 2 and self.bias_idx is not None:
             bias_node = get_input_node(node, self.bias_idx)
             if bias_node:
                 if not is_param_node(ep, bias_node):
-                    return (False, [])  # bias node must be a static param
+                    why(node, "Expected bias to be a static param")
+                    return (False, [])
                 gemm_deps.append(bias_node)
 
         return (True, gemm_deps)
@@ -228,7 +254,7 @@ def _get_act_deps(
         else:
             dq_input = get_input_node(node, self.act_idx)
             if not is_dequant(dq_input):
-                # Expected static quant input to be dequant node
+                why(node, "Expected act input to be dequant node")
                 return False, []
             gemm_deps.append(dq_input)
             if precision == ConfigPrecisionType.STATIC_QUANT:
@@ -238,6 +264,7 @@ def _get_act_deps(
             # q input node
             q_input = get_input_node(dq_input, 0)
             if not is_quant(q_input):
+                why(node, "Expected  dequant input to be quant node")
                 return (False, [])
 
             gemm_deps.append(q_input)
@@ -245,20 +272,20 @@ def _get_act_deps(
             if is_affine_qdq(q_input):
                 q_input_args = extract_qdq_affine_op_args_for_decomposed_ops(q_input)
             if not (is_node(q_input_args[1]) and is_node(q_input_args[2])):
-                # expected to find getitem node from choose qparam
+                why(node, "expected to find getitem node from choose qparam")
                 return (False, [])
 
             getitem1 = q_input_args[1]
             getitem2 = q_input_args[2]
 
             if not (is_getitem(getitem1) and is_getitem(getitem2)):
-                # expected getitem node from choose qparam
+                why(node, "expected getitem node from choose qparam")
                 return (False, [])
 
             gemm_deps.extend([getitem1, getitem2])
             choose_qparam = get_input_node(getitem1, 0)
             if not is_qparam(choose_qparam):
-                # expected to find choose_qparam node
+                why(node, "expected to find choose_qparam node")
                 return (False, [])
             gemm_deps.append(choose_qparam)
             return (True, gemm_deps)
@@ -282,8 +309,11 @@ def get_original_aten(self) -> Optional[torch._ops.OpOverload]:
     def _get_weight_deps(
         self, node: torch.fx.Node, ep: ExportedProgram, precision: ConfigPrecisionType
     ) -> Tuple[bool, List[torch.fx.Node]]:
-        if precision == ConfigPrecisionType.FP32 and self.force_fp32_dynamic_linear:
-            # if force fp32_dynamic_linear is enabled, then we
+        if (
+            precision == ConfigPrecisionType.FP32
+            and self.force_non_static_weights_for_f32_linear
+        ):
+            # if force_non_static_weights_for_f32_linear is enabled, then we
             # do not partition the weight node
             return (True, [])
 
@@ -389,9 +419,11 @@ def __init__(self, **kwargs):
     def _get_weight_deps(
         self, node: torch.fx.Node, ep: ExportedProgram, precision: ConfigPrecisionType
     ) -> Tuple[bool, List[torch.fx.Node]]:
-        # TODO(maxren, T210537195):
-        if precision == ConfigPrecisionType.FP32 and self.force_fp32_dynamic_linear:
-            # if force fp32_dynamic_linear is on and we detected this as fp32, then we
+        if (
+            precision == ConfigPrecisionType.FP32
+            and self.force_non_static_weights_for_f32_linear
+        ):
+            # if force_non_static_weights_for_f32_linear is on and we detected this as fp32, then we
             # do not partition the weight node
             return (True, [])
 
@@ -466,6 +498,7 @@ def find_partition_args(input_node):
             # there can only be a single output node in partition
             or len(src_partition.output_nodes) != 1
         ):
+            why(node, "invalid source partition")
             return (False, [])
 
         # map addmm's args to the source partition linear's inputs and users
@@ -477,7 +510,15 @@ def find_partition_args(input_node):
         node.args = old_args
         node.users = old_users
 
-        return valid_deps, list(set(deps) | set(src_partition.nodes))
+        # When using force_non_static_weights_for_f32_linear, we want to get_deps to overwrite the source partition nodes.
+        # Else we want to be greedy.
+        ret_deps = (
+            list(set(deps) & set(src_partition.nodes))
+            if self.force_non_static_weights_for_f32_linear
+            else list(set(deps) | set(src_partition.nodes))
+        )
+
+        return valid_deps, ret_deps
 
     def supported_precision_types(self):
         return [
@@ -499,8 +540,11 @@ def __init__(self, **kwargs):
     def _get_weight_deps(
         self, node: torch.fx.Node, ep: ExportedProgram, precision: ConfigPrecisionType
     ) -> Tuple[bool, List[torch.fx.Node]]:
-        if precision == ConfigPrecisionType.FP32 and self.force_fp32_dynamic_linear:
-            # if force fp32_dynamic_linear is on and we detected this as fp32, then we
+        if (
+            precision == ConfigPrecisionType.FP32
+            and self.force_non_static_weights_for_f32_linear
+        ):
+            # if force_non_static_weights_for_f32_linear is on and we detected this as fp32, then we
             # do not partition the weight node
             return (True, [])
 
diff --git a/backends/xnnpack/partition/config/xnnpack_config.py b/backends/xnnpack/partition/config/xnnpack_config.py
index d261416a76f..20018610fce 100644
--- a/backends/xnnpack/partition/config/xnnpack_config.py
+++ b/backends/xnnpack/partition/config/xnnpack_config.py
@@ -41,7 +41,9 @@ def __init__(self, **kwargs):
         super().__init__()
         self.enabled_precision_types = self.supported_precision_types()
         # Flag used in GEMMConfig()
-        self.force_fp32_dynamic_linear = kwargs.get("force_fp32_dynamic_linear", False)
+        self.force_non_static_weights_for_f32_linear = kwargs.get(
+            "force_non_static_weights_for_f32_linear", False
+        )
 
     def get_partition(
         self, node: torch.fx.Node, ep: ExportedProgram
diff --git a/backends/xnnpack/partition/xnnpack_partitioner.py b/backends/xnnpack/partition/xnnpack_partitioner.py
index 358b3085c80..e5532e17f36 100644
--- a/backends/xnnpack/partition/xnnpack_partitioner.py
+++ b/backends/xnnpack/partition/xnnpack_partitioner.py
@@ -115,7 +115,7 @@ def generate_per_op_partitions(self, ep: ExportedProgram) -> List[Partition]:
 class XnnpackDynamicallyQuantizedPartitioner(XnnpackPartitioner):
     def __init__(self):
         super().__init__(
-            config_precisions=ConfigPrecisionType.DYNAMIC_QUANT, per_op_mode=True
+            config_precisions=ConfigPrecisionType.DYNAMIC_QUANT,
         )
 
 
diff --git a/backends/xnnpack/runtime/XNNCompiler.cpp b/backends/xnnpack/runtime/XNNCompiler.cpp
index 8d8e9a13152..c0204831c07 100644
--- a/backends/xnnpack/runtime/XNNCompiler.cpp
+++ b/backends/xnnpack/runtime/XNNCompiler.cpp
@@ -10,8 +10,10 @@
 #include <executorch/backends/xnnpack/runtime/XNNHeader.h>
 #include <executorch/backends/xnnpack/serialization/schema_generated.h>
 #include <executorch/extension/threadpool/threadpool.h>
-#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/executor/pte_data_map.h>
+#include <string>
 #include <unordered_map>
+#include <vector>
 
 #pragma clang diagnostic ignored "-Wmissing-prototypes"
 #pragma clang diagnostic ignored "-Wglobal-constructors"
@@ -22,7 +24,9 @@ namespace xnnpack {
 namespace delegate {
 
 using executorch::runtime::Error;
+using executorch::runtime::FreeableBuffer;
 using executorch::runtime::MemoryAllocator;
+using executorch::runtime::NamedDataMap;
 using executorch::runtime::Result;
 
 /*
@@ -48,6 +52,7 @@ class CompileAllocator {
 using ValuePtr = const fb_xnnpack::XValue*;
 using NodePtr = const fb_xnnpack::XNode*;
 using GraphPtr = const fb_xnnpack::XNNGraph*;
+using ConstantDataOffsetPtr = const fb_xnnpack::ConstantDataOffset*;
 using DataType = fb_xnnpack::XNNDatatype;
 
 // Type for define node function. This is the function signature
@@ -162,7 +167,10 @@ data associated with the tensor value, then returns nullptr.
 const uint8_t* getConstantDataPtr(
     const fb_xnnpack::XNNTensorValue* tensor_value,
     GraphPtr flatbuffer_graph,
-    const uint8_t* constant_data_ptr) {
+    const uint8_t* constant_data_ptr,
+    const NamedDataMap* named_data_map,
+    std::vector<FreeableBuffer>& freeable_buffers,
+    XNNWeightsCache* weights_cache) {
   auto buffer_idx = tensor_value->constant_buffer_idx();
   if (buffer_idx) {
     if (!constant_data_ptr) {
@@ -171,10 +179,41 @@ const uint8_t* getConstantDataPtr(
       const auto& constant_buffer = *flatbuffer_graph->constant_buffer();
       return constant_buffer[buffer_idx]->storage()->data();
     } else {
-      const auto& constant_data_offsets = *flatbuffer_graph->constant_data();
-      uint64_t constant_data_offset =
-          constant_data_offsets[buffer_idx]->offset();
-      return constant_data_ptr + constant_data_offset;
+      ConstantDataOffsetPtr constant_data_offset =
+          flatbuffer_graph->constant_data()->Get(buffer_idx);
+      uint64_t offset = constant_data_offset->offset();
+
+      bool has_named_key = flatbuffers::IsFieldPresent(
+          constant_data_offset, fb_xnnpack::ConstantDataOffset::VT_NAMED_KEY);
+      // If there is no tensor name
+      if (!has_named_key) {
+        return constant_data_ptr + offset;
+      } else {
+        const std::string& data_name = constant_data_offset->named_key()->str();
+#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE
+        Result<const uint8_t*> data_ptr =
+            weights_cache->load_unpacked_data(data_name);
+        if (!data_ptr.ok()) {
+          ET_LOG(Error, "Failed to load weights from cache");
+          return nullptr;
+        }
+        return data_ptr.get();
+#else
+        Result<FreeableBuffer> buffer =
+            named_data_map->get_data(data_name.c_str());
+        if (!buffer.ok()) {
+          ET_LOG(
+              Error,
+              "Failed to get constant data for key %s",
+              data_name.c_str());
+          return nullptr;
+        }
+        const uint8_t* data_ptr =
+            static_cast<const uint8_t*>(buffer.get().data());
+        freeable_buffers.push_back(std::move(buffer.get()));
+        return data_ptr;
+#endif
+      }
     }
   }
 
@@ -194,7 +233,10 @@ Error defineTensor(
     const uint8_t* constant_data_ptr,
     std::vector<uint32_t>& input_ids,
     std::vector<uint32_t>& output_ids,
-    CompileAllocator& allocator) {
+    CompileAllocator& allocator,
+    const NamedDataMap* named_data_map,
+    std::vector<FreeableBuffer>& freeable_buffers,
+    XNNWeightsCache* weights_cache) {
   const fb_xnnpack::XNNTensorValue* tensor_value = nullptr;
   const fb_xnnpack::XNNQuantizedTensorValue* qtensor_value = nullptr;
 
@@ -231,8 +273,13 @@ Error defineTensor(
 
   // Get Pointer to constant data from flatbuffer, if its non-constant
   // it is a nullptr
-  const uint8_t* buffer_ptr =
-      getConstantDataPtr(tensor_value, flatbuffer_graph, constant_data_ptr);
+  const uint8_t* buffer_ptr = getConstantDataPtr(
+      tensor_value,
+      flatbuffer_graph,
+      constant_data_ptr,
+      named_data_map,
+      freeable_buffers,
+      weights_cache);
 
   xnn_status status;
   // The type we might have to convert to
@@ -1967,8 +2014,9 @@ ET_NODISCARD Error XNNCompiler::compileModel(
     const void* buffer_pointer,
     size_t num_bytes,
     XNNExecutor* executor,
-    MemoryAllocator* runtime_allocator,
-    xnn_workspace_t workspace) {
+    XNNWeightsCache* weights_cache,
+    xnn_workspace_t workspace,
+    const NamedDataMap* named_data_map) {
   Result<XNNHeader> header = XNNHeader::Parse(buffer_pointer, num_bytes);
   const uint8_t* flatbuffer_data = nullptr;
   const uint8_t* constant_data = nullptr;
@@ -2032,6 +2080,10 @@ ET_NODISCARD Error XNNCompiler::compileModel(
   // Invalid ids do not need to be remapped
   remapped_ids.emplace(XNN_INVALID_VALUE_ID, XNN_INVALID_VALUE_ID);
 
+  // If weight cache is not on we hold onto all the unpacked buffers
+  // and we free them at the end
+  std::vector<FreeableBuffer> unpacked_buffers;
+
   // External Ids for inputs and outputs
   std::vector<uint32_t> input_ids;
   std::vector<uint32_t> output_ids;
@@ -2045,7 +2097,10 @@ ET_NODISCARD Error XNNCompiler::compileModel(
         constant_data,
         input_ids,
         output_ids,
-        compile_allocator);
+        compile_allocator,
+        named_data_map,
+        unpacked_buffers,
+        weights_cache);
 
     if (err != Error::Ok) {
       return err;
@@ -2067,12 +2122,26 @@ ET_NODISCARD Error XNNCompiler::compileModel(
 
   xnn_runtime_t runtime_ptr = nullptr;
 
+  // XNNWeightsCache if weights cache is not enabled, then XNNWeightsCache
+  // just manages the unpacked weights until the runtime is created.
+#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE
+  ET_CHECK_OR_RETURN_ERROR(
+      unpacked_buffers.size() == 0,
+      Internal,
+      "Weight Cache is enabled, which means unpacked buffers should be owned by the cache");
+  xnn_weights_cache_t weights_cache_ptr =
+      weights_cache->get_num_unpacked_data() > 0 ? weights_cache->get()
+                                                 : nullptr;
+#else
+  xnn_weights_cache_t weights_cache_ptr = nullptr;
+#endif
+
 #ifdef ENABLE_XNNPACK_SHARED_WORKSPACE
   ET_CHECK_OR_RETURN_ERROR(
       workspace != nullptr, Internal, "Failed to initialize XNNPACK workspace");
   status = xnn_create_runtime_v4(
       subgraph.get(),
-      /*weight_cache=*/nullptr, // TODO - support weight cache
+      weights_cache_ptr,
       workspace,
       ::executorch::extension::threadpool::get_pthreadpool(),
       runtime_flags,
@@ -2080,7 +2149,7 @@ ET_NODISCARD Error XNNCompiler::compileModel(
 #else
   status = xnn_create_runtime_v3(
       subgraph.get(),
-      /*weight_cache=*/nullptr, // TODO - support weight cache
+      weights_cache_ptr,
       ::executorch::extension::threadpool::get_pthreadpool(),
       runtime_flags,
       &runtime_ptr);
@@ -2092,10 +2161,25 @@ ET_NODISCARD Error XNNCompiler::compileModel(
       "XNN Runtime creation failed with code: %s",
       xnn_status_to_string(status));
 
+#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE
+  auto packed_weights_names = weights_cache->finalize_for_runtime();
+  ET_CHECK_OR_RETURN_ERROR(
+      packed_weights_names.ok(),
+      Internal,
+      "Failed to finalize weights cache after creating the xnn runtime")
+#else
+  for (auto& buffer : unpacked_buffers) {
+    buffer.Free();
+  }
+  Result<std::vector<std::string>> packed_weights_names =
+      std::vector<std::string>();
+#endif
+
   err = executor->initialize( // NOLINT: runtime_ptr is non-null
       runtime_ptr,
       std::move(input_ids),
-      std::move(output_ids));
+      std::move(output_ids),
+      std::move(packed_weights_names.get()));
 
   return err;
 };
diff --git a/backends/xnnpack/runtime/XNNCompiler.h b/backends/xnnpack/runtime/XNNCompiler.h
index e66cb791ecb..bcc87351d7d 100644
--- a/backends/xnnpack/runtime/XNNCompiler.h
+++ b/backends/xnnpack/runtime/XNNCompiler.h
@@ -9,11 +9,9 @@
 #pragma once
 
 #include <executorch/backends/xnnpack/runtime/XNNExecutor.h>
+#include <executorch/backends/xnnpack/runtime/XNNWeightsCache.h>
 #include <executorch/runtime/platform/compiler.h>
-
 #include <xnnpack.h>
-#include <memory>
-#include <vector>
 
 namespace executorch {
 namespace backends {
@@ -29,8 +27,9 @@ class XNNCompiler {
       const void* buffer_pointer,
       size_t num_bytes,
       XNNExecutor* executor,
-      executorch::runtime::MemoryAllocator* runtime_allocator,
-      xnn_workspace_t workspace);
+      XNNWeightsCache* weights_cache,
+      xnn_workspace_t workspace,
+      const NamedDataMap* named_data_map);
 };
 
 } // namespace delegate
diff --git a/backends/xnnpack/runtime/XNNExecutor.cpp b/backends/xnnpack/runtime/XNNExecutor.cpp
index 1ba549bb8d7..f18e319ac33 100644
--- a/backends/xnnpack/runtime/XNNExecutor.cpp
+++ b/backends/xnnpack/runtime/XNNExecutor.cpp
@@ -30,7 +30,8 @@ using executorch::runtime::kTensorDimensionLimit;
 ET_NODISCARD Error XNNExecutor::initialize(
     xnn_runtime_t runtime,
     std::vector<uint32_t>&& input_ids,
-    std::vector<uint32_t>&& output_ids) {
+    std::vector<uint32_t>&& output_ids,
+    std::vector<std::string>&& packed_data_names) {
   runtime_ = std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)>(
       runtime, xnn_delete_runtime);
 
@@ -51,6 +52,7 @@ ET_NODISCARD Error XNNExecutor::initialize(
   std::sort(output_ids_.begin(), output_ids_.end());
 
   externals_.resize(input_ids_.size() + output_ids_.size());
+  packed_data_names_ = std::move(packed_data_names);
 
   return Error::Ok;
 }
@@ -68,6 +70,11 @@ ET_NODISCARD Error XNNExecutor::initialize(
  * delegate->execute()
  */
 ET_NODISCARD Error XNNExecutor::prepare_args(EValue** args) {
+  ET_CHECK_OR_RETURN_ERROR(
+      runtime_ != nullptr,
+      Internal,
+      "XNNPACK Delegate did not compile correctly");
+
   // Create xnn_externals_value from evalue args
   xnn_status status;
   for (uint32_t i = 0; i < externals_.size(); ++i) {
diff --git a/backends/xnnpack/runtime/XNNExecutor.h b/backends/xnnpack/runtime/XNNExecutor.h
index 68ee18609e3..b98c902f44f 100644
--- a/backends/xnnpack/runtime/XNNExecutor.h
+++ b/backends/xnnpack/runtime/XNNExecutor.h
@@ -34,6 +34,7 @@ class XNNExecutor {
   std::vector<uint32_t> input_ids_;
   std::vector<uint32_t> output_ids_;
   std::vector<xnn_external_value> externals_;
+  std::vector<std::string> packed_data_names_;
 
  public:
   XNNExecutor() = default;
@@ -46,6 +47,10 @@ class XNNExecutor {
     return output_ids_.size();
   }
 
+  inline std::vector<std::string> get_packed_data_names() {
+    return packed_data_names_;
+  }
+
   /**
    * Initialize the XNNExecutor with a given runtime and input/output ids.
    * The input/output ids are expected to be sorted in order of their
@@ -54,7 +59,8 @@ class XNNExecutor {
   ET_NODISCARD executorch::runtime::Error initialize(
       xnn_runtime_t runtime,
       std::vector<uint32_t>&& input_ids,
-      std::vector<uint32_t>&& output_ids);
+      std::vector<uint32_t>&& output_ids,
+      std::vector<std::string>&& packed_data_names);
 
   /**
    * Prepares the arguments for runtime graph execution.
diff --git a/backends/xnnpack/runtime/XNNPACKBackend.cpp b/backends/xnnpack/runtime/XNNPACKBackend.cpp
index d0e140ce48a..1e2f07bd905 100644
--- a/backends/xnnpack/runtime/XNNPACKBackend.cpp
+++ b/backends/xnnpack/runtime/XNNPACKBackend.cpp
@@ -7,10 +7,11 @@
  */
 
 #include <executorch/backends/xnnpack/runtime/XNNCompiler.h>
+#include <executorch/backends/xnnpack/runtime/XNNWeightsCache.h>
 #include <executorch/runtime/backend/interface.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/evalue.h>
-#include <executorch/runtime/platform/profiler.h>
+#include <executorch/runtime/executor/pte_data_map.h>
 
 #include <memory>
 #include <mutex>
@@ -20,6 +21,7 @@
 namespace executorch {
 namespace backends {
 
+using executorch::backends::xnnpack::delegate::XNNWeightsCache;
 using executorch::runtime::ArrayRef;
 using executorch::runtime::Backend;
 using executorch::runtime::BackendExecutionContext;
@@ -29,6 +31,7 @@ using executorch::runtime::DelegateHandle;
 using executorch::runtime::Error;
 using executorch::runtime::EValue;
 using executorch::runtime::FreeableBuffer;
+using executorch::runtime::NamedDataMap;
 using executorch::runtime::Result;
 
 class XnnpackBackend final : public ::executorch::runtime::BackendInterface {
@@ -73,16 +76,25 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface {
       BackendInitContext& context,
       FreeableBuffer* processed,
       ArrayRef<CompileSpec> compile_specs) const override {
-    auto executor = ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR(
-        context.get_runtime_allocator(), xnnpack::delegate::XNNExecutor);
+    auto executor = context.get_runtime_allocator()
+                        ->allocateInstance<xnnpack::delegate::XNNExecutor>();
+    if (executor == nullptr) {
+      return Error::MemoryAllocationFailed;
+    }
 
-#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE
-    // This is needed to serialize access to xnn_create_runtime which is not
+    const NamedDataMap* named_data_map = context.get_named_data_map();
     // thread safe. This can heppen when multiple threads call init() on
     // the same backend instance.
+#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE
     const std::lock_guard<std::mutex> lock(workspace_mutex_);
 #endif
 
+#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE
+    const std::lock_guard<std::mutex> lock_weight_cache(weights_cache_mutex_);
+    weights_cache_->initialize_for_runtime(
+        context.get_runtime_allocator(), named_data_map);
+#endif
+
     // Executor has been allocated but not constructed, ensure that runtime_ is
     // nullptr by constructing it in place here. NOTE: Since we use placement
     // new and since this type is not trivially destructible, we must call the
@@ -92,8 +104,9 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface {
         processed->data(),
         processed->size(),
         executor,
-        context.get_runtime_allocator(),
-        workspace_.get());
+        weights_cache_.get(),
+        workspace_.get(),
+        named_data_map);
     // This backend does not need its processed data after compiling the model.
     processed->Free();
 
@@ -119,6 +132,10 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface {
     const std::lock_guard<std::mutex> lock(workspace_mutex_);
 #endif
 
+#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE
+    const std::lock_guard<std::mutex> lock_weights_cache(weights_cache_mutex_);
+#endif
+
     // Prepare Inputs/Outputs and Propagate Input Shapes
     Error err = executor->prepare_args(args);
     if (err != Error::Ok) {
@@ -139,16 +156,24 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface {
 
   void destroy(DelegateHandle* handle) const override {
     if (handle != nullptr) {
-#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE
       // This is needed to serialize access to xnn_delete_runtime which is not
       // thread safe. This can heppen when multiple threads call destroy() on
       // the same backend instance.
+#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE
       const std::lock_guard<std::mutex> lock(workspace_mutex_);
 #endif
+
       auto executor = static_cast<xnnpack::delegate::XNNExecutor*>(handle);
+
 #ifdef ENABLE_XNNPACK_PROFILING
       executor->print_avg_op_timings();
 #endif
+
+#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE
+      const std::lock_guard<std::mutex> lock_weights_cache(
+          weights_cache_mutex_);
+      weights_cache_->delete_packed_data(executor->get_packed_data_names());
+#endif
       // XNNExecutor is not trivially destructible. Since this was constructed
       // manually in init(), we must destroy it manually here.
       executor->~XNNExecutor();
@@ -161,6 +186,15 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface {
   std::unique_ptr<xnn_workspace, decltype(&xnn_release_workspace)> workspace_{
       nullptr,
       &xnn_release_workspace};
+
+  // Weights cache is global to all delegate instances.
+  mutable std::mutex weights_cache_mutex_;
+  std::unique_ptr<XNNWeightsCache> weights_cache_ =
+      std::make_unique<XNNWeightsCache>();
+
+  // Lock Hiearchy for Mutexes:
+  // workspace_mutex_
+  // weights_cache_mutex_
 };
 
 namespace {
diff --git a/backends/xnnpack/runtime/XNNWeightsCache.cpp b/backends/xnnpack/runtime/XNNWeightsCache.cpp
new file mode 100644
index 00000000000..f2842851d3a
--- /dev/null
+++ b/backends/xnnpack/runtime/XNNWeightsCache.cpp
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/xnnpack/runtime/XNNWeightsCache.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/memory_allocator.h>
+#include <sys/stat.h>
+#include <xnnpack.h>
+#include <string>
+#include <vector>
+
+namespace executorch {
+namespace backends {
+namespace xnnpack {
+namespace delegate {
+
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::NamedDataMap;
+
+XNNWeightsCache::XNNWeightsCache() {
+  weights_cache_.context = this;
+  weights_cache_.look_up = (size_t(*)(
+      void*, const xnn_weights_cache_look_up_key*))XNNWeightsCache::look_up;
+  weights_cache_.reserve_space =
+      (void* (*)(void*, size_t))XNNWeightsCache::reserve_space;
+  weights_cache_.look_up_or_insert =
+      (size_t(*)(void*, const xnn_weights_cache_look_up_key*, void*, size_t))
+          XNNWeightsCache::look_up_or_insert;
+  weights_cache_.is_finalized = (bool (*)(void*))XNNWeightsCache::is_finalized;
+  weights_cache_.offset_to_addr =
+      (void* (*)(void*, size_t))XNNWeightsCache::offset_to_addr;
+  weights_cache_.delete_cache =
+      (enum xnn_status(*)(void*))XNNWeightsCache::delete_cache;
+}
+
+Error XNNWeightsCache::initialize_for_runtime(
+    MemoryAllocator* runtime_allocator,
+    const NamedDataMap* named_data_map) {
+  runtime_allocator_ = runtime_allocator;
+  named_data_map_ = named_data_map;
+  is_finalized_ = false;
+
+  return Error::Ok;
+}
+
+Result<std::vector<std::string>> XNNWeightsCache::finalize_for_runtime() {
+  is_finalized_ = true;
+
+  // All data has been packed by create_runtime
+  // so we clear the unpacked data as it is no longer needed
+  for (FreeableBuffer& buffer : unpacked_data_) {
+    buffer.Free();
+  }
+  unpacked_data_.clear();
+  unpacked_data_to_name_.clear();
+
+  std::vector<std::string> packed_data_names;
+  // update the reference count of all the packed data
+  // used by this runtime
+  for (auto& entry : name_to_packed_data_metadata_) {
+    if (entry.second.in_current_runtime) {
+      entry.second.ref_count++;
+      entry.second.in_current_runtime = false;
+      packed_data_names.push_back(entry.first);
+    }
+  }
+
+  return packed_data_names;
+}
+
+Result<const uint8_t*> XNNWeightsCache::load_unpacked_data(
+    const std::string& name) {
+  Result<FreeableBuffer> named_data = named_data_map_->get_data(name.c_str());
+  if (!named_data.ok()) {
+    ET_LOG(Error, "Failed to load constant data for key %s", name.c_str());
+    return Error::InvalidExternalData;
+  }
+  const uint8_t* data_pointer =
+      static_cast<const uint8_t*>(named_data.get().data());
+  unpacked_data_.push_back(std::move(named_data.get()));
+  unpacked_data_to_name_[data_pointer] = name;
+
+  return data_pointer;
+}
+
+Error XNNWeightsCache::delete_packed_data(
+    const std::vector<std::string>& packed_data_names) {
+  if (!is_finalized_) {
+    ET_LOG(
+        Error,
+        "Error, attempted to delete packed data from the cache but the cache is not finalized");
+    return Error::InvalidArgument;
+  }
+  for (const std::string& name : packed_data_names) {
+    auto entry = name_to_packed_data_metadata_.find(name);
+    if (entry == name_to_packed_data_metadata_.end()) {
+      ET_LOG(
+          Error,
+          "Error, attempted to deleted packed data: %s, from the cache but it wasn't found",
+          name.c_str());
+      return Error::InvalidArgument;
+    } else {
+      entry->second.ref_count--;
+      if (entry->second.ref_count == 0) {
+        void* packed_data_ptr = packed_data_ptrs_[entry->second.offset];
+        // Erase the key/value from the map frees the pointer holding the packed
+        // data
+        packed_pointer_to_container_.erase(packed_data_ptr);
+        // remove the pointer from the packed_data_ptrs_
+        packed_data_ptrs_[entry->second.offset] = nullptr;
+        // Erase the name to packed metadata entry
+        name_to_packed_data_metadata_.erase(entry->first);
+      }
+    }
+  }
+
+  return Error::Ok;
+}
+
+size_t XNNWeightsCache::look_up(
+    XNNWeightsCache* context,
+    const xnn_weights_cache_look_up_key* cache_key) {
+  const void* unpacked_weights_ptr = cache_key->kernel;
+  const void* unpacked_bias_ptr = cache_key->bias;
+  auto entry = context->unpacked_data_to_name_.find(unpacked_weights_ptr);
+
+  // Check if weight_pointer has been cached
+  if (entry == context->unpacked_data_to_name_.end()) {
+    return SIZE_MAX;
+  }
+
+  std::string weight_bias_name = entry->second;
+
+  // Check if bias_pointer has been cached
+  if (unpacked_bias_ptr != nullptr) {
+    auto bias_entry = context->unpacked_data_to_name_.find(unpacked_bias_ptr);
+    if (bias_entry != context->unpacked_data_to_name_.end()) {
+      weight_bias_name.append(bias_entry->second);
+    }
+  }
+
+  // check if weight_bias_name has been packed already
+  auto packed_weight_entry =
+      context->name_to_packed_data_metadata_.find(weight_bias_name);
+  if (packed_weight_entry == context->name_to_packed_data_metadata_.end()) {
+    return SIZE_MAX;
+  }
+  packed_weight_entry->second.in_current_runtime = true;
+
+  return packed_weight_entry->second.offset;
+}
+
+void* XNNWeightsCache::reserve_space(XNNWeightsCache* context, size_t n) {
+  // MemoryAllocator* allocator = context->runtime_allocator_;
+  // void* reserved_pointer = allocator->allocate(n,
+  // context->kPackedAllocationAlignment);
+
+  // return reserved_pointer;
+  std::string data_container;
+  data_container.resize(n + context->kPackedAllocationAlignment);
+  void* maybe_aligned_space = data_container.data();
+  void* aligned_space = (void*)((intptr_t)maybe_aligned_space + 64 -
+                                (intptr_t)maybe_aligned_space % 64);
+
+  context->packed_pointer_to_container_[aligned_space] =
+      std::move(data_container);
+  return aligned_space;
+}
+
+size_t XNNWeightsCache::look_up_or_insert(
+    XNNWeightsCache* context,
+    const xnn_weights_cache_look_up_key* cache_key,
+    void* ptr,
+    size_t size) {
+  size_t offset = context->look_up(context, cache_key);
+
+  if (offset != SIZE_MAX) {
+    void* saved_ptr = context->offset_to_addr(context, offset);
+    if (0 == memcmp(ptr, saved_ptr, size)) {
+      return offset;
+    }
+    // Failure, cache is out of date
+    return SIZE_MAX;
+  }
+
+  // Add to Cache if it is not finalized
+  size_t next_offset = context->packed_data_ptrs_.size();
+  auto entry = context->unpacked_data_to_name_.find(cache_key->kernel);
+
+  // Check if weight_pointer has been cached
+  if (entry != context->unpacked_data_to_name_.end()) {
+    std::string weight_bias_name = entry->second;
+    if (cache_key->bias != nullptr) {
+      auto bias_entry = context->unpacked_data_to_name_.find(cache_key->bias);
+      if (bias_entry != context->unpacked_data_to_name_.end()) {
+        weight_bias_name.append(bias_entry->second);
+      }
+    }
+    PackedDataMeta packed_data_metadata = {
+        .offset = next_offset,
+        .ref_count =
+            0, // ref_count is only incremented after finalizing for runtime
+        .in_current_runtime = true};
+    context->name_to_packed_data_metadata_[weight_bias_name] =
+        packed_data_metadata;
+  } else {
+    ET_LOG(
+        Info,
+        "Warning: Unpacked weight and bias were not registered with names, "
+        "this will add new cache entries for packed data and may affect performance.");
+  }
+  context->packed_data_ptrs_.push_back(ptr);
+
+  return next_offset;
+}
+
+bool XNNWeightsCache::is_finalized(XNNWeightsCache* context) {
+  return context->is_finalized_;
+}
+
+void* XNNWeightsCache::offset_to_addr(XNNWeightsCache* context, size_t offset) {
+  return context->packed_data_ptrs_[offset];
+}
+
+enum xnn_status XNNWeightsCache::delete_cache(XNNWeightsCache* context) {
+  return xnn_status_success;
+}
+
+} // namespace delegate
+} // namespace xnnpack
+} // namespace backends
+} // namespace executorch
diff --git a/backends/xnnpack/runtime/XNNWeightsCache.h b/backends/xnnpack/runtime/XNNWeightsCache.h
new file mode 100644
index 00000000000..bc00ac15fd0
--- /dev/null
+++ b/backends/xnnpack/runtime/XNNWeightsCache.h
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <xnnpack.h>
+
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/memory_allocator.h>
+#include <executorch/runtime/core/result.h>
+#include <executorch/runtime/executor/pte_data_map.h>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace executorch {
+namespace backends {
+namespace xnnpack {
+namespace delegate {
+
+using executorch::runtime::Error;
+using executorch::runtime::FreeableBuffer;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::NamedDataMap;
+using executorch::runtime::Result;
+
+struct PackedDataMeta {
+  size_t offset;
+  // Count number of xnn_runtime_t this packed data is used in
+  size_t ref_count;
+  // true if this packed data was inserted or looked up for the
+  // current runtime being created
+  bool in_current_runtime;
+};
+
+class XNNWeightsCache {
+ public:
+  XNNWeightsCache();
+
+  /**
+   * Initializes the XNNWeightsCache for the next xnn_create_runtime
+   */
+  Error initialize_for_runtime(
+      MemoryAllocator* runtime_allocator,
+      const NamedDataMap* named_data_map);
+
+  /**
+   * Finalizes the weights cache after the weights have been packed
+   * in xnn_create_runtime.
+   *
+   * This should only be called after creating the runtime. Returns
+   * the name of all the packed weights used by this runtime
+   */
+  Result<std::vector<std::string>> finalize_for_runtime();
+
+  // Taken from XNN_ALLOCATION_ALIGNMENT in xnnpack/common.h
+  static const size_t kPackedAllocationAlignment = 64;
+
+  /**
+   * Returns XNNPACK's underlying weights_cache pointer
+   */
+  inline xnn_weights_cache_t get() {
+    return (xnn_weights_cache_t)&weights_cache_;
+  }
+
+  /**
+   * Returns the number of unpacked data
+   */
+  inline size_t get_num_unpacked_data() {
+    return unpacked_data_.size();
+  };
+
+  /**
+   * Returns the names of all unpacked data
+   */
+  inline std::vector<std::string> get_unpacked_data_names() {
+    std::vector<std::string> names;
+    for (const auto& pair : unpacked_data_to_name_) {
+      names.push_back(pair.second);
+    }
+    return names;
+  };
+
+  /**
+   * Returns the packed data names
+   */
+  inline std::vector<std::string> get_packed_data_names() {
+    std::vector<std::string> names;
+    for (const auto& pair : name_to_packed_data_metadata_) {
+      names.push_back(pair.first);
+    }
+    return names;
+  };
+
+  /**
+   * Loads unpacked named data from the NamedDataMap into this XNNWeightsCache
+   * and returns a pointer to the unpacked data. This unpacked data is given
+   * to XNNPACK's define_tensor APIs, and used as the cache key for
+   * look_up_or_insert.
+   * @param[in] name The name of the data to load
+   * @param[out] out the pointer to the unpacked data that was loaded
+   */
+  Result<const uint8_t*> load_unpacked_data(const std::string& name);
+
+  /**
+   * Deletes the packed data associated with the names given.
+   * Decrements the ref_count if the packed data is used by other
+   * models
+   *
+   */
+  Error delete_packed_data(const std::vector<std::string>& packed_names);
+
+ private:
+  // Runtime Allocator used to reserve memory for packed weights
+  MemoryAllocator* runtime_allocator_;
+
+  // Named Data Map used to load named data
+  const NamedDataMap* named_data_map_;
+
+  // Map of unpacked pointers to the data name
+  std::unordered_map<const void*, std::string> unpacked_data_to_name_;
+  // Map of data names to offset into the packed data
+  std::unordered_map<std::string, PackedDataMeta> name_to_packed_data_metadata_;
+  // Vector holding list of pointers to the packed data
+  std::vector<void*> packed_data_ptrs_;
+  // vector holding list of strings which are containers for packed_data_ptrs
+  std::unordered_map<void*, std::string> packed_pointer_to_container_;
+  // Vector hodling list of unpacked freeable buffers
+  std::vector<FreeableBuffer> unpacked_data_;
+  // xnnpack's weight cache provider
+  xnn_weights_cache_provider weights_cache_;
+  // whether or not the weight cache is finalized
+  bool is_finalized_;
+
+  // Function pointers to override XNNPACK's default xnn_weights_cache_provider
+  // functions.
+  static size_t look_up(
+      XNNWeightsCache* context,
+      const xnn_weights_cache_look_up_key* cache_key);
+
+  static void* reserve_space(XNNWeightsCache* context, size_t n);
+
+  static size_t look_up_or_insert(
+      XNNWeightsCache* context,
+      const xnn_weights_cache_look_up_key* cache_key,
+      void* ptr,
+      size_t size);
+
+  static bool is_finalized(XNNWeightsCache* context);
+
+  static void* offset_to_addr(XNNWeightsCache* context, size_t offset);
+
+  static enum xnn_status delete_cache(XNNWeightsCache* context);
+};
+
+} // namespace delegate
+} // namespace xnnpack
+} // namespace backends
+} // namespace executorch
diff --git a/backends/xnnpack/serialization/runtime_schema.fbs b/backends/xnnpack/serialization/runtime_schema.fbs
index 11cb48430ed..75074107c55 100644
--- a/backends/xnnpack/serialization/runtime_schema.fbs
+++ b/backends/xnnpack/serialization/runtime_schema.fbs
@@ -320,11 +320,20 @@ table XNNLeakyReLU {
 table ConstantDataOffset {
   // Constant data offsets are relative to the constant data base offset provided
   // in the XNNPACKHeader.
+  // named_key and offset are mutually exclusive, meaning only one of these values
+  // are valid. If the named key is a non-empty string, then the offset must be UINT64_MAX.
+  // If the offset is not UINT64_MAX, then the named key must be an empty string
   offset: uint64;
 
   // The size in bytes of valid data starting at the offset. The constant data
   // may be followed by padding before the next piece of constant data
   size: uint64;
+
+  // unique string id used to query the offset from the named data store.
+  // named_key and offset are mutually exclusive, meaning only one of these values
+  // are valid. If the named key is a non-empty string, then the offset must be UINT64_MAX.
+  // If the offset is not UINT64_MAX, then the named key must be an empty string
+  named_key: string;
 }
 
 table XNNGraph {
diff --git a/backends/xnnpack/serialization/schema.fbs b/backends/xnnpack/serialization/schema.fbs
index 5a43481b98d..193656c30b1 100644
--- a/backends/xnnpack/serialization/schema.fbs
+++ b/backends/xnnpack/serialization/schema.fbs
@@ -316,11 +316,20 @@ table XNNLeakyReLU {
 table ConstantDataOffset {
   // Constant data offsets are relative to the constant data base offset provided
   // in the XNNPACKHeader.
+  // named_key and offset are mutually exclusive, meaning only one of these values
+  // are valid. If the named key is a non-empty string, then the offset must be UINT64_MAX.
+  // If the offset is not UINT64_MAX, then the named key must be an empty string
   offset: uint64;
 
   // The size in bytes of valid data starting at the offset. The constant data
   // may be followed by padding before the next piece of constant data
   size: uint64;
+
+  // unique string id used to query the offset from the named data store.
+  // named_key and offset are mutually exclusive, meaning only one of these values
+  // are valid. If the named key is a non-empty string, then the offset must be UINT64_MAX.
+  // If the offset is not UINT64_MAX, then the named key must be an empty string
+  named_key: string;
 }
 
 table XNNGraph {
diff --git a/backends/xnnpack/serialization/xnnpack_graph_schema.py b/backends/xnnpack/serialization/xnnpack_graph_schema.py
index 3276dac7869..3cb572c66ef 100644
--- a/backends/xnnpack/serialization/xnnpack_graph_schema.py
+++ b/backends/xnnpack/serialization/xnnpack_graph_schema.py
@@ -470,6 +470,7 @@ class XValue:
 class ConstantDataOffset:
     offset: int
     size: int
+    named_key: str = ""
 
 
 @dataclass
diff --git a/backends/xnnpack/targets.bzl b/backends/xnnpack/targets.bzl
index b89a999bc3d..e97f1941ff7 100644
--- a/backends/xnnpack/targets.bzl
+++ b/backends/xnnpack/targets.bzl
@@ -6,11 +6,15 @@ def _get_preprocessor_flags():
     Disable if someone explictly specified a config option,
     else Enable otherwise
     """
-    if native.read_config("executorch", "xnnpack_workspace_sharing", "0") == "0":
-        return []
+    preprocessor_flags = []
+    if native.read_config("executorch", "xnnpack_workspace_sharing", "0") != "0":
+        preprocessor_flags.append("-DENABLE_XNNPACK_SHARED_WORKSPACE")
+
+    if native.read_config("executorch", "xnnpack_weights_cache", "0") != "0":
+        preprocessor_flags.append("-DENABLE_XNNPACK_WEIGHTS_CACHE")
 
     # Enable if not disabled through config
-    return ["-DENABLE_XNNPACK_SHARED_WORKSPACE"]
+    return preprocessor_flags
 
 def define_common_targets():
     runtime.cxx_library(
@@ -60,6 +64,7 @@ def define_common_targets():
             "//executorch/backends/xnnpack/serialization:xnnpack_flatbuffer_header",
             "//executorch/extension/threadpool:threadpool",
             "//executorch/runtime/core/exec_aten/util:tensor_util",
+            "//executorch/runtime/executor:pte_data_map"
         ],
         # XnnpackBackend.cpp needs to compile with executor as whole
         # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
diff --git a/backends/xnnpack/test/CMakeLists.txt b/backends/xnnpack/test/CMakeLists.txt
index 0874775774d..c73709817f2 100644
--- a/backends/xnnpack/test/CMakeLists.txt
+++ b/backends/xnnpack/test/CMakeLists.txt
@@ -15,7 +15,7 @@ cmake_minimum_required(VERSION 3.19)
 
 set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 
-include(${EXECUTORCH_ROOT}/build/Test.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
 set(_test_srcs
     runtime/test_xnnexecutor.cpp
diff --git a/backends/xnnpack/test/TARGETS b/backends/xnnpack/test/TARGETS
index b3143743b9c..9b2ce0a4e82 100644
--- a/backends/xnnpack/test/TARGETS
+++ b/backends/xnnpack/test/TARGETS
@@ -53,7 +53,7 @@ runtime.python_test(
     srcs = glob([
         "models/*.py",
     ]),
-    tags = ["long_running"],
+    labels = ["long_running"],
     deps = [
         "fbsource//third-party/pypi/timm:timm",
         "fbsource//third-party/pypi/torchsr:torchsr",  # @manual
diff --git a/backends/xnnpack/test/ops/test_check_quant_params.py b/backends/xnnpack/test/ops/test_check_quant_params.py
new file mode 100644
index 00000000000..cd18568afba
--- /dev/null
+++ b/backends/xnnpack/test/ops/test_check_quant_params.py
@@ -0,0 +1,104 @@
+import unittest
+
+import torch
+from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
+
+from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import (
+    get_symmetric_quantization_config,
+    XNNPACKQuantizer,
+)
+from executorch.backends.xnnpack.utils.utils import get_param_tensor
+from executorch.exir import to_edge_transform_and_lower
+from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
+from torch.export import export_for_training
+
+
+class TestCheckQuantParams(unittest.TestCase):
+    def create_invalid_value_injector(
+        self, invalid_value, is_per_channel=False, is_zp=False
+    ):
+        def inject_invalid_scale_in_per_tensor(aten):
+            for node in aten.graph_module.graph.nodes:
+                target_to_find = (
+                    torch.ops.quantized_decomposed.quantize_per_tensor.default
+                    if not is_per_channel
+                    else torch.ops.quantized_decomposed.dequantize_per_channel.default
+                )
+                if node.op == "call_function" and node.target == target_to_find:
+                    if is_zp:
+                        node_args = list(node.args)
+                        node_args[2] = invalid_value
+                        node.args = tuple(node_args)
+                        break
+                    else:
+                        scale = node.args[1]
+                        if is_per_channel:
+                            self.assertTrue(isinstance(scale, torch.fx.Node))
+                            scale_tensor = get_param_tensor(aten, scale)
+                            scale_tensor[2] = invalid_value
+                        else:
+                            self.assertTrue(isinstance(scale, float))
+                            node_args = list(node.args)
+                            node_args[1] = invalid_value
+                            node.args = tuple(node_args)
+                            break
+
+        return inject_invalid_scale_in_per_tensor
+
+    def _test_check_quant_message(self, ep_modifier, expected_message):
+        mod = torch.nn.Linear(10, 10)
+        quantizer = XNNPACKQuantizer()
+        captured = export_for_training(mod, (torch.randn(1, 10),)).module()
+        quantizer.set_global(get_symmetric_quantization_config(is_per_channel=True))
+        prepared = prepare_pt2e(captured, quantizer)
+
+        prepared(*(torch.randn(1, 10),))
+        converted = convert_pt2e(prepared)
+        aten = torch.export.export(converted, (torch.randn(1, 10),))
+
+        ep_modifier(aten)
+
+        with self.assertRaises(ValueError) as context:
+            to_edge_transform_and_lower(aten, partitioner=[XnnpackPartitioner()])
+
+        self.assertEquals(str(context.exception), expected_message)
+
+    def test_in_per_tensor_quant(self):
+
+        for invalid_scale in [
+            float("nan"),
+            float("inf"),
+            -float("inf"),
+            1.0000002153053333e-39,
+        ]:
+            self._test_check_quant_message(
+                self.create_invalid_value_injector(invalid_scale),
+                "Invalid quantization scale or zero point for quantized_decomposed_quantize_per_tensor_default: "
+                "Scales must be finite and normal, however found scale value: "
+                f"{invalid_scale} in scale tensor at index: (0,)",
+            )
+
+    def test_in_per_channel_quant(self):
+        for invalid_scale in [
+            float("nan"),
+            float("inf"),
+            -float("inf"),
+            1.0000002153053333e-39,
+        ]:
+            self._test_check_quant_message(
+                self.create_invalid_value_injector(invalid_scale, is_per_channel=True),
+                "Invalid quantization scale or zero point for quantized_decomposed_dequantize_per_channel_default: "
+                "Scales must be finite and normal, however found scale value: "
+                f"{invalid_scale} in scale tensor at index: (2,)",
+            )
+
+    def test_inject_invalid_zp(self):
+        for invalid_zp in [-129, 128]:
+            self._test_check_quant_message(
+                self.create_invalid_value_injector(
+                    invalid_zp, is_zp=True, is_per_channel=False
+                ),
+                "Invalid quantization scale or zero point for quantized_decomposed_quantize_per_tensor_default: "
+                f"Found invalid zeropoint {invalid_zp} "
+                "in zero point tensor at index: (0,)",
+            )
diff --git a/backends/xnnpack/test/ops/test_linear.py b/backends/xnnpack/test/ops/test_linear.py
index eccda406b80..cf9473180bb 100644
--- a/backends/xnnpack/test/ops/test_linear.py
+++ b/backends/xnnpack/test/ops/test_linear.py
@@ -31,6 +31,8 @@
     ToEdgeTransformAndLower,
 )
 
+from torch.export.graph_signature import ExportGraphSignature, InputKind
+
 try:
     from torchao.quantization.quant_api import (
         int8_dynamic_activation_int4_weight,
@@ -189,6 +191,21 @@ def forward(self, x, y):
         return a + b
 
 
+class SharedDQChain(torch.nn.Module):
+    def __init__(self, input_size, output_size):
+        super().__init__()
+        self.linear1_weight = torch.nn.Parameter(torch.rand(output_size, input_size))
+        self.linear1_bias = torch.nn.Parameter(torch.rand(output_size))
+
+        self.linear2_weight = torch.nn.Parameter(torch.rand(output_size, input_size))
+        self.linear2_bias = torch.nn.Parameter(torch.rand(output_size))
+
+    def forward(self, x):
+        a = torch.nn.functional.linear(x, self.linear1_weight, self.linear1_bias)
+        b = torch.nn.functional.linear(x, self.linear2_weight, self.linear2_bias)
+        return a + b
+
+
 class TestLinear(unittest.TestCase):
     """
     Test Class for XNNPACK Linear Operators.
@@ -518,6 +535,23 @@ def get_qnode_checks(quant_node_checks, dialect):
                 #     qtol=bool(quant_config), atol=atol
                 # )
 
+    def test_qd8_f32_per_channel_shared_dq_chain(self):
+        for use_bias in (False, True):
+            module = SharedDQChain(
+                input_size=13,
+                output_size=17,
+            )
+            inputs = (torch.randn(1, 2, 13),)
+
+            self._test_dqlinear(
+                module,
+                inputs,
+                dynamic_shapes=None,
+                is_per_channel=True,
+                linear_count=2,
+                uses_bias=use_bias,
+            )
+
     def _test_qd8_per_channel_linear(self, dtype: torch.dtype = torch.float):
         for uses_bias in (False, True):
             module = BaseLinear(
@@ -537,6 +571,66 @@ def _test_qd8_per_channel_linear(self, dtype: torch.dtype = torch.float):
                 uses_bias=uses_bias,
             )
 
+    def _test_qd8_linear_per_tensor_unsupported(self, dtype: torch.dtype = torch.float):
+        for uses_bias in (False, True):
+            module = BaseLinear(
+                in_size=8,
+                input_channels=13,
+                output_channels=17,
+                dtype=dtype,
+                use_bias=uses_bias,
+            )
+            inputs = module.get_inputs()
+            dynamic_shapes = ({1: torch.export.Dim("batch", max=100)},)
+
+            quant_config = get_symmetric_quantization_config(
+                is_per_channel=False,
+                is_dynamic=True,
+            )
+
+            for legacy_partitioner in (True, False):
+                for per_op_mode in (True, False):
+                    # Every combination should fail to partition Linear or [add]mm.
+                    DynamicallyQuantizedPartitioner = XnnpackPartitioner(
+                        config_precisions=ConfigPrecisionType.DYNAMIC_QUANT,
+                        per_op_mode=per_op_mode,
+                    )
+
+                    tester = Tester(module, inputs, dynamic_shapes=dynamic_shapes)
+                    tester.quantize(Quantize(quantization_config=quant_config))
+                    tester.export()
+
+                    if legacy_partitioner:
+                        tester.to_edge()
+                        tester.partition(
+                            Partition(DynamicallyQuantizedPartitioner)
+                        ).dump_artifact()
+                        # should have [add]mm node
+                        if uses_bias:
+                            tester.check(
+                                [
+                                    "executorch_exir_dialects_edge__ops_aten_addmm_default",
+                                ]
+                            )
+                        else:
+                            tester.check(
+                                [
+                                    "executorch_exir_dialects_edge__ops_aten_mm_default",
+                                ]
+                            )
+                    else:
+                        tester.to_edge_transform_and_lower(
+                            ToEdgeTransformAndLower([DynamicallyQuantizedPartitioner])
+                        ).dump_artifact()
+                        # should not have a delegate node
+                        tester.check_not(
+                            [
+                                "torch.ops.higher_order.executorch_call_delegate",
+                            ]
+                        )
+                    # No need to run the model, since it should fail to partition.
+                    return
+
     def _test_qd8_per_channel_4w_linear(self, dtype: torch.dtype = torch.float):
         qconfig = self._get_4b_dqconfig()
         input_channels = [2, 63]
@@ -583,31 +677,32 @@ def _test_qd8_per_token_weight_per_channel_group_int4(
         bl_sizes = [32, 32, 32, 64]
         N_sizes = [2, 17, 92, 128]
 
-        for use_bias in [True, False]:
-            for M, K, bl, N in zip(M_sizes, K_sizes, bl_sizes, N_sizes):
-                lin_mod = BaseLinear(
-                    in_size=M,
-                    input_channels=K,
-                    output_channels=N,
-                    dtype=dtype,
-                    use_bias=use_bias,
-                )
+        for input_rank in range(2, 4):
+            for use_bias in [True, False]:
+                for M, K, bl, N in zip(M_sizes, K_sizes, bl_sizes, N_sizes):
+                    lin_mod = BaseLinear(
+                        in_size=M,
+                        input_channels=K,
+                        output_channels=N,
+                        dtype=dtype,
+                        use_bias=use_bias,
+                    )
 
-                inputs = lin_mod.get_inputs()
-                # Half requires slightly higher atol, but if you look at error it is not that bad:
-                # Difference: max: 0.00140380859375, abs: 0.00140380859375, mean abs error: 0.00042724609375.
-                # -- Model vs. Reference --
-                # Numel: 4, 4
-                # Median: -0.05023193359375, -0.0516357421875
-                # Mean: 0.2373046875, 0.237060546875
-                # Max: 1.0078125, 1.0078125
-                # Min: -0.08465576171875, -0.08441162109375
-                atol = (
-                    1e-2 if dtype == torch.half else 5e-3
-                )  # TODO(T212995726): Investigate right atol for rand[n] inputs
-                self._test_groupwise_dq_linear(
-                    lin_mod, inputs, group_size=bl, use_bias=use_bias, atol=atol
-                )
+                    inputs = lin_mod.get_inputs(rank=input_rank)
+                    # Half requires slightly higher atol, but if you look at error it is not that bad:
+                    # Difference: max: 0.00140380859375, abs: 0.00140380859375, mean abs error: 0.00042724609375.
+                    # -- Model vs. Reference --
+                    # Numel: 4, 4
+                    # Median: -0.05023193359375, -0.0516357421875
+                    # Mean: 0.2373046875, 0.237060546875
+                    # Max: 1.0078125, 1.0078125
+                    # Min: -0.08465576171875, -0.08441162109375
+                    atol = (
+                        1e-2 if dtype == torch.half else 5e-3
+                    )  # TODO(T212995726): Investigate right atol for rand[n] inputs
+                    self._test_groupwise_dq_linear(
+                        lin_mod, inputs, group_size=bl, use_bias=use_bias, atol=atol
+                    )
 
     def test_fp16_linear(self):
         for use_bias in (True, False):
@@ -695,10 +790,24 @@ def test_qs8_linear(self):
     def test_qd8_f16_per_channel_linear(self):
         self._test_qd8_per_channel_linear(dtype=torch.half)
 
+    def test_qd8_f16_per_tensor_linear(self):
+        """
+        XNNPACK doesn't support per_tensor quantized weights for dynamic quantized linear op.
+        This test is to verify that we can't lower per_tensor quantized weights to per_channel quantized weights.
+        """
+        self._test_qd8_linear_per_tensor_unsupported(dtype=torch.half)
+
     # Tests for q[dp]8-f32-qc8w
     def test_qd8_f32_per_channel_linear(self):
         self._test_qd8_per_channel_linear(dtype=torch.float)
 
+    def test_qd8_f32_per_tensor_linear(self):
+        """
+        XNNPACK doesn't support per_tensor quantized weights for dynamic quantized linear op.
+        This test is to verify that we can't lower per_tensor quantized weights to per_channel quantized weights.
+        """
+        self._test_qd8_linear_per_tensor_unsupported(dtype=torch.half)
+
     # Tests for q[dp]8-f16-qc4w
     def test_linear_qd8_f16_per_channel_int4(self):
         self._test_qd8_per_channel_4w_linear(dtype=torch.half)
@@ -871,3 +980,71 @@ def test_linear_qd8_as_fp32(self):
                     "dequantize_per_channel.default": 1,  # 1: weight
                 },
             )
+
+    def test_linear_with_force_non_static_weights_for_f32_linear(self):
+        def check_signature(
+            signature: ExportGraphSignature,
+            force_flag: bool,
+            use_bias: bool,
+            legacy_mode: bool,
+        ):
+            num_params = 0
+            if force_flag:
+                num_params = 1  # weight_param
+                if use_bias:
+                    num_params += 1  # bias_param
+            sign_params: int = 0
+            input_specs = signature.input_specs
+            for input_spec in input_specs:
+                if input_spec.kind == InputKind.PARAMETER:
+                    sign_params += 1
+            assert (
+                sign_params == num_params
+            ), f"Expected {num_params} params, got {sign_params} with force_flag={force_flag}, use_bias={use_bias}, legacy_mode={legacy_mode}"
+
+        for force_flag in (True, False):
+            for use_bias in (True, False):
+                for legacy_mode in (True, False):
+                    module = BaseLinear(
+                        in_size=8,
+                        input_channels=13,
+                        output_channels=17,
+                        use_bias=use_bias,
+                    )
+                    inputs = module.get_inputs()
+                    tester = Tester(module, inputs).export()
+                    partitioner = XnnpackPartitioner(
+                        force_non_static_weights_for_f32_linear=force_flag
+                    )
+                    if legacy_mode:
+                        tester.to_edge()
+                        partitioner_stage = Partition(partitioner=partitioner)
+                        tester.partition(partition_stage=partitioner_stage)
+                        tester.check_not(
+                            [
+                                (
+                                    "executorch_exir_dialects_edge__ops_aten_mm_default"
+                                    if use_bias
+                                    else "executorch_exir_dialects_edge__ops_aten_addmm_default"
+                                )
+                            ]
+                        )
+                    else:
+                        to_edge_and_transform_stage = ToEdgeTransformAndLower(
+                            partitioners=[partitioner]
+                        )
+                        tester.to_edge_transform_and_lower(
+                            to_edge_and_transform_stage=to_edge_and_transform_stage
+                        )
+                        tester.check_not(
+                            ["executorch_exir_dialects_edge__ops_aten_linear_default"]
+                        )
+
+                    signature: ExportGraphSignature = (
+                        tester.get_artifact().exported_program().graph_signature
+                    )
+                    check_signature(signature, force_flag, use_bias, legacy_mode)
+
+                    tester.to_executorch()
+                    tester.serialize()
+                    tester.run_method_and_compare_outputs()
diff --git a/backends/xnnpack/test/ops/test_lstm.py b/backends/xnnpack/test/ops/test_lstm.py
index bfc6113c417..6c174b16f33 100644
--- a/backends/xnnpack/test/ops/test_lstm.py
+++ b/backends/xnnpack/test/ops/test_lstm.py
@@ -43,20 +43,21 @@ def test_fp32_lstm(self):
             .run_method_and_compare_outputs()
         )
 
-    def test_fp32_lstm_force_dynamic_linear(self):
+    def test_lstm_with_force_non_static_weights_for_f32_linear(self):
         (
             Tester(self.LSTMLinear(32, 32, 10), (torch.rand(1, 32, 32),))
             .export()
             .to_edge_transform_and_lower(
                 ToEdgeTransformAndLower(
-                    partitioners=[XnnpackPartitioner(force_fp32_dynamic_linear=True)]
+                    partitioners=[
+                        XnnpackPartitioner(force_non_static_weights_for_f32_linear=True)
+                    ]
                 )
             )
             .check_not(["executorch_exir_dialects_edge__ops_aten_addmm_default"])
             # Weights are supplied as input to linears
-            .check(["p_lstm_weight_hh_l0", "p_lstm_weight_ih_l0"])
-            # Biases are owned by delegates
-            .check_not(["p_lstm_bias"])
+            # Biases are not owned by delegates when force_non_static_weights_for_f32_linear is set
+            .check(["p_lstm_weight_hh_l0", "p_lstm_weight_ih_l0", "p_lstm_bias"])
             .to_executorch()
             .serialize()
             .run_method_and_compare_outputs()
diff --git a/backends/xnnpack/test/runtime/test_xnn_weights_cache.cpp b/backends/xnnpack/test/runtime/test_xnn_weights_cache.cpp
new file mode 100644
index 00000000000..ca149a67b5e
--- /dev/null
+++ b/backends/xnnpack/test/runtime/test_xnn_weights_cache.cpp
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/xnnpack/runtime/XNNWeightsCache.h>
+
+#include <executorch/runtime/executor/pte_data_map.h>
+
+#include <executorch/extension/data_loader/file_data_loader.h>
+#include <executorch/extension/testing_util/temp_file.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/result.h>
+#include <executorch/runtime/platform/runtime.h>
+#include <executorch/schema/program_generated.h>
+#include <gtest/gtest.h>
+#include <xnnpack.h>
+
+using executorch::backends::xnnpack::delegate::XNNWeightsCache;
+using executorch::extension::FileDataLoader;
+using executorch::extension::testing::TempFile;
+using executorch::runtime::DataLoader;
+using executorch::runtime::Error;
+using executorch::runtime::FreeableBuffer;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::Result;
+using executorch::runtime::internal::PteDataMap;
+
+class XNNWeightsCacheTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // Creating a NamedDataMap from scratch is a little bit convoluted, so
+    // we copied a lot of setup from test_pte_data_map.cpp
+
+    // Since these tests cause ET_LOG to be called, the PAL must be initialized
+    // first.
+    executorch::runtime::runtime_init();
+
+    // Create a sample Program with only named_data and segments. Technically
+    // not a valid Program; only used to test the PteDataMap.
+    // Create named data.
+    std::array<const flatbuffers::Offset<executorch_flatbuffer::NamedData>, 2>
+        named_data_arr = {
+            executorch_flatbuffer::CreateNamedDataDirect(
+                builder_, "weight", /*segment_index=*/0),
+            executorch_flatbuffer::CreateNamedDataDirect(
+                builder_, "bias", /*segment_index=*/1),
+        };
+    const auto named_data =
+        builder_.CreateVector(named_data_arr.data(), named_data_arr.size());
+
+    // Create segments.
+    std::array<const flatbuffers::Offset<executorch_flatbuffer::DataSegment>, 2>
+        segment_arr = {// @lint-ignore CLANGTIDY facebook-hte-BadArgumentComment
+                       executorch_flatbuffer::CreateDataSegment(
+                           builder_, /*offset=*/0, /*size=*/kSegmentSizes[0]),
+                       // @lint-ignore CLANGTIDY facebook-hte-BadArgumentComment
+                       executorch_flatbuffer::CreateDataSegment(
+                           builder_,
+                           /*offset=*/kSegmentAlignment * 2,
+                           /*size=*/kSegmentSizes[1])};
+    const auto segments =
+        builder_.CreateVector(segment_arr.data(), segment_arr.size());
+
+    // Create Program.
+    const auto program = executorch_flatbuffer::CreateProgram(
+        builder_, 0, 0, 0, 0, segments, 0, 0, named_data);
+
+    builder_.Finish(program);
+    program_ = executorch_flatbuffer::GetProgram(builder_.GetBufferPointer());
+
+    // Create sample segment data.
+    for (int i = 0; i < kSegmentSizes[0]; i++) {
+      sample_data_[i] = 1;
+    }
+    for (int i = kSegmentOffsets[1]; i < kSegmentOffsets[1] + kSegmentSizes[1];
+         i++) {
+      sample_data_[i] = 2;
+    }
+    TempFile tf(sample_data_.data(), sizeof(sample_data_));
+
+    // Wrap the sample data in a loader.
+    Result<FileDataLoader> loader =
+        FileDataLoader::from(tf.path().c_str(), kSegmentAlignment);
+    ASSERT_EQ(loader.error(), Error::Ok);
+    data_map_loader_ =
+        std::make_unique<FileDataLoader>(std::move(loader.get()));
+
+    Result<PteDataMap> data_map = PteDataMap::create(
+        data_map_loader_.get(),
+        0,
+        program_->named_data(),
+        program_->segments());
+    ASSERT_EQ(data_map.error(), Error::Ok);
+    data_map_ = std::make_unique<PteDataMap>(std::move(data_map.get()));
+
+    memory_allocator_ = std::make_unique<MemoryAllocator>(
+        memory_allocator_data_.size(), memory_allocator_data_.data());
+
+    xnn_status status = xnn_initialize(nullptr);
+    ASSERT_EQ(status, xnn_status_success);
+  }
+
+  void BuildAndRunGraphWithWeightsCache(
+      XNNWeightsCache& weight_cache,
+      const std::vector<size_t>& batches,
+      size_t input_channels,
+      size_t output_channels,
+      float* input_data,
+      float* output_data) {
+    // Defining subgraph
+    xnn_subgraph_t subgraph_ptr = nullptr;
+    xnn_status status = xnn_create_subgraph(
+        /*external_value_ids=*/2,
+        /*flags=*/0,
+        &subgraph_ptr);
+    ASSERT_EQ(status, xnn_status_success);
+    std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> subgraph(
+        subgraph_ptr, &xnn_delete_subgraph);
+
+    // Define tensors
+    // Define input
+    uint32_t input_id;
+    std::vector<size_t> input_dims(batches);
+    input_dims.push_back(input_channels);
+    status = xnn_define_tensor_value(
+        subgraph_ptr,
+        xnn_datatype_fp32,
+        input_dims.size(),
+        input_dims.data(),
+        nullptr,
+        0,
+        XNN_VALUE_FLAG_EXTERNAL_INPUT,
+        &input_id);
+
+    // Define weight
+    uint32_t weight_id;
+    Result<const uint8_t*> weight_pointer =
+        weight_cache.load_unpacked_data("weight");
+    ASSERT_TRUE(weight_pointer.ok());
+    ASSERT_TRUE(weight_pointer.get() != nullptr);
+    std::vector<size_t> weight_dims{output_channels, input_channels};
+    status = xnn_define_tensor_value(
+        subgraph_ptr,
+        xnn_datatype_fp32,
+        weight_dims.size(),
+        weight_dims.data(),
+        weight_pointer.get(),
+        XNN_INVALID_VALUE_ID,
+        0,
+        &weight_id);
+    ASSERT_EQ(status, xnn_status_success);
+
+    // Define bias
+    uint32_t bias_id;
+    Result<const uint8_t*> bias_pointer =
+        weight_cache.load_unpacked_data("bias");
+    ASSERT_TRUE(bias_pointer.ok());
+    std::vector<size_t> bias_dims{output_channels};
+    status = xnn_define_tensor_value(
+        subgraph_ptr,
+        xnn_datatype_fp32,
+        bias_dims.size(),
+        bias_dims.data(),
+        bias_pointer.get(),
+        XNN_INVALID_VALUE_ID,
+        0,
+        &bias_id);
+
+    // Define output tensor
+    uint32_t output_id;
+    std::vector<size_t> output_dims(batches);
+    output_dims.push_back(output_channels);
+    status = xnn_define_tensor_value(
+        subgraph_ptr,
+        xnn_datatype_fp32,
+        output_dims.size(),
+        output_dims.data(),
+        nullptr,
+        1,
+        XNN_VALUE_FLAG_EXTERNAL_OUTPUT,
+        &output_id);
+
+    // create xecond fully connected
+    status = xnn_define_fully_connected(
+        subgraph_ptr,
+        -std::numeric_limits<float>::infinity(),
+        std::numeric_limits<float>::infinity(),
+        input_id,
+        weight_id,
+        bias_id,
+        output_id,
+        0);
+    // Create and Pack Weights
+    xnn_runtime_t runtime_ptr = nullptr;
+    status = xnn_create_runtime_v3(
+        subgraph_ptr, weight_cache.get(), nullptr, 0, &runtime_ptr);
+    Result<std::vector<std::string>> packed_weights_added =
+        weight_cache.finalize_for_runtime();
+    ASSERT_TRUE(packed_weights_added.ok());
+    ASSERT_EQ(packed_weights_added.get().size(), 1);
+    ASSERT_EQ(packed_weights_added.get()[0], "weightbias");
+
+    auto runtime = std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)>(
+        runtime_ptr, xnn_delete_runtime);
+
+    const std::array<xnn_external_value, 2> external = {
+        xnn_external_value{0, input_data},
+        xnn_external_value{1, output_data},
+    };
+
+    status = xnn_reshape_runtime(runtime.get());
+    status =
+        xnn_setup_runtime_v2(runtime.get(), external.size(), external.data());
+
+    ASSERT_EQ(status, xnn_status_success);
+    status = xnn_invoke_runtime(runtime.get());
+    ASSERT_EQ(status, xnn_status_success);
+  }
+
+  // Program builder constants.
+  static constexpr int kSegmentAlignment = 16;
+  static constexpr std::array<int, 2> kSegmentSizes{384, 128};
+  static constexpr std::array<int, 2> kSegmentOffsets{0, kSegmentAlignment * 2};
+  std::array<uint8_t, 512> sample_data_;
+
+  // Program builder.
+  flatbuffers::FlatBufferBuilder builder_;
+  const executorch_flatbuffer::Program* program_;
+
+  // Data loader for the sample data.
+  std::unique_ptr<FileDataLoader> data_map_loader_;
+
+  // PteDataMap
+  std::unique_ptr<PteDataMap> data_map_;
+
+  // MemoryAllocator
+  std::array<uint8_t, 200> memory_allocator_data_;
+  std::unique_ptr<MemoryAllocator> memory_allocator_;
+};
+
+TEST_F(XNNWeightsCacheTest, ReusePackedWeights) {
+  XNNWeightsCache weight_cache;
+  size_t padding = 32;
+
+  std::vector<size_t> batches{1, 2, 3};
+  size_t num_batches = 1;
+  for (size_t batch_dim : batches) {
+    num_batches *= batch_dim;
+  }
+  size_t input_channels = 3;
+  size_t output_channels = 4;
+  std::vector<float> input_tensor(num_batches * input_channels + padding, 1.0f);
+  std::vector<float> output_tensor(num_batches * output_channels, 0.0f);
+  float* input_data = input_tensor.data();
+  float* output_data = output_tensor.data();
+  weight_cache.initialize_for_runtime(memory_allocator_.get(), data_map_.get());
+  BuildAndRunGraphWithWeightsCache(
+      weight_cache,
+      batches,
+      input_channels,
+      output_channels,
+      input_data,
+      output_data);
+
+  weight_cache.initialize_for_runtime(memory_allocator_.get(), data_map_.get());
+  BuildAndRunGraphWithWeightsCache(
+      weight_cache,
+      batches,
+      input_channels,
+      output_channels,
+      input_data,
+      output_data);
+  ASSERT_EQ(weight_cache.get_num_unpacked_data(), 0);
+  weight_cache.delete_packed_data(weight_cache.get_packed_data_names());
+  std::vector<std::string> packed_data_names =
+      weight_cache.get_packed_data_names();
+  // Packed Data Still exists because it has a ref count of 2
+  ASSERT_EQ(packed_data_names.size(), 1);
+  weight_cache.delete_packed_data(weight_cache.get_packed_data_names());
+  packed_data_names = weight_cache.get_packed_data_names();
+  ASSERT_EQ(packed_data_names.size(), 0);
+}
diff --git a/backends/xnnpack/test/runtime/test_xnnexecutor.cpp b/backends/xnnpack/test/runtime/test_xnnexecutor.cpp
index a5a26004b49..42d925c1253 100644
--- a/backends/xnnpack/test/runtime/test_xnnexecutor.cpp
+++ b/backends/xnnpack/test/runtime/test_xnnexecutor.cpp
@@ -74,7 +74,8 @@ TEST(XNNExecutorTest, ArgumentWithTooManyDimensions) {
           },
           {
               1,
-          }),
+          },
+          {}),
       Error::Ok);
   TensorFactory<executorch::aten::ScalarType::Int> tf;
   auto input_tensor = tf.make({1, 1, 1, 1, 1, 1, 1, 1, 1}, {42});
diff --git a/backends/xnnpack/test/targets.bzl b/backends/xnnpack/test/targets.bzl
index 30ce970a842..58589b70607 100644
--- a/backends/xnnpack/test/targets.bzl
+++ b/backends/xnnpack/test/targets.bzl
@@ -30,3 +30,16 @@ def define_common_targets():
             "//executorch/backends/xnnpack:xnnpack_backend",
         ],
     )
+
+    runtime.cxx_test(
+        name = "test_xnn_weights_cache",
+        srcs = ["runtime/test_xnn_weights_cache.cpp"],
+        deps = [
+            third_party_dep("XNNPACK"),
+            "//executorch/backends/xnnpack:xnnpack_backend",
+            "//executorch/runtime/executor:pte_data_map",
+            "//executorch/extension/data_loader:file_data_loader",
+            "//executorch/extension/testing_util:temp_file",
+            "//executorch/schema:program",
+        ],
+    )
diff --git a/backends/xnnpack/test/tester/TARGETS b/backends/xnnpack/test/tester/TARGETS
index 0ba34cc0bfa..231de970d7b 100644
--- a/backends/xnnpack/test/tester/TARGETS
+++ b/backends/xnnpack/test/tester/TARGETS
@@ -26,5 +26,6 @@ runtime.python_library(
         "//executorch/exir/backend:partitioner",
         "//executorch/exir/passes:spec_prop_pass",
         "//executorch/extension/pybindings:portable_lib",  # @manual
+        "//executorch/backends/transforms:duplicate_dynamic_quant_chain"
     ],
 )
diff --git a/backends/xnnpack/test/tester/tester.py b/backends/xnnpack/test/tester/tester.py
index 79544256022..a82688cd52c 100644
--- a/backends/xnnpack/test/tester/tester.py
+++ b/backends/xnnpack/test/tester/tester.py
@@ -15,6 +15,9 @@
 from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Type, Union
 
 import torch
+from executorch.backends.transforms.duplicate_dynamic_quant_chain import (
+    DuplicateDynamicQuantChainPass,
+)
 from executorch.backends.xnnpack._passes import XNNPACKPassManager
 from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
 from executorch.backends.xnnpack.utils.configs import get_xnnpack_edge_compile_config
@@ -177,6 +180,8 @@ def run(
                 prepared(*inputs)
 
         converted = convert_pt2e(prepared)
+        DuplicateDynamicQuantChainPass()(converted)
+
         self.converted_graph = converted
 
     @property
@@ -306,9 +311,8 @@ def __init__(
         self.edge_dialect_program = None
 
     def run(self, artifact: ExportedProgram, inputs=None) -> None:
-        artifact_to_run = copy.deepcopy(artifact)
         self.edge_dialect_program = to_edge_transform_and_lower(
-            artifact_to_run,
+            artifact,
             compile_config=self.edge_compile_conf,
             partitioner=self.partitioners,
         )
diff --git a/backends/xnnpack/third-party/xnnpack_src_defs.bzl b/backends/xnnpack/third-party/xnnpack_src_defs.bzl
index 8cb9affede3..f940a38025b 100644
--- a/backends/xnnpack/third-party/xnnpack_src_defs.bzl
+++ b/backends/xnnpack/third-party/xnnpack_src_defs.bzl
@@ -7,7 +7,7 @@ load(
     _XNNPACK_SRCS = "XNNPACK_SRCS",
 )
 load("//backends/xnnpack/third-party/XNNPACK/gen:microkernels.bzl", "prod_srcs_for_arch")
-load("//third-party:glob_defs.bzl", "subdir_glob")
+load("@fbsource//xplat/executorch/third-party:glob_defs.bzl", "subdir_glob")
 
 def define_xnnpack_build_src(xnnpack_build_src):
     return ["XNNPACK/{}".format(src) for src in xnnpack_build_src]
diff --git a/backends/xnnpack/utils/gen_xnnpack_constants.sh b/backends/xnnpack/utils/gen_xnnpack_constants.sh
index 6be9d4519f3..5fa92e5b038 100644
--- a/backends/xnnpack/utils/gen_xnnpack_constants.sh
+++ b/backends/xnnpack/utils/gen_xnnpack_constants.sh
@@ -26,5 +26,6 @@
 } > xnnpack_constants.py
 
 echo UINT32_MAX = 4294967295 >> xnnpack_constants.py
+echo UINT64_MAX = 18446744073709551615 >> xnnpack_constants.py
 awk '/^#define\s+XNN_/ { print $2,"=",$3} ' "$1"/include/xnnpack.h >> xnnpack_constants.py
 if ! grep -qc "^XNN_" xnnpack_constants.py; then false; fi
diff --git a/backends/xnnpack/utils/quant_utils.py b/backends/xnnpack/utils/quant_utils.py
index 7c035757a6f..db1914e3910 100644
--- a/backends/xnnpack/utils/quant_utils.py
+++ b/backends/xnnpack/utils/quant_utils.py
@@ -6,7 +6,7 @@
 
 import operator
 from itertools import accumulate
-from typing import cast
+from typing import cast, Union
 
 import torch
 from executorch.exir.backend.canonical_partitioners.config_partitioner import (
@@ -47,12 +47,30 @@
 
 
 def is_dynamic_qdq(node: torch.fx.Node) -> bool:
-    if node.op != "call_function":
+    # check has dynamic qdq name
+    if not (is_quant(node) or is_dequant(node)):
+        return False
+
+    # check scales and zp are dynamically chosen
+    node_input_args = node.args
+    if is_affine_qdq(node):
+        node_input_args = extract_qdq_affine_op_args_for_decomposed_ops(node)
+
+    scale = node_input_args[1]
+    zp = node_input_args[2]
+    if not (isinstance(scale, torch.fx.Node) and isinstance(zp, torch.fx.Node)):
+        return False
+
+    if not (scale.target == operator.getitem and zp.target == operator.getitem):
+        return False
+
+    scale_choose_qparam = scale.all_input_nodes[0]
+    zp_choose_qparam = zp.all_input_nodes[0]
+
+    if not (is_qparam(scale_choose_qparam) and is_qparam(zp_choose_qparam)):
         return False
-    node_name = format_target_name(node.target.__name__)  # pyre-ignore
-    is_dynamic_affine = is_per_token(node) and not is_per_channel_group(node)
 
-    return node_name in _DYNAMIC_OPS or is_dynamic_affine
+    return True
 
 
 def is_qparam(node: torch.fx.Node) -> bool:
@@ -89,6 +107,15 @@ def is_per_channel(node: torch.fx.Node) -> bool:
     return is_per_channel or is_affine_per_channel_group
 
 
+def is_per_tensor(node: torch.fx.Node) -> bool:
+    if not (is_quant(node) or is_dequant(node)):
+        return False
+
+    is_per_tensor = "per_tensor" in node.target.__name__  # pyre-ignore
+
+    return is_per_tensor and not (is_per_channel(node))
+
+
 def is_affine_qdq(node: torch.fx.Node) -> bool:
     if not (is_quant(node) or is_dequant(node)):
         return False
@@ -186,3 +213,62 @@ def extract_qdq_affine_op_args_for_decomposed_ops(node: torch.fx.Node):
     args.append(node.args[-1])
 
     return args
+
+
+def is_tensor_subnormal(tensor: torch.Tensor):
+    finfo = torch.finfo(tensor.dtype)
+    return (tensor >= 0) & (torch.abs(tensor) < finfo.smallest_normal)
+
+
+def validate_quant_scales(scales: Union[float, torch.Tensor]):
+    if isinstance(scales, float):
+        scales = torch.tensor([scales])
+
+    is_infinite = torch.isinf(scales) | torch.isnan(scales)
+
+    is_subnormal = is_tensor_subnormal(scales)
+
+    if is_infinite.nonzero().numel() != 0:
+        idx = torch.where(is_infinite)
+        idx = tuple(int(index[0]) for index in idx)
+        value = scales[idx]
+        raise ValueError(
+            f"Scales must be finite and normal, however found scale value: {value}"
+            f" in scale tensor at index: {idx}"
+        )
+
+    if is_subnormal.nonzero().numel() != 0:
+        idx = torch.where(is_subnormal)
+        idx = tuple(int(index[0]) for index in idx)
+        value = scales[idx]
+        raise ValueError(
+            f"Scales must be finite and normal, however found scale value: {value}"
+            f" in scale tensor at index: {tuple(idx)}"
+        )
+
+
+def validate_quant_zeropoints(
+    zp: Union[float, int, torch.Tensor], dtype: torch.dtype, is_4bit: bool
+):
+    if not isinstance(zp, torch.Tensor):
+        zp = torch.tensor([zp])
+
+    if dtype == torch.int8 or dtype == torch.qint8:
+        if is_4bit:
+            invalid_zp = (zp < 0) | (zp > 15)
+        else:
+            invalid_zp = (zp < -128) | (zp > 127)
+    elif dtype == torch.uint8 or dtype == torch.quint8:
+        invalid_zp = (zp < 0) | (zp > 255)
+    elif dtype == torch.int32:
+        invalid_zp = zp != 0
+    else:
+        raise ValueError("Unsupported dtype for quantization")
+
+    if invalid_zp.nonzero().numel() != 0:
+        idx = torch.where(invalid_zp)
+        idx = tuple(int(index[0]) for index in idx)
+        value = zp[tuple(idx)]
+        raise ValueError(
+            f"Found invalid zeropoint {value}" f" in zero point tensor at index: {idx}"
+        )
diff --git a/backends/xnnpack/utils/utils.py b/backends/xnnpack/utils/utils.py
index b802d73c16b..fab95618807 100644
--- a/backends/xnnpack/utils/utils.py
+++ b/backends/xnnpack/utils/utils.py
@@ -131,6 +131,22 @@ def get_param_tensor(
     raise RuntimeError(f"unsupported param type, {node.op}.")
 
 
+def get_tensor_name(exp_prog: ExportedProgram, node: torch.fx.Node) -> str:
+    if node is None:
+        return ""
+    if is_param(exp_prog, node):
+        return exp_prog.graph_signature.inputs_to_parameters[node.name]
+    elif is_buffer(exp_prog, node):
+        return exp_prog.graph_signature.inputs_to_buffers[node.name]
+    elif is_lifted_tensor_constant(exp_prog, node):
+        return exp_prog.graph_signature.inputs_to_lifted_tensor_constants[node.name]
+    else:
+        assert isinstance(node.target, str)
+        return node.target
+
+    return ""
+
+
 def get_source_fn(node: torch.fx.Node) -> Optional[torch.fx.Node]:
     """
     Returns the source fn of the given node, return None if something goes wrong
diff --git a/backends/xnnpack/utils/xnnpack_constants.py b/backends/xnnpack/utils/xnnpack_constants.py
index 351cc8ad897..364819a2435 100644
--- a/backends/xnnpack/utils/xnnpack_constants.py
+++ b/backends/xnnpack/utils/xnnpack_constants.py
@@ -6,8 +6,11 @@
 
 # Auto-generated by gen_xnnpack_constants.sh script. Do not modify
 UINT32_MAX = 4294967295
+UINT64_MAX = 18446744073709551615
+XNN_EXTRA_BYTES = 128
 XNN_EXTRA_BYTES = 16
 XNN_MAX_TENSOR_DIMS = 6
+XNN_INVALID_VALUE_ID = UINT32_MAX
 XNN_FLAG_HINT_SPARSE_INFERENCE = 0x00000001
 XNN_FLAG_HINT_FP16_INFERENCE = 0x00000002
 XNN_FLAG_FORCE_FP16_INFERENCE = 0x00000004
@@ -26,7 +29,8 @@
 XNN_FLAG_YIELD_WORKERS = 0x00000010
 XNN_FLAG_TRANSIENT_INDIRECTION_BUFFER = 0x00000020
 XNN_FLAG_KEEP_DIMS = 0x00000040
-XNN_EXTRA_QUANTIZATION_PARAMS = 8
+XNN_EXTRA_QUANTIZATION_PARAMS = 10
+XNN_MIN_BLOCKSIZE = 32
 XNN_VALUE_FLAG_EXTERNAL_INPUT = 0x00000001
 XNN_VALUE_FLAG_EXTERNAL_OUTPUT = 0x00000002
 XNN_VALUE_FLAG_PERSISTENT = 0x00000004
diff --git a/backends/xnnpack/xnnpack_preprocess.py b/backends/xnnpack/xnnpack_preprocess.py
index 4548de4940a..84cdfd69a48 100644
--- a/backends/xnnpack/xnnpack_preprocess.py
+++ b/backends/xnnpack/xnnpack_preprocess.py
@@ -31,6 +31,7 @@
     XNN_VALUE_FLAG_EXTERNAL_INPUT,
     XNN_VALUE_FLAG_EXTERNAL_OUTPUT,
 )
+from executorch.exir._serialize._named_data_store import NamedDataStore
 
 from executorch.exir.backend.backend_details import (
     BackendDetails,
@@ -103,7 +104,7 @@ def preprocess(
         edge_program: ExportedProgram,
         compile_specs: List[CompileSpec],
     ) -> PreprocessResult:
-
+        named_data_store = NamedDataStore()
         xnnpack_edge_compile_config = get_xnnpack_edge_compile_config()
 
         # Need to wrap EP here because xnnpack does addmm to linear
@@ -162,7 +163,7 @@ def preprocess(
         )
 
         constant_data_bytes = bytearray()
-        node_visitors = get_node_visitors(ep, node_to_external_map, constant_data_bytes)
+        node_visitors = get_node_visitors(ep, node_to_external_map, named_data_store)
 
         for node in graph_module.graph.nodes:
             if node.op == "call_function":
@@ -191,4 +192,5 @@ def preprocess(
                 xnnpack_graph, constant_data_bytes
             ),
             debug_handle_map={},
+            data_store_output=named_data_store.get_named_data_store_output(),
         )
diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh
deleted file mode 100644
index b72968037c1..00000000000
--- a/build/build_android_llm_demo.sh
+++ /dev/null
@@ -1,204 +0,0 @@
-#!/bin/bash
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-set -ex
-
-if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
-  PYTHON_EXECUTABLE=python3
-fi
-which "${PYTHON_EXECUTABLE}"
-CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')"
-
-build_jar() {
-  pushd extension/android
-  ./gradlew build
-  popd
-  mkdir -p "${BUILD_AAR_DIR}/libs"
-  cp extension/android/build/libs/executorch.jar "${BUILD_AAR_DIR}/libs/"
-}
-
-build_android_native_library() {
-  ANDROID_ABI="$1"
-  ANDROID_NDK="${ANDROID_NDK:-/opt/ndk}"
-  CMAKE_OUT="cmake-out-android-${ANDROID_ABI}"
-  EXECUTORCH_CMAKE_BUILD_TYPE="${EXECUTORCH_CMAKE_BUILD_TYPE:-Release}"
-  QNN_SDK_ROOT="${QNN_SDK_ROOT:-}"
-  if [ -n "$QNN_SDK_ROOT" ]; then
-    EXECUTORCH_BUILD_QNN=ON
-  else
-    EXECUTORCH_BUILD_QNN=OFF
-  fi
-
-  NEURON_BUFFER_ALLOCATOR_LIB="${NEURON_BUFFER_ALLOCATOR_LIB:-}"
-  NEURON_USDK_ADAPTER_LIB="${NEURON_USDK_ADAPTER_LIB:-}"
-  if [ -n "$NEURON_BUFFER_ALLOCATOR_LIB" ]; then
-    EXECUTORCH_BUILD_NEURON=ON
-  else
-    EXECUTORCH_BUILD_NEURON=OFF
-  fi
-
-  cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
-    -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \
-    -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}/build/cmake/android.toolchain.cmake" \
-    -DANDROID_ABI="${ANDROID_ABI}" \
-    -DANDROID_PLATFORM=android-26 \
-    -DEXECUTORCH_ENABLE_LOGGING=ON \
-    -DEXECUTORCH_LOG_LEVEL=Info \
-    -DEXECUTORCH_BUILD_XNNPACK=ON \
-    -DEXECUTORCH_XNNPACK_SHARED_WORKSPACE=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
-    -DEXECUTORCH_BUILD_NEURON="${EXECUTORCH_BUILD_NEURON}" \
-    -DNEURON_BUFFER_ALLOCATOR_LIB="${NEURON_BUFFER_ALLOCATOR_LIB}" \
-    -DEXECUTORCH_BUILD_QNN="${EXECUTORCH_BUILD_QNN}" \
-    -DQNN_SDK_ROOT="${QNN_SDK_ROOT}" \
-    -DCMAKE_BUILD_TYPE="${EXECUTORCH_CMAKE_BUILD_TYPE}" \
-    -B"${CMAKE_OUT}"
-
-  if [ "$(uname)" == "Darwin" ]; then
-    CMAKE_JOBS=$(( $(sysctl -n hw.ncpu) - 1 ))
-  else
-    CMAKE_JOBS=$(( $(nproc) - 1 ))
-  fi
-  cmake --build "${CMAKE_OUT}" -j "${CMAKE_JOBS}" --target install --config "${EXECUTORCH_CMAKE_BUILD_TYPE}"
-
-  cmake extension/android \
-    -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
-    -DANDROID_ABI="${ANDROID_ABI}" \
-    -DANDROID_PLATFORM=android-26 \
-    -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
-    -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \
-    -DEXECUTORCH_ENABLE_LOGGING=ON \
-    -DEXECUTORCH_LOG_LEVEL=Info \
-    -DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \
-    -DNEURON_BUFFER_ALLOCATOR_LIB="$NEURON_BUFFER_ALLOCATOR_LIB" \
-    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
-    -DEXECUTORCH_BUILD_LLAMA_JNI=ON \
-    -DCMAKE_BUILD_TYPE="${EXECUTORCH_CMAKE_BUILD_TYPE}" \
-    -B"${CMAKE_OUT}"/extension/android
-
-  cmake --build "${CMAKE_OUT}"/extension/android -j "${CMAKE_JOBS}" --config "${EXECUTORCH_CMAKE_BUILD_TYPE}"
-
-  # Copy artifacts to ABI specific directory
-  mkdir -p "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}"
-  cp "${CMAKE_OUT}"/extension/android/*.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/"
-
-  # Copy QNN related so library
-  if [ -n "$QNN_SDK_ROOT" ] && [ "$ANDROID_ABI" == "arm64-v8a" ]; then
-    cp "${CMAKE_OUT}"/lib/libqnn_executorch_backend.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/"
-    cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtp.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/"
-    cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnSystem.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/"
-    cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtpV69Stub.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/"
-    cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtpV73Stub.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/"
-    cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtpV75Stub.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/"
-    cp "${QNN_SDK_ROOT}"/lib/hexagon-v69/unsigned/libQnnHtpV69Skel.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/"
-    cp "${QNN_SDK_ROOT}"/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/"
-    cp "${QNN_SDK_ROOT}"/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/"
-  fi
-
-  # Copy MTK related so library
-  if [ -n "$NEURON_BUFFER_ALLOCATOR_LIB" ] && [ -n "$NEURON_USDK_ADAPTER_LIB" ] && [ "$ANDROID_ABI" == "arm64-v8a" ]; then
-    cp "${CMAKE_OUT}"/backends/mediatek/libneuron_backend.so ${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/
-    cp "${NEURON_BUFFER_ALLOCATOR_LIB}" ${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/
-    cp "${NEURON_USDK_ADAPTER_LIB}" ${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/
-  fi
-}
-
-build_aar() {
-  echo \<manifest xmlns:android=\"http://schemas.android.com/apk/res/android\" \
-  package=\"org.pytorch.executorch\"\> \
-  \<uses-sdk android:minSdkVersion=\"19\" /\> \
-  \</manifest\> > "${BUILD_AAR_DIR}/AndroidManifest.xml"
-  pushd "${BUILD_AAR_DIR}"
-  # Rename libexecutorch_jni.so to libexecutorch.so for soname consistency
-  # between Java and JNI
-  find jni -type f -name "libexecutorch_jni.so" -exec bash -c 'mv "$1" "${1/_jni/}"' bash {} \;
-  if [ "$EXECUTORCH_CMAKE_BUILD_TYPE" == "Release" ]; then
-    find jni -type f -name "*.so" -exec "$ANDROID_NDK"/toolchains/llvm/prebuilt/*/bin/llvm-strip {} \;
-  fi
-  # Zip all necessary files into the AAR file
-  zip -r executorch.aar libs jni/*/libexecutorch.so jni/*/libqnn*.so jni/*/libQnn*.so jni/*/libneuron_backend.so jni/*/libneuron_buffer_allocator.so jni/*/libneuronusdk_adapter.mtk.so AndroidManifest.xml
-  popd
-}
-
-build_android_demo_apps() {
-  mkdir -p examples/demo-apps/android/LlamaDemo/app/libs
-  cp ${BUILD_AAR_DIR}/executorch.aar examples/demo-apps/android/LlamaDemo/app/libs
-  pushd examples/demo-apps/android/LlamaDemo
-  ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew build assembleAndroidTest
-  popd
-
-  mkdir -p extension/benchmark/android/benchmark/app/libs
-  cp ${BUILD_AAR_DIR}/executorch.aar extension/benchmark/android/benchmark/app/libs
-  pushd extension/benchmark/android/benchmark
-  ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew build assembleAndroidTest
-  popd
-
-  pushd extension/android_test
-  ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew testDebugUnitTest
-  ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew build assembleAndroidTest
-  popd
-}
-
-collect_artifacts_to_be_uploaded() {
-  ARTIFACTS_DIR_NAME="$1"
-  DEMO_APP_DIR="${ARTIFACTS_DIR_NAME}/llm_demo"
-  # The app directory is named using its build flavor as a suffix.
-  mkdir -p "${DEMO_APP_DIR}"
-  # Collect the app and its test suite
-  cp examples/demo-apps/android/LlamaDemo/app/build/outputs/apk/debug/*.apk "${DEMO_APP_DIR}"
-  cp examples/demo-apps/android/LlamaDemo/app/build/outputs/apk/androidTest/debug/*.apk "${DEMO_APP_DIR}"
-  # Collect all ABI specific libraries
-  for ANDROID_ABI in "${ANDROID_ABIS[@]}"; do
-    mkdir -p "${DEMO_APP_DIR}/${ANDROID_ABI}"
-    cp cmake-out-android-${ANDROID_ABI}/lib/*.a "${DEMO_APP_DIR}/${ANDROID_ABI}/"
-    cp cmake-out-android-${ANDROID_ABI}/extension/android/*.so "${DEMO_APP_DIR}/${ANDROID_ABI}/"
-  done
-  # Collect JAR and AAR
-  cp extension/android/build/libs/executorch.jar "${DEMO_APP_DIR}"
-  find "${BUILD_AAR_DIR}/" -name 'executorch*.aar' -exec cp {} "${DEMO_APP_DIR}" \;
-  # Collect MiniBench APK
-  MINIBENCH_APP_DIR="${ARTIFACTS_DIR_NAME}/minibench"
-  mkdir -p "${MINIBENCH_APP_DIR}"
-  cp extension/benchmark/android/benchmark/app/build/outputs/apk/debug/*.apk "${MINIBENCH_APP_DIR}"
-  cp extension/benchmark/android/benchmark/app/build/outputs/apk/androidTest/debug/*.apk "${MINIBENCH_APP_DIR}"
-  # Collect Java library test
-  JAVA_LIBRARY_TEST_DIR="${ARTIFACTS_DIR_NAME}/library_test_dir"
-  mkdir -p "${JAVA_LIBRARY_TEST_DIR}"
-  cp extension/android_test/build/outputs/apk/debug/*.apk "${JAVA_LIBRARY_TEST_DIR}"
-  cp extension/android_test/build/outputs/apk/androidTest/debug/*.apk "${JAVA_LIBRARY_TEST_DIR}"
-}
-
-main() {
-  BUILD_AAR_DIR="$(mktemp -d)"
-  export BUILD_AAR_DIR
-  if [ -z "$ANDROID_ABIS" ]; then
-    ANDROID_ABIS=("arm64-v8a" "x86_64")
-  fi
-  export ANDROID_ABIS
-
-  ARTIFACTS_DIR_NAME="$1"
-
-  build_jar
-  for ANDROID_ABI in "${ANDROID_ABIS[@]}"; do
-    build_android_native_library ${ANDROID_ABI}
-  done
-  build_aar
-  build_android_demo_apps
-  if [ -n "$ARTIFACTS_DIR_NAME" ]; then
-    collect_artifacts_to_be_uploaded ${ARTIFACTS_DIR_NAME}
-  fi
-}
-
-if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
-  main "$@"
-fi
diff --git a/build/install_flatc.sh b/build/install_flatc.sh
deleted file mode 100755
index 75b4e418836..00000000000
--- a/build/install_flatc.sh
+++ /dev/null
@@ -1,99 +0,0 @@
-#!/bin/bash
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# This file builds the `flatc` commandline tool from the
-# `third-party/flatbuffers` directory and help users install it correctly.
-
-set -o errexit
-set -o nounset
-set -o pipefail
-
-EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/.." && pwd)"
-readonly EXECUTORCH_ROOT
-
-readonly FLATBUFFERS_PATH="${EXECUTORCH_ROOT}/third-party/flatbuffers"
-readonly BUILD_DIR="${FLATBUFFERS_PATH}/cmake-out"
-readonly BUILT_FLATC="${BUILD_DIR}/flatc"
-
-# Must use "echo -e" to expand these escape sequences.
-readonly GREEN="\033[0;32m" # GREEN Color
-readonly RED="\033[0;31m" # Red Color
-readonly NC="\033[0m" # No Color
-
-# Prints the flatbuffers version of the git submodule.
-print_flatbuffers_version(){
-    local version_file="${FLATBUFFERS_PATH}/package.json"
-    local version
-    # Extract the version from the first line like `"version": "23.5.26",`
-    # First remove the final double quote, then remove everything
-    # before the now-final double quote.
-    version="$(
-        grep '"version"\s*:' "${version_file}" \
-        | head -1 \
-        | sed -e 's/"[^"]*$//' \
-        | sed -e 's/.*"//'
-        )"
-    if [[ ${version} =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
-        echo "${version}"
-    else
-        echo "ERROR: Bad version '${version}'; could not find version in ${version_file}" >&2
-        exit 1
-    fi
-}
-
-main() {
-    local flatbuffers_version
-    flatbuffers_version="$(print_flatbuffers_version)"
-    echo "Version of ${FLATBUFFERS_PATH} is ${flatbuffers_version}"
-
-    local flatc_path
-    flatc_path="$(which flatc 2>/dev/null || echo '')"
-    if [[ -f "${flatc_path}" ]]; then
-        # A flatc is already on the PATH.
-        if { "${flatc_path}" --version | grep -q "${flatbuffers_version}"; }; then
-            echo -e "${GREEN}A compatible version of flatc is on the PATH" \
-                "and ready to use.${NC}"
-            return 0
-        else
-            echo -e "${RED}WARNING: An incompatible version of flatc" \
-                "is on the PATH at ${flatc_path}."
-            echo -e "  Required version: flatc version ${flatbuffers_version}"
-            echo -e "  Actual version: $("${flatc_path}" --version)${NC}"
-
-            if [[ "${flatc_path}" == *miniconda* ]]; then
-                echo -e "${RED}ERROR: ${flatc_path} appears to be installed" \
-                    "with conda, which can cause consistency problems."
-                echo -e "Please run the following command to remove it: "
-                echo -e "  conda uninstall flatbuffers${NC}"
-                return 1
-            fi
-
-            # Continue to build a compatible version.
-        fi
-    fi
-
-    if [[ -f "${BUILT_FLATC}" ]]; then
-        echo -e "${BUILT_FLATC} is already built."
-    else
-        # Build the tool if not already built.
-        echo "Building flatc under ${FLATBUFFERS_PATH}..."
-        # Generate cache.
-        (rm -rf "${BUILD_DIR}" && mkdir "${BUILD_DIR}" && cd "${BUILD_DIR}" && cmake -DCMAKE_BUILD_TYPE=Release ..)
-        # Build.
-        (cd "${FLATBUFFERS_PATH}" && cmake --build "${BUILD_DIR}" --target flatc -j9)
-
-        echo -e "Finished building ${BUILT_FLATC}."
-    fi
-
-    echo -e ""
-    echo -e "***** Run the following commands to add a compatible flatc"\
-        "to the PATH and re-run this script:"
-    echo -e "  ${RED}export PATH=\"${BUILD_DIR}:\${PATH}\""
-    echo -e "  bash ${EXECUTORCH_ROOT}/build/install_flatc.sh${NC}"
-}
-
-main "$@"
diff --git a/build/packaging/smoke_test.py b/build/packaging/smoke_test.py
deleted file mode 100644
index 8f2bd08004f..00000000000
--- a/build/packaging/smoke_test.py
+++ /dev/null
@@ -1,121 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""
-This script is run by CI after building the executorch wheel. Before running
-this, the job will install the matching torch package as well as the newly-built
-executorch package and its dependencies.
-"""
-
-# Import this first. If it can't find the torch.so libraries, the dynamic load
-# will fail and the process will exit.
-from executorch.extension.pybindings import portable_lib  # usort: skip
-
-# Import custom ops. This requires portable_lib to be loaded first.
-from executorch.extension.llm.custom_ops import (  # noqa: F401, F403
-    custom_ops,
-)  # usort: skip
-
-# Import quantized ops. This requires portable_lib to be loaded first.
-from executorch.kernels import quantized  # usort: skip # noqa: F401, F403
-
-# Import this after importing the ExecuTorch pybindings. If the pybindings
-# links against a different torch.so than this uses, there will be a set of
-# symbol comflicts; the process will either exit now, or there will be issues
-# later in the smoke test.
-import torch  # usort: skip
-
-# Import everything else later to help isolate the critical imports above.
-import os
-import tempfile
-from typing import Tuple
-
-from executorch.exir import to_edge
-from torch.export import export
-
-
-class LinearModel(torch.nn.Module):
-    """Runs Linear on its input, which should have shape [4]."""
-
-    def __init__(self):
-        super().__init__()
-        self.linear = torch.nn.Linear(4, 2)
-
-    def forward(self, x: torch.Tensor):
-        """Expects a single tensor of shape [4]."""
-        return self.linear(x)
-
-
-def linear_model_inputs() -> Tuple[torch.Tensor]:
-    """Returns some example inputs compatible with LinearModel."""
-    # The model takes a single tensor of shape [4] as an input.
-    return (torch.ones(4),)
-
-
-def export_linear_model() -> bytes:
-    """Exports LinearModel and returns the .pte data."""
-
-    # This helps the exporter understand the shapes of tensors used in the model.
-    # Since our model only takes one input, this is a one-tuple.
-    example_inputs = linear_model_inputs()
-
-    # Export the pytorch model and process for ExecuTorch.
-    print("Exporting program...")
-    exported_program = export(LinearModel(), example_inputs, strict=True)
-    print("Lowering to edge...")
-    edge_program = to_edge(exported_program)
-    print("Creating ExecuTorch program...")
-    et_program = edge_program.to_executorch()
-
-    return et_program.buffer
-
-
-def main():
-    """Tests the export and execution of a simple model."""
-
-    # If the pybindings loaded correctly, we should be able to ask for the set
-    # of operators.
-    ops = portable_lib._get_operator_names()
-    assert len(ops) > 0, "Empty operator list"
-    print(f"Found {len(ops)} operators; first element '{ops[0]}'")
-
-    # Make sure custom ops are registered.
-    assert (
-        "llama::sdpa_with_kv_cache.out" in ops
-    ), f"llama::sdpa_with_kv_cache.out not registered, Got ops: {ops}"
-
-    # Make sure quantized ops are registered.
-    assert (
-        "quantized_decomposed::add.out" in ops
-    ), f"quantized_decomposed::add.out not registered, Got ops: {ops}"
-    # Export LinearModel to .pte data.
-    pte_data: bytes = export_linear_model()
-
-    # Try saving to and loading from a file.
-    with tempfile.TemporaryDirectory() as tempdir:
-        pte_file = os.path.join(tempdir, "linear.pte")
-
-        # Save the .pte data to a file.
-        with open(pte_file, "wb") as file:
-            file.write(pte_data)
-            print(f"ExecuTorch program saved to {pte_file} ({len(pte_data)} bytes).")
-
-        # Load the model from disk.
-        m = portable_lib._load_for_executorch(pte_file)
-
-        # Run the model.
-        outputs = m.forward(linear_model_inputs())
-
-        # Should see a single output with shape [2].
-        assert len(outputs) == 1, f"Unexpected output length {len(outputs)}: {outputs}"
-        assert outputs[0].shape == (2,), f"Unexpected output size {outputs[0].shape}"
-
-    print("PASS")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/build/test_android_ci.sh b/build/test_android_ci.sh
deleted file mode 100755
index f6ab72cb3f1..00000000000
--- a/build/test_android_ci.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-set -ex
-
-# https://github.com/pytorch/executorch/tree/main/examples/demo-apps/android/ExecuTorchDemo
-export_model() {
-  MODEL_NAME=dl3
-  # Delegating DeepLab v3 to XNNPACK backend
-  python -m examples.xnnpack.aot_compiler --model_name="${MODEL_NAME}" --delegate
-
-  ASSETS_DIR=examples/demo-apps/android/ExecuTorchDemo/app/src/main/assets/
-  mkdir -p "${ASSETS_DIR}"
-  cp "${MODEL_NAME}_xnnpack_fp32.pte" "${ASSETS_DIR}"
-}
-
-build_android_demo_app() {
-  mkdir -p examples/demo-apps/android/ExecuTorchDemo/app/libs
-  cp executorch.aar examples/demo-apps/android/ExecuTorchDemo/app/libs
-  pushd examples/demo-apps/android/ExecuTorchDemo
-  ANDROID_HOME=/opt/android/sdk ./gradlew build
-  popd
-}
-
-export_model
-build_android_demo_app
diff --git a/codegen/macros.h b/codegen/macros.h
index c84b30c2317..ddba56b18d7 100644
--- a/codegen/macros.h
+++ b/codegen/macros.h
@@ -8,6 +8,6 @@
 
 #pragma once
 // TODO(T157709949) remove this file
-#ifndef USE_ATEN_LIB
+#if !defined(USE_ATEN_LIB) && !defined(TORCH_API)
 #define TORCH_API
 #endif
diff --git a/codegen/tools/targets.bzl b/codegen/tools/targets.bzl
index 62037b92300..fb629adf047 100644
--- a/codegen/tools/targets.bzl
+++ b/codegen/tools/targets.bzl
@@ -28,6 +28,7 @@ def define_common_targets(is_fbcode = False):
         deps = [
             ":gen_oplist_lib",
         ],
+        preload_deps = [] if runtime.is_oss else ["//executorch/codegen/tools/fb:selective_build"],  # TODO(larryliu0820) :selective_build doesn't build in OSS yet
         package_style = "inplace",
         visibility = [
             "//executorch/...",
diff --git a/configurations/CMakeLists.txt b/configurations/CMakeLists.txt
index eddb8b2a12c..d620b722a09 100644
--- a/configurations/CMakeLists.txt
+++ b/configurations/CMakeLists.txt
@@ -16,7 +16,7 @@ if(NOT EXECUTORCH_ROOT)
   set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/..)
 endif()
 
-include(${EXECUTORCH_ROOT}/build/Utils.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 
 if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
@@ -24,13 +24,13 @@ endif()
 
 set(_common_compile_options -Wno-deprecated-declarations)
 
-include(${EXECUTORCH_ROOT}/build/Utils.cmake)
-include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
 
 if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
   # Merge optimized and portable definitions, taking optimized where available.
   merge_yaml(
-    FUNCTIONS_YAML ${EXECUTORCH_ROOT}/kernels/optimized/optimized-oss.yaml
+    FUNCTIONS_YAML ${EXECUTORCH_ROOT}/kernels/optimized/optimized.yaml
     FALLBACK_YAML ${EXECUTORCH_ROOT}/kernels/portable/functions.yaml OUTPUT_DIR
     ${CMAKE_CURRENT_BINARY_DIR}
   )
@@ -47,12 +47,17 @@ if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
   message("Generated files ${gen_command_sources}")
 
   # optimized_native_cpu_ops_lib: Register optimized op kernels into the runtime
+  if(TARGET optimized_portable_kernels)
+    set(_optimized_native_cpu_ops_lib_portable_kernels_lib optimized_portable_kernels)
+  else()
+    set(_optimized_native_cpu_ops_lib_portable_kernels_lib portable_kernels)
+  endif()
   gen_operators_lib(
     LIB_NAME
     "optimized_native_cpu_ops_lib"
     KERNEL_LIBS
-    portable_kernels
     optimized_kernels
+    ${_optimized_native_cpu_ops_lib_portable_kernels_lib}
     DEPS
     executorch
   )
diff --git a/configurations/targets.bzl b/configurations/targets.bzl
index 6a5341c2904..5a39f7301ec 100644
--- a/configurations/targets.bzl
+++ b/configurations/targets.bzl
@@ -50,21 +50,3 @@ def define_common_targets():
             "@EXECUTORCH_CLIENTS",
         ],
     )
-
-    # TODO(T183193812): delete this target after optimized-oss.yaml is gone
-    executorch_generated_lib(
-        name = "optimized_native_cpu_ops_oss",
-        deps = [
-            "//executorch/kernels/optimized:optimized_operators",
-            "//executorch/kernels/optimized:optimized_oplist",
-            "//executorch/kernels/portable:executorch_aten_ops",
-            "//executorch/kernels/portable:operators",
-        ],
-        functions_yaml_target = "//executorch/kernels/optimized:optimized-oss.yaml",
-        fallback_yaml_target = "//executorch/kernels/portable:functions.yaml",
-        define_static_targets = True,
-        visibility = [
-            "//executorch/examples/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-    )
diff --git a/data/bin/README.md b/data/bin/README.md
new file mode 100644
index 00000000000..ca81882e00d
--- /dev/null
+++ b/data/bin/README.md
@@ -0,0 +1,31 @@
+## PLEASE DO NOT REMOVE THIS DIRECTORY!
+
+This directory is used to host binaries installed during pip wheel build time.
+
+## How to add a binary into pip wheel
+
+1. Update `[project.scripts]` section of `pyproject.toml` file. Add the new binary name and it's corresponding module name similar to:
+
+```
+flatc = "executorch.data.bin:flatc"
+```
+
+For example, `flatc` is built during wheel packaging, we first build `flatc` through CMake and copy the file to `<executorch root>/data/bin/flatc` and ask `setuptools` to generate a commandline wrapper for `flatc`, then route it to `<executorch root>/data/bin/flatc`.
+
+This way after installing `executorch`, a user will be able to call `flatc` directly in commandline and it points to `<executorch root>/data/bin/flatc`
+
+2. Update `setup.py` to include the logic of building the new binary and copying the binary to this directory.
+
+```python
+BuiltFile(
+    src_dir="%CMAKE_CACHE_DIR%/third-party/flatbuffers/%BUILD_TYPE%/",
+    src_name="flatc",
+    dst="executorch/data/bin/",
+    is_executable=True,
+),
+```
+This means find `flatc` in `CMAKE_CACHE_DIR` and copy it to `<executorch root>/data/bin`. Notice that this works for both pip wheel packaging as well as editable mode install.
+
+## Why we can't create this directory at wheel build time?
+
+The reason is without `data/bin/` present in source file, we can't tell `setuptools` to generate a module `executorch.data.bin` in editable mode, partially because we don't have a good top level module `executorch` and have to enumerate all the second level modules, including `executorch.data.bin`.
diff --git a/devtools/CMakeLists.txt b/devtools/CMakeLists.txt
index 58043067a54..9dd38d3678e 100644
--- a/devtools/CMakeLists.txt
+++ b/devtools/CMakeLists.txt
@@ -20,7 +20,11 @@ if(NOT CMAKE_CXX_STANDARD)
 endif()
 
 if(NOT FLATCC_EXECUTABLE)
-  set(FLATCC_EXECUTABLE ${_flatcc_source_dir}/bin/flatcc)
+  if(WIN32)
+    set(FLATCC_EXECUTABLE ${_flatcc_source_dir}/bin/${CMAKE_BUILD_TYPE}/flatcc)
+  else()
+    set(FLATCC_EXECUTABLE ${_flatcc_source_dir}/bin/flatcc)
+  endif()
 endif()
 
 # Source root directory for executorch.
@@ -28,16 +32,12 @@ if(NOT EXECUTORCH_ROOT)
   set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/..)
 endif()
 
-include(${EXECUTORCH_ROOT}/build/Utils.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 
 if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
 endif()
 
-if(NOT FLATC_EXECUTABLE)
-  set(FLATC_EXECUTABLE flatc)
-endif()
-
 # Paths to headers generated from the .fbs files. set(_etdump_schemas
 # etdump_schema_flatcc.fbs scalar_type.fbs)
 
@@ -159,6 +159,12 @@ file(MAKE_DIRECTORY
      ${_program_schema__include_dir}/executorch/devtools/bundled_program
 )
 
+if(WIN32)
+  set(RM_COMMAND rmdir /s /q)
+else()
+  set(RM_COMMAND rm -rf)
+endif()
+
 add_custom_command(
   OUTPUT ${_etdump_schema__outputs}
   COMMAND
@@ -168,14 +174,20 @@ add_custom_command(
     ${FLATCC_EXECUTABLE} -cwr -o
     ${_program_schema__include_dir}/executorch/devtools/etdump
     ${_etdump_schema__srcs}
-  COMMAND rm -rf ${_etdump_schema_cleanup_paths}
+  COMMAND ${RM_COMMAND} ${_etdump_schema_cleanup_paths}
   DEPENDS ${_etdump_schema_gen_dep}
   COMMENT "Generating etdump headers"
 )
 
+unset(RM_COMMAND)
+
 add_library(
   etdump ${CMAKE_CURRENT_SOURCE_DIR}/etdump/etdump_flatcc.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/etdump/emitter.cpp
+         ${CMAKE_CURRENT_SOURCE_DIR}/etdump/data_sinks/buffer_data_sink.cpp
+         ${CMAKE_CURRENT_SOURCE_DIR}/etdump/data_sinks/buffer_data_sink.h
+         ${CMAKE_CURRENT_SOURCE_DIR}/etdump/data_sinks/file_data_sink.cpp
+         ${CMAKE_CURRENT_SOURCE_DIR}/etdump/data_sinks/file_data_sink.h
 )
 
 target_link_libraries(
@@ -191,7 +203,7 @@ add_custom_command(
     "${_bundled_schema__include_dir}/executorch/devtools/bundled_program/schema"
     ${_bundled_program_schema__srcs}
   WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/devtools
-  DEPENDS ${FLATC_EXECUTABLE} ${_bundled_program_schema__srcs}
+  DEPENDS flatc ${_bundled_program_schema__srcs}
   COMMENT "Generating bundled_program headers"
   VERBATIM
 )
diff --git a/devtools/bundled_program/bundled_program.cpp b/devtools/bundled_program/bundled_program.cpp
index 1da42aa95de..f12262f7dd0 100644
--- a/devtools/bundled_program/bundled_program.cpp
+++ b/devtools/bundled_program/bundled_program.cpp
@@ -361,6 +361,11 @@ ET_NODISCARD Error verify_method_outputs(
   auto bundled_expected_outputs =
       method_test.get()->test_cases()->Get(testset_idx)->expected_outputs();
 
+  if (bundled_expected_outputs->size() == 0) {
+    // No bundled expected outputs, so we can't verify the method outputs.
+    return Error::NotSupported;
+  }
+
   for (size_t output_idx = 0; output_idx < method.outputs_size();
        output_idx++) {
     auto bundled_expected_output =
diff --git a/devtools/bundled_program/targets.bzl b/devtools/bundled_program/targets.bzl
index 7035b3b31f6..09e9aae11b1 100644
--- a/devtools/bundled_program/targets.bzl
+++ b/devtools/bundled_program/targets.bzl
@@ -1,4 +1,4 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime")
 
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
@@ -7,7 +7,7 @@ def define_common_targets():
     TARGETS and BUCK files that call this function.
     """
 
-    for aten_mode in (True, False):
+    for aten_mode in get_aten_mode_options():
         aten_suffix = ("_aten" if aten_mode else "")
         runtime.cxx_library(
             name = "runtime" + aten_suffix,
diff --git a/devtools/etdump/data_sinks/TARGETS b/devtools/etdump/data_sinks/TARGETS
new file mode 100644
index 00000000000..0a42614a385
--- /dev/null
+++ b/devtools/etdump/data_sinks/TARGETS
@@ -0,0 +1,5 @@
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/devtools/etdump/data_sinks/buffer_data_sink.cpp b/devtools/etdump/data_sinks/buffer_data_sink.cpp
new file mode 100644
index 00000000000..5678aefb181
--- /dev/null
+++ b/devtools/etdump/data_sinks/buffer_data_sink.cpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/devtools/etdump/data_sinks/buffer_data_sink.h>
+#include <executorch/devtools/etdump/utils.h>
+
+using ::executorch::runtime::Error;
+using ::executorch::runtime::Result;
+using ::executorch::runtime::Span;
+
+namespace executorch {
+namespace etdump {
+
+Result<BufferDataSink> BufferDataSink::create(
+    Span<uint8_t> buffer,
+    size_t alignment) noexcept {
+  // Check if alignment is a power of two and greater than 0
+  if (alignment == 0 || (alignment & (alignment - 1)) != 0) {
+    return Error::InvalidArgument;
+  }
+
+  return BufferDataSink(buffer, alignment);
+}
+
+Result<BufferDataSink>
+BufferDataSink::create(void* ptr, size_t size, size_t alignment) noexcept {
+  return BufferDataSink::create({(uint8_t*)ptr, size}, alignment);
+}
+
+Result<size_t> BufferDataSink::write(const void* ptr, size_t length) {
+  if (length == 0) {
+    return offset_;
+  }
+
+  uint8_t* last_data_end = debug_buffer_.data() + offset_;
+
+  // The beginning of the next data blob must be aligned to the alignment
+  uint8_t* cur_data_begin = internal::align_pointer(last_data_end, alignment_);
+  uint8_t* cur_data_end = cur_data_begin + length;
+
+  if (cur_data_end > debug_buffer_.data() + debug_buffer_.size()) {
+    ET_LOG(Error, "Ran out of space to store intermediate outputs.");
+    return Error::OutOfResources;
+  }
+
+  // Zero out the padding between data blobs
+  memset(last_data_end, 0, cur_data_begin - last_data_end);
+  memcpy(cur_data_begin, ptr, length);
+  offset_ = (size_t)(cur_data_end - debug_buffer_.data());
+
+  return (size_t)(cur_data_begin - debug_buffer_.data());
+}
+
+Result<size_t> BufferDataSink::get_storage_size() const {
+  return debug_buffer_.size();
+}
+
+size_t BufferDataSink::get_used_bytes() const {
+  return offset_;
+}
+
+} // namespace etdump
+} // namespace executorch
diff --git a/devtools/etdump/data_sinks/buffer_data_sink.h b/devtools/etdump/data_sinks/buffer_data_sink.h
new file mode 100644
index 00000000000..685e62b2103
--- /dev/null
+++ b/devtools/etdump/data_sinks/buffer_data_sink.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/devtools/etdump/data_sinks/data_sink_base.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/span.h>
+
+namespace executorch {
+namespace etdump {
+
+/**
+ * BufferDataSink is a concrete implementation of the DataSinkBase class,
+ * designed to store debug data in a pre-allocated, user-owned buffer. This
+ * class provides methods to write raw data and tensor data into the buffer,
+ * ensuring proper alignment and managing padding as needed.
+ */
+class BufferDataSink : public DataSinkBase {
+ public:
+  /**
+   * Creates a BufferDataSink with a given span buffer.
+   *
+   * @param[in] buffer A Span object representing the buffer where data will be
+   * stored.
+   * @param[in] alignment The alignment requirement for the buffer. It must be
+   * a power of two and greater than zero. Default is 64.
+   * @return A Result object containing either:
+   *         - A BufferDataSink object if succees, or
+   *         - An error code indicating the failure reason, if any issue
+   *           occurs during the creation process.
+   */
+  static ::executorch::runtime::Result<BufferDataSink> create(
+      ::executorch::runtime::Span<uint8_t> buffer,
+      size_t alignment = 64) noexcept;
+
+  /**
+   * Creates a BufferDataSink with a given span buffer.
+   *
+   * @param[in] ptr A pointer to the data blob where data will be stored.
+   * @param[in] size The size of the data blob in bytes.
+   * @param[in] alignment The alignment requirement for the buffer. It must be
+   * a power of two and greater than zero. Default is 64.
+   * @return A Result object containing either:
+   *         - A BufferDataSink object if succees, or
+   *         - An error code indicating the failure reason, if any issue
+   *           occurs during the creation process.
+   */
+  static ::executorch::runtime::Result<BufferDataSink>
+  create(void* ptr, size_t size, size_t alignment = 64) noexcept;
+
+  /**
+   * Creates a empty BufferDataSink;
+   */
+  BufferDataSink() = default;
+
+  // Uncopiable and unassignable to avoid double assignment and free of the
+  // internal buffer.
+  BufferDataSink(const BufferDataSink&) = delete;
+  BufferDataSink& operator=(const BufferDataSink&) = delete;
+
+  // Movable to be compatible with Result.
+  BufferDataSink(BufferDataSink&&) = default;
+  BufferDataSink& operator=(BufferDataSink&&) = default;
+
+  ~BufferDataSink() override = default;
+
+  /**
+   * Write data into the debug buffer and return the offset of the starting
+   * location of the data within the buffer.
+   *
+   * @param[in] ptr A pointer to the data to be written into the storage.
+   * @param[in] size The size of the data in bytes.
+   * @return A Result object containing either:
+   *         - The offset of the starting location of the data within the
+   *           debug buffer, or
+   *         - An error code indicating the failure reason, if any issue
+   *           occurs during the write process.
+   */
+  ::executorch::runtime::Result<size_t> write(const void* ptr, size_t size)
+      override;
+
+  /**
+   * Retrieves the total size of the buffer.
+   *
+   * @return A Result object containing the total size of the buffer in bytes.
+   */
+  ::executorch::runtime::Result<size_t> get_storage_size() const;
+
+  /**
+   * Retrieves the number of bytes currently used in the buffer.
+   *
+   * @return The amount of data currently stored in the buffer in bytes.
+   */
+  size_t get_used_bytes() const override;
+
+ private:
+  /**
+   * Constructs a BufferDataSink with a given buffer.
+   *
+   * @param[in] buffer A Span object representing the buffer where data will be
+   * stored.
+   * @param[in] alignment The alignment requirement for the buffer. It must be
+   * a power of two. Default is 64.
+   */
+  explicit BufferDataSink(
+      ::executorch::runtime::Span<uint8_t> buffer,
+      size_t alignment)
+      : debug_buffer_(buffer), offset_(0), alignment_(alignment) {}
+
+  // A Span object representing the buffer used for storing debug data.
+  ::executorch::runtime::Span<uint8_t> debug_buffer_;
+
+  // The offset of the next available location in the buffer.
+  size_t offset_;
+
+  // The alignment of the buffer.
+  size_t alignment_;
+};
+
+} // namespace etdump
+} // namespace executorch
diff --git a/devtools/etdump/data_sinks/data_sink_base.h b/devtools/etdump/data_sinks/data_sink_base.h
new file mode 100644
index 00000000000..602c249ce9d
--- /dev/null
+++ b/devtools/etdump/data_sinks/data_sink_base.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/result.h>
+
+namespace executorch {
+namespace etdump {
+
+/**
+ * DataSinkBase is an abstract class that users can inherit and implement
+ * to customize the storage and management of debug data in ETDumpGen. This
+ * class provides a basic and essential interface for writing datablob to a
+ * user-defined storage, retrieving storage capacity, and tracking the amount of
+ * data stored.
+ */
+class DataSinkBase {
+ public:
+  /**
+   * Virtual destructor to ensure proper cleanup of derived classes.
+   */
+
+  virtual ~DataSinkBase() = default;
+  /**
+   * Write data into the debug storage. This method should be implemented
+   * by derived classes to handle the specifics of data storage.
+   *
+   * This function should return the offset of the starting location of the
+   * data within the debug storage if the write operation succeeds, or an
+   * Error code if any issue occurs during the write process.
+   *
+   * @param[in] ptr A pointer to the data to be written into the storage.
+   * @param[in] length The size of the data in bytes.
+   * @return A Result object containing either:
+   *         - The offset of the starting location of the data within the
+   *           debug storage, which will be recorded in the corresponding
+   *           metadata of ETDump, or
+   *         - An error code indicating the failure reason, if any issue
+   *           occurs during the write process.
+   */
+  virtual ::executorch::runtime::Result<size_t> write(
+      const void* ptr,
+      size_t length) = 0;
+
+  /**
+   * Get the number of bytes currently used in the debug storage.
+   *
+   * @return The amount of data currently stored in bytes.
+   */
+  virtual size_t get_used_bytes() const = 0;
+};
+
+} // namespace etdump
+} // namespace executorch
diff --git a/devtools/etdump/data_sinks/file_data_sink.cpp b/devtools/etdump/data_sinks/file_data_sink.cpp
new file mode 100644
index 00000000000..e9f9f44a899
--- /dev/null
+++ b/devtools/etdump/data_sinks/file_data_sink.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/devtools/etdump/data_sinks/file_data_sink.h>
+#include <cstdio> // For FILE operations
+
+using ::executorch::runtime::Error;
+using ::executorch::runtime::Result;
+
+namespace executorch {
+namespace etdump {
+
+FileDataSink::FileDataSink(FileDataSink&& other) noexcept
+    : file_(other.file_), total_written_bytes_(other.total_written_bytes_) {
+  other.file_ = nullptr;
+}
+
+Result<FileDataSink> FileDataSink::create(const char* file_path) {
+  // Open the file and get the file pointer
+  FILE* file = fopen(file_path, "wb");
+  if (!file) {
+    // Return an error if the file cannot be accessed or created
+    ET_LOG(Error, "File %s cannot be accessed or created.", file_path);
+    return Error::AccessFailed;
+  }
+
+  // Return the successfully created FileDataSink
+  return FileDataSink(file);
+}
+
+FileDataSink::~FileDataSink() {
+  // Close the file
+  close();
+}
+
+Result<size_t> FileDataSink::write(const void* ptr, size_t size) {
+  if (!file_) {
+    ET_LOG(Error, "File not open, unable to write.");
+    return Error::AccessFailed;
+  }
+
+  size_t offset = total_written_bytes_;
+
+  if (size == 0) {
+    // No data to write, return current offset
+    return offset;
+  }
+
+  size_t written = fwrite(ptr, 1, size, file_);
+  if (written != size) {
+    ET_LOG(Error, "Write failed: wrote %zu bytes of %zu", written, size);
+    return Error::Internal;
+  }
+
+  total_written_bytes_ += written;
+  return offset;
+}
+
+size_t FileDataSink::get_used_bytes() const {
+  return total_written_bytes_;
+}
+
+void FileDataSink::close() {
+  if (file_) {
+    fclose(file_);
+    file_ = nullptr;
+  }
+}
+
+} // namespace etdump
+} // namespace executorch
diff --git a/devtools/etdump/data_sinks/file_data_sink.h b/devtools/etdump/data_sinks/file_data_sink.h
new file mode 100644
index 00000000000..c2d1f93cebe
--- /dev/null
+++ b/devtools/etdump/data_sinks/file_data_sink.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/devtools/etdump/data_sinks/data_sink_base.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <cstdio> // For FILE operations
+
+namespace executorch {
+namespace etdump {
+
+/**
+ * FileDataSink is a concrete implementation of the DataSinkBase class,
+ * designed to facilitate the direct writing of data to a file. It is
+ * particularly useful for scenarios where immediate data storage is
+ * required, such as logging or streaming data to a file for real-time
+ * analysis. The class manages file operations, including opening, writing,
+ * and closing the file, while handling potential errors during these
+ * operations.
+ */
+
+class FileDataSink : public DataSinkBase {
+ public:
+  /**
+   * Creates a FileDataSink with a given file path.
+   *
+   * @param[in] file_path The path to the file for writing data.
+   * @return A Result object containing either:
+   *         - A FileDataSink object if succees, or
+   *         - AccessFailed Error when the file cannot be accessed or created
+   */
+  static ::executorch::runtime::Result<FileDataSink> create(
+      const char* file_path);
+
+  /**
+   * Destructor that closes the file.
+   */
+  ~FileDataSink() override;
+
+  // Delete copy constructor and copy assignment operator
+  FileDataSink(const FileDataSink&) = delete;
+  FileDataSink& operator=(const FileDataSink&) = delete;
+
+  FileDataSink(FileDataSink&& other) noexcept;
+  FileDataSink& operator=(FileDataSink&& other) = default;
+
+  /**
+   * Writes data directly to the file.
+   *
+   * This function does not perform any alignment, and will overwrite
+   * any existing data in the file.
+   *
+   * @param[in] ptr A pointer to the data to be written into the file.
+   * @param[in] size The size of the data in bytes.
+   * @return A Result object containing either:
+   *         - The offset of the starting location of the data within the
+   *           file, or
+   *         - AccessFailedError if the file has been closed.
+   *         - InternalError if the os write operation fails.
+   */
+  ::executorch::runtime::Result<size_t> write(const void* ptr, size_t size)
+      override;
+
+  /**
+   * Gets the number of bytes currently written to the file.
+   *
+   * @return The amount of data currently stored in bytes.
+   */
+  size_t get_used_bytes() const override;
+
+  /**
+   * Closes the file, if it is open.
+   */
+  void close();
+
+ private:
+  /**
+   * Constructs a FileDataSink with a given file pointer.
+   *
+   * @param[in] file A valid file pointer for writing data.
+   */
+  explicit FileDataSink(FILE* file) : file_(file), total_written_bytes_(0) {}
+
+  FILE* file_;
+  size_t total_written_bytes_;
+};
+
+} // namespace etdump
+} // namespace executorch
diff --git a/devtools/etdump/data_sinks/targets.bzl b/devtools/etdump/data_sinks/targets.bzl
new file mode 100644
index 00000000000..6cf86a17fb2
--- /dev/null
+++ b/devtools/etdump/data_sinks/targets.bzl
@@ -0,0 +1,50 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+
+def define_data_sink_target(data_sink_name, aten_suffix):
+    runtime.cxx_library(
+            name = data_sink_name + aten_suffix,
+            exported_headers = [
+                data_sink_name + ".h",
+            ],
+            srcs = [
+                data_sink_name + ".cpp",
+            ],
+            deps = [
+                "//executorch/devtools/etdump:utils",
+            ],
+            exported_deps = [
+                "//executorch/runtime/core/exec_aten:lib" + aten_suffix,
+                ":data_sink_base" + aten_suffix,
+            ],
+            visibility = [
+                "//executorch/...",
+                "@EXECUTORCH_CLIENTS",
+            ],
+        )
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+    for aten_mode in (True, False):
+        aten_suffix = "_aten" if aten_mode else ""
+
+        runtime.cxx_library(
+            name = "data_sink_base" + aten_suffix,
+            exported_headers = [
+                "data_sink_base.h",
+            ],
+            exported_deps = [
+                "//executorch/runtime/core/exec_aten/util:scalar_type_util" + aten_suffix,
+            ],
+            visibility = [
+                "//executorch/...",
+                "@EXECUTORCH_CLIENTS",
+            ],
+        )
+
+        define_data_sink_target("buffer_data_sink", aten_suffix)
+        define_data_sink_target("file_data_sink", aten_suffix)
diff --git a/devtools/etdump/data_sinks/tests/TARGETS b/devtools/etdump/data_sinks/tests/TARGETS
new file mode 100644
index 00000000000..0a42614a385
--- /dev/null
+++ b/devtools/etdump/data_sinks/tests/TARGETS
@@ -0,0 +1,5 @@
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/devtools/etdump/data_sinks/tests/buffer_data_sink_test.cpp b/devtools/etdump/data_sinks/tests/buffer_data_sink_test.cpp
new file mode 100644
index 00000000000..c4178c29a4b
--- /dev/null
+++ b/devtools/etdump/data_sinks/tests/buffer_data_sink_test.cpp
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/devtools/etdump/data_sinks/buffer_data_sink.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+#include <executorch/runtime/core/span.h>
+#include <executorch/runtime/platform/runtime.h>
+#include <executorch/test/utils/DeathTest.h>
+#include <gtest/gtest.h>
+
+using namespace ::testing;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::etdump::BufferDataSink;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::Result;
+using ::executorch::runtime::Span;
+using torch::executor::testing::TensorFactory;
+
+class BufferDataSinkTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    torch::executor::runtime_init();
+    // Allocate a small buffer for testing
+    buffer_size_ = 128; // Small size for testing
+    buffer_ptr_ = malloc(buffer_size_);
+    buffer_ = Span<uint8_t>(static_cast<uint8_t*>(buffer_ptr_), buffer_size_);
+    Result<BufferDataSink> buffer_data_sink_ret =
+        BufferDataSink::create(buffer_);
+    ASSERT_EQ(buffer_data_sink_ret.error(), Error::Ok);
+    buffer_data_sink_ =
+        std::make_unique<BufferDataSink>(std::move(buffer_data_sink_ret.get()));
+  }
+
+  void TearDown() override {
+    free(buffer_ptr_);
+  }
+
+  size_t buffer_size_;
+  void* buffer_ptr_;
+  Span<uint8_t> buffer_;
+  std::unique_ptr<BufferDataSink> buffer_data_sink_;
+};
+
+TEST_F(BufferDataSinkTest, StorageSizeCheck) {
+  Result<size_t> ret = buffer_data_sink_->get_storage_size();
+  ASSERT_EQ(ret.error(), Error::Ok);
+
+  size_t storage_size = ret.get();
+  EXPECT_EQ(storage_size, buffer_size_);
+}
+
+TEST_F(BufferDataSinkTest, WriteOneTensorAndCheckData) {
+  TensorFactory<ScalarType::Float> tf;
+  Tensor tensor = tf.make({1, 4}, {1.0, 2.0, 3.0, 4.0});
+
+  Result<size_t> ret =
+      buffer_data_sink_->write(tensor.const_data_ptr(), tensor.nbytes());
+  ASSERT_EQ(ret.error(), Error::Ok);
+
+  size_t offset = ret.get();
+
+  EXPECT_NE(offset, static_cast<size_t>(-1));
+
+  // Check that the data in the buffer matches the tensor data
+  const float* buffer_data =
+      reinterpret_cast<const float*>(buffer_.data() + offset);
+  for (size_t i = 0; i < tensor.numel(); ++i) {
+    EXPECT_EQ(buffer_data[i], tensor.const_data_ptr<float>()[i]);
+  }
+}
+
+TEST_F(BufferDataSinkTest, WriteMultiTensorsAndCheckData) {
+  TensorFactory<ScalarType::Float> tf;
+  std::vector<Tensor> tensors = {
+      tf.make({1, 4}, {1.0, 2.0, 3.0, 4.0}),
+      tf.make({1, 4}, {5.0, 6.0, 7.0, 8.0})};
+
+  for (const auto& tensor : tensors) {
+    Result<size_t> ret =
+        buffer_data_sink_->write(tensor.const_data_ptr(), tensor.nbytes());
+    ASSERT_EQ(ret.error(), Error::Ok);
+
+    size_t offset = ret.get();
+    EXPECT_NE(offset, static_cast<size_t>(-1));
+    // Check that the data in the buffer matches the tensor data
+    const float* buffer_data =
+        reinterpret_cast<const float*>(buffer_.data() + offset);
+    for (size_t i = 0; i < tensor.numel(); ++i) {
+      EXPECT_EQ(buffer_data[i], tensor.const_data_ptr<float>()[i]);
+    }
+  }
+}
+
+TEST_F(BufferDataSinkTest, PointerAlignmentCheck) {
+  TensorFactory<ScalarType::Float> tf;
+  Tensor tensor = tf.make({1, 4}, {1.0, 2.0, 3.0, 4.0});
+
+  Result<size_t> ret =
+      buffer_data_sink_->write(tensor.const_data_ptr(), tensor.nbytes());
+  ASSERT_EQ(ret.error(), Error::Ok);
+
+  size_t offset = ret.get();
+  EXPECT_NE(offset, static_cast<size_t>(-1));
+  // Check that the offset pointer is 64-byte aligned
+  const uint8_t* offset_ptr = buffer_.data() + offset;
+  EXPECT_EQ(reinterpret_cast<uintptr_t>(offset_ptr) % 64, 0);
+}
+
+TEST_F(BufferDataSinkTest, WriteUntilOverflow) {
+  TensorFactory<ScalarType::Float> tf;
+  Tensor tensor = tf.zeros({1, 8}); // Large tensor to fill the buffer
+
+  // Write tensors until we run out of space
+  for (size_t i = 0; i < 2; i++) {
+    Result<size_t> ret =
+        buffer_data_sink_->write(tensor.const_data_ptr(), tensor.nbytes());
+    ASSERT_EQ(ret.error(), Error::Ok);
+  }
+
+  // Attempting to write another tensor should raise an error
+  Result<size_t> ret =
+      buffer_data_sink_->write(tensor.const_data_ptr(), tensor.nbytes());
+  ASSERT_EQ(ret.error(), Error::OutOfResources);
+}
+
+TEST_F(BufferDataSinkTest, illegalAlignment) {
+  // Create a buffer_data_sink_ with legal alignment that is a power of 2 and
+  // greater than 0
+  for (size_t i = 1; i <= 128; i <<= 1) {
+    Result<BufferDataSink> buffer_data_sink_ret =
+        BufferDataSink::create(buffer_, i);
+    ASSERT_EQ(buffer_data_sink_ret.error(), Error::Ok);
+  }
+
+  // Create a buffer_data_sink_ with illegal alignment that is not a power of 2
+  // or greater than 0
+  std::vector<size_t> illegal_alignments = {0, 3, 5, 7, 100, 127};
+
+  for (size_t i = 0; i < illegal_alignments.size(); i++) {
+    Result<BufferDataSink> buffer_data_sink_ret =
+        BufferDataSink::create(buffer_, illegal_alignments[i]);
+    ASSERT_EQ(buffer_data_sink_ret.error(), Error::InvalidArgument);
+  }
+}
diff --git a/devtools/etdump/data_sinks/tests/file_data_sink_test.cpp b/devtools/etdump/data_sinks/tests/file_data_sink_test.cpp
new file mode 100644
index 00000000000..33122d320aa
--- /dev/null
+++ b/devtools/etdump/data_sinks/tests/file_data_sink_test.cpp
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/devtools/etdump/data_sinks/file_data_sink.h>
+#include <executorch/runtime/platform/runtime.h>
+#include <gtest/gtest.h>
+#include <stdio.h> // tmpnam(), remove()
+#include <fstream>
+
+using namespace ::testing;
+using ::executorch::etdump::FileDataSink;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::Result;
+
+class FileDataSinkTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // Initialize the runtime environment
+    torch::executor::runtime_init();
+
+    // Define the file path for testing
+    std::array<char, L_tmpnam> buf;
+    const char* ret = std::tmpnam(buf.data());
+    ASSERT_NE(ret, nullptr) << "Could not generate temp file";
+    buf[L_tmpnam - 1] = '\0';
+    file_path_ = std::string(buf.data()) + "-executorch-testing";
+  }
+
+  void TearDown() override {
+    // Remove the test file
+    std::remove(file_path_.c_str());
+  }
+
+  std::string file_path_;
+};
+
+TEST_F(FileDataSinkTest, CreationExpectFail) {
+  // Create a FileDataSink instance with a valid file path
+  Result<FileDataSink> success = FileDataSink::create(file_path_.c_str());
+  ASSERT_TRUE(success.ok());
+
+  // Try to create another FileDataSink instance with an invalid file path
+  Result<FileDataSink> fail_with_invalid_file_path = FileDataSink::create("");
+  ASSERT_EQ(fail_with_invalid_file_path.error(), Error::AccessFailed);
+}
+
+TEST_F(FileDataSinkTest, WriteDataToFile) {
+  const char* data = "Hello, World!";
+  size_t data_size = strlen(data);
+
+  // Create a FileDataSink instance
+  Result<FileDataSink> result = FileDataSink::create(file_path_.c_str());
+  ASSERT_TRUE(result.ok());
+
+  FileDataSink* data_sink = &result.get();
+
+  // Write data to the file
+  Result<size_t> write_result = data_sink->write(data, data_size);
+  ASSERT_TRUE(write_result.ok());
+
+  size_t used_bytes = data_sink->get_used_bytes();
+  EXPECT_EQ(used_bytes, data_size);
+
+  data_sink->close();
+
+  // Expect fail if write again after close
+  Result<size_t> write_result_after_close = data_sink->write(data, data_size);
+  ASSERT_EQ(write_result_after_close.error(), Error::AccessFailed);
+
+  // Verify the file contents
+  std::ifstream file(file_path_, std::ios::binary);
+  file.seekg(0, std::ios::end);
+  size_t file_size = file.tellg();
+  file.seekg(0, std::ios::beg);
+  EXPECT_EQ(file_size, used_bytes);
+
+  // Read the file content and verify it matches the original data
+  std::vector<char> file_content(file_size);
+  file.read(file_content.data(), file_size);
+  file.close();
+
+  EXPECT_EQ(std::memcmp(file_content.data(), data, data_size), 0);
+}
+
+TEST_F(FileDataSinkTest, WriteMultipleDataAndCheckOffsets) {
+  const char* data1 = "Accelerate";
+  const char* data2 = "Core";
+  const char* data3 = "Experience";
+  size_t data1_size = strlen(data1);
+  size_t data2_size = strlen(data2);
+  size_t data3_size = strlen(data3);
+
+  // Create a FileDataSink instance
+  Result<FileDataSink> result = FileDataSink::create(file_path_.c_str());
+  ASSERT_TRUE(result.ok());
+  FileDataSink* data_sink = &result.get();
+
+  // Write multiple data chunks and check offsets
+  Result<size_t> offset1 = data_sink->write(data1, data1_size);
+  ASSERT_TRUE(offset1.ok());
+  EXPECT_EQ(offset1.get(), 0);
+
+  Result<size_t> offset2 = data_sink->write(data2, data2_size);
+  ASSERT_TRUE(offset2.ok());
+  EXPECT_EQ(offset2.get(), data1_size);
+
+  Result<size_t> offset3 = data_sink->write(data3, data3_size);
+  ASSERT_TRUE(offset3.ok());
+  EXPECT_EQ(offset3.get(), data1_size + data2_size);
+  size_t used_bytes = data_sink->get_used_bytes();
+  EXPECT_EQ(used_bytes, data1_size + data2_size + data3_size);
+
+  data_sink->close();
+
+  // Verify the file contents
+  std::ifstream file(file_path_, std::ios::binary);
+  file.seekg(0, std::ios::end);
+  size_t file_size = file.tellg();
+  file.seekg(0, std::ios::beg);
+  EXPECT_EQ(file_size, used_bytes);
+
+  // Read the file content
+  std::vector<char> file_content(file_size);
+  file.read(file_content.data(), file_size);
+  file.close();
+
+  // Verify each data chunk in the file using offsets
+  EXPECT_EQ(
+      std::memcmp(file_content.data() + offset1.get(), data1, data1_size), 0);
+  EXPECT_EQ(
+      std::memcmp(file_content.data() + offset2.get(), data2, data2_size), 0);
+  EXPECT_EQ(
+      std::memcmp(file_content.data() + offset3.get(), data3, data3_size), 0);
+}
diff --git a/extension/parallel/test/targets.bzl b/devtools/etdump/data_sinks/tests/targets.bzl
similarity index 51%
rename from extension/parallel/test/targets.bzl
rename to devtools/etdump/data_sinks/tests/targets.bzl
index 791c0727471..1b6f67855df 100644
--- a/extension/parallel/test/targets.bzl
+++ b/devtools/etdump/data_sinks/tests/targets.bzl
@@ -1,5 +1,17 @@
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
+def define_data_sink_test(data_sink_name):
+    runtime.cxx_test(
+        name = data_sink_name + "_test",
+        srcs = [
+            data_sink_name + "_test.cpp",
+        ],
+        deps = [
+            "//executorch/devtools/etdump/data_sinks:" + data_sink_name,
+            "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
+        ],
+    )
+
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
 
@@ -7,13 +19,5 @@ def define_common_targets():
     TARGETS and BUCK files that call this function.
     """
 
-    runtime.cxx_test(
-        name = "thread_parallel_test",
-        srcs = [
-            "thread_parallel_test.cpp",
-        ],
-        deps = [
-            "//executorch/extension/parallel:thread_parallel",
-            "//executorch/runtime/platform:platform",
-        ],
-    )
+    define_data_sink_test("buffer_data_sink")
+    define_data_sink_test("file_data_sink")
diff --git a/devtools/etdump/etdump_flatcc.cpp b/devtools/etdump/etdump_flatcc.cpp
index a34b5188c53..ef6bde94403 100644
--- a/devtools/etdump/etdump_flatcc.cpp
+++ b/devtools/etdump/etdump_flatcc.cpp
@@ -10,9 +10,11 @@
 
 #include <cstring>
 
+#include <executorch/devtools/etdump/data_sinks/buffer_data_sink.h>
 #include <executorch/devtools/etdump/emitter.h>
 #include <executorch/devtools/etdump/etdump_schema_flatcc_builder.h>
 #include <executorch/devtools/etdump/etdump_schema_flatcc_reader.h>
+#include <executorch/devtools/etdump/utils.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
 #include <executorch/runtime/platform/assert.h>
@@ -28,6 +30,7 @@ using ::executorch::runtime::DelegateDebugIdType;
 using ::executorch::runtime::EValue;
 using ::executorch::runtime::EventTracerEntry;
 using ::executorch::runtime::LoggedEValueType;
+using ::executorch::runtime::Result;
 using ::executorch::runtime::Span;
 using ::executorch::runtime::Tag;
 
@@ -94,16 +97,6 @@ etdump_Tensor_ref_t add_tensor_entry(
   return etdump_Tensor_end(builder_);
 }
 
-static uint8_t* alignPointer(void* ptr, size_t alignment) {
-  intptr_t addr = reinterpret_cast<intptr_t>(ptr);
-  if ((addr & (alignment - 1)) == 0) {
-    // Already aligned.
-    return reinterpret_cast<uint8_t*>(ptr);
-  }
-  addr = (addr | (alignment - 1)) + 1;
-  return reinterpret_cast<uint8_t*>(addr);
-}
-
 } // namespace
 
 // Constructor implementation
@@ -113,9 +106,10 @@ ETDumpGen::ETDumpGen(Span<uint8_t> buffer) {
   // Initialize the flatcc builder_ using the buffer and buffer size.
 
   if (buffer.data() != nullptr) {
-    builder_ = (struct flatcc_builder*)alignPointer(buffer.data(), 64);
-    uintptr_t buffer_with_builder =
-        (uintptr_t)alignPointer(builder_ + sizeof(struct flatcc_builder), 64);
+    builder_ =
+        (struct flatcc_builder*)internal::align_pointer(buffer.data(), 64);
+    uintptr_t buffer_with_builder = (uintptr_t)internal::align_pointer(
+        builder_ + sizeof(struct flatcc_builder), 64);
     size_t builder_size =
         (size_t)(buffer_with_builder - (uintptr_t)buffer.data());
     size_t min_buf_size = max_alloc_buf_size + builder_size;
@@ -150,6 +144,7 @@ ETDumpGen::~ETDumpGen() {
 void ETDumpGen::reset() {
   state_ = State::Init;
   num_blocks_ = 0;
+  data_sink_ = nullptr;
   flatcc_builder_reset(builder_);
   flatbuffers_buffer_start(builder_, etdump_ETDump_file_identifier);
   etdump_ETDump_start_as_root_with_size(builder_);
@@ -312,39 +307,44 @@ void ETDumpGen::log_profiling_delegate(
   etdump_RunData_events_push_end(builder_);
 }
 
-void ETDumpGen::log_intermediate_output_delegate(
+Result<bool> ETDumpGen::log_intermediate_output_delegate(
     const char* name,
     DebugHandle delegate_debug_index,
     const Tensor& output) {
   log_intermediate_output_delegate_helper(name, delegate_debug_index, output);
+  return true;
 }
 
-void ETDumpGen::log_intermediate_output_delegate(
+Result<bool> ETDumpGen::log_intermediate_output_delegate(
     const char* name,
     DebugHandle delegate_debug_index,
     const ArrayRef<Tensor> output) {
   log_intermediate_output_delegate_helper(name, delegate_debug_index, output);
+  return true;
 }
 
-void ETDumpGen::log_intermediate_output_delegate(
+Result<bool> ETDumpGen::log_intermediate_output_delegate(
     const char* name,
     DebugHandle delegate_debug_index,
     const int& output) {
   log_intermediate_output_delegate_helper(name, delegate_debug_index, output);
+  return true;
 }
 
-void ETDumpGen::log_intermediate_output_delegate(
+Result<bool> ETDumpGen::log_intermediate_output_delegate(
     const char* name,
     DebugHandle delegate_debug_index,
     const bool& output) {
   log_intermediate_output_delegate_helper(name, delegate_debug_index, output);
+  return true;
 }
 
-void ETDumpGen::log_intermediate_output_delegate(
+Result<bool> ETDumpGen::log_intermediate_output_delegate(
     const char* name,
     DebugHandle delegate_debug_index,
     const double& output) {
   log_intermediate_output_delegate_helper(name, delegate_debug_index, output);
+  return true;
 }
 
 template <typename T>
@@ -355,10 +355,6 @@ void ETDumpGen::log_intermediate_output_delegate_helper(
   ET_CHECK_MSG(
       (name == nullptr) ^ (delegate_debug_index == -1),
       "Only name or delegate_debug_index can be valid. Check DelegateMappingBuilder documentation for more details.");
-  if (debug_buffer_.empty()) {
-    ET_CHECK_MSG(0, "Must pre-set debug buffer with set_debug_buffer()\n");
-    return;
-  }
 
   check_ready_to_add_events();
   int64_t string_id = name != nullptr ? create_string_entry(name) : -1;
@@ -375,7 +371,7 @@ void ETDumpGen::log_intermediate_output_delegate_helper(
 
   // Check the type of `output` then call the corresponding logging functions
   if constexpr (std::is_same<T, Tensor>::value) {
-    long offset = copy_tensor_to_debug_buffer(output);
+    long offset = write_tensor_or_raise_error(output);
     etdump_Tensor_ref_t tensor_ref = add_tensor_entry(builder_, output, offset);
 
     etdump_Value_start(builder_);
@@ -385,7 +381,7 @@ void ETDumpGen::log_intermediate_output_delegate_helper(
   } else if constexpr (std::is_same<T, ArrayRef<Tensor>>::value) {
     etdump_Tensor_vec_start(builder_);
     for (size_t i = 0; i < output.size(); ++i) {
-      long offset = copy_tensor_to_debug_buffer(output[i]);
+      long offset = write_tensor_or_raise_error(output[i]);
       etdump_Tensor_vec_push(
           builder_, add_tensor_entry(builder_, output[i], offset));
     }
@@ -505,28 +501,21 @@ ETDumpResult ETDumpGen::get_etdump_data() {
 }
 
 void ETDumpGen::set_debug_buffer(Span<uint8_t> buffer) {
-  debug_buffer_ = buffer;
+  Result<BufferDataSink> bds_ret = BufferDataSink::create(buffer);
+  ET_CHECK_MSG(
+      bds_ret.ok(),
+      "Failed to create data sink from debug buffer with error 0x%" PRIx32,
+      static_cast<uint32_t>(bds_ret.error()));
+
+  buffer_data_sink_ = std::move(bds_ret.get());
+  data_sink_ = &buffer_data_sink_;
 }
 
-size_t ETDumpGen::copy_tensor_to_debug_buffer(executorch::aten::Tensor tensor) {
-  if (tensor.nbytes() == 0) {
-    return static_cast<size_t>(-1);
-  }
-  uint8_t* offset_ptr =
-      alignPointer(debug_buffer_.data() + debug_buffer_offset_, 64);
-  debug_buffer_offset_ = (offset_ptr - debug_buffer_.data()) + tensor.nbytes();
-  ET_CHECK_MSG(
-      debug_buffer_offset_ <= debug_buffer_.size(),
-      "Ran out of space to store intermediate outputs.");
-  memcpy(offset_ptr, tensor.const_data_ptr(), tensor.nbytes());
-  return (size_t)(offset_ptr - debug_buffer_.data());
+void ETDumpGen::set_data_sink(DataSinkBase* data_sink) {
+  data_sink_ = data_sink;
 }
 
 void ETDumpGen::log_evalue(const EValue& evalue, LoggedEValueType evalue_type) {
-  if (debug_buffer_.empty()) {
-    return;
-  }
-
   check_ready_to_add_events();
 
   etdump_DebugEvent_start(builder_);
@@ -537,7 +526,7 @@ void ETDumpGen::log_evalue(const EValue& evalue, LoggedEValueType evalue_type) {
   switch (evalue.tag) {
     case Tag::Tensor: {
       executorch::aten::Tensor tensor = evalue.toTensor();
-      long offset = copy_tensor_to_debug_buffer(tensor);
+      long offset = write_tensor_or_raise_error(tensor);
       etdump_Tensor_ref_t tensor_ref =
           add_tensor_entry(builder_, tensor, offset);
 
@@ -559,7 +548,7 @@ void ETDumpGen::log_evalue(const EValue& evalue, LoggedEValueType evalue_type) {
           evalue.toTensorList();
       etdump_Tensor_vec_start(builder_);
       for (size_t i = 0; i < tensors.size(); ++i) {
-        long offset = copy_tensor_to_debug_buffer(tensors[i]);
+        long offset = write_tensor_or_raise_error(tensors[i]);
         etdump_Tensor_vec_push(
             builder_, add_tensor_entry(builder_, tensors[i], offset));
       }
@@ -643,8 +632,32 @@ bool ETDumpGen::is_static_etdump() {
   return alloc_.data != nullptr;
 }
 
-size_t ETDumpGen::get_debug_buffer_size() const {
-  return debug_buffer_.size();
+DataSinkBase* ETDumpGen::get_data_sink() {
+  return data_sink_;
+}
+
+long ETDumpGen::write_tensor_or_raise_error(Tensor tensor) {
+  // Previously, the function copy_tensor_to_debug_buffer returned 0xFF..F when
+  // given an empty tensor, which is an invalid offset for most buffers. In our
+  // data sink, we will return the current debug_buffer_offset for better
+  // clarity. We are isolating the empty tensor case here using the old logic to
+  // avoid any backward compatibility issues while introducing the data sink.
+  // Once the data sink is fully implemented, we can remove this check and apply
+  // the new logic to all cases.
+  // TODO(gasoonjia): remove this check after datasink is fully rolled out.
+  if (tensor.nbytes() == 0) {
+    return static_cast<size_t>(-1);
+  }
+
+  ET_CHECK_MSG(
+      data_sink_, "Must set data sink before writing tensor-like data");
+  Result<size_t> ret =
+      data_sink_->write(tensor.const_data_ptr(), tensor.nbytes());
+  ET_CHECK_MSG(
+      ret.ok(),
+      "Failed to write tensor with error 0x%" PRIx32,
+      static_cast<uint32_t>(ret.error()));
+  return static_cast<long>(ret.get());
 }
 
 } // namespace etdump
diff --git a/devtools/etdump/etdump_flatcc.h b/devtools/etdump/etdump_flatcc.h
index d7781066533..f2e5f37055a 100644
--- a/devtools/etdump/etdump_flatcc.h
+++ b/devtools/etdump/etdump_flatcc.h
@@ -9,8 +9,12 @@
 #pragma once
 
 #include <cstdint>
+#include <memory>
 
+#include <executorch/devtools/etdump/data_sinks/buffer_data_sink.h>
+#include <executorch/devtools/etdump/data_sinks/data_sink_base.h>
 #include <executorch/runtime/core/event_tracer.h>
+#include <executorch/runtime/core/result.h>
 #include <executorch/runtime/core/span.h>
 #include <executorch/runtime/platform/platform.h>
 
@@ -21,6 +25,8 @@ struct flatcc_builder;
 namespace executorch {
 namespace etdump {
 
+using ::executorch::runtime::Result;
+
 namespace internal {
 struct ETDumpStaticAllocator {
   ETDumpStaticAllocator() = default;
@@ -103,7 +109,7 @@ class ETDumpGen : public ::executorch::runtime::EventTracer {
   /**
    * Log an intermediate tensor output from a delegate.
    */
-  virtual void log_intermediate_output_delegate(
+  virtual Result<bool> log_intermediate_output_delegate(
       const char* name,
       ::executorch::runtime::DebugHandle delegate_debug_index,
       const executorch::aten::Tensor& output) override;
@@ -111,7 +117,7 @@ class ETDumpGen : public ::executorch::runtime::EventTracer {
   /**
    * Log an intermediate tensor array output from a delegate.
    */
-  virtual void log_intermediate_output_delegate(
+  virtual Result<bool> log_intermediate_output_delegate(
       const char* name,
       ::executorch::runtime::DebugHandle delegate_debug_index,
       const ::executorch::runtime::ArrayRef<executorch::aten::Tensor> output)
@@ -120,7 +126,7 @@ class ETDumpGen : public ::executorch::runtime::EventTracer {
   /**
    * Log an intermediate int output from a delegate.
    */
-  virtual void log_intermediate_output_delegate(
+  virtual Result<bool> log_intermediate_output_delegate(
       const char* name,
       ::executorch::runtime::DebugHandle delegate_debug_index,
       const int& output) override;
@@ -128,7 +134,7 @@ class ETDumpGen : public ::executorch::runtime::EventTracer {
   /**
    * Log an intermediate bool output from a delegate.
    */
-  virtual void log_intermediate_output_delegate(
+  virtual Result<bool> log_intermediate_output_delegate(
       const char* name,
       ::executorch::runtime::DebugHandle delegate_debug_index,
       const bool& output) override;
@@ -136,14 +142,15 @@ class ETDumpGen : public ::executorch::runtime::EventTracer {
   /**
    * Log an intermediate double output from a delegate.
    */
-  virtual void log_intermediate_output_delegate(
+  virtual Result<bool> log_intermediate_output_delegate(
       const char* name,
       ::executorch::runtime::DebugHandle delegate_debug_index,
       const double& output) override;
   void set_debug_buffer(::executorch::runtime::Span<uint8_t> buffer);
+  void set_data_sink(DataSinkBase* data_sink);
   ETDumpResult get_etdump_data();
-  size_t get_debug_buffer_size() const;
   size_t get_num_blocks();
+  DataSinkBase* get_data_sink();
   bool is_static_etdump();
   void reset();
 
@@ -158,7 +165,6 @@ class ETDumpGen : public ::executorch::runtime::EventTracer {
 
   void check_ready_to_add_events();
   int64_t create_string_entry(const char* name);
-  size_t copy_tensor_to_debug_buffer(executorch::aten::Tensor tensor);
 
   /**
    * Templated helper function used to log various types of intermediate output.
@@ -170,10 +176,15 @@ class ETDumpGen : public ::executorch::runtime::EventTracer {
       ::executorch::runtime::DebugHandle delegate_debug_index,
       const T& output);
 
+  long write_tensor_or_raise_error(executorch::aten::Tensor tensor);
+
   struct flatcc_builder* builder_;
   size_t num_blocks_ = 0;
-  ::executorch::runtime::Span<uint8_t> debug_buffer_;
-  size_t debug_buffer_offset_ = 0;
+  DataSinkBase* data_sink_;
+
+  // It is only for set_debug_buffer function.
+  BufferDataSink buffer_data_sink_;
+
   int bundled_input_index_ = -1;
   State state_ = State::Init;
   struct internal::ETDumpStaticAllocator alloc_;
diff --git a/devtools/etdump/targets.bzl b/devtools/etdump/targets.bzl
index ddbb35eab74..dda68e1b6ac 100644
--- a/devtools/etdump/targets.bzl
+++ b/devtools/etdump/targets.bzl
@@ -1,4 +1,4 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime")
 
 SCALAR_TYPE_STEM = "scalar_type"
 SCALAR_TYPE = SCALAR_TYPE_STEM + ".fbs"
@@ -87,8 +87,20 @@ def define_common_targets():
         exported_external_deps = ["flatccrt"],
     )
 
-    for aten_mode in (True, False):
+    runtime.cxx_library(
+        name = "utils",
+        srcs = [],
+        exported_headers = [
+            "utils.h",
+        ],
+        visibility = [
+            "//executorch/devtools/etdump/...",
+        ],
+    )
+
+    for aten_mode in get_aten_mode_options():
         aten_suffix = "_aten" if aten_mode else ""
+
         runtime.cxx_library(
             name = "etdump_flatcc" + aten_suffix,
             srcs = [
@@ -106,6 +118,9 @@ def define_common_targets():
             ],
             exported_deps = [
                 ":etdump_schema_flatcc",
+                ":utils",
+                "//executorch/devtools/etdump/data_sinks:data_sink_base" + aten_suffix,
+                "//executorch/devtools/etdump/data_sinks:buffer_data_sink" + aten_suffix,
                 "//executorch/runtime/core:event_tracer" + aten_suffix,
                 "//executorch/runtime/core/exec_aten/util:scalar_type_util" + aten_suffix,
             ],
diff --git a/devtools/etdump/tests/CMakeLists.txt b/devtools/etdump/tests/CMakeLists.txt
index f8f19ed8d6e..1443457932e 100644
--- a/devtools/etdump/tests/CMakeLists.txt
+++ b/devtools/etdump/tests/CMakeLists.txt
@@ -15,7 +15,7 @@ cmake_minimum_required(VERSION 3.19)
 
 set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 
-include(${EXECUTORCH_ROOT}/build/Test.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
 set(_test_srcs etdump_test.cpp)
 
diff --git a/devtools/etdump/tests/etdump_test.cpp b/devtools/etdump/tests/etdump_test.cpp
index 664a5ee1a0d..8e93a547074 100644
--- a/devtools/etdump/tests/etdump_test.cpp
+++ b/devtools/etdump/tests/etdump_test.cpp
@@ -8,22 +8,26 @@
 
 #include <gtest/gtest.h>
 #include <cstdio>
+#include <memory>
 
+#include <executorch/devtools/etdump/data_sinks/buffer_data_sink.h>
+#include <executorch/devtools/etdump/data_sinks/file_data_sink.h>
 #include <executorch/devtools/etdump/etdump_flatcc.h>
 #include <executorch/devtools/etdump/etdump_schema_flatcc_builder.h>
 #include <executorch/devtools/etdump/etdump_schema_flatcc_reader.h>
+#include <executorch/extension/testing_util/temp_file.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
 #include <executorch/runtime/core/span.h>
 #include <executorch/runtime/platform/runtime.h>
 #include <executorch/test/utils/DeathTest.h>
 #include <cstdint>
 #include <cstring>
-#include <memory>
 
 using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
 using ::executorch::etdump::ETDumpGen;
 using ::executorch::etdump::ETDumpResult;
+using ::executorch::extension::testing::TempFile;
 using ::executorch::runtime::AllocatorID;
 using ::executorch::runtime::ArrayRef;
 using ::executorch::runtime::BoxedEvalueList;
@@ -35,6 +39,9 @@ using ::executorch::runtime::Span;
 using ::executorch::runtime::Tag;
 using ::executorch::runtime::testing::TensorFactory;
 
+using ::executorch::etdump::BufferDataSink;
+using ::executorch::etdump::FileDataSink;
+
 class ProfilerETDumpTest : public ::testing::Test {
  protected:
   void SetUp() override {
@@ -43,6 +50,9 @@ class ProfilerETDumpTest : public ::testing::Test {
     const size_t buf_size = 512 * 1024;
     buf = (uint8_t*)malloc(buf_size * sizeof(uint8_t));
     etdump_gen[1] = new ETDumpGen(Span<uint8_t>(buf, buf_size));
+
+    temp_file = std::make_unique<TempFile>(std::string());
+    dump_file_path = temp_file->path();
   }
 
   void TearDown() override {
@@ -53,6 +63,8 @@ class ProfilerETDumpTest : public ::testing::Test {
 
   ETDumpGen* etdump_gen[2];
   uint8_t* buf = nullptr;
+  std::unique_ptr<TempFile> temp_file;
+  std::string dump_file_path;
 };
 
 TEST_F(ProfilerETDumpTest, SingleProfileEvent) {
@@ -175,54 +187,92 @@ TEST_F(ProfilerETDumpTest, AllocationEvents) {
 
 TEST_F(ProfilerETDumpTest, DebugEvent) {
   for (size_t i = 0; i < 2; i++) {
-    TensorFactory<ScalarType::Float> tf;
-    EValue evalue(tf.ones({3, 2}));
+    for (size_t j = 0; j < 3; j++) {
+      etdump_gen[i]->create_event_block("test_block");
 
-    etdump_gen[i]->create_event_block("test_block");
+      void* ptr = malloc(2048);
+
+      EValue evalue_int((int64_t)5);
+      etdump_gen[i]->log_evalue(evalue_int);
 
-    void* ptr = malloc(2048);
-    Span<uint8_t> buffer((uint8_t*)ptr, 2048);
+      EValue evalue_double((double)1.5);
+      etdump_gen[i]->log_evalue(evalue_double);
 
-    etdump_gen[i]->set_debug_buffer(buffer);
-    etdump_gen[i]->log_evalue(evalue);
-    etdump_gen[i]->log_evalue(evalue, LoggedEValueType::kProgramOutput);
+      EValue evalue_bool(true);
+      etdump_gen[i]->log_evalue(evalue_bool);
 
-    EValue evalue_int((int64_t)5);
-    etdump_gen[i]->log_evalue(evalue_int);
+      etdump_gen[i]->log_evalue(evalue_bool);
 
-    EValue evalue_double((double)1.5);
-    etdump_gen[i]->log_evalue(evalue_double);
+      TensorFactory<ScalarType::Float> tf;
+      EValue evalue_tensor(tf.ones({3, 2}));
 
-    EValue evalue_bool(true);
-    etdump_gen[i]->log_evalue(evalue_bool);
+      // using span to record debug data
+      Span<uint8_t> buffer((uint8_t*)ptr, 2048);
+      auto buffer_data_sink = BufferDataSink::create(ptr, 2048);
+      auto file_data_sink = FileDataSink::create(dump_file_path.c_str());
 
-    etdump_gen[i]->log_evalue(evalue_bool);
+      if (j == 0) {
+        ET_EXPECT_DEATH(
+            etdump_gen[i]->log_evalue(evalue_tensor),
+            "Must set data sink before writing tensor-like data");
+        etdump_gen[i]->set_debug_buffer(buffer);
+      }
+      // using buffer data sink to record debug data
+      else if (j == 1) {
+        etdump_gen[i]->set_data_sink(&buffer_data_sink.get());
+      }
+      // using file data sink to record debug data
+      else {
+        etdump_gen[i]->set_data_sink(&file_data_sink.get());
+      }
+
+      etdump_gen[i]->log_evalue(evalue_tensor);
+      etdump_gen[i]->log_evalue(
+          evalue_tensor, LoggedEValueType::kProgramOutput);
 
-    free(ptr);
+      free(ptr);
+    }
   }
 }
 
 TEST_F(ProfilerETDumpTest, DebugEventTensorList) {
   for (size_t i = 0; i < 2; i++) {
-    TensorFactory<ScalarType::Int> tf;
-    executorch::aten::Tensor storage[2] = {tf.ones({3, 2}), tf.ones({3, 2})};
-    EValue evalue_1(storage[0]);
-    EValue evalue_2(storage[1]);
-    EValue* values_p[2] = {&evalue_1, &evalue_2};
+    for (size_t j = 0; j < 3; j++) {
+      TensorFactory<ScalarType::Int> tf;
+      executorch::aten::Tensor storage[2] = {tf.ones({3, 2}), tf.ones({3, 2})};
+      EValue evalue_1(storage[0]);
+      EValue evalue_2(storage[1]);
+      EValue* values_p[2] = {&evalue_1, &evalue_2};
 
-    BoxedEvalueList<executorch::aten::Tensor> a_box(values_p, storage, 2);
-    EValue evalue(a_box);
-    evalue.tag = Tag::ListTensor;
+      BoxedEvalueList<executorch::aten::Tensor> a_box(values_p, storage, 2);
+      EValue evalue(a_box);
+      evalue.tag = Tag::ListTensor;
 
-    etdump_gen[i]->create_event_block("test_block");
+      etdump_gen[i]->create_event_block("test_block");
 
-    void* ptr = malloc(2048);
-    Span<uint8_t> buffer((uint8_t*)ptr, 2048);
+      void* ptr = malloc(2048);
+      Span<uint8_t> buffer((uint8_t*)ptr, 2048);
 
-    etdump_gen[i]->set_debug_buffer(buffer);
-    etdump_gen[i]->log_evalue(evalue);
+      auto buffer_data_sink = BufferDataSink::create(ptr, 2048);
+      auto file_data_sink = FileDataSink::create(dump_file_path.c_str());
 
-    free(ptr);
+      // using span to record debug data
+      if (j == 0) {
+        etdump_gen[i]->set_debug_buffer(buffer);
+      }
+      // using buffer data sink to record debug data
+      else if (j == 1) {
+        etdump_gen[i]->set_data_sink(&buffer_data_sink.get());
+      }
+      // using file data sink to record debug dats
+      else {
+        etdump_gen[i]->set_data_sink(&file_data_sink.get());
+      }
+
+      etdump_gen[i]->log_evalue(evalue);
+
+      free(ptr);
+    }
   }
 }
 
@@ -231,61 +281,78 @@ TEST_F(ProfilerETDumpTest, VerifyLogging) {
   EValue evalue(tf.ones({3, 2}));
 
   for (size_t i = 0; i < 2; i++) {
-    etdump_gen[i]->create_event_block("test_block");
+    for (size_t j = 0; j < 2; j++) {
+      etdump_gen[i]->create_event_block("test_block");
 
-    void* ptr = malloc(2048);
-    Span<uint8_t> buffer((uint8_t*)ptr, 2048);
+      void* ptr = malloc(2048);
+      Span<uint8_t> buffer((uint8_t*)ptr, 2048);
 
-    etdump_gen[i]->set_debug_buffer(buffer);
-    etdump_gen[i]->log_evalue(evalue);
-    etdump_gen[i]->log_evalue(evalue, LoggedEValueType::kProgramOutput);
+      auto buffer_data_sink = BufferDataSink::create(ptr, 2048);
+      auto file_data_sink = FileDataSink::create(dump_file_path.c_str());
 
-    ETDumpResult result = etdump_gen[i]->get_etdump_data();
-    ASSERT_TRUE(result.buf != nullptr);
-    ASSERT_TRUE(result.size != 0);
+      // using span to record debug data
+      if (j == 0) {
+        etdump_gen[i]->set_debug_buffer(buffer);
+      }
+      // using buffer data sink to record debug data
+      else if (j == 1) {
+        etdump_gen[i]->set_data_sink(&buffer_data_sink.get());
+      }
+      // using buffer data sink to record debug data
+      else {
+        etdump_gen[i]->set_data_sink(&file_data_sink.get());
+      }
 
-    size_t size = 0;
-    void* buf = flatbuffers_read_size_prefix(result.buf, &size);
-    etdump_ETDump_table_t etdump = etdump_ETDump_as_root_with_identifier(
-        buf, etdump_ETDump_file_identifier);
+      etdump_gen[i]->log_evalue(evalue);
+      etdump_gen[i]->log_evalue(evalue, LoggedEValueType::kProgramOutput);
 
-    etdump_RunData_vec_t run_data_vec = etdump_ETDump_run_data(etdump);
-    ASSERT_EQ(etdump_RunData_vec_len(run_data_vec), 1);
-
-    etdump_Event_vec_t events =
-        etdump_RunData_events(etdump_RunData_vec_at(run_data_vec, 0));
-    ASSERT_EQ(etdump_Event_vec_len(events), 2);
-
-    etdump_Event_table_t event = etdump_Event_vec_at(events, 0);
-
-    etdump_DebugEvent_table_t single_debug_event =
-        etdump_Event_debug_event(event);
-    etdump_Value_table_t value =
-        etdump_DebugEvent_debug_entry(single_debug_event);
-    ASSERT_EQ(etdump_Value_tensor_is_present(value), true);
-    ASSERT_EQ(etdump_Value_output_is_present(value), false);
-
-    etdump_Tensor_table_t tensor = etdump_Value_tensor(value);
-    executorch_flatbuffer_ScalarType_enum_t scalar_enum =
-        etdump_Tensor_scalar_type(tensor);
-    ASSERT_EQ(scalar_enum, executorch_flatbuffer_ScalarType_FLOAT);
-    flatbuffers_int64_vec_t sizes = etdump_Tensor_sizes(tensor);
-    ASSERT_EQ(flatbuffers_int64_vec_len(sizes), 2);
-    ASSERT_EQ(flatbuffers_int64_vec_at(sizes, 0), 3);
-    ASSERT_EQ(flatbuffers_int64_vec_at(sizes, 1), 2);
-
-    event = etdump_Event_vec_at(events, 1);
-    single_debug_event = etdump_Event_debug_event(event);
-    value = etdump_DebugEvent_debug_entry(single_debug_event);
-    ASSERT_EQ(etdump_Value_tensor_is_present(value), true);
-    ASSERT_EQ(etdump_Value_output_is_present(value), true);
-    etdump_Bool_table_t bool_val = etdump_Value_output_get(value);
-    bool bool_val_from_table = etdump_Bool_bool_val(bool_val);
-    ASSERT_EQ(bool_val_from_table, true);
-
-    free(ptr);
-    if (!etdump_gen[i]->is_static_etdump()) {
-      free(result.buf);
+      ETDumpResult result = etdump_gen[i]->get_etdump_data();
+      ASSERT_TRUE(result.buf != nullptr);
+      ASSERT_TRUE(result.size != 0);
+
+      size_t size = 0;
+      void* buf = flatbuffers_read_size_prefix(result.buf, &size);
+      etdump_ETDump_table_t etdump = etdump_ETDump_as_root_with_identifier(
+          buf, etdump_ETDump_file_identifier);
+
+      etdump_RunData_vec_t run_data_vec = etdump_ETDump_run_data(etdump);
+      ASSERT_EQ(etdump_RunData_vec_len(run_data_vec), 1);
+
+      etdump_Event_vec_t events =
+          etdump_RunData_events(etdump_RunData_vec_at(run_data_vec, 0));
+      ASSERT_EQ(etdump_Event_vec_len(events), 2);
+
+      etdump_Event_table_t event = etdump_Event_vec_at(events, 0);
+
+      etdump_DebugEvent_table_t single_debug_event =
+          etdump_Event_debug_event(event);
+      etdump_Value_table_t value =
+          etdump_DebugEvent_debug_entry(single_debug_event);
+      ASSERT_EQ(etdump_Value_tensor_is_present(value), true);
+      ASSERT_EQ(etdump_Value_output_is_present(value), false);
+
+      etdump_Tensor_table_t tensor = etdump_Value_tensor(value);
+      executorch_flatbuffer_ScalarType_enum_t scalar_enum =
+          etdump_Tensor_scalar_type(tensor);
+      ASSERT_EQ(scalar_enum, executorch_flatbuffer_ScalarType_FLOAT);
+      flatbuffers_int64_vec_t sizes = etdump_Tensor_sizes(tensor);
+      ASSERT_EQ(flatbuffers_int64_vec_len(sizes), 2);
+      ASSERT_EQ(flatbuffers_int64_vec_at(sizes, 0), 3);
+      ASSERT_EQ(flatbuffers_int64_vec_at(sizes, 1), 2);
+
+      event = etdump_Event_vec_at(events, 1);
+      single_debug_event = etdump_Event_debug_event(event);
+      value = etdump_DebugEvent_debug_entry(single_debug_event);
+      ASSERT_EQ(etdump_Value_tensor_is_present(value), true);
+      ASSERT_EQ(etdump_Value_output_is_present(value), true);
+      etdump_Bool_table_t bool_val = etdump_Value_output_get(value);
+      bool bool_val_from_table = etdump_Bool_bool_val(bool_val);
+      ASSERT_EQ(bool_val_from_table, true);
+
+      free(ptr);
+      if (!etdump_gen[i]->is_static_etdump()) {
+        free(result.buf);
+      }
     }
   }
 }
@@ -432,58 +499,75 @@ TEST_F(ProfilerETDumpTest, VerifyData) {
 
 TEST_F(ProfilerETDumpTest, LogDelegateIntermediateOutput) {
   for (size_t i = 0; i < 2; i++) {
-    void* ptr = malloc(2048);
-    Span<uint8_t> buffer((uint8_t*)ptr, 2048);
-
-    etdump_gen[i]->create_event_block("test_block");
-    TensorFactory<ScalarType::Float> tf;
-
-    ET_EXPECT_DEATH(
-        etdump_gen[i]->log_intermediate_output_delegate(
-            "test_event_tensor",
-            static_cast<torch::executor::DebugHandle>(-1),
-            tf.ones({3, 2})),
-        "Must pre-set debug buffer with set_debug_buffer()");
-    etdump_gen[i]->set_debug_buffer(buffer);
-
-    // Log a tensor
-    etdump_gen[i]->log_intermediate_output_delegate(
-        "test_event_tensor",
-        static_cast<torch::executor::DebugHandle>(-1),
-        tf.ones({3, 2}));
+    for (size_t j = 0; j < 3; j++) {
+      void* ptr = malloc(2048);
+      Span<uint8_t> buffer((uint8_t*)ptr, 2048);
 
-    // Log a tensor list
-    std::vector<Tensor> tensors = {tf.ones({5, 4}), tf.ones({7, 6})};
-    etdump_gen[i]->log_intermediate_output_delegate(
-        "test_event_tensorlist",
-        static_cast<torch::executor::DebugHandle>(-1),
-        ArrayRef<Tensor>(tensors.data(), tensors.size()));
-
-    // Log an int
-    etdump_gen[i]->log_intermediate_output_delegate(
-        "test_event_tensorlist",
-        static_cast<torch::executor::DebugHandle>(-1),
-        10);
+      auto buffer_data_sink = BufferDataSink::create(ptr, 2048);
+      auto file_data_sink = FileDataSink::create(dump_file_path.c_str());
 
-    // Log a double
-    etdump_gen[i]->log_intermediate_output_delegate(
-        "test_event_tensorlist",
-        static_cast<torch::executor::DebugHandle>(-1),
-        20.75);
+      etdump_gen[i]->create_event_block("test_block");
+      TensorFactory<ScalarType::Float> tf;
+
+      // using span to record debug data
+      if (j == 0) {
+        // TODO(gasoonjia): add similar ET_EXPECT_DEATH on BufferDataSink branch
+        ET_EXPECT_DEATH(
+            etdump_gen[i]->log_intermediate_output_delegate(
+                "test_event_tensor",
+                static_cast<torch::executor::DebugHandle>(-1),
+                tf.ones({3, 2})),
+            "Must set data sink before writing tensor-like data");
+        etdump_gen[i]->set_debug_buffer(buffer);
+      }
+      // using buffer data sink to record debug data
+      else if (j == 1) {
+        etdump_gen[i]->set_data_sink(&buffer_data_sink.get());
+      }
+      // using file data sink to record debug data
+      else {
+        etdump_gen[i]->set_data_sink(&file_data_sink.get());
+      }
 
-    // Log a bool
-    etdump_gen[i]->log_intermediate_output_delegate(
-        "test_event_tensorlist",
-        static_cast<torch::executor::DebugHandle>(-1),
-        true);
+      // Log a tensor
+      etdump_gen[i]->log_intermediate_output_delegate(
+          "test_event_tensor",
+          static_cast<torch::executor::DebugHandle>(-1),
+          tf.ones({3, 2}));
+
+      // Log a tensor list
+      std::vector<Tensor> tensors = {tf.ones({5, 4}), tf.ones({7, 6})};
+      etdump_gen[i]->log_intermediate_output_delegate(
+          "test_event_tensorlist",
+          static_cast<torch::executor::DebugHandle>(-1),
+          ArrayRef<Tensor>(tensors.data(), tensors.size()));
+
+      // Log an int
+      etdump_gen[i]->log_intermediate_output_delegate(
+          "test_event_tensorlist",
+          static_cast<torch::executor::DebugHandle>(-1),
+          10);
+
+      // Log a double
+      etdump_gen[i]->log_intermediate_output_delegate(
+          "test_event_tensorlist",
+          static_cast<torch::executor::DebugHandle>(-1),
+          20.75);
+
+      // Log a bool
+      etdump_gen[i]->log_intermediate_output_delegate(
+          "test_event_tensorlist",
+          static_cast<torch::executor::DebugHandle>(-1),
+          true);
 
-    ETDumpResult result = etdump_gen[i]->get_etdump_data();
-    ASSERT_TRUE(result.buf != nullptr);
-    ASSERT_TRUE(result.size != 0);
+      ETDumpResult result = etdump_gen[i]->get_etdump_data();
+      ASSERT_TRUE(result.buf != nullptr);
+      ASSERT_TRUE(result.size != 0);
 
-    free(ptr);
-    if (!etdump_gen[i]->is_static_etdump()) {
-      free(result.buf);
+      free(ptr);
+      if (!etdump_gen[i]->is_static_etdump()) {
+        free(result.buf);
+      }
     }
   }
 }
@@ -493,81 +577,97 @@ TEST_F(ProfilerETDumpTest, VerifyDelegateIntermediateLogging) {
   EValue evalue(tf.ones({3, 2}));
 
   for (size_t i = 0; i < 2; i++) {
-    etdump_gen[i]->create_event_block("test_block");
-
-    void* ptr = malloc(2048);
-    Span<uint8_t> buffer((uint8_t*)ptr, 2048);
+    for (size_t j = 0; j < 3; j++) {
+      etdump_gen[i]->create_event_block("test_block");
 
-    etdump_gen[i]->set_debug_buffer(buffer);
+      void* ptr = malloc(2048);
+      Span<uint8_t> buffer((uint8_t*)ptr, 2048);
+      ;
+      auto buffer_data_sink = BufferDataSink::create(ptr, 2048);
+      auto file_data_sink = FileDataSink::create(dump_file_path.c_str());
 
-    // Event 0
-    etdump_gen[i]->log_intermediate_output_delegate(
-        nullptr, 257, tf.ones({3, 4}));
-    // Event 1
-    etdump_gen[i]->log_intermediate_output_delegate(
-        nullptr, 258, tf.ones({5, 6}));
+      // using span to record debug data
+      if (j == 0) {
+        etdump_gen[i]->set_debug_buffer(buffer);
+      }
+      // using buffer data sink to record debug data
+      else if (j == 1) {
+        etdump_gen[i]->set_data_sink(&buffer_data_sink.get());
+      }
+      // using file data sink to record debug data
+      else {
+        etdump_gen[i]->set_data_sink(&file_data_sink.get());
+      }
 
-    ETDumpResult result = etdump_gen[i]->get_etdump_data();
-    ASSERT_TRUE(result.buf != nullptr);
-    ASSERT_TRUE(result.size != 0);
+      // Event 0
+      etdump_gen[i]->log_intermediate_output_delegate(
+          nullptr, 257, tf.ones({3, 4}));
+      // Event 1
+      etdump_gen[i]->log_intermediate_output_delegate(
+          nullptr, 258, tf.ones({5, 6}));
 
-    size_t size = 0;
-    void* buf = flatbuffers_read_size_prefix(result.buf, &size);
-    etdump_ETDump_table_t etdump = etdump_ETDump_as_root_with_identifier(
-        buf, etdump_ETDump_file_identifier);
+      ETDumpResult result = etdump_gen[i]->get_etdump_data();
+      ASSERT_TRUE(result.buf != nullptr);
+      ASSERT_TRUE(result.size != 0);
 
-    etdump_RunData_vec_t run_data_vec = etdump_ETDump_run_data(etdump);
-    ASSERT_EQ(etdump_RunData_vec_len(run_data_vec), 1);
-
-    etdump_Event_vec_t events =
-        etdump_RunData_events(etdump_RunData_vec_at(run_data_vec, 0));
-    ASSERT_EQ(etdump_Event_vec_len(events), 2);
-
-    // Verify Event 0
-    etdump_Event_table_t event_0 = etdump_Event_vec_at(events, 0);
-
-    etdump_DebugEvent_table_t single_debug_event =
-        etdump_Event_debug_event(event_0);
-    etdump_Value_table_t value =
-        etdump_DebugEvent_debug_entry(single_debug_event);
-    ASSERT_EQ(etdump_Value_tensor_is_present(value), true);
-
-    etdump_Tensor_table_t tensor = etdump_Value_tensor(value);
-    executorch_flatbuffer_ScalarType_enum_t scalar_enum =
-        etdump_Tensor_scalar_type(tensor);
-    ASSERT_EQ(scalar_enum, executorch_flatbuffer_ScalarType_FLOAT);
-    flatbuffers_int64_vec_t sizes = etdump_Tensor_sizes(tensor);
-    ASSERT_EQ(flatbuffers_int64_vec_len(sizes), 2);
-    ASSERT_EQ(flatbuffers_int64_vec_at(sizes, 0), 3);
-    ASSERT_EQ(flatbuffers_int64_vec_at(sizes, 1), 4);
-
-    // Verify Event 1
-    etdump_Event_table_t event_1 = etdump_Event_vec_at(events, 1);
-
-    single_debug_event = etdump_Event_debug_event(event_1);
-    value = etdump_DebugEvent_debug_entry(single_debug_event);
-
-    tensor = etdump_Value_tensor(value);
-    sizes = etdump_Tensor_sizes(tensor);
-    ASSERT_EQ(flatbuffers_int64_vec_len(sizes), 2);
-    ASSERT_EQ(flatbuffers_int64_vec_at(sizes, 0), 5);
-    ASSERT_EQ(flatbuffers_int64_vec_at(sizes, 1), 6);
-
-    // Event 1 should have a empty delegate_debug_id_str
-    flatbuffers_string_t delegate_debug_id_name =
-        etdump_DebugEvent_delegate_debug_id_str(
-            etdump_Event_debug_event(event_1));
+      size_t size = 0;
+      void* buf = flatbuffers_read_size_prefix(result.buf, &size);
+      etdump_ETDump_table_t etdump = etdump_ETDump_as_root_with_identifier(
+          buf, etdump_ETDump_file_identifier);
 
-    EXPECT_EQ(delegate_debug_id_name, nullptr);
-    // Check for the correct delegate_debug_id_int
-    EXPECT_EQ(
-        etdump_DebugEvent_delegate_debug_id_int(
-            etdump_Event_debug_event(event_1)),
-        258);
+      etdump_RunData_vec_t run_data_vec = etdump_ETDump_run_data(etdump);
+      ASSERT_EQ(etdump_RunData_vec_len(run_data_vec), 1);
+
+      etdump_Event_vec_t events =
+          etdump_RunData_events(etdump_RunData_vec_at(run_data_vec, 0));
+      ASSERT_EQ(etdump_Event_vec_len(events), 2);
+
+      // Verify Event 0
+      etdump_Event_table_t event_0 = etdump_Event_vec_at(events, 0);
+
+      etdump_DebugEvent_table_t single_debug_event =
+          etdump_Event_debug_event(event_0);
+      etdump_Value_table_t value =
+          etdump_DebugEvent_debug_entry(single_debug_event);
+      ASSERT_EQ(etdump_Value_tensor_is_present(value), true);
+
+      etdump_Tensor_table_t tensor = etdump_Value_tensor(value);
+      executorch_flatbuffer_ScalarType_enum_t scalar_enum =
+          etdump_Tensor_scalar_type(tensor);
+      ASSERT_EQ(scalar_enum, executorch_flatbuffer_ScalarType_FLOAT);
+      flatbuffers_int64_vec_t sizes = etdump_Tensor_sizes(tensor);
+      ASSERT_EQ(flatbuffers_int64_vec_len(sizes), 2);
+      ASSERT_EQ(flatbuffers_int64_vec_at(sizes, 0), 3);
+      ASSERT_EQ(flatbuffers_int64_vec_at(sizes, 1), 4);
+
+      // Verify Event 1
+      etdump_Event_table_t event_1 = etdump_Event_vec_at(events, 1);
+
+      single_debug_event = etdump_Event_debug_event(event_1);
+      value = etdump_DebugEvent_debug_entry(single_debug_event);
+
+      tensor = etdump_Value_tensor(value);
+      sizes = etdump_Tensor_sizes(tensor);
+      ASSERT_EQ(flatbuffers_int64_vec_len(sizes), 2);
+      ASSERT_EQ(flatbuffers_int64_vec_at(sizes, 0), 5);
+      ASSERT_EQ(flatbuffers_int64_vec_at(sizes, 1), 6);
+
+      // Event 1 should have a empty delegate_debug_id_str
+      flatbuffers_string_t delegate_debug_id_name =
+          etdump_DebugEvent_delegate_debug_id_str(
+              etdump_Event_debug_event(event_1));
+
+      EXPECT_EQ(delegate_debug_id_name, nullptr);
+      // Check for the correct delegate_debug_id_int
+      EXPECT_EQ(
+          etdump_DebugEvent_delegate_debug_id_int(
+              etdump_Event_debug_event(event_1)),
+          258);
 
-    free(ptr);
-    if (!etdump_gen[i]->is_static_etdump()) {
-      free(result.buf);
+      free(ptr);
+      if (!etdump_gen[i]->is_static_etdump()) {
+        free(result.buf);
+      }
     }
   }
 }
diff --git a/devtools/etdump/tests/targets.bzl b/devtools/etdump/tests/targets.bzl
index 5299b7c1cb7..c97696c6c82 100644
--- a/devtools/etdump/tests/targets.bzl
+++ b/devtools/etdump/tests/targets.bzl
@@ -15,6 +15,8 @@ def define_common_targets():
         deps = [
             "//executorch/devtools/etdump:etdump_flatcc",
             "//executorch/devtools/etdump:etdump_schema_flatcc",
+            "//executorch/devtools/etdump/data_sinks:file_data_sink",
+            "//executorch/extension/testing_util:temp_file",
             "//executorch/runtime/platform:platform",
             "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
         ],
diff --git a/devtools/etdump/utils.h b/devtools/etdump/utils.h
new file mode 100644
index 00000000000..0d6b5cd8515
--- /dev/null
+++ b/devtools/etdump/utils.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#pragma once
+
+namespace executorch {
+namespace etdump {
+namespace internal {
+
+/**
+ * Aligns a pointer to the next multiple of `alignment`.
+ *
+ * @param[in] ptr Pointer to align.
+ * @param[in] alignment Alignment to align to. Must be a power of 2 and cannot
+ * be 0.
+ *
+ * @returns A pointer aligned to `alignment`.
+ */
+inline uint8_t* align_pointer(void* ptr, size_t alignment) {
+  intptr_t addr = reinterpret_cast<intptr_t>(ptr);
+  if ((addr & (alignment - 1)) == 0) {
+    // Already aligned.
+    return reinterpret_cast<uint8_t*>(ptr);
+  }
+  addr = (addr | (alignment - 1)) + 1;
+  return reinterpret_cast<uint8_t*>(addr);
+}
+
+} // namespace internal
+} // namespace etdump
+} // namespace executorch
diff --git a/devtools/inspector/_inspector.py b/devtools/inspector/_inspector.py
index 989663601f8..d0243d86255 100644
--- a/devtools/inspector/_inspector.py
+++ b/devtools/inspector/_inspector.py
@@ -1224,6 +1224,10 @@ def find_total_for_module(self, module_name: str) -> float:
         total = 0.0
         for block in self.event_blocks:
             for event in block.events:
+                # Skip OPERATOR_CALL events to avoid double-counting and exclude framework tax
+                if event.event_name == "OPERATOR_CALL":
+                    continue
+
                 module_hierarchy = event.module_hierarchy.values()
                 for hierarchy in module_hierarchy:
                     if not hierarchy:
diff --git a/docs/TARGETS b/docs/TARGETS
index 6e8210dbdfe..a0281b8b782 100644
--- a/docs/TARGETS
+++ b/docs/TARGETS
@@ -9,8 +9,9 @@ python_binary(
     par_style = "xar",
     deps = [
         "//caffe2:torch",
-        "//executorch/exir:lib",
+        "//executorch/backends/xnnpack/quantizer:xnnpack_quantizer",
         "//executorch/devtools:lib",
+        "//executorch/exir:lib",
         "//executorch/exir/backend/test:backend_with_compiler_demo",
         "//executorch/exir/backend/test:op_partitioner_demo",
         "//executorch/devtools/bundled_program/serialize:lib",
diff --git a/docs/source/android-prebuilt-library.md b/docs/source/android-prebuilt-library.md
deleted file mode 100644
index 324c63376c5..00000000000
--- a/docs/source/android-prebuilt-library.md
+++ /dev/null
@@ -1,37 +0,0 @@
-# Using Android prebuilt libraries (AAR)
-
-We provide two prebuilt Android libraries (AAR), `executorch.aar` for generic use case (image/audio processing) and `executorch_llama.aar` for LLAMA use case.
-
-## Contents of libraries
-- `executorch.aar`
-  - [Java library](https://github.com/pytorch/executorch/tree/main/extension/android/src/main/java/org/pytorch/executorch)
-  - JNI contains the JNI binding for [NativePeer.java](https://github.com/pytorch/executorch/blob/main/extension/android/src/main/java/org/pytorch/executorch/NativePeer.java) and ExecuTorch native library, including core ExecuTorch runtime libraries, XNNPACK backend, Portable kernels, Optimized kernels, and Quantized kernels.
-    - Comes with two ABI variants, arm64-v8a and x86_64.
-- `executorch_llama.aar`
-  - [Java library](https://github.com/pytorch/executorch/tree/main/extension/android/src/main/java/org/pytorch/executorch) (Note: it contains the same Java classes as the previous Java, but it does not contain the JNI binding for generic Module/NativePeer Java code).
-  - JNI contains the JNI binding for [LlamaModule.java](https://github.com/pytorch/executorch/blob/main/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java) and ExecuTorch native library, including core ExecuTorch runtime libraries, XNNPACK backend, Portable kernels, Optimized kernels, Quantized kernels, and LLAMA-specific Custom ops library.
-    - Comes with two ABI variants, arm64-v8a and x86_64.
-
-## Downloading AAR
-[executorch.aar](https://ossci-android.s3.amazonaws.com/executorch/release/executorch-241002/executorch.aar)
-[executorch.aar.sha256sums](https://ossci-android.s3.amazonaws.com/executorch/release/executorch-241002/executorch.aar.sha256sums)
-
-## Using prebuilt libraries
-
-To add the Java library to your app, simply download the AAR, and add it to your gradle build rule.
-
-In your app working directory, such as example executorch/examples/demo-apps/android/LlamaDemo,
-```
-mkdir -p app/libs
-curl https://ossci-android.s3.amazonaws.com/executorch/release/executorch-241002/executorch.aar -o app/libs/executorch.aar
-```
-
-And include it in gradle:
-```
-# app/build.grardle.kts
-dependencies {
-    implementation(files("libs/executorch.aar"))
-}
-```
-
-Now you can compile your app with the ExecuTorch Android library.
diff --git a/docs/source/api-life-cycle.md b/docs/source/api-life-cycle.md
index 1836ba77d71..0327f23a985 100644
--- a/docs/source/api-life-cycle.md
+++ b/docs/source/api-life-cycle.md
@@ -1,4 +1,4 @@
-# ExecuTorch API Life Cycle and Deprecation Policy
+# API Life Cycle and Deprecation Policy
 
 ## API Life Cycle
 
diff --git a/docs/source/native-delegates-executorch-xnnpack-delegate.md b/docs/source/backend-delegates-xnnpack-reference.md
similarity index 99%
rename from docs/source/native-delegates-executorch-xnnpack-delegate.md
rename to docs/source/backend-delegates-xnnpack-reference.md
index 6bfbfa6be36..52d208de219 100644
--- a/docs/source/native-delegates-executorch-xnnpack-delegate.md
+++ b/docs/source/backend-delegates-xnnpack-reference.md
@@ -1,4 +1,4 @@
-# ExecuTorch XNNPACK delegate
+# XNNPACK Delegate Internals
 
 This is a high-level overview of the ExecuTorch XNNPACK backend delegate. This high performance delegate is aimed to reduce CPU inference latency for ExecuTorch models. We will provide a brief introduction to the XNNPACK library and explore the delegate’s overall architecture and intended use cases.
 
diff --git a/docs/source/backend-template.md b/docs/source/backend-template.md
new file mode 100644
index 00000000000..5dc5f739671
--- /dev/null
+++ b/docs/source/backend-template.md
@@ -0,0 +1,42 @@
+# Backend Template
+
+Provide a brief overview/description of the backend. At a high-level, what does it do? Consider linking to top-level vendor documentation for the target hardware family and/or framework (Core ML, XNNPACK, etc.).
+
+## Features
+
+List high-level features of backend, such as general operator and hardware support.
+
+## Target Requirements
+
+What hardware and software is required to run the backend on a specific device? For example, does it require specific iOS or Android OS versions? If it's an NPU, what hardware models are supported?
+
+## Development Requirements
+
+What software and hardware is needed to create a .PTE file targeting this backend? Are there any additional dependencies that need to be installed that are not included with the ExecuTorch pip package? How does the user install them?
+
+## Using *Backend Name*
+
+This section describes the steps users need to take in order to generate a .PTE targeting this backend. Include a full code sample for exporting and lowering a model to this backend. Make sure relevant imports for the backend partitioner are included.
+
+### Partitioner API
+
+What options, if any, does the partitioner take? Are there any other export-time configurations that can be applied? Document each option.
+
+### Quantization
+
+What quantization schemes does this backend support? Consider including the following, as appropriate.
+- What operators are supported?
+- Number of bits?
+- Static vs dynamic activations?
+- Weight only vs activations + weights?
+- Symmetric vs asymmetric weights?
+- Per-tensor, per-chanel, group/blockwise?
+
+Include a code snippet demonstrating how to perform quantization for this backend. Document, or link to, a description of the parameters that the user can specify.
+
+## Runtime Integration
+
+This section is intended to tell the user all of the steps they'll need to take to be able to run a .PTE file on-device that is targeting the given backend.
+- What CMake targets should they link to?
+- How is this backend compiled from source?
+- Is the backend bundled by default in iOS and/or Android pre-built libraries?
diff --git a/docs/source/executorch-arm-delegate-tutorial.md b/docs/source/backends-arm-ethos-u.md
similarity index 88%
rename from docs/source/executorch-arm-delegate-tutorial.md
rename to docs/source/backends-arm-ethos-u.md
index eaea7fc23bf..a64ff2729e2 100644
--- a/docs/source/executorch-arm-delegate-tutorial.md
+++ b/docs/source/backends-arm-ethos-u.md
@@ -1,5 +1,5 @@
 <!---- Name is a WIP - this reflects better what it can do today ----->
-# Building and Running ExecuTorch with ARM Ethos-U Backend
+# ARM Ethos-U Backend
 
 <!----This will show a grid card on the page----->
 ::::{grid} 2
@@ -7,8 +7,8 @@
 :::{grid-item-card}  Tutorials we recommend you complete before this:
 :class-card: card-prerequisites
 * [Introduction to ExecuTorch](./intro-how-it-works.md)
-* [Setting up ExecuTorch](./getting-started-setup.md)
-* [Building ExecuTorch with CMake](./runtime-build-and-cross-compilation.md)
+* [Getting Started](./getting-started.md)
+* [Building ExecuTorch with CMake](./using-executorch-building-from-source.md)
 :::
 
 :::{grid-item-card}  What you will learn in this tutorial:
@@ -114,6 +114,25 @@ At the end of the setup, if everything goes well, your top level devlopement dir
 └── setup_path.sh
 ```
 
+### Notes:
+
+The `setup.sh` script has generated a `setup_path.sh` script that you need to source everytime you restart you shell.
+
+e.g. run
+`source  executorch/examples/arm/ethos-u-scratch/setup_path.sh`
+
+As `setup.sh` will download and setup the needed Arm toolchain make sure it is used by calling
+
+`which arm-none-eabi-gcc`
+
+It should show `arm-none-eabi-gcc` in the `executorch` project and not anything in `/usr/bin` something like:
+
+`<EXECUTORCH_ROOT>/examples/arm/ethos-u-scratch/arm-gnu-toolchain-13.3.rel1-aarch64-arm-none-eabi/bin/arm-none-eabi-gcc`
+or
+`<EXECUTORCH_ROOT>/examples/arm/ethos-u-scratch/arm-gnu-toolchain-13.3.rel1-x86_64-arm-none-eabi/bin/arm-none-eabi-gcc`
+
+If not you might need to uninstall `arm-none-eabi-gcc` or make sure its picked after the one in the project in your $PATH env varable.
+
 ## Convert the PyTorch Model to the `.pte` File
 
 `.pte` is a binary file produced by ExecuTorch Ahead-of-Time (AoT) pipeline by taking in a PyTorch Model (a torch.nn.Module), exporting it, running a variety of passes, and finally serializing it to a `.pte` file format. This binary file is typically consumed by the ExecuTorch Runtime. This [document](https://github.com/pytorch/executorch/blob/main/docs/source/getting-started-architecture.md) goes in much more depth about the ExecuTorch software stack for both AoT as well as Runtime.
@@ -200,7 +219,7 @@ Following script will serve as a helper utility to help us generate the `.pte` f
 
 ```bash
 python3 -m examples.arm.aot_arm_compiler --model_name="softmax"
-# This should produce ./softmax.pte
+# This should produce ./softmax_arm_ethos-u55-128.pte
 ```
 
 ### Delegated Workflow
@@ -221,15 +240,15 @@ Similar to the non-delegate flow, the same script will server as a helper utilit
 
 ```bash
 python3 -m examples.arm.aot_arm_compiler --model_name="add" --delegate
-# should produce ./add_arm_delegate.pte
+# should produce ./add_arm_delegate_ethos-u55-128.pte
 ```
 
 ### Delegated Quantized Workflow
 Before generating the `.pte` file for delegated quantized networks like MobileNetV2, we need to build the `quantized_ops_aot_lib`
 
+You can just run the `backends/arm/scripts/build_quantized_ops_aot_lib.sh` script to build this for you or build it yourself like this. 
+
 ```bash
-SITE_PACKAGES="$(python3 -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
-CMAKE_PREFIX_PATH="${SITE_PACKAGES}/torch"
 
 cd <executorch_root_dir>
 mkdir -p cmake-out-aot-lib
@@ -237,7 +256,6 @@ cmake -DCMAKE_BUILD_TYPE=Release \
     -DEXECUTORCH_BUILD_XNNPACK=OFF \
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED_AOT=ON \
-    -DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \
     -DPYTHON_EXECUTABLE=python3 \
 -Bcmake-out-aot-lib \
     "${et_root_dir}"
@@ -248,7 +266,7 @@ cmake --build cmake-out-aot-lib --parallel -- quantized_ops_aot_lib
 After the `quantized_ops_aot_lib` build, we can run the following script to generate the `.pte` file
 ```bash
 python3 -m examples.arm.aot_arm_compiler --model_name="mv2" --delegate --quantize --so_library="$(find cmake-out-aot-lib -name libquantized_ops_aot_lib.so)"
-# should produce ./mv2_arm_delegate.pte.pte
+# should produce ./mv2_arm_delegate_ethos-u55-128.pte
 ```
 
 <br />
@@ -265,6 +283,14 @@ Now let's try to run these `.pte` files on a Corstone-300 and Corstone-320 platf
 
 In this section, we will go over steps that you need to go through to build the runtime application. This then run on the target device. In the executorch repository we have a functioning script which does the exact same steps. It is located at `executorch/examples/arm/run.sh`. We will use that to build necessary pieces and finally run the previously generated PTE file on an FVP.
 
+By default the `run.sh` will use `arm_test/` as an build and output folder and you will find the build artifacts under it. This can be contolled/overrided with the `--et_build_root` and the `--output` flags if needed.
+
+e.g. running `examples/arm/run.sh --model_name=add --target=ethos-u85-128` will produce a pte and elf file like this:
+
+```bash
+arm_test/add/add_arm_delegate_ethos-u85-128.pte
+arm_test/add/cmake-out/arm_executor_runner
+```
 Also before we get started, make sure that you have completed ExecuTorch cmake build setup, and the instructions to setup the development environment described [earlier](#set-up-the-developer-environment).
 
 The block diagram below demonstrates, at the high level, how the various build artifacts are generated and are linked together to generate the final bare-metal executable.
@@ -279,7 +305,7 @@ The `generate_pte_file` function in `run.sh` script produces the `.pte` files ba
 
 ExecuTorch's CMake build system produces a set of build pieces which are critical for us to include and run the ExecuTorch runtime with-in the bare-metal environment we have for Corstone FVPs from Ethos-U SDK.
 
-[This](./runtime-build-and-cross-compilation.md) document provides a detailed overview of each individual build piece. For running either variant of the `.pte` file, we will need a core set of libraries. Here is a list,
+[This](./using-executorch-building-from-source.md) document provides a detailed overview of each individual build piece. For running either variant of the `.pte` file, we will need a core set of libraries. Here is a list,
 
 - `libexecutorch.a`
 - `libportable_kernels.a`
@@ -289,23 +315,19 @@ To run a `.pte` file with the Arm backend delegate call instructions, we will ne
 
 - `libexecutorch_delegate_ethos_u.a`
 
-These libraries are generated in `build_executorch` and `build_quantization_aot_lib` function of the `run.sh` script.
+These libraries are generated by the `backends/arm/scripts/build_executorch.sh`, `backends/arm/scripts/build_portable_kernels.sh` and `backends/arm/scripts/build_quantized_ops_aot_lib.sh` scripts called from the `run.sh` script.
 
-In this function, `EXECUTORCH_SELECT_OPS_LIST` will decide the number of portable operators included in the build and are available at runtime. It must match with `.pte` file's requirements, otherwise you will get `Missing Operator` error at runtime.
+The `--portable_kernels` flag can be used to set the build flag `EXECUTORCH_SELECT_OPS_LIST` when running `backends/arm/scripts/build_portable_kernels.sh` that will decide the number of portable operators included in the build and are available at runtime. It must match with `.pte` file's requirements, otherwise you will get `Missing Operator` error at runtime.
 
 For example, there  in the command line above, to run SoftmaxModule, we only included the softmax CPU operator. Similarly, to run AddModule in a non-delegated manner you will need add op and so on. As you might have already realized, for the delegated operators, which will be executed by the Arm backend delegate, we do not need to include those operators in this list. This is only for *non-delegated* operators.
 
-```{tip}
-The `run.sh` script takes in `--portable_kernels` option, which provides a way to supply a comma seperated list of portable kernels to be included.
-```
-
 ### Building the executor_runner Bare-Metal Application
 
 The SDK dir is the same one prepared [earlier](#setup-the-arm-ethos-u-software-development). And, we will be passing the `.pte` file (any one of them) generated above.
 
 Note, you have to generate a new `executor-runner` binary if you want to change the model or the `.pte` file. This constraint is from the constrained bare-metal runtime environment we have for Corstone-300/Corstone-320 platforms.
 
-This is performed by the `build_executorch_runner` function in `run.sh`.
+This is performed by the `backends/arm/scripts/build_executorch_runner.sh` script runned from `run.sh`.
 
 ```{tip}
 The `run.sh` script takes in `--target` option, which provides a way to provide a specific target, Corstone-300(ethos-u55-128) or Corstone-320(ethos-u85-128)
@@ -313,7 +335,10 @@ The `run.sh` script takes in `--target` option, which provides a way to provide
 
 ## Running on Corstone FVP Platforms
 
-Once the elf is prepared, regardless of the `.pte` file variant is used to generate the bare metal elf. The below command is used to run the [MV2Model](#mv2module) on Corstone-320 FVP
+Once the elf is prepared, regardless of the `.pte` file variant is used to generate the bare metal elf. `run.sh` will run the FVP for you via the `backends/arm/scripts/run_fvp.sh` script but you can also run it directly.
+
+
+The below command is used to run the [MV2Model](#mv2module) on Corstone-320 FVP
 
 ```bash
 ethos_u_build_dir=examples/arm/executor_runner/
diff --git a/docs/source/build-run-xtensa.md b/docs/source/backends-cadence.md
similarity index 96%
rename from docs/source/build-run-xtensa.md
rename to docs/source/backends-cadence.md
index 6097c9095a6..6dfb097c805 100644
--- a/docs/source/build-run-xtensa.md
+++ b/docs/source/backends-cadence.md
@@ -1,4 +1,4 @@
-# Building and Running ExecuTorch on Xtensa HiFi4 DSP
+# Cadence Xtensa Backend
 
 
 In this tutorial we will walk you through the process of getting setup to build ExecuTorch for an Xtensa HiFi4 DSP and running a simple model on it.
@@ -17,9 +17,9 @@ On top of being able to run on the Xtensa HiFi4 DSP, another goal of this tutori
 :::
 :::{grid-item-card}  Tutorials we recommend you complete before this:
 :class-card: card-prerequisites
-* [Introduction to ExecuTorch](intro-how-it-works.md)
-* [Setting up ExecuTorch](getting-started-setup.md)
-* [Building ExecuTorch with CMake](runtime-build-and-cross-compilation.md)
+* [Introduction to ExecuTorch](./intro-how-it-works.md)
+* [Getting Started](./getting-started.md)
+* [Building ExecuTorch with CMake](./using-executorch-building-from-source.md)
 :::
 ::::
 
@@ -174,8 +174,6 @@ cmake -DCMAKE_TOOLCHAIN_FILE=<path_to_executorch>/backends/cadence/cadence.cmake
     -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \
     -DEXECUTORCH_BUILD_PTHREADPOOL=OFF \
     -DEXECUTORCH_BUILD_CPUINFO=OFF \
-    -DEXECUTORCH_BUILD_FLATC=OFF \
-    -DFLATC_EXECUTABLE="$(which flatc)" \
     -Bcmake-out .
 
 cmake --build cmake-out -j<num_cores> --target install --config Debug
@@ -184,8 +182,7 @@ cmake -DCMAKE_BUILD_TYPE=Debug \
     -DCMAKE_TOOLCHAIN_FILE=<path_to_executorch>/examples/backends/cadence.cmake \
     -DCMAKE_PREFIX_PATH=<path_to_executorch>/cmake-out \
     -DMODEL_PATH=<path_to_program_file_generated_in_previous_step> \
-    -DNXP_SDK_ROOT_DIR=<path_to_nxp_sdk_root> -DEXECUTORCH_BUILD_FLATC=0 \
-    -DFLATC_EXECUTABLE="$(which flatc)" \
+    -DNXP_SDK_ROOT_DIR=<path_to_nxp_sdk_root> \
     -DNN_LIB_BASE_DIR=<path_to_nnlib_cloned_in_step_2> \
     -Bcmake-out/examples/cadence \
     examples/cadence
diff --git a/docs/source/backends-coreml.md b/docs/source/backends-coreml.md
new file mode 100644
index 00000000000..a06820b2d08
--- /dev/null
+++ b/docs/source/backends-coreml.md
@@ -0,0 +1,200 @@
+# Core ML Backend
+
+Core ML delegate is the ExecuTorch solution to take advantage of Apple's [CoreML framework](https://developer.apple.com/documentation/coreml) for on-device ML.  With CoreML, a model can run on CPU, GPU, and the Apple Neural Engine (ANE).
+
+## Features
+
+- Dynamic dispatch to the CPU, GPU, and ANE.
+- Supports fp32 and fp16 computation.
+
+## Target Requirements
+
+Below are the minimum OS requirements on various hardware for running a CoreML-delegated ExecuTorch model:
+- [macOS](https://developer.apple.com/macos) >= 13.0
+- [iOS](https://developer.apple.com/ios/) >= 16.0
+- [iPadOS](https://developer.apple.com/ipados/) >= 16.0
+- [tvOS](https://developer.apple.com/tvos/) >= 16.0
+
+## Development Requirements
+To develop you need:
+
+- [macOS](https://developer.apple.com/macos) >= 13.0.
+- [Xcode](https://developer.apple.com/documentation/xcode) >= 14.1
+
+
+Before starting, make sure you install the Xcode Command Line Tools:
+
+```bash
+xcode-select --install
+```
+
+Finally you must install the CoreML backend by running the following script:
+```bash
+sh ./backends/apple/coreml/scripts/install_requirements.sh
+```
+
+
+----
+
+## Using the CoreML Backend
+
+To target the CoreML backend during the export and lowering process, pass an instance of the `CoreMLPartitioner` to `to_edge_transform_and_lower`. The example below demonstrates this process using the MobileNet V2 model from torchvision.
+
+```python
+import torch
+import torchvision.models as models
+from torchvision.models.mobilenetv2 import MobileNet_V2_Weights
+from executorch.backends.apple.coreml.partition import CoreMLPartitioner
+from executorch.exir import to_edge_transform_and_lower
+
+mobilenet_v2 = models.mobilenetv2.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).eval()
+sample_inputs = (torch.randn(1, 3, 224, 224), )
+
+et_program = to_edge_transform_and_lower(
+    torch.export.export(mobilenet_v2, sample_inputs),
+    partitioner=[CoreMLPartitioner()],
+).to_executorch()
+
+with open("mv2_coreml.pte", "wb") as file:
+    et_program.write_to_file(file)
+```
+
+### Partitioner API
+
+The CoreML partitioner API allows for configuration of the model delegation to CoreML. Passing an `CoreMLPartitioner` instance with no additional parameters will run as much of the model as possible on the CoreML backend with default settings. This is the most common use-case. For advanced use cases, the partitioner exposes the following options via the [constructor](https://github.com/pytorch/executorch/blob/14ff52ff89a89c074fc6c14d3f01683677783dcd/backends/apple/coreml/partition/coreml_partitioner.py#L60):
+
+
+ - `skip_ops_for_coreml_delegation`: Allows you to skip ops for delegation by CoreML.  By default, all ops that CoreML supports will be delegated.  See [here](https://github.com/pytorch/executorch/blob/14ff52ff89a89c074fc6c14d3f01683677783dcd/backends/apple/coreml/test/test_coreml_partitioner.py#L42) for an example of skipping an op for delegation.
+- `compile_specs`: A list of CompileSpec for the CoreML backend.  These control low-level details of CoreML delegation, such as the compute unit (CPU, GPU, ANE), the iOS deployment target, and the compute precision (FP16, FP32).  These are discussed more below.
+- `take_over_mutable_buffer`: A boolean that indicates whether PyTorch mutable buffers in stateful models should be converted to [CoreML MLState](https://developer.apple.com/documentation/coreml/mlstate).  If set to false, mutable buffers in the PyTorch graph are converted to graph inputs and outputs to the CoreML lowered module under the hood.  Generally setting take_over_mutable_buffer to true will result in better performance, but using MLState requires iOS >= 18.0, macOS >= 15.0, and XCode >= 16.0.
+
+#### CoreML CompileSpec
+
+A list of CompileSpec is constructed with [CoreMLBackend.generate_compile_specs](https://github.com/pytorch/executorch/blob/14ff52ff89a89c074fc6c14d3f01683677783dcd/backends/apple/coreml/compiler/coreml_preprocess.py#L210).  Below are the available options:
+- `compute_unit`: this controls the compute units (CPU, GPU, ANE) that are used by CoreML.  The default value is coremltools.ComputeUnit.ALL.  The available options from coremltools are:
+    - coremltools.ComputeUnit.ALL (uses the CPU, GPU, and ANE)
+    - coremltools.ComputeUnit.CPU_ONLY (uses the CPU only)
+    - coremltools.ComputeUnit.CPU_AND_GPU (uses both the CPU and GPU, but not the ANE)
+    - coremltools.ComputeUnit.CPU_AND_NE (uses both the CPU and ANE, but not the GPU)
+- `minimum_deployment_target`: The minimum iOS deployment target (e.g., coremltools.target.iOS18).  The default value is coremltools.target.iOS15.
+- `compute_precision`: The compute precision used by CoreML (coremltools.precision.FLOAT16, coremltools.precision.FLOAT32).  The default value is coremltools.precision.FLOAT16.  Note that the compute precision is applied no matter what dtype is specified in the exported PyTorch model.  For example, an FP32 PyTorch model will be converted to FP16 when delegating to the CoreML backend by default.  Also note that the ANE only supports FP16 precision.
+- `model_type`: Whether the model should be compiled to the CoreML [mlmodelc format](https://developer.apple.com/documentation/coreml/downloading-and-compiling-a-model-on-the-user-s-device) during .pte creation ([CoreMLBackend.MODEL_TYPE.COMPILED_MODEL](https://github.com/pytorch/executorch/blob/14ff52ff89a89c074fc6c14d3f01683677783dcd/backends/apple/coreml/compiler/coreml_preprocess.py#L71)), or whether it should be compiled to mlmodelc on device ([CoreMLBackend.MODEL_TYPE.MODEL](https://github.com/pytorch/executorch/blob/14ff52ff89a89c074fc6c14d3f01683677783dcd/backends/apple/coreml/compiler/coreml_preprocess.py#L70)).  Using CoreMLBackend.MODEL_TYPE.COMPILED_MODEL and doing compilation ahead of time should improve the first time on-device model load time.
+
+### Testing the Model
+
+After generating the CoreML-delegated .pte, the model can be tested from Python using the ExecuTorch runtime python bindings. This can be used to sanity check the model and evaluate numerical accuracy. See [Testing the Model](using-executorch-export.md#testing-the-model) for more information.
+
+----
+
+### Quantization
+
+To quantize a PyTorch model for the CoreML backend, use the `CoreMLQuantizer`. `Quantizers` are backend specific, and the `CoreMLQuantizer` is configured to quantize models to leverage the available quantization for the CoreML backend.
+
+### 8-bit Quantization using the PT2E Flow
+
+To perform 8-bit quantization with the PT2E flow, perform the following steps:
+
+1) Define [coremltools.optimize.torch.quantization.LinearQuantizerConfig](https://apple.github.io/coremltools/source/coremltools.optimize.torch.quantization.html#coremltools.optimize.torch.quantization.LinearQuantizerConfig) and use to to create an instance of a `CoreMLQuantizer`.
+2) Use `torch.export.export_for_training` to export a graph module that will be prepared for quantization.
+3) Call `prepare_pt2e` to prepare the model for quantization.
+4) For static quantization, run the prepared model with representative samples to calibrate the quantizated tensor activation ranges.
+5) Call `convert_pt2e` to quantize the model.
+6) Export and lower the model using the standard flow.
+
+The output of `convert_pt2e` is a PyTorch model which can be exported and lowered using the normal flow. As it is a regular PyTorch model, it can also be used to evaluate the accuracy of the quantized model using standard PyTorch techniques.
+
+```python
+import torch
+import coremltools as ct
+import torchvision.models as models
+from torchvision.models.mobilenetv2 import MobileNet_V2_Weights
+from executorch.backends.apple.coreml.quantizer import CoreMLQuantizer
+from executorch.backends.apple.coreml.partition import CoreMLPartitioner
+from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
+from executorch.exir import to_edge_transform_and_lower
+from executorch.backends.apple.coreml.compiler import CoreMLBackend
+
+mobilenet_v2 = models.mobilenetv2.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).eval()
+sample_inputs = (torch.randn(1, 3, 224, 224), )
+
+# Step 1: Define a LinearQuantizerConfig and create an instance of a CoreMLQuantizer
+quantization_config = ct.optimize.torch.quantization.LinearQuantizerConfig.from_dict(
+    {
+        "global_config": {
+            "quantization_scheme": ct.optimize.torch.quantization.QuantizationScheme.symmetric,
+            "milestones": [0, 0, 10, 10],
+            "activation_dtype": torch.quint8,
+            "weight_dtype": torch.qint8,
+            "weight_per_channel": True,
+        }
+    }
+)
+quantizer = CoreMLQuantizer(quantization_config)
+
+# Step 2: Export the model for training
+training_gm = torch.export.export_for_training(mobilenet_v2, sample_inputs).module()
+
+# Step 3: Prepare the model for quantization
+prepared_model = prepare_pt2e(training_gm, quantizer)
+
+# Step 4: Calibrate the model on representative data
+# Replace with your own calibration data
+for calibration_sample in [torch.randn(1, 3, 224, 224)]:
+	prepared_model(calibration_sample)
+
+# Step 5: Convert the calibrated model to a quantized model
+quantized_model = convert_pt2e(prepared_model)
+
+# Step 6: Export the quantized model to CoreML
+et_program = to_edge_transform_and_lower(
+    torch.export.export(quantized_model, sample_inputs),
+    partitioner=[
+        CoreMLPartitioner(
+             # iOS17 is required for the quantized ops in this example
+            compile_specs=CoreMLBackend.generate_compile_specs(
+                minimum_deployment_target=ct.target.iOS17
+            )
+        )
+    ],
+).to_executorch()
+```
+
+See [PyTorch 2 Export Post Training Quantization](https://pytorch.org/tutorials/prototype/pt2e_quant_ptq.html) for more information.
+
+----
+
+## Runtime integration
+
+To run the model on-device, use the standard ExecuTorch runtime APIs. See [Running on Device](getting-started.md#running-on-device) for more information, including building the iOS frameworks.
+
+When building from source, pass `-DEXECUTORCH_BUILD_COREML=ON` when configuring the CMake build to compile the CoreML backend.
+
+To link against the `coremldelegate` target. Due to the use of static registration, it may be necessary to link with whole-archive. This can typically be done by passing `"$<LINK_LIBRARY:WHOLE_ARCHIVE,coremldelegate>"` to `target_link_libraries`.
+
+```
+# CMakeLists.txt
+add_subdirectory("executorch")
+...
+target_link_libraries(
+    my_target
+    PRIVATE executorch
+    executorch_module_static
+    executorch_tensor
+    optimized_native_cpu_ops_lib
+    coremldelegate)
+```
+
+No additional steps are necessary to use the backend beyond linking the target. A CoreML-delegated .pte file will automatically run on the registered backend.
+
+---
+
+## Advanced
+
+### Extracting the mlpackage
+
+[CoreML *.mlpackage files](https://apple.github.io/coremltools/docs-guides/source/convert-to-ml-program.html#save-ml-programs-as-model-packages) can be extracted from a CoreML-delegated *.pte file.  This can help with debugging and profiling for users who are more familiar with *.mlpackage files:
+```bash
+python examples/apple/coreml/scripts/extract_coreml_models.py -m /path/to/model.pte
+```
+
+Note that if the ExecuTorch model has graph breaks, there may be multiple extracted *.mlpackage files.
diff --git a/docs/source/build-run-mediatek-backend.md b/docs/source/backends-mediatek.md
similarity index 87%
rename from docs/source/build-run-mediatek-backend.md
rename to docs/source/backends-mediatek.md
index eeaa2b8dc88..456a62aaabd 100644
--- a/docs/source/build-run-mediatek-backend.md
+++ b/docs/source/backends-mediatek.md
@@ -1,4 +1,4 @@
-# Building and Running ExecuTorch with MediaTek Backend
+# MediaTek Backend
 
 MediaTek backend empowers ExecuTorch to speed up PyTorch models on edge devices that equips with MediaTek Neuron Processing Unit (NPU). This document offers a step-by-step guide to set up the build environment for the MediaTek ExecuTorch libraries.
 
@@ -11,9 +11,9 @@ MediaTek backend empowers ExecuTorch to speed up PyTorch models on edge devices
 :::
 :::{grid-item-card}  Tutorials we recommend you complete before this:
 :class-card: card-prerequisites
-* [Introduction to ExecuTorch](intro-how-it-works.md)
-* [Setting up ExecuTorch](getting-started-setup.md)
-* [Building ExecuTorch with CMake](runtime-build-and-cross-compilation.md)
+* [Introduction to ExecuTorch](./intro-how-it-works.md)
+* [Getting Started](./getting-started.md)
+* [Building ExecuTorch with CMake](./using-executorch-building-from-source.md)
 :::
 ::::
 
@@ -34,7 +34,7 @@ MediaTek backend empowers ExecuTorch to speed up PyTorch models on edge devices
 
 Follow the steps below to setup your build environment:
 
-1. **Setup ExecuTorch Environment**: Refer to the [Setting up ExecuTorch](https://pytorch.org/executorch/stable/getting-started-setup) guide for detailed instructions on setting up the ExecuTorch environment.
+1. **Setup ExecuTorch Environment**: Refer to the [Getting Started](getting-started.md) guide for detailed instructions on setting up the ExecuTorch environment.
 
 2. **Setup MediaTek Backend Environment**
 - Install the dependent libs. Ensure that you are inside `backends/mediatek/` directory
@@ -91,4 +91,4 @@ cd executorch
 
    ```bash
    export LD_LIBRARY_PATH=<path_to_usdk>:<path_to_neuron_backend>:$LD_LIBRARY_PATH
-   ```
\ No newline at end of file
+   ```
diff --git a/docs/source/backends-mps.md b/docs/source/backends-mps.md
new file mode 100644
index 00000000000..8053954bb3b
--- /dev/null
+++ b/docs/source/backends-mps.md
@@ -0,0 +1,157 @@
+# MPS Backend
+
+In this tutorial we will walk you through the process of getting setup to build the MPS backend for ExecuTorch and running a simple model on it.
+
+The MPS backend device maps machine learning computational graphs and primitives on the [MPS Graph](https://developer.apple.com/documentation/metalperformanceshadersgraph/mpsgraph?language=objc) framework and tuned kernels provided by [MPS](https://developer.apple.com/documentation/metalperformanceshaders?language=objc).
+
+::::{grid} 2
+:::{grid-item-card}  What you will learn in this tutorial:
+:class-card: card-prerequisites
+* In this tutorial you will learn how to export [MobileNet V3](https://pytorch.org/vision/main/models/mobilenetv3.html) model to the MPS delegate.
+* You will also learn how to compile and deploy the ExecuTorch runtime with the MPS delegate on macOS and iOS.
+:::
+:::{grid-item-card}  Tutorials we recommend you complete before this:
+:class-card: card-prerequisites
+* [Introduction to ExecuTorch](./intro-how-it-works.md)
+* [Getting Started](./getting-started.md)
+* [Building ExecuTorch with CMake](./using-executorch-building-from-source.md)
+* [ExecuTorch iOS Demo App](demo-apps-ios.md)
+* [ExecuTorch iOS LLaMA Demo App](llm/llama-demo-ios.md)
+:::
+::::
+
+
+## Prerequisites (Hardware and Software)
+
+In order to be able to successfully build and run a model using the MPS backend for ExecuTorch, you'll need the following hardware and software components:
+
+### Hardware:
+ - A [mac](https://www.apple.com/mac/) for tracing the model
+
+### Software:
+
+  - **Ahead of time** tracing:
+    - [macOS](https://www.apple.com/macos/) 12
+
+  - **Runtime**:
+    - [macOS](https://www.apple.com/macos/) >= 12.4
+    - [iOS](https://www.apple.com/ios) >= 15.4
+    - [Xcode](https://developer.apple.com/xcode/) >= 14.1
+
+## Setting up Developer Environment
+
+***Step 1.*** Please finish tutorial [Getting Started](getting-started.md).
+
+***Step 2.*** Install dependencies needed to lower MPS delegate:
+
+  ```bash
+  ./backends/apple/mps/install_requirements.sh
+  ```
+
+## Build
+
+### AOT (Ahead-of-time) Components
+
+**Compiling model for MPS delegate**:
+- In this step, you will generate a simple ExecuTorch program that lowers MobileNetV3 model to the MPS delegate. You'll then pass this Program (the `.pte` file) during the runtime to run it using the MPS backend.
+
+```bash
+cd executorch
+# Note: `mps_example` script uses by default the MPSPartitioner for ops that are not yet supported by the MPS delegate. To turn it off, pass `--no-use_partitioner`.
+python3 -m examples.apple.mps.scripts.mps_example --model_name="mv3" --bundled --use_fp16
+
+# To see all options, run following command:
+python3 -m examples.apple.mps.scripts.mps_example --help
+```
+
+### Runtime
+
+**Building the MPS executor runner:**
+```bash
+# In this step, you'll be building the `mps_executor_runner` that is able to run MPS lowered modules:
+cd executorch
+./examples/apple/mps/scripts/build_mps_executor_runner.sh
+```
+
+## Run the mv3 generated model using the mps_executor_runner
+
+```bash
+./cmake-out/examples/apple/mps/mps_executor_runner --model_path mv3_mps_bundled_fp16.pte --bundled_program
+```
+
+- You should see the following results. Note that no output file will be generated in this example:
+```
+I 00:00:00.003290 executorch:mps_executor_runner.mm:286] Model file mv3_mps_bundled_fp16.pte is loaded.
+I 00:00:00.003306 executorch:mps_executor_runner.mm:292] Program methods: 1
+I 00:00:00.003308 executorch:mps_executor_runner.mm:294] Running method forward
+I 00:00:00.003311 executorch:mps_executor_runner.mm:349] Setting up non-const buffer 1, size 606112.
+I 00:00:00.003374 executorch:mps_executor_runner.mm:376] Setting up memory manager
+I 00:00:00.003376 executorch:mps_executor_runner.mm:392] Loading method name from plan
+I 00:00:00.018942 executorch:mps_executor_runner.mm:399] Method loaded.
+I 00:00:00.018944 executorch:mps_executor_runner.mm:404] Loading bundled program...
+I 00:00:00.018980 executorch:mps_executor_runner.mm:421] Inputs prepared.
+I 00:00:00.118731 executorch:mps_executor_runner.mm:438] Model executed successfully.
+I 00:00:00.122615 executorch:mps_executor_runner.mm:501] Model verified successfully.
+```
+
+### [Optional] Run the generated model directly using pybind
+1. Make sure `pybind` MPS support was installed:
+```bash
+./install_executorch.sh --pybind mps
+```
+2. Run the `mps_example` script to trace the model and run it directly from python:
+```bash
+cd executorch
+# Check correctness between PyTorch eager forward pass and ExecuTorch MPS delegate forward pass
+python3 -m examples.apple.mps.scripts.mps_example --model_name="mv3" --no-use_fp16 --check_correctness
+# You should see following output: `Results between ExecuTorch forward pass with MPS backend and PyTorch forward pass for mv3_mps are matching!`
+
+# Check performance between PyTorch MPS forward pass and ExecuTorch MPS forward pass
+python3 -m examples.apple.mps.scripts.mps_example --model_name="mv3" --no-use_fp16 --bench_pytorch
+```
+
+### Profiling:
+1. [Optional] Generate an [ETRecord](./etrecord.rst) while you're exporting your model.
+```bash
+cd executorch
+python3 -m examples.apple.mps.scripts.mps_example --model_name="mv3" --generate_etrecord -b
+```
+2. Run your Program on the ExecuTorch runtime and generate an [ETDump](./etdump.md).
+```
+./cmake-out/examples/apple/mps/mps_executor_runner --model_path mv3_mps_bundled_fp16.pte --bundled_program --dump-outputs
+```
+3. Create an instance of the Inspector API by passing in the ETDump you have sourced from the runtime along with the optionally generated ETRecord from step 1.
+```bash
+python3 -m sdk.inspector.inspector_cli --etdump_path etdump.etdp --etrecord_path etrecord.bin
+```
+
+## Deploying and Running on Device
+
+***Step 1***. Create the ExecuTorch core and MPS delegate frameworks to link on iOS
+```bash
+cd executorch
+./scripts/build_apple_frameworks.sh --mps
+```
+
+`mps_delegate.xcframework` will be in `cmake-out` folder, along with `executorch.xcframework` and `portable_delegate.xcframework`:
+```bash
+cd cmake-out && ls
+```
+
+***Step 2***. Link the frameworks into your XCode project:
+Go to project Target’s  `Build Phases`  -  `Link Binaries With Libraries`, click the **+** sign and add the frameworks: files located in  `Release` folder.
+- `executorch.xcframework`
+- `portable_delegate.xcframework`
+- `mps_delegate.xcframework`
+
+From the same page, include the needed libraries for the MPS delegate:
+- `MetalPerformanceShaders.framework`
+- `MetalPerformanceShadersGraph.framework`
+- `Metal.framework`
+
+In this tutorial, you have learned how to lower a model to the MPS delegate, build the mps_executor_runner and run a lowered model through the MPS delegate, or directly on device using the MPS delegate static library.
+
+
+## Frequently encountered errors and resolution.
+
+If you encountered any bugs or issues following this tutorial please file a bug/issue on the [ExecuTorch repository](https://github.com/pytorch/executorch/issues), with hashtag **#mps**.
diff --git a/docs/source/backends-overview.md b/docs/source/backends-overview.md
new file mode 100644
index 00000000000..dd3aa0354bc
--- /dev/null
+++ b/docs/source/backends-overview.md
@@ -0,0 +1,20 @@
+# Backend Overview
+
+ExecuTorch backends provide hardware acceleration for a specific hardware target. In order to achieve maximum performance on target hardware, ExecuTorch optimizes the model for a specific backend during the export and lowering process. This means that the resulting .pte file is specialized for the specific hardware. In order to deploy to multiple backends, such as Core ML on iOS and Arm CPU on Android, it is common to generate a dedicated .pte file for each.
+
+The choice of hardware backend is informed by the hardware that the model is intended to be deployed on. Each backend has specific hardware requires and level of model support. See the documentation for each hardware backend for more details.
+
+As part of the .pte file creation process, ExecuTorch identifies portions of the model (partitions) that are supported for the given backend. These sections are processed by the backend ahead of time to support efficient execution. Portions of the model that are not supported on the delegate, if any, are executed using the portable fallback implementation on CPU. This allows for partial model acceleration when not all model operators are supported on the backend, but may have negative performance implications. In addition, multiple partitioners can be specified in order of priority. This allows for operators not supported on GPU to run on CPU via XNNPACK, for example.
+
+### Available Backends
+
+Commonly used hardware backends are listed below. For mobile, consider using XNNPACK for Android and XNNPACK or Core ML for iOS. To create a .pte file for a specific backend, pass the appropriate partitioner class to `to_edge_transform_and_lower`. See the appropriate backend documentation for more information.
+
+- [XNNPACK (Mobile CPU)](backends-xnnpack.md)
+- [Core ML (iOS)](backends-coreml.md)
+- [Metal Performance Shaders (iOS GPU)](backends-mps.md)
+- [Vulkan (Android GPU)](backends-vulkan.md)
+- [Qualcomm NPU](backends-qualcomm.md)
+- [MediaTek NPU](backends-mediatek.md)
+- [Arm Ethos-U NPU](backends-arm-ethos-u.md)
+- [Cadence DSP](backends-cadence.md)
diff --git a/docs/source/build-run-qualcomm-ai-engine-direct-backend.md b/docs/source/backends-qualcomm.md
similarity index 97%
rename from docs/source/build-run-qualcomm-ai-engine-direct-backend.md
rename to docs/source/backends-qualcomm.md
index 55634459eff..2d2b017aca1 100644
--- a/docs/source/build-run-qualcomm-ai-engine-direct-backend.md
+++ b/docs/source/backends-qualcomm.md
@@ -1,4 +1,4 @@
-# Building and Running ExecuTorch with Qualcomm AI Engine Direct Backend
+# Qualcomm AI Engine Backend
 
 In this tutorial we will walk you through the process of getting started to
 build ExecuTorch for Qualcomm AI Engine Direct and running a model on it.
@@ -14,9 +14,9 @@ Qualcomm AI Engine Direct is also referred to as QNN in the source and documenta
 :::
 :::{grid-item-card}  Tutorials we recommend you complete before this:
 :class-card: card-prerequisites
-* [Introduction to ExecuTorch](intro-how-it-works.md)
-* [Setting up ExecuTorch](getting-started-setup.md)
-* [Building ExecuTorch with CMake](runtime-build-and-cross-compilation.md)
+* [Introduction to ExecuTorch](./intro-how-it-works.md)
+* [Getting Started](./getting-started.md)
+* [Building ExecuTorch with CMake](./using-executorch-building-from-source.md)
 :::
 ::::
 
@@ -347,7 +347,7 @@ The model, inputs, and output location are passed to `qnn_executorch_runner` by
 ### Running a model via ExecuTorch's android demo-app
 
 An Android demo-app using Qualcomm AI Engine Direct Backend can be found in
-`examples`. Please refer to android demo app [tutorial](https://pytorch.org/executorch/stable/demo-apps-android.html).
+`examples`. Please refer to android demo app [tutorial](demo-apps-android.md).
 
 ## Supported model list
 
diff --git a/docs/source/backends-vulkan.md b/docs/source/backends-vulkan.md
new file mode 100644
index 00000000000..2cfff6a6eb6
--- /dev/null
+++ b/docs/source/backends-vulkan.md
@@ -0,0 +1,205 @@
+# Vulkan Backend
+
+The ExecuTorch Vulkan delegate is a native GPU delegate for ExecuTorch that is
+built on top of the cross-platform Vulkan GPU API standard. It is primarily
+designed to leverage the GPU to accelerate model inference on Android devices,
+but can be used on any platform that supports an implementation of Vulkan:
+laptops, servers, and edge devices.
+
+::::{note}
+The Vulkan delegate is currently under active development, and its components
+are subject to change.
+::::
+
+## What is Vulkan?
+
+Vulkan is a low-level GPU API specification developed as a successor to OpenGL.
+It is designed to offer developers more explicit control over GPUs compared to
+previous specifications in order to reduce overhead and maximize the
+capabilities of the modern graphics hardware.
+
+Vulkan has been widely adopted among GPU vendors, and most modern GPUs (both
+desktop and mobile) in the market support Vulkan. Vulkan is also included in
+Android from Android 7.0 onwards.
+
+**Note that Vulkan is a GPU API, not a GPU Math Library**. That is to say it
+provides a way to execute compute and graphics operations on a GPU, but does not
+come with a built-in library of performant compute kernels.
+
+## The Vulkan Compute Library
+
+The ExecuTorch Vulkan Delegate is a wrapper around a standalone runtime known as
+the **Vulkan Compute Library**. The aim of the Vulkan Compute Library is to
+provide GPU implementations for PyTorch operators via GLSL compute shaders.
+
+The Vulkan Compute Library is a fork/iteration of the [PyTorch Vulkan Backend](https://pytorch.org/tutorials/prototype/vulkan_workflow.html).
+The core components of the PyTorch Vulkan backend were forked into ExecuTorch
+and adapted for an AOT graph-mode style of model inference (as opposed to
+PyTorch which adopted an eager execution style of model inference).
+
+The components of the Vulkan Compute Library are contained in the
+`executorch/backends/vulkan/runtime/` directory. The core components are listed
+and described below:
+
+```
+runtime/
+├── api/ .................... Wrapper API around Vulkan to manage Vulkan objects
+└── graph/ .................. ComputeGraph class which implements graph mode inference
+    └── ops/ ................ Base directory for operator implementations
+        ├── glsl/ ........... GLSL compute shaders
+        │   ├── *.glsl
+        │   └── conv2d.glsl
+        └── impl/ ........... C++ code to dispatch GPU compute shaders
+            ├── *.cpp
+            └── Conv2d.cpp
+```
+
+## Features
+
+The Vulkan delegate currently supports the following features:
+
+* **Memory Planning**
+  * Intermediate tensors whose lifetimes do not overlap will share memory allocations. This reduces the peak memory usage of model inference.
+* **Capability Based Partitioning**:
+  * A graph can be partially lowered to the Vulkan delegate via a partitioner, which will identify nodes (i.e. operators) that are supported by the Vulkan delegate and lower only supported subgraphs
+* **Support for upper-bound dynamic shapes**:
+  * Tensors can change shape between inferences as long as its current shape is smaller than the bounds specified during lowering
+
+In addition to increasing operator coverage, the following features are
+currently in development:
+
+* **Quantization Support**
+  * We are currently working on support for 8-bit dynamic quantization, with plans to extend to other quantization schemes in the future.
+* **Memory Layout Management**
+  * Memory layout is an important factor to optimizing performance. We plan to introduce graph passes to introduce memory layout transitions throughout a graph to optimize memory-layout sensitive operators such as Convolution and Matrix Multiplication.
+* **Selective Build**
+  * We plan to make it possible to control build size by selecting which operators/shaders you want to build with
+
+## End to End Example
+
+To further understand the features of the Vulkan Delegate and how to use it,
+consider the following end to end example with a simple single operator model.
+
+### Compile and lower a model to the Vulkan Delegate
+
+Assuming ExecuTorch has been set up and installed, the following script can be
+used to produce a lowered MobileNet V2 model as `vulkan_mobilenetv2.pte`.
+
+Once ExecuTorch has been set up and installed, the following script can be used
+to generate a simple model and lower it to the Vulkan delegate.
+
+```
+# Note: this script is the same as the script from the "Setting up ExecuTorch"
+# page, with one minor addition to lower to the Vulkan backend.
+import torch
+from torch.export import export
+from executorch.exir import to_edge
+
+from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
+
+# Start with a PyTorch model that adds two input tensors (matrices)
+class Add(torch.nn.Module):
+  def __init__(self):
+    super(Add, self).__init__()
+
+  def forward(self, x: torch.Tensor, y: torch.Tensor):
+      return x + y
+
+# 1. torch.export: Defines the program with the ATen operator set.
+aten_dialect = export(Add(), (torch.ones(1), torch.ones(1)))
+
+# 2. to_edge: Make optimizations for Edge devices
+edge_program = to_edge(aten_dialect)
+# 2.1 Lower to the Vulkan backend
+edge_program = edge_program.to_backend(VulkanPartitioner())
+
+# 3. to_executorch: Convert the graph to an ExecuTorch program
+executorch_program = edge_program.to_executorch()
+
+# 4. Save the compiled .pte program
+with open("vk_add.pte", "wb") as file:
+    file.write(executorch_program.buffer)
+```
+
+Like other ExecuTorch delegates, a model can be lowered to the Vulkan Delegate
+using the `to_backend()` API. The Vulkan Delegate implements the
+`VulkanPartitioner` class which identifies nodes (i.e. operators) in the graph
+that are supported by the Vulkan delegate, and separates compatible sections of
+the model to be executed on the GPU.
+
+This means the a model can be lowered to the Vulkan delegate even if it contains
+some unsupported operators. This will just mean that only parts of the graph
+will be executed on the GPU.
+
+
+::::{note}
+The [supported ops list](https://github.com/pytorch/executorch/blob/main/backends/vulkan/partitioner/supported_ops.py)
+Vulkan partitioner code can be inspected to examine which ops are currently
+implemented in the Vulkan delegate.
+::::
+
+### Build Vulkan Delegate libraries
+
+The easiest way to build and test the Vulkan Delegate is to build for Android
+and test on a local Android device. Android devices have built in support for
+Vulkan, and the Android NDK ships with a GLSL compiler which is needed to
+compile the Vulkan Compute Library's GLSL compute shaders.
+
+The Vulkan Delegate libraries can be built by setting `-DEXECUTORCH_BUILD_VULKAN=ON`
+when building with CMake.
+
+First, make sure that you have the Android NDK installed; any NDK version past
+NDK r19c should work. Note that the examples in this doc have been validated with
+NDK r27b. The Android SDK should also be installed so that you have access to `adb`.
+
+The instructions in this page assumes that the following environment variables
+are set.
+
+```shell
+export ANDROID_NDK=<path_to_ndk>
+# Select the appropriate Android ABI for your device
+export ANDROID_ABI=arm64-v8a
+# All subsequent commands should be performed from ExecuTorch repo root
+cd <path_to_executorch_root>
+# Make sure adb works
+adb --version
+```
+
+To build and install ExecuTorch libraries (for Android) with the Vulkan
+Delegate:
+
+```shell
+# From executorch root directory
+(rm -rf cmake-android-out && \
+  pp cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \
+    -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+    -DANDROID_ABI=$ANDROID_ABI \
+    -DEXECUTORCH_BUILD_VULKAN=ON \
+    -DPYTHON_EXECUTABLE=python \
+    -Bcmake-android-out && \
+  cmake --build cmake-android-out -j16 --target install)
+```
+
+### Run the Vulkan model on device
+
+::::{note}
+Since operator support is currently limited, only binary arithmetic operators
+will run on the GPU. Expect inference to be slow as the majority of operators
+are being executed via Portable operators.
+::::
+
+Now, the partially delegated model can be executed (partially) on your device's
+GPU!
+
+```shell
+# Build a model runner binary linked with the Vulkan delegate libs
+cmake --build cmake-android-out --target vulkan_executor_runner -j32
+
+# Push model to device
+adb push vk_add.pte /data/local/tmp/vk_add.pte
+# Push binary to device
+adb push cmake-android-out/backends/vulkan/vulkan_executor_runner /data/local/tmp/runner_bin
+
+# Run the model
+adb shell /data/local/tmp/runner_bin --model_path /data/local/tmp/vk_add.pte
+```
diff --git a/docs/source/backends-xnnpack.md b/docs/source/backends-xnnpack.md
new file mode 100644
index 00000000000..1fcc6d8c51b
--- /dev/null
+++ b/docs/source/backends-xnnpack.md
@@ -0,0 +1,137 @@
+# XNNPACK Backend
+
+The XNNPACK delegate is the ExecuTorch solution for CPU execution on mobile CPUs. [XNNPACK](https://github.com/google/XNNPACK/tree/master) is a library that provides optimized kernels for machine learning operators on Arm and x86 CPUs. 
+
+## Features
+
+- Wide operator support on Arm and x86 CPUs, available on any modern mobile phone.
+- Support for a wide variety of quantization schemes and quantized operators.
+- Supports fp32 and fp16 activations.
+- Supports 8-bit quantization.
+
+## Target Requirements
+
+- ARM64 on Android, iOS, macOS, Linux, and Windows.
+- ARMv7 (with NEON) on Android.
+- ARMv6 (with VFPv2) on Linux.
+- x86 and x86-64 (up to AVX512) on Windows, Linux, macOS, Android, and iOS simulator.
+
+## Development Requirements
+
+The XNNPACK delegate does not introduce any development system requirements beyond those required by 
+the core ExecuTorch runtime.
+
+----
+
+## Using the XNNPACK Backend
+
+To target the XNNPACK backend during the export and lowering process, pass an instance of the `XnnpackPartitioner` to `to_edge_transform_and_lower`. The example below demonstrates this process using the MobileNet V2 model from torchvision.
+
+```python
+import torchvision.models as models
+from torchvision.models.mobilenetv2 import MobileNet_V2_Weights
+from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
+from executorch.exir import to_edge_transform_and_lower
+
+mobilenet_v2 = models.mobilenetv2.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).eval()
+sample_inputs = (torch.randn(1, 3, 224, 224), )
+
+et_program = to_edge_transform_and_lower(
+    torch.export.export(mobilenet_v2, sample_inputs),
+    partitioner=[XnnpackPartitioner()],
+).to_executorch()
+
+with open("mv2_xnnpack.pte", "wb") as file:
+    et_program.write_to_file(file)
+```
+
+### Partitioner API
+
+The XNNPACK partitioner API allows for configuration of the model delegation to XNNPACK. Passing an `XnnpackPartitioner` instance with no additional parameters will run as much of the model as possible on the XNNPACK backend. This is the most common use-case. For advanced use cases, the partitioner exposes the following options via the [constructor](https://github.com/pytorch/executorch/blob/14ff52ff89a89c074fc6c14d3f01683677783dcd/backends/xnnpack/partition/xnnpack_partitioner.py#L31):
+
+ - `configs`: Control which operators are delegated to XNNPACK. By default, all available operators all delegated. See [../config/\_\_init\_\_.py](https://github.com/pytorch/executorch/blob/14ff52ff89a89c074fc6c14d3f01683677783dcd/backends/xnnpack/partition/config/__init__.py#L66) for an exhaustive list of available operator configs.
+ - `config_precisions`: Filter operators by data type. By default, delegate all precisions. One or more of `ConfigPrecisionType.FP32`, `ConfigPrecisionType.STATIC_QUANT`, or `ConfigPrecisionType.DYNAMIC_QUANT`. See [ConfigPrecisionType](https://github.com/pytorch/executorch/blob/14ff52ff89a89c074fc6c14d3f01683677783dcd/backends/xnnpack/partition/config/xnnpack_config.py#L24).
+ - `per_op_mode`: If true, emit individual delegate calls for every operator. This is an advanced option intended to reduce memory overhead in some contexts at the cost of a small amount of runtime overhead. Defaults to false.
+ - `verbose`: If true, print additional information during lowering.
+
+### Testing the Model
+
+After generating the XNNPACK-delegated .pte, the model can be tested from Python using the ExecuTorch runtime python bindings. This can be used to sanity check the model and evaluate numerical accuracy. See [Testing the Model](using-executorch-export.md#testing-the-model) for more information.
+
+----
+
+## Quantization
+
+The XNNPACK delegate can also be used as a backend to execute symmetrically quantized models. To quantize a PyTorch model for the XNNPACK backend, use the `XNNPACKQuantizer`. `Quantizers` are backend specific, which means the `XNNPACKQuantizer` is configured to quantize models to leverage the quantized operators offered by the XNNPACK Library. 
+
+### Supported Quantization Schemes
+The XNNPACK delegate supports the following quantization schemes:
+- 8-bit symmetric weights with 8-bit asymmetric activations (via the PT2E quantization flow).
+    - Supports both static and dynamic activations.
+    - Supports per-channel and per-tensor schemes.
+    - Supports linear, convolution, add, mul, cat, and adaptive avg pool 2d operators.
+
+Weight-only quantization is not currently supported on XNNPACK.
+
+### 8-bit Quantization using the PT2E Flow
+
+To perform 8-bit quantization with the PT2E flow, perform the following steps prior to exporting the model:
+
+1) Create an instance of the `XnnpackQuantizer` class. Set quantization parameters.
+2) Use `torch.export.export_for_training` to prepare for quantization.
+3) Call `prepare_pt2e` to prepare the model for quantization.
+4) For static quantization, run the prepared model with representative samples to calibrate the quantizated tensor activation ranges.
+5) Call `convert_pt2e` to quantize the model.
+6) Export and lower the model using the standard flow.
+
+The output of `convert_pt2e` is a PyTorch model which can be exported and lowered using the normal flow. As it is a regular PyTorch model, it can also be used to evaluate the accuracy of the quantized model using standard PyTorch techniques.
+
+```python
+from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import XNNPACKQuantizer
+from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
+from torch.ao.quantization.quantizer.xnnpack_quantizer import get_symmetric_quantization_config
+
+qparams = get_symmetric_quantization_config(is_per_channel=True) # (1)
+quantizer = XNNPACKQuantizer()
+quantizer.set_global(qparams)
+
+training_ep = torch.export.export_for_training(model, sample_inputs).module(), # (2)
+prepared_model = prepare_pt2e(training_ep, quantizer) # (3)
+
+for cal_sample in [torch.randn(1, 3, 224, 224)]: # Replace with representative model inputs
+	prepared_model(cal_sample) # (4) Calibrate
+
+quantized_model = convert_pt2e(prepared_model) # (5)
+
+et_program = to_edge_transform_and_lower( # (6)
+    torch.export.export(quantized_model, sample_inputs),
+    partitioner=[XnnpackPartitioner()],
+).to_executorch()
+```
+
+See [PyTorch 2 Export Post Training Quantization](https://pytorch.org/tutorials/prototype/pt2e_quant_ptq.html) for more information.
+
+----
+
+## Runtime Integration
+
+To run the model on-device, use the standard ExecuTorch runtime APIs. See [Running on Device](getting-started.md#running-on-device) for more information.
+
+The XNNPACK delegate is included by default in the published Android, iOS, and pip packages. When building from source, pass `-DEXECUTORCH_BUILD_XNNPACK=ON` when configuring the CMake build to compile the XNNPACK backend.
+
+To link against the backend, add the `xnnpack_backend` CMake target as a build dependency, or link directly against `libxnnpack_backend`. Due to the use of static registration, it may be necessary to link with whole-archive. This can typically be done by passing `"$<LINK_LIBRARY:WHOLE_ARCHIVE,xnnpack_backend>"` to `target_link_libraries`.
+
+```
+# CMakeLists.txt
+add_subdirectory("executorch")
+...
+target_link_libraries(
+    my_target
+    PRIVATE executorch
+    executorch_module_static
+    executorch_tensor
+    optimized_native_cpu_ops_lib
+    xnnpack_backend)
+```
+
+No additional steps are necessary to use the backend beyond linking the target. Any XNNPACK-delegated .pte file will automatically run on the registered backend.
diff --git a/docs/source/build-run-coreml.md b/docs/source/build-run-coreml.md
deleted file mode 100644
index 45a7ecafce4..00000000000
--- a/docs/source/build-run-coreml.md
+++ /dev/null
@@ -1,164 +0,0 @@
-# Building and Running ExecuTorch with Core ML Backend
-
-Core ML delegate uses Core ML APIs to enable running neural networks via Apple's hardware acceleration. For more about Core ML you can read [here](https://developer.apple.com/documentation/coreml). In this tutorial, we will walk through the steps of lowering a PyTorch model to Core ML delegate
-
-
-::::{grid} 2
-:::{grid-item-card}  What you will learn in this tutorial:
-:class-card: card-prerequisites
-* In this tutorial you will learn how to export [MobileNet V3](https://pytorch.org/vision/main/models/mobilenetv3.html) model so that it runs on Core ML backend.
-* You will also learn how to deploy and run the exported model on a supported Apple device.
-:::
-:::{grid-item-card}  Tutorials we recommend you complete before this:
-:class-card: card-prerequisites
-* [Introduction to ExecuTorch](intro-how-it-works.md)
-* [Setting up ExecuTorch](getting-started-setup.md)
-* [Building ExecuTorch with CMake](runtime-build-and-cross-compilation.md)
-* [ExecuTorch iOS Demo App](demo-apps-ios.md)
-:::
-::::
-
-
-## Prerequisites (Hardware and Software)
-
-In order to be able to successfully build and run the ExecuTorch's Core ML backend you'll need the following hardware and software components.
-
-### Hardware:
-- A [mac](https://www.apple.com/mac/) system for building.
-- A [mac](https://www.apple.com/mac/) or [iPhone](https://www.apple.com/iphone/) or [iPad](https://www.apple.com/ipad/) or [Apple TV](https://www.apple.com/tv-home/) device for running the model.
-
-### Software:
-
-- [Xcode](https://developer.apple.com/documentation/xcode) >= 14.1, [macOS](https://developer.apple.com/macos) >= 13.0 for building.
-- [macOS](https://developer.apple.com/macos) >= 13.0, [iOS](https://developer.apple.com/ios/) >= 16.0, [iPadOS](https://developer.apple.com/ipados/) >= 16.0, and [tvOS](https://developer.apple.com/tvos/) >= 16.0 for running the model.
-
-## Setting up your developer environment
-
-1. Make sure that you have completed the ExecuTorch setup tutorials linked to at the top of this page and setup the environment.
-2. Run `install_requirements.sh` to install dependencies required by the **Core ML** backend.
-
-```bash
-cd executorch
-./backends/apple/coreml/scripts/install_requirements.sh
-```
-3. Install [Xcode](https://developer.apple.com/xcode/).
-4. Install Xcode Command Line Tools.
-
-```bash
-xcode-select --install
-```
-
-## Build
-
-### AOT (Ahead-of-time) components:
-
-
-**Exporting a Core ML delegated Program**:
-- In this step, you will lower the [MobileNet V3](https://pytorch.org/vision/main/models/mobilenetv3.html) model to the Core ML backend and export the ExecuTorch program. You'll then deploy and run the exported program on a supported Apple device using Core ML backend.
-```bash
-cd executorch
-
-# Generates ./mv3_coreml_all.pte file.
-python3 -m examples.apple.coreml.scripts.export --model_name mv3
-```
-
-- Core ML backend uses [coremltools](https://apple.github.io/coremltools/docs-guides/source/overview-coremltools.html) to lower [Edge dialect](ir-exir.md#edge-dialect) to Core ML format and then bundles it in the `.pte` file.
-
-
-### Runtime:
-
-**Running a Core ML delegated Program**:
-1. Build the runner.
-```bash
-cd executorch
-
-# Builds `coreml_executor_runner`.
-./examples/apple/coreml/scripts/build_executor_runner.sh
-```
-2. Run the CoreML delegated program.
-```bash
-cd executorch
-
-# Runs the exported mv3 model using the Core ML backend.
-./coreml_executor_runner --model_path mv3_coreml_all.pte
-```
-
-**Profiling a Core ML delegated Program**:
-
-Note that profiling is supported on [macOS](https://developer.apple.com/macos) >= 14.4.
-
-1. [Optional] Generate an [ETRecord](./etrecord.rst) when exporting your model.
-```bash
-cd executorch
-
-# Generates `mv3_coreml_all.pte` and `mv3_coreml_etrecord.bin` files.
-python3 -m examples.apple.coreml.scripts.export --model_name mv3 --generate_etrecord
-```
-
-2. Build the runner.
-```bash
-# Builds `coreml_executor_runner`.
-./examples/apple/coreml/scripts/build_executor_runner.sh
-```
-3. Run and generate an [ETDump](./etdump.md).
-```bash
-cd executorch
-
-# Generate the ETDump file.
-./coreml_executor_runner --model_path mv3_coreml_all.pte --profile_model --etdump_path etdump.etdp
-```
-
-4. Create an instance of the [Inspector API](./model-inspector.rst) by passing in the [ETDump](./etdump.md) you have sourced from the runtime along with the optionally generated [ETRecord](./etrecord.rst) from step 1 or execute the following command in your terminal to display the profiling data table.
-```bash
-python examples/apple/coreml/scripts/inspector_cli.py --etdump_path etdump.etdp --etrecord_path mv3_coreml.bin
-```
-
-
-## Deploying and running on a device
-
-**Running the Core ML delegated Program in the Demo iOS App**:
-1. Please follow the [Export Model](demo-apps-ios.md#models-and-labels) step of the tutorial to bundle the exported [MobileNet V3](https://pytorch.org/vision/main/models/mobilenetv3.html) program. You only need to do the Core ML part.
-
-2. Complete the [Build Runtime and Backends](demo-apps-ios.md#build-runtime-and-backends) section of the tutorial. When building the frameworks you only need the `coreml` option.
-
-3. Complete the [Final Steps](demo-apps-ios.md#final-steps) section of the tutorial to build and run the demo app.
-
-<br>**Running the Core ML delegated Program in your App**
-1. Build frameworks, running the following will create a `executorch.xcframework` and `coreml_backend.xcframework` in the `cmake-out` directory.
-```bash
-cd executorch
-./build/build_apple_frameworks.sh --coreml
-```
-2. Create a new [Xcode project](https://developer.apple.com/documentation/xcode/creating-an-xcode-project-for-an-app#) or open an existing project.
-
-3. Drag the `executorch.xcframework` and `coreml_backend.xcframework` generated from Step 2 to Frameworks.
-
-4. Go to the project's [Build Phases](https://developer.apple.com/documentation/xcode/customizing-the-build-phases-of-a-target) -  Link Binaries With Libraries, click the + sign, and add the following frameworks:
-```
-executorch.xcframework
-coreml_backend.xcframework
-Accelerate.framework
-CoreML.framework
-libsqlite3.tbd
-```
-5. Add the exported program to the [Copy Bundle Phase](https://developer.apple.com/documentation/xcode/customizing-the-build-phases-of-a-target#Copy-files-to-the-finished-product) of your Xcode target.
-
-6. Please follow the [Runtime APIs Tutorial](extension-module.md) to integrate the code for loading an ExecuTorch program.
-
-7. Update the code to load the program from the Application's bundle.
-``` objective-c
-NSURL *model_url = [NBundle.mainBundle URLForResource:@"mv3_coreml_all" extension:@"pte"];
-
-Result<executorch::extension::FileDataLoader> loader =
-    executorch::extension::FileDataLoader::from(model_url.path.UTF8String);
-```
-
-8. Use [Xcode](https://developer.apple.com/documentation/xcode/building-and-running-an-app#Build-run-and-debug-your-app) to deploy the application on the device.
-
-9. The application can now run the [MobileNet V3](https://pytorch.org/vision/main/models/mobilenetv3.html) model on the Core ML backend.
-
-<br>In this tutorial, you have learned how to lower the [MobileNet V3](https://pytorch.org/vision/main/models/mobilenetv3.html) model to the Core ML backend, deploy, and run it on an Apple device.
-
-## Frequently encountered errors and resolution.
-
-If you encountered any bugs or issues following this tutorial please file a bug/issue [here](https://github.com/pytorch/executorch/issues) with tag #coreml.
diff --git a/docs/source/build-run-mps.md b/docs/source/build-run-mps.md
deleted file mode 100644
index f9af4e9d3d5..00000000000
--- a/docs/source/build-run-mps.md
+++ /dev/null
@@ -1 +0,0 @@
-```{include} ../../backends/apple/mps/setup.md
diff --git a/docs/source/build-run-vulkan.md b/docs/source/build-run-vulkan.md
deleted file mode 100644
index 736859b86f6..00000000000
--- a/docs/source/build-run-vulkan.md
+++ /dev/null
@@ -1 +0,0 @@
-```{include} ../../backends/vulkan/docs/android_demo.md
diff --git a/docs/source/compiler-delegate-and-partitioner.md b/docs/source/compiler-delegate-and-partitioner.md
index 21a2f4dd392..c6808a11383 100644
--- a/docs/source/compiler-delegate-and-partitioner.md
+++ b/docs/source/compiler-delegate-and-partitioner.md
@@ -1,4 +1,4 @@
-# Backend and Delegate
+# Backends and Delegates
 
 Audience: Vendors, Backend Delegate developers, who are interested in integrating their own compilers and hardware as part of ExecuTorch
 
diff --git a/docs/source/concepts.md b/docs/source/concepts.md
index 289ecda6d85..4cef25c606e 100644
--- a/docs/source/concepts.md
+++ b/docs/source/concepts.md
@@ -1,4 +1,4 @@
-# ExecuTorch Concepts
+# Concepts
 This page provides an overview of key concepts and terms used throughout the ExecuTorch documentation. It is intended to help readers understand the terminology and concepts used in PyTorch Edge and ExecuTorch.
 
 ## Concepts Map
diff --git a/docs/source/debug-backend-delegate.md b/docs/source/debug-backend-delegate.md
index 68914aaed90..86dddd75868 100644
--- a/docs/source/debug-backend-delegate.md
+++ b/docs/source/debug-backend-delegate.md
@@ -1,4 +1,4 @@
-# Debug Backend Delegate
+# Debugging Delegation
 
 We provide a list of util functions to give users insights on what happened to the graph modules during the `to_backend()` stage.
 
diff --git a/docs/source/executorch-runtime-api-reference.rst b/docs/source/executorch-runtime-api-reference.rst
index 5bec597987a..2b4239271c1 100644
--- a/docs/source/executorch-runtime-api-reference.rst
+++ b/docs/source/executorch-runtime-api-reference.rst
@@ -1,4 +1,4 @@
-ExecuTorch Runtime API Reference
+Runtime API Reference
 ================================
 
 The ExecuTorch C++ API provides an on-device execution framework for exported PyTorch models.
diff --git a/docs/source/export-to-executorch-api-reference.rst b/docs/source/export-to-executorch-api-reference.rst
index 1ae563d842d..e4aeae9cb6c 100644
--- a/docs/source/export-to-executorch-api-reference.rst
+++ b/docs/source/export-to-executorch-api-reference.rst
@@ -1,4 +1,4 @@
-Export to ExecuTorch API Reference
+Export API Reference
 ----------------------------------
 
 For detailed information on how APIs evolve and the deprecation process, please refer to the `ExecuTorch API Life Cycle and Deprecation Policy <api-life-cycle.html>`__.
diff --git a/docs/source/getting-started-architecture.md b/docs/source/getting-started-architecture.md
index 937b5b389f5..2472b3547fe 100644
--- a/docs/source/getting-started-architecture.md
+++ b/docs/source/getting-started-architecture.md
@@ -1,4 +1,4 @@
-# High-level Architecture and Components of ExecuTorch
+# Architecture and Components
 
 This page describes the technical architecture of ExecuTorch and its individual components. This document is targeted towards engineers who are deploying PyTorch model onto edge devices.
 
diff --git a/docs/source/getting-started-setup.md b/docs/source/getting-started-setup.md
deleted file mode 100644
index f4782312790..00000000000
--- a/docs/source/getting-started-setup.md
+++ /dev/null
@@ -1,263 +0,0 @@
-<!---- DO NOT MODIFY Progress Bar Start --->
-<div class="progress-bar-wrapper">
-   <div class="progress-bar-item">
-     <div class="step-number" id="step-1">1</div>
-     <span class="step-caption" id="caption-1"></span>
-   </div>
-   <div class="progress-bar-item">
-     <div class="step-number" id="step-2">2</div>
-     <span class="step-caption" id="caption-2"></span>
-   </div>
-   <div class="progress-bar-item">
-     <div class="step-number" id="step-3">3</div>
-     <span class="step-caption" id="caption-3"></span>
-   </div>
-   <div class="progress-bar-item">
-     <div class="step-number" id="step-4">4</div>
-     <span class="step-caption" id="caption-4"></span>
-   </div>
-</div>
-<!---- DO NOT MODIFY Progress Bar End--->
-
-# Setting Up ExecuTorch
-In this section, we'll learn how to
-* Set up an environment to work on ExecuTorch
-* Generate a sample ExecuTorch program
-* Build and run a program with the ExecuTorch runtime
-
-## System Requirements
-### Operating System
-
-We've tested these instructions on the following systems, although they should
-also work in similar environments.
-
-
-Linux (x86_64)
-- CentOS 8+
-- Ubuntu 20.04.6 LTS+
-- RHEL 8+
-
-macOS (x86_64/M1/M2)
-- Big Sur (11.0)+
-
-Windows (x86_64)
-- Windows Subsystem for Linux (WSL) with any of the Linux options
-
-### Software
-* `conda` or another virtual environment manager
-  - We recommend `conda` as it provides cross-language
-    support and integrates smoothly with `pip` (Python's built-in package manager)
-  - Otherwise, Python's built-in virtual environment manager `python venv` is a good alternative.
-* `g++` version 7 or higher, `clang++` version 5 or higher, or another
-  C++17-compatible toolchain.
-
-Note that the cross-compilable core runtime code supports a wider range of
-toolchains, down to C++17. See the [Runtime Overview](./runtime-overview.md) for
-portability details.
-
-## Quick Setup: Colab/Jupyter Notebook Prototype
-
-To utilize ExecuTorch to its fullest extent, please follow the setup instructions provided below to install from source.
-
-Alternatively, if you would like to experiment with ExecuTorch quickly and easily, we recommend using the following [colab notebook](https://colab.research.google.com/drive/1qpxrXC3YdJQzly3mRg-4ayYiOjC6rue3?usp=sharing) for prototyping purposes. You can install directly via `pip` for basic functionality.
-  ```bash
-  pip install executorch
-  ```
-
-
-## Environment Setup
-
-### Create a Virtual Environment
-
-[Install conda on your machine](https://conda.io/projects/conda/en/latest/user-guide/install/index.html). Then, create a virtual environment to manage our dependencies.
-   ```bash
-   # Create and activate a conda environment named "executorch"
-   conda create -yn executorch python=3.10.0
-   conda activate executorch
-   ```
-
-### Clone and install ExecuTorch requirements
-
-   ```bash
-   # Clone the ExecuTorch repo from GitHub
-   # 'main' branch is the primary development branch where you see the latest changes.
-   # 'viable/strict' contains all of the commits on main that pass all of the necessary CI checks.
-   git clone --branch viable/strict https://github.com/pytorch/executorch.git
-   cd executorch
-
-   # Update and pull submodules
-   git submodule sync
-   git submodule update --init
-
-   # Install ExecuTorch pip package and its dependencies, as well as
-   # development tools like CMake.
-   # If developing on a Mac, make sure to install the Xcode Command Line Tools first.
-   ./install_executorch.sh
-   ```
-
-   Use the [`--pybind` flag](https://github.com/pytorch/executorch/blob/main/install_executorch.sh#L26-L29) to install with pybindings and dependencies for other backends.
-   ```bash
-   ./install_executorch.sh --pybind <coreml | mps | xnnpack>
-
-   # Example: pybindings with CoreML *only*
-   ./install_executorch.sh --pybind coreml
-
-   # Example: pybinds with CoreML *and* XNNPACK
-   ./install_executorch.sh --pybind coreml xnnpack
-   ```
-
-   By default, `./install_executorch.sh` command installs pybindings for XNNPACK. To disable any pybindings altogether:
-   ```bash
-   ./install_executorch.sh --pybind off
-   ```
-
-After setting up your environment, you are ready to convert your PyTorch programs
-to ExecuTorch.
-
-> **_NOTE:_**  Cleaning the build system
->
-> When fetching a new version of the upstream repo (via `git fetch` or `git
-> pull`) it is a good idea to clean the old build artifacts. The build system
-> does not currently adapt well to changes in build dependencies.
->
-> You should also update and pull the submodules again, in case their versions
-> have changed.
->
-> ```bash
-> # From the root of the executorch repo:
-> ./install_executorch.sh --clean
-> git submodule sync
-> git submodule update --init
-> ```
-
-## Create an ExecuTorch program
-
-After setting up your environment, you are ready to convert your PyTorch programs
-to ExecuTorch.
-
-### Export a Program
-ExecuTorch provides APIs to compile a PyTorch [`nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html) to a `.pte` binary consumed by the ExecuTorch runtime.
-1. [`torch.export`](https://pytorch.org/docs/stable/export.html)
-1. [`exir.to_edge`](https://pytorch.org/executorch/stable/export-to-executorch-api-reference.html#exir.to_edge)
-1. [`exir.to_executorch`](ir-exir.md)
-1. Save the result as a [`.pte` binary](pte-file-format.md) to be consumed by the ExecuTorch runtime.
-
-
-Let's try this using with a simple PyTorch model that adds its inputs.
-
-Create `export_add.py` in a new directory outside of the ExecuTorch repo.
-
-**Note: It's important that this file does does not live in the directory that's a parent of the `executorch` directory. We need python to import from site-packages, not from the repo itself.**
-
-```
-mkdir -p ../example_files
-cd ../example_files
-touch export_add.py
-```
-
-Add the following code to `export_add.py`:
-```python
-import torch
-from torch.export import export
-from executorch.exir import to_edge
-
-# Start with a PyTorch model that adds two input tensors (matrices)
-class Add(torch.nn.Module):
-  def __init__(self):
-    super(Add, self).__init__()
-
-  def forward(self, x: torch.Tensor, y: torch.Tensor):
-      return x + y
-
-# 1. torch.export: Defines the program with the ATen operator set.
-aten_dialect = export(Add(), (torch.ones(1), torch.ones(1)))
-
-# 2. to_edge: Make optimizations for Edge devices
-edge_program = to_edge(aten_dialect)
-
-# 3. to_executorch: Convert the graph to an ExecuTorch program
-executorch_program = edge_program.to_executorch()
-
-# 4. Save the compiled .pte program
-with open("add.pte", "wb") as file:
-    file.write(executorch_program.buffer)
-
-```
-
-Then, execute it from your terminal.
-```bash
-python3 export_add.py
-```
-
-If it worked you'll see `add.pte` in that directory
-
-See the [ExecuTorch export tutorial](tutorials_source/export-to-executorch-tutorial.py) to learn more about the export process.
-
-
-## Build & Run
-
-After creating a program go back to the executorch directory to execute it using the ExecuTorch runtime.
-```
-cd ../executorch
-```
-
-For now, let's use [`executor_runner`](https://github.com/pytorch/executorch/blob/main/examples/portable/executor_runner/executor_runner.cpp), an example that runs the `forward` method on your program using the ExecuTorch runtime.
-
-### Build Tooling Setup
-The ExecuTorch repo uses CMake to build its C++ code. Here, we'll configure it to build the `executor_runner` tool to run it on our desktop OS.
-  ```bash
-  # Clean and configure the CMake build system. Compiled programs will
-  # appear in the executorch/cmake-out directory we create here.
-  ./install_executorch.sh --clean
-  (mkdir cmake-out && cd cmake-out && cmake ..)
-
-  # Go to work directory.
-  cd ..
-
-  # Build the executor_runner target
-  cmake --build cmake-out --target executor_runner -j9
-  ```
-
-> **_NOTE:_**  Cleaning the build system
->
-> When fetching a new version of the upstream repo (via `git fetch` or `git
-> pull`) it is a good idea to clean the old build artifacts. The build system
-> does not currently adapt well to changes in build dependencies.
->
-> You should also update and pull the submodules again, in case their versions
-> have changed.
->
-> ```bash
-> # From the root of the executorch repo:
-> ./install_executorch.sh --clean
-> git submodule sync
-> git submodule update --init
-> ```
-
-### Run Your Program
-
-Now that we've exported a program and built the runtime, let's execute it!
-
-  ```bash
-  ./cmake-out/executor_runner --model_path ../example_files/add.pte
-  ```
-Our output is a `torch.Tensor` with a size of 1. The `executor_runner` sets all input values to a [`torch.ones`](https://pytorch.org/docs/stable/generated/torch.ones.html) tensor, so when `x=[1]` and `y=[1]`, we get `[1]+[1]=[2]`
-  :::{dropdown} Sample Output
-
-  ```
-Output 0: tensor(sizes=[1], [2.])
-  ```
-  :::
-
-To learn how to build a similar program, visit the [Runtime APIs Tutorial](extension-module.md).
-
-## Next Steps
-
-Congratulations! You have successfully exported, built, and run your first
-ExecuTorch program. Now that you have a basic understanding of ExecuTorch,
-explore its advanced features and capabilities below.
-
-* Build an [Android](demo-apps-android.md) or [iOS](demo-apps-ios.md) demo app
-* Learn more about the [export process](export-overview.md)
-* Dive deeper into the [Export Intermediate Representation (EXIR)](ir-exir.md) for complex export workflows
-* Refer to [advanced examples in executorch/examples](https://github.com/pytorch/executorch/tree/main/examples)
diff --git a/docs/source/getting-started-setup.rst b/docs/source/getting-started-setup.rst
new file mode 100644
index 00000000000..d29e5835f15
--- /dev/null
+++ b/docs/source/getting-started-setup.rst
@@ -0,0 +1,13 @@
+Setting Up ExecuTorch
+=====================
+
+This page is re-organized into the following pages:
+
+* `Getting Started with ExecuTorch <getting-started.html>`_
+* `Building from Source <using-executorch-building-from-source.html>`_
+
+It will redirect in 3 seconds
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='./getting-started.html'" />
\ No newline at end of file
diff --git a/docs/source/getting-started.md b/docs/source/getting-started.md
new file mode 100644
index 00000000000..4d8e3f0189d
--- /dev/null
+++ b/docs/source/getting-started.md
@@ -0,0 +1,217 @@
+# Getting Started with ExecuTorch
+This section is intended to describe the necessary steps to take PyTorch model and run it using ExecuTorch. To use the framework, you will typically need to take the following steps:
+- Install the ExecuTorch python package and runtime libraries.
+- Export the PyTorch model for the target hardware configuration.
+- Run the model using the ExecuTorch runtime APIs on your development platform.
+- Deploy the model to the target platform using the ExecuTorch runtime.
+
+## System Requirements
+The following are required to install the ExecuTorch host libraries, needed to export models and run from Python. Requirements for target end-user devices are backend dependent. See the appropriate backend documentation for more information.
+
+- Python 3.10 - 3.12
+- g++ version 7 or higher, clang++ version 5 or higher, or another C++17-compatible toolchain.
+- Linux or MacOS operating system (Arm or x86).
+  - Windows is supported via WSL.
+
+## Installation
+To use ExecuTorch, you will need to install both the Python package and the appropriate platform-specific runtime libraries. Pip is the recommended way to install the ExecuTorch python package. 
+
+This package includes the dependencies needed to export a PyTorch model, as well as Python runtime bindings for model testing and evaluation. Consider installing ExecuTorch within a virtual environment, such as one provided by [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html#creating-environments) or [venv](https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/#create-and-use-virtual-environments).
+
+```
+pip install executorch
+```
+
+To build the framework from source, see [Building From Source](using-executorch-building-from-source.md). Backend delegates may require additional dependencies. See the appropriate backend documentation for more information.
+
+
+<hr/>
+
+## Preparing the Model
+Exporting is the process of taking a PyTorch model and converting it to the .pte file format used by the ExecuTorch runtime. This is done using Python APIs. PTE files for common models, such as Llama 3.2, can be found on HuggingFace under [ExecuTorch Community](https://huggingface.co/executorch-community). These models have been exported and lowered for ExecuTorch, and can be directly deployed without needing to go through the lowering process.
+
+A complete example of exporting, lowering, and verifying MobileNet V2 is available as a [Colab notebook](https://colab.research.google.com/drive/1qpxrXC3YdJQzly3mRg-4ayYiOjC6rue3?usp=sharing).
+
+### Requirements
+- A PyTorch model.
+- Example model inputs, typically as PyTorch tensors. You should be able to successfully run the PyTorch model with these inputs.
+- One or more target hardware backends.
+
+### Selecting a Backend
+ExecuTorch provides hardware acceleration for a wide variety of hardware. The most commonly used backends are XNNPACK, for Arm and x86 CPU, Core ML (for iOS), Vulkan (for Android GPUs), and Qualcomm (for Qualcomm-powered Android phones).
+
+For mobile use cases, consider using XNNPACK for Android and Core ML or XNNPACK for iOS as a first step. See [Hardware Backends](backends-overview.md) for more information.
+
+### Exporting
+Exporting is done using Python APIs. ExecuTorch provides a high degree of customization during the export process, but the typical flow is as follows. This example uses the MobileNet V2 image classification model implementation in torchvision, but the process supports any [export-compliant](https://pytorch.org/docs/stable/export.html) PyTorch model.
+
+```python
+import torch
+import torchvision.models as models
+from torchvision.models.mobilenetv2 import MobileNet_V2_Weights
+from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
+from executorch.exir import to_edge_transform_and_lower
+
+model = models.mobilenetv2.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).eval()
+sample_inputs = (torch.randn(1, 3, 224, 224), )
+
+et_program = to_edge_transform_and_lower(
+    torch.export.export(model, sample_inputs),
+    partitioner=[XnnpackPartitioner()]
+).to_executorch()
+
+with open("model.pte", "wb") as f:
+    f.write(et_program.buffer)
+```
+
+If the model requires varying input sizes, you will need to specify the varying dimensions and bounds as part of the `export` call. See [Model Export and Lowering](using-executorch-export.md) for more information.
+
+The hardware backend to target is controlled by the partitioner parameter to to\_edge\_transform\_and\_lower. In this example, the XnnpackPartitioner is used to target mobile CPUs. See the [backend-specific documentation](backends-overview.md) for information on how to use each backend.
+
+Quantization can also be done at this stage to reduce model size and runtime. Quantization is backend-specific. See the documentation for the target backend for a full description of supported quantization schemes.
+
+### Testing the Model
+
+After successfully generating a .pte file, it is common to use the Python runtime APIs to validate the model on the development platform. This can be used to evaluate model accuracy before running on-device. 
+
+For the MobileNet V2 model from torchvision used in this example, image inputs are expected as a normalized, float32 tensor with a dimensions of (batch, channels, height, width). The output See [torchvision.models.mobilenet_v2](https://pytorch.org/vision/main/models/generated/torchvision.models.mobilenet_v2.html) for more information on the input and output tensor format for this model.
+
+```python
+import torch
+from executorch.runtime import Runtime
+from typing import List
+
+runtime = Runtime.get()
+
+input_tensor: torch.Tensor = torch.randn(1, 3, 224, 224)
+program = runtime.load_program("model.pte")
+method = program.load_method("forward")
+outputs: List[torch.Tensor] = method.execute([input_tensor])
+```
+
+
+<hr/>
+
+## Running on Device
+ExecuTorch provides runtime APIs in Java, Objective-C, and C++.
+
+Quick Links:
+- [Android](#android)
+- [iOS](#ios)
+- [C++](#c)
+
+### Android
+
+#### Installation
+ExecuTorch provides Java bindings for Android usage, which can be consumed from both Java and Kotlin. 
+To add the library to your app, download the AAR, and add it to the gradle build rule.
+
+```
+mkdir -p app/libs
+curl https://ossci-android.s3.amazonaws.com/executorch/release/v0.5.0-rc3/executorch.aar -o app/libs/executorch.aar
+```
+And in gradle,
+```
+# app/build.gradle.kts
+dependencies {
+    implementation(files("libs/executorch.aar"))
+    implementation("com.facebook.soloader:soloader:0.10.5")
+    implementation("com.facebook.fbjni:fbjni:0.5.1")
+}
+```
+
+#### Runtime APIs
+Models can be loaded and run using the `Module` class:
+```java
+import org.pytorch.executorch.EValue;
+import org.pytorch.executorch.Module;
+import org.pytorch.executorch.Tensor;
+
+// …
+
+Module model = Module.load("/path/to/model.pte");
+
+Tensor input_tensor = Tensor.fromBlob(float_data, new long[] { 1, 3, height, width });
+EValue input_evalue = EValue.from(input_tensor);
+EValue[] output = model.forward(input_evalue);
+float[] scores = output[0].toTensor().getDataAsFloatArray();
+```
+
+For a full example of running a model on Android, see the [ExecuTorch Android Demo App](https://github.com/pytorch/executorch/blob/main/examples/demo-apps/android/ExecuTorchDemo/app/src/main/java/com/example/executorchdemo/ClassificationActivity.java). For more information on Android development, including building from source, a full description of the Java APIs, and information on using ExecuTorch from Android native code, see [Using ExecuTorch on Android](using-executorch-android.md).
+
+### iOS
+
+#### Installation
+ExecuTorch supports both iOS and MacOS via C++, as well as hardware backends for CoreML, MPS, and CPU. The iOS runtime library is provided as a collection of .xcframework targets and are made available as a Swift PM package.
+
+To get started with Xcode, go to File > Add Package Dependencies. Paste the URL of the ExecuTorch repo into the search bar and select it. Make sure to change the branch name to the desired ExecuTorch version in format “swiftpm-”, (e.g. “swiftpm-0.5.0”).  The ExecuTorch dependency can also be added to the package file manually. See [Using ExecuTorch on iOS](using-executorch-ios.md) for more information.
+
+#### Runtime APIs
+Models can be loaded and run from Objective-C using the C++ APIs.
+
+For more information on iOS integration, including an API reference, logging setup, and building from source, see [Using ExecuTorch on iOS](using-executorch-ios.md).
+
+### C++
+ExecuTorch provides C++ APIs, which can be used to target embedded or mobile devices. The C++ APIs provide a greater level of control compared to other language bindings, allowing for advanced memory management, data loading, and platform integration.
+
+#### Installation
+CMake is the preferred build system for the ExecuTorch C++ runtime. To use with CMake, clone the ExecuTorch repository as a subdirectory of your project, and use CMake's `add_subdirectory("executorch")` to include the dependency. The `executorch` target, as well as kernel and backend targets will be made available to link against. The runtime can also be built standalone to support diverse toolchains. See [Using ExecuTorch with C++](using-executorch-cpp.md) for a detailed description of build integration, targets, and cross compilation.
+
+```
+git clone -b release/0.5 https://github.com/pytorch/executorch.git
+```
+```python
+# CMakeLists.txt
+add_subdirectory("executorch")
+...
+target_link_libraries(
+  my_target
+  PRIVATE executorch
+          executorch_module_static
+          executorch_tensor
+          optimized_native_cpu_ops_lib
+          xnnpack_backend)
+```
+
+#### Runtime APIs
+Both high-level and low-level C++ APIs are provided. The low-level APIs are platform independent, do not dynamically allocate memory, and are most suitable for resource-constrained embedded systems. The high-level APIs are provided as a convenience wrapper around the lower-level APIs, and make use of dynamic memory allocation and standard library constructs to reduce verbosity.
+
+ExecuTorch uses CMake for native builds. Integration is typically done by cloning the ExecuTorch repository and using CMake add_subdirectory to add the dependency.
+
+Loading and running a model using the high-level API can be done as follows:
+```cpp
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor.h>
+
+using namespace ::executorch::extension;
+
+// Load the model.
+Module module("/path/to/model.pte");
+
+// Create an input tensor.
+float input[1 * 3 * 256 * 256];
+auto tensor = from_blob(input, {1, 3, 256, 256});
+
+// Perform an inference.
+const auto result = module.forward(tensor);
+
+if (result.ok()) {
+  // Retrieve the output data.
+  const auto output = result->at(0).toTensor().const_data_ptr<float>();
+}
+```
+
+For more information on the C++ APIs, see [Running an ExecuTorch Model Using the Module Extension in C++](extension-module.md) and [Managing Tensor Memory in C++](extension-tensor.md).
+
+<hr/>
+
+## Next Steps
+ExecuTorch provides a high-degree of customizability to support diverse hardware targets. Depending on your use cases, consider exploring one or more of the following pages:
+
+- [Export and Lowering](using-executorch-export.md) for advanced model conversion options.
+- [Backend Overview](backends-overview.md) for available backends and configuration options.
+- [Using ExecuTorch on Android](using-executorch-android.md) and [Using ExecuTorch on iOS](using-executorch-ios.md) for mobile runtime integration.
+- [Using ExecuTorch with C++](using-executorch-cpp.md) for embedded and mobile native development.
+- [Profiling and Debugging](using-executorch-troubleshooting.md) for developer tooling and debugging.
+- [API Reference](export-to-executorch-api-reference.md) for a full description of available APIs.
+- [Examples](https://github.com/pytorch/executorch/tree/main/examples) for demo apps and example code.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index d0cff5fa570..b27d53f51c7 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -56,7 +56,7 @@ Topics in this section will help you get started with ExecuTorch.
      .. grid-item-card:: :octicon:`file-code;1em`
         Getting started with ExecuTorch
         :img-top: _static/img/card-background.svg
-        :link: getting-started-setup.html
+        :link: getting-started.html
         :link-type: url
 
         A step-by-step tutorial on how to get started with
@@ -84,83 +84,66 @@ Topics in this section will help you get started with ExecuTorch.
 .. toctree::
    :glob:
    :maxdepth: 1
-   :caption: Getting Started
+   :caption: Usage
    :hidden:
 
-   getting-started-setup
-   export-overview
-   runtime-build-and-cross-compilation
-   getting-started-faqs
+   getting-started
+   using-executorch-export
+   using-executorch-android
+   using-executorch-ios
+   using-executorch-cpp
+   using-executorch-runtime-integration
+   using-executorch-troubleshooting
+   using-executorch-building-from-source
+   using-executorch-faqs
 
 .. toctree::
    :glob:
    :maxdepth: 1
-   :caption: Tutorials
-   :hidden:
-
-   tutorials/export-to-executorch-tutorial
-   running-a-model-cpp-tutorial
-   extension-module
-   extension-tensor
-   tutorials/devtools-integration-tutorial
-   apple-runtime
-   demo-apps-ios
-   demo-apps-android
-   examples-end-to-end-to-lower-model-to-delegate
-   tutorial-xnnpack-delegate-lowering
-   build-run-vulkan
-   ..
-      Alphabetical by backend name. Be sure to keep the same order in the
-      customcarditem entries below.
-   executorch-arm-delegate-tutorial
-   build-run-coreml
-   build-run-mediatek-backend
-   build-run-mps
-   build-run-qualcomm-ai-engine-direct-backend
-   build-run-xtensa
-
-.. toctree::
-   :glob:
-   :maxdepth: 2
-   :caption: Working with LLMs
+   :caption: Examples
    :hidden:
 
-   Llama <llm/llama>
-   Llama on Android <llm/llama-demo-android>
-   Llama on iOS <llm/llama-demo-ios>
-   Llama on Android via Qualcomm backend <llm/build-run-llama3-qualcomm-ai-engine-direct-backend>
-   Intro to LLMs in Executorch <llm/getting-started>
+   demo-apps-android.md
+   demo-apps-ios.md
 
 .. toctree::
    :glob:
    :maxdepth: 1
-   :caption: API Reference
+   :caption: Backends
    :hidden:
 
-   export-to-executorch-api-reference
-   executorch-runtime-api-reference
-   runtime-python-api-reference
-   api-life-cycle
+   backends-overview
+   backends-xnnpack
+   backends-coreml
+   backends-mps
+   backends-vulkan
+   backends-arm-ethos-u
+   backends-qualcomm
+   backends-mediatek
+   backends-cadence
 
 .. toctree::
    :glob:
    :maxdepth: 1
-   :caption: IR Specification
+   :caption: Tutorials
    :hidden:
 
-   ir-exir
-   ir-ops-set-definition
-
 .. toctree::
    :glob:
    :maxdepth: 1
-   :caption: Compiler Entry Points
+   :caption: Developer Tools
    :hidden:
 
-   compiler-delegate-and-partitioner
-   compiler-backend-dialect
-   compiler-custom-compiler-passes
-   compiler-memory-planning
+   devtools-overview
+   bundled-io
+   etrecord
+   etdump
+   runtime-profiling
+   model-debugging
+   model-inspector
+   memory-planning-inspection
+   delegate-debugging
+   devtools-tutorial
 
 .. toctree::
    :glob:
@@ -169,11 +152,26 @@ Topics in this section will help you get started with ExecuTorch.
    :hidden:
 
    runtime-overview
+   extension-module
+   extension-tensor
+   running-a-model-cpp-tutorial
    runtime-backend-delegate-implementation-and-linking
    runtime-platform-abstraction-layer
    portable-cpp-programming
    pte-file-format
 
+.. toctree::
+   :glob:
+   :maxdepth: 1
+   :caption: API Reference
+   :hidden:
+
+   export-to-executorch-api-reference
+   executorch-runtime-api-reference
+   runtime-python-api-reference
+   api-life-cycle
+   Javadoc <https://pytorch.org/executorch/main/javadoc/>
+
 .. toctree::
    :glob:
    :maxdepth: 1
@@ -192,34 +190,48 @@ Topics in this section will help you get started with ExecuTorch.
    kernel-library-custom-aten-kernel
    kernel-library-selective-build
 
+.. toctree::
+   :glob:
+   :maxdepth: 2
+   :caption: Working with LLMs
+   :hidden:
+
+   Llama <llm/llama>
+   Llama on Android <llm/llama-demo-android>
+   Llama on iOS <llm/llama-demo-ios>
+   Llama on Android via Qualcomm backend <llm/build-run-llama3-qualcomm-ai-engine-direct-backend>
+   Intro to LLMs in Executorch <llm/getting-started>
+
 .. toctree::
    :glob:
    :maxdepth: 1
-   :caption: Backend Delegates
+   :caption: Backend Development
    :hidden:
 
-   native-delegates-executorch-xnnpack-delegate
-   native-delegates-executorch-vulkan-delegate
    backend-delegates-integration
+   backend-delegates-xnnpack-reference
    backend-delegates-dependencies
+   compiler-delegate-and-partitioner
    debug-backend-delegate
 
 .. toctree::
    :glob:
    :maxdepth: 1
-   :caption: Developer Tools
+   :caption: IR Specification
    :hidden:
 
-   devtools-overview
-   bundled-io
-   etrecord
-   etdump
-   runtime-profiling
-   model-debugging
-   model-inspector
-   memory-planning-inspection
-   delegate-debugging
-   devtools-tutorial
+   ir-exir
+   ir-ops-set-definition
+
+.. toctree::
+   :glob:
+   :maxdepth: 1
+   :caption: Compiler Entry Points
+   :hidden:
+
+   compiler-backend-dialect
+   compiler-custom-compiler-passes
+   compiler-memory-planning
 
 .. toctree::
    :glob:
@@ -314,7 +326,7 @@ ExecuTorch tutorials.
    :header: Building and Running ExecuTorch with Vulkan Backend
    :card_description: A tutorial that walks you through the process of building ExecuTorch with Vulkan Backend
    :image: _static/img/generic-pytorch-logo.png
-   :link: build-run-vulkan.html
+   :link: backends-vulkan.html
    :tags: Export,Backend,Delegation,Vulkan
 
 ..
@@ -332,35 +344,35 @@ ExecuTorch tutorials.
    :header: Building and Running ExecuTorch with CoreML Backend
    :card_description: A tutorial that walks you through the process of building ExecuTorch with CoreML Backend
    :image: _static/img/generic-pytorch-logo.png
-   :link: build-run-coreml.html
+   :link: backends-coreml.html
    :tags: Export,Backend,Delegation,CoreML
 
 .. customcarditem::
    :header: Building and Running ExecuTorch with MediaTek Backend
    :card_description: A tutorial that walks you through the process of building ExecuTorch with MediaTek Backend
    :image: _static/img/generic-pytorch-logo.png
-   :link: build-run-mediatek-backend.html
+   :link: backends-mediatek-backend.html
    :tags: Export,Backend,Delegation,MediaTek
 
 .. customcarditem::
    :header: Building and Running ExecuTorch with MPS Backend
    :card_description: A tutorial that walks you through the process of building ExecuTorch with MPSGraph Backend
    :image: _static/img/generic-pytorch-logo.png
-   :link: build-run-mps.html
+   :link: backends-mps.html
    :tags: Export,Backend,Delegation,MPS,MPSGraph
 
 .. customcarditem::
    :header: Building and Running ExecuTorch with Qualcomm AI Engine Direct Backend
    :card_description: A tutorial that walks you through the process of building ExecuTorch with Qualcomm AI Engine Direct Backend
    :image: _static/img/generic-pytorch-logo.png
-   :link: build-run-qualcomm-ai-engine-direct-backend.html
+   :link: backends-qualcomm.html
    :tags: Export,Backend,Delegation,QNN
 
 .. customcarditem::
    :header: Building and Running ExecuTorch on Xtensa HiFi4 DSP
    :card_description: A tutorial that walks you through the process of building ExecuTorch for an Xtensa Hifi4 DSP using custom operators
    :image: _static/img/generic-pytorch-logo.png
-   :link: build-run-xtensa.html
+   :link: backends-cadence.html
    :tags: Export,Custom-Operators,DSP,Xtensa
 
 .. customcardend::
diff --git a/docs/source/kernel-library-custom-aten-kernel.md b/docs/source/kernel-library-custom-aten-kernel.md
index 0f060d1c5e5..838b0f69d3b 100644
--- a/docs/source/kernel-library-custom-aten-kernel.md
+++ b/docs/source/kernel-library-custom-aten-kernel.md
@@ -267,7 +267,7 @@ Here's an example to do it:
 
 ```cmake
 # For target_link_options_shared_lib
-include(${EXECUTORCH_ROOT}/build/Utils.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 
 # Add a custom op library
 add_library(custom_op_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/custom_op.cpp)
diff --git a/docs/source/kernel-library-selective-build.md b/docs/source/kernel-library-selective-build.md
index 1a7562942e0..4bbbb8e7f36 100644
--- a/docs/source/kernel-library-selective-build.md
+++ b/docs/source/kernel-library-selective-build.md
@@ -36,7 +36,7 @@ The basic flow looks like this:
 
 ## APIs
 
-We expose a CMake macro `[gen_selected_ops](https://github.com/pytorch/executorch/blob/main/build/Codegen.cmake#L12)`, to allow users specifying op info:
+We expose a CMake macro `[gen_selected_ops](https://github.com/pytorch/executorch/blob/main/tools/cmake/Codegen.cmake#L12)`, to allow users specifying op info:
 
 ```
 gen_selected_ops(
diff --git a/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md b/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md
index 7ed768baf23..c02701a839c 100644
--- a/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md
+++ b/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md
@@ -5,7 +5,7 @@ This tutorial demonstrates how to export Llama 3 8B Instruct for Qualcomm AI Eng
 ## Prerequisites
 
 - Set up your ExecuTorch repo and environment if you haven’t done so by following [the Setting up ExecuTorch](../getting-started-setup.md) to set up the repo and dev environment.
-- Read [the Building and Running ExecuTorch with Qualcomm AI Engine Direct Backend page](../build-run-qualcomm-ai-engine-direct-backend.md) to understand how to export and run a model with Qualcomm AI Engine Direct Backend on Qualcomm device.
+- Read [the Building and Running ExecuTorch with Qualcomm AI Engine Direct Backend page](../backends-qualcomm.md) to understand how to export and run a model with Qualcomm AI Engine Direct Backend on Qualcomm device.
 - Follow [the README for executorch llama](https://github.com/pytorch/executorch/tree/main/examples/models/llama) to know how to run a llama model on mobile via ExecuTorch.
 - A Qualcomm device with 16GB RAM
   - We are continuing to optimize our memory usage to ensure compatibility with lower memory devices.
diff --git a/docs/source/llm/getting-started.md b/docs/source/llm/getting-started.md
index 8ea34269ff0..066bb3f3d1c 100644
--- a/docs/source/llm/getting-started.md
+++ b/docs/source/llm/getting-started.md
@@ -588,9 +588,9 @@ I'm not sure if you've heard of the "Curse of the Dragon" or not, but it's a ver
 
 The delegated model should be noticeably faster compared to the non-delegated model.
 
-For more information regarding backend delegateion, see the ExecuTorch guides
-for the [XNNPACK Backend](../tutorial-xnnpack-delegate-lowering.md),  [Core ML
-Backend](../build-run-coreml.md) and [Qualcomm AI Engine Direct Backend](build-run-llama3-qualcomm-ai-engine-direct-backend.md).
+For more information regarding backend delegation, see the ExecuTorch guides
+for the [XNNPACK Backend](../backends-xnnpack.md),  [Core ML
+Backend](../backends-coreml.md) and [Qualcomm AI Engine Direct Backend](build-run-llama3-qualcomm-ai-engine-direct-backend.md).
 
 ## Quantization
 
diff --git a/docs/source/runtime-overview.md b/docs/source/runtime-overview.md
index 1a421fdcc0a..911d0c142e8 100644
--- a/docs/source/runtime-overview.md
+++ b/docs/source/runtime-overview.md
@@ -157,7 +157,7 @@ For more details about the ExecuTorch runtime, please see:
 
 * [Detailed Runtime APIs Tutorial](running-a-model-cpp-tutorial.md)
 * [Simplified Runtime APIs Tutorial](extension-module.md)
-* [Runtime Build and Cross Compilation](runtime-build-and-cross-compilation.md)
+* [Building from Source](using-executorch-building-from-source.md)
 * [Runtime Platform Abstraction Layer](runtime-platform-abstraction-layer.md)
 * [Runtime Profiling](runtime-profiling.md)
 * [Backends and Delegates](compiler-delegate-and-partitioner.md)
diff --git a/docs/source/runtime-python-api-reference.rst b/docs/source/runtime-python-api-reference.rst
index 64c135de8c8..270cab9b61e 100644
--- a/docs/source/runtime-python-api-reference.rst
+++ b/docs/source/runtime-python-api-reference.rst
@@ -1,4 +1,4 @@
-ExecuTorch Runtime Python API Reference
+Runtime Python API Reference
 ----------------------------------
 The Python ``executorch.runtime`` module wraps the C++ ExecuTorch runtime. It can load and execute serialized ``.pte`` program files: see the `Export to ExecuTorch Tutorial <tutorials/export-to-executorch-tutorial.html>`__ for how to convert a PyTorch ``nn.Module`` to an ExecuTorch ``.pte`` program file. Execution accepts and returns ``torch.Tensor`` values, making it a quick way to validate the correctness of the program.
 
diff --git a/docs/source/tutorial-xnnpack-delegate-lowering.md b/docs/source/tutorial-xnnpack-delegate-lowering.md
index d1148511c5f..a469edebd54 100644
--- a/docs/source/tutorial-xnnpack-delegate-lowering.md
+++ b/docs/source/tutorial-xnnpack-delegate-lowering.md
@@ -12,7 +12,7 @@ In this tutorial, you will learn how to export an XNNPACK lowered Model and run
 :class-card: card-prerequisites
 * [Setting up ExecuTorch](./getting-started-setup.md)
 * [Model Lowering Tutorial](./tutorials/export-to-executorch-tutorial)
-* [ExecuTorch XNNPACK Delegate](./native-delegates-executorch-xnnpack-delegate.md)
+* [ExecuTorch XNNPACK Delegate](./backends-xnnpack.md)
 :::
 ::::
 
diff --git a/docs/source/tutorials_source/bundled_program.bp b/docs/source/tutorials_source/bundled_program.bp
new file mode 100644
index 00000000000..2587278e47f
Binary files /dev/null and b/docs/source/tutorials_source/bundled_program.bp differ
diff --git a/docs/source/using-executorch-android.md b/docs/source/using-executorch-android.md
new file mode 100644
index 00000000000..62d1f3ee75a
--- /dev/null
+++ b/docs/source/using-executorch-android.md
@@ -0,0 +1,185 @@
+# Using ExecuTorch on Android
+
+To use from Android, ExecuTorch provides Java/Kotlin API bindings and Android platform integration, available as an AAR file.
+
+Note: This page covers Android app integration through the AAR library. The ExecuTorch C++ APIs can also be used from Android native, and the documentation can be found on [this page about cross compilation](https://pytorch.org/executorch/main/using-executorch-building-from-source.html#cross-compilation).
+
+## Installation
+
+All ExecuTorch Android libraries are packaged into an [Android library (AAR)](https://developer.android.com/studio/projects/android-library), `executorch.aar` for both generic (image/audio processing) and LLM (LLaMA) use case. In each release, prebuilt AAR artifacts are uploaded to [Maven](https://repo.maven.apache.org/maven2/org/pytorch/executorch-android/) and S3. Users can also build the AAR from source.
+
+### Contents of library
+
+The AAR artifact contains the Java library for users to integrate with their Java/Kotlin application code, as well as the corresponding JNI library (.so file), which is loaded by the Java code during initialization.
+
+- [Java library](https://github.com/pytorch/executorch/tree/main/extension/android/src/main/java/org/pytorch/executorch)
+- JNI contains the JNI binding for the corresponding Java code, and ExecuTorch native library, including
+  - core ExecuTorch runtime libraries
+  - XNNPACK backend
+  - Portable kernels
+  - Optimized kernels
+  - Quantized kernels
+  - LLaMa-specific Custom ops library.
+- Comes with two ABI variants, arm64-v8a and x86\_64.
+
+## Using AAR from Maven Central
+
+ExecuTorch is available on [Maven Central](https://mvnrepository.com/artifact/org.pytorch/executorch-android).
+
+Simply add the target [`org.pytorch:executorch-android:0.5.1`](https://repo.maven.apache.org/maven2/org/pytorch/executorch-android/0.5.1/) to your Android app dependency (build.gradle), and build your app.
+
+For example:
+```
+# app/build.gradle.kts
+dependencies {
+    implementation("org.pytorch:executorch-android:0.5.1")
+}
+```
+
+Note: `org.pytorch:executorch-android:0.5.1` corresponds to executorch v0.5.0.
+
+## Using AAR file directly
+
+You can also directly specify an AAR file in the app. We upload pre-built AAR to S3 during each release, or as a snapshot.
+
+### Released versions (recommended)
+
+| Version | AAR | SHASUMS |
+| ------- | --- | ------- |
+| [v0.5.0](https://github.com/pytorch/executorch/releases/tag/v0.5.0) | [executorch.aar](https://ossci-android.s3.amazonaws.com/executorch/release/v0.5.0-rc3/executorch.aar) | [executorch.aar.sha256sums](https://ossci-android.s3.amazonaws.com/executorch/release/v0.5.0-rc3/executorch.aar.sha256sums) |
+
+### Snapshots from main branch
+
+| Date | AAR | SHASUMS |
+| ------- | --- | ------- |
+| 2025-02-27 | [executorch.aar](https://ossci-android.s3.amazonaws.com/executorch/release/executorch-20250227/executorch.aar) | [executorch.aar.sha256sums](https://ossci-android.s3.amazonaws.com/executorch/release/executorch-20250227/executorch.aar.sha256sums) |
+
+## Using AAR file
+
+To add the AAR file to your app:
+1. Download the AAR.
+2. Add it to your gradle build rule as a file path.
+
+An AAR file itself does not contain dependency info, unlike the Maven one which bundled with pom.xml. The Java package requires `fbjni` and `soloader`, and currently requires users to explicitly declare the dependency. Therefore, two more `dependencies` in gradle rule is required:
+```
+implementation("com.facebook.soloader:soloader:0.10.5")
+implementation("com.facebook.fbjni:fbjni:0.5.1")
+```
+
+### Example usage
+
+In your app working directory, such as executorch/examples/demo-apps/android/LlamaDemo,
+```
+mkdir -p app/libs
+curl https://ossci-android.s3.amazonaws.com/executorch/release/v0.5.0-rc3/executorch.aar -o app/libs/executorch.aar
+```
+
+And include it in gradle:
+```
+# app/build.gradle.kts
+dependencies {
+    implementation(files("libs/executorch.aar"))
+    implementation("com.facebook.soloader:soloader:0.10.5")
+    implementation("com.facebook.fbjni:fbjni:0.5.1")
+}
+```
+
+Now you can compile your app with the ExecuTorch Android library.
+
+## Building from Source
+
+`scripts/build_android_library.sh` is a helper script to build the Java library (into .jar), native library (into .so), and the packaged AAR file. It can also build
+demo apps to showcase the AAR is integrated into a user app correctly.
+
+You need Android [SDK](https://developer.android.com/studio) and [NDK](https://developer.android.com/ndk/downloads) to use it.
+
+Current NDK version used in ExecuTorch CI: r27b.
+
+You need to set `ANDROID_HOME` to Android SDK home and `ANDROID_NDK` to the correct NDK root (containing NOTICE file).
+
+```
+export ANDROID_HOME=/path/to/sdk
+export ANDROID_NDK=/path/to/ndk
+sh scripts/build_android_library.sh
+```
+
+### Optional environment variables
+
+Optionally, set these environment variables before running `build_android_library.sh`.
+
+#### ANDROID_ABIS
+Set environment variable `ANDROID_ABIS` to either `arm64-v8a` or `x86_64` if you only need to build the native library for one ABI only.
+```
+export ANDROID_ABIS=arm64-v8a
+# or
+# export ANDROID_ABIS=x86_64
+sh scripts/build_android_library.sh
+```
+
+#### EXECUTORCH_CMAKE_BUILD_TYPE
+Set environment variable `EXECUTORCH_CMAKE_BUILD_TYPE` to `Release` or `Debug` based on your needs.
+
+#### Using MediaTek backend
+
+To use [MediaTek backend](https://pytorch.org/executorch/main/backends-mediatek.html),
+after installing and setting up the SDK, set `NEURON_BUFFER_ALLOCATOR_LIB` and `NEURON_USDK_ADAPTER_LIB` to the corresponding path.
+
+#### Using Qualcomm AI Engine Backend
+
+To use [Qualcomm AI Engine Backend](https://pytorch.org/executorch/main/backends-qualcomm.html#qualcomm-ai-engine-backend),
+after installing and setting up the SDK, set `QNN_SDK_ROOT` to the corresponding path.
+
+#### Using Vulkan Backend
+
+To use [Vulkan Backend](https://pytorch.org/executorch/main/backends-vulkan.html#vulkan-backend),
+set `EXECUTORCH_BUILD_VULKAN` to `ON`.
+
+## Android Backends
+
+The following backends are available for Android:
+
+| Backend | Type | Doc |
+| ------- | -------- | --- |
+| [XNNPACK](https://github.com/google/XNNPACK) | CPU | [Doc](./backends-xnnpack.md) |
+| [MediaTek NeuroPilot](https://neuropilot.mediatek.com/) | NPU | [Doc](./backends-mediatek.md) |
+| [Qualcomm AI Engine](https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk) | NPU | [Doc](./backends-qualcomm.md) |
+| [Vulkan](https://www.vulkan.org/) | GPU | [Doc](./backends-vulkan.md) |
+
+
+## Runtime Integration
+
+Here is an example code sample in Java that demonstrates how to integrate ExecuTorch into an Android app:
+
+```java
+import org.pytorch.executorch.EValue;
+import org.pytorch.executorch.Module;
+import org.pytorch.executorch.Tensor;
+
+public class MainActivity extends Activity {
+    private Module module;
+
+    @Override
+    protected void onCreate(Bundle savedInstanceState) {
+        super.onCreate(savedInstanceState);
+        // Load the ExecuTorch module
+        module = Module.load("/path/to/module.pte");
+    }
+    public void runInference(View view) {
+        // Prepare input data
+        Tensor input = Tensor.fromBlob(getInputData());
+        // Run inference
+        Tensor output = module.forward(EValue.from(input))[0].toTensor();
+        // Process output data
+        processOutput(output);
+    }
+}
+```
+This example loads an ExecuTorch module, prepares input data, runs inference, and processes the output data.
+
+Please use [ExecuTorchDemo](https://github.com/pytorch/executorch/tree/main/examples/demo-apps/android/ExecuTorchDemo)
+and [LlamaDemo](https://github.com/pytorch/executorch/tree/main/examples/demo-apps/android/LlamaDemo) for the code examples
+using ExecuTorch AAR package.
+
+## Java API reference
+
+Please see [Java API reference](https://pytorch.org/executorch/main/javadoc/).
diff --git a/docs/source/runtime-build-and-cross-compilation.md b/docs/source/using-executorch-building-from-source.md
similarity index 55%
rename from docs/source/runtime-build-and-cross-compilation.md
rename to docs/source/using-executorch-building-from-source.md
index 3574b76b6df..345f0324d50 100644
--- a/docs/source/runtime-build-and-cross-compilation.md
+++ b/docs/source/using-executorch-building-from-source.md
@@ -1,10 +1,114 @@
-# Building with CMake
+# Building from Source
 
-ExecuTorch uses [CMake](https://cmake.org/)  as its primary build system.
+ExecuTorch uses [CMake](https://cmake.org/) as the primary build system.
 Even if you don't use CMake directly, CMake can emit scripts for other format
 like Make, Ninja or Xcode. For information, see [cmake-generators(7)](https://cmake.org/cmake/help/latest/manual/cmake-generators.7.html).
 
-## Targets Built by the CMake Build System
+## System Requirements
+### Operating System
+
+We've tested these instructions on the following systems, although they should
+also work in similar environments.
+
+
+Linux (x86_64)
+- CentOS 8+
+- Ubuntu 20.04.6 LTS+
+- RHEL 8+
+
+macOS (x86_64/M1/M2)
+- Big Sur (11.0)+
+
+Windows (x86_64)
+- Windows Subsystem for Linux (WSL) with any of the Linux options
+
+### Software
+* `conda` or another virtual environment manager
+  - We recommend `conda` as it provides cross-language
+    support and integrates smoothly with `pip` (Python's built-in package manager)
+  - Otherwise, Python's built-in virtual environment manager `python venv` is a good alternative.
+* `g++` version 7 or higher, `clang++` version 5 or higher, or another
+  C++17-compatible toolchain.
+
+Note that the cross-compilable core runtime code supports a wider range of
+toolchains, down to C++17. See the [Runtime Overview](./runtime-overview.md) for
+portability details.
+
+## Environment Setup
+
+### Create a Virtual Environment
+
+[Install conda on your machine](https://conda.io/projects/conda/en/latest/user-guide/install/index.html). Then, create a virtual environment to manage our dependencies.
+   ```bash
+   # Create and activate a conda environment named "executorch"
+   conda create -yn executorch python=3.10.0
+   conda activate executorch
+   ```
+
+### Clone ExecuTorch
+
+   ```bash
+   # Clone the ExecuTorch repo from GitHub
+   # 'main' branch is the primary development branch where you see the latest changes.
+   # 'viable/strict' contains all of the commits on main that pass all of the necessary CI checks.
+   git clone --branch viable/strict https://github.com/pytorch/executorch.git
+   cd executorch
+
+   # Update and pull submodules
+   git submodule sync
+   git submodule update --init
+   ```
+
+## Install ExecuTorch pip package from Source
+   ```bash
+   # Install ExecuTorch pip package and its dependencies, as well as
+   # development tools like CMake.
+   # If developing on a Mac, make sure to install the Xcode Command Line Tools first.
+   ./install_executorch.sh
+   ```
+
+   Use the [`--pybind` flag](https://github.com/pytorch/executorch/blob/main/install_executorch.sh#L26-L29) to install with pybindings and dependencies for other backends. 
+   ```bash
+   ./install_executorch.sh --pybind <coreml | mps | xnnpack>
+
+   # Example: pybindings with CoreML *only*
+   ./install_executorch.sh --pybind coreml
+
+   # Example: pybinds with CoreML *and* XNNPACK
+   ./install_executorch.sh --pybind coreml xnnpack
+   ```
+
+   By default, `./install_executorch.sh` command installs pybindings for XNNPACK. To disable any pybindings altogether:
+   ```bash
+   ./install_executorch.sh --pybind off
+   ```
+
+   For development mode, run the command with `--editable`, which allows us to modify Python source code and see changes reflected immediately.
+   ```bash
+   ./install_executorch.sh --editable [--pybind xnnpack]
+   
+   # Or you can directly do the following if dependencies are already installed
+   # either via a previous invocation of `./install_executorch.sh` or by explicitly installing requirements via `./install_requirements.sh` first.
+   pip install -e .
+   ```
+
+> **_NOTE:_**  Cleaning the build system
+>
+> When fetching a new version of the upstream repo (via `git fetch` or `git
+> pull`) it is a good idea to clean the old build artifacts. The build system
+> does not currently adapt well to changes in build dependencies.
+>
+> You should also update and pull the submodules again, in case their versions
+> have changed.
+>
+> ```bash
+> # From the root of the executorch repo:
+> ./install_executorch.sh --clean
+> git submodule sync
+> git submodule update --init
+> ```
+
+## Build ExecuTorch C++ runtime from source
 
 ExecuTorch's CMake build system covers the pieces of the runtime that are
 likely to be useful to embedded systems users.
@@ -24,17 +128,8 @@ likely to be useful to embedded systems users.
   `libportable_kernels.a`, so the program may use any of the operators it
   implements.
 
-## One-time setup to prepare for CMake Build
 
-Follow the steps below to have the tools ready before using CMake to build on your machine.
-
-1. If your system's version of python3 is older than 3.11:
-   - Run `pip install tomli`
-3. Install CMake version 3.19 or later:
-   - Run `conda install cmake` or `pip install cmake`.
-
-
-## Configure the CMake Build
+### Configure the CMake build
 
 Follow these steps after cloning or pulling the upstream repo, since the build
 dependencies may have changed.
@@ -51,7 +146,7 @@ cd executorch
 
 Once this is done, you don't need to do it again until you pull from the upstream repo again, or if you modify any CMake-related files.
 
-### CMake Build Options
+### CMake build options
 
 The release build offers optimizations intended to improve performance and reduce binary size. It disables program verification and executorch logging, and adds optimizations flags.
 ```bash
@@ -66,7 +161,7 @@ To further optimize the release build for size, use both:
 
 See [CMakeLists.txt](https://github.com/pytorch/executorch/blob/main/CMakeLists.txt)
 
-## Build the runtime components
+### Build the runtime components
 
 Build all targets with
 
@@ -83,19 +178,19 @@ cd executorch
 cmake --build cmake-out -j9
 ```
 
-## Use an example app `executor_runner` to execute a .pte file
+## Use an example binary `executor_runner` to execute a .pte file
 
 First, generate an `add.pte` or other ExecuTorch program file using the
 instructions as described in
-[Setting up ExecuTorch](getting-started-setup.md#building-a-runtime).
+[Preparing a Model](getting-started.md#preparing-the-model).
 
 Then, pass it to the command line tool:
 
 ```bash
-./cmake-out/executor_runner --model_path path/to/add.pte
+./cmake-out/executor_runner --model_path path/to/model.pte
 ```
 
-If it worked, you should see the message "Model executed successfully" followed
+You should see the message "Model executed successfully" followed
 by the output values.
 
 ```
@@ -126,7 +221,7 @@ Assuming Android NDK is available, run:
 mkdir cmake-android-out && cd cmake-android-out
 
 # point -DCMAKE_TOOLCHAIN_FILE to the location where ndk is installed
-cmake -DCMAKE_TOOLCHAIN_FILE=/Users/{user_name}/Library/Android/sdk/ndk/25.2.9519653/build/cmake/android.toolchain.cmake  -DANDROID_ABI=arm64-v8a ..
+cmake -DCMAKE_TOOLCHAIN_FILE=/Users/{user_name}/Library/Android/sdk/ndk/27.2.12479018/build/cmake/android.toolchain.cmake  -DANDROID_ABI=arm64-v8a ..
 
 cd  ..
 cmake --build  cmake-android-out  -j9
@@ -155,11 +250,11 @@ xcode-select --install
 2. Build the frameworks:
 
 ```bash
-./build/build_apple_frameworks.sh
+./scripts/build_apple_frameworks.sh
 ```
 
 Run the above command with `--help` flag to learn more on how to build additional backends
-(like [Core ML](build-run-coreml.md), [MPS](build-run-mps.md) or XNNPACK), etc.
+(like [Core ML](backends-coreml.md), [MPS](backends-mps.md) or XNNPACK), etc.
 Note, some backends may require additional dependencies and certain versions of Xcode and iOS.
 
 3. Copy over the generated `.xcframework` bundles to your Xcode project, link them against
@@ -172,6 +267,6 @@ Check out the [iOS Demo App](demo-apps-ios.md) tutorial for more info.
 
 You have successfully cross-compiled `executor_runner` binary to iOS and Android platforms. You can start exploring advanced features and capabilities. Here is a list of sections you might want to read next:
 
-* [Selective build](./kernel-library-selective_build) to build the runtime that links to only kernels used by the program, which can provide significant binary size savings.
+* [Selective build](kernel-library-selective-build.md) to build the runtime that links to only kernels used by the program, which can provide significant binary size savings.
 * Tutorials on building [Android](./demo-apps-android.md) and [iOS](./demo-apps-ios.md) demo apps.
-* Tutorials on deploying applications to embedded devices such as [ARM Cortex-M/Ethos-U](./executorch-arm-delegate-tutorial.md) and [XTensa HiFi DSP](./build-run-xtensa.md).
+* Tutorials on deploying applications to embedded devices such as [ARM Cortex-M/Ethos-U](backends-arm-ethos-u.md) and [XTensa HiFi DSP](./backends-cadence.md).
diff --git a/docs/source/using-executorch-cpp.md b/docs/source/using-executorch-cpp.md
new file mode 100644
index 00000000000..4f8a83830e0
--- /dev/null
+++ b/docs/source/using-executorch-cpp.md
@@ -0,0 +1,75 @@
+# Using ExecuTorch with C++
+
+In order to support a wide variety of devices, from high-end mobile phones down to tiny embedded systems, ExecuTorch provides an API surface with a high degree of customizability. The C++ APIs expose advanced configuration options, such as controlling memory allocation, placement, and data loading. To meet the needs of both application and embedded programming, ExecuTorch provides a low-level, highly-customizable core set of APIs, and set of high-level extensions, which abstract away many of the low-level details that are not relevant for mobile application programming.
+
+## High-Level APIs
+
+The C++ `Module` class provides the high-level interface to load and execute a model from C++. It is responsible for loading the .pte file, configuring memory allocation and placement, and running the model. The Module constructor takes a file path and provides a simplified `forward()` method to run the model.
+
+In addition the Module class, the tensor extension provides an encapsulated interface to define and manage tensor memory. It provides the `TensorPtr` class, which is a "fat" smart pointer. It provides ownership over the tensor  data and metadata, such as size and strides. The `make_tensor_ptr` and `from_blob` methods, defined in `tensor.h`, provide owning and non-owning tensor creation APIs, respectively.
+
+```cpp
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor.h>
+
+using namespace ::executorch::extension;
+
+// Load the model.
+Module module("/path/to/model.pte");
+
+// Create an input tensor.
+float input[1 * 3 * 256 * 256];
+auto tensor = from_blob(input, {1, 3, 256, 256});
+
+// Perform an inference.
+const auto result = module.forward(tensor);
+
+if (result.ok()) {
+  // Retrieve the output data.
+  const auto output = result->at(0).toTensor().const_data_ptr<float>();
+}
+```
+
+For more information on the Module class, see [Running an ExecuTorch Model Using the Module Extension in C++](extension-module.md). For information on high-level tensor APIs, see [Managing Tensor Memory in C++](extension-tensor.md).
+
+## Low-Level APIs
+
+Running a model using the low-level runtime APIs allows for a high-degree of control over memory allocation, placement, and loading. This allows for advanced use cases, such as placing allocations in specific memory banks or loading a model without a file system. For an end to end example using the low-level runtime APIs, see [Running an ExecuTorch Model in C++ Tutorial](running-a-model-cpp-tutorial.md).
+
+## Building with CMake
+
+ExecuTorch uses CMake as the primary build system. Inclusion of the module and tensor APIs are controlled by the `EXECUTORCH_BUILD_EXTENSION_MODULE` and `EXECUTORCH_BUILD_EXTENSION_TENSOR` CMake options. As these APIs may not be supported on embedded systems, they are disabled by default when building from source. The low-level API surface is always included. To link, add the `executorch` target as a CMake dependency, along with `executorch_module_static` and `executorch_tensor`, if desired.
+
+```
+# CMakeLists.txt
+add_subdirectory("executorch")
+...
+target_link_libraries(
+    my_target
+    PRIVATE executorch
+    executorch_module_static
+    executorch_tensor
+    optimized_native_cpu_ops_lib
+    xnnpack_backend)
+```
+
+See [Building from Source](using-executorch-building-from-source.md) for more information on the CMake build process.
+
+## Reference Runners
+
+The ExecuTorch repository includes several reference runners, which are simple programs that load and execute a .pte file, typically with random inputs. These can be used to sanity check model execution on a development platform and as a code reference for runtime integration.
+
+The `executor_runner` target is built by default when building with CMake. It can be invoked as follows:
+```
+./cmake-out/executor_runner --model_path path/to/model.pte
+```
+
+The runner source code can be found in the ExecuTorch repo under [examples/portable/executor_runner.cpp](https://github.com/pytorch/executorch/blob/main/examples/portable/executor_runner/executor_runner.cpp). Some backends, such as CoreML, have dedicated runners to showcase backend and platform-specific functionality. See [examples/apple/coreml](https://github.com/pytorch/executorch/tree/main/examples/apple/coreml) and the [examples](https://github.com/pytorch/executorch/tree/main/examples) directory for more information.
+
+## Next Steps
+
+- [Runtime API Reference](executorch-runtime-api-reference.md) for documentation on the available C++ runtime APIs.
+- [Running an ExecuTorch Model Using the Module Extension in C++](extension-module.md) for information on the high-level Module API.
+- [Managing Tensor Memory in C++](extension-tensor.md) for information on high-level tensor APIs.
+- [Running an ExecuTorch Model in C++ Tutorial](running-a-model-cpp-tutorial.md) for information on the low-level runtime APIs.
+- [Building from Source](using-executorch-building-from-source.md) for information on CMake build integration.
diff --git a/docs/source/using-executorch-export.md b/docs/source/using-executorch-export.md
new file mode 100644
index 00000000000..fb51ff1bd40
--- /dev/null
+++ b/docs/source/using-executorch-export.md
@@ -0,0 +1,180 @@
+# Model Export and Lowering
+
+The section describes the process of taking a PyTorch model and converting to the runtime format used by ExecuTorch. This process is commonly known as "exporting", as it uses the PyTorch export functionality to convert a PyTorch model into a format suitable for on-device execution. This process yields a .pte file which is optimized for on-device execution using a particular backend.
+
+## Prerequisites
+
+Exporting requires the ExecuTorch python libraries to be installed, typically by running `pip install executorch`. See [Installation](getting-started.md#Installation) for more information. This process assumes you have a PyTorch model, can instantiate it from Python, and can provide example input tensors to run the model.
+
+## The Export and Lowering Process
+
+The process to export and lower a model to the .pte format typically involves the following steps:
+
+1) Select a backend to target.
+2) Prepare the PyTorch model, including inputs and shape specification.
+3) Export the model using torch.export.export.
+4) Optimize the model for the target backend using to_edge_transform_and_lower.
+5) Create the .pte file by calling to_executorch and serializing the output.
+
+<br/>
+
+Quantization - the process of using reduced precision to reduce inference time and memory footprint - is also commonly done at this stage. See [Quantization Overview](quantization-overview.md) for more information.
+
+## Hardware Backends
+
+ExecuTorch backends provide hardware acceleration for a specific hardware target. In order to achieve maximum performance on target hardware, ExecuTorch optimizes the model for a specific backend during the export and lowering process. This means that the resulting .pte file is specialized for the specific hardware. In order to deploy to multiple backends, such as Core ML on iOS and Arm CPU on Android, it is common to generate a dedicated .pte file for each.
+
+The choice of hardware backend is informed by the hardware that the model is intended to be deployed on. Each backend has specific hardware requires and level of model support. See the documentation for each hardware backend for more details.
+
+As part of the .pte file creation process, ExecuTorch identifies portions of the model (partitions) that are supported for the given backend. These sections are processed by the backend ahead of time to support efficient execution. Portions of the model that are not supported on the delegate, if any, are executed using the portable fallback implementation on CPU. This allows for partial model acceleration when not all model operators are supported on the backend, but may have negative performance implications. In addition, multiple partitioners can be specified in order of priority. This allows for operators not supported on GPU to run on CPU via XNNPACK, for example.
+
+### Available Backends
+
+Commonly used hardware backends are listed below. For mobile, consider using XNNPACK for Android and XNNPACK or Core ML for iOS. To create a .pte file for a specific backend, pass the appropriate partitioner class to `to_edge_transform_and_lower`. See the appropriate backend documentation and the [Export and Lowering](#export-and-lowering) section below for more information. 
+
+- [XNNPACK (Mobile CPU)](backends-xnnpack.md)
+- [Core ML (iOS)](backends-coreml.md)
+- [Metal Performance Shaders (iOS GPU)](backends-mps.md)
+- [Vulkan (Android GPU)](backends-vulkan.md)
+- [Qualcomm NPU](backends-qualcomm.md)
+- [MediaTek NPU](backends-mediatek.md)
+- [Arm Ethos-U NPU](backends-arm-ethos-u.md)
+- [Cadence DSP](backends-cadence.md)
+
+## Model Preparation
+
+The export process takes in a standard PyTorch model, typically a `torch.nn.Module`. This can be an custom model definition, or a model from an existing source, such as TorchVision or HuggingFace. See [Getting Started with ExecuTorch](getting-started.md) for an example of lowering a TorchVision model.
+
+Model export is done from Python. This is commonly done through a Python script or from an interactive Python notebook, such as Jupyter or Colab. The example below shows instantiation and inputs for a simple PyTorch model. The inputs are prepared as a tuple of torch.Tensors, and the model can run with these inputs.
+
+```python
+import torch
+
+class Model(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.seq = torch.nn.Sequential(
+            torch.nn.Conv2d(1, 8, 3),
+            torch.nn.ReLU(),
+            torch.nn.Conv2d(8, 16, 3),
+            torch.nn.ReLU(),
+            torch.nn.AdaptiveAvgPool2d((1,1))
+       )
+        self.linear = torch.nn.Linear(16, 10)
+    
+    def forward(self, x):
+        y = self.seq(x)
+        y = torch.flatten(y, 1)
+        y = self.linear(y)
+        return y
+
+model = Model().eval()
+inputs = (torch.randn(1,1,16,16),)
+outputs = model(*inputs)
+print(f"Model output: {outputs}")
+```
+
+Note that the model is set to evaluation mode using `.eval()`. Models should always be exported in evaluation mode unless performing on-device training. This mode configures certain operations with training-specific behavior, such as batch norm or dropout, to use the inference-mode configuration.
+
+## Export and Lowering
+
+To actually export and lower the model, call `export`, `to_edge_transform_and_lower`, and `to_executorch` in sequence. This yields an ExecuTorch program which can be serialized to a file. Putting it all together, lowering the example model above using the XNNPACK delegate for mobile CPU performance can be done as follows:
+
+```python
+import torch
+from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
+from executorch.exir import to_edge_transform_and_lower
+from torch.export import Dim, export
+
+class Model(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.seq = torch.nn.Sequential(
+            torch.nn.Conv2d(1, 8, 3),
+            torch.nn.ReLU(),
+            torch.nn.Conv2d(8, 16, 3),
+            torch.nn.ReLU(),
+            torch.nn.AdaptiveAvgPool2d((1,1))
+        )
+        self.linear = torch.nn.Linear(16, 10)
+    
+    def forward(self, x):
+        y = self.seq(x)
+        y = torch.flatten(y, 1)
+        y = self.linear(y)
+        return y
+
+model = Model()
+inputs = (torch.randn(1,1,16,16),)
+dynamic_shapes = {
+    "x": {
+        2: Dim("h", min=16, max=1024),
+        3: Dim("w", min=16, max=1024),
+    }
+}
+
+exported_program = export(model, inputs, dynamic_shapes=dynamic_shapes)
+executorch_program = to_edge_transform_and_lower(
+    exported_program,
+    partitioner = [XnnpackPartitioner()]
+).to_executorch()
+
+with open("model.pte", "wb") as file:
+    file.write(executorch_program.buffer)
+```
+
+This yields a `model.pte` file which can be run on mobile devices.
+
+### Supporting Varying Input Sizes (Dynamic Shapes)
+
+The PyTorch export process uses the example inputs provided to trace through the model and reason about the size and type of tensors at each step. Unless told otherwise, export will assume a fixed input size equal to the example inputs and will use this information to optimize the model.
+
+Many models require support for varying input sizes. To support this, export takes a `dynamic_shapes` parameter, which informs the compiler of which dimensions can vary and their bounds. This takes the form of a nested dictionary, where keys correspond to input names and values specify the bounds for each input.
+
+In the example model, inputs are provided as 4-dimensions tensors following the standard convention of batch, channels, height, and width (NCHW). An input with the shape `[1, 3, 16, 16]` indicates 1 batch, 3 channels, and a height and width of 16.
+
+Suppose your model supports images with sizes between 16x16 and 1024x1024. The shape bounds can be specified as follows:
+
+```
+dynamic_shapes = {
+    "x": {
+        2: Dim("h", min=16, max=1024),
+        3: Dim("w", min=16, max=1024),
+    }
+}
+
+ep = torch.export.export(model, inputs, dynamic_shapes=dynamic_shapes)
+```
+
+In the above example, `"x"` corresponds to the parameter name in `Model.forward`. The 2 and 3 keys correpond to dimensions 2 and 3, which are height and width. As there are no specifications for batch and channel dimensions, these values are fixed according to the example inputs.
+
+ExecuTorch uses the shape bounds both to optimize the model and to plan memory for model execution. For this reason, it is advised to set the dimension upper bounds to no higher than needed, as higher bounds increase memory consumption.
+
+For more complex use cases, dynamic shape specification allows for mathematical relationships between dimensions. For more information on dynamic shape specification, see [Expressing Dynamism](https://pytorch.org/docs/stable/export.html#expressing-dynamism).
+
+## Testing the Model
+
+Before integrating the runtime code, it is common to test the exported model from Python. This can be used to evaluate model accuracy and sanity check behavior before moving to the target device. Note that not all hardware backends are available from Python, as they may require specialized hardware to function. See the specific backend documentation for more information on hardware requirements and the availablilty of simulators. The XNNPACK delegate used in this example is always available on host machines.
+
+```python
+from executorch.runtime import Runtime
+
+runtime = Runtime.get()
+
+input_tensor = torch.randn(1, 3, 32, 32)
+program = runtime.load_program("model.pte")
+method = program.load_method("forward")
+outputs = method.execute([input_tensor])
+```
+
+For more information, see [Runtime API Reference](executorch-runtime-api-reference.md).
+
+## Next Steps
+
+The PyTorch and ExecuTorch export and lowering APIs provide a high level of customizability to meet the needs of diverse hardware and models. See [torch.export](https://pytorch.org/docs/main/export.html) and [Export API Reference](export-to-executorch-api-reference.md) for more information.
+
+For advanced use cases, see the following:
+- [Quantization Overview](quantization-overview.md) for information on quantizing models to reduce inference time and memory footprint.
+- [Memory Planning](compiler-memory-planning.md) for information on controlling memory placement and planning.
+- [Custom Compiler Passes](compiler-custom-compiler-passes.md) for information on writing custom compiler passes.
+- [Export IR Specification](ir-exir.md) for information on the intermediate representation generated by export.
diff --git a/docs/source/getting-started-faqs.md b/docs/source/using-executorch-faqs.md
similarity index 99%
rename from docs/source/getting-started-faqs.md
rename to docs/source/using-executorch-faqs.md
index c7c03488de1..56384c8015e 100644
--- a/docs/source/getting-started-faqs.md
+++ b/docs/source/using-executorch-faqs.md
@@ -1,4 +1,4 @@
-# FAQs and Common Issues
+# Frequently Asked Questions
 
 This page summarizes frequently asked questions and provides guidance on issues that commonly occur when adopting ExecuTorch.
 
diff --git a/docs/source/apple-runtime.md b/docs/source/using-executorch-ios.md
similarity index 84%
rename from docs/source/apple-runtime.md
rename to docs/source/using-executorch-ios.md
index 4114b780607..e975cb9ef22 100644
--- a/docs/source/apple-runtime.md
+++ b/docs/source/using-executorch-ios.md
@@ -1,6 +1,8 @@
-# Integrating and Running ExecuTorch on Apple Platforms
+# Using ExecuTorch on iOS
 
-**Author:** [Anthony Shoumikhin](https://github.com/shoumikhin)
+ExecuTorch supports both iOS and macOS via Objective-C, Swift, and C++. ExecuTorch also provides backends to leverage Core ML and Metal Performance Shaders (MPS) for hardware-accelerated execution on Apple platforms.
+
+## Integration
 
 The ExecuTorch Runtime for iOS and macOS is distributed as a collection of prebuilt [.xcframework](https://developer.apple.com/documentation/xcode/creating-a-multi-platform-binary-framework-bundle) binary targets. These targets are compatible with both iOS and macOS devices and simulators and are available in both release and debug modes:
 
@@ -17,15 +19,13 @@ Link your binary with the ExecuTorch runtime and any backends or kernels used by
 
 **Note:** To access logs, link against the Debug build of the ExecuTorch runtime, i.e., the `executorch_debug` framework. For optimal performance, always link against the Release version of the deliverables (those without the `_debug` suffix), which have all logging overhead removed.
 
-## Integration
-
 ### Swift Package Manager
 
 The prebuilt ExecuTorch runtime, backend, and kernels are available as a [Swift PM](https://www.swift.org/documentation/package-manager/) package.
 
 #### Xcode
 
-In Xcode, go to `File > Add Package Dependencies`. Paste the URL of the [ExecuTorch repo](https://github.com/pytorch/executorch) into the search bar and select it. Make sure to change the branch name to the desired ExecuTorch version in format "swiftpm-<version>", (e.g. "swiftpm-0.5.0"), or a branch name in format "swiftpm-<version>.<year_month_date>" (e.g. "swiftpm-0.5.0-20250130") for a nightly build on a specific date.
+In Xcode, go to `File > Add Package Dependencies`. Paste the URL of the [ExecuTorch repo](https://github.com/pytorch/executorch) into the search bar and select it. Make sure to change the branch name to the desired ExecuTorch version in format "swiftpm-<version>", (e.g. "swiftpm-0.5.0"), or a branch name in format "swiftpm-<version>.<year_month_date>" (e.g. "swiftpm-0.5.0-20250228") for a nightly build on a specific date.
 
 ![](_static/img/swiftpm_xcode1.png)
 
@@ -84,9 +84,9 @@ swift package resolve
 swift build
 ```
 
-### Local Build
+### Building from Source
 
-Another way to integrate the ExecuTorch runtime is to build the necessary components from sources locally and link against them. This route is more involved but certainly doable.
+Another way to integrate the ExecuTorch runtime is to build the necessary components from sources locally and link against them. This is useful when customizing the runtime.
 
 1. Install [Xcode](https://developer.apple.com/xcode/resources/) 15+ and Command Line Tools:
 
@@ -106,7 +106,7 @@ git clone https://github.com/pytorch/executorch.git --depth 1 --recurse-submodul
 python3 -m venv .venv && source .venv/bin/activate && pip install --upgrade pip
 ```
 
-4. Install the required dependencies, including those needed for the backends like [Core ML](build-run-coreml.md) or [MPS](build-run-mps.md), if you plan to build them as well:
+4. Install the required dependencies, including those needed for the backends like [Core ML](backends-coreml.md) or [MPS](backends-mps.md), if you plan to build them as well:
 
 ```bash
 ./install_executorch.sh --pybind coreml mps xnnpack
@@ -129,17 +129,15 @@ sudo /Applications/CMake.app/Contents/bin/cmake-gui --install
 6. Use the provided script to build .xcframeworks:
 
 ```bash
-./build/build_apple_frameworks.sh --help
+./scripts/build_apple_frameworks.sh --help
 ```
 
-For example, the following invocation will build the ExecuTorch Runtime and all currently available kernels and backends for the Apple platform:
+For example, the following command will build the ExecuTorch Runtime along with all available kernels and backends for the Apple platform in both Release and Debug modes:
 
 ```bash
-./build/build_apple_frameworks.sh --coreml --mps --xnnpack --custom --optimized --portable --quantized
+./scripts/build_apple_frameworks.sh --Release --Debug --coreml --mps --xnnpack --custom --optimized --portable --quantized
 ```
 
-Append a `--Debug` flag to the above command to build the binaries with debug symbols if needed.
-
 After the build finishes successfully, the resulting frameworks can be found in the `cmake-out` directory.
 Copy them to your project and link them against your targets.
 
@@ -155,15 +153,15 @@ ET_PLATFORM[sdk=iphoneos*] = ios
 ET_PLATFORM[sdk=macos*] = macos
 
 OTHER_LDFLAGS = $(inherited) \
-    -force_load $(BUILT_PRODUCTS_DIR)/libexecutorch-$(ET_PLATFORM)-release.a \
-    -force_load $(BUILT_PRODUCTS_DIR)/libbackend_coreml-$(ET_PLATFORM)-release.a \
-    -force_load $(BUILT_PRODUCTS_DIR)/libbackend_mps-$(ET_PLATFORM)-release.a \
-    -force_load $(BUILT_PRODUCTS_DIR)/libbackend_xnnpack-$(ET_PLATFORM)-release.a \
-    -force_load $(BUILT_PRODUCTS_DIR)/libkernels_optimized-$(ET_PLATFORM)-release.a \
-    -force_load $(BUILT_PRODUCTS_DIR)/libkernels_quantized-$(ET_PLATFORM)-release.a
+    -force_load $(BUILT_PRODUCTS_DIR)/libexecutorch_debug_$(ET_PLATFORM).a \
+    -force_load $(BUILT_PRODUCTS_DIR)/libbackend_coreml_$(ET_PLATFORM).a \
+    -force_load $(BUILT_PRODUCTS_DIR)/libbackend_mps_$(ET_PLATFORM).a \
+    -force_load $(BUILT_PRODUCTS_DIR)/libbackend_xnnpack_$(ET_PLATFORM).a \
+    -force_load $(BUILT_PRODUCTS_DIR)/libkernels_optimized_$(ET_PLATFORM).a \
+    -force_load $(BUILT_PRODUCTS_DIR)/libkernels_quantized_$(ET_PLATFORM).a
 ```
 
-For a Debug build configuration, replace `release` with `debug` in the library file names. Remember to link against the ExecuTorch runtime (`libexecutorch`) in Debug mode even if other components are built for Release to preserve logs if needed.
+**Note:** In the example above, we link against the Debug version of the ExecuTorch runtime (`libexecutorch_debug`) to preserve the logs. Normally, that does not impact the performance too much. Nevertheless, remember to link against the release version of the runtime (`libexecutorch`) for the best performance and no logs.
 
 You can assign such a config file to your target in Xcode:
 
@@ -195,7 +193,7 @@ import ExecuTorch
 
 ### Logging
 
-We provide extra APIs for logging in Objective-C and Swift as a lightweight wrapper of the internal ExecuTorch machinery. To use it, just import the main framework header in Objective-C. Then use the `ExecuTorchLog` interface (or the `Log` class in Swift) to subscribe your own implementation of the `ExecuTorchLogSink` protocol (or `LogSink` in Swift) to listen to log events.
+ExecuTorch provides extra APIs for logging in Objective-C and Swift as a lightweight wrapper of the internal ExecuTorch machinery. To use it, just import the main framework header in Objective-C. Then use the `ExecuTorchLog` interface (or the `Log` class in Swift) to subscribe your own implementation of the `ExecuTorchLogSink` protocol (or `LogSink` in Swift) to listen to log events.
 
 ```objectivec
 #import <ExecuTorch/ExecuTorch.h>
diff --git a/docs/source/using-executorch-runtime-integration.md b/docs/source/using-executorch-runtime-integration.md
new file mode 100644
index 00000000000..08e071e59ab
--- /dev/null
+++ b/docs/source/using-executorch-runtime-integration.md
@@ -0,0 +1,53 @@
+# Runtime Integration
+
+This section describes options for configuring and customizing the ExecuTorch runtime. While the pre-built packages are designed to provide an "out-of-box" experience, it is common to require additional configuration when shipping into production. ExecuTorch provides the ability to compile-time gate features, such as logging, customize system integration, and include only the operators needed to run specific models (selective build).
+
+## Logging
+
+ExecuTorch runtime code includes logging statements at various levels, to aid with integration and debugging. Logging inclusion is controlled at build time by the `EXECUTORCH_ENABLE_LOGGING` and `EXECUTORCH_LOG_LEVEL` CMake options. Having these exposed as compile-time configuration allows for all logging-related code to be excluded when not used, which is critical for resource constrained systems.
+
+Logging is sent to STDOUT and STDERR by default on host platforms, and is redirected to OS-specific logging on Android and iOS. See [Platform Abstraction Layer](#platform-abstraction-layer-pal) below for more information on log routing.
+
+To configure log level when building from source, specify `EXECUTORCH_ENABLE_LOGGING` as on or off and `EXECUTORCH_LOG_LEVEL` as one of debug, info, error, or fatal. Logging is enabled by default in debug builds and disabled in release. Log level defaults to info.
+
+See [Building from Source](using-executorch-building-from-source.md) for more information.
+
+```
+cmake -b cmake-out -DEXECUTORCH_ENABLE_LOGGING=ON -DEXECUTORCH_LOG_LEVEL=DEBUG ...
+```
+
+## Platform Abstraction Layer (PAL)
+
+The ExecuTorch Platform Abstraction Layer, or PAL, is a glue layer responsible for providing integration with a particular host system. This includes log routing, timestamps, and abort handling. ExecuTorch provides a default implementation for POSIX-compliant targets, as well as a Android and iOS-specific implementations under the appropriate extensions.
+
+For non-POSIX-compliant systems, a minimal no-op PAL implementation is provided. It is expected that users override the relevant PAL methods in order to enable logging, timestamps, and aborts. The minimal PAL can be selected by building with `-DEXECUTORCH_PAL_DEFAULT=minimal`.
+
+### Overriding the PAL
+
+Overriding the default PAL implementation is commonly done to route logs to a user-specified destination or to provide PAL functionality on embedded systems. To override one or more PAL methods, take the following steps:
+
+- Include
+  [`executorch/runtime/platform/platform.h`](https://github.com/pytorch/executorch/blob/main/runtime/platform/platform.h)
+  in one of your application's `.c` or `.cpp` files.
+- Define an implementation of one or more of the `et_pal_*()` functions.
+
+The default PAL functions are weak symbols, so providing your own strong-symbol
+definition can override them at link time. To ensure that your definitions take
+precedence, you may need to ensure that the strong definitions precede the weak
+definitions in the link order.
+
+See [runtime/platform/platform.h](https://github.com/pytorch/executorch/blob/main/runtime/platform/platform.h) for the PAL function signatures and [runtime/platform/default/posix.cpp](https://github.com/pytorch/executorch/blob/main/runtime/platform/default/posix.cpp) for the reference POSIX implementation.
+
+## Kernel Libraries
+
+During export, a model is broken down into a list of operators, each providing some fundamental computation. Adding two tensors is an operator, as is convolution. Each operator requires a corresponding operator kernel to perform the computation on the target hardware. ExecuTorch backends are the preferred way to do this, but not all operators are supported on all backends.
+
+To handle this, ExecuTorch provides two implementations - the *portable* and *optimized* kernel libraries. The portable kernel library provides full support for all operators in a platform-independent manner. The optimized library carries additional system requirements, but is able to leverage multithreading and vectorized code to achieve greater performance. Operators can be drawn for both for a single build, allowing the optimized library to be used where available with the portable library as a fallback.
+
+The choice of kernel library is transparent to the user when using mobile pre-built packages. However, it is important when building from source, especially on embedded systems. On mobile, the optimized operators are preferred where available. See [Overview of ExecuTorch's Kernel Libraries](kernel-library-overview.md) for more information.
+
+## Selective Build
+
+By default, ExecuTorch ships with all supported operator kernels, allowing it to run any supported model at any precision. This comes with a binary size of several megabytes, which may be undesirable for production use cases or resource constrained systems. To minimize binary size, ExecuTorch provides selective build functionality, in order to include only the operators needed to run specific models.
+
+Note the selective build only applies to the portable and optimized kernel libraries. Delegates do not participate in selective build and can be included or excluded by linking indivually. See [Kernel Library Selective Build](kernel-library-selective-build.md) for more information.
diff --git a/docs/source/using-executorch-troubleshooting.md b/docs/source/using-executorch-troubleshooting.md
new file mode 100644
index 00000000000..16006802611
--- /dev/null
+++ b/docs/source/using-executorch-troubleshooting.md
@@ -0,0 +1,20 @@
+# Profiling and Debugging
+
+To faciliate model and runtime integration, ExecuTorch provides tools to profile model resource utilization, numerics, and more. This section describes the available troubleshooting tools and steps to resolve issues when integrating ExecuTorch.
+
+## General Troubleshooting Steps
+
+- To troubleshoot failure of runtime API calls, such as loading or running a model, ensure that ExecuTorch framework logging is enabled. See [Logging](using-executorch-runtime-integration.md#logging) for more information.
+- As a prelimatinary step to troubleshoot slow run times, ensure that performance testing is being done in a release build, and that the model is delegated. See [Inference is Slow](using-executorch-faqs.md#inference-is-slow--performance-troubleshooting) for more information.
+- Check [Frequently Asked Questions](using-executorch-faqs.md) for common issues and questions encountered during install, model export, and runtime integration.
+
+## Developer Tools
+
+The ExecuTorch developer tools, or devtools, are a collection of tooling for troubleshooting model performance, numerics, and resource utilization. See [Introduction to the ExecuTorch Developer Tools](devtools-overview.md) for an overview of the available developer tools and usage.
+
+## Next Steps
+
+- [Frequently Asked Questions](using-executorch-faqs.md) for solutions to commonly encountered questions and issues.
+- [Introduction to the ExecuTorch Developer Tools](runtime-profiling.md) for a high-level introduction to available developer tooling.
+- [Using the ExecuTorch Developer Tools to Profile a Model](tutorials/devtools-integration-tutorial.md) for information on runtime performance profiling.
+- [Inspector APIs](runtime-profiling.md) for reference material on trace inspector APIs.
diff --git a/examples/apple/coreml/llama/TARGETS b/examples/apple/coreml/llama/TARGETS
new file mode 100644
index 00000000000..87ad47fbf6d
--- /dev/null
+++ b/examples/apple/coreml/llama/TARGETS
@@ -0,0 +1,66 @@
+# Any targets that should be shared between fbcode and xplat must be defined in
+# targets.bzl. This file can contain fbcode-only targets.
+
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+runtime.python_library(
+    name = "llama_transformer",
+    srcs = [
+        "llama_transformer.py",
+    ],
+    _is_external_target = True,
+    base_module = "executorch.examples.apple.coreml.llama",
+    visibility = [
+        "//executorch/...",
+        "@EXECUTORCH_CLIENTS",
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/examples/models/llama:llama_transformer",
+    ],
+)
+
+runtime.python_library(
+    name = "utils",
+    srcs = [
+        "utils.py",
+    ],
+    _is_external_target = True,
+    base_module = "executorch.examples.apple.coreml.llama",
+    visibility = [
+        "//executorch/...",
+        "@EXECUTORCH_CLIENTS",
+    ],
+    deps = [
+        "//caffe2:torch",
+    ],
+)
+
+runtime.python_binary(
+    name = "export",
+    srcs = [
+        "export.py",
+    ],
+    main_function = "executorch.examples.apple.coreml.llama.export.main",
+    visibility = [
+        "//executorch/...",
+        "@EXECUTORCH_CLIENTS",
+    ],
+    deps = [
+        "fbsource//third-party/pypi/coremltools:coremltools",
+        ":llama_transformer",
+        ":utils",
+        "//caffe2:torch",
+        "//executorch/backends/apple/coreml:backend",
+        "//executorch/backends/apple/coreml:partitioner",
+        "//executorch/examples/models/llama:source_transformation",
+        "//executorch/exir/backend:utils",
+        "//executorch/exir/capture:config",
+        "//executorch/exir/passes:lib",
+        "//executorch/exir/passes:quant_fusion_pass",
+        "//executorch/exir/passes:sym_shape_eval_pass",
+        "//executorch/exir/program:program",
+        "//executorch/extension/export_util:export_util",
+        "//executorch/extension/llm/export:export_lib",
+    ],
+)
diff --git a/examples/apple/coreml/llama/export.py b/examples/apple/coreml/llama/export.py
new file mode 100644
index 00000000000..9aa232fa691
--- /dev/null
+++ b/examples/apple/coreml/llama/export.py
@@ -0,0 +1,241 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+
+import coremltools as ct
+import torch
+from executorch.backends.apple.coreml.compiler import CoreMLBackend  # pyre-ignore
+from executorch.backends.apple.coreml.partition import CoreMLPartitioner  # pyre-ignore
+
+from executorch.examples.apple.coreml.llama.llama_transformer import (
+    InputManager,
+    load_model,
+)
+from executorch.examples.apple.coreml.llama.utils import (
+    replace_linear_with_split_linear,
+)
+from executorch.examples.models.llama.source_transformation.quantize import (
+    EmbeddingQuantHandler,
+)
+
+from executorch.exir.backend.utils import format_delegated_graph
+from executorch.exir.capture._config import EdgeCompileConfig, ExecutorchBackendConfig
+from executorch.exir.passes import MemoryPlanningPass
+from executorch.exir.passes.quant_fusion_pass import QuantFusionPass
+from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass
+from executorch.exir.program._program import to_edge_with_preserved_ops
+from executorch.extension.export_util.utils import save_pte_program
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-n",
+        "--output_name",
+        default="model.pte",
+        help="Override the output filename of the saved pte model file.",
+    )
+    parser.add_argument(
+        "-p",
+        "--params",
+        help="config.json",
+    )
+    parser.add_argument(
+        "-c",
+        "--checkpoint",
+        help="checkpoint path",
+    )
+    parser.add_argument(
+        "--seq_length",
+        type=int,
+        default=1,
+        help="length sequence to evaluate",
+    )
+    parser.add_argument(
+        "--max_seq_length",
+        type=int,
+        default=128,
+        help="maximum length sequence to evaluate",
+    )
+    parser.add_argument(
+        "--cache_size",
+        type=int,
+        default=None,
+        help="Cache size.  Old items are evicted from cache",
+    )
+    parser.add_argument(
+        "-E",
+        "--embedding-quantize",
+        default=None,
+        type=str,
+        help="type of embedding quantization, '<bitwidth>,<groupsize>', e.g., '8,1024'.",
+    )
+    parser.add_argument(
+        "--coreml-quantize",
+        default=None,
+        choices=["b4w", "c4w"],
+        help="This option is only for coreml: Use coreml quantization, e.g. b4w (for blockwise 4 bit weight), c4w (for channelwise 4 bit weight)",
+    )
+    parser.add_argument(
+        "--use_cache_list",
+        action="store_true",
+        help="Use cache list to speed up model computation (does not work in pybindings)",
+    )
+    parser.add_argument(
+        "--target_split_size",
+        type=int,
+        default=None,
+        help="Split linear layers into smaller chunks of target_split_size.",
+    )
+    parser.add_argument(
+        "--max_splits",
+        type=int,
+        default=8,
+        help="Maximum number of splits to divide linear layers",
+    )
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        default="fp16",
+    )
+
+    export_args = parser.parse_args()
+    model = load_model(
+        export_args.checkpoint,
+        export_args.params,
+        max_seq_length=export_args.max_seq_length,
+        use_cache_list=export_args.use_cache_list,
+    )
+
+    float_dtype = {"fp16": torch.float16, "fp32": torch.float32}[
+        export_args.dtype
+    ]  # dtype for model/inputs
+
+    if export_args.embedding_quantize:
+        bitwidth, group_size = export_args.embedding_quantize.split(",")
+        if group_size == "none" or group_size == "None" or group_size == "0":
+            group_size = None
+        else:
+            group_size = int(group_size)
+        bitwidth = int(bitwidth)
+        model = EmbeddingQuantHandler(
+            model,
+            bitwidth=bitwidth,
+            group_size=group_size,
+            packed=(bitwidth in [2, 4]),
+        ).quantized_model()
+
+    if export_args.target_split_size is not None:
+        replace_linear_with_split_linear(
+            model,
+            out_target_split_size=export_args.target_split_size,
+            out_max_splits=export_args.max_splits,
+            # I have not found splitting on in_features to be beneficial,
+            # and it often leads to OOM so I set in_max_splits to 1
+            in_target_split_size=1,
+            in_max_splits=1,
+        )
+
+    model.eval()
+    model.to(float_dtype)
+
+    op_linear_quantizer_config = None
+    if export_args.coreml_quantize == "b4w":
+        op_linear_quantizer_config = {
+            "mode": "linear_symmetric",
+            "dtype": "int4",
+            "granularity": "per_block",
+            "block_size": 32,
+            "weight_threshold": 512,
+        }
+    elif export_args.coreml_quantize == "c4w":
+        op_linear_quantizer_config = {
+            "mode": "linear_symmetric",
+            "dtype": "int4",
+            "granularity": "per_channel",
+        }
+
+    compile_specs = CoreMLBackend.generate_compile_specs(  # pyre-fixme[16]
+        minimum_deployment_target=ct.target.iOS18,
+        compute_precision={
+            torch.float16: ct.precision.FLOAT16,
+            torch.float32: ct.precision.FLOAT32,
+        }[float_dtype],
+        compute_unit=ct.ComputeUnit.CPU_AND_NE,
+        model_type=CoreMLBackend.MODEL_TYPE.MODEL,  # pyre-fixme[16]
+        op_linear_quantizer_config=op_linear_quantizer_config,
+    )
+    partitioner = CoreMLPartitioner(  # pyre-fixme[16]
+        compile_specs=compile_specs,
+        take_over_mutable_buffer=False,
+        skip_ops_for_coreml_delegation=[
+            "quantized_decomposed.embedding_4bit.dtype",
+            "aten.embedding.default",
+        ],
+    )
+
+    input_manager = InputManager(
+        n_layers=model.params.n_layers,
+        max_batch_size=model.params.max_batch_size,
+        n_kv_heads=model.params.n_kv_heads,
+        max_seq_length=model.params.max_seq_len,
+        head_dim=model.params.head_dim,
+        use_cache_list=export_args.use_cache_list,
+        seq_length=export_args.seq_length,
+        dtype=float_dtype,
+        minus_infinity=-30000,
+        cache_size=export_args.cache_size,
+    )
+    example_inputs = input_manager.get_inputs(tokens=[0])
+
+    ep = torch.export.export(model, example_inputs, strict=True)
+    print("Exported program")
+    print(ep)
+
+    edge_manager = to_edge_with_preserved_ops(
+        ep,
+        preserve_ops=[
+            torch.ops.aten.scaled_dot_product_attention.default,
+            # preserve norm op for numerical stability
+            torch.ops.aten.linalg_vector_norm.default,
+            torch.ops.aten.reciprocal.default,
+        ],
+        compile_config=EdgeCompileConfig(
+            _check_ir_validity=False,
+            _skip_type_promotion=(float_dtype == torch.float16),
+            _skip_dim_order=True,
+        ),
+    )
+    print("Edge program")
+    print(edge_manager.exported_program())
+
+    for node in edge_manager.exported_program().graph_module.graph.nodes:
+        print(node.name, node.target, node.args, node.kwargs)
+
+    edge_manager = edge_manager.to_backend(partitioner)
+
+    print("Delegated program")
+
+    print(format_delegated_graph(edge_manager.exported_program().graph_module))
+
+    executorch_program = edge_manager.to_executorch(
+        ExecutorchBackendConfig(
+            extract_delegate_segments=True,
+            passes=[
+                QuantFusionPass(),
+            ],
+            memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
+            sym_shape_eval_pass=ConstraintBasedSymShapeEvalPass(),
+        )
+    )
+
+    filename = save_pte_program(executorch_program, export_args.output_name)
+    print(f"Saved Executorch program to local {filename}")
+
+
+if __name__ == "__main__":
+    main()  # pragma: no cover
diff --git a/examples/apple/coreml/llama/llama_transformer.py b/examples/apple/coreml/llama/llama_transformer.py
new file mode 100644
index 00000000000..324f4aa1f2e
--- /dev/null
+++ b/examples/apple/coreml/llama/llama_transformer.py
@@ -0,0 +1,636 @@
+# @lint-ignore-every LICENSELINT
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved.
+
+# Please refer to README.md in the same folder for more information.
+
+from dataclasses import dataclass
+from functools import partial
+from typing import Dict, List, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+
+from executorch.examples.models.llama.rope import (
+    hf_apply_rotary_emb,
+    hf_precompute_freqs_cis,
+    precompute_freqs_cis,
+    RotaryEmbedding,
+)
+
+from torch import nn
+
+
+def find_multiple(n: int, k: int) -> int:
+    if n % k == 0:
+        return n
+    return n + k - (n % k)
+
+
+@dataclass
+class ModelArgs:
+    dim: int = 2048
+    n_layers: int = 16
+    n_heads: int = 32
+    n_kv_heads: Optional[int] = None
+    vocab_size: int = 128256
+    hidden_dim: Optional[int] = None
+    head_dim: Optional[int] = None  # Optional customized head_dim
+    multiple_of: int = 256
+    ffn_dim_multiplier: Optional[float] = None
+    norm_eps: float = 1e-5
+    max_batch_size: int = 1
+    max_seq_len: int = 128
+    max_context_len: int = 2048
+    moe: bool = False  # True to enable the MoE (Mixture of Experts)
+    num_experts: int = 8  # Number of experts
+    num_activated_experts: int = 2  # Number of experts to activate
+
+    # Generate logits for all inputs. When it's True, it would take big memory usage
+    # at runtime. Enable it only necessary (e.g., use perplexity tools that requires
+    # logits for all input tokens.)
+    generate_full_logits: bool = False
+    # A dictionary mapping from pruned token-id to original token-id
+    input_prune_map: Optional[Dict[int, int]] = None
+    # A dictionary mapping from pruned token-id to original token-id
+    output_prune_map: Optional[Dict[int, int]] = None
+    use_hf_rope: bool = False  # Use HuggingFace's RoPE implementation
+    rope_theta: Optional[float] = (
+        None  # The official name to override self.rope_freq_base.
+    )
+    rope_freq_base: float = 10000.0  # The base frequency for RoPE. Keep it for BC.
+    use_scaled_rope: bool = True  # Use scaled RoPE, introduced in llama3.1.
+    # Additional Model Metadata needed at runtime
+    rope_scale_factor: int = 8
+    bos_idx: int = 1
+    eos_idx: int = 3
+    bos_count: int = -1  # i.e., a single EOS is used as BOS
+    eos_count: int = 2
+
+    quantization_args: Optional[dict] = None
+    lora_args: Optional[dict] = None
+
+    use_cache_list: bool = True
+
+    def __post_init__(self):
+        if self.n_kv_heads is None:
+            self.n_kv_heads = self.n_heads
+
+        # rope_theta overrides rope_freq_base since it's the official name.
+        if self.rope_theta is not None:
+            self.rope_freq_base = self.rope_theta
+
+        if self.hidden_dim is None:
+            # If hidden_dim is not explicitly set in the ModelArgs,
+            # then calculate implicitly based on dim and also multiple of `args.multiple_of`
+            multiple_of = self.multiple_of
+            hidden_dim = 4 * self.dim
+            hidden_dim = int(2 * hidden_dim / 3)
+            if self.ffn_dim_multiplier is not None:
+                hidden_dim = int(self.ffn_dim_multiplier * hidden_dim)
+            self.hidden_dim = find_multiple(hidden_dim, multiple_of)
+
+        if self.head_dim is None:
+            self.head_dim = self.dim // self.n_heads
+
+
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        """
+        Initialize the RMSNorm normalization layer.
+
+        Args:
+            dim (int): The dimension of the input tensor.
+            eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
+
+        Attributes:
+            eps (float): A small value added to the denominator for numerical stability.
+            weight (nn.Parameter): Learnable scaling parameter.
+
+        """
+        super().__init__()
+        self.dim = dim
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+
+    def _norm(self, x):
+        """
+        Apply the RMSNorm normalization to the input tensor.
+
+        Args:
+            x (torch.Tensor): The input tensor.
+
+        Returns:
+            torch.Tensor: The normalized tensor.
+
+        """
+        # CoreML ignores casts to FP32, so existing implementation of RMSNorm was not stable
+        # We instead use (x * sqrt(n)) / norm(x, dim=-1)
+        # Using torch.norm and preserving this op in CoreML improves stability
+        # Note, we ignore eps, but could add it by using torch.norm(torch.concat(x, sqrt(n*eps))) in the denominator
+        # In future, we want to add CoreML support for the functional RMSNorm op
+        # We have yet to do large scale evaluations on the numeric stability of this solution, but note that
+        # it appears better than what exists currently (removing FP32 casts and using FP16)
+        rms_norm_eps0 = (
+            x
+            * torch.sqrt(torch.tensor(self.dim, dtype=x.dtype))
+            * torch.reciprocal(torch.linalg.vector_norm(x, dim=-1, keepdim=True))
+        )
+        return rms_norm_eps0
+
+    def forward(self, x):
+        """
+        Forward pass through the RMSNorm layer.
+
+        Args:
+            x (torch.Tensor): The input tensor.
+
+        Returns:
+            torch.Tensor: The output tensor after applying RMSNorm.
+
+        """
+        output = self._norm(x)
+        return output * self.weight
+
+
+class Rope(torch.nn.Module):
+    def __init__(self, params: ModelArgs):
+        super().__init__()
+        self.params = params
+        if self.params.use_hf_rope:
+            self.precompute_freqs_cis = hf_precompute_freqs_cis
+        else:
+            self.precompute_freqs_cis = partial(
+                precompute_freqs_cis, use_scaled=self.params.use_scaled_rope
+            )
+        freqs_cos, freqs_sin = self.precompute_freqs_cis(
+            self.params.head_dim,
+            (
+                self.params.max_context_len  # Normal llama2.
+                if self.params.ffn_dim_multiplier is None
+                else self.params.max_context_len * 2  # Sharded checkpoint.
+            ),
+            self.params.rope_freq_base,
+            scale_factor=8,
+        )
+        self.register_buffer("freqs_cos", freqs_cos, persistent=False)
+        self.register_buffer("freqs_sin", freqs_sin, persistent=False)
+        if self.params.use_hf_rope:
+            self.apply_rotary_emb = hf_apply_rotary_emb
+        else:
+            self.apply_rotary_emb = RotaryEmbedding()
+
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        freqs_cos: torch.Tensor,
+        freqs_sin: torch.Tensor,
+    ):
+        return self.apply_rotary_emb(q, k, freqs_cos, freqs_sin)
+
+    def get_freqs(self, input_pos: Optional[torch.Tensor], seq_len: int):
+        """
+        Get the precomputed frequencies for the given input position and sequence length.
+
+        Args:
+            input_pos (torch.Tensor): The input position tensor.
+            seq_len (int): The sequence length.
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: The precomputed frequencies for the given input position and sequence length.
+        """
+        assert (
+            input_pos is not None
+        ), "input_pos must be provided when use_kv_cache is True"
+        input_pos_item = input_pos[-1].item()
+
+        # CoreML partitioner is not picking up _check_is_size
+        # So instead use _check as workaround.  Should be easy fix for partitioner
+        # torch._check_is_size(input_pos_item)
+        torch._check(input_pos_item >= 0)
+        torch._check(input_pos_item + seq_len <= self.params.max_seq_len)
+        # pyre-ignore: Incompatible parameter type [6]: torch.narrow does expect int or Tensor
+        freqs_cos = self.freqs_cos.narrow(0, input_pos_item, seq_len)
+        # pyre-ignore: Incompatible parameter type [6]
+        freqs_sin = self.freqs_sin.narrow(0, input_pos_item, seq_len)
+
+        return freqs_cos, freqs_sin
+
+
+class FeedForward(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        assert args.hidden_dim is not None
+        hidden_dim: int = args.hidden_dim
+        self.w1 = nn.Linear(args.dim, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, args.dim, bias=False)
+        self.w3 = nn.Linear(args.dim, hidden_dim, bias=False)
+
+    def forward(self, x):
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))
+
+
+class ConditionalFeedForward(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        self.dim = args.dim
+        hidden_dim = args.hidden_dim
+        if hidden_dim is None:
+            # If hidden_dim is not explicitly set in the ModelArgs,
+            # then calculate implicitly based on dim and also multiple of `args.multiple_of`
+            multiple_of = args.multiple_of
+            hidden_dim = 4 * self.dim
+            hidden_dim = int(2 * hidden_dim / 3)
+            hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+
+        self.w1 = nn.Parameter(torch.randn(args.num_experts, hidden_dim, self.dim))
+        self.w2 = nn.Parameter(torch.randn(args.num_experts, hidden_dim, self.dim))
+        self.w3 = nn.Parameter(torch.randn(args.num_experts, hidden_dim, self.dim))
+        self.num_experts = args.num_experts
+
+    def forward(self, x: torch.Tensor, expert_indices: torch.Tensor) -> torch.Tensor:
+        w1_weights = self.w1[expert_indices].transpose(-1, -2)  # [T, A, D, D]
+        w3_weights = self.w3[expert_indices].transpose(-1, -2)  # [T, A, D, D]
+        w2_weights = self.w2[expert_indices]  # [T, A, D, D]
+        x1 = F.silu(torch.einsum("ti,taio -> tao", x, w1_weights))
+        x3 = torch.einsum("ti, taio -> tao", x, w3_weights)
+        expert_outs = torch.einsum("tao, taoi -> tai", (x1 * x3), w2_weights)
+        return expert_outs
+
+
+class MOEFeedForward(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        self.gate = nn.Linear(config.dim, config.num_experts, bias=False)
+        self.cond_ffn = ConditionalFeedForward(config)
+        self.dim = config.dim
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x.view(-1, self.dim)
+        # T = num_tokens, E = num_experts, D = hidden dim, A = activated experts
+        # x: [T, D]
+        scores = self.gate(x)  # [T, E]
+        expert_weights, expert_indices = torch.topk(scores, 2, dim=-1)  # [T, A], [T, A]
+        expert_weights = expert_weights.softmax(dim=-1)  # [T, A]
+        expert_outs = self.cond_ffn(x, expert_indices)
+        return torch.einsum("tai,ta -> ti", expert_outs, expert_weights)
+
+
+class Attention(nn.Module):
+    def __init__(self, args: ModelArgs, layer_id: int, rope: Rope):
+        super().__init__()
+        self.n_heads = args.n_heads
+        self.n_kv_heads = self.n_heads if args.n_kv_heads is None else args.n_kv_heads
+
+        assert self.n_heads % self.n_kv_heads == 0
+        model_parallel_size = 1
+        self.n_local_heads = self.n_heads // model_parallel_size
+        self.n_local_kv_heads = self.n_kv_heads // model_parallel_size
+        self.n_rep = self.n_local_heads // self.n_local_kv_heads
+        self.head_dim = args.head_dim
+        self.max_batch_size = args.max_batch_size
+        self.max_seq_len = args.max_seq_len
+        self.dim = args.dim
+        self.wq = nn.Linear(self.dim, self.n_heads * self.head_dim, bias=False)
+        self.wk = nn.Linear(self.dim, self.n_kv_heads * self.head_dim, bias=False)
+        self.wv = nn.Linear(self.dim, self.n_kv_heads * self.head_dim, bias=False)
+        self.wo = nn.Linear(self.n_heads * self.head_dim, self.dim, bias=False)
+
+        self.layer_id = layer_id
+
+        self.rope = rope
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        freqs_cos: torch.Tensor,
+        freqs_sin: torch.Tensor,
+        k_cache: torch.Tensor,
+        v_cache: torch.Tensor,
+        attn_mask: torch.Tensor,
+    ):
+        bsz, seqlen, _ = x.shape
+        # QKV
+        q, k, v = self.wq(x), self.wk(x), self.wv(x)
+        # We need view_copy elimination
+        q = q.view(bsz, seqlen, self.n_local_heads, self.head_dim)
+        k = k.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+        v = v.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+
+        # RoPE relative positional embeddings
+        q, k = self.rope.forward(q, k, freqs_cos, freqs_sin)
+
+        q = q.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+
+        new_k = k
+        new_v = v
+
+        k = torch.concat([k_cache, k], dim=2)
+        v = torch.concat([v_cache, v], dim=2)
+
+        # grouped multiquery attention: expand out keys and values
+        if self.n_rep > 1:
+            k = k.repeat_interleave(self.n_rep, dim=1)
+            v = v.repeat_interleave(self.n_rep, dim=1)
+
+        output = torch.ops.aten.scaled_dot_product_attention.default(
+            q, k, v, attn_mask=attn_mask
+        )
+        output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1)
+        output = self.wo(output)
+        return output, new_k, new_v
+
+
+class TransformerBlock(nn.Module):
+    def __init__(self, layer_id: int, args: ModelArgs, rope: Rope):
+        super().__init__()
+        self.n_heads = args.n_heads
+        self.dim = args.dim
+        self.head_dim = args.head_dim
+        self.attention = Attention(args, layer_id, rope)
+        if args.moe:
+            self.block_sparse_moe = MOEFeedForward(args)
+        else:
+            self.feed_forward = FeedForward(args)
+        self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
+        self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)
+
+    def forward(
+        self,
+        x,
+        freqs_cos,
+        freqs_sin,
+        k_cache,
+        v_cache,
+        attn_mask,
+    ):  # x: 1xN
+        norm_emb = self.attention_norm(x)
+        h, new_k, new_v = self.attention.forward(
+            norm_emb, freqs_cos, freqs_sin, k_cache, v_cache, attn_mask
+        )
+
+        h = x + h
+        out = h + self.feed_forward(self.ffn_norm(h))
+        return out, new_k, new_v
+
+
+class Transformer(nn.Module):
+    def __init__(self, params: ModelArgs):
+        super().__init__()
+        self.params = params
+        self.vocab_size = params.vocab_size
+        self.n_layers = params.n_layers
+
+        self.tok_embeddings = nn.Embedding(params.vocab_size, params.dim)
+        self.rope = Rope(params)
+        self.layers = torch.nn.ModuleList()
+        for layer_id in range(params.n_layers):
+            self.layers.append(TransformerBlock(layer_id, params, self.rope))
+        self.norm = RMSNorm(params.dim, eps=params.norm_eps)
+        self.output = nn.Linear(params.dim, params.vocab_size, bias=False)
+        self.generate_full_logits = params.generate_full_logits
+        self.max_seq_len = params.max_seq_len
+        self.input_prune_map = params.input_prune_map
+        self.output_prune_map = params.output_prune_map
+        self.use_cache_list = params.use_cache_list
+
+    def forward(
+        self,
+        tokens: torch.LongTensor,  # tokens
+        input_pos: torch.LongTensor,
+        input_length: torch.LongTensor,  # input_length
+        k_caches: List[torch.FloatTensor],
+        v_caches: List[torch.FloatTensor],
+        attn_mask: torch.LongTensor,
+        h: Optional[torch.FloatTensor] = None,  # embeddings
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        if (tokens is None) ^ (h is not None):
+            raise ValueError(
+                "You cannot specify both tokens and h at the same time, and must specify either one"
+            )
+        if tokens is not None and h is None:
+            h = self.tok_embeddings(tokens)
+        seqlen = h.shape[1]
+        freqs_cos, freqs_sin = self.rope.get_freqs(input_pos, seqlen)
+
+        k_out = []
+        v_out = []
+        for i, layer in enumerate(self.layers):
+            h, new_k, new_v = layer(
+                h,
+                freqs_cos,
+                freqs_sin,
+                k_caches[i] if self.use_cache_list else k_caches[i, :, :, :, :],
+                v_caches[i] if self.use_cache_list else v_caches[i, :, :, :, :],
+                attn_mask,
+            )
+            k_out.append(new_k)
+            v_out.append(new_v)
+
+        if not self.generate_full_logits:
+            # Only the last logit is used for the new generated token
+            h = h[:, input_length - 1, :].squeeze(1)
+
+        h = self.norm(h)
+
+        logits = self.output(h)
+
+        if not self.use_cache_list:
+            k_out = torch.stack(k_out, dim=0)
+            v_out = torch.stack(v_out, dim=0)
+        return logits, k_out, v_out  # pyre-ignore[7]
+
+
+def load_model(checkpoint_path, params_path, max_seq_length, use_cache_list):
+    import json
+
+    with open(params_path, "r") as f:
+        params = json.loads(f.read())
+
+    args = ModelArgs(
+        max_seq_len=max_seq_length,
+        generate_full_logits=False,
+        use_cache_list=use_cache_list,
+        **params,
+    )
+
+    with torch.device("meta"):
+        model = Transformer(args)
+
+    checkpoint = torch.load(
+        checkpoint_path, map_location="cpu", mmap=True, weights_only=True
+    )
+    if "model" in checkpoint:
+        checkpoint = checkpoint["model"]
+
+    missing, unexpected = model.load_state_dict(
+        checkpoint,
+        strict=False,
+        assign=True,
+    )
+    print("Missing keys: ", missing)
+    print("Unexpected keys: ", unexpected)
+
+    return model
+
+
+class InputManager:
+    def __init__(
+        self,
+        n_layers: int,
+        max_batch_size: int,
+        n_kv_heads: int,
+        max_seq_length: int,
+        head_dim: int,
+        use_cache_list: bool,
+        seq_length: int,
+        dtype=torch.float16,
+        minus_infinity=-torch.inf,
+        cache_size=None,
+    ):
+        if cache_size is None:
+            cache_size = max_seq_length - seq_length
+        self.cache_size = cache_size
+        assert self.cache_size + seq_length <= max_seq_length
+
+        self.n_layers = n_layers
+        self.max_batch_size = max_batch_size
+        self.n_kv_heads = n_kv_heads
+        self.head_dim = head_dim
+
+        self.seq_length = seq_length
+        self.use_cache_list = use_cache_list
+
+        if self.use_cache_list:
+            self.k_caches = [
+                torch.zeros(self.get_cache_shape(self.cache_size)).to(dtype)
+                for _ in range(self.n_layers)
+            ]
+            self.v_caches = [
+                torch.zeros(self.get_cache_shape(self.cache_size)).to(dtype)
+                for _ in range(self.n_layers)
+            ]
+        else:
+            self.k_caches = torch.zeros(self.get_cache_shape(self.cache_size)).to(dtype)
+            self.v_caches = torch.zeros(self.get_cache_shape(self.cache_size)).to(dtype)
+
+        attn_cache = minus_infinity * torch.ones(
+            seq_length, self.cache_size
+        )  # attn for past tokens
+        attn_seq = torch.triu(
+            minus_infinity * torch.ones(self.seq_length, self.seq_length), diagonal=1
+        )  # attn for current tokens
+        self.attn_mask = torch.concat([attn_cache, attn_seq], dim=-1).to(dtype)
+        assert self.attn_mask.shape == (
+            self.seq_length,
+            self.cache_size + self.seq_length,
+        )
+
+        self.input_pos = 0
+        self.cache_pos = 0
+
+    def get_cache_shape(self, length):
+        if self.use_cache_list:
+            return (
+                self.max_batch_size,
+                self.n_kv_heads,
+                length,
+                self.head_dim,
+            )
+        return (
+            self.n_layers,
+            self.max_batch_size,
+            self.n_kv_heads,
+            length,
+            self.head_dim,
+        )
+
+    def _update_cache(self, start, length, new_k_caches, new_v_caches):
+        """
+        Copies new cache data from start to start + length to cache
+        """
+        assert self.cache_pos + length <= self.cache_size
+        assert start + length <= self.seq_length
+
+        if self.use_cache_list:
+            for i in range(self.n_layers):
+                assert new_k_caches[i].shape == self.get_cache_shape(self.seq_length)
+                assert new_v_caches[i].shape == self.get_cache_shape(self.seq_length)
+
+                self.k_caches[i][
+                    :, :, (self.cache_pos) : (self.cache_pos + length), :
+                ] = new_k_caches[i][:, :, start : (start + length), :]
+                self.v_caches[i][
+                    :, :, (self.cache_pos) : (self.cache_pos + length), :
+                ] = new_v_caches[i][:, :, start : (start + length), :]
+        else:
+            assert new_k_caches.shape == self.get_cache_shape(self.seq_length)
+            assert new_v_caches.shape == self.get_cache_shape(self.seq_length)
+            self.k_caches[:, :, :, (self.cache_pos) : (self.cache_pos + length), :] = (
+                new_k_caches[:, :, :, start : (start + length), :]
+            )
+            self.v_caches[:, :, :, (self.cache_pos) : (self.cache_pos + length), :] = (
+                new_v_caches[:, :, :, start : (start + length), :]
+            )
+
+        self.cache_pos += length
+        if self.cache_pos == self.cache_size:
+            self.cache_pos = 0
+
+    def update(self, input_length, new_k_caches, new_v_caches):
+        # Copy as much new cache data into cache as possible without wrapping
+        amount_to_copy = min(input_length, self.cache_size - self.cache_pos)
+        self._update_cache(0, amount_to_copy, new_k_caches, new_v_caches)
+        if self.input_pos <= self.cache_size:
+            self.attn_mask[:, (self.input_pos) : (self.input_pos + amount_to_copy)] = (
+                0.0
+            )
+
+        # Copy remainder (cache is now wrapped around and has more room)
+        # Attention mask needs no further updates.  Attention is paid to the whole cache
+        remaining_to_copy = min(
+            input_length - amount_to_copy, self.cache_size - self.cache_pos
+        )
+        if remaining_to_copy > 0:
+            self._update_cache(
+                amount_to_copy, remaining_to_copy, new_k_caches, new_v_caches
+            )
+
+        self.input_pos += input_length
+
+    def get_inputs(self, tokens: List[int]):
+        input_length = len(tokens)
+        assert input_length <= self.seq_length
+
+        return (
+            # tokens
+            torch.concat(
+                [
+                    torch.tensor(tokens, dtype=torch.int64),
+                    torch.zeros(self.seq_length - input_length, dtype=torch.int64),
+                ],
+                dim=-1,
+            ).reshape(1, -1),
+            # input_pos
+            torch.tensor([self.input_pos], dtype=torch.long),
+            # input_length
+            torch.tensor([input_length], dtype=torch.long),
+            # k_cache
+            self.k_caches,
+            # v_cache
+            self.v_caches,
+            # attn_mask
+            self.attn_mask,
+        )
+
+    def get_inputs_and_remaining_tokens(self, tokens: List[int]):
+        processed_tokens = min(self.seq_length, len(tokens))
+        return (
+            self.get_inputs(tokens[0:processed_tokens]),
+            tokens[processed_tokens:],
+        )
diff --git a/examples/apple/coreml/llama/readme.md b/examples/apple/coreml/llama/readme.md
new file mode 100644
index 00000000000..14dff0c8580
--- /dev/null
+++ b/examples/apple/coreml/llama/readme.md
@@ -0,0 +1,46 @@
+# ANE-friendly Llama models
+
+This directory contains ANE-friendly Llama models.
+
+Export model with:
+```
+python export.py -n /path/to/output/model.pte -p /path/to/params.json -c /path/to/model.pth --seq_length 64 --max_seq_length 1024 --coreml-quantize c4w --dtype fp16
+```
+
+(Note the script should be run from the executorch/examples/apple/coreml/llama directory.)
+
+The runner is written in python and is only intended to serve as an example for how the model inputs should be processed; it is not performant.
+
+
+Run model with:
+```
+python run.py -m /path/to/model.pte -t /path/to/tokenizer.model --prompt "Once upon a time,"
+```
+
+The runner can also be used to run an eager model model to compare with CoreML numerics (--use_eager).  In this case, you must specify:
+* --checkpoint
+* --dtype
+* --max_seq_length
+* --seq_length
+
+(Note the script should be run from the executorch/examples/apple/coreml/llama directory.)
+
+
+## Export args
+* seq_length: the number of tokens processed by the model.  Sequences shorter than seq_length must be padded, and sequences longer than it must be chunked.
+* max_seq_length: the maximum context tokens that can be processed.
+* cache_size: the size of the KV cache sequences.  This parameter is optional, and defaults to max_seq_length - seq_length.  If a smaller cache_size is used, older tokens are evicted from the cache and no longer play a role in attention.  For example, if max_seq_length=1024, but cache_size is 512, the model can generate up to 1024 tokens, but only the current tokens and the previous 512 will participate in attention.  In terms of computation, cache_size plays a similar role to max_seq_length in models without cache eviction.
+* use_cache_list: boolean option that controls whether KV caches are passed as a list of 4D tensors, one per layer, or if they are passed as one 5D tensor.  (Note that use_cache_list does not work with ExecuTorch pybindings.)
+* target_split_size: this option splits linear layers into chunks of target size.  For example, if target_split_size is 1024, a linear layer with (in_features=512, out_features=8096) will be split into 8 linear layers with (in_features=512, out_features=1024) and the results concatted.  If not specified, the default is no splitting.
+* max_splits: this controls the maximum number of splits for linear layers.  It is only relevant if target_size is passed and defaults to 8.
+
+## Llama1B on iPhone 15
+
+We are actively experimenting with different settings.  But here are ones that we've found work well for Llama1B on iPhone 15 Pro:
+
+* Set use_cache_list.
+* Use seq_length = 32, which offers a good balance between prefill/decode performance.
+* Split out_features in linear layers with target_split_size=1024, max_splits=8.
+* For ANE, set dtype = fp16, coreml-quantize = c4w.  The requires doing QAT on Llama1B for good accuracy.
+* Set embedding-quantize to "4,32".
+* Set max_seq_length to 128, 256, 512, 1024, and 2048, depending on needed context.  Note that performance drops with max_seq_length.  More specifically, performance drops with cache_size, and the best experience may require a good cache eviction policy.  The python runner in run.py uses a last-in-last-out policy when cache_size is specified.
diff --git a/examples/apple/coreml/llama/run.py b/examples/apple/coreml/llama/run.py
new file mode 100644
index 00000000000..e68471a1d29
--- /dev/null
+++ b/examples/apple/coreml/llama/run.py
@@ -0,0 +1,206 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+
+import sentencepiece as spm
+
+import torch
+from executorch.examples.apple.coreml.llama.llama_transformer import (
+    InputManager,
+    load_model,
+)
+
+from executorch.examples.models.llama.runner.generation import next_token
+from executorch.examples.models.llama.tokenizer import tiktoken
+
+from executorch.runtime import Runtime
+
+
+class Tokenizer:
+    def __init__(self, model_path: str):
+        # Try sentence piece
+        try:
+            print("Trying to load sentencepiece")
+            sp = spm.SentencePieceProcessor()
+            sp.load(model_path)
+            self.tokenizer = sp
+        except:
+            print("Trying to load tiktoken")
+            self.tokenizer = tiktoken.Tokenizer(model_path)
+
+    def encode(self, text, bos, eos):
+        if isinstance(self.tokenizer, spm.SentencePieceProcessor):
+            bos_string = "<s>" if bos else ""
+            eos_string = "</s>" if eos else ""
+            return self.tokenizer.encode(f"{bos_string}{text}{eos_string}")
+        return self.tokenizer.encode(text, bos=bos, eos=eos)
+
+    def decode_token(self, token):
+        if isinstance(self.tokenizer, spm.SentencePieceProcessor):
+            return f"{self.tokenizer.decode(token)} "
+        return self.tokenizer.decode_token(token)
+
+    def stop_tokens(self):
+        if isinstance(self.tokenizer, spm.SentencePieceProcessor):
+            return [self.tokenizer.eos_id()]
+        return self.tokenizer.stop_tokens
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-m",
+        "--model",
+        help="model.pte",
+    )
+    parser.add_argument(
+        "-t",
+        "--tokenizer",
+        help="tokenizer.model path",
+    )
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        default="Once upon a time,",
+    )
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=0.6,
+    )
+    parser.add_argument(
+        "--top_p",
+        type=float,
+        default=0.9,
+    )
+    parser.add_argument(
+        "--use_eager",
+        action="store_true",
+    )
+    parser.add_argument(
+        "-p",
+        "--params",
+        type=str,
+        default=None,
+    )
+    parser.add_argument(
+        "-c",
+        "--checkpoint",
+        type=str,
+        default=None,
+    )
+    parser.add_argument("--dtype", type=str, choices=["fp16", "fp32"], default=None)
+    parser.add_argument(
+        "--seq_length",
+        type=int,
+        default=None,
+    )
+    parser.add_argument(
+        "--max_seq_length",
+        type=int,
+        default=None,
+    )
+    parser.add_argument(
+        "--cache_size",
+        type=int,
+        default=None,
+    )
+
+    args = parser.parse_args()
+
+    tokenizer = Tokenizer(args.tokenizer)
+
+    runtime = Runtime.get()
+    if args.use_eager:
+        assert args.params is not None
+        assert args.checkpoint is not None
+        assert args.dtype is not None
+        assert args.max_seq_length is not None
+        assert args.seq_length is not None
+
+        max_seq_length = args.max_seq_length
+        seq_length = args.seq_length
+        model = load_model(
+            args.checkpoint,
+            args.params,
+            max_seq_length=max_seq_length,
+            use_cache_list=False,
+        )
+        n_layers = model.params.n_layers
+        max_batch_size = model.params.max_batch_size
+        n_kv_heads = model.params.n_kv_heads
+        head_dim = model.params.head_dim
+        cache_size = args.cache_size
+
+        float_dtype = {"fp16": torch.float16, "fp32": torch.float32}[
+            args.dtype
+        ]  # dtype for model/inputs
+        model.eval()
+        model.to(float_dtype)
+    else:
+        program = runtime.load_program(args.model)
+        method = program.load_method("forward")
+
+        metadata = method.metadata
+        print("Method metadata: ", metadata, "\n\n")
+
+        assert (
+            metadata.num_inputs() == 6
+        ), "Do not export with --use_cache_list for use in pybindings"
+        # k_cache input
+        n_layers, max_batch_size, n_kv_heads, cache_size, head_dim = (
+            metadata.input_tensor_meta(3).sizes()
+        )
+        float_dtype = {5: torch.float16, 6: torch.float32}[
+            metadata.input_tensor_meta(3).dtype()
+        ]
+
+        # mask input
+        seq_length, max_seq_length = metadata.input_tensor_meta(5).sizes()
+
+    input_manager = InputManager(
+        n_layers=n_layers,
+        max_batch_size=max_batch_size,
+        n_kv_heads=n_kv_heads,
+        max_seq_length=max_seq_length,
+        head_dim=head_dim,
+        use_cache_list=False,
+        seq_length=seq_length,
+        dtype=float_dtype,
+        minus_infinity=-30000.0,
+        cache_size=cache_size,
+    )
+
+    print(args.prompt, end="")
+    tokens = tokenizer.encode(args.prompt, bos=True, eos=False)
+    while input_manager.input_pos + seq_length < max_seq_length:
+        while len(tokens) > 0 and (
+            input_manager.input_pos + seq_length < max_seq_length
+        ):
+            inputs, remaining_tokens = input_manager.get_inputs_and_remaining_tokens(
+                tokens
+            )
+            processed_tokens = len(tokens) - len(remaining_tokens)
+            if args.use_eager:
+                logits, k, v = model(*inputs)
+            else:
+                logits, k, v = method.execute(inputs)
+
+            input_manager.update(
+                input_length=processed_tokens, new_k_caches=k, new_v_caches=v
+            )
+            tokens = remaining_tokens
+
+        tokens = [next_token(logits, args.temperature, args.top_p)]
+
+        if tokens[-1] in tokenizer.stop_tokens():
+            break
+        print(tokenizer.decode_token(tokens[-1]), end="", flush=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/apple/coreml/llama/test.py b/examples/apple/coreml/llama/test.py
new file mode 100644
index 00000000000..895cf2e1cce
--- /dev/null
+++ b/examples/apple/coreml/llama/test.py
@@ -0,0 +1,48 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import sys
+
+sys.path.insert(0, ".")
+import copy
+
+import torch
+from utils import replace_linear_with_split_linear
+
+
+def get_split_model(
+    model,
+    out_target_split_size=1,
+    out_max_splits=1,
+    in_target_split_size=1,
+    in_max_splits=1,
+):
+    model_copy = copy.deepcopy(model)
+    replace_linear_with_split_linear(
+        model_copy,
+        out_target_split_size,
+        out_max_splits,
+        in_target_split_size,
+        in_max_splits,
+    )
+    return model_copy
+
+
+def test_split_model():
+    inputs = torch.randn(10, 5, 1, 512)
+
+    model = torch.nn.Sequential(*[torch.nn.Linear(512, 1024, bias=False)])
+    model1 = get_split_model(model, 64, 2, 64, 1000)
+    model2 = get_split_model(model, 64, 2, 64, 1)
+    model3 = get_split_model(model, 64, 1, 64, 1000)
+
+    assert torch.allclose(model(inputs), model1(inputs), atol=1e-5)
+    assert torch.allclose(model(inputs), model2(inputs), atol=1e-5)
+    assert torch.allclose(model(inputs), model3(inputs), atol=1e-5)
+
+
+if __name__ == "__main__":
+    test_split_model()
diff --git a/examples/apple/coreml/llama/utils.py b/examples/apple/coreml/llama/utils.py
new file mode 100644
index 00000000000..1e5a842fed5
--- /dev/null
+++ b/examples/apple/coreml/llama/utils.py
@@ -0,0 +1,116 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+
+class SplitLinearModule(torch.nn.Module):
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        out_target_split_size=1,
+        out_max_splits=1,
+        in_target_split_size=1,
+        in_max_splits=1,
+    ):
+        super(SplitLinearModule, self).__init__()
+        self.out_split_sizes = self._get_split_sizes(
+            out_features, out_target_split_size, out_max_splits
+        )
+        self.in_split_sizes = self._get_split_sizes(
+            in_features, in_target_split_size, in_max_splits
+        )
+        print(
+            f"Splitting out_features={out_features} into {len(self.out_split_sizes)} of size {self.out_split_sizes[0]}."
+        )
+        print(
+            f"Splitting in_features={in_features} into {len(self.in_split_sizes)} of size {self.in_split_sizes[0]}."
+        )
+
+        # self.ops contains a list of linear ops for different pieces of the output matrix
+        # The index of an op at (in_idx, out_idx) is given by self.op_index(in_idx, out_idx)
+        self.ops = torch.nn.ModuleList()
+        for idx_out, s_out in enumerate(self.out_split_sizes):
+            for idx_in, s_in in enumerate(self.in_split_sizes):
+                assert len(self.ops) == self.op_index(idx_in, idx_out)
+                self.ops.append(torch.nn.Linear(s_in, s_out, bias=False))
+
+    def op_index(self, in_index, out_index):
+        idx = out_index * len(self.in_split_sizes) + in_index
+        return idx
+
+    def _get_split_sizes(self, n_features, target_split_size, max_splits):
+        num_splits = max(n_features // target_split_size, 1)
+        if num_splits > max_splits:
+            num_splits = max_splits
+
+        split_size = n_features // num_splits
+        split_remainder = n_features % num_splits
+        if split_remainder > 0:
+            raise ValueError(
+                f"Cannot split {n_features} with target_split_size={target_split_size} and max_splits={max_splits} because it leaves a remainder of {split_remainder}."
+            )
+
+        ret = [split_size for _ in range(num_splits)]
+        return ret
+
+    def set_params(self, weight):
+        split_weights = []
+        for w_out in weight.split(self.out_split_sizes, dim=0):
+            for w in w_out.split(self.in_split_sizes, dim=1):
+                split_weights.append(w)
+
+        for i, split in enumerate(self.ops):
+            split.weight = torch.nn.Parameter(split_weights[i])
+
+    def forward(self, x):
+        if len(self.in_split_sizes) == 1:
+            out_chunks = [op(x) for op in self.ops]
+        else:
+            x_splits = x.split(self.in_split_sizes, dim=-1)
+            out_chunks = [
+                torch.sum(
+                    torch.stack(
+                        [
+                            self.ops[self.op_index(in_idx, out_idx)].forward(
+                                x_splits[in_idx]
+                            )
+                            for in_idx in range(len(self.in_split_sizes))
+                        ],
+                    ),
+                    dim=0,
+                )
+                for out_idx in range(len(self.out_split_sizes))
+            ]
+
+        return torch.concat(out_chunks, dim=-1)
+
+
+def replace_linear_with_split_linear(
+    model, out_target_split_size, out_max_splits, in_target_split_size, in_max_splits=1
+):
+    for name, module in model.named_children():
+        if isinstance(module, torch.nn.Linear):
+            assert module.bias is None, "SplitLinearModule does not support bias"
+            new_module = SplitLinearModule(
+                module.in_features,
+                module.out_features,
+                out_target_split_size,
+                out_max_splits,
+                in_target_split_size,
+                in_max_splits,
+            )
+            new_module.set_params(module.weight)
+            setattr(model, name, new_module)
+        else:
+            replace_linear_with_split_linear(
+                module,
+                out_target_split_size,
+                out_max_splits,
+                in_target_split_size,
+                in_max_splits,
+            )
diff --git a/examples/apple/coreml/scripts/build_executor_runner.sh b/examples/apple/coreml/scripts/build_executor_runner.sh
index 9d20f289bf6..8ee9608a5ec 100755
--- a/examples/apple/coreml/scripts/build_executor_runner.sh
+++ b/examples/apple/coreml/scripts/build_executor_runner.sh
@@ -33,7 +33,6 @@ cmake "$EXECUTORCH_ROOT_PATH" -B"$CMAKE_BUILD_DIR_PATH" \
 -DCMAKE_TOOLCHAIN_FILE="$IOS_TOOLCHAIN_PATH" \
 -DPLATFORM=MAC_UNIVERSAL \
 -DDEPLOYMENT_TARGET=13.0 \
--DFLATC_EXECUTABLE="$(which flatc)" \
 -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \
 -DEXECUTORCH_BUILD_XNNPACK=OFF \
 -DEXECUTORCH_BUILD_DEVTOOLS=ON \
@@ -41,7 +40,7 @@ cmake "$EXECUTORCH_ROOT_PATH" -B"$CMAKE_BUILD_DIR_PATH" \
 -Dprotobuf_BUILD_TESTS=OFF \
 -Dprotobuf_BUILD_EXAMPLES=OFF \
 -DCOREML_BUILD_EXECUTOR_RUNNER=ON \
--DCMAKE_MACOSX_BUNDLE=OFF \
+-DCMAKE_MACOSX_BUNDLE=OFF
 
 cmake --build "$CMAKE_BUILD_DIR_PATH" -j9 -t coremldelegate
 cmake --build "$CMAKE_BUILD_DIR_PATH" -j9 -t etdump -t flatccrt
@@ -78,4 +77,11 @@ XCODE_WORKSPACE_DIR_PATH="$EXAMPLES_COREML_DIR_PATH/executor_runner"
 XCODE_BUILD_DIR_PATH="$EXAMPLES_COREML_DIR_PATH/xcode-build"
 
 xcodebuild build -workspace "$XCODE_WORKSPACE_DIR_PATH/coreml_executor_runner.xcworkspace" -scheme coreml_executor_runner BUILD_DIR="$XCODE_BUILD_DIR_PATH"
-cp -f "$XCODE_BUILD_DIR_PATH/DEBUG/coreml_executor_runner" "$PWD"
+
+if [[ -z "${COREML_EXECUTOR_RUNNER_OUT_DIR:-}" ]]; then
+    COREML_EXECUTOR_RUNNER_OUT_DIR=$(pwd)
+elif [[ ! -d "${COREML_EXECUTOR_RUNNER_OUT_DIR}" ]]; then
+    mkdir -p "${COREML_EXECUTOR_RUNNER_OUT_DIR}"
+fi
+cp -f "$XCODE_BUILD_DIR_PATH/DEBUG/coreml_executor_runner" "${COREML_EXECUTOR_RUNNER_OUT_DIR}"
+echo "created ${COREML_EXECUTOR_RUNNER_OUT_DIR}/coreml_executor_runner"
diff --git a/examples/apple/coreml/scripts/export.py b/examples/apple/coreml/scripts/export.py
index a4ceaee05da..b9acc3b8fb9 100644
--- a/examples/apple/coreml/scripts/export.py
+++ b/examples/apple/coreml/scripts/export.py
@@ -76,6 +76,12 @@ def parse_args() -> argparse.ArgumentParser:
     parser.add_argument("--use_partitioner", action=argparse.BooleanOptionalAction)
     parser.add_argument("--generate_etrecord", action=argparse.BooleanOptionalAction)
     parser.add_argument("--save_processed_bytes", action=argparse.BooleanOptionalAction)
+    parser.add_argument(
+        "--dynamic_shapes",
+        action=argparse.BooleanOptionalAction,
+        required=False,
+        default=False,
+    )
 
     args = parser.parse_args()
     # pyre-fixme[7]: Expected `ArgumentParser` but got `Namespace`.
@@ -164,16 +170,20 @@ def main():
             f"Valid compute units are {valid_compute_units}."
         )
 
-    model, example_inputs, _, _ = EagerModelFactory.create_model(
+    model, example_inputs, _, dynamic_shapes = EagerModelFactory.create_model(
         *MODEL_NAME_TO_MODEL[args.model_name]
     )
+    if not args.dynamic_shapes:
+        dynamic_shapes = None
 
     compile_specs = generate_compile_specs_from_args(args)
     lowered_module = None
 
     if args.use_partitioner:
         model.eval()
-        exir_program_aten = torch.export.export(model, example_inputs, strict=True)
+        exir_program_aten = torch.export.export(
+            model, example_inputs, dynamic_shapes=dynamic_shapes, strict=True
+        )
 
         edge_program_manager = exir.to_edge(exir_program_aten)
         edge_copy = copy.deepcopy(edge_program_manager)
diff --git a/examples/apple/mps/CMakeLists.txt b/examples/apple/mps/CMakeLists.txt
index 319d8159ced..66583592844 100644
--- a/examples/apple/mps/CMakeLists.txt
+++ b/examples/apple/mps/CMakeLists.txt
@@ -18,16 +18,12 @@ if(NOT CMAKE_CXX_STANDARD)
   set(CMAKE_CXX_STANDARD 17)
 endif()
 
-if(NOT FLATC_EXECUTABLE)
-  set(FLATC_EXECUTABLE flatc)
-endif()
-
 # Source root directory for executorch.
 if(NOT EXECUTORCH_ROOT)
   set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 endif()
 
-include(${EXECUTORCH_ROOT}/build/Utils.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 
 if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
@@ -35,7 +31,7 @@ endif()
 
 add_compile_options("-Wall" "-Werror")
 
-include(${EXECUTORCH_ROOT}/build/Utils.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 
 set(_common_compile_options -Wno-deprecated-declarations -fPIC
                             -DET_EVENT_TRACER_ENABLED
@@ -62,8 +58,8 @@ if(NOT CMAKE_TOOLCHAIN_FILE MATCHES ".*(iOS|ios\.toolchain)\.cmake$")
   #
 
   # portable_ops_lib
-  include(${EXECUTORCH_ROOT}/build/Utils.cmake)
-  include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+  include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+  include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
   gen_selected_ops(LIB_NAME "mps_portable_ops_lib" INCLUDE_ALL_OPS "ON")
   generate_bindings_for_kernels(
     LIB_NAME "mps_portable_ops_lib" FUNCTIONS_YAML
diff --git a/examples/arm/CMakeLists.txt b/examples/arm/CMakeLists.txt
index 0c754beaaaf..4bae20d2c1f 100644
--- a/examples/arm/CMakeLists.txt
+++ b/examples/arm/CMakeLists.txt
@@ -21,7 +21,7 @@ if(NOT EXECUTORCH_ROOT)
   set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
 endif()
 
-include(${EXECUTORCH_ROOT}/build/Utils.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 
 if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
@@ -35,8 +35,8 @@ set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 find_package(executorch CONFIG REQUIRED HINTS ${CMAKE_INSTALL_PREFIX})
 target_include_directories(executorch INTERFACE ${_common_include_directories})
 
-include(${EXECUTORCH_ROOT}/build/Utils.cmake)
-include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
 
 # Generate C++ bindings to register kernels into both PyTorch (for AOT) and
 # Executorch (for runtime). Here select all ops in functions.yaml
diff --git a/examples/arm/README.md b/examples/arm/README.md
index bb68ef537be..c74eda7ae2b 100644
--- a/examples/arm/README.md
+++ b/examples/arm/README.md
@@ -21,14 +21,19 @@ build artifacts. If supplied, the same argument must be supplied to both the scr
 To run these scripts. On a Linux system, in a terminal, with a working internet connection,
 ```
 # Step [1] - setup necessary tools
-$ ./setup.sh --i-agree-to-the-contained-eula [optional-scratch-dir]
+$ cd <EXECUTORCH-ROOT-FOLDER>
+$ executorch/examples/arm/setup.sh --i-agree-to-the-contained-eula [optional-scratch-dir]
 
-# Step [2] - build + run ExecuTorch and executor_runner baremetal application
+# Step [2] - Setup Patch to tools, The `setup.sh` script has generated a script that you need to source everytime you restart you shell. 
+$ source  executorch/examples/arm/ethos-u-scratch/setup_path.sh
+
+# Step [3] - build + run ExecuTorch and executor_runner baremetal application
 # suited for Corstone FVP's to run a simple PyTorch model.
-$ ./run.sh [--scratch-dir=same-optional-scratch-dir-as-before]
+$ executorch/examples/arm/run.sh --model_name=mv2 --target=ethos-u85-128 [--scratch-dir=same-optional-scratch-dir-as-before]
 ```
+
 ### Online Tutorial
 
 We also have a [tutorial](https://pytorch.org/executorch/stable/executorch-arm-delegate-tutorial.html) explaining the steps performed in these
-scripts, expected results, and more. It is a step-by-step guide
+scripts, expected results, possible problems and more. It is a step-by-step guide
 you can follow to better understand this delegate.
diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py
index 33d8bc5ebf2..5fb12342a2d 100644
--- a/examples/arm/aot_arm_compiler.py
+++ b/examples/arm/aot_arm_compiler.py
@@ -13,9 +13,10 @@
 import os
 
 from pathlib import Path
-from typing import Any, Dict, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 
 import torch
+from examples.devtools.scripts.export_bundled_program import save_bundled_program
 from executorch.backends.arm.arm_backend import (
     ArmCompileSpecBuilder,
     get_tosa_spec,
@@ -36,6 +37,8 @@
     MobileNetV2Evaluator,
 )
 from executorch.devtools.backend_debug import get_delegation_info
+from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite
+
 from executorch.exir import (
     EdgeCompileConfig,
     ExecutorchBackendConfig,
@@ -56,27 +59,50 @@
 logging.basicConfig(level=logging.WARNING, format=FORMAT)
 
 
-def get_model_and_inputs_from_name(model_name: str) -> Tuple[torch.nn.Module, Any]:
+def get_model_and_inputs_from_name(
+    model_name: str, model_input: str | None
+) -> Tuple[torch.nn.Module, Any]:
     """Given the name of an example pytorch model, return it and example inputs.
 
     Raises RuntimeError if there is no example model corresponding to the given name.
     """
+    example_inputs = None
+    if model_input is not None:
+        logging.info(f"Load model input from {model_input}")
+        if model_input.endswith(".pt"):
+            example_inputs = torch.load(model_input, weights_only=False)
+        else:
+            raise RuntimeError(
+                f"Model input data '{model_input}' is not a valid name. Use --model_input <FILE>.pt e.g. saved with torch.save()"
+            )
+
     # Case 1: Model is defined in this file
     if model_name in models.keys():
+        logging.info(f"Internal model {model_name}")
         model = models[model_name]()
-        example_inputs = models[model_name].example_input
+        if example_inputs is None:
+            example_inputs = models[model_name].example_input
     # Case 2: Model is defined in examples/models/
     elif model_name in MODEL_NAME_TO_MODEL.keys():
         logging.warning(
             "Using a model from examples/models not all of these are currently supported"
         )
-        model, example_inputs, _, _ = EagerModelFactory.create_model(
+        logging.info(
+            f"Load {model_name} -> {MODEL_NAME_TO_MODEL[model_name]} from examples/models"
+        )
+
+        model, tmp_example_inputs, _, _ = EagerModelFactory.create_model(
             *MODEL_NAME_TO_MODEL[model_name]
         )
+        if example_inputs is None:
+            example_inputs = tmp_example_inputs
     # Case 3: Model is in an external python file loaded as a module.
     #         ModelUnderTest should be a torch.nn.module instance
     #         ModelInputs should be a tuple of inputs to the forward function
     elif model_name.endswith(".py"):
+        logging.info(
+            f"Load model file {model_name}   Variable ModelUnderTest=<Model> ModelInputs=<ModelInput>"
+        )
         import importlib.util
 
         # load model's module and add it
@@ -84,13 +110,22 @@ def get_model_and_inputs_from_name(model_name: str) -> Tuple[torch.nn.Module, An
         module = importlib.util.module_from_spec(spec)
         spec.loader.exec_module(module)
         model = module.ModelUnderTest
-        example_inputs = module.ModelInputs
-
+        if example_inputs is None:
+            example_inputs = module.ModelInputs
+    # Case 4: Model is in an saved model file torch.save(model)
+    elif model_name.endswith(".pth") or model_name.endswith(".pt"):
+        logging.info(f"Load model file {model_name}")
+        model = torch.load(model_name, weights_only=False)
+        if example_inputs is None:
+            raise RuntimeError(
+                f"Model '{model_name}' requires input data specify --model_input <FILE>.pt"
+            )
     else:
         raise RuntimeError(
             f"Model '{model_name}' is not a valid name. Use --help for a list of available models."
         )
-
+    logging.debug(f"Loaded model: {model}")
+    logging.debug(f"Loaded input: {example_inputs}")
     return model, example_inputs
 
 
@@ -107,7 +142,7 @@ def quantize(
     logging.debug(f"Original model: {model}")
     quantizer = None
     if is_ethosu(compile_specs):
-        quantizer = EthosUQuantizer(compile_spec)
+        quantizer = EthosUQuantizer(compile_specs)
     elif is_tosa(compile_specs):
         quantizer = TOSAQuantizer(get_tosa_spec(compile_specs))
     else:
@@ -185,7 +220,7 @@ def forward(self, x):
         return z
 
     example_input = (torch.ones(2, 2),)
-    can_delegate = False
+    can_delegate = True
 
 
 class MultipleOutputsModule(torch.nn.Module):
@@ -365,13 +400,19 @@ def dump_delegation_info(edge, intermediate_files_folder: Optional[str] = None):
             file.write(delegation_info_string)
 
 
-def get_args():  # noqa C901
+def get_args():
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "-m",
         "--model_name",
         required=True,
-        help=f"Provide model name. Valid ones: {set(list(models.keys())+list(MODEL_NAME_TO_MODEL.keys()))}",
+        help=f"Model file .py/.pth/.pt, builtin model or a model from examples/models. Valid names: {set(list(models.keys())+list(MODEL_NAME_TO_MODEL.keys()))}",
+    )
+    parser.add_argument(
+        "--model_input",
+        required=False,
+        default=None,
+        help="Provide model input .pt file, or python variable name",
     )
     parser.add_argument(
         "-d",
@@ -381,6 +422,13 @@ def get_args():  # noqa C901
         default=False,
         help="Flag for producing ArmBackend delegated model",
     )
+    parser.add_argument(
+        "--bundleio",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Flag for producing BundleIO bpte file with input/output test/ref data.",
+    )
     parser.add_argument(
         "-t",
         "--target",
@@ -436,7 +484,7 @@ def get_args():  # noqa C901
         "--output",
         action="store",
         required=False,
-        help="Location for outputs, if not the default of cwd.",
+        help="Filename (if .pte or .bpte is used) or a folder for outputs, if not specified the default is to place files in cwd.",
     )
     parser.add_argument(
         "--system_config",
@@ -468,10 +516,6 @@ def get_args():  # noqa C901
             + "This is required for running quantized models with unquantized input."
         )
 
-    if args.quantize and not args.delegate:
-        logging.error("--delegate must be set when using --quanitze flag.")
-        exit(1)
-
     # if we have custom ops, register them before processing the model
     if args.so_library is not None:
         logging.info(f"Loading custom ops from {args.so_library}")
@@ -484,15 +528,15 @@ def get_args():  # noqa C901
     ):
         raise RuntimeError(f"Model {args.model_name} cannot be delegated.")
 
-    if args.system_config is None:
+    if "ethos-u" in args.target and args.system_config is None:
         if "u55" in args.target:
             args.system_config = "Ethos_U55_High_End_Embedded"
         elif "u85" in args.target:
-            args.system_confg = "Ethos_U85_SYS_DRAM_Mid"
+            args.system_config = "Ethos_U85_SYS_DRAM_Mid"
         else:
             raise RuntimeError(f"Invalid target name {args.target}")
 
-    if args.memory_mode is None:
+    if "ethos-u" in args.target and args.memory_mode is None:
         if "u55" in args.target:
             args.memory_mode = "Shared_Sram"
         elif "u85" in args.target:
@@ -503,12 +547,137 @@ def get_args():  # noqa C901
     return args
 
 
-if __name__ == "__main__":
+def save_bpte_program(exec_prog, original_model: torch.nn.Module, output_name: str):
+    # Construct MethodTestSuite for Each Method
+
+    # Generate Test Suites
+    method_names = [
+        method.name for method in exec_prog.executorch_program.execution_plan
+    ]
+
+    program_inputs = {m_name: [example_inputs] for m_name in method_names}
+
+    method_test_suites: List[MethodTestSuite] = []
+    for m_name in method_names:
+        method_inputs = program_inputs[m_name]
+
+        # To create a bundled program, we first create every test cases from input. We leverage eager model
+        # to generate expected output for each test input, and use MethodTestCase to hold the information of
+        # each test case. We gather all MethodTestCase for same method into one MethodTestSuite, and generate
+        # bundled program by all MethodTestSuites.
+        method_test_cases: List[MethodTestCase] = []
+
+        if args.intermediates:
+            # Save model.pth
+            intermediates_path = Path(args.intermediates)
+            model_path = os.path.join(intermediates_path, "model.pth")
+            try:
+                torch.save(original_model, model_path)
+            except:
+                logging.warning(f"Could not torch.save(model, {model_path})")
+        method_index = 0
+        for method_input in method_inputs:
+            output_ref = original_model(*method_input)
+
+            logging.debug(f"input_{method_index}: {method_input}")
+            logging.debug(f"output_ref_{method_index}: {output_ref}")
+
+            if args.intermediates:
+                # Save model input and referece output
+                input_path = os.path.join(
+                    intermediates_path, f"input_{method_index}.pt"
+                )
+                try:
+                    torch.save(method_input, input_path)
+                except:
+                    logging.warning(
+                        f"Could not torch.save(input_{method_index}, {input_path})"
+                    )
+                refoutput_path = os.path.join(
+                    intermediates_path, f"output_ref_{method_index}.pt"
+                )
+                try:
+                    torch.save(output_ref, refoutput_path)
+                except:
+                    logging.warning(
+                        f"Could not torch.save(output_ref_{method_index}, {refoutput_path})"
+                    )
+
+            method_test_cases.append(
+                MethodTestCase(
+                    inputs=method_input,
+                    expected_outputs=output_ref,
+                )
+            )
+
+            method_index = method_index + 1
+
+        method_test_suites.append(
+            MethodTestSuite(
+                method_name=m_name,
+                test_cases=method_test_cases,
+            )
+        )
+
+    # Generate BundledProgram
+    save_bundled_program(exec_prog, method_test_suites, output_name)
+
+
+def to_edge_TOSA_delegate(
+    exported_program,
+    args,
+    model: torch.nn.Module,
+):
+    model_int8 = None
+    # As we can target multiple output encodings, one must
+    # be specified.
+    compile_spec = get_compile_spec(
+        args.target,
+        args.intermediates,
+        args.system_config,
+        args.memory_mode,
+    )
+    if args.quantize:
+        model = quantize(
+            model,
+            args.model_name,
+            compile_spec,
+            example_inputs,
+            args.evaluate,
+            args.evaluate_config,
+        )
+        model_int8 = model
+        # Wrap quantized model back into an exported_program
+        exported_program = torch.export.export_for_training(model, example_inputs)
+
+        if args.intermediates:
+            os.makedirs(args.intermediates, exist_ok=True)
+
+    if is_ethosu(compile_spec):
+        partitioner = EthosUPartitioner(compile_spec)
+    elif is_tosa(compile_spec):
+        partitioner = TOSAPartitioner(compile_spec)
+    else:
+        raise RuntimeError(f"Unhandled compile spec: {compile_spec}")
+
+    edge = to_edge_transform_and_lower(
+        exported_program,
+        partitioner=[partitioner],
+        compile_config=EdgeCompileConfig(
+            _check_ir_validity=False,
+        ),
+    )
+    return model_int8, edge
+
+
+if __name__ == "__main__":  # noqa: C901
     args = get_args()
 
     # Pick model from one of the supported lists
-    model, example_inputs = get_model_and_inputs_from_name(args.model_name)
-    model = model.eval()
+    original_model, example_inputs = get_model_and_inputs_from_name(
+        args.model_name, args.model_input
+    )
+    model = original_model.eval()
 
     # export_for_training under the assumption we quantize, the exported form also works
     # in to_edge if we don't quantize
@@ -519,44 +688,7 @@ def get_args():  # noqa C901
     # Quantize if required
     model_int8 = None
     if args.delegate:
-        # As we can target multiple output encodings, one must
-        # be specified.
-        compile_spec = get_compile_spec(
-            args.target,
-            args.intermediates,
-            args.system_config,
-            args.memory_mode,
-        )
-        if args.quantize:
-            model = quantize(
-                model,
-                args.model_name,
-                compile_spec,
-                example_inputs,
-                args.evaluate,
-                args.evaluate_config,
-            )
-            model_int8 = model
-            # Wrap quantized model back into an exported_program
-            exported_program = torch.export.export_for_training(model, example_inputs)
-
-            if args.intermediates:
-                os.makedirs(args.intermediates, exist_ok=True)
-
-        if is_ethosu(compile_spec):
-            partitioner = EthosUPartitioner(compile_spec)
-        elif is_tosa(compile_spec):
-            partitioner = TOSAPartitioner(compile_spec)
-        else:
-            raise RuntimeError(f"Unhandled compile spec: {compile_spec}")
-
-        edge = to_edge_transform_and_lower(
-            exported_program,
-            partitioner=[partitioner],
-            compile_config=EdgeCompileConfig(
-                _check_ir_validity=False,
-            ),
-        )
+        model_int8, edge = to_edge_TOSA_delegate(exported_program, args, model)
     else:
         edge = to_edge_transform_and_lower(
             exported_program,
@@ -587,10 +719,33 @@ def get_args():  # noqa C901
         else f"_arm_{args.target}"
     )
 
+    if args.bundleio:
+        output_name = f"{output_name}.bpte"
+    else:
+        output_name = f"{output_name}.pte"
+
     if args.output is not None:
-        output_name = os.path.join(args.output, output_name)
+        if args.output.endswith(".pte") or args.output.endswith(".bpte"):
+            # --output is a pte or bundle pte filename use it as output name
+            if args.bundleio and not args.output.endswith(".bpte"):
+                raise RuntimeError(
+                    f"--bundleio expects a .bpte file ending to --output and not .pte {args.output}"
+                )
+            if not args.bundleio and not args.output.endswith(".pte"):
+                raise RuntimeError(
+                    f"When not using --bundleio a .bpte file should not be use as --output {args.output}"
+                )
+            output_name = args.output
+        else:
+            # --output is a folder
+            output_name = os.path.join(args.output, output_name)
 
-    save_pte_program(exec_prog, output_name)
+    if args.bundleio:
+        save_bpte_program(exec_prog, original_model, output_name)
+        print(f"Bundle PTE file saved as {output_name}")
+    else:
+        save_pte_program(exec_prog, output_name)
+        print(f"PTE file saved as {output_name}")
 
     if args.evaluate:
         evaluate_model(
diff --git a/examples/arm/executor_runner/CMakeLists.txt b/examples/arm/executor_runner/CMakeLists.txt
index d43a7047080..25aef08f375 100644
--- a/examples/arm/executor_runner/CMakeLists.txt
+++ b/examples/arm/executor_runner/CMakeLists.txt
@@ -9,11 +9,14 @@ project(arm_executor_runner)
 option(SEMIHOSTING "Enable semihosting" OFF)
 option(ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE "Set ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE to specify memory alloction pool size" OFF)
 option(ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE "Set ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE to specify temp alloction pool size" OFF)
+option(ET_BUNDLE_IO "Set to compile in BundleIO support" OFF)
+option(ET_ATOL "Set atol to use for BundleIO testing" OFF)
+option(ET_RTOL "Set rtol to use for BundleIO testing" OFF)
 
 if(NOT DEFINED ET_PTE_FILE_PATH AND NOT ${SEMIHOSTING})
   message(
     FATAL_ERROR
-      "ET_PTE_FILE_PATH must specify a model .pte, for bare metal systems the "
+      "ET_PTE_FILE_PATH must specify a model .pte or .bpte, for bare metal systems the "
       "model is built into the binary."
   )
 endif()
@@ -80,6 +83,10 @@ execute_process(COMMAND bash -c "pwd && source backends/arm/scripts/utils.sh &&
 # Selects timing adapter values matching system_config.
 # Default is Ethos_U55_High_End_Embedded, simulating optimal hardware for the Corestone-300.
 set(SYSTEM_CONFIG "Ethos_U55_High_End_Embedded" CACHE STRING "System config")
+set(MEMORY_MODE "Shared_Sram" CACHE STRING "Vela memory mode")
+
+message(STATUS "SYSTEM_CONFIG is ${SYSTEM_CONFIG}")
+message(STATUS "MEMORY_MODE is ${MEMORY_MODE}")
 
 get_filename_component(ET_BUILD_DIR_PATH ${ET_BUILD_DIR_PATH} REALPATH)
 get_filename_component(ET_DIR_PATH ${ET_DIR_PATH} REALPATH)
@@ -89,6 +96,8 @@ if(NOT ${SEMIHOSTING})
   get_filename_component(ET_PTE_FILE_PATH ${ET_PTE_FILE_PATH} REALPATH)
 endif()
 
+
+
 # Dependencies from the Ethos-U Core This is the platform target of
 # Corstone-300, that includes ethosu_core_driver and bare-metal bringup
 # libraries. We link against ethosu_target_init which includes all of these
@@ -96,11 +105,44 @@ endif()
 if(SYSTEM_CONFIG STREQUAL "Ethos_U55_High_End_Embedded")
   add_subdirectory(${ETHOS_SDK_PATH}/core_platform/targets/corstone-300 target)
   set(TARGET_BOARD "corstone-300")
-  target_compile_definitions(ethosu_target_common INTERFACE
-      # ETHOSU_MODEL=0 place pte file/data in SRAM area
-      # ETHOSU_MODEL=1 place pte file/data in DDR area
-      ETHOSU_MODEL=1
-      # Configure NPU architecture timing adapters
+  if(MEMORY_MODE STREQUAL "Shared_Sram")
+    target_compile_definitions(ethosu_target_common INTERFACE
+        # ETHOSU_MODEL=0 place pte file/data in SRAM area
+        # ETHOSU_MODEL=1 place pte file/data in DDR area
+        ETHOSU_MODEL=1
+        # Configure NPU architecture timing adapters
+        # This is just example numbers and you should make this match your hardware
+        # SRAM
+        ETHOSU_TA_MAXR_0=8
+        ETHOSU_TA_MAXW_0=8
+        ETHOSU_TA_MAXRW_0=0
+        ETHOSU_TA_RLATENCY_0=32
+        ETHOSU_TA_WLATENCY_0=32
+        ETHOSU_TA_PULSE_ON_0=3999
+        ETHOSU_TA_PULSE_OFF_0=1
+        ETHOSU_TA_BWCAP_0=4000
+        ETHOSU_TA_PERFCTRL_0=0
+        ETHOSU_TA_PERFCNT_0=0
+        ETHOSU_TA_MODE_0=1
+        ETHOSU_TA_HISTBIN_0=0
+        ETHOSU_TA_HISTCNT_0=0
+        # Flash
+        ETHOSU_TA_MAXR_1=2
+        ETHOSU_TA_MAXW_1=0
+        ETHOSU_TA_MAXRW_1=0
+        ETHOSU_TA_RLATENCY_1=64
+        ETHOSU_TA_WLATENCY_1=0
+        ETHOSU_TA_PULSE_ON_1=320
+        ETHOSU_TA_PULSE_OFF_1=80
+        ETHOSU_TA_BWCAP_1=50
+        ETHOSU_TA_PERFCTRL_1=0
+        ETHOSU_TA_PERFCNT_1=0
+        ETHOSU_TA_MODE_1=1
+        ETHOSU_TA_HISTBIN_1=0
+        ETHOSU_TA_HISTCNT_1=0
+        )
+  elseif(MEMORY_MODE STREQUAL "Sram_Only")
+    target_compile_definitions(ethosu_target_common INTERFACE
       # This is just example numbers and you should make this match your hardware
       # SRAM
       ETHOSU_TA_MAXR_0=8
@@ -116,28 +158,66 @@ if(SYSTEM_CONFIG STREQUAL "Ethos_U55_High_End_Embedded")
       ETHOSU_TA_MODE_0=1
       ETHOSU_TA_HISTBIN_0=0
       ETHOSU_TA_HISTCNT_0=0
-      # Flash
-      ETHOSU_TA_MAXR_1=2
-      ETHOSU_TA_MAXW_1=0
+      # Set the second Timing Adapter to SRAM latency & bandwidth
+      ETHOSU_TA_MAXR_1=8
+      ETHOSU_TA_MAXW_1=8
       ETHOSU_TA_MAXRW_1=0
-      ETHOSU_TA_RLATENCY_1=64
-      ETHOSU_TA_WLATENCY_1=0
-      ETHOSU_TA_PULSE_ON_1=320
-      ETHOSU_TA_PULSE_OFF_1=80
-      ETHOSU_TA_BWCAP_1=50
+      ETHOSU_TA_RLATENCY_1=32
+      ETHOSU_TA_WLATENCY_1=32
+      ETHOSU_TA_PULSE_ON_1=3999
+      ETHOSU_TA_PULSE_OFF_1=1
+      ETHOSU_TA_BWCAP_1=4000
       ETHOSU_TA_PERFCTRL_1=0
       ETHOSU_TA_PERFCNT_1=0
       ETHOSU_TA_MODE_1=1
       ETHOSU_TA_HISTBIN_1=0
       ETHOSU_TA_HISTCNT_1=0
       )
+
+  else()
+    message(FATAL_ERROR "Unsupported memory_mode ${MEMORY_MODE} for the Ethos-U55. The Ethos-U55 supports only Shared_Sram and Sram_Only.")
+  endif()
 elseif(SYSTEM_CONFIG STREQUAL "Ethos_U55_Deep_Embedded")
   add_subdirectory(${ETHOS_SDK_PATH}/core_platform/targets/corstone-300 target)
   set(TARGET_BOARD "corstone-300")
-  target_compile_definitions(ethosu_target_common INTERFACE
-      # ETHOSU_MODEL=0 place pte file/data in SRAM area
-      # ETHOSU_MODEL=1 place pte file/data in DDR area
-      ETHOSU_MODEL=1
+  if(MEMORY_MODE STREQUAL "Shared_Sram")
+    target_compile_definitions(ethosu_target_common INTERFACE
+        # ETHOSU_MODEL=0 place pte file/data in SRAM area
+        # ETHOSU_MODEL=1 place pte file/data in DDR area
+        ETHOSU_MODEL=1
+        # Configure NPU architecture timing adapters
+        # This is just example numbers and you should make this match your hardware
+        # SRAM
+        ETHOSU_TA_MAXR_0=4
+        ETHOSU_TA_MAXW_0=4
+        ETHOSU_TA_MAXRW_0=0
+        ETHOSU_TA_RLATENCY_0=8
+        ETHOSU_TA_WLATENCY_0=8
+        ETHOSU_TA_PULSE_ON_0=3999
+        ETHOSU_TA_PULSE_OFF_0=1
+        ETHOSU_TA_BWCAP_0=4000
+        ETHOSU_TA_PERFCTRL_0=0
+        ETHOSU_TA_PERFCNT_0=0
+        ETHOSU_TA_MODE_0=1
+        ETHOSU_TA_HISTBIN_0=0
+        ETHOSU_TA_HISTCNT_0=0
+        # Flash
+        ETHOSU_TA_MAXR_1=2
+        ETHOSU_TA_MAXW_1=0
+        ETHOSU_TA_MAXRW_1=0
+        ETHOSU_TA_RLATENCY_1=32
+        ETHOSU_TA_WLATENCY_1=0
+        ETHOSU_TA_PULSE_ON_1=360
+        ETHOSU_TA_PULSE_OFF_1=40
+        ETHOSU_TA_BWCAP_1=25
+        ETHOSU_TA_PERFCTRL_1=0
+        ETHOSU_TA_PERFCNT_1=0
+        ETHOSU_TA_MODE_1=1
+        ETHOSU_TA_HISTBIN_1=0
+        ETHOSU_TA_HISTCNT_1=0
+        )
+    elseif(MEMORY_MODE STREQUAL "Sram_Only")
+      target_compile_definitions(ethosu_target_common INTERFACE
       # Configure NPU architecture timing adapters
       # This is just example numbers and you should make this match your hardware
       # SRAM
@@ -154,25 +234,65 @@ elseif(SYSTEM_CONFIG STREQUAL "Ethos_U55_Deep_Embedded")
       ETHOSU_TA_MODE_0=1
       ETHOSU_TA_HISTBIN_0=0
       ETHOSU_TA_HISTCNT_0=0
-      # Flash
-      ETHOSU_TA_MAXR_1=2
-      ETHOSU_TA_MAXW_1=0
+      # Set the second Timing Adapter to SRAM latency & bandwidth
+      ETHOSU_TA_MAXR_1=4
+      ETHOSU_TA_MAXW_1=4
       ETHOSU_TA_MAXRW_1=0
-      ETHOSU_TA_RLATENCY_1=32
-      ETHOSU_TA_WLATENCY_1=0
-      ETHOSU_TA_PULSE_ON_1=360
-      ETHOSU_TA_PULSE_OFF_1=40
-      ETHOSU_TA_BWCAP_1=25
+      ETHOSU_TA_RLATENCY_1=8
+      ETHOSU_TA_WLATENCY_1=8
+      ETHOSU_TA_PULSE_ON_1=3999
+      ETHOSU_TA_PULSE_OFF_1=1
+      ETHOSU_TA_BWCAP_1=4000
       ETHOSU_TA_PERFCTRL_1=0
       ETHOSU_TA_PERFCNT_1=0
       ETHOSU_TA_MODE_1=1
       ETHOSU_TA_HISTBIN_1=0
       ETHOSU_TA_HISTCNT_1=0
       )
+    else()
+      message(FATAL_ERROR "Unsupported memory_mode ${MEMORY_MODE} for the Ethos-U55. The Ethos-U55 supports only Shared_Sram and Sram_Only.")
+  endif()
 elseif(SYSTEM_CONFIG STREQUAL "Ethos_U85_SYS_DRAM_Low")
   add_subdirectory(${ETHOS_SDK_PATH}/core_platform/targets/corstone-320 target)
   set(TARGET_BOARD "corstone-320")
-  target_compile_definitions(ethosu_target_common INTERFACE
+  if(MEMORY_MODE STREQUAL "Dedicated_Sram")
+    target_compile_definitions(ethosu_target_common INTERFACE
+        # ETHOSU_MODEL=0 place pte file/data in SRAM area
+        # ETHOSU_MODEL=1 place pte file/data in DDR area
+        ETHOSU_MODEL=1
+        # Configure NPU architecture timing adapters
+        # This is just example numbers and you should make this match your hardware
+        # SRAM
+        ETHOSU_TA_MAXR_0=8
+        ETHOSU_TA_MAXW_0=8
+        ETHOSU_TA_MAXRW_0=0
+        ETHOSU_TA_RLATENCY_0=16
+        ETHOSU_TA_WLATENCY_0=16
+        ETHOSU_TA_PULSE_ON_0=3999
+        ETHOSU_TA_PULSE_OFF_0=1
+        ETHOSU_TA_BWCAP_0=4000
+        ETHOSU_TA_PERFCTRL_0=0
+        ETHOSU_TA_PERFCNT_0=0
+        ETHOSU_TA_MODE_0=1
+        ETHOSU_TA_HISTBIN_0=0
+        ETHOSU_TA_HISTCNT_0=0
+        # DRAM
+        ETHOSU_TA_MAXR_1=24
+        ETHOSU_TA_MAXW_1=12
+        ETHOSU_TA_MAXRW_1=0
+        ETHOSU_TA_RLATENCY_1=250
+        ETHOSU_TA_WLATENCY_1=125
+        ETHOSU_TA_PULSE_ON_1=4000
+        ETHOSU_TA_PULSE_OFF_1=1000
+        ETHOSU_TA_BWCAP_1=2344
+        ETHOSU_TA_PERFCTRL_1=0
+        ETHOSU_TA_PERFCNT_1=0
+        ETHOSU_TA_MODE_1=1
+        ETHOSU_TA_HISTBIN_1=0
+        ETHOSU_TA_HISTCNT_1=0
+        )
+  elseif(MEMORY_MODE STREQUAL "Sram_Only")
+      target_compile_definitions(ethosu_target_common INTERFACE
       # ETHOSU_MODEL=0 place pte file/data in SRAM area
       # ETHOSU_MODEL=1 place pte file/data in DDR area
       ETHOSU_MODEL=1
@@ -192,59 +312,98 @@ elseif(SYSTEM_CONFIG STREQUAL "Ethos_U85_SYS_DRAM_Low")
       ETHOSU_TA_MODE_0=1
       ETHOSU_TA_HISTBIN_0=0
       ETHOSU_TA_HISTCNT_0=0
-      # DRAM
-      ETHOSU_TA_MAXR_1=24
-      ETHOSU_TA_MAXW_1=12
+      # Set the second Timing Adapter to SRAM latency & bandwidth
+      ETHOSU_TA_MAXR_1=8
+      ETHOSU_TA_MAXW_1=8
       ETHOSU_TA_MAXRW_1=0
-      ETHOSU_TA_RLATENCY_1=250
-      ETHOSU_TA_WLATENCY_1=125
-      ETHOSU_TA_PULSE_ON_1=4000
-      ETHOSU_TA_PULSE_OFF_1=1000
-      ETHOSU_TA_BWCAP_1=2344
+      ETHOSU_TA_RLATENCY_1=16
+      ETHOSU_TA_WLATENCY_1=16
+      ETHOSU_TA_PULSE_ON_1=3999
+      ETHOSU_TA_PULSE_OFF_1=1
+      ETHOSU_TA_BWCAP_1=4000
       ETHOSU_TA_PERFCTRL_1=0
       ETHOSU_TA_PERFCNT_1=0
       ETHOSU_TA_MODE_1=1
       ETHOSU_TA_HISTBIN_1=0
       ETHOSU_TA_HISTCNT_1=0
       )
+  endif()
 elseif(SYSTEM_CONFIG STREQUAL "Ethos_U85_SYS_DRAM_Mid" OR SYSTEM_CONFIG STREQUAL "Ethos_U85_SYS_DRAM_High")
   add_subdirectory(${ETHOS_SDK_PATH}/core_platform/targets/corstone-320 target)
   set(TARGET_BOARD "corstone-320")
-  target_compile_definitions(ethosu_target_common INTERFACE
-      # ETHOSU_MODEL=0 place pte file/data in SRAM area
-      # ETHOSU_MODEL=1 place pte file/data in DDR area
-      ETHOSU_MODEL=1
-      # Configure NPU architecture timing adapters
-      # This is just example numbers and you should make this match your hardware
-      # SRAM
-      ETHOSU_TA_MAXR_0=8
-      ETHOSU_TA_MAXW_0=8
-      ETHOSU_TA_MAXRW_0=0
-      ETHOSU_TA_RLATENCY_0=32
-      ETHOSU_TA_WLATENCY_0=32
-      ETHOSU_TA_PULSE_ON_0=3999
-      ETHOSU_TA_PULSE_OFF_0=1
-      ETHOSU_TA_BWCAP_0=4000
-      ETHOSU_TA_PERFCTRL_0=0
-      ETHOSU_TA_PERFCNT_0=0
-      ETHOSU_TA_MODE_0=1
-      ETHOSU_TA_HISTBIN_0=0
-      ETHOSU_TA_HISTCNT_0=0
-      # DRAM
-      ETHOSU_TA_MAXR_1=64
-      ETHOSU_TA_MAXW_1=32
-      ETHOSU_TA_MAXRW_1=0
-      ETHOSU_TA_RLATENCY_1=500
-      ETHOSU_TA_WLATENCY_1=250
-      ETHOSU_TA_PULSE_ON_1=4000
-      ETHOSU_TA_PULSE_OFF_1=1000
-      ETHOSU_TA_BWCAP_1=3750
-      ETHOSU_TA_PERFCTRL_1=0
-      ETHOSU_TA_PERFCNT_1=0
-      ETHOSU_TA_MODE_1=1
-      ETHOSU_TA_HISTBIN_1=0
-      ETHOSU_TA_HISTCNT_1=0
-      )
+  if(MEMORY_MODE STREQUAL "Dedicated_Sram")
+    target_compile_definitions(ethosu_target_common INTERFACE
+        # ETHOSU_MODEL=0 place pte file/data in SRAM area
+        # ETHOSU_MODEL=1 place pte file/data in DDR area
+        ETHOSU_MODEL=1
+        # Configure NPU architecture timing adapters
+        # This is just example numbers and you should make this match your hardware
+        # SRAM
+        ETHOSU_TA_MAXR_0=8
+        ETHOSU_TA_MAXW_0=8
+        ETHOSU_TA_MAXRW_0=0
+        ETHOSU_TA_RLATENCY_0=32
+        ETHOSU_TA_WLATENCY_0=32
+        ETHOSU_TA_PULSE_ON_0=3999
+        ETHOSU_TA_PULSE_OFF_0=1
+        ETHOSU_TA_BWCAP_0=4000
+        ETHOSU_TA_PERFCTRL_0=0
+        ETHOSU_TA_PERFCNT_0=0
+        ETHOSU_TA_MODE_0=1
+        ETHOSU_TA_HISTBIN_0=0
+        ETHOSU_TA_HISTCNT_0=0
+        # DRAM
+        ETHOSU_TA_MAXR_1=64
+        ETHOSU_TA_MAXW_1=32
+        ETHOSU_TA_MAXRW_1=0
+        ETHOSU_TA_RLATENCY_1=500
+        ETHOSU_TA_WLATENCY_1=250
+        ETHOSU_TA_PULSE_ON_1=4000
+        ETHOSU_TA_PULSE_OFF_1=1000
+        ETHOSU_TA_BWCAP_1=3750
+        ETHOSU_TA_PERFCTRL_1=0
+        ETHOSU_TA_PERFCNT_1=0
+        ETHOSU_TA_MODE_1=1
+        ETHOSU_TA_HISTBIN_1=0
+        ETHOSU_TA_HISTCNT_1=0
+        )
+  elseif(MEMORY_MODE STREQUAL "Sram_Only")
+    target_compile_definitions(ethosu_target_common INTERFACE
+    # ETHOSU_MODEL=0 place pte file/data in SRAM area
+    # ETHOSU_MODEL=1 place pte file/data in DDR area
+    ETHOSU_MODEL=1
+    # Configure NPU architecture timing adapters
+    # This is just example numbers and you should make this match your hardware
+    # SRAM
+    ETHOSU_TA_MAXR_0=8
+    ETHOSU_TA_MAXW_0=8
+    ETHOSU_TA_MAXRW_0=0
+    ETHOSU_TA_RLATENCY_0=32
+    ETHOSU_TA_WLATENCY_0=32
+    ETHOSU_TA_PULSE_ON_0=3999
+    ETHOSU_TA_PULSE_OFF_0=1
+    ETHOSU_TA_BWCAP_0=4000
+    ETHOSU_TA_PERFCTRL_0=0
+    ETHOSU_TA_PERFCNT_0=0
+    ETHOSU_TA_MODE_0=1
+    ETHOSU_TA_HISTBIN_0=0
+    ETHOSU_TA_HISTCNT_0=0
+    # Set the second Timing Adapter to SRAM latency & bandwidth
+    ETHOSU_TA_MAXR_1=8
+    ETHOSU_TA_MAXW_1=8
+    ETHOSU_TA_MAXRW_1=0
+    ETHOSU_TA_RLATENCY_1=32
+    ETHOSU_TA_WLATENCY_1=32
+    ETHOSU_TA_PULSE_ON_1=3999
+    ETHOSU_TA_PULSE_OFF_1=1
+    ETHOSU_TA_BWCAP_1=4000
+    ETHOSU_TA_PERFCTRL_1=0
+    ETHOSU_TA_PERFCNT_1=0
+    ETHOSU_TA_MODE_1=1
+    ETHOSU_TA_HISTBIN_1=0
+    ETHOSU_TA_HISTCNT_1=0
+    )
+  endif()
 else()
   message(FATAL_ERROR "Unsupported SYSTEM_CONFIG: ${SYSTEM_CONFIG}")
 endif()
@@ -373,6 +532,18 @@ if(EXECUTORCH_ENABLE_EVENT_TRACER)
   )
 endif()
 
+if(ET_BUNDLE_IO)
+  add_library(bundled_program STATIC IMPORTED)
+  set_property(
+    TARGET bundled_program
+    PROPERTY IMPORTED_LOCATION
+        "${ET_BUILD_DIR_PATH}/lib/libbundled_program.a"
+  )
+  list(APPEND arm_executor_runner_link
+    bundled_program
+  )
+endif()
+
 # Need whole-archive to ensure C++ ctor's are called - this may be wasteful for
 # bin size as we link in a number of other symbols
 target_link_libraries(
@@ -402,6 +573,18 @@ if(ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE)
   target_compile_definitions(arm_executor_runner PUBLIC ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE=${ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE})
 endif()
 
+if(ET_BUNDLE_IO)
+  target_compile_definitions(arm_executor_runner PUBLIC -DET_BUNDLE_IO)
+endif()
+
+if(ET_ATOL)
+  target_compile_definitions(arm_executor_runner PUBLIC ET_ATOL=${ET_ATOL})
+endif()
+
+if(ET_RTOL)
+  target_compile_definitions(arm_executor_runner PUBLIC ET_RTOL=${ET_RTOL})
+endif()
+
 # Fixup compilation of retarget.c
 if(SEMIHOSTING)
   # Remove this when MLBEDSW-8910 is closed.
diff --git a/examples/arm/executor_runner/arm_executor_runner.cpp b/examples/arm/executor_runner/arm_executor_runner.cpp
index 2d08f733eba..48237acdf22 100644
--- a/examples/arm/executor_runner/arm_executor_runner.cpp
+++ b/examples/arm/executor_runner/arm_executor_runner.cpp
@@ -1,17 +1,12 @@
 /* Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
- * Copyright 2023-2024 Arm Limited and/or its affiliates.
+ * Copyright 2023-2025 Arm Limited and/or its affiliates.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
 
 #include <errno.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <memory>
-#include <vector>
-
 #include <executorch/extension/data_loader/buffer_data_loader.h>
 #include <executorch/extension/runner_util/inputs.h>
 #include <executorch/runtime/core/memory_allocator.h>
@@ -19,8 +14,17 @@
 #include <executorch/runtime/platform/log.h>
 #include <executorch/runtime/platform/platform.h>
 #include <executorch/runtime/platform/runtime.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <memory>
+#include <vector>
 
 #include "arm_perf_monitor.h"
+
+#if defined(ET_BUNDLE_IO)
+#include <executorch/devtools/bundled_program/bundled_program.h>
+#endif
+
 #if defined(ET_EVENT_TRACER_ENABLED)
 #include <executorch/devtools/etdump/etdump_flatcc.h>
 #if !defined(SEMIHOSTING)
@@ -102,6 +106,24 @@ unsigned char __attribute__((
     section("input_data_sec"),
     aligned(16))) method_allocation_pool[method_allocation_pool_size];
 
+#if defined(ET_BUNDLE_IO)
+
+const size_t testset_idx = 0; // BundleIO test indexes to test if used
+
+#if defined(ET_ATOL)
+const float et_atol = ET_ATOL;
+#else
+const float et_atol = 0.01;
+#endif
+
+#if defined(ET_RTOL)
+const float et_rtol = ET_RTOL;
+#else
+const float et_rtol = 0.01;
+#endif
+
+#endif
+
 /**
  * The temp_allocation_pool is used for allocating temporary data during kernel
  * or delegate execution. This will be reset after each kernel or delegate call.
@@ -409,15 +431,41 @@ int main(int argc, const char* argv[]) {
     }
   }
 #endif
-  ET_LOG(Info, "Model in %p %c", model_pte, model_pte[0]);
-  auto loader = BufferDataLoader(model_pte, pte_size);
-  ET_LOG(Info, "Model PTE file loaded. Size: %lu bytes.", pte_size);
+  ET_LOG(
+      Info, "PTE in %p %c Size: %lu bytes", model_pte, model_pte[0], pte_size);
+
+  // Find the offset to the embedded Program.
+  const void* program_data = model_pte;
+  size_t program_data_len = pte_size;
+
+#if defined(ET_BUNDLE_IO)
+  bool bundle_io = executorch::bundled_program::is_bundled_program(
+      reinterpret_cast<void*>(model_pte), pte_size);
+  if (bundle_io) {
+    // BundleIO bpte is provided, dig out the actual model from the data area
+    Error status = executorch::bundled_program::get_program_data(
+        reinterpret_cast<void*>(model_pte),
+        pte_size,
+        &program_data,
+        &program_data_len);
+
+    ET_CHECK_MSG(
+        status == Error::Ok,
+        "get_program_data() from bundle PTE failed: 0x%x",
+        (unsigned int)status);
+  }
+#endif
+  auto loader = BufferDataLoader(program_data, program_data_len);
+  ET_LOG(Info, "PTE Model data loaded. Size: %lu bytes.", program_data_len);
+
+  // Parse the program file. This is immutable, and can also be reused
+  // between multiple execution invocations across multiple threads.
   Result<Program> program = Program::load(&loader);
   if (!program.ok()) {
     ET_LOG(
         Info,
         "Program loading failed @ 0x%p: 0x%" PRIx32,
-        model_pte,
+        program_data,
         program.error());
   }
 
@@ -483,6 +531,7 @@ int main(int argc, const char* argv[]) {
   executorch::runtime::EventTracer* event_tracer_ptr = nullptr;
 
 #if defined(ET_EVENT_TRACER_ENABLED)
+  ET_LOG(Info, "Setting up ETDump");
   torch::executor::ETDumpGen etdump_gen = torch::executor::ETDumpGen();
   event_tracer_ptr = &etdump_gen;
 #endif
@@ -499,21 +548,75 @@ int main(int argc, const char* argv[]) {
   }
   size_t method_loaded_memsize =
       method_allocator.used_size() - method_loaded_membase;
-  ET_LOG(Info, "Method loaded.");
+  ET_LOG(Info, "Method '%s' loaded.", method_name);
 
   ET_LOG(Info, "Preparing inputs...");
   size_t input_membase = method_allocator.used_size();
 
-  auto inputs =
-      ::prepare_input_tensors(*method, method_allocator, input_buffers);
-
-  if (!inputs.ok()) {
-    ET_LOG(
-        Info,
-        "Preparing inputs tensors for method %s failed with status 0x%" PRIx32,
-        method_name,
-        inputs.error());
+#if defined(ET_BUNDLE_IO)
+  if (bundle_io) {
+    // Get inputs from bundled IO ".bpte" data
+    // Useful for testing
+    ET_LOG(Info, "Input testset[%d] from bundled bpte", testset_idx);
+    Error status = executorch::bundled_program::load_bundled_input(
+        *method, model_pte, testset_idx);
+    ET_CHECK_MSG(
+        status == Error::Ok,
+        "load_bundled_input failed with status 0x%" PRIx32,
+        status);
+  } else
+#endif
+  {
+    // Here you would add code to get input from your Hardware
+    // Get inputs from SEMIHOSTING or fake it with a lot of "1"
+    // Use "static" to force to compiler to remove this when it goes out of
+    // scope
+    static auto prepared_inputs =
+        ::prepare_input_tensors(*method, method_allocator, input_buffers);
+
+    if (!prepared_inputs.ok()) {
+      ET_LOG(
+          Info,
+          "Preparing inputs tensors for method %s failed with status 0x%" PRIx32,
+          method_name,
+          prepared_inputs.error());
+    }
   }
+#ifdef DUMP_INPUT
+  {
+    std::vector<EValue> inputs(method->inputs_size());
+    ET_LOG(Info, "%zu inputs: ", inputs.size());
+    Error status = method->get_inputs(inputs.data(), inputs.size());
+    ET_CHECK(status == Error::Ok);
+
+    for (int i = 0; i < inputs.size(); ++i) {
+      Tensor t = inputs[i].toTensor();
+      // The output might be collected and parsed so printf() is used instead
+      // of ET_LOG() here
+      for (int j = 0; j < inputs[i].toTensor().numel(); ++j) {
+        if (t.scalar_type() == ScalarType::Int) {
+          printf(
+              "Input[%d][%d]: (int) %d\n",
+              i,
+              j,
+              inputs[i].toTensor().const_data_ptr<int>()[j]);
+        } else if (t.scalar_type() == ScalarType::Float) {
+          printf(
+              "Input[%d][%d]: (float) %f\n",
+              i,
+              j,
+              inputs[i].toTensor().const_data_ptr<float>()[j]);
+        } else if (t.scalar_type() == ScalarType::Char) {
+          printf(
+              "Input[%d][%d]: (char) %d\n",
+              i,
+              j,
+              inputs[i].toTensor().const_data_ptr<int8_t>()[j]);
+        }
+      }
+    }
+  }
+#endif
   size_t input_memsize = method_allocator.used_size() - input_membase;
   ET_LOG(Info, "Input prepared.");
 
@@ -524,7 +627,8 @@ int main(int argc, const char* argv[]) {
   StopMeasurements();
   size_t executor_memsize = method_allocator.used_size() - executor_membase;
 
-  ET_LOG(Info, "model_pte_loaded_size:     %lu bytes.", pte_size);
+  ET_LOG(Info, "model_pte_program_size:     %lu bytes.", program_data_len);
+  ET_LOG(Info, "model_pte_loaded_size:      %lu bytes.", pte_size);
 #if defined(SEMIHOSTING)
   if (input_file_allocator.size() > 0) {
     ET_LOG(
@@ -575,50 +679,34 @@ int main(int argc, const char* argv[]) {
   ET_LOG(Info, "%zu outputs: ", outputs.size());
   status = method->get_outputs(outputs.data(), outputs.size());
   ET_CHECK(status == Error::Ok);
+
   for (int i = 0; i < outputs.size(); ++i) {
     Tensor t = outputs[i].toTensor();
 #if !defined(SEMIHOSTING)
+#if !defined(ET_BUNDLE_IO)
     // The output might be collected and parsed so printf() is used instead
     // of ET_LOG() here
     for (int j = 0; j < outputs[i].toTensor().numel(); ++j) {
       if (t.scalar_type() == ScalarType::Int) {
         printf(
-            "Output[%d][%d]: %d\n",
+            "Output[%d][%d]: (int) %d\n",
             i,
             j,
             outputs[i].toTensor().const_data_ptr<int>()[j]);
       } else if (t.scalar_type() == ScalarType::Float) {
         printf(
-            "Output[%d][%d]: %f\n",
+            "Output[%d][%d]: (float) %f\n",
             i,
             j,
             outputs[i].toTensor().const_data_ptr<float>()[j]);
       } else if (t.scalar_type() == ScalarType::Char) {
         printf(
-            "Output[%d][%d]: %d\n",
+            "Output[%d][%d]: (char) %d\n",
             i,
             j,
             outputs[i].toTensor().const_data_ptr<int8_t>()[j]);
       }
     }
-#if defined(ET_EVENT_TRACER_ENABLED)
-    ETDumpResult result = etdump_gen.get_etdump_data();
-    if (result.buf != nullptr && result.size > 0) {
-      // On a device with no file system we can't just write it out
-      // to the file-system so we base64 encode it and dump it on the log.
-      int mode = 0;
-      size_t len = result.size;
-      size_t encoded_len = base64_encoded_size(result.size, mode);
-      uint8_t* encoded_buf = reinterpret_cast<uint8_t*>(
-          method_allocator.allocate(encoded_len + 1));
-      int ret = base64_encode(
-          encoded_buf, (uint8_t*)result.buf, &encoded_len, &len, mode);
-      encoded_buf[encoded_len] = 0x00; // Ensure null termination
-      ET_LOG(Info, "Writing etdump.bin [base64]");
-      printf(
-          "#---\nbase64 -i -d <<<\"\\\n%s\\\n\" >etdump.bin\npython3 -m devtools.inspector.inspector_cli --etdump_path etdump.bin  --source_time_scale cycles --target_time_scale cycles\n#---\n",
-          encoded_buf);
-    }
 #endif
 #else
     char out_filename[255];
@@ -631,21 +719,66 @@ int main(int argc, const char* argv[]) {
         outputs[i].toTensor().nbytes(),
         out_file);
     fclose(out_file);
-#if defined(ET_EVENT_TRACER_ENABLED)
-    etdump_result result = etdump_gen.get_etdump_data();
-    if (result.buf != nullptr && result.size > 0) {
-      // On a device with a file system we can just write it out
-      // to the file-system.
-      char etdump_filename = "etdump.bin";
-      ET_LOG(Info, "Writing etdump to file: %s", etdump_filename);
-      FILE* f = fopen(etdump_filename, "w+");
-      fwrite((uint8_t*)result.buf, 1, result.size, f);
-      fclose(f);
-      free(result.buf);
-    }
 #endif
+  }
+
+#if defined(ET_BUNDLE_IO)
+  if (bundle_io) {
+    // Verify the result.
+    status = executorch::bundled_program::verify_method_outputs(
+        *method, model_pte, testset_idx, et_rtol, et_atol);
+    if (status == Error::Ok) {
+      ET_LOG(Info, "Model output match expected BundleIO bpte ref data.");
+      ET_LOG(Info, "TEST: BundleIO index[%d] Test_result: PASS", testset_idx);
+    } else {
+      ET_LOG(
+          Error,
+          "Model output don't match expected BundleIO bpte ref data. rtol=%f atol=%f",
+          et_rtol,
+          et_atol);
+      ET_LOG(Error, "TEST: BundleIO index[%d] Test_result: FAIL", testset_idx);
+    }
+    ET_CHECK_MSG(
+        status == Error::Ok,
+        "Bundle verification failed with status 0x%" PRIx32,
+        status);
+  }
 #endif
+
+#if defined(ET_EVENT_TRACER_ENABLED)
+#if !defined(SEMIHOSTING)
+  ETDumpResult result = etdump_gen.get_etdump_data();
+  if (result.buf != nullptr && result.size > 0) {
+    // On a device with no file system we can't just write it out
+    // to the file-system so we base64 encode it and dump it on the log.
+    int mode = 0;
+    size_t len = result.size;
+    size_t encoded_len = base64_encoded_size(result.size, mode);
+    uint8_t* encoded_buf =
+        reinterpret_cast<uint8_t*>(method_allocator.allocate(encoded_len + 1));
+    int ret = base64_encode(
+        encoded_buf, (uint8_t*)result.buf, &encoded_len, &len, mode);
+    encoded_buf[encoded_len] = 0x00; // Ensure null termination
+    ET_LOG(Info, "Writing etdump.bin [base64]");
+    printf(
+        "#---\nbase64 -i -d <<<\"\\\n%s\\\n\" >etdump.bin\npython3 -m devtools.inspector.inspector_cli --etdump_path etdump.bin  --source_time_scale cycles --target_time_scale cycles\n#---\n",
+        encoded_buf);
+  }
+#else
+  etdump_result result = etdump_gen.get_etdump_data();
+  if (result.buf != nullptr && result.size > 0) {
+    // On a device with a file system we can just write it out
+    // to the file-system.
+    char etdump_filename = "etdump.bin";
+    ET_LOG(Info, "Writing etdump to file: %s", etdump_filename);
+    FILE* f = fopen(etdump_filename, "w+");
+    fwrite((uint8_t*)result.buf, 1, result.size, f);
+    fclose(f);
+    free(result.buf);
   }
+#endif
+#endif
+
 out:
   ET_LOG(Info, "Program complete, exiting.");
 #if defined(SEMIHOSTING)
diff --git a/examples/arm/run.sh b/examples/arm/run.sh
index 1a50f59d454..3125460aea2 100755
--- a/examples/arm/run.sh
+++ b/examples/arm/run.sh
@@ -9,45 +9,52 @@
 
 set -eu
 
-
-
 ########
 ### Hardcoded constants
 ########
 script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+et_root_dir=$(cd ${script_dir}/../.. && pwd)
+et_root_dir=$(realpath ${et_root_dir})
 
-# Default Ethos-u tool folder override with --scratch-dir=<FOLDER>
-root_dir=${script_dir}/ethos-u-scratch
 
 model_name=""
+model_input_set=false
+model_input=""
 aot_arm_compiler_flags="--delegate --quantize"
 portable_kernels="aten::_softmax.out"
 target="ethos-u55-128"
 output_folder_set=false
 output_folder="."
+bundleio=false
 build_with_etdump=false
 build_type="Release"
 extra_build_flags=""
 build_only=false
 system_config=""
 memory_mode=""
+et_build_root="${et_root_dir}/arm_test"
+ethos_u_scratch_dir=${script_dir}/ethos-u-scratch
 
-help() {
+function help() {
     echo "Usage: $(basename $0) [options]"
     echo "Options:"
-    echo "  --model_name=<MODEL>                   Model to run, can be a builtin, examples/models or a filename Default to all builtin models"
+    echo "  --model_name=<MODEL>                   Model file .py/.pth/.pt, builtin model or a model from examples/models. Passed to aot_arm_compiler"
+    echo "  --model_input=<INPUT>                  Provide model input .pt file to override the input in the model file. Passed to aot_arm_compiler"
+    echo "                                           NOTE: Inference in FVP is done with a dummy input full of ones. Use bundleio flag to run the model in FVP with the custom input or the input from the model file."  
     echo "  --aot_arm_compiler_flags=<FLAGS>       Only used if --model_name is used Default: ${aot_arm_compiler_flags}"
     echo "  --portable_kernels=<OPS>               Comma separated list of portable (non delagated) kernels to include Default: ${portable_kernels}"
     echo "  --target=<TARGET>                      Target to build and run for Default: ${target}"
-    echo "  --output=<FOLDER>                      Output folder Default: ${output_folder}"
+    echo "  --output=<FOLDER>                      Target build output folder Default: ${output_folder}"
+    echo "  --bundleio                             Create Bundled pte using Devtools BundelIO with Input/RefOutput included"
     echo "  --etdump                               Adds Devtools etdump support to track timing, etdump area will be base64 encoded in the log"
-    echo "  --debug_build                          Build with debug flag, default is Release"
-    echo "  --extra_build_flags                    Extra flags to pass to cmake like -DET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE=60000 Default: none "
+    echo "  --build_type=<TYPE>                    Build with Release, Debug or RelWithDebInfo, default is ${build_type}"
+    echo "  --extra_build_flags=<FLAGS>            Extra flags to pass to cmake like -DET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE=60000 Default: none "
     echo "  --build_only                           Only build, don't run FVP"
-    echo "  --scratch-dir=<FOLDER>                 Path to your Ethos-U scrach dir if you not using default"
     echo "  --system_config=<CONFIG>               System configuration to select from the Vela configuration file (see vela.ini). Default: Ethos_U55_High_End_Embedded for EthosU55 targets, Ethos_U85_SYS_DRAM_Mid for EthosU85 targets."
     echo "                                            NOTE: If given, this option must match the given target. This option also sets timing adapter values customized for specific hardware, see ./executor_runner/CMakeLists.txt."
     echo "  --memory_mode=<MODE>                   Memory mode to select from the Vela configuration file (see vela.ini), e.g. Shared_Sram/Sram_Only. Default: 'Shared_Sram' for Ethos-U55 targets, 'Sram_Only' for Ethos-U85 targets"
+    echo "  --et_build_root=<FOLDER>               Executorch build output root folder to use, defaults to ${et_build_root}"
+    echo "  --scratch-dir=<FOLDER>                 Path to your Ethos-U scrach dir if you not using default ${ethos_u_scratch_dir}"
     exit 0
 }
 
@@ -55,48 +62,33 @@ for arg in "$@"; do
     case $arg in
       -h|--help) help ;;
       --model_name=*) model_name="${arg#*=}";;
+      --model_input=*) model_input="${arg#*=}" ; model_input_set=true  ;;
       --aot_arm_compiler_flags=*) aot_arm_compiler_flags="${arg#*=}";;
       --portable_kernels=*) portable_kernels="${arg#*=}";;
       --target=*) target="${arg#*=}";;
       --output=*) output_folder="${arg#*=}" ; output_folder_set=true ;;
+      --bundleio) bundleio=true ;;
       --etdump) build_with_etdump=true ;;
-      --debug_build) build_type="Debug" ;;
+      --build_type=*) build_type="${arg#*=}";;
       --extra_build_flags=*) extra_build_flags="${arg#*=}";;
       --build_only) build_only=true ;;
-      --scratch-dir=*) root_dir="${arg#*=}";;
       --system_config=*) system_config="${arg#*=}";;
       --memory_mode=*) memory_mode="${arg#*=}";;
+      --et_build_root=*) et_build_root="${arg#*=}";;
+      --scratch-dir=*) ethos_u_scratch_dir="${arg#*=}";;
       *)
       ;;
     esac
 done
 
-root_dir=$(realpath ${root_dir})
-output_folder=$(realpath ${output_folder})
-mkdir -p ${output_folder}
-if [ "$output_folder_set" = true ] ; then
-    executor_runner_path=${output_folder}
-else
-    executor_runner_path=${script_dir}/executor_runner
-fi
-executor_runner_path=$(realpath ${executor_runner_path})
+# Default Ethos-u tool folder override with --scratch-dir=<FOLDER>
+ethos_u_scratch_dir=$(realpath ${ethos_u_scratch_dir})
+setup_path_script=${ethos_u_scratch_dir}/setup_path.sh
+toolchain_cmake=${script_dir}/ethos-u-setup/arm-none-eabi-gcc.cmake
+_setup_msg="please refer to ${script_dir}/setup.sh to properly install necessary tools."
 
-mkdir -p ${root_dir}/ethos-u
-ethos_u_root_dir="$(cd ${root_dir}/ethos-u && pwd)"
-setup_path_script=${root_dir}/setup_path.sh
-
-# Executorch
-et_root_dir=$(cd ${script_dir}/../.. && pwd)
-et_build_dir=${et_root_dir}/cmake-out
 
 # Set target based variables
-fvp_model=FVP_Corstone_SSE-300_Ethos-U55
-if [[ ${target} =~ "ethos-u85" ]]
-then
-    echo "target is ethos-u85 variant so switching to CS320 FVP"
-    fvp_model=FVP_Corstone_SSE-320
-fi
-
 if [[ ${system_config} == "" ]]
 then
     system_config="Ethos_U55_High_End_Embedded"
@@ -115,227 +107,6 @@ then
     fi
 fi
 
-toolchain_cmake=${script_dir}/ethos-u-setup/arm-none-eabi-gcc.cmake
-_setup_msg="please refer to ${script_dir}/ethos-u-setup/setup.sh to properly install necessary tools."
-
-if ! [[ $portable_kernels =~ ^((^|,)aten::[a-zA-Z0-9_]+\.[a-zA-Z0-9_]*out)*$ ]]; then
-    echo " ERROR: specified argument --portable_kernels=${portable_kernels}"
-    echo "        is in the wrong format please use \"aten::<OP1>.out,aten::<OP2>.out,...\""
-    echo "        e.g. \"aten::_softmax.out,aten::add.out\""
-    exit 1
-fi
-
-# Generate a pte file
-# output from this function is the pte filename e.g. echo should be avoided or directed to stderr e.g. >&2
-function generate_pte_file() {
-    [[ $# -ne 2 ]] && { echo "[${FUNCNAME[0]}]" "Expecting model and model_compiler_flags flag, got, $*"; exit 1; }
-    local model=${1}
-    local model_short_name=$(basename -- "${model}" ".py")
-    local model_compiler_flags=${2}
-
-    local model_filename=${model_short_name}_arm_${target}.pte
-    if [[ "${model_compiler_flags}" == *"--delegate"* ]]; then
-        # Name aligned with default aot_arm_compiler output
-        model_filename=${model_short_name}_arm_delegate_${target}.pte
-    fi
-    cd $et_root_dir
-
-    local pte_file
-    pte_file=$(realpath ${output_folder}/${model_filename})
-    rm -f "${pte_file}"
-
-    SO_EXT=$(python3 -c 'import platform; print({"Darwin": "dylib", "Linux": "so", "Windows": "dll"}.get(platform.system(), None))')
-    # We are using the aot_lib from build_quantization_aot_lib below
-    SO_LIB=$(find cmake-out-aot-lib -name libquantized_ops_aot_lib.${SO_EXT})
-
-    local ARM_AOT_CMD="python3 -m examples.arm.aot_arm_compiler --model_name=${model} --target=${target} ${model_compiler_flags} --output ${output_folder} --so_library=$SO_LIB --system_config=${system_config} --memory_mode=${memory_mode}"
-    echo "CALL ${ARM_AOT_CMD}" >&2
-    ${ARM_AOT_CMD} 1>&2
-
-    [[ -f ${pte_file} ]] || { >&2 echo "Failed to generate a pte file - ${pte_file}"; exit 1; }
-    echo "${pte_file}"
-}
-
-# build ExecuTorch Libraries
-function build_executorch() {
-    set -x
-
-    [[ -d "${et_build_dir}" ]] \
-        && echo "[${FUNCNAME[0]}] Warn: using already existing build-dir for executorch: ${et_build_dir}!!"
-    mkdir -p "${et_build_dir}"
-
-    cd "${et_root_dir}"
-
-    build_with_etdump_flags=""
-    if [ "$build_with_etdump" = true ] ; then
-        ( set +x ;
-            echo "--------------------------------------------------------------------------------" ;
-            echo "Build ExecuTorch Libraries host flatcc bin ${build_type} into ${et_root_dir} - cmake-out-host-tools/bin/flatcc" ;
-            echo "--------------------------------------------------------------------------------" )
-
-
-        # Build host flatcc bin
-        mkdir -p cmake-out-host-tools
-        cmake                                                 \
-            -DCMAKE_INSTALL_PREFIX=${et_build_dir}            \
-            -DCMAKE_BUILD_TYPE=${build_type}                  \
-            -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF            \
-            -DEXECUTORCH_ENABLE_LOGGING=ON                    \
-            -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON               \
-            -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON           \
-            -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON       \
-            -DEXECUTORCH_BUILD_DEVTOOLS=ON                    \
-            -DEXECUTORCH_ENABLE_EVENT_TRACER=ON               \
-            -DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=ON      \
-            -DFLATCC_ALLOW_WERROR=OFF                         \
-            -DFLATC_EXECUTABLE="$(which flatc)"               \
-            ${extra_build_flags}                              \
-            -Bcmake-out-host-tools                            \
-            "${et_root_dir}"
-
-        mkdir -p cmake-out-host-tools/bin
-        cp third-party/flatcc/bin/flatcc cmake-out-host-tools/bin
-
-        build_with_etdump_flags="-DEXECUTORCH_BUILD_DEVTOOLS=ON                    \
-                                 -DEXECUTORCH_ENABLE_EVENT_TRACER=ON               \
-                                 -DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=OFF     \
-                                 -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=OFF      \
-                                 -DFLATCC_ALLOW_WERROR=OFF                         \
-                                 -DFLATCC_EXECUTABLE=${et_root_dir}/cmake-out-host-tools/bin/flatcc "
-    fi
-
-    ( set +x ;
-        echo "--------------------------------------------------------------------------------" ;
-        echo "Build ExecuTorch Libraries target libs with --target install ${build_type} into '${et_root_dir}' - '${et_build_dir}'" ;
-        echo "--------------------------------------------------------------------------------" )
-
-    # Build
-    cmake                                                 \
-        -DCMAKE_INSTALL_PREFIX=${et_build_dir}            \
-        -DCMAKE_BUILD_TYPE=${build_type}                  \
-        -DCMAKE_TOOLCHAIN_FILE="${toolchain_cmake}"       \
-        -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF            \
-        -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON               \
-        -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON           \
-        -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON       \
-        -DEXECUTORCH_ENABLE_LOGGING=ON                    \
-        ${build_with_etdump_flags}                        \
-        -DFLATC_EXECUTABLE="$(which flatc)"               \
-        ${extra_build_flags}                              \
-        -B${et_build_dir}                                 \
-        "${et_root_dir}"
-
-    echo "[${FUNCNAME[0]}] Configured CMAKE"
-
-    cmake --build ${et_build_dir} --parallel --target install --config ${build_type} --
-
-    ( set +x ;
-        echo "--------------------------------------------------------------------------------" ;
-        echo "Build ExecuTorch Libraries ${build_type} into '${et_root_dir}/examples/arm' - '${et_build_dir}/examples/arm'" ;
-        echo "--------------------------------------------------------------------------------" )
-
-    cmake                                                 \
-        -DCMAKE_INSTALL_PREFIX=${et_build_dir}            \
-        -DCMAKE_BUILD_TYPE=${build_type}                  \
-        -DCMAKE_TOOLCHAIN_FILE="${toolchain_cmake}"       \
-        -DEXECUTORCH_SELECT_OPS_LIST=${portable_kernels}  \
-        -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON               \
-        ${extra_build_flags}                              \
-        -B"${et_build_dir}/examples/arm"                  \
-        "${et_root_dir}/examples/arm"
-
-    cmake --build "${et_build_dir}/examples/arm" --parallel --config ${build_type} --
-
-    set +x
-
-    cd "${et_build_dir}"
-    echo "[${FUNCNAME[0]}] Generated static libraries for ExecuTorch:"
-    find . -name "*.a" -exec ls -al {} \;
-}
-
-# build Arm Baremetal executor_runner
-function build_executorch_runner() {
-    echo "[${FUNCNAME[0]}] Generating ExecuTorch libraries"
-    [[ $# -ne 1 ]] && { echo "[${FUNCNAME[0]}]" "Expecting a single pte file as argument got, $*"; exit 1; }
-    local pte=${1}
-    if [[ ${target} == *"ethos-u55"*  ]]; then
-        local target_cpu=cortex-m55
-    else
-        local target_cpu=cortex-m85
-    fi
-    echo "--------------------------------------------------------------------------------"
-    echo "Build Arm Baremetal executor_runner for ${target} - '${executor_runner_path}/cmake-out'"
-    echo "--------------------------------------------------------------------------------"
-
-    cd ${script_dir}/executor_runner
-
-    build_with_etdump_flags=""
-    if [ "$build_with_etdump" = true ] ; then
-        build_with_etdump_flags=" -DEXECUTORCH_ENABLE_EVENT_TRACER=ON "
-    fi
-
-    cmake \
-      -DCMAKE_BUILD_TYPE=${build_type}            \
-      -DCMAKE_TOOLCHAIN_FILE=${toolchain_cmake}   \
-      -DTARGET_CPU=${target_cpu}                  \
-      -DET_DIR_PATH:PATH=${et_root_dir}           \
-      -DET_BUILD_DIR_PATH:PATH=${et_build_dir}    \
-      -DET_PTE_FILE_PATH:PATH="${pte}"            \
-      -DETHOS_SDK_PATH:PATH=${ethos_u_root_dir}   \
-      -DETHOSU_TARGET_NPU_CONFIG=${target}        \
-      ${build_with_etdump_flags}                  \
-      -DPYTHON_EXECUTABLE=$(which python3)        \
-      -DSYSTEM_CONFIG=${system_config}            \
-      ${extra_build_flags}                        \
-      -B ${executor_runner_path}/cmake-out
-
-    echo "[${FUNCNAME[0]}] Configured CMAKE"
-
-    cmake --build ${executor_runner_path}/cmake-out --parallel -- arm_executor_runner
-    echo "[${FUNCNAME[0]}] Generated baremetal elf file:"
-    find ${executor_runner_path}/cmake-out -name "arm_executor_runner"
-    echo "executable_text: $(find ${executor_runner_path}/cmake-out -name arm_executor_runner -exec arm-none-eabi-size {} \; | grep -v filename | awk '{print $1}') bytes"
-    echo "executable_data: $(find ${executor_runner_path}/cmake-out -name arm_executor_runner -exec arm-none-eabi-size {} \; | grep -v filename | awk '{print $2}') bytes"
-    echo "executable_bss:  $(find ${executor_runner_path}/cmake-out -name arm_executor_runner -exec arm-none-eabi-size {} \; | grep -v filename | awk '{print $3}') bytes"
-}
-
-# Execute the executor_runner on FVP Simulator
-function run_fvp() {
-    [[ $# -ne 1 ]] && { echo "[${FUNCNAME[0]}]" "Expexted elf binary name, got $*"; exit 1; }
-    local elf_name=${1}
-    elf=$(find ${executor_runner_path} -name "${elf_name}")
-    [[ ! -f $elf ]] && { echo "[${FUNCNAME[0]}]: Unable to find executor_runner elf: ${elf}"; exit 1; }
-    num_macs=$(echo ${target} | cut -d - -f 3)
-
-    if [[ ${target} == *"ethos-u55"*  ]]; then
-        echo "Running ${elf} for ${target} run with FVP:${fvp_model} num_macs:${num_macs}"
-        ${fvp_model}                                            \
-            -C ethosu.num_macs=${num_macs}                      \
-            -C mps3_board.visualisation.disable-visualisation=1 \
-            -C mps3_board.telnetterminal0.start_telnet=0        \
-            -C mps3_board.uart0.out_file='-'                    \
-            -C mps3_board.uart0.shutdown_on_eot=1               \
-            -a "${elf}"                                         \
-            --timelimit 220 || true # seconds
-        echo "[${FUNCNAME[0]}] Simulation complete, $?"
-    elif [[ ${target} == *"ethos-u85"*  ]]; then
-        echo "Running ${elf} for ${target} run with FVP:${fvp_model} num_macs:${num_macs}"
-    	${fvp_model}                                            \
-            -C mps4_board.subsystem.ethosu.num_macs=${num_macs} \
-            -C mps4_board.visualisation.disable-visualisation=1 \
-            -C vis_hdlcd.disable_visualisation=1                \
-            -C mps4_board.telnetterminal0.start_telnet=0        \
-            -C mps4_board.uart0.out_file='-'                    \
-            -C mps4_board.uart0.shutdown_on_eot=1               \
-            -a "${elf}"                                         \
-            --timelimit 220 || true # seconds
-        echo "[${FUNCNAME[0]}] Simulation complete, $?"
-    else
-        echo "Running ${elf} for ${target} is not supported"
-        exit 1
-    fi
-}
-
 #######
 ### Main
 #######
@@ -343,12 +114,10 @@ function run_fvp() {
 # This should be prepared by the setup.sh
 [[ -f ${setup_path_script} ]] \
     || { echo "Missing ${setup_path_script}. ${_setup_msg}"; exit 1; }
-source ${root_dir}/setup_path.sh
 
-# basic checks before we get started
-hash ${fvp_model} \
-    || { echo "Could not find ${fvp_model} on PATH, ${_setup_msg}"; exit 1; }
+source ${setup_path_script}
 
+# basic checks before we get started
 hash arm-none-eabi-gcc \
     || { echo "Could not find arm baremetal toolchain on PATH, ${_setup_msg}"; exit 1; }
 
@@ -358,9 +127,32 @@ hash arm-none-eabi-gcc \
 [[ -f ${et_root_dir}/CMakeLists.txt ]] \
     || { echo "Executorch repo doesn't contain CMakeLists.txt file at root level"; exit 1; }
 
-# build executorch libraries
-build_executorch
-cd $et_root_dir && backends/arm/scripts/build_quantized_ops_aot_lib.sh $build_type
+# Build executorch libraries
+cd $et_root_dir
+devtools_flag=""
+bundleio_flag=""
+et_dump_flag=""
+if [ "$build_with_etdump" = true ] ; then
+    devtools_flag="--devtools --etdump"
+    et_dump_flag="--etdump"
+fi
+
+if [ "$bundleio" = true ] ; then
+    devtools_flag="--devtools --etdump"
+    bundleio_flag="--bundleio"
+    et_dump_flag="--etdump"
+fi
+
+backends/arm/scripts/build_executorch.sh --et_build_root="${et_build_root}" --build_type=$build_type $devtools_flag
+backends/arm/scripts/build_portable_kernels.sh --et_build_root="${et_build_root}" --build_type=$build_type --portable_kernels=$portable_kernels
+
+# Build a lib quantized_ops_aot_lib
+backends/arm/scripts/build_quantized_ops_aot_lib.sh --et_build_root="${et_build_root}" --build_type=$build_type
+
+SO_EXT=$(python3 -c 'import platform; print({"Darwin": "dylib", "Linux": "so", "Windows": "dll"}.get(platform.system(), None))')
+# We are using the aot_lib from build_quantization_aot_lib below
+SO_LIB=$(find "${et_build_root}/cmake-out-aot-lib" -name libquantized_ops_aot_lib.${SO_EXT})
+
 
 if [[ -z "$model_name" ]]; then
     # the test models run, and whether to delegate
@@ -373,19 +165,68 @@ fi
 
 # loop over running the AoT flow and executing the model on device
 for i in "${!test_model[@]}"; do
+    model="${test_model[i]}"
+    model_compiler_flags="${model_compiler_flags[i]}"
+
     echo "--------------------------------------------------------------------------------"
-    printf "Running e2e flow for model '%s' with flags '%s'\n" "${test_model[i]}" "${model_compiler_flags[i]}"
+    printf "Running e2e flow for model '%s' with flags '%s'\n" "${model}" "${model_compiler_flags}"
     echo "--------------------------------------------------------------------------------"
-    pte=$(generate_pte_file "${test_model[i]}" "${model_compiler_flags[i]}")
-    stat --printf="Generated pte_data_size: %s bytes\npte_file:%n\n" ${pte}
+
+    cd $et_root_dir
+    # Remove path and file exetension to get model_short_name
+    ext=${model##*.}
+    model_short_name=$(basename -- "${model}" .$ext)
+    model_filename=${model_short_name}_arm_${target}
+
+    if [[ "${model_compiler_flags}" == *"--delegate"* ]]; then
+        # Name aligned with default aot_arm_compiler output
+        model_filename=${model_short_name}_arm_delegate_${target}
+    fi
+    elf_folder=${model_filename}
+
+    if [ "$bundleio" = true ] ; then
+        model_filename=${model_filename}.bpte
+    else
+        model_filename=${model_filename}.pte
+    fi
+
+    if [ "$output_folder_set" = false ] ; then
+        output_folder=${et_build_root}/${model_short_name}
+    fi
+
+    mkdir -p ${output_folder}
+    output_folder=$(realpath ${output_folder})
+    pte_file="${output_folder}/${model_filename}"
+
+    # Remove old pte files
+    rm -f "${output_folder}/${model_filename}"
+
+    if [ "$model_input_set" = true ]; then
+        model_compiler_flags="${model_compiler_flags} --model_input=${model_input}"
+    fi
+
+    ARM_AOT_CMD="python3 -m examples.arm.aot_arm_compiler --model_name=${model} --target=${target} ${model_compiler_flags} --intermediate=${output_folder} --output=${pte_file} --so_library=$SO_LIB --system_config=${system_config} --memory_mode=${memory_mode} $bundleio_flag"
+    echo "CALL ${ARM_AOT_CMD}" >&2
+    ${ARM_AOT_CMD} 1>&2
+
+    pte_file=$(realpath ${pte_file})
+
+    [[ -f ${pte_file} ]] || { >&2 echo "Failed to generate a pte file - ${pte_file}"; exit 1; }
+    echo "pte_data_size: $(wc -c ${pte_file})"
+    echo "pte_file: ${pte_file}"
+
     if [[ ${target} == *"TOSA"*  ]]; then
-        echo "Build for ${target} skip generating .elf and running"
+        echo "Build for ${target} skip generating a .elf and running it"
     else
+        set -x
         # Rebuild the application as the pte is imported as a header/c array
-        build_executorch_runner "${pte}"
+        backends/arm/scripts/build_executorch_runner.sh --et_build_root="${et_build_root}" --pte="${pte_file}" --build_type=${build_type} --target=${target} --system_config=${system_config} --memory_mode=${memory_mode} ${bundleio_flag} ${et_dump_flag} --extra_build_flags="${extra_build_flags}" --ethosu_tools_dir="${ethos_u_scratch_dir}"
         if [ "$build_only" = false ] ; then
-            run_fvp arm_executor_runner
+            # Execute the executor_runner on FVP Simulator
+            elf_file="${output_folder}/${elf_folder}/cmake-out/arm_executor_runner"
+            backends/arm/scripts/run_fvp.sh --elf=${elf_file} --target=$target
         fi
+        set +x
     fi
 done
 
diff --git a/examples/arm/setup.sh b/examples/arm/setup.sh
index 800dfb8d6d4..cc8774ba7a0 100755
--- a/examples/arm/setup.sh
+++ b/examples/arm/setup.sh
@@ -65,8 +65,7 @@ tosa_reference_model_rev="70ed0b40fa831387e36abdb4f7fb9670a3464f5a"
 
 # vela
 vela_repo_url="https://gitlab.arm.com/artificial-intelligence/ethos-u/ethos-u-vela"
-vela_rev="e131bf4f528f0d461868229972e07f371dcbc881"
-
+vela_rev="425541302c7e4b6fbeca7c0061286b131ee507c3"
 
 ########
 ### Optional user args
@@ -85,8 +84,10 @@ function setup_fvp() {
 
     # Mandatory user arg --i-agree-to-the-contained-eula
     eula_acceptance="${1:-'.'}"
+    eula_acceptance_by_variable="${ARM_FVP_INSTALL_I_AGREE_TO_THE_CONTAINED_EULA:-False}"
+
     if [[ "${eula_acceptance}" != "--i-agree-to-the-contained-eula" ]]; then
-        if [[ ${ARM_FVP_INSTALL_I_AGREE_TO_THE_CONTAINED_EULA} != "True" ]]; then
+        if [[ ${eula_acceptance_by_variable} != "True" ]]; then
         echo "Must pass first positional argument '--i-agree-to-the-contained-eula' to agree to EULA associated with downloading the FVP. Exiting!"
         exit 1
         else
diff --git a/examples/cadence/CMakeLists.txt b/examples/cadence/CMakeLists.txt
index f1d5ccbd2e5..757009bd4df 100644
--- a/examples/cadence/CMakeLists.txt
+++ b/examples/cadence/CMakeLists.txt
@@ -19,7 +19,7 @@ if(NOT EXECUTORCH_ROOT)
   set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
 endif()
 
-include(${EXECUTORCH_ROOT}/build/Utils.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 
 if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
diff --git a/examples/cadence/operators/TARGETS b/examples/cadence/operators/TARGETS
deleted file mode 100644
index 67f2bab681a..00000000000
--- a/examples/cadence/operators/TARGETS
+++ /dev/null
@@ -1,5 +0,0 @@
-load("targets.bzl", "define_common_targets")
-
-oncall("odai_jarvis")
-
-define_common_targets()
diff --git a/examples/cadence/operators/targets.bzl b/examples/cadence/operators/targets.bzl
deleted file mode 100644
index 68d90e8238f..00000000000
--- a/examples/cadence/operators/targets.bzl
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
-load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
-
-TESTS_LIST = [
-    "add_op",
-    "g3_ops",
-    "quantized_conv1d_op",
-    "quantized_linear_op",
-]
-
-def define_common_targets():
-    for op in TESTS_LIST:
-        _define_test_target(op)
-
-def _define_test_target(test_name):
-    file_name = "test_{}".format(test_name)
-    python_unittest(
-        name = file_name,
-        srcs = [
-            "{}.py".format(file_name),
-        ],
-        typing = True,
-        supports_static_listing = False,
-        deps = [
-            "fbsource//third-party/pypi/parameterized:parameterized",
-            "fbcode//caffe2:torch",
-            "fbcode//executorch/backends/cadence/aot:ops_registrations",
-            "fbcode//executorch/backends/cadence/aot:export_example",
-            "fbcode//executorch/backends/cadence/aot:compiler",
-            "fbcode//executorch/backends/cadence/utils:facto_util",
-        ],
-    )
diff --git a/examples/cadence/operators/test_requantize_op.py b/examples/cadence/operators/test_requantize_op.py
new file mode 100644
index 00000000000..4e4528d330a
--- /dev/null
+++ b/examples/cadence/operators/test_requantize_op.py
@@ -0,0 +1,105 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+# Example script for exporting simple models to flatbuffer
+
+import logging
+import unittest
+
+import torch
+
+from executorch.backends.cadence.aot.ops_registrations import *  # noqa
+from executorch.backends.cadence.aot.ref_implementations import *  # noqa
+
+import itertools
+
+from executorch.backends.cadence.aot.export_example import export_model
+from parameterized import parameterized
+
+
+FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
+logging.basicConfig(level=logging.INFO, format=FORMAT)
+
+
+def create_tensor_with_dtype(
+    shape: tuple[int], dtype: torch.dtype = torch.float32, _max: float = 1
+) -> torch.Tensor:
+    """
+    Create a tensor with the given shape and dtype. '_max' indicates the maximum
+    value in the tensor.
+    """
+    new_tensor: torch.Tensor = torch.rand(shape) * _max
+    return new_tensor.to(dtype=dtype)
+
+
+class CadenceRequantizeOpCases(unittest.TestCase):
+    @parameterized.expand(
+        # Check cross-product of in and out dtypes.
+        [
+            [(5, 2), 0.01, 0, 0.02, 1, in_dtype, out_dtype]
+            for in_dtype, out_dtype in itertools.product(
+                [torch.int8, torch.uint8, torch.int16, torch.uint16],
+                repeat=2,
+            )
+        ]
+    )
+    def test_cadence_requantize_out(
+        self,
+        shape: tuple[int],
+        in_scale: float,
+        in_zero_point: int,
+        out_scale: float,
+        out_zero_point: int,
+        in_dtype: torch.dtype,
+        out_dtype: torch.dtype,
+    ) -> None:
+        class QuantModel(torch.nn.Module):
+            def __init__(
+                self,
+                in_scale: float,
+                in_zero_point: int,
+                out_scale: float,
+                out_zero_point: float,
+                dtype: torch.dtype,
+            ) -> None:
+                super().__init__()
+                self.in_scale = torch.tensor(in_scale)
+                self.in_zero_point = torch.tensor(in_zero_point, dtype=torch.int32)
+                self.out_scale = torch.tensor(out_scale)
+                self.out_zero_point = torch.tensor(out_zero_point, dtype=torch.int32)
+                self.dtype = dtype
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.ops.cadence.requantize.default(
+                    x,
+                    self.in_scale,
+                    self.in_zero_point,
+                    self.out_scale,
+                    self.out_zero_point,
+                    self.dtype,
+                )
+
+        model = QuantModel(
+            in_scale, in_zero_point, out_scale, out_zero_point, out_dtype
+        )
+        dtype_info = torch.iinfo(in_dtype)
+        inputs = (
+            create_tensor_with_dtype(shape, in_dtype, _max=float(dtype_info.max)),
+        )
+
+        # Run and verify correctness
+        # Since this test is handling integers, its inputs and outputs might have
+        # a larger MSE loss, and that's alright.
+        # For example, if the ref output is [33, 50] and the real output is [33, 49],
+        # the MSE loss is around 0.5, but the relative error is < 2%. So we set
+        # the epsilon to a higher value.
+        export_model(model, inputs, eps_error=1.0)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/examples/demo-apps/android/ExecuTorchDemo/README.md b/examples/demo-apps/android/ExecuTorchDemo/README.md
index 931509891a6..c6ee756458f 100644
--- a/examples/demo-apps/android/ExecuTorchDemo/README.md
+++ b/examples/demo-apps/android/ExecuTorchDemo/README.md
@@ -17,7 +17,7 @@ This guide explains how to setup ExecuTorch for Android using a demo app. The ap
 * Refer to [Setting up ExecuTorch](https://pytorch.org/executorch/stable/getting-started-setup) to set up the repo and dev environment.
 * Download and install [Android Studio and SDK](https://developer.android.com/studio).
 * Supported Host OS: CentOS, macOS Ventura (M1/x86_64). See below for Qualcomm HTP specific requirements.
-* *Qualcomm HTP Only[^1]:* To build and run on Qualcomm's AI Engine Direct, please follow [Building and Running ExecuTorch with Qualcomm AI Engine Direct Backend](build-run-qualcomm-ai-engine-direct-backend.md) for hardware and software pre-requisites. The version we use for this tutorial is 2.19. The chip we use for this tutorial is SM8450.
+* *Qualcomm HTP Only[^1]:* To build and run on Qualcomm's AI Engine Direct, please follow [Building and Running ExecuTorch with Qualcomm AI Engine Direct Backend](backends-qualcomm.md) for hardware and software pre-requisites. The version we use for this tutorial is 2.19. The chip we use for this tutorial is SM8450.
 :::
 ::::
 
@@ -39,146 +39,79 @@ We generate the model file for the ExecuTorch runtime in Android Demo App.
 For delegating DeepLab v3 to XNNPACK backend, please do the following to export the model:
 
 ```bash
+cd executorch # go to executorch root
 python3 -m examples.xnnpack.aot_compiler --model_name="dl3" --delegate
-mkdir -p examples/demo-apps/android/ExecuTorchDemo/app/src/main/assets/
-cp dl3_xnnpack_fp32.pte examples/demo-apps/android/ExecuTorchDemo/app/src/main/assets/
 ```
 
-For more detailed tutorial of lowering to XNNPACK, please see [XNNPACK backend](tutorial-xnnpack-delegate-lowering.md).
-
-#### Qualcomm Hexagon NPU
-
-For delegating to Qualcomm Hexagon NPU, please follow the tutorial [here](build-run-qualcomm-ai-engine-direct-backend.md).
-
-After generating the model, copy the model to `assets` directory.
+Then push the pte file to Android device:
 
 ```bash
-python -m examples.qualcomm.scripts.deeplab_v3 -b build-android -m SM8450 -s <adb_connected_device_serial>
-cp deeplab_v3/dlv3_qnn.pte examples/demo-apps/android/ExecuTorchDemo/app/src/main/assets/
+adb push dl3_xnnpack_fp32.pte /data/local/tmp/dl3_xnnpack_fp32.pte
 ```
 
-### Runtime
-
-We build the required ExecuTorch runtime library to run the model.
+For more detailed tutorial of lowering to XNNPACK, please see [XNNPACK backend](backends-xnnpack.md).
 
-#### XNNPACK
+#### Qualcomm Hexagon NPU
 
-1. Build the CMake target for the library with XNNPACK backend:
+For delegating to Qualcomm Hexagon NPU, please follow the tutorial [here](backends-qualcomm.md).
 
 ```bash
-export ANDROID_NDK=<path-to-android-ndk>
-export ANDROID_ABI=arm64-v8a
-
-# Run the following lines from the `executorch/` folder
-./install_executorch.sh --clean
-mkdir cmake-android-out
-
-# Build the core executorch library
-cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \
-  -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}/build/cmake/android.toolchain.cmake" \
-  -DANDROID_ABI="${ANDROID_ABI}" \
-  -DEXECUTORCH_BUILD_XNNPACK=ON \
-  -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-  -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-  -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
-  -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-  -Bcmake-android-out
-
-cmake --build cmake-android-out -j16 --target install
+python -m examples.qualcomm.scripts.deeplab_v3 -b build-android -m SM8450 -s <adb_connected_device_serial>
 ```
 
-When we set `EXECUTORCH_BUILD_XNNPACK=ON`, we will build the target [`xnnpack_backend`](https://github.com/pytorch/executorch/blob/main/backends/xnnpack/CMakeLists.txt) which in turn is linked into libexecutorch_jni via [CMake](https://github.com/pytorch/executorch/blob/main/examples/demo-apps/android/jni/CMakeLists.txt).
-
-2. Build the Android extension:
+Then push the pte file to Android device:
 
 ```bash
-
-# Build the android extension
-cmake extension/android \
-  -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}"/build/cmake/android.toolchain.cmake \
-  -DANDROID_ABI="${ANDROID_ABI}" \
-  -DCMAKE_INSTALL_PREFIX=cmake-android-out \
-  -Bcmake-android-out/extension/android
-
-cmake --build cmake-android-out/extension/android -j16
+adb push deeplab_v3/dlv3_qnn.pte /data/local/tmp/dlv3_qnn.pte
 ```
 
-`libexecutorch_jni.so` wraps up the required XNNPACK Backend runtime library from `xnnpack_backend`, and adds an additional JNI layer using fbjni. This is later exposed to Java app.
+### Runtime
 
-#### Qualcomm Hexagon NPU
+We build the required ExecuTorch runtime library (AAR) to run the model.
 
-1. Build the CMake target for the library with Qualcomm Hexagon NPU (HTP) backend (XNNPACK also included):
+#### XNNPACK
 
 ```bash
+# go to ExecuTorch repo root
 export ANDROID_NDK=<path-to-android-ndk>
-export ANDROID_ABI=arm64-v8a
-export QNN_SDK_ROOT=<path-to-qnn-sdk>
+export ANDROID_ABIS=arm64-v8a
 
+# Run the following lines from the `executorch/` folder
 ./install_executorch.sh --clean
-mkdir cmake-android-out
-cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \
-    -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}/build/cmake/android.toolchain.cmake" \
-    -DANDROID_ABI="${ANDROID_ABI}" \
-    -DEXECUTORCH_BUILD_XNNPACK=ON \
-    -DEXECUTORCH_BUILD_QNN=ON \
-    -DQNN_SDK_ROOT="${QNN_SDK_ROOT}" \
-    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-    -Bcmake-android-out
-
-cmake --build cmake-android-out -j16 --target install
-```
-Similar to the XNNPACK library, with this setup, we compile `libexecutorch_jni.so` but it adds an additional static library `qnn_executorch_backend` which wraps up Qualcomm HTP runtime library and registers the Qualcomm HTP backend. This is later exposed to Java app.
-
-`qnn_executorch_backend` is built when we turn on CMake option `EXECUTORCH_BUILD_QNN`. It will include the [CMakeLists.txt](https://github.com/pytorch/executorch/blob/main/backends/qualcomm/CMakeLists.txt) from backends/qualcomm where we `add_library(qnn_executorch_backend STATIC)`.
 
-2. Build the Android extension:
-
-```bash
-cmake extension/android \
-  -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}"/build/cmake/android.toolchain.cmake \
-  -DANDROID_ABI="${ANDROID_ABI}" \
-  -DCMAKE_INSTALL_PREFIX=cmake-android-out \
-  -Bcmake-android-out/extension/android
+# Create a new directory `app/libs` for the AAR to live
+pushd examples/demo-apps/android/ExecuTorchDemo
+mkdir -p app/libs
+popd
 
-cmake --build cmake-android-out/extension/android -j16
+# Build the AAR. It will include XNNPACK backend by default.
+export BUILD_AAR_DIR=$(realpath examples/demo-apps/android/ExecuTorchDemo/app/libs)
+sh scripts/build_android_library.sh
 ```
 
-## Deploying on Device via Demo App
-
-### Steps for Deploying Model via XNNPACK
+#### Qualcomm Hexagon NPU
 
 ```bash
-mkdir -p examples/demo-apps/android/ExecuTorchDemo/app/src/main/jniLibs/arm64-v8a
-cp cmake-android-out/extension/android/libexecutorch_jni.so \
-   examples/demo-apps/android/ExecuTorchDemo/app/src/main/jniLibs/arm64-v8a/libexecutorch.so
-```
-
-This allows the Android app to load ExecuTorch runtime with XNNPACK backend as a JNI library. Later, this shared library will be loaded by `NativePeer.java` in Java code.
-
-### Steps for Deploying Model via Qualcomm's AI Engine Direct
+# go to ExecuTorch repo root
+export ANDROID_NDK=<path-to-android-ndk>
+export ANDROID_ABIS=arm64-v8a
+export QNN_SDK_ROOT=<path-to-qnn-sdk-root>
 
-```bash
-mkdir -p ../examples/demo-apps/android/ExecuTorchDemo/app/src/main/jniLibs/arm64-v8a
-```
+# Run the following lines from the `executorch/` folder
+./install_executorch.sh --clean
 
-We need to push some additional Qualcomm HTP backend libraries to the app. Please refer to [Qualcomm docs](build-run-qualcomm-ai-engine-direct-backend.md) here.
+# Create a new directory `app/libs` for the AAR to live
+pushd examples/demo-apps/android/ExecuTorchDemo
+mkdir -p app/libs
+popd
 
-```bash
-cp ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtp.so ${QNN_SDK_ROOT}/lib/hexagon-v69/unsigned/libQnnHtpV69Skel.so ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV69Stub.so ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnSystem.so \
-   examples/demo-apps/android/ExecuTorchDemo/app/src/main/jniLibs/arm64-v8a
+# Build the AAR. It will include XNNPACK backend by default.
+export BUILD_AAR_DIR=$(realpath examples/demo-apps/android/ExecuTorchDemo/app/libs)
+sh scripts/build_android_library.sh
 ```
 
-Copy the core libraries:
-
-```bash
-cp cmake-android-out/extension/android/libexecutorch_jni.so \
-   examples/demo-apps/android/ExecuTorchDemo/app/src/main/jniLibs/arm64-v8a/libexecutorch.so
-cp cmake-android-out/lib/libqnn_executorch_backend.so \
-   examples/demo-apps/android/ExecuTorchDemo/app/src/main/jniLibs/arm64-v8a/libqnn_executorch_backend.so
-```
+This is very similar to XNNPACK setup, but users now needs to define `QNN_SDK_ROOT` so that
+QNN backend is built into the AAR.
 
 ## Running the App
 
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/build.gradle.kts b/examples/demo-apps/android/ExecuTorchDemo/app/build.gradle.kts
index 615fee860f8..ca06671f328 100644
--- a/examples/demo-apps/android/ExecuTorchDemo/app/build.gradle.kts
+++ b/examples/demo-apps/android/ExecuTorchDemo/app/build.gradle.kts
@@ -57,9 +57,7 @@ dependencies {
   implementation("androidx.constraintlayout:constraintlayout:2.2.0-alpha12")
   implementation("com.facebook.soloader:soloader:0.10.5")
   implementation("com.facebook.fbjni:fbjni:0.5.1")
-  implementation("org.pytorch.executorch:executorch") {
-    exclude("com.facebook.fbjni", "fbjni-java-only")
-  }
+  implementation(files("libs/executorch.aar"))
   testImplementation("junit:junit:4.13.2")
   androidTestImplementation("androidx.test.ext:junit:1.1.5")
   androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
@@ -72,8 +70,8 @@ dependencies {
 tasks.register("setup") {
   doFirst {
     exec {
-      commandLine("sh", "examples/demo-apps/android/LlamaDemo/setup.sh")
-      workingDir("../../../../../")
+      commandLine("sh", "setup.sh")
+      workingDir("../")
     }
   }
 }
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/AndroidManifest.xml b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/AndroidManifest.xml
index 4c16e3a994e..8d71b156398 100644
--- a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/AndroidManifest.xml
+++ b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/AndroidManifest.xml
@@ -27,18 +27,6 @@
         <uses-native-library android:name="libcdsprpc.so"
             android:required="false"/>
 
-        <uses-native-library android:name="libQnnHtp.so"
-            android:required="false"/>
-
-        <uses-native-library android:name="libQnnHtpV69Skel.so"
-            android:required="false"/>
-
-        <uses-native-library android:name="libQnnHtpV69Stub.so"
-            android:required="false"/>
-
-        <uses-native-library android:name="libQnnSystem.so"
-            android:required="false"/>
-
         <activity
             android:name=".MainActivity"
             android:exported="true"
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/BUCK b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/BUCK
index 2b33cef732a..371c991ce88 100644
--- a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/BUCK
+++ b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/BUCK
@@ -30,8 +30,6 @@ fb_android_resource(
 fb_android_library(
     name = "app_lib",
     srcs = [
-        "java/com/example/executorchdemo/ClassificationActivity.java",
-        "java/com/example/executorchdemo/ImageNetClasses.java",
         "java/com/example/executorchdemo/MainActivity.java",
         "java/com/example/executorchdemo/TensorImageUtils.java",
     ],
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/assets/corgi2.jpg b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/assets/corgi2.jpg
deleted file mode 100644
index 42d7c8cce87..00000000000
Binary files a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/assets/corgi2.jpg and /dev/null differ
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/assets/test1.png b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/assets/test1.png
deleted file mode 100644
index 087e25efb3e..00000000000
Binary files a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/assets/test1.png and /dev/null differ
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/assets/test2.jpg b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/assets/test2.jpg
deleted file mode 100644
index 9659f0d5e17..00000000000
Binary files a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/assets/test2.jpg and /dev/null differ
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/assets/test3.png b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/assets/test3.png
deleted file mode 100644
index 2795f67a46f..00000000000
Binary files a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/assets/test3.png and /dev/null differ
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/java/com/example/executorchdemo/ClassificationActivity.java b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/java/com/example/executorchdemo/ClassificationActivity.java
deleted file mode 100644
index cafa9719759..00000000000
--- a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/java/com/example/executorchdemo/ClassificationActivity.java
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-package com.example.executorchdemo;
-
-import android.app.Activity;
-import android.content.Intent;
-import android.graphics.Bitmap;
-import android.graphics.BitmapFactory;
-import android.os.Bundle;
-import android.util.Log;
-import android.view.View;
-import android.widget.Button;
-import android.widget.ImageView;
-import android.widget.TextView;
-import java.io.IOException;
-import org.pytorch.executorch.EValue;
-import org.pytorch.executorch.Module;
-import org.pytorch.executorch.Tensor;
-
-public class ClassificationActivity extends Activity implements Runnable {
-
-  private void openSegmentationActivity() {
-    Intent intent = new Intent(this, MainActivity.class);
-    startActivity(intent);
-  }
-
-  private void populateBitmap(String file) {
-    Bitmap bitmap = null;
-    try {
-      bitmap = BitmapFactory.decodeStream(getAssets().open(file));
-      bitmap = Bitmap.createScaledBitmap(bitmap, 299, 299, true);
-    } catch (IOException e) {
-      Log.e("Classification", "Error reading assets", e);
-      finish();
-    }
-
-    // showing image on UI
-    ImageView imageView = findViewById(R.id.image);
-    imageView.setImageBitmap(bitmap);
-  }
-
-  @Override
-  public void run() {
-    Bitmap bitmap = null;
-    Module module = null;
-    try {
-      bitmap = BitmapFactory.decodeStream(getAssets().open("corgi2.jpg"));
-      bitmap = Bitmap.createScaledBitmap(bitmap, 299, 299, true);
-      module = Module.load(MainActivity.assetFilePath(this, "ic4_xnnpack_fp32.pte"));
-    } catch (IOException e) {
-      Log.e("PytorchHelloWorld", "Error reading assets", e);
-      finish();
-    }
-
-    // showing image on UI
-    ImageView imageView = findViewById(R.id.image);
-    imageView.setImageBitmap(bitmap);
-
-    // preparing input tensor
-    final Tensor inputTensor =
-        TensorImageUtils.bitmapToFloat32Tensor(
-            bitmap,
-            TensorImageUtils.TORCHVISION_NORM_MEAN_RGB,
-            TensorImageUtils.TORCHVISION_NORM_STD_RGB);
-
-    // running the model
-    final Tensor outputTensor = module.forward(EValue.from(inputTensor))[0].toTensor();
-
-    // getting tensor content as java array of floats
-    final float[] scores = outputTensor.getDataAsFloatArray();
-
-    // searching for the index with maximum score
-    float maxScore = -Float.MAX_VALUE;
-    int maxScoreIdx = -1;
-    for (int i = 0; i < scores.length; i++) {
-      if (scores[i] > maxScore) {
-        maxScore = scores[i];
-        maxScoreIdx = i;
-      }
-    }
-
-    String className = ImageNetClasses.IMAGENET_CLASSES[maxScoreIdx];
-
-    // showing className on UI
-    TextView textView = findViewById(R.id.text);
-    textView.setText(className);
-  }
-
-  @Override
-  protected void onCreate(Bundle savedInstanceState) {
-    super.onCreate(savedInstanceState);
-    setContentView(R.layout.activity_classification);
-
-    final Button classificationDemoButton = findViewById(R.id.segmentationDemoButton);
-    classificationDemoButton.setOnClickListener(
-        new View.OnClickListener() {
-          public void onClick(View v) {
-            openSegmentationActivity();
-          }
-        });
-
-    final Button forwardButton = findViewById(R.id.forward);
-    forwardButton.setOnClickListener(
-        new View.OnClickListener() {
-          public void onClick(View v) {
-            TextView textView = findViewById(R.id.text);
-            textView.setText("Running");
-            ClassificationActivity.this.run();
-          }
-        });
-
-    populateBitmap("corgi2.jpg");
-  }
-}
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/java/com/example/executorchdemo/ImageNetClasses.java b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/java/com/example/executorchdemo/ImageNetClasses.java
deleted file mode 100644
index 08e966e6bc8..00000000000
--- a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/java/com/example/executorchdemo/ImageNetClasses.java
+++ /dev/null
@@ -1,1021 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-package com.example.executorchdemo;
-
-public class ImageNetClasses {
-  public static String[] IMAGENET_CLASSES =
-      new String[] {
-        "tench, Tinca tinca",
-        "goldfish, Carassius auratus",
-        "great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias",
-        "tiger shark, Galeocerdo cuvieri",
-        "hammerhead, hammerhead shark",
-        "electric ray, crampfish, numbfish, torpedo",
-        "stingray",
-        "cock",
-        "hen",
-        "ostrich, Struthio camelus",
-        "brambling, Fringilla montifringilla",
-        "goldfinch, Carduelis carduelis",
-        "house finch, linnet, Carpodacus mexicanus",
-        "junco, snowbird",
-        "indigo bunting, indigo finch, indigo bird, Passerina cyanea",
-        "robin, American robin, Turdus migratorius",
-        "bulbul",
-        "jay",
-        "magpie",
-        "chickadee",
-        "water ouzel, dipper",
-        "kite",
-        "bald eagle, American eagle, Haliaeetus leucocephalus",
-        "vulture",
-        "great grey owl, great gray owl, Strix nebulosa",
-        "European fire salamander, Salamandra salamandra",
-        "common newt, Triturus vulgaris",
-        "eft",
-        "spotted salamander, Ambystoma maculatum",
-        "axolotl, mud puppy, Ambystoma mexicanum",
-        "bullfrog, Rana catesbeiana",
-        "tree frog, tree-frog",
-        "tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui",
-        "loggerhead, loggerhead turtle, Caretta caretta",
-        "leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea",
-        "mud turtle",
-        "terrapin",
-        "box turtle, box tortoise",
-        "banded gecko",
-        "common iguana, iguana, Iguana iguana",
-        "American chameleon, anole, Anolis carolinensis",
-        "whiptail, whiptail lizard",
-        "agama",
-        "frilled lizard, Chlamydosaurus kingi",
-        "alligator lizard",
-        "Gila monster, Heloderma suspectum",
-        "green lizard, Lacerta viridis",
-        "African chameleon, Chamaeleo chamaeleon",
-        "Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis",
-        "African crocodile, Nile crocodile, Crocodylus niloticus",
-        "American alligator, Alligator mississipiensis",
-        "triceratops",
-        "thunder snake, worm snake, Carphophis amoenus",
-        "ringneck snake, ring-necked snake, ring snake",
-        "hognose snake, puff adder, sand viper",
-        "green snake, grass snake",
-        "king snake, kingsnake",
-        "garter snake, grass snake",
-        "water snake",
-        "vine snake",
-        "night snake, Hypsiglena torquata",
-        "boa constrictor, Constrictor constrictor",
-        "rock python, rock snake, Python sebae",
-        "Indian cobra, Naja naja",
-        "green mamba",
-        "sea snake",
-        "horned viper, cerastes, sand viper, horned asp, Cerastes cornutus",
-        "diamondback, diamondback rattlesnake, Crotalus adamanteus",
-        "sidewinder, horned rattlesnake, Crotalus cerastes",
-        "trilobite",
-        "harvestman, daddy longlegs, Phalangium opilio",
-        "scorpion",
-        "black and gold garden spider, Argiope aurantia",
-        "barn spider, Araneus cavaticus",
-        "garden spider, Aranea diademata",
-        "black widow, Latrodectus mactans",
-        "tarantula",
-        "wolf spider, hunting spider",
-        "tick",
-        "centipede",
-        "black grouse",
-        "ptarmigan",
-        "ruffed grouse, partridge, Bonasa umbellus",
-        "prairie chicken, prairie grouse, prairie fowl",
-        "peacock",
-        "quail",
-        "partridge",
-        "African grey, African gray, Psittacus erithacus",
-        "macaw",
-        "sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita",
-        "lorikeet",
-        "coucal",
-        "bee eater",
-        "hornbill",
-        "hummingbird",
-        "jacamar",
-        "toucan",
-        "drake",
-        "red-breasted merganser, Mergus serrator",
-        "goose",
-        "black swan, Cygnus atratus",
-        "tusker",
-        "echidna, spiny anteater, anteater",
-        "platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus",
-        "wallaby, brush kangaroo",
-        "koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus",
-        "wombat",
-        "jellyfish",
-        "sea anemone, anemone",
-        "brain coral",
-        "flatworm, platyhelminth",
-        "nematode, nematode worm, roundworm",
-        "conch",
-        "snail",
-        "slug",
-        "sea slug, nudibranch",
-        "chiton, coat-of-mail shell, sea cradle, polyplacophore",
-        "chambered nautilus, pearly nautilus, nautilus",
-        "Dungeness crab, Cancer magister",
-        "rock crab, Cancer irroratus",
-        "fiddler crab",
-        "king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica",
-        "American lobster, Northern lobster, Maine lobster, Homarus americanus",
-        "spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish",
-        "crayfish, crawfish, crawdad, crawdaddy",
-        "hermit crab",
-        "isopod",
-        "white stork, Ciconia ciconia",
-        "black stork, Ciconia nigra",
-        "spoonbill",
-        "flamingo",
-        "little blue heron, Egretta caerulea",
-        "American egret, great white heron, Egretta albus",
-        "bittern",
-        "crane",
-        "limpkin, Aramus pictus",
-        "European gallinule, Porphyrio porphyrio",
-        "American coot, marsh hen, mud hen, water hen, Fulica americana",
-        "bustard",
-        "ruddy turnstone, Arenaria interpres",
-        "red-backed sandpiper, dunlin, Erolia alpina",
-        "redshank, Tringa totanus",
-        "dowitcher",
-        "oystercatcher, oyster catcher",
-        "pelican",
-        "king penguin, Aptenodytes patagonica",
-        "albatross, mollymawk",
-        "grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus",
-        "killer whale, killer, orca, grampus, sea wolf, Orcinus orca",
-        "dugong, Dugong dugon",
-        "sea lion",
-        "Chihuahua",
-        "Japanese spaniel",
-        "Maltese dog, Maltese terrier, Maltese",
-        "Pekinese, Pekingese, Peke",
-        "Shih-Tzu",
-        "Blenheim spaniel",
-        "papillon",
-        "toy terrier",
-        "Rhodesian ridgeback",
-        "Afghan hound, Afghan",
-        "basset, basset hound",
-        "beagle",
-        "bloodhound, sleuthhound",
-        "bluetick",
-        "black-and-tan coonhound",
-        "Walker hound, Walker foxhound",
-        "English foxhound",
-        "redbone",
-        "borzoi, Russian wolfhound",
-        "Irish wolfhound",
-        "Italian greyhound",
-        "whippet",
-        "Ibizan hound, Ibizan Podenco",
-        "Norwegian elkhound, elkhound",
-        "otterhound, otter hound",
-        "Saluki, gazelle hound",
-        "Scottish deerhound, deerhound",
-        "Weimaraner",
-        "Staffordshire bullterrier, Staffordshire bull terrier",
-        "American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull"
-            + " terrier",
-        "Bedlington terrier",
-        "Border terrier",
-        "Kerry blue terrier",
-        "Irish terrier",
-        "Norfolk terrier",
-        "Norwich terrier",
-        "Yorkshire terrier",
-        "wire-haired fox terrier",
-        "Lakeland terrier",
-        "Sealyham terrier, Sealyham",
-        "Airedale, Airedale terrier",
-        "cairn, cairn terrier",
-        "Australian terrier",
-        "Dandie Dinmont, Dandie Dinmont terrier",
-        "Boston bull, Boston terrier",
-        "miniature schnauzer",
-        "giant schnauzer",
-        "standard schnauzer",
-        "Scotch terrier, Scottish terrier, Scottie",
-        "Tibetan terrier, chrysanthemum dog",
-        "silky terrier, Sydney silky",
-        "soft-coated wheaten terrier",
-        "West Highland white terrier",
-        "Lhasa, Lhasa apso",
-        "flat-coated retriever",
-        "curly-coated retriever",
-        "golden retriever",
-        "Labrador retriever",
-        "Chesapeake Bay retriever",
-        "German short-haired pointer",
-        "vizsla, Hungarian pointer",
-        "English setter",
-        "Irish setter, red setter",
-        "Gordon setter",
-        "Brittany spaniel",
-        "clumber, clumber spaniel",
-        "English springer, English springer spaniel",
-        "Welsh springer spaniel",
-        "cocker spaniel, English cocker spaniel, cocker",
-        "Sussex spaniel",
-        "Irish water spaniel",
-        "kuvasz",
-        "schipperke",
-        "groenendael",
-        "malinois",
-        "briard",
-        "kelpie",
-        "komondor",
-        "Old English sheepdog, bobtail",
-        "Shetland sheepdog, Shetland sheep dog, Shetland",
-        "collie",
-        "Border collie",
-        "Bouvier des Flandres, Bouviers des Flandres",
-        "Rottweiler",
-        "German shepherd, German shepherd dog, German police dog, alsatian",
-        "Doberman, Doberman pinscher",
-        "miniature pinscher",
-        "Greater Swiss Mountain dog",
-        "Bernese mountain dog",
-        "Appenzeller",
-        "EntleBucher",
-        "boxer",
-        "bull mastiff",
-        "Tibetan mastiff",
-        "French bulldog",
-        "Great Dane",
-        "Saint Bernard, St Bernard",
-        "Eskimo dog, husky",
-        "malamute, malemute, Alaskan malamute",
-        "Siberian husky",
-        "dalmatian, coach dog, carriage dog",
-        "affenpinscher, monkey pinscher, monkey dog",
-        "basenji",
-        "pug, pug-dog",
-        "Leonberg",
-        "Newfoundland, Newfoundland dog",
-        "Great Pyrenees",
-        "Samoyed, Samoyede",
-        "Pomeranian",
-        "chow, chow chow",
-        "keeshond",
-        "Brabancon griffon",
-        "Pembroke, Pembroke Welsh corgi",
-        "Cardigan, Cardigan Welsh corgi",
-        "toy poodle",
-        "miniature poodle",
-        "standard poodle",
-        "Mexican hairless",
-        "timber wolf, grey wolf, gray wolf, Canis lupus",
-        "white wolf, Arctic wolf, Canis lupus tundrarum",
-        "red wolf, maned wolf, Canis rufus, Canis niger",
-        "coyote, prairie wolf, brush wolf, Canis latrans",
-        "dingo, warrigal, warragal, Canis dingo",
-        "dhole, Cuon alpinus",
-        "African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus",
-        "hyena, hyaena",
-        "red fox, Vulpes vulpes",
-        "kit fox, Vulpes macrotis",
-        "Arctic fox, white fox, Alopex lagopus",
-        "grey fox, gray fox, Urocyon cinereoargenteus",
-        "tabby, tabby cat",
-        "tiger cat",
-        "Persian cat",
-        "Siamese cat, Siamese",
-        "Egyptian cat",
-        "cougar, puma, catamount, mountain lion, painter, panther, Felis concolor",
-        "lynx, catamount",
-        "leopard, Panthera pardus",
-        "snow leopard, ounce, Panthera uncia",
-        "jaguar, panther, Panthera onca, Felis onca",
-        "lion, king of beasts, Panthera leo",
-        "tiger, Panthera tigris",
-        "cheetah, chetah, Acinonyx jubatus",
-        "brown bear, bruin, Ursus arctos",
-        "American black bear, black bear, Ursus americanus, Euarctos americanus",
-        "ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus",
-        "sloth bear, Melursus ursinus, Ursus ursinus",
-        "mongoose",
-        "meerkat, mierkat",
-        "tiger beetle",
-        "ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle",
-        "ground beetle, carabid beetle",
-        "long-horned beetle, longicorn, longicorn beetle",
-        "leaf beetle, chrysomelid",
-        "dung beetle",
-        "rhinoceros beetle",
-        "weevil",
-        "fly",
-        "bee",
-        "ant, emmet, pismire",
-        "grasshopper, hopper",
-        "cricket",
-        "walking stick, walkingstick, stick insect",
-        "cockroach, roach",
-        "mantis, mantid",
-        "cicada, cicala",
-        "leafhopper",
-        "lacewing, lacewing fly",
-        "dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake"
-            + " doctor, mosquito hawk, skeeter hawk",
-        "damselfly",
-        "admiral",
-        "ringlet, ringlet butterfly",
-        "monarch, monarch butterfly, milkweed butterfly, Danaus plexippus",
-        "cabbage butterfly",
-        "sulphur butterfly, sulfur butterfly",
-        "lycaenid, lycaenid butterfly",
-        "starfish, sea star",
-        "sea urchin",
-        "sea cucumber, holothurian",
-        "wood rabbit, cottontail, cottontail rabbit",
-        "hare",
-        "Angora, Angora rabbit",
-        "hamster",
-        "porcupine, hedgehog",
-        "fox squirrel, eastern fox squirrel, Sciurus niger",
-        "marmot",
-        "beaver",
-        "guinea pig, Cavia cobaya",
-        "sorrel",
-        "zebra",
-        "hog, pig, grunter, squealer, Sus scrofa",
-        "wild boar, boar, Sus scrofa",
-        "warthog",
-        "hippopotamus, hippo, river horse, Hippopotamus amphibius",
-        "ox",
-        "water buffalo, water ox, Asiatic buffalo, Bubalus bubalis",
-        "bison",
-        "ram, tup",
-        "bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis"
-            + " canadensis",
-        "ibex, Capra ibex",
-        "hartebeest",
-        "impala, Aepyceros melampus",
-        "gazelle",
-        "Arabian camel, dromedary, Camelus dromedarius",
-        "llama",
-        "weasel",
-        "mink",
-        "polecat, fitch, foulmart, foumart, Mustela putorius",
-        "black-footed ferret, ferret, Mustela nigripes",
-        "otter",
-        "skunk, polecat, wood pussy",
-        "badger",
-        "armadillo",
-        "three-toed sloth, ai, Bradypus tridactylus",
-        "orangutan, orang, orangutang, Pongo pygmaeus",
-        "gorilla, Gorilla gorilla",
-        "chimpanzee, chimp, Pan troglodytes",
-        "gibbon, Hylobates lar",
-        "siamang, Hylobates syndactylus, Symphalangus syndactylus",
-        "guenon, guenon monkey",
-        "patas, hussar monkey, Erythrocebus patas",
-        "baboon",
-        "macaque",
-        "langur",
-        "colobus, colobus monkey",
-        "proboscis monkey, Nasalis larvatus",
-        "marmoset",
-        "capuchin, ringtail, Cebus capucinus",
-        "howler monkey, howler",
-        "titi, titi monkey",
-        "spider monkey, Ateles geoffroyi",
-        "squirrel monkey, Saimiri sciureus",
-        "Madagascar cat, ring-tailed lemur, Lemur catta",
-        "indri, indris, Indri indri, Indri brevicaudatus",
-        "Indian elephant, Elephas maximus",
-        "African elephant, Loxodonta africana",
-        "lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens",
-        "giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca",
-        "barracouta, snoek",
-        "eel",
-        "coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch",
-        "rock beauty, Holocanthus tricolor",
-        "anemone fish",
-        "sturgeon",
-        "gar, garfish, garpike, billfish, Lepisosteus osseus",
-        "lionfish",
-        "puffer, pufferfish, blowfish, globefish",
-        "abacus",
-        "abaya",
-        "academic gown, academic robe, judge's robe",
-        "accordion, piano accordion, squeeze box",
-        "acoustic guitar",
-        "aircraft carrier, carrier, flattop, attack aircraft carrier",
-        "airliner",
-        "airship, dirigible",
-        "altar",
-        "ambulance",
-        "amphibian, amphibious vehicle",
-        "analog clock",
-        "apiary, bee house",
-        "apron",
-        "ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel,"
-            + " trash bin",
-        "assault rifle, assault gun",
-        "backpack, back pack, knapsack, packsack, rucksack, haversack",
-        "bakery, bakeshop, bakehouse",
-        "balance beam, beam",
-        "balloon",
-        "ballpoint, ballpoint pen, ballpen, Biro",
-        "Band Aid",
-        "banjo",
-        "bannister, banister, balustrade, balusters, handrail",
-        "barbell",
-        "barber chair",
-        "barbershop",
-        "barn",
-        "barometer",
-        "barrel, cask",
-        "barrow, garden cart, lawn cart, wheelbarrow",
-        "baseball",
-        "basketball",
-        "bassinet",
-        "bassoon",
-        "bathing cap, swimming cap",
-        "bath towel",
-        "bathtub, bathing tub, bath, tub",
-        "beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon",
-        "beacon, lighthouse, beacon light, pharos",
-        "beaker",
-        "bearskin, busby, shako",
-        "beer bottle",
-        "beer glass",
-        "bell cote, bell cot",
-        "bib",
-        "bicycle-built-for-two, tandem bicycle, tandem",
-        "bikini, two-piece",
-        "binder, ring-binder",
-        "binoculars, field glasses, opera glasses",
-        "birdhouse",
-        "boathouse",
-        "bobsled, bobsleigh, bob",
-        "bolo tie, bolo, bola tie, bola",
-        "bonnet, poke bonnet",
-        "bookcase",
-        "bookshop, bookstore, bookstall",
-        "bottlecap",
-        "bow",
-        "bow tie, bow-tie, bowtie",
-        "brass, memorial tablet, plaque",
-        "brassiere, bra, bandeau",
-        "breakwater, groin, groyne, mole, bulwark, seawall, jetty",
-        "breastplate, aegis, egis",
-        "broom",
-        "bucket, pail",
-        "buckle",
-        "bulletproof vest",
-        "bullet train, bullet",
-        "butcher shop, meat market",
-        "cab, hack, taxi, taxicab",
-        "caldron, cauldron",
-        "candle, taper, wax light",
-        "cannon",
-        "canoe",
-        "can opener, tin opener",
-        "cardigan",
-        "car mirror",
-        "carousel, carrousel, merry-go-round, roundabout, whirligig",
-        "carpenter's kit, tool kit",
-        "carton",
-        "car wheel",
-        "cash machine, cash dispenser, automated teller machine, automatic teller machine,"
-            + " automated teller, automatic teller, ATM",
-        "cassette",
-        "cassette player",
-        "castle",
-        "catamaran",
-        "CD player",
-        "cello, violoncello",
-        "cellular telephone, cellular phone, cellphone, cell, mobile phone",
-        "chain",
-        "chainlink fence",
-        "chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour",
-        "chain saw, chainsaw",
-        "chest",
-        "chiffonier, commode",
-        "chime, bell, gong",
-        "china cabinet, china closet",
-        "Christmas stocking",
-        "church, church building",
-        "cinema, movie theater, movie theatre, movie house, picture palace",
-        "cleaver, meat cleaver, chopper",
-        "cliff dwelling",
-        "cloak",
-        "clog, geta, patten, sabot",
-        "cocktail shaker",
-        "coffee mug",
-        "coffeepot",
-        "coil, spiral, volute, whorl, helix",
-        "combination lock",
-        "computer keyboard, keypad",
-        "confectionery, confectionary, candy store",
-        "container ship, containership, container vessel",
-        "convertible",
-        "corkscrew, bottle screw",
-        "cornet, horn, trumpet, trump",
-        "cowboy boot",
-        "cowboy hat, ten-gallon hat",
-        "cradle",
-        "crane",
-        "crash helmet",
-        "crate",
-        "crib, cot",
-        "Crock Pot",
-        "croquet ball",
-        "crutch",
-        "cuirass",
-        "dam, dike, dyke",
-        "desk",
-        "desktop computer",
-        "dial telephone, dial phone",
-        "diaper, nappy, napkin",
-        "digital clock",
-        "digital watch",
-        "dining table, board",
-        "dishrag, dishcloth",
-        "dishwasher, dish washer, dishwashing machine",
-        "disk brake, disc brake",
-        "dock, dockage, docking facility",
-        "dogsled, dog sled, dog sleigh",
-        "dome",
-        "doormat, welcome mat",
-        "drilling platform, offshore rig",
-        "drum, membranophone, tympan",
-        "drumstick",
-        "dumbbell",
-        "Dutch oven",
-        "electric fan, blower",
-        "electric guitar",
-        "electric locomotive",
-        "entertainment center",
-        "envelope",
-        "espresso maker",
-        "face powder",
-        "feather boa, boa",
-        "file, file cabinet, filing cabinet",
-        "fireboat",
-        "fire engine, fire truck",
-        "fire screen, fireguard",
-        "flagpole, flagstaff",
-        "flute, transverse flute",
-        "folding chair",
-        "football helmet",
-        "forklift",
-        "fountain",
-        "fountain pen",
-        "four-poster",
-        "freight car",
-        "French horn, horn",
-        "frying pan, frypan, skillet",
-        "fur coat",
-        "garbage truck, dustcart",
-        "gasmask, respirator, gas helmet",
-        "gas pump, gasoline pump, petrol pump, island dispenser",
-        "goblet",
-        "go-kart",
-        "golf ball",
-        "golfcart, golf cart",
-        "gondola",
-        "gong, tam-tam",
-        "gown",
-        "grand piano, grand",
-        "greenhouse, nursery, glasshouse",
-        "grille, radiator grille",
-        "grocery store, grocery, food market, market",
-        "guillotine",
-        "hair slide",
-        "hair spray",
-        "half track",
-        "hammer",
-        "hamper",
-        "hand blower, blow dryer, blow drier, hair dryer, hair drier",
-        "hand-held computer, hand-held microcomputer",
-        "handkerchief, hankie, hanky, hankey",
-        "hard disc, hard disk, fixed disk",
-        "harmonica, mouth organ, harp, mouth harp",
-        "harp",
-        "harvester, reaper",
-        "hatchet",
-        "holster",
-        "home theater, home theatre",
-        "honeycomb",
-        "hook, claw",
-        "hoopskirt, crinoline",
-        "horizontal bar, high bar",
-        "horse cart, horse-cart",
-        "hourglass",
-        "iPod",
-        "iron, smoothing iron",
-        "jack-o'-lantern",
-        "jean, blue jean, denim",
-        "jeep, landrover",
-        "jersey, T-shirt, tee shirt",
-        "jigsaw puzzle",
-        "jinrikisha, ricksha, rickshaw",
-        "joystick",
-        "kimono",
-        "knee pad",
-        "knot",
-        "lab coat, laboratory coat",
-        "ladle",
-        "lampshade, lamp shade",
-        "laptop, laptop computer",
-        "lawn mower, mower",
-        "lens cap, lens cover",
-        "letter opener, paper knife, paperknife",
-        "library",
-        "lifeboat",
-        "lighter, light, igniter, ignitor",
-        "limousine, limo",
-        "liner, ocean liner",
-        "lipstick, lip rouge",
-        "Loafer",
-        "lotion",
-        "loudspeaker, speaker, speaker unit, loudspeaker system, speaker system",
-        "loupe, jeweler's loupe",
-        "lumbermill, sawmill",
-        "magnetic compass",
-        "mailbag, postbag",
-        "mailbox, letter box",
-        "maillot",
-        "maillot, tank suit",
-        "manhole cover",
-        "maraca",
-        "marimba, xylophone",
-        "mask",
-        "matchstick",
-        "maypole",
-        "maze, labyrinth",
-        "measuring cup",
-        "medicine chest, medicine cabinet",
-        "megalith, megalithic structure",
-        "microphone, mike",
-        "microwave, microwave oven",
-        "military uniform",
-        "milk can",
-        "minibus",
-        "miniskirt, mini",
-        "minivan",
-        "missile",
-        "mitten",
-        "mixing bowl",
-        "mobile home, manufactured home",
-        "Model T",
-        "modem",
-        "monastery",
-        "monitor",
-        "moped",
-        "mortar",
-        "mortarboard",
-        "mosque",
-        "mosquito net",
-        "motor scooter, scooter",
-        "mountain bike, all-terrain bike, off-roader",
-        "mountain tent",
-        "mouse, computer mouse",
-        "mousetrap",
-        "moving van",
-        "muzzle",
-        "nail",
-        "neck brace",
-        "necklace",
-        "nipple",
-        "notebook, notebook computer",
-        "obelisk",
-        "oboe, hautboy, hautbois",
-        "ocarina, sweet potato",
-        "odometer, hodometer, mileometer, milometer",
-        "oil filter",
-        "organ, pipe organ",
-        "oscilloscope, scope, cathode-ray oscilloscope, CRO",
-        "overskirt",
-        "oxcart",
-        "oxygen mask",
-        "packet",
-        "paddle, boat paddle",
-        "paddlewheel, paddle wheel",
-        "padlock",
-        "paintbrush",
-        "pajama, pyjama, pj's, jammies",
-        "palace",
-        "panpipe, pandean pipe, syrinx",
-        "paper towel",
-        "parachute, chute",
-        "parallel bars, bars",
-        "park bench",
-        "parking meter",
-        "passenger car, coach, carriage",
-        "patio, terrace",
-        "pay-phone, pay-station",
-        "pedestal, plinth, footstall",
-        "pencil box, pencil case",
-        "pencil sharpener",
-        "perfume, essence",
-        "Petri dish",
-        "photocopier",
-        "pick, plectrum, plectron",
-        "pickelhaube",
-        "picket fence, paling",
-        "pickup, pickup truck",
-        "pier",
-        "piggy bank, penny bank",
-        "pill bottle",
-        "pillow",
-        "ping-pong ball",
-        "pinwheel",
-        "pirate, pirate ship",
-        "pitcher, ewer",
-        "plane, carpenter's plane, woodworking plane",
-        "planetarium",
-        "plastic bag",
-        "plate rack",
-        "plow, plough",
-        "plunger, plumber's helper",
-        "Polaroid camera, Polaroid Land camera",
-        "pole",
-        "police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria",
-        "poncho",
-        "pool table, billiard table, snooker table",
-        "pop bottle, soda bottle",
-        "pot, flowerpot",
-        "potter's wheel",
-        "power drill",
-        "prayer rug, prayer mat",
-        "printer",
-        "prison, prison house",
-        "projectile, missile",
-        "projector",
-        "puck, hockey puck",
-        "punching bag, punch bag, punching ball, punchball",
-        "purse",
-        "quill, quill pen",
-        "quilt, comforter, comfort, puff",
-        "racer, race car, racing car",
-        "racket, racquet",
-        "radiator",
-        "radio, wireless",
-        "radio telescope, radio reflector",
-        "rain barrel",
-        "recreational vehicle, RV, R.V.",
-        "reel",
-        "reflex camera",
-        "refrigerator, icebox",
-        "remote control, remote",
-        "restaurant, eating house, eating place, eatery",
-        "revolver, six-gun, six-shooter",
-        "rifle",
-        "rocking chair, rocker",
-        "rotisserie",
-        "rubber eraser, rubber, pencil eraser",
-        "rugby ball",
-        "rule, ruler",
-        "running shoe",
-        "safe",
-        "safety pin",
-        "saltshaker, salt shaker",
-        "sandal",
-        "sarong",
-        "sax, saxophone",
-        "scabbard",
-        "scale, weighing machine",
-        "school bus",
-        "schooner",
-        "scoreboard",
-        "screen, CRT screen",
-        "screw",
-        "screwdriver",
-        "seat belt, seatbelt",
-        "sewing machine",
-        "shield, buckler",
-        "shoe shop, shoe-shop, shoe store",
-        "shoji",
-        "shopping basket",
-        "shopping cart",
-        "shovel",
-        "shower cap",
-        "shower curtain",
-        "ski",
-        "ski mask",
-        "sleeping bag",
-        "slide rule, slipstick",
-        "sliding door",
-        "slot, one-armed bandit",
-        "snorkel",
-        "snowmobile",
-        "snowplow, snowplough",
-        "soap dispenser",
-        "soccer ball",
-        "sock",
-        "solar dish, solar collector, solar furnace",
-        "sombrero",
-        "soup bowl",
-        "space bar",
-        "space heater",
-        "space shuttle",
-        "spatula",
-        "speedboat",
-        "spider web, spider's web",
-        "spindle",
-        "sports car, sport car",
-        "spotlight, spot",
-        "stage",
-        "steam locomotive",
-        "steel arch bridge",
-        "steel drum",
-        "stethoscope",
-        "stole",
-        "stone wall",
-        "stopwatch, stop watch",
-        "stove",
-        "strainer",
-        "streetcar, tram, tramcar, trolley, trolley car",
-        "stretcher",
-        "studio couch, day bed",
-        "stupa, tope",
-        "submarine, pigboat, sub, U-boat",
-        "suit, suit of clothes",
-        "sundial",
-        "sunglass",
-        "sunglasses, dark glasses, shades",
-        "sunscreen, sunblock, sun blocker",
-        "suspension bridge",
-        "swab, swob, mop",
-        "sweatshirt",
-        "swimming trunks, bathing trunks",
-        "swing",
-        "switch, electric switch, electrical switch",
-        "syringe",
-        "table lamp",
-        "tank, army tank, armored combat vehicle, armoured combat vehicle",
-        "tape player",
-        "teapot",
-        "teddy, teddy bear",
-        "television, television system",
-        "tennis ball",
-        "thatch, thatched roof",
-        "theater curtain, theatre curtain",
-        "thimble",
-        "thresher, thrasher, threshing machine",
-        "throne",
-        "tile roof",
-        "toaster",
-        "tobacco shop, tobacconist shop, tobacconist",
-        "toilet seat",
-        "torch",
-        "totem pole",
-        "tow truck, tow car, wrecker",
-        "toyshop",
-        "tractor",
-        "trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi",
-        "tray",
-        "trench coat",
-        "tricycle, trike, velocipede",
-        "trimaran",
-        "tripod",
-        "triumphal arch",
-        "trolleybus, trolley coach, trackless trolley",
-        "trombone",
-        "tub, vat",
-        "turnstile",
-        "typewriter keyboard",
-        "umbrella",
-        "unicycle, monocycle",
-        "upright, upright piano",
-        "vacuum, vacuum cleaner",
-        "vase",
-        "vault",
-        "velvet",
-        "vending machine",
-        "vestment",
-        "viaduct",
-        "violin, fiddle",
-        "volleyball",
-        "waffle iron",
-        "wall clock",
-        "wallet, billfold, notecase, pocketbook",
-        "wardrobe, closet, press",
-        "warplane, military plane",
-        "washbasin, handbasin, washbowl, lavabo, wash-hand basin",
-        "washer, automatic washer, washing machine",
-        "water bottle",
-        "water jug",
-        "water tower",
-        "whiskey jug",
-        "whistle",
-        "wig",
-        "window screen",
-        "window shade",
-        "Windsor tie",
-        "wine bottle",
-        "wing",
-        "wok",
-        "wooden spoon",
-        "wool, woolen, woollen",
-        "worm fence, snake fence, snake-rail fence, Virginia fence",
-        "wreck",
-        "yawl",
-        "yurt",
-        "web site, website, internet site, site",
-        "comic book",
-        "crossword puzzle, crossword",
-        "street sign",
-        "traffic light, traffic signal, stoplight",
-        "book jacket, dust cover, dust jacket, dust wrapper",
-        "menu",
-        "plate",
-        "guacamole",
-        "consomme",
-        "hot pot, hotpot",
-        "trifle",
-        "ice cream, icecream",
-        "ice lolly, lolly, lollipop, popsicle",
-        "French loaf",
-        "bagel, beigel",
-        "pretzel",
-        "cheeseburger",
-        "hotdog, hot dog, red hot",
-        "mashed potato",
-        "head cabbage",
-        "broccoli",
-        "cauliflower",
-        "zucchini, courgette",
-        "spaghetti squash",
-        "acorn squash",
-        "butternut squash",
-        "cucumber, cuke",
-        "artichoke, globe artichoke",
-        "bell pepper",
-        "cardoon",
-        "mushroom",
-        "Granny Smith",
-        "strawberry",
-        "orange",
-        "lemon",
-        "fig",
-        "pineapple, ananas",
-        "banana",
-        "jackfruit, jak, jack",
-        "custard apple",
-        "pomegranate",
-        "hay",
-        "carbonara",
-        "chocolate sauce, chocolate syrup",
-        "dough",
-        "meat loaf, meatloaf",
-        "pizza, pizza pie",
-        "potpie",
-        "burrito",
-        "red wine",
-        "espresso",
-        "cup",
-        "eggnog",
-        "alp",
-        "bubble",
-        "cliff, drop, drop-off",
-        "coral reef",
-        "geyser",
-        "lakeside, lakeshore",
-        "promontory, headland, head, foreland",
-        "sandbar, sand bar",
-        "seashore, coast, seacoast, sea-coast",
-        "valley, vale",
-        "volcano",
-        "ballplayer, baseball player",
-        "groom, bridegroom",
-        "scuba diver",
-        "rapeseed",
-        "daisy",
-        "yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium"
-            + " parviflorum",
-        "corn",
-        "acorn",
-        "hip, rose hip, rosehip",
-        "buckeye, horse chestnut, conker",
-        "coral fungus",
-        "agaric",
-        "gyromitra",
-        "stinkhorn, carrion fungus",
-        "earthstar",
-        "hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa",
-        "bolete",
-        "ear, spike, capitulum",
-        "toilet tissue, toilet paper, bathroom tissue"
-      };
-}
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/java/com/example/executorchdemo/MainActivity.java b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/java/com/example/executorchdemo/MainActivity.java
index 85b011d3942..9ac800b49a3 100644
--- a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/java/com/example/executorchdemo/MainActivity.java
+++ b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/java/com/example/executorchdemo/MainActivity.java
@@ -97,14 +97,7 @@ protected void onCreate(Bundle savedInstanceState) {
       finish();
     }
 
-    try {
-      mModule =
-          Module.load(MainActivity.assetFilePath(getApplicationContext(), "dl3_xnnpack_fp32.pte"));
-
-    } catch (IOException e) {
-      Log.e("ImageSegmentation", "Error reading assets", e);
-      finish();
-    }
+    mModule = Module.load("/data/local/tmp/dl3_xnnpack_fp32.pte");
 
     mImageView = findViewById(R.id.imageView);
     mImageView.setImageBitmap(mBitmap);
@@ -130,16 +123,8 @@ public void onClick(View v) {
     mButtonXnnpack.setOnClickListener(
         new View.OnClickListener() {
           public void onClick(View v) {
-            try {
-              mModule.destroy();
-              mModule =
-                  Module.load(
-                      MainActivity.assetFilePath(getApplicationContext(), "dl3_xnnpack_fp32.pte"));
-            } catch (IOException e) {
-              Log.e("ImageSegmentation", "Error reading assets", e);
-              finish();
-            }
-
+            mModule.destroy();
+            mModule = Module.load("/data/local/tmp/dl3_xnnpack_fp32.pte");
             mButtonXnnpack.setEnabled(false);
             mProgressBar.setVisibility(ProgressBar.VISIBLE);
             mButtonXnnpack.setText(getString(R.string.run_model));
@@ -152,14 +137,8 @@ public void onClick(View v) {
     mButtonHtp.setOnClickListener(
         new View.OnClickListener() {
           public void onClick(View v) {
-            try {
-              mModule.destroy();
-              mModule =
-                  Module.load(MainActivity.assetFilePath(getApplicationContext(), "dlv3_qnn.pte"));
-            } catch (IOException e) {
-              Log.e("ImageSegmentation", "Error reading assets", e);
-              finish();
-            }
+            mModule.destroy();
+            mModule = Module.load("/data/local/tmp/dlv3_qnn.pte");
             mButtonHtp.setEnabled(false);
             mProgressBar.setVisibility(ProgressBar.VISIBLE);
             mButtonHtp.setText(getString(R.string.run_model));
diff --git a/examples/demo-apps/android/ExecuTorchDemo/settings.gradle.kts b/examples/demo-apps/android/ExecuTorchDemo/settings.gradle.kts
index 40adb48f270..ba0e809fd98 100644
--- a/examples/demo-apps/android/ExecuTorchDemo/settings.gradle.kts
+++ b/examples/demo-apps/android/ExecuTorchDemo/settings.gradle.kts
@@ -25,5 +25,3 @@ dependencyResolutionManagement {
 rootProject.name = "ExecuTorch Demo"
 
 include(":app")
-
-includeBuild("../../../../extension/android")
diff --git a/examples/demo-apps/android/ExecuTorchDemo/setup.sh b/examples/demo-apps/android/ExecuTorchDemo/setup.sh
index 00d9201b092..28ecc083bba 100644
--- a/examples/demo-apps/android/ExecuTorchDemo/setup.sh
+++ b/examples/demo-apps/android/ExecuTorchDemo/setup.sh
@@ -7,35 +7,6 @@
 
 set -eu
 
-CMAKE_OUT="${CMAKE_OUT:-cmake-out-android}"
-# Note: Set up ANDROID_NDK and ANDROID_ABI
-cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
-  -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}/build/cmake/android.toolchain.cmake" \
-  -DANDROID_ABI="${ANDROID_ABI}" \
-  -DEXECUTORCH_BUILD_XNNPACK=ON \
-  -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-  -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-  -DEXECUTORCH_BUILD_EXTENSION_TESNOR=ON \
-  -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-  -DCMAKE_BUILD_TYPE=Release \
-  -B"${CMAKE_OUT}"
-
-if [ "$(uname)" == "Darwin" ]; then
-  CMAKE_JOBS=$(( $(sysctl -n hw.ncpu) - 1 ))
-else
-  CMAKE_JOBS=$(( $(nproc) - 1 ))
-fi
-cmake --build "${CMAKE_OUT}" -j "${CMAKE_JOBS}" --target install --config Release
-
-cmake extension/android \
-  -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
-  -DANDROID_ABI="${ANDROID_ABI}" \
-  -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
-  -DCMAKE_BUILD_TYPE=Release \
-  -B"${CMAKE_OUT}"/extension/android
-
-cmake --build "${CMAKE_OUT}"/extension/android -j "${CMAKE_JOBS}" --config Release
-
-JNI_LIBS_PATH="examples/demo-apps/android/ExecuTorchDemo/app/src/main/jniLibs"
-mkdir -p "${JNI_LIBS_PATH}/${ANDROID_ABI}"
-cp "${CMAKE_OUT}"/extension/android/libexecutorch_jni.so "${JNI_LIBS_PATH}/${ANDROID_ABI}/"
+BASEDIR=$(dirname "$0")
+mkdir -p "$BASEDIR"/app/libs
+curl -o "$BASEDIR"/app/libs/executorch.aar https://ossci-android.s3.amazonaws.com/executorch/release/v0.5.0-rc3/executorch.aar
diff --git a/examples/demo-apps/android/LlamaDemo/README.md b/examples/demo-apps/android/LlamaDemo/README.md
index 48927e2f801..a735b48dee1 100644
--- a/examples/demo-apps/android/LlamaDemo/README.md
+++ b/examples/demo-apps/android/LlamaDemo/README.md
@@ -75,7 +75,7 @@ Optional Parameters:
 
 ```java
 // Upon returning to the Main Chat Activity
-mModule = new LlamaModule(
+mModule = new LlmModule(
             ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType()),
             modelPath,
             tokenizerPath,
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/androidTest/java/com/example/executorchllamademo/PerfTest.java b/examples/demo-apps/android/LlamaDemo/app/src/androidTest/java/com/example/executorchllamademo/PerfTest.java
index 221a9bd7417..21ac285d3b0 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/androidTest/java/com/example/executorchllamademo/PerfTest.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/androidTest/java/com/example/executorchllamademo/PerfTest.java
@@ -20,11 +20,11 @@
 import java.util.List;
 import org.junit.Test;
 import org.junit.runner.RunWith;
-import org.pytorch.executorch.LlamaCallback;
-import org.pytorch.executorch.LlamaModule;
+import org.pytorch.executorch.extension.llm.LlmCallback;
+import org.pytorch.executorch.extension.llm.LlmModule;
 
 @RunWith(AndroidJUnit4.class)
-public class PerfTest implements LlamaCallback {
+public class PerfTest implements LlmCallback {
 
   private static final String RESOURCE_PATH = "/data/local/tmp/llama/";
   private static final String TOKENIZER_BIN = "tokenizer.bin";
@@ -41,7 +41,7 @@ public void testTokensPerSecond() {
         .filter(file -> file.getName().endsWith(".pte"))
         .forEach(
             model -> {
-              LlamaModule mModule = new LlamaModule(model.getPath(), tokenizerPath, 0.8f);
+              LlmModule mModule = new LlmModule(model.getPath(), tokenizerPath, 0.8f);
               // Print the model name because there might be more than one of them
               report("ModelName", model.getName());
 
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
index 7b88d16d708..e19155b83e8 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
@@ -49,17 +49,17 @@
 import java.util.List;
 import java.util.concurrent.Executor;
 import java.util.concurrent.Executors;
-import org.pytorch.executorch.LlamaCallback;
-import org.pytorch.executorch.LlamaModule;
+import org.pytorch.executorch.extension.llm.LlmCallback;
+import org.pytorch.executorch.extension.llm.LlmModule;
 
-public class MainActivity extends AppCompatActivity implements Runnable, LlamaCallback {
+public class MainActivity extends AppCompatActivity implements Runnable, LlmCallback {
   private EditText mEditTextMessage;
   private ImageButton mSendButton;
   private ImageButton mGalleryButton;
   private ImageButton mCameraButton;
   private ListView mMessagesView;
   private MessageAdapter mMessageAdapter;
-  private LlamaModule mModule = null;
+  private LlmModule mModule = null;
   private Message mResultMessage = null;
   private ImageButton mSettingsButton;
   private TextView mMemoryView;
@@ -124,7 +124,7 @@ private void setLocalModel(String modelPath, String tokenizerPath, float tempera
     }
     long runStartTime = System.currentTimeMillis();
     mModule =
-        new LlamaModule(
+        new LlmModule(
             ModelUtils.getModelCategory(
                 mCurrentSettingsFields.getModelType(), mCurrentSettingsFields.getBackendType()),
             modelPath,
@@ -714,7 +714,7 @@ private void onModelRunStopped() {
           // Scroll to bottom of the list
           mMessagesView.smoothScrollToPosition(mMessageAdapter.getCount() - 1);
           // After images are added to prompt and chat thread, we clear the imageURI list
-          // Note: This has to be done after imageURIs are no longer needed by LlamaModule
+          // Note: This has to be done after imageURIs are no longer needed by LlmModule
           mSelectedImageUri = null;
           promptID++;
           Runnable runnable =
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelRunner.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelRunner.java
index 4dc32d14756..78cfee993c4 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelRunner.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelRunner.java
@@ -13,12 +13,12 @@
 import android.os.Looper;
 import android.os.Message;
 import androidx.annotation.NonNull;
-import org.pytorch.executorch.LlamaCallback;
-import org.pytorch.executorch.LlamaModule;
+import org.pytorch.executorch.extension.llm.LlmCallback;
+import org.pytorch.executorch.extension.llm.LlmModule;
 
 /** A helper class to handle all model running logic within this class. */
-public class ModelRunner implements LlamaCallback {
-  LlamaModule mModule = null;
+public class ModelRunner implements LlmCallback {
+  LlmModule mModule = null;
 
   String mModelFilePath = "";
   String mTokenizerFilePath = "";
@@ -45,7 +45,7 @@ public class ModelRunner implements LlamaCallback {
     mTokenizerFilePath = tokenizerFilePath;
     mCallback = callback;
 
-    mModule = new LlamaModule(mModelFilePath, mTokenizerFilePath, 0.8f);
+    mModule = new LlmModule(mModelFilePath, mTokenizerFilePath, 0.8f);
     mHandlerThread = new HandlerThread("ModelRunner");
     mHandlerThread.start();
     mHandler = new ModelRunnerHandler(mHandlerThread.getLooper(), this);
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java
index c922e32e761..416dc0b0aac 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java
@@ -189,7 +189,7 @@ public void onTextChanged(CharSequence s, int start, int before, int count) {}
           public void afterTextChanged(Editable s) {
             mSetTemperature = Double.parseDouble(s.toString());
             // This is needed because temperature is changed together with model loading
-            // Once temperature is no longer in LlamaModule constructor, we can remove this
+            // Once temperature is no longer in LlmModule constructor, we can remove this
             mSettingsFields.saveLoadModelAction(true);
             saveSettings();
           }
diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/mediatek_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/mediatek_README.md
index 6351640dcc0..4d1346963c7 100644
--- a/examples/demo-apps/android/LlamaDemo/docs/delegates/mediatek_README.md
+++ b/examples/demo-apps/android/LlamaDemo/docs/delegates/mediatek_README.md
@@ -123,7 +123,7 @@ The Mediatek runner (`examples/mediatek/executor_runner/mtk_llama_runner.cpp`) c
 
 Next we need to build and compile the MediaTek backend and MediaTek Llama runner. By setting  `NEURON_BUFFER_ALLOCATOR_LIB`, the script will build the MediaTek backend.
 ```
-sh build/build_android_llm_demo.sh
+sh scripts/build_android_library.sh
 ```
 
 **Output**: This will generate an .aar file that is already imported into the expected directory for the Android app. It will live in `examples/demo-apps/android/Llamademo/app/libs`.
diff --git a/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh b/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh
index 044d80832de..0f1cde1a06f 100644
--- a/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh
+++ b/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh
@@ -13,13 +13,7 @@ if [ -z "$QNN_SDK_ROOT" ]; then
 fi
 
 BASEDIR=$(dirname "$0")
-source "$BASEDIR"/../../../../build/build_android_llm_demo.sh
+ANDROID_ABIS="arm64-v8a" bash "$BASEDIR"/setup.sh
 
 BUILD_AAR_DIR="$(mktemp -d)"
 export BUILD_AAR_DIR
-
-build_jar
-build_android_native_library "arm64-v8a"
-build_aar
-mkdir -p "$BASEDIR"/app/libs
-cp "$BUILD_AAR_DIR/executorch.aar" "$BASEDIR"/app/libs/executorch.aar
diff --git a/examples/demo-apps/android/LlamaDemo/setup.sh b/examples/demo-apps/android/LlamaDemo/setup.sh
index 4d70c67ede0..c7e3a4a95d0 100644
--- a/examples/demo-apps/android/LlamaDemo/setup.sh
+++ b/examples/demo-apps/android/LlamaDemo/setup.sh
@@ -7,15 +7,11 @@
 
 set -eu
 
-BASEDIR=$(dirname "$0")
-source "$BASEDIR"/../../../../build/build_android_llm_demo.sh
-
 BUILD_AAR_DIR="$(mktemp -d)"
 export BUILD_AAR_DIR
 
-build_jar
-build_android_native_library "arm64-v8a"
-build_android_native_library "x86_64"
-build_aar
+BASEDIR=$(dirname "$0")
 mkdir -p "$BASEDIR"/app/libs
+bash "$BASEDIR"/../../../../scripts/build_android_library.sh
+
 cp "$BUILD_AAR_DIR/executorch.aar" "$BASEDIR"/app/libs/executorch.aar
diff --git a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo.xcodeproj/project.pbxproj b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo.xcodeproj/project.pbxproj
index f08d61396d2..2ee4db5361d 100644
--- a/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo.xcodeproj/project.pbxproj
+++ b/examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo.xcodeproj/project.pbxproj
@@ -806,7 +806,7 @@
 			isa = XCRemoteSwiftPackageReference;
 			repositoryURL = "https://github.com/pytorch/executorch";
 			requirement = {
-				branch = "swiftpm-0.5.0.20250130";
+				branch = "swiftpm-0.5.0.20250317";
 				kind = branch;
 			};
 		};
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj b/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj
index 2cc93808799..a067873a0b9 100644
--- a/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj
@@ -25,15 +25,8 @@
 		03729F0C2BB203B300152F2E /* util.h in Headers */ = {isa = PBXBuildFile; fileRef = 03729F092BB203B300152F2E /* util.h */; };
 		03729F122BB2042B00152F2E /* sampler.h in Headers */ = {isa = PBXBuildFile; fileRef = 03729F102BB2042B00152F2E /* sampler.h */; };
 		03729F132BB2042B00152F2E /* sampler.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 03729F112BB2042B00152F2E /* sampler.cpp */; };
-		03729F162BB2043600152F2E /* bpe_tokenizer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 03729F142BB2043600152F2E /* bpe_tokenizer.cpp */; };
-		03729F172BB2043600152F2E /* tokenizer.h in Headers */ = {isa = PBXBuildFile; fileRef = 03729F152BB2043600152F2E /* tokenizer.h */; };
 		0372C3142C89418E00CD942A /* llava_runner.h in Headers */ = {isa = PBXBuildFile; fileRef = 0372C3122C89418E00CD942A /* llava_runner.h */; };
 		0372C3152C89418E00CD942A /* llava_runner.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 0372C3132C89418E00CD942A /* llava_runner.cpp */; };
-		038D678C2C482C1E00B88CF2 /* llama_tiktoken.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 038D678A2C482C1D00B88CF2 /* llama_tiktoken.cpp */; };
-		038D678D2C482C1E00B88CF2 /* llama_tiktoken.h in Headers */ = {isa = PBXBuildFile; fileRef = 038D678B2C482C1E00B88CF2 /* llama_tiktoken.h */; };
-		03BADE202BD2E88600DDFDC2 /* bpe_tokenizer.h in Headers */ = {isa = PBXBuildFile; fileRef = 03BADE1F2BD2E88600DDFDC2 /* bpe_tokenizer.h */; };
-		03BADE232BD2EB6700DDFDC2 /* tiktoken.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 03BADE212BD2EB6600DDFDC2 /* tiktoken.cpp */; };
-		03BADE242BD2EB6700DDFDC2 /* tiktoken.h in Headers */ = {isa = PBXBuildFile; fileRef = 03BADE222BD2EB6700DDFDC2 /* tiktoken.h */; };
 		03CF43962CEC5CEC00C7113B /* backend_coreml in Frameworks */ = {isa = PBXBuildFile; productRef = 03CF43952CEC5CEC00C7113B /* backend_coreml */; };
 		03CF43982CEC5CEC00C7113B /* backend_coreml_debug in Frameworks */ = {isa = PBXBuildFile; productRef = 03CF43972CEC5CEC00C7113B /* backend_coreml_debug */; };
 		03CF439A2CEC5CEC00C7113B /* backend_mps in Frameworks */ = {isa = PBXBuildFile; productRef = 03CF43992CEC5CEC00C7113B /* backend_mps */; };
@@ -53,8 +46,20 @@
 		03D03DA82C7823620088D6A7 /* text_prefiller.h in Headers */ = {isa = PBXBuildFile; fileRef = 03D03DA62C7823620088D6A7 /* text_prefiller.h */; };
 		03D03DAB2C7823830088D6A7 /* text_decoder_runner.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 03D03DA92C7823830088D6A7 /* text_decoder_runner.cpp */; };
 		03D03DAC2C7823830088D6A7 /* text_decoder_runner.h in Headers */ = {isa = PBXBuildFile; fileRef = 03D03DAA2C7823830088D6A7 /* text_decoder_runner.h */; };
-		03DDA0FB2BD6368100D234B3 /* base64.h in Headers */ = {isa = PBXBuildFile; fileRef = 03DDA0FA2BD6368100D234B3 /* base64.h */; };
 		26A6A4282C8A3769005A761E /* ImagePicker.swift in Sources */ = {isa = PBXBuildFile; fileRef = 26A6A4272C8A3769005A761E /* ImagePicker.swift */; };
+		F292B0752D88B0C200BE6839 /* tiktoken.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F292B06F2D88B0C200BE6839 /* tiktoken.cpp */; };
+		F292B0762D88B0C200BE6839 /* llama2c_tokenizer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F292B06C2D88B0C200BE6839 /* llama2c_tokenizer.cpp */; };
+		F292B0772D88B0C200BE6839 /* bpe_tokenizer_base.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F292B06A2D88B0C200BE6839 /* bpe_tokenizer_base.cpp */; };
+		F292B0882D88B0D200BE6839 /* llama2c_tokenizer.h in Headers */ = {isa = PBXBuildFile; fileRef = F292B07C2D88B0D200BE6839 /* llama2c_tokenizer.h */; };
+		F292B0892D88B0D200BE6839 /* tokenizer.h in Headers */ = {isa = PBXBuildFile; fileRef = F292B0832D88B0D200BE6839 /* tokenizer.h */; };
+		F292B08B2D88B0D200BE6839 /* result.h in Headers */ = {isa = PBXBuildFile; fileRef = F292B07F2D88B0D200BE6839 /* result.h */; };
+		F292B08D2D88B0D200BE6839 /* error.h in Headers */ = {isa = PBXBuildFile; fileRef = F292B07A2D88B0D200BE6839 /* error.h */; };
+		F292B08E2D88B0D200BE6839 /* bpe_tokenizer_base.h in Headers */ = {isa = PBXBuildFile; fileRef = F292B0792D88B0D200BE6839 /* bpe_tokenizer_base.h */; };
+		F292B08F2D88B0D200BE6839 /* log.h in Headers */ = {isa = PBXBuildFile; fileRef = F292B07D2D88B0D200BE6839 /* log.h */; };
+		F292B0912D88B0D200BE6839 /* tiktoken.h in Headers */ = {isa = PBXBuildFile; fileRef = F292B0812D88B0D200BE6839 /* tiktoken.h */; };
+		F292B0922D88B0D200BE6839 /* base64.h in Headers */ = {isa = PBXBuildFile; fileRef = F292B0782D88B0D200BE6839 /* base64.h */; };
+		F292B1012D88B20C00BE6839 /* llama_tiktoken.h in Headers */ = {isa = PBXBuildFile; fileRef = F292B0FF2D88B20C00BE6839 /* llama_tiktoken.h */; };
+		F292B1022D88B20C00BE6839 /* llama_tiktoken.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F292B1002D88B20C00BE6839 /* llama_tiktoken.cpp */; };
 /* End PBXBuildFile section */
 
 /* Begin PBXContainerItemProxy section */
@@ -103,23 +108,28 @@
 		03729F092BB203B300152F2E /* util.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = util.h; sourceTree = "<group>"; };
 		03729F102BB2042B00152F2E /* sampler.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sampler.h; sourceTree = "<group>"; };
 		03729F112BB2042B00152F2E /* sampler.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sampler.cpp; sourceTree = "<group>"; };
-		03729F142BB2043600152F2E /* bpe_tokenizer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = bpe_tokenizer.cpp; path = ../../../../extension/llm/tokenizer/bpe_tokenizer.cpp; sourceTree = "<group>"; };
-		03729F152BB2043600152F2E /* tokenizer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = tokenizer.h; path = ../../../../extension/llm/tokenizer/tokenizer.h; sourceTree = "<group>"; };
 		0372C3122C89418E00CD942A /* llava_runner.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = llava_runner.h; path = ../../../examples/models/llava/runner/llava_runner.h; sourceTree = "<group>"; };
 		0372C3132C89418E00CD942A /* llava_runner.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = llava_runner.cpp; path = ../../../examples/models/llava/runner/llava_runner.cpp; sourceTree = "<group>"; };
-		038D678A2C482C1D00B88CF2 /* llama_tiktoken.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = llama_tiktoken.cpp; sourceTree = "<group>"; };
-		038D678B2C482C1E00B88CF2 /* llama_tiktoken.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = llama_tiktoken.h; sourceTree = "<group>"; };
-		03BADE1F2BD2E88600DDFDC2 /* bpe_tokenizer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = bpe_tokenizer.h; path = ../../../../extension/llm/tokenizer/bpe_tokenizer.h; sourceTree = "<group>"; };
-		03BADE212BD2EB6600DDFDC2 /* tiktoken.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = tiktoken.cpp; path = ../../../../extension/llm/tokenizer/tiktoken.cpp; sourceTree = "<group>"; };
-		03BADE222BD2EB6700DDFDC2 /* tiktoken.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = tiktoken.h; path = ../../../../extension/llm/tokenizer/tiktoken.h; sourceTree = "<group>"; };
 		03C5F51C2CE7D35C00D6CE3F /* Release.xcconfig */ = {isa = PBXFileReference; lastKnownFileType = text.xcconfig; path = Release.xcconfig; sourceTree = "<group>"; };
 		03C5F51D2CE7D37100D6CE3F /* Debug.xcconfig */ = {isa = PBXFileReference; lastKnownFileType = text.xcconfig; path = Debug.xcconfig; sourceTree = "<group>"; };
 		03D03DA52C7823620088D6A7 /* text_prefiller.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = text_prefiller.cpp; sourceTree = "<group>"; };
 		03D03DA62C7823620088D6A7 /* text_prefiller.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = text_prefiller.h; sourceTree = "<group>"; };
 		03D03DA92C7823830088D6A7 /* text_decoder_runner.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = text_decoder_runner.cpp; sourceTree = "<group>"; };
 		03D03DAA2C7823830088D6A7 /* text_decoder_runner.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = text_decoder_runner.h; sourceTree = "<group>"; };
-		03DDA0FA2BD6368100D234B3 /* base64.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = base64.h; path = ../../../../extension/llm/tokenizer/base64.h; sourceTree = "<group>"; };
 		26A6A4272C8A3769005A761E /* ImagePicker.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ImagePicker.swift; sourceTree = "<group>"; };
+		F292B06A2D88B0C200BE6839 /* bpe_tokenizer_base.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = bpe_tokenizer_base.cpp; path = src/bpe_tokenizer_base.cpp; sourceTree = "<group>"; };
+		F292B06C2D88B0C200BE6839 /* llama2c_tokenizer.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = llama2c_tokenizer.cpp; path = src/llama2c_tokenizer.cpp; sourceTree = "<group>"; };
+		F292B06F2D88B0C200BE6839 /* tiktoken.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = tiktoken.cpp; path = src/tiktoken.cpp; sourceTree = "<group>"; };
+		F292B0782D88B0D200BE6839 /* base64.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = base64.h; sourceTree = "<group>"; };
+		F292B0792D88B0D200BE6839 /* bpe_tokenizer_base.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = bpe_tokenizer_base.h; sourceTree = "<group>"; };
+		F292B07A2D88B0D200BE6839 /* error.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = error.h; sourceTree = "<group>"; };
+		F292B07C2D88B0D200BE6839 /* llama2c_tokenizer.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = llama2c_tokenizer.h; sourceTree = "<group>"; };
+		F292B07D2D88B0D200BE6839 /* log.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = log.h; sourceTree = "<group>"; };
+		F292B07F2D88B0D200BE6839 /* result.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = result.h; sourceTree = "<group>"; };
+		F292B0812D88B0D200BE6839 /* tiktoken.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = tiktoken.h; sourceTree = "<group>"; };
+		F292B0832D88B0D200BE6839 /* tokenizer.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = tokenizer.h; sourceTree = "<group>"; };
+		F292B0FF2D88B20C00BE6839 /* llama_tiktoken.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = llama_tiktoken.h; path = /Users/larryliu/CLionProjects/executorch/examples/models/llama/tokenizer/llama_tiktoken.h; sourceTree = "<absolute>"; };
+		F292B1002D88B20C00BE6839 /* llama_tiktoken.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = llama_tiktoken.cpp; path = /Users/larryliu/CLionProjects/executorch/examples/models/llama/tokenizer/llama_tiktoken.cpp; sourceTree = "<absolute>"; };
 /* End PBXFileReference section */
 
 /* Begin PBXFrameworksBuildPhase section */
@@ -214,7 +224,7 @@
 				0324D69B2BAACB7C00DEF36F /* Exported */,
 				03729F062BB2035900152F2E /* runner */,
 				03729F0F2BB203E100152F2E /* sampler */,
-				03729F0E2BB203D700152F2E /* tokenizer */,
+				03729F0E2BB203D700152F2E /* tokenizers */,
 			);
 			path = LLaMARunner;
 			sourceTree = "<group>";
@@ -264,20 +274,18 @@
 			path = ../../../../../../extension/llm/runner;
 			sourceTree = "<group>";
 		};
-		03729F0E2BB203D700152F2E /* tokenizer */ = {
+		03729F0E2BB203D700152F2E /* tokenizers */ = {
 			isa = PBXGroup;
 			children = (
-				03DDA0FA2BD6368100D234B3 /* base64.h */,
-				03729F142BB2043600152F2E /* bpe_tokenizer.cpp */,
-				03BADE1F2BD2E88600DDFDC2 /* bpe_tokenizer.h */,
-				038D678A2C482C1D00B88CF2 /* llama_tiktoken.cpp */,
-				038D678B2C482C1E00B88CF2 /* llama_tiktoken.h */,
-				03BADE212BD2EB6600DDFDC2 /* tiktoken.cpp */,
-				03BADE222BD2EB6700DDFDC2 /* tiktoken.h */,
-				03729F152BB2043600152F2E /* tokenizer.h */,
-			);
-			name = tokenizer;
-			path = ../../../../../models/llama/tokenizer;
+				F292B0FF2D88B20C00BE6839 /* llama_tiktoken.h */,
+				F292B1002D88B20C00BE6839 /* llama_tiktoken.cpp */,
+				F292B0862D88B0D200BE6839 /* include */,
+				F292B06A2D88B0C200BE6839 /* bpe_tokenizer_base.cpp */,
+				F292B06C2D88B0C200BE6839 /* llama2c_tokenizer.cpp */,
+				F292B06F2D88B0C200BE6839 /* tiktoken.cpp */,
+			);
+			name = tokenizers;
+			path = ../../../../../../extension/llm/tokenizers;
 			sourceTree = "<group>";
 		};
 		03729F0F2BB203E100152F2E /* sampler */ = {
@@ -290,6 +298,37 @@
 			path = ../../../../../../extension/llm/sampler;
 			sourceTree = "<group>";
 		};
+		F292B0842D88B0D200BE6839 /* tokenizers */ = {
+			isa = PBXGroup;
+			children = (
+				F292B0782D88B0D200BE6839 /* base64.h */,
+				F292B0792D88B0D200BE6839 /* bpe_tokenizer_base.h */,
+				F292B07A2D88B0D200BE6839 /* error.h */,
+				F292B07C2D88B0D200BE6839 /* llama2c_tokenizer.h */,
+				F292B07D2D88B0D200BE6839 /* log.h */,
+				F292B07F2D88B0D200BE6839 /* result.h */,
+				F292B0812D88B0D200BE6839 /* tiktoken.h */,
+				F292B0832D88B0D200BE6839 /* tokenizer.h */,
+			);
+			path = tokenizers;
+			sourceTree = "<group>";
+		};
+		F292B0852D88B0D200BE6839 /* pytorch */ = {
+			isa = PBXGroup;
+			children = (
+				F292B0842D88B0D200BE6839 /* tokenizers */,
+			);
+			path = pytorch;
+			sourceTree = "<group>";
+		};
+		F292B0862D88B0D200BE6839 /* include */ = {
+			isa = PBXGroup;
+			children = (
+				F292B0852D88B0D200BE6839 /* pytorch */,
+			);
+			path = include;
+			sourceTree = "<group>";
+		};
 /* End PBXGroup section */
 
 /* Begin PBXHeadersBuildPhase section */
@@ -297,16 +336,20 @@
 			isa = PBXHeadersBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
-				03BADE202BD2E88600DDFDC2 /* bpe_tokenizer.h in Headers */,
-				03729F172BB2043600152F2E /* tokenizer.h in Headers */,
 				03729EE22BB1F93E00152F2E /* LLaMARunner.h in Headers */,
 				03D03DA82C7823620088D6A7 /* text_prefiller.h in Headers */,
+				F292B0882D88B0D200BE6839 /* llama2c_tokenizer.h in Headers */,
+				F292B0892D88B0D200BE6839 /* tokenizer.h in Headers */,
+				F292B08B2D88B0D200BE6839 /* result.h in Headers */,
+				F292B08D2D88B0D200BE6839 /* error.h in Headers */,
+				F292B08E2D88B0D200BE6839 /* bpe_tokenizer_base.h in Headers */,
+				F292B08F2D88B0D200BE6839 /* log.h in Headers */,
+				F292B0912D88B0D200BE6839 /* tiktoken.h in Headers */,
+				F292B0922D88B0D200BE6839 /* base64.h in Headers */,
 				03D03DAC2C7823830088D6A7 /* text_decoder_runner.h in Headers */,
-				03DDA0FB2BD6368100D234B3 /* base64.h in Headers */,
-				03BADE242BD2EB6700DDFDC2 /* tiktoken.h in Headers */,
 				03729F122BB2042B00152F2E /* sampler.h in Headers */,
-				038D678D2C482C1E00B88CF2 /* llama_tiktoken.h in Headers */,
 				03729F0C2BB203B300152F2E /* util.h in Headers */,
+				F292B1012D88B20C00BE6839 /* llama_tiktoken.h in Headers */,
 				03729F0B2BB203B300152F2E /* runner.h in Headers */,
 				0372C3142C89418E00CD942A /* llava_runner.h in Headers */,
 			);
@@ -448,7 +491,7 @@
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 			shellPath = /bin/sh;
-			shellScript = "set -e\n\nif ! command -v cmake &> /dev/null\nthen\n    echo \"Cmake not found, please install Cmake. \\n1. Download Cmake.app from https://cmake.org/download with version > 3.19. \\n2. Install it to Applications/ folder and run `sudo /Applications/CMake.app/Contents/bin/cmake-gui --install` to install CMake commandline tools.\"\n    exit 1\nfi\n\nCMAKE_DIR=\"$TEMP_DIR/cmake\"\nrm -rf \"$CMAKE_DIR\"\n\nPLATFORM=\"SIMULATORARM64\"\nDEPLOYMENT_TARGET=\"17.0\"\n\nif [[ \"$PLATFORM_NAME\" == *\"iphoneos\"* ]]; then\n  PLATFORM=\"OS64\"\nelif [[ \"$PLATFORM_NAME\" == *\"macos\"* ]]; then\n  PLATFORM=\"MAC_ARM64\"\n  DEPLOYMENT_TARGET=\"10.15\"\nfi\n\ncmake_build() {\n    local src_dir=$1\n    local target=$2\n    shift 2\n    local extra_args=(\"$@\")\n    local build_dir=\"$CMAKE_DIR/build/$(basename \"$src_dir\")\"\n\n    mkdir -p \"$build_dir\" && cd \"$build_dir\"\n\n    if [[ \"$PLATFORM\" == \"MAC_ARM64\" ]]; then\n        extra_args+=(-DCMAKE_INSTALL_BUNDLEDIR=\"${CMAKE_DIR}/bin\")\n        extra_args+=(-DCMAKE_MACOSX_BUNDLE=OFF)\n    fi\n    cmake -G Xcode \\\n          -DCMAKE_BUILD_TYPE=\"Release\" \\\n          -DCMAKE_CXX_STANDARD=17 \\\n          -DCMAKE_TOOLCHAIN_FILE=\"$SRCROOT/../../../../third-party/ios-cmake/ios.toolchain.cmake\" \\\n          -DCMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD=\"c++17\" \\\n          -DCMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY=\"libc++\" \\\n          -DPLATFORM=\"$PLATFORM\" \\\n          -DDEPLOYMENT_TARGET=\"$DEPLOYMENT_TARGET\" \\\n          -DCMAKE_INSTALL_PREFIX=\"$CMAKE_DIR\" \\\n          \"${extra_args[@]}\" \\\n          \"$src_dir\"\n    cmake --build . --config \"Release\" --target \"$target\"\n    if [[ \"$target\" == \"install\" ]]; then\n        cmake --install . --prefix \"$CMAKE_DIR\"\n    fi\n}\n\ncmake_build \"$SRCROOT/../../../../extension/llm/third-party/abseil-cpp\" \"install\" \\\n    -DABSL_PROPAGATE_CXX_STD=ON\n\ncmake_build \"$SRCROOT/../../../../extension/llm/third-party/re2\" \"install\"\n\ncmake_build \"$SRCROOT/../../../../extension/llm/third-party/sentencepiece\" \"sentencepiece-static\" \\\n    -DSPM_ENABLE_SHARED=OFF\n\necho \"$(find $CMAKE_DIR/lib -name \"*.a\" | sed -E 's|^.*/lib([^/]+)\\.a|-l\\1|g' | tr '\\n' ' ')\" > \"$CMAKE_DIR/linker_flags\"\n";
+			shellScript = "set -e\n\nif ! command -v cmake &> /dev/null\nthen\n    echo \"Cmake not found, please install Cmake. \\n1. Download Cmake.app from https://cmake.org/download with version > 3.19. \\n2. Install it to Applications/ folder and run `sudo /Applications/CMake.app/Contents/bin/cmake-gui --install` to install CMake commandline tools.\"\n    exit 1\nfi\n\nCMAKE_DIR=\"$TEMP_DIR/cmake\"\nrm -rf \"$CMAKE_DIR\"\n\nPLATFORM=\"SIMULATORARM64\"\nDEPLOYMENT_TARGET=\"17.0\"\n\nif [[ \"$PLATFORM_NAME\" == *\"iphoneos\"* ]]; then\n  PLATFORM=\"OS64\"\nelif [[ \"$PLATFORM_NAME\" == *\"macos\"* ]]; then\n  PLATFORM=\"MAC_ARM64\"\n  DEPLOYMENT_TARGET=\"10.15\"\nfi\n\ncmake_build() {\n    local src_dir=$1\n    local target=$2\n    shift 2\n    local extra_args=(\"$@\")\n    local build_dir=\"$CMAKE_DIR/build/$(basename \"$src_dir\")\"\n\n    mkdir -p \"$build_dir\" && cd \"$build_dir\"\n\n    if [[ \"$PLATFORM\" == \"MAC_ARM64\" ]]; then\n        extra_args+=(-DCMAKE_INSTALL_BUNDLEDIR=\"${CMAKE_DIR}/bin\")\n        extra_args+=(-DCMAKE_MACOSX_BUNDLE=OFF)\n    fi\n    cmake -G Xcode \\\n          -DCMAKE_BUILD_TYPE=\"Release\" \\\n          -DCMAKE_CXX_STANDARD=17 \\\n          -DCMAKE_TOOLCHAIN_FILE=\"$SRCROOT/../../../../third-party/ios-cmake/ios.toolchain.cmake\" \\\n          -DCMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD=\"c++17\" \\\n          -DCMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY=\"libc++\" \\\n          -DPLATFORM=\"$PLATFORM\" \\\n          -DDEPLOYMENT_TARGET=\"$DEPLOYMENT_TARGET\" \\\n          -DCMAKE_INSTALL_PREFIX=\"$CMAKE_DIR\" \\\n          \"${extra_args[@]}\" \\\n          \"$src_dir\"\n    cmake --build . --config \"Release\" --target \"$target\"\n    if [[ \"$target\" == \"install\" ]]; then\n        cmake --install . --prefix \"$CMAKE_DIR\"\n    fi\n}\n\ncmake_build \"$SRCROOT/../../../../extension/llm/tokenizers/third-party/abseil-cpp\" \"install\" \\\n    -DABSL_PROPAGATE_CXX_STD=ON\n\ncmake_build \"$SRCROOT/../../../../extension/llm/tokenizers/third-party/re2\" \"install\"\n\ncmake_build \"$SRCROOT/../../../../extension/llm/tokenizers/third-party/sentencepiece\" \"sentencepiece-static\" \\\n    -DSPM_ENABLE_SHARED=OFF\n\necho \"$(find $CMAKE_DIR/lib -name \"*.a\" | sed -E 's|^.*/lib([^/]+)\\.a|-l\\1|g' | tr '\\n' ' ')\" > \"$CMAKE_DIR/linker_flags\"\n";
 		};
 /* End PBXShellScriptBuildPhase section */
 
@@ -475,11 +518,12 @@
 			buildActionMask = 2147483647;
 			files = (
 				03729EE12BB1F93800152F2E /* LLaMARunner.mm in Sources */,
-				03BADE232BD2EB6700DDFDC2 /* tiktoken.cpp in Sources */,
-				038D678C2C482C1E00B88CF2 /* llama_tiktoken.cpp in Sources */,
 				0372C3152C89418E00CD942A /* llava_runner.cpp in Sources */,
+				F292B1022D88B20C00BE6839 /* llama_tiktoken.cpp in Sources */,
 				03D03DAB2C7823830088D6A7 /* text_decoder_runner.cpp in Sources */,
-				03729F162BB2043600152F2E /* bpe_tokenizer.cpp in Sources */,
+				F292B0752D88B0C200BE6839 /* tiktoken.cpp in Sources */,
+				F292B0762D88B0C200BE6839 /* llama2c_tokenizer.cpp in Sources */,
+				F292B0772D88B0C200BE6839 /* bpe_tokenizer_base.cpp in Sources */,
 				03729F0A2BB203B300152F2E /* runner.cpp in Sources */,
 				03729F132BB2042B00152F2E /* sampler.cpp in Sources */,
 				03D03DA72C7823620088D6A7 /* text_prefiller.cpp in Sources */,
@@ -808,7 +852,7 @@
 			isa = XCRemoteSwiftPackageReference;
 			repositoryURL = "https://github.com/pytorch/executorch";
 			requirement = {
-				branch = "swiftpm-0.5.0.20250130";
+				branch = "swiftpm-0.5.0.20250228";
 				kind = branch;
 			};
 		};
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA/SupportingFiles/Debug.xcconfig b/examples/demo-apps/apple_ios/LLaMA/LLaMA/SupportingFiles/Debug.xcconfig
index e674c783b2c..4db30506e82 100644
--- a/examples/demo-apps/apple_ios/LLaMA/LLaMA/SupportingFiles/Debug.xcconfig
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA/SupportingFiles/Debug.xcconfig
@@ -3,20 +3,21 @@ ET_PLATFORM[sdk=iphoneos*] = ios
 ET_PLATFORM[sdk=macos*] = macos
 
 OTHER_LDFLAGS = $(inherited) \
-    -force_load $(BUILT_PRODUCTS_DIR)/libexecutorch-$(ET_PLATFORM)-debug.a \
-    -force_load $(BUILT_PRODUCTS_DIR)/libbackend_coreml-$(ET_PLATFORM)-debug.a \
-    -force_load $(BUILT_PRODUCTS_DIR)/libbackend_mps-$(ET_PLATFORM)-debug.a \
-    -force_load $(BUILT_PRODUCTS_DIR)/libbackend_xnnpack-$(ET_PLATFORM)-debug.a \
-    -force_load $(BUILT_PRODUCTS_DIR)/libkernels_custom-$(ET_PLATFORM)-debug.a \
-    -force_load $(BUILT_PRODUCTS_DIR)/libkernels_optimized-$(ET_PLATFORM)-debug.a \
-    -force_load $(BUILT_PRODUCTS_DIR)/libkernels_quantized-$(ET_PLATFORM)-debug.a \
+    -force_load $(BUILT_PRODUCTS_DIR)/libexecutorch_debug_$(ET_PLATFORM).a \
+    -force_load $(BUILT_PRODUCTS_DIR)/libbackend_coreml_debug_$(ET_PLATFORM).a \
+    -force_load $(BUILT_PRODUCTS_DIR)/libbackend_mps_debug_$(ET_PLATFORM).a \
+    -force_load $(BUILT_PRODUCTS_DIR)/libbackend_xnnpack_debug_$(ET_PLATFORM).a \
+    -force_load $(BUILT_PRODUCTS_DIR)/libkernels_custom_debug_$(ET_PLATFORM).a \
+    -force_load $(BUILT_PRODUCTS_DIR)/libkernels_optimized_debug_$(ET_PLATFORM).a \
+    -force_load $(BUILT_PRODUCTS_DIR)/libkernels_quantized_debug_$(ET_PLATFORM).a \
     @$(TEMP_DIR)/cmake/linker_flags
 
 // LLaMARunner requires additional dependencies built with CMake in a custom run script phase.
 // Include headers and libraries from $(TEMP_DIR)/cmake for it.
 HEADER_SEARCH_PATHS = $(inherited) \
     $(SRCROOT)/../../../../.. \
-    $(TEMP_DIR)/cmake/include
+    $(TEMP_DIR)/cmake/include \
+    $(SRCROOT)/../../../../extension/llm/tokenizers/include 
 
 LIBRARY_SEARCH_PATHS = $(inherited) \
     $(TEMP_DIR)/cmake/lib
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA/SupportingFiles/Release.xcconfig b/examples/demo-apps/apple_ios/LLaMA/LLaMA/SupportingFiles/Release.xcconfig
index fcf63012cc4..d30a2c7957b 100644
--- a/examples/demo-apps/apple_ios/LLaMA/LLaMA/SupportingFiles/Release.xcconfig
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA/SupportingFiles/Release.xcconfig
@@ -5,20 +5,21 @@ ET_PLATFORM[sdk=macos*] = macos
 // Link the Debug version of ExecuTorch runtime to keep the logs.
 // Switch to Release for better performance if logs are not needed.
 OTHER_LDFLAGS = $(inherited) \
-    -force_load $(BUILT_PRODUCTS_DIR)/libexecutorch-$(ET_PLATFORM)-debug.a \
-    -force_load $(BUILT_PRODUCTS_DIR)/libbackend_coreml-$(ET_PLATFORM)-release.a \
-    -force_load $(BUILT_PRODUCTS_DIR)/libbackend_mps-$(ET_PLATFORM)-release.a \
-    -force_load $(BUILT_PRODUCTS_DIR)/libbackend_xnnpack-$(ET_PLATFORM)-release.a \
-    -force_load $(BUILT_PRODUCTS_DIR)/libkernels_custom-$(ET_PLATFORM)-release.a \
-    -force_load $(BUILT_PRODUCTS_DIR)/libkernels_optimized-$(ET_PLATFORM)-release.a \
-    -force_load $(BUILT_PRODUCTS_DIR)/libkernels_quantized-$(ET_PLATFORM)-release.a \
+    -force_load $(BUILT_PRODUCTS_DIR)/libexecutorch_debug_$(ET_PLATFORM).a \
+    -force_load $(BUILT_PRODUCTS_DIR)/libbackend_coreml_$(ET_PLATFORM).a \
+    -force_load $(BUILT_PRODUCTS_DIR)/libbackend_mps_$(ET_PLATFORM).a \
+    -force_load $(BUILT_PRODUCTS_DIR)/libbackend_xnnpack_$(ET_PLATFORM).a \
+    -force_load $(BUILT_PRODUCTS_DIR)/libkernels_custom_$(ET_PLATFORM).a \
+    -force_load $(BUILT_PRODUCTS_DIR)/libkernels_optimized_$(ET_PLATFORM).a \
+    -force_load $(BUILT_PRODUCTS_DIR)/libkernels_quantized_$(ET_PLATFORM).a \
     @$(TEMP_DIR)/cmake/linker_flags
 
 // LLaMARunner requires additional dependencies built with CMake in a custom run script phase.
 // Include headers and libraries from $(TEMP_DIR)/cmake for it.
 HEADER_SEARCH_PATHS = $(inherited) \
     $(SRCROOT)/../../../../.. \
-    $(TEMP_DIR)/cmake/include
+    $(TEMP_DIR)/cmake/include \
+    $(SRCROOT)/../../../../extension/llm/tokenizers/include
 
 LIBRARY_SEARCH_PATHS = $(inherited) \
     $(TEMP_DIR)/cmake/lib
diff --git a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md
index e1a1530acf9..f5292fe5c05 100644
--- a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md
+++ b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md
@@ -76,7 +76,7 @@ sudo /Applications/CMake.app/Contents/bin/cmake-gui --install
 The prebuilt ExecuTorch runtime, backend, and kernels are available as a Swift PM package.
 
 ### Xcode
-Open the project in Xcode.In Xcode, go to `File > Add Package Dependencies`. Paste the URL of the ExecuTorch repo into the search bar and select it. Make sure to change the branch name to the desired ExecuTorch version, e.g., “swiftpm-0.5.0”, or a branch name in format "swiftpm-<version>.<year_month_date>" (e.g. "swiftpm-0.5.0-20250130") for a nightly build on a specific date.
+Open the project in Xcode.In Xcode, go to `File > Add Package Dependencies`. Paste the URL of the ExecuTorch repo into the search bar and select it. Make sure to change the branch name to the desired ExecuTorch version, e.g., “swiftpm-0.5.0”, or a branch name in format "swiftpm-<version>.<year_month_date>" (e.g. "swiftpm-0.5.0-20250228") for a nightly build on a specific date.
 
 Note: If you're running into any issues related to package dependencies, quit Xcode entirely, delete the whole executorch repo, clean the caches by running the command below in terminal and clone the repo again.
 
diff --git a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md
index b127bad10e2..c45871a1fe5 100644
--- a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md
+++ b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md
@@ -130,7 +130,7 @@ While we recommended using the latest prebuilt package pre-configured with the X
 
 Go to Project Navigator, click on LLaMA. `Project --> LLaMA --> Package Dependencies`, and update the package dependencies to any of the available options below:
 
-- Branch --> swiftpm-0.5.0.20250130 (amend to match the latest nightly build)
+- Branch --> swiftpm-0.5.0.20250228 (amend to match the latest nightly build)
 - Branch --> swiftpm-0.5.0
 - Branch --> swiftpm-0.4.0
 
@@ -149,7 +149,7 @@ curl -LO "https://github.com/facebook/buck2/releases/download/$BUCK2_RELEASE_DAT
 zstd -cdq "$BUCK2_ARCHIVE" > "$BUCK2" && chmod +x "$BUCK2"
 rm "$BUCK2_ARCHIVE"
 
-./build/build_apple_frameworks.sh --buck2="$(realpath $BUCK2)" --coreml --custom --mps --optimized --portable --quantized --xnnpack
+./scripts/build_apple_frameworks.sh --buck2="$(realpath $BUCK2)" --coreml --custom --mps --optimized --portable --quantized --xnnpack
 ```
 
  After the build finishes successfully, the resulting frameworks can be found in the `cmake-out` directory. Copy them to your project and link them against your targets.
diff --git a/examples/demo-apps/react-native/rnllama/ios/Release.xcconfig b/examples/demo-apps/react-native/rnllama/ios/Release.xcconfig
index b98b21b9c6e..6893e1252e7 100644
--- a/examples/demo-apps/react-native/rnllama/ios/Release.xcconfig
+++ b/examples/demo-apps/react-native/rnllama/ios/Release.xcconfig
@@ -5,13 +5,13 @@ ET_PLATFORM[sdk=macos*] = macos
 // Link the Debug version of ExecuTorch runtime to keep the logs.
 // Switch to Release for better performance if logs are not needed.
 OTHER_LDFLAGS = $(inherited) \
-    -force_load $(BUILT_PRODUCTS_DIR)/libexecutorch-$(ET_PLATFORM)-release.a \
-    -force_load $(BUILT_PRODUCTS_DIR)/libbackend_coreml-$(ET_PLATFORM)-release.a \
-    -force_load $(BUILT_PRODUCTS_DIR)/libbackend_mps-$(ET_PLATFORM)-release.a \
-    -force_load $(BUILT_PRODUCTS_DIR)/libbackend_xnnpack-$(ET_PLATFORM)-release.a \
-    -force_load $(BUILT_PRODUCTS_DIR)/libkernels_custom-$(ET_PLATFORM)-release.a \
-    -force_load $(BUILT_PRODUCTS_DIR)/libkernels_optimized-$(ET_PLATFORM)-release.a \
-    -force_load $(BUILT_PRODUCTS_DIR)/libkernels_quantized-$(ET_PLATFORM)-release.a \
+    -force_load $(BUILT_PRODUCTS_DIR)/libexecutorch_debug_$(ET_PLATFORM).a \
+    -force_load $(BUILT_PRODUCTS_DIR)/libbackend_coreml_$(ET_PLATFORM).a \
+    -force_load $(BUILT_PRODUCTS_DIR)/libbackend_mps_$(ET_PLATFORM).a \
+    -force_load $(BUILT_PRODUCTS_DIR)/libbackend_xnnpack_$(ET_PLATFORM).a \
+    -force_load $(BUILT_PRODUCTS_DIR)/libkernels_custom_$(ET_PLATFORM).a \
+    -force_load $(BUILT_PRODUCTS_DIR)/libkernels_optimized_$(ET_PLATFORM).a \
+    -force_load $(BUILT_PRODUCTS_DIR)/libkernels_quantized_$(ET_PLATFORM).a \
     @$(TEMP_DIR)/cmake/linker_flags
 
 // LLaMARunner requires additional dependencies built with CMake in a custom run script phase.
diff --git a/examples/demo-apps/react-native/rnllama/ios/rnllama.xcodeproj/project.pbxproj b/examples/demo-apps/react-native/rnllama/ios/rnllama.xcodeproj/project.pbxproj
index 1a587970640..612dd410a1a 100644
--- a/examples/demo-apps/react-native/rnllama/ios/rnllama.xcodeproj/project.pbxproj
+++ b/examples/demo-apps/react-native/rnllama/ios/rnllama.xcodeproj/project.pbxproj
@@ -557,7 +557,7 @@
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 			shellPath = /bin/sh;
-			shellScript = "set -e\n\nif ! command -v cmake &> /dev/null\nthen\n    echo \"cmake not found, please install cmake. \\n1. Download Cmake.app from https://cmake.org/download with version > 3.19. \\n2. Install it to Applications/ folder and run sudo /Applications/CMake.app/Contents/bin/cmake-gui --install to install CMake commandline tools.\"\n    exit 1\nfi\n\nCMAKE_DIR=\"$TEMP_DIR/cmake\"\nrm -rf \"$CMAKE_DIR\"\n\nPLATFORM=\"SIMULATORARM64\"\nDEPLOYMENT_TARGET=\"17.0\"\n\nif [[ \"$PLATFORM_NAME\" == *\"iphoneos\"* ]]; then\n  PLATFORM=\"OS64\"\nelif [[ \"$PLATFORM_NAME\" == *\"macos\"* ]]; then\n  PLATFORM=\"MAC_ARM64\"\n  DEPLOYMENT_TARGET=\"10.15\"\nfi\n\ncmake_build() {\n    local src_dir=$1\n    shift\n    local extra_args=(\"$@\")\n    local build_dir=\"$CMAKE_DIR/build/$(basename \"$src_dir\")\"\n\n    mkdir -p \"$build_dir\" && cd \"$build_dir\"\n    cmake -G Xcode \\\n          -DCMAKE_BUILD_TYPE=\"Release\" \\\n          -DCMAKE_CXX_STANDARD=17 \\\n          -DCMAKE_TOOLCHAIN_FILE=\"/Users/jh/dev/executorch/third-party/ios-cmake/ios.toolchain.cmake\" \\\n          -DCMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD=\"c++17\" \\\n          -DCMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY=\"libc++\" \\\n          -DPLATFORM=\"$PLATFORM\" \\\n          -DDEPLOYMENT_TARGET=\"$DEPLOYMENT_TARGET\" \\\n          \"${extra_args[@]}\" \\\n          \"$src_dir\"\n    cmake --build . --config \"Release\"\n    cmake --install . --prefix \"$CMAKE_DIR\"\n}\n\ncmake_build \"/Users/jh/dev/executorch/extension/llm/third-party/abseil-cpp\" \\\n    -DABSL_PROPAGATE_CXX_STD=ON\n\ncmake_build \"/Users/jh/dev/executorch/extension/llm/third-party/re2\" \\\n    -DCMAKE_PREFIX_PATH=\"$CMAKE_DIR/lib/cmake/absl\"\n    \ncmake_build \"/Users/jh/dev/executorch/extension/llm/third-party/sentencepiece\" \\\n    -DSPM_ENABLE_SHARED=OFF\n\necho \"$(find $CMAKE_DIR/lib -name \"*.a\" | sed -E 's|^.*/lib([^/]+)\\.a|-l\\1|g' | tr '\\n' ' ')\" > \"$CMAKE_DIR/linker_flags\"\n\n\n\n";
+			shellScript = "set -e\n\nif ! command -v cmake &> /dev/null\nthen\n    echo \"cmake not found, please install cmake. \\n1. Download Cmake.app from https://cmake.org/download with version > 3.19. \\n2. Install it to Applications/ folder and run sudo /Applications/CMake.app/Contents/bin/cmake-gui --install to install CMake commandline tools.\"\n    exit 1\nfi\n\nCMAKE_DIR=\"$TEMP_DIR/cmake\"\nrm -rf \"$CMAKE_DIR\"\n\nPLATFORM=\"SIMULATORARM64\"\nDEPLOYMENT_TARGET=\"17.0\"\n\nif [[ \"$PLATFORM_NAME\" == *\"iphoneos\"* ]]; then\n  PLATFORM=\"OS64\"\nelif [[ \"$PLATFORM_NAME\" == *\"macos\"* ]]; then\n  PLATFORM=\"MAC_ARM64\"\n  DEPLOYMENT_TARGET=\"10.15\"\nfi\n\ncmake_build() {\n    local src_dir=$1\n    shift\n    local extra_args=(\"$@\")\n    local build_dir=\"$CMAKE_DIR/build/$(basename \"$src_dir\")\"\n\n    mkdir -p \"$build_dir\" && cd \"$build_dir\"\n    cmake -G Xcode \\\n          -DCMAKE_BUILD_TYPE=\"Release\" \\\n          -DCMAKE_CXX_STANDARD=17 \\\n          -DCMAKE_TOOLCHAIN_FILE=\"$PROJECT_DIR/../../../../../third-party/ios-cmake/ios.toolchain.cmake\" \\\n          -DCMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD=\"c++17\" \\\n          -DCMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY=\"libc++\" \\\n          -DPLATFORM=\"$PLATFORM\" \\\n          -DDEPLOYMENT_TARGET=\"$DEPLOYMENT_TARGET\" \\\n          \"${extra_args[@]}\" \\\n          \"$src_dir\"\n    cmake --build . --config \"Release\"\n    cmake --install . --prefix \"$CMAKE_DIR\"\n}\n\ncmake_build \"$PROJECT_DIR/../../../../../extension/llm/third-party/abseil-cpp\" \\\n    -DABSL_PROPAGATE_CXX_STD=ON\n    \ncmake_build \"$PROJECT_DIR/../../../../../extension/llm/third-party/re2\" \\\n    -DCMAKE_PREFIX_PATH=\"$CMAKE_DIR/lib/cmake/absl\"\n    \ncmake_build \"$PROJECT_DIR/../../../../../extension/llm/third-party/sentencepiece\" \\\n    -DSPM_ENABLE_SHARED=OFF\n\necho \"$(find $CMAKE_DIR/lib -name \"*.a\" | sed -E 's|^.*/lib([^/]+)\\.a|-l\\1|g' | tr '\\n' ' ')\" > \"$CMAKE_DIR/linker_flags\"\n\n\n\n";
 		};
 		F7CCCCE770493310D0125117 /* [Expo] Configure project */ = {
 			isa = PBXShellScriptBuildPhase;
@@ -827,7 +827,7 @@
 				CURRENT_PROJECT_VERSION = 1;
 				DEBUG_INFORMATION_FORMAT = dwarf;
 				DEFINES_MODULE = YES;
-				DEVELOPMENT_TEAM = CLFN2N8XXS;
+				DEVELOPMENT_TEAM = "";
 				DYLIB_COMPATIBILITY_VERSION = 1;
 				DYLIB_CURRENT_VERSION = 1;
 				DYLIB_INSTALL_NAME_BASE = "@rpath";
@@ -878,7 +878,7 @@
 				CURRENT_PROJECT_VERSION = 1;
 				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
 				DEFINES_MODULE = YES;
-				DEVELOPMENT_TEAM = CLFN2N8XXS;
+				DEVELOPMENT_TEAM = "";
 				DYLIB_COMPATIBILITY_VERSION = 1;
 				DYLIB_CURRENT_VERSION = 1;
 				DYLIB_INSTALL_NAME_BASE = "@rpath";
@@ -947,7 +947,7 @@
 			isa = XCRemoteSwiftPackageReference;
 			repositoryURL = "https://github.com/pytorch/executorch.git";
 			requirement = {
-				branch = "swiftpm-0.5.0.20250130";
+				branch = "swiftpm-0.5.0.20250228";
 				kind = branch;
 			};
 		};
diff --git a/examples/devtools/CMakeLists.txt b/examples/devtools/CMakeLists.txt
index 7ed5232ba41..74cbf5e78e6 100644
--- a/examples/devtools/CMakeLists.txt
+++ b/examples/devtools/CMakeLists.txt
@@ -22,8 +22,8 @@ if(NOT EXECUTORCH_ROOT)
   set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
 endif()
 
-include(${EXECUTORCH_ROOT}/build/Utils.cmake)
-include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
 
 if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
diff --git a/examples/devtools/example_runner/example_runner.cpp b/examples/devtools/example_runner/example_runner.cpp
index 1b3f1fd6408..eafcd58c22e 100644
--- a/examples/devtools/example_runner/example_runner.cpp
+++ b/examples/devtools/example_runner/example_runner.cpp
@@ -17,12 +17,14 @@
  * all fp32 tensors.
  */
 
+#include <array>
 #include <fstream>
 #include <memory>
 
 #include <gflags/gflags.h>
 
 #include <executorch/devtools/bundled_program/bundled_program.h>
+#include <executorch/devtools/etdump/data_sinks/buffer_data_sink.h>
 #include <executorch/devtools/etdump/etdump_flatcc.h>
 #include <executorch/extension/data_loader/buffer_data_loader.h>
 #include <executorch/runtime/executor/method.h>
@@ -75,6 +77,7 @@ DEFINE_int32(
     262144, // 256 KB
     "Size of the debug buffer in bytes to allocate for intermediate outputs and program outputs logging.");
 
+using executorch::etdump::BufferDataSink;
 using executorch::etdump::ETDumpGen;
 using executorch::etdump::ETDumpResult;
 using executorch::extension::BufferDataLoader;
@@ -215,27 +218,38 @@ int main(int argc, char** argv) {
   // be used by a single thread at at time, but it can be reused.
   //
   ETDumpGen etdump_gen;
-  Result<Method> method =
-      program->load_method(method_name, &memory_manager, &etdump_gen);
-  ET_CHECK_MSG(
-      method.ok(),
-      "Loading of method %s failed with status 0x%" PRIx32,
-      method_name,
-      static_cast<int>(method.error()));
-  ET_LOG(Info, "Method loaded.");
 
+  // Malloc debug buffer and create if and only if we need to log intermediate
+  // tensor outputs
   void* debug_buffer = malloc(FLAGS_debug_buffer_size);
+  Result<BufferDataSink> data_sink_ret =
+      BufferDataSink::create(debug_buffer, FLAGS_debug_buffer_size);
+  ET_CHECK_MSG(
+      data_sink_ret.ok(),
+      "Creating Datasink for etdump failed with status 0x%" PRIx32,
+      static_cast<int>(data_sink_ret.error()));
+
+  BufferDataSink* data_sink = &data_sink_ret.get();
+
   if (FLAGS_dump_intermediate_outputs) {
-    Span<uint8_t> buffer((uint8_t*)debug_buffer, FLAGS_debug_buffer_size);
-    etdump_gen.set_debug_buffer(buffer);
     etdump_gen.set_event_tracer_debug_level(
         EventTracerDebugLogLevel::kIntermediateOutputs);
+    etdump_gen.set_data_sink(data_sink);
   } else if (FLAGS_dump_outputs) {
-    Span<uint8_t> buffer((uint8_t*)debug_buffer, FLAGS_debug_buffer_size);
-    etdump_gen.set_debug_buffer(buffer);
     etdump_gen.set_event_tracer_debug_level(
         EventTracerDebugLogLevel::kProgramOutputs);
+    etdump_gen.set_data_sink(data_sink);
   }
+
+  Result<Method> method =
+      program->load_method(method_name, &memory_manager, &etdump_gen);
+  ET_CHECK_MSG(
+      method.ok(),
+      "Loading of method %s failed with status 0x%" PRIx32,
+      method_name,
+      static_cast<int>(method.error()));
+  ET_LOG(Info, "Method loaded.");
+
   // Use the inputs embedded in the bundled program.
   status = executorch::bundled_program::load_bundled_input(
       *method, file_data.data(), FLAGS_testset_idx);
diff --git a/examples/llm_pte_finetuning/README.md b/examples/llm_pte_finetuning/README.md
index f3e946f28ca..bdd317109e5 100644
--- a/examples/llm_pte_finetuning/README.md
+++ b/examples/llm_pte_finetuning/README.md
@@ -6,21 +6,43 @@ In this tutorial, we show how to fine-tune an LLM using executorch.
 
 You will need to have a model's checkpoint, in the Hugging Face format. For example:
 
-```
-git clone https://huggingface.co/microsoft/Phi-3-mini-4k-instruct
+```console
+git clone git clone https://huggingface.co/Qwen/Qwen2-0.5B-Instruct
 ```
 
 You will need to install [torchtune](https://github.com/pytorch/torchtune) following [its installation instructions](https://github.com/pytorch/torchtune?tab=readme-ov-file#installation).
 
+You might run into an issue with the `triton` package when installing `torchtune`. You can build `triton` locally following the [instructions in their repo](https://github.com/triton-lang/triton?tab=readme-ov-file#install-from-source).
+
 ## Config Files
 
+The directory structure of the `llm_pte_finetuning` is:
+
+```console
+examples/llm_pte_finetuning
+├── README.md
+├── TARGETS
+├── __init__.py
+│   ├── model_loading_lib.cpython-312.pyc
+│   └── training_lib.cpython-312.pyc
+├── model_exporter.py
+├── model_loading_lib.py
+├── phi3_alpaca_code_config.yaml
+├── phi3_config.yaml
+├── qwen_05b_config.yaml
+├── runner.py
+└── training_lib.py
+```
+
+We already provide configs out of the box. The following sections explain how you can setup the config for your own model or dataset.
+
 As mentioned in the previous section, we internally use `torchtune` APIs, and thus, we use config files that follow `torchtune`'s structure. Typically, in the following sections we go through a working example which can be found in the `phi3_config.yaml` config file.
 
 ### Tokenizer
 
 We need to define the tokenizer. Let's suppose we would like to use [PHI3 Mini Instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) model from Microsoft. We need to define the tokenizer component:
 
-```
+```yaml
 tokenizer:
   _component_: torchtune.models.phi3.phi3_mini_tokenizer
   path: /tmp/Phi-3-mini-4k-instruct/tokenizer.model
@@ -33,7 +55,7 @@ This will load the tokenizer, and set the max sequence length to 1024. The class
 
 In this example we use the [Alpaca-Cleaned dataset](https://huggingface.co/datasets/yahma/alpaca-cleaned). We need to define the following parameters:
 
-```
+```yaml
 dataset:
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 seed: null
@@ -47,7 +69,7 @@ Torchtune supports datasets using huggingface dataloaders, so custom datasets co
 
 For the loss function, we use PyTorch losses. In this example we use the `CrossEntropyLoss`:
 
-```
+```yaml
 loss:
   _component_: torch.nn.CrossEntropyLoss
 ```
@@ -56,7 +78,7 @@ loss:
 
 Model parameters can be set, in this example we replicate the configuration for phi3 mini instruct benchmarks:
 
-```
+```yaml
 model:
   _component_: torchtune.models.phi3.lora_phi3_mini
   lora_attn_modules: ['q_proj', 'v_proj']
@@ -70,7 +92,7 @@ model:
 
 Depending on how your model is defined, you will need to instantiate different components. In these examples we use checkpoints from HF (hugging face format), and thus we will need to instantiate a `FullModelHFCheckpointer` object. We need to pass the checkpoint directory, the files with the tensors, the output directory for training and the model type:
 
-```
+```yaml
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
   checkpoint_dir: /tmp/Phi-3-mini-4k-instruct
@@ -87,7 +109,7 @@ checkpointer:
 
 Torchtune supports `cuda` and `bf16` tensors. However, for ExecuTorch training we only support `cpu` and `fp32`:
 
-```
+```yaml
 device: cpu
 dtype: fp32
 ```
@@ -101,28 +123,34 @@ The `model_exporter.py` exports the LLM checkpoint into an ExecuTorch checkpoint
 * `cfg`: Configuration file
 * `output_file`: The `.pte` output path
 
-```
-python model_exporter.py --cfg=phi3_config.yaml --output_file=phi3_mini_lora.pte
+```console
+python model_exporter.py \
+    --cfg=qwen_05b_config.yaml \
+    --output_file=qwen2_0_5B.pte
 ```
 
 ### Step 2: Run the fine-tuning job
 
 To run the fine-tuning job:
 
-```
-python runner.py --cfg=phi3_config.yaml --model_file=phi3_mini_lora.pte
+```console
+python runner.py \
+    --cfg=qwen_05b_config.yaml \
+    --model_file=qwen2_0_5B.pte \
+    --num_training_steps=10 \
+    --num_eval_steps=5
 ```
 
 You need to use **the same** config file from the previous step. The `model_file` arg is the `.pte` model from the previous step.
 
 Example output:
 
-```
-Evaluating the model before training...
-100%|██████████████████████████████████████████████████████████████████████████████████████| 3/3 [31:23<00:00, 627.98s/it]
-Eval loss:  tensor(2.3778)
-100%|██████████████████████████████████████████████████████████████████████████████████████| 5/5 [52:29<00:00, 629.84s/it]
-Losses:  [2.7152762413024902, 0.7890686988830566, 2.249271869659424, 1.4777560234069824, 0.8378427624702454]
-100%|██████████████████████████████████████████████████████████████████████████████████████| 3/3 [30:35<00:00, 611.90s/it]
-Eval loss:  tensor(0.8464)
+```console
+Evaluating the model before training
+100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:47<00:00,  9.45s/it]
+Eval loss:  tensor(0.9441)
+100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [01:30<00:00,  9.09s/it]
+Losses:  [0.5646533966064453, 1.3464953899383545, 1.297974705696106, 1.2249481678009033, 0.6750457286834717, 0.7721152901649475, 1.0774847269058228, 0.7962403893470764, 0.8448256850242615, 0.8731598854064941]
+100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:45<00:00,  9.18s/it]
+Eval loss:  tensor(0.7679)
 ```
diff --git a/examples/llm_pte_finetuning/TARGETS b/examples/llm_pte_finetuning/TARGETS
index fee67914909..5ba24c11203 100644
--- a/examples/llm_pte_finetuning/TARGETS
+++ b/examples/llm_pte_finetuning/TARGETS
@@ -12,7 +12,7 @@ python_library(
         "fbcode//caffe2:torch",
         "fbcode//executorch/examples/llm_pte_finetuning:training_lib",
         "fbcode//executorch/exir:lib",
-        "fbcode//executorch/extension/pybindings:aten_lib",  # @manual For PTE loader
+        "fbcode//executorch/extension/pybindings:portable_lib",  # @manual For PTE loader
         "fbcode//pytorch/torchtune:lib",
         "fbsource//third-party/pypi/blobfile:blobfile",  # @manual For tokenizer
         "fbsource//third-party/pypi/omegaconf:omegaconf",
@@ -27,11 +27,12 @@ python_library(
     ],
     deps = [
         "fbcode//caffe2:torch",
-        "fbcode//executorch/extension/pybindings:aten_lib",  # @manual For PTE loader
+        "fbcode//executorch/extension/pybindings:portable_lib",  # @manual For PTE loader
         "fbcode//pytorch/torchtune:lib",
         "fbsource//third-party/pypi/blobfile:blobfile",  # @manual For tokenizer
         "fbsource//third-party/pypi/tiktoken:tiktoken",  # @manual For tokenizer
         "fbsource//third-party/pypi/tqdm:tqdm",
+        "fbcode//executorch/backends/xnnpack/partition:xnnpack_partitioner",
     ],
 )
 
diff --git a/examples/llm_pte_finetuning/__init__.py b/examples/llm_pte_finetuning/__init__.py
new file mode 100644
index 00000000000..1db9dd0e3be
--- /dev/null
+++ b/examples/llm_pte_finetuning/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# Copyright 2024 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .model_loading_lib import export_model_lora_training, load_checkpoint, setup_model
+from .training_lib import eval_model, get_dataloader, TrainingModule, update_function
+
+__all__ = [
+    "eval_model",
+    "get_dataloader",
+    "update_function",
+    "TrainingModule",
+    "export_model_lora_training",
+    "load_checkpoint",
+    "setup_model",
+]
diff --git a/examples/llm_pte_finetuning/model_loading_lib.py b/examples/llm_pte_finetuning/model_loading_lib.py
index 3372a97e269..2c42a0e7635 100644
--- a/examples/llm_pte_finetuning/model_loading_lib.py
+++ b/examples/llm_pte_finetuning/model_loading_lib.py
@@ -9,8 +9,9 @@
 from typing import Any, Dict, Tuple
 
 import torch
+from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
 from executorch.examples.llm_pte_finetuning.training_lib import TrainingModule
-from executorch.exir import to_edge
+from executorch.exir import EdgeCompileConfig, to_edge
 
 from omegaconf import DictConfig
 from torch.export import export, ExportedProgram
@@ -72,16 +73,70 @@ def export_model_lora_training(
         exported_graph: ExportedProgram = export(model, example_args, strict=False)
         print("Creating a joint forward-backwards graph for training")
         joint_graph = _export_forward_backward(exported_graph)
+        ep = joint_graph
+
+        # Currently there is no implementation of empty_permuted for edge dialect.
+        # We manually make a pass to rewrite the empty_permuted to empty and permute.
+        for node in ep.graph.nodes:
+            if (
+                node.op == "call_function"
+                and node.target == torch.ops.aten.empty_permuted.out
+            ):
+                print("found empty_permute: ", node)
+                empty_permuted_node = node
+                with ep.graph.inserting_before(empty_permuted_node):
+                    empty_node = ep.graph.create_node(
+                        "call_function",
+                        torch.ops.aten.empty.memory_format,
+                        (node.args[0],),
+                        empty_permuted_node.kwargs,
+                    )
+                    permute_node = ep.graph.create_node(
+                        "call_function",
+                        torch.ops.aten.permute,
+                        (empty_node, node.args[1]),
+                    )
+                    for user in empty_permuted_node.users.copy():
+                        user.replace_input_with(empty_permuted_node, permute_node)
+            if (
+                node.op == "call_function"
+                and node.target == torch.ops.aten.empty_permuted.default
+            ):
+                print("found empty_permute default: ", node)
+                empty_permuted_node = node
+                with ep.graph.inserting_before(empty_permuted_node):
+                    empty_node = ep.graph.create_node(
+                        "call_function",
+                        torch.ops.aten.empty.memory_format,
+                        (node.args[0],),
+                        empty_permuted_node.kwargs,
+                    )
+                    permute_node = ep.graph.create_node(
+                        "call_function",
+                        torch.ops.aten.permute.default,
+                        (empty_node, node.args[1]),
+                    )
+                    for user in empty_permuted_node.users.copy():
+                        user.replace_input_with(empty_permuted_node, permute_node)
 
         # 2. to_edge: Make optimizations for Edge devices.
         print("Lowering to edge dialect")
-        edge_program = to_edge(joint_graph)
+        edge_program = to_edge(
+            joint_graph,
+            compile_config=EdgeCompileConfig(
+                _core_aten_ops_exception_list=[torch.ops.aten.empty_permuted.default]
+            ),
+        )
 
         print(edge_program._edge_programs["forward"].graph_module)
 
     # 3. to_executorch: Convert the graph to an ExecuTorch program.
     print("Exporting to executorch")
+    edge_program = edge_program.to_backend(
+        XnnpackPartitioner(force_fp32_dynamic_linear=True)
+    )
     executorch_program = edge_program.to_executorch()
+
     print(executorch_program.exported_program().graph_signature)
     print(f"Saving to {output_file}")
     with open(output_file, "wb") as file:
diff --git a/examples/llm_pte_finetuning/qwen_05b_config.yaml b/examples/llm_pte_finetuning/qwen_05b_config.yaml
index b93517b8fda..f5ab2dbad68 100644
--- a/examples/llm_pte_finetuning/qwen_05b_config.yaml
+++ b/examples/llm_pte_finetuning/qwen_05b_config.yaml
@@ -27,7 +27,7 @@ checkpointer:
     model.safetensors
   ]
   recipe_checkpoint: null
-  output_dir: /tmp/Qwen2-0.5B-Instruct
+  output_dir: /tmp/qwen_0.5B_ft-output
   model_type: QWEN2
 resume_from_checkpoint: False
 save_adapter_weights_only: False
diff --git a/examples/llm_pte_finetuning/runner.py b/examples/llm_pte_finetuning/runner.py
index 0deebcf010d..0baf160a56b 100644
--- a/examples/llm_pte_finetuning/runner.py
+++ b/examples/llm_pte_finetuning/runner.py
@@ -15,7 +15,7 @@
     update_function,
 )
 
-from executorch.extension.pybindings.aten_lib import (  # @manual
+from executorch.extension.pybindings.portable_lib import (  # @manual
     _load_for_executorch_from_buffer,
 )
 from omegaconf import OmegaConf
@@ -30,6 +30,18 @@
 )
 parser.add_argument("--cfg", type=str, help="Path to the config file.")
 parser.add_argument("--model_file", type=str, help="Path to the ET model file.")
+parser.add_argument(
+    "--num_training_steps",
+    type=int,
+    help="Number of training steps, assuming 1 epoch.",
+    default=100,
+)
+parser.add_argument(
+    "--num_eval_steps",
+    type=int,
+    help="Number of eval steps, assuming 1 epoch.",
+    default=5,
+)
 
 
 def main() -> None:
@@ -47,10 +59,11 @@ def main() -> None:
     train_set, val_set = torch.utils.data.random_split(ds, [0.8, 0.2])
     train_dataloader = get_dataloader(cfg, train_set, tokenizer, loss_fn)
     val_dataloader = get_dataloader(cfg, val_set, tokenizer, loss_fn)
+    num_training_steps = args.num_training_steps
+    num_eval_steps = args.num_eval_steps
 
     max_seq_len = cfg.tokenizer.max_seq_len
     # Num of steps to run training. Assume 1 epoch
-    num_steps = 100
     with open(file, "rb") as f:
         model_bytes = f.read()
         et_mod = _load_for_executorch_from_buffer(model_bytes)
@@ -62,7 +75,7 @@ def main() -> None:
             dataloader=val_dataloader,
             loss_fn=loss_fn,
             max_seq_len=max_seq_len,
-            num_eval_steps=10,
+            num_eval_steps=num_eval_steps,
         )
         print("Eval loss: ", eval_loss)
 
@@ -74,9 +87,9 @@ def main() -> None:
         learning_rate = 5e-3
         f.seek(0)
         losses = []
-        for i, batch in tqdm(enumerate(train_dataloader), total=num_steps):
+        for i, batch in tqdm(enumerate(train_dataloader), total=num_training_steps):
             # Run for a limited number of steps.
-            if i >= num_steps:
+            if i >= num_training_steps:
                 break
             tokens, labels = batch["tokens"], batch["labels"]
             token_size = tokens.shape[1]
@@ -113,7 +126,7 @@ def main() -> None:
             dataloader=val_dataloader,
             loss_fn=loss_fn,
             max_seq_len=max_seq_len,
-            num_eval_steps=10,
+            num_eval_steps=num_eval_steps,
         )
     print("Eval loss: ", eval_loss)
 
diff --git a/examples/llm_pte_finetuning/training_lib.py b/examples/llm_pte_finetuning/training_lib.py
index dfdaf9b115a..f8cae70d39c 100644
--- a/examples/llm_pte_finetuning/training_lib.py
+++ b/examples/llm_pte_finetuning/training_lib.py
@@ -10,7 +10,7 @@
 from typing import Any
 
 import torch
-from executorch.extension.pybindings.aten_lib import ExecuTorchModule  # @manual
+from executorch.extension.pybindings.portable_lib import ExecuTorchModule  # @manual
 
 from torch.nn import functional as F
 from torch.utils.data import DataLoader, Dataset, DistributedSampler
diff --git a/examples/mediatek/CMakeLists.txt b/examples/mediatek/CMakeLists.txt
index 61906870e1d..8d33244fdfa 100644
--- a/examples/mediatek/CMakeLists.txt
+++ b/examples/mediatek/CMakeLists.txt
@@ -19,8 +19,8 @@ if(NOT EXECUTORCH_ROOT)
   set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
 endif()
 
-include(${EXECUTORCH_ROOT}/build/Utils.cmake)
-include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
 
 if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
@@ -122,33 +122,33 @@ if(${ANDROID})
   )
   # Build ABSL and RE2
   set(EXTENSIONS_LLM_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm)
-  set(THIRD_PARTY_ABSL_DIR ${EXTENSIONS_LLM_DIR}/third-party/abseil-cpp)
-  set(THIRD_PARTY_RE2_DIR ${EXTENSIONS_LLM_DIR}/third-party/re2)
+  set(THIRD_PARTY_ABSL_DIR ${EXTENSIONS_LLM_DIR}/tokenizers/third-party/abseil-cpp)
+  set(THIRD_PARTY_RE2_DIR ${EXTENSIONS_LLM_DIR}/tokenizers/third-party/re2)
   set(ABSL_ENABLE_INSTALL ON)
   set(ABSL_PROPAGATE_CXX_STD ON)
   set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
   set(CMAKE_POSITION_INDEPENDENT_CODE ON)
   add_subdirectory(
-    ${THIRD_PARTY_ABSL_DIR} ${CMAKE_CURRENT_BINARY_DIR}/third-party/abseil
+    ${THIRD_PARTY_ABSL_DIR} ${CMAKE_CURRENT_BINARY_DIR}/tokenizers/third-party/abseil
   )
   add_subdirectory(
-    ${THIRD_PARTY_RE2_DIR} ${CMAKE_CURRENT_BINARY_DIR}/third-party/re2
+    ${THIRD_PARTY_RE2_DIR} ${CMAKE_CURRENT_BINARY_DIR}/tokenizers/third-party/re2
   )
   set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
 
   # Build tokenizers
-  set(LLAMA2_TOKENIZER_DIR ${EXTENSIONS_LLM_DIR}/tokenizer)
+  set(LLAMA2_TOKENIZER_DIR ${EXTENSIONS_LLM_DIR}/tokenizers)
   add_library(tokenizer STATIC)
   target_include_directories(
     tokenizer PUBLIC ${_common_include_directories} ${THIRD_PARTY_ABSL_DIR}
-                     ${THIRD_PARTY_RE2_DIR}
+                     ${THIRD_PARTY_RE2_DIR} ${LLAMA2_TOKENIZER_DIR}/include
   )
   target_link_libraries(tokenizer PRIVATE re2::re2)
   target_sources(
     tokenizer
     PRIVATE
-      ${LLAMA2_TOKENIZER_DIR}/tiktoken.cpp
-      ${LLAMA2_TOKENIZER_DIR}/bpe_tokenizer.cpp
+      ${LLAMA2_TOKENIZER_DIR}/src/tiktoken.cpp
+      ${LLAMA2_TOKENIZER_DIR}/src/llama2c_tokenizer.cpp
       ${CMAKE_CURRENT_SOURCE_DIR}/../models/llama/tokenizer/llama_tiktoken.cpp
   )
 
diff --git a/examples/mediatek/executor_runner/mtk_executor_runner.cpp b/examples/mediatek/executor_runner/mtk_executor_runner.cpp
index 1d9d5522161..b8369c63005 100644
--- a/examples/mediatek/executor_runner/mtk_executor_runner.cpp
+++ b/examples/mediatek/executor_runner/mtk_executor_runner.cpp
@@ -1,10 +1,9 @@
 /*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
  * Copyright (c) 2024 MediaTek Inc.
- * All rights reserved.
  *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
+ * Licensed under the BSD License (the "License"); you may not use this file
+ * except in compliance with the License. See the license file in the root
+ * directory of this source tree for more details.
  */
 
 /**
diff --git a/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp b/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp
index 4fba0e20a81..012206e5142 100644
--- a/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp
+++ b/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp
@@ -1,47 +1,9 @@
 /*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
  * Copyright (c) 2024 MediaTek Inc.
- * All rights reserved.
  *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/* Copyright Statement:
- *
- * This software/firmware and related documentation ("MediaTek Software") are
- * protected under relevant copyright laws. The information contained herein
- * is confidential and proprietary to MediaTek Inc. and/or its licensors.
- * Without the prior written permission of MediaTek inc. and/or its licensors,
- * any reproduction, modification, use or disclosure of MediaTek Software,
- * and information contained herein, in whole or in part, shall be strictly
- * prohibited.
- */
-/* MediaTek Inc. (C) 2024. All rights reserved.
- *
- * BY OPENING THIS FILE, RECEIVER HEREBY UNEQUIVOCALLY ACKNOWLEDGES AND AGREES
- * THAT THE SOFTWARE/FIRMWARE AND ITS DOCUMENTATIONS ("MEDIATEK SOFTWARE")
- * RECEIVED FROM MEDIATEK AND/OR ITS REPRESENTATIVES ARE PROVIDED TO RECEIVER ON
- * AN "AS-IS" BASIS ONLY. MEDIATEK EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE OR NONINFRINGEMENT.
- * NEITHER DOES MEDIATEK PROVIDE ANY WARRANTY WHATSOEVER WITH RESPECT TO THE
- * SOFTWARE OF ANY THIRD PARTY WHICH MAY BE USED BY, INCORPORATED IN, OR
- * SUPPLIED WITH THE MEDIATEK SOFTWARE, AND RECEIVER AGREES TO LOOK ONLY TO SUCH
- * THIRD PARTY FOR ANY WARRANTY CLAIM RELATING THERETO. RECEIVER EXPRESSLY
- * ACKNOWLEDGES THAT IT IS RECEIVER'S SOLE RESPONSIBILITY TO OBTAIN FROM ANY
- * THIRD PARTY ALL PROPER LICENSES CONTAINED IN MEDIATEK SOFTWARE. MEDIATEK
- * SHALL ALSO NOT BE RESPONSIBLE FOR ANY MEDIATEK SOFTWARE RELEASES MADE TO
- * RECEIVER'S SPECIFICATION OR TO CONFORM TO A PARTICULAR STANDARD OR OPEN
- * FORUM. RECEIVER'S SOLE AND EXCLUSIVE REMEDY AND MEDIATEK'S ENTIRE AND
- * CUMULATIVE LIABILITY WITH RESPECT TO THE MEDIATEK SOFTWARE RELEASED HEREUNDER
- * WILL BE, AT MEDIATEK'S OPTION, TO REVISE OR REPLACE THE MEDIATEK SOFTWARE AT
- * ISSUE, OR REFUND ANY SOFTWARE LICENSE FEES OR SERVICE CHARGE PAID BY RECEIVER
- * TO MEDIATEK FOR SUCH MEDIATEK SOFTWARE AT ISSUE.
- *
- * The following software/firmware and/or related documentation ("MediaTek
- * Software") have been modified by MediaTek Inc. All revisions are subject to
- * any receiver's applicable license agreements with MediaTek Inc.
+ * Licensed under the BSD License (the "License"); you may not use this file
+ * except in compliance with the License. See the license file in the root
+ * directory of this source tree for more details.
  */
 
 #include "executorch/backends/mediatek/runtime/include/NeuronBufferAllocator.h"
@@ -68,8 +30,8 @@
 #include "llama_runner/llm_helper/include/llm_types.h"
 
 #include <executorch/examples/models/llama/tokenizer/llama_tiktoken.h>
-#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
-#include <executorch/extension/llm/tokenizer/tiktoken.h>
+#include <pytorch/tokenizers/llama2c_tokenizer.h>
+#include <pytorch/tokenizers/tiktoken.h>
 
 // Llama model options
 DEFINE_uint64(
@@ -140,10 +102,10 @@ using example::utils::read_file;
 using example::utils::split;
 using example::utils::Timer;
 using example::utils::to_string;
-using executorch::extension::llm::BPETokenizer;
-using executorch::extension::llm::Tokenizer;
 using executorch::runtime::Error;
 using executorch::runtime::Result;
+using tokenizers::Llama2cTokenizer;
+using tokenizers::Tokenizer;
 
 LlamaModelOptions get_model_options() {
   LlamaModelOptions options = {
diff --git a/examples/mediatek/executor_runner/mtk_llama_runner.cpp b/examples/mediatek/executor_runner/mtk_llama_runner.cpp
index de22171d179..5274d0925ae 100644
--- a/examples/mediatek/executor_runner/mtk_llama_runner.cpp
+++ b/examples/mediatek/executor_runner/mtk_llama_runner.cpp
@@ -1,47 +1,9 @@
 /*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
  * Copyright (c) 2024 MediaTek Inc.
- * All rights reserved.
  *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/* Copyright Statement:
- *
- * This software/firmware and related documentation ("MediaTek Software") are
- * protected under relevant copyright laws. The information contained herein
- * is confidential and proprietary to MediaTek Inc. and/or its licensors.
- * Without the prior written permission of MediaTek inc. and/or its licensors,
- * any reproduction, modification, use or disclosure of MediaTek Software,
- * and information contained herein, in whole or in part, shall be strictly
- * prohibited.
- */
-/* MediaTek Inc. (C) 2024. All rights reserved.
- *
- * BY OPENING THIS FILE, RECEIVER HEREBY UNEQUIVOCALLY ACKNOWLEDGES AND AGREES
- * THAT THE SOFTWARE/FIRMWARE AND ITS DOCUMENTATIONS ("MEDIATEK SOFTWARE")
- * RECEIVED FROM MEDIATEK AND/OR ITS REPRESENTATIVES ARE PROVIDED TO RECEIVER ON
- * AN "AS-IS" BASIS ONLY. MEDIATEK EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE OR NONINFRINGEMENT.
- * NEITHER DOES MEDIATEK PROVIDE ANY WARRANTY WHATSOEVER WITH RESPECT TO THE
- * SOFTWARE OF ANY THIRD PARTY WHICH MAY BE USED BY, INCORPORATED IN, OR
- * SUPPLIED WITH THE MEDIATEK SOFTWARE, AND RECEIVER AGREES TO LOOK ONLY TO SUCH
- * THIRD PARTY FOR ANY WARRANTY CLAIM RELATING THERETO. RECEIVER EXPRESSLY
- * ACKNOWLEDGES THAT IT IS RECEIVER'S SOLE RESPONSIBILITY TO OBTAIN FROM ANY
- * THIRD PARTY ALL PROPER LICENSES CONTAINED IN MEDIATEK SOFTWARE. MEDIATEK
- * SHALL ALSO NOT BE RESPONSIBLE FOR ANY MEDIATEK SOFTWARE RELEASES MADE TO
- * RECEIVER'S SPECIFICATION OR TO CONFORM TO A PARTICULAR STANDARD OR OPEN
- * FORUM. RECEIVER'S SOLE AND EXCLUSIVE REMEDY AND MEDIATEK'S ENTIRE AND
- * CUMULATIVE LIABILITY WITH RESPECT TO THE MEDIATEK SOFTWARE RELEASED HEREUNDER
- * WILL BE, AT MEDIATEK'S OPTION, TO REVISE OR REPLACE THE MEDIATEK SOFTWARE AT
- * ISSUE, OR REFUND ANY SOFTWARE LICENSE FEES OR SERVICE CHARGE PAID BY RECEIVER
- * TO MEDIATEK FOR SUCH MEDIATEK SOFTWARE AT ISSUE.
- *
- * The following software/firmware and/or related documentation ("MediaTek
- * Software") have been modified by MediaTek Inc. All revisions are subject to
- * any receiver's applicable license agreements with MediaTek Inc.
+ * Licensed under the BSD License (the "License"); you may not use this file
+ * except in compliance with the License. See the license file in the root
+ * directory of this source tree for more details.
  */
 
 #include <executorch/examples/mediatek/executor_runner/mtk_llama_runner.h>
diff --git a/examples/mediatek/executor_runner/mtk_llama_runner.h b/examples/mediatek/executor_runner/mtk_llama_runner.h
index 4c7b35d1a88..0f76f610a7e 100644
--- a/examples/mediatek/executor_runner/mtk_llama_runner.h
+++ b/examples/mediatek/executor_runner/mtk_llama_runner.h
@@ -14,8 +14,8 @@
 #include <executorch/examples/models/llama/tokenizer/llama_tiktoken.h>
 #include <executorch/extension/llm/runner/irunner.h>
 #include <executorch/extension/llm/runner/stats.h>
-#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
-#include <executorch/extension/llm/tokenizer/tiktoken.h>
+#include <pytorch/tokenizers/llama2c_tokenizer.h>
+#include <pytorch/tokenizers/tiktoken.h>
 #include <cstdint>
 #include <functional>
 #include <memory>
@@ -28,9 +28,9 @@ using Stats = ::executorch::llm::Stats;
 using example::LlamaModelOptions;
 using example::LlamaModelPaths;
 using example::LlamaRuntime;
-using executorch::extension::llm::Tokenizer;
 using executorch::runtime::Error;
 using executorch::runtime::Result;
+using tokenizers::Tokenizer;
 
 class MTKLlamaRunner : public executorch::extension::llm::IRunner {
  public:
diff --git a/examples/models/__init__.py b/examples/models/__init__.py
index 822d55fc09d..80ba6801a6c 100644
--- a/examples/models/__init__.py
+++ b/examples/models/__init__.py
@@ -5,35 +5,82 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from enum import Enum
+
+
+class Model(str, Enum):
+    Mul = "mul"
+    Linear = "linear"
+    Add = "add"
+    AddMul = "add_mul"
+    Softmax = "softmax"
+    Dl3 = "dl3"
+    Edsr = "edsr"
+    EmformerTranscribe = "emformer_transcribe"
+    EmformerPredict = "emformer_predict"
+    EmformerJoin = "emformer_join"
+    Llama2 = "llama2"
+    Llama = "llama"
+    Llama32VisionEncoder = "llama3_2_vision_encoder"
+    Lstm = "lstm"
+    MobileBert = "mobilebert"
+    Mv2 = "mv2"
+    Mv2Untrained = "mv2_untrained"
+    Mv3 = "mv3"
+    Vit = "vit"
+    W2l = "w2l"
+    Ic3 = "ic3"
+    Ic4 = "ic4"
+    ResNet18 = "resnet18"
+    ResNet50 = "resnet50"
+    Llava = "llava"
+    EfficientSam = "efficient_sam"
+    Qwen25 = "qwen2_5"
+    Phi4Mini = "phi-4-mini"
+
+    def __str__(self) -> str:
+        return self.value
+
+
+class Backend(str, Enum):
+    XnnpackQuantizationDelegation = "xnnpack-quantization-delegation"
+    CoreMlTest = "coreml-test"
+
+    def __str__(self) -> str:
+        return self.value
+
+
 MODEL_NAME_TO_MODEL = {
-    "mul": ("toy_model", "MulModule"),
-    "linear": ("toy_model", "LinearModule"),
-    "add": ("toy_model", "AddModule"),
-    "add_mul": ("toy_model", "AddMulModule"),
-    "softmax": ("toy_model", "SoftmaxModule"),
-    "dl3": ("deeplab_v3", "DeepLabV3ResNet50Model"),
-    "edsr": ("edsr", "EdsrModel"),
-    "emformer_transcribe": ("emformer_rnnt", "EmformerRnntTranscriberModel"),
-    "emformer_predict": ("emformer_rnnt", "EmformerRnntPredictorModel"),
-    "emformer_join": ("emformer_rnnt", "EmformerRnntJoinerModel"),
-    "llama2": ("llama", "Llama2Model"),
-    "llama": ("llama", "Llama2Model"),
-    "llama3_2_vision_encoder": ("llama3_2_vision", "FlamingoVisionEncoderModel"),
+    str(Model.Mul): ("toy_model", "MulModule"),
+    str(Model.Linear): ("toy_model", "LinearModule"),
+    str(Model.Add): ("toy_model", "AddModule"),
+    str(Model.AddMul): ("toy_model", "AddMulModule"),
+    str(Model.Softmax): ("toy_model", "SoftmaxModule"),
+    str(Model.Dl3): ("deeplab_v3", "DeepLabV3ResNet50Model"),
+    str(Model.Edsr): ("edsr", "EdsrModel"),
+    str(Model.EmformerTranscribe): ("emformer_rnnt", "EmformerRnntTranscriberModel"),
+    str(Model.EmformerPredict): ("emformer_rnnt", "EmformerRnntPredictorModel"),
+    str(Model.EmformerJoin): ("emformer_rnnt", "EmformerRnntJoinerModel"),
+    str(Model.Llama2): ("llama", "Llama2Model"),
+    str(Model.Llama): ("llama", "Llama2Model"),
+    str(Model.Llama32VisionEncoder): ("llama3_2_vision", "FlamingoVisionEncoderModel"),
     # TODO: This take too long to export on both Linux and MacOS (> 6 hours)
     # "llama3_2_text_decoder": ("llama3_2_vision", "Llama3_2Decoder"),
-    "lstm": ("lstm", "LSTMModel"),
-    "mobilebert": ("mobilebert", "MobileBertModelExample"),
-    "mv2": ("mobilenet_v2", "MV2Model"),
-    "mv2_untrained": ("mobilenet_v2", "MV2UntrainedModel"),
-    "mv3": ("mobilenet_v3", "MV3Model"),
-    "vit": ("torchvision_vit", "TorchVisionViTModel"),
-    "w2l": ("wav2letter", "Wav2LetterModel"),
-    "ic3": ("inception_v3", "InceptionV3Model"),
-    "ic4": ("inception_v4", "InceptionV4Model"),
-    "resnet18": ("resnet", "ResNet18Model"),
-    "resnet50": ("resnet", "ResNet50Model"),
-    "llava": ("llava", "LlavaModel"),
-    "efficient_sam": ("efficient_sam", "EfficientSAM"),
+    str(Model.Lstm): ("lstm", "LSTMModel"),
+    str(Model.MobileBert): ("mobilebert", "MobileBertModelExample"),
+    str(Model.Mv2): ("mobilenet_v2", "MV2Model"),
+    str(Model.Mv2Untrained): ("mobilenet_v2", "MV2UntrainedModel"),
+    str(Model.Mv3): ("mobilenet_v3", "MV3Model"),
+    str(Model.Vit): ("torchvision_vit", "TorchVisionViTModel"),
+    str(Model.W2l): ("wav2letter", "Wav2LetterModel"),
+    str(Model.Ic3): ("inception_v3", "InceptionV3Model"),
+    str(Model.Ic4): ("inception_v4", "InceptionV4Model"),
+    str(Model.ResNet18): ("resnet", "ResNet18Model"),
+    str(Model.ResNet50): ("resnet", "ResNet50Model"),
+    str(Model.Llava): ("llava", "LlavaModel"),
+    str(Model.EfficientSam): ("efficient_sam", "EfficientSAM"),
+    str(Model.Qwen25): ("qwen2_5", "Qwen2_5Model"),
+    str(Model.Phi4Mini): ("phi-4-mini", "Phi4MiniModel"),
 }
 
 __all__ = [
diff --git a/examples/models/checkpoint.py b/examples/models/checkpoint.py
index ee3fb560429..57a5b0ffaca 100644
--- a/examples/models/checkpoint.py
+++ b/examples/models/checkpoint.py
@@ -9,6 +9,8 @@
 from pathlib import Path
 from typing import Any, Dict, Optional
 
+import torch
+
 
 def get_default_model_resource_dir(model_file_path: str) -> Path:
     """
@@ -52,7 +54,7 @@ def get_default_model_resource_dir(model_file_path: str) -> Path:
     return resource_dir
 
 
-def get_checkpoint_dtype(checkpoint: Dict[str, Any]) -> Optional[str]:
+def get_checkpoint_dtype(checkpoint: Dict[str, Any]) -> Optional[torch.dtype]:
     """
     Get the dtype of the checkpoint, returning "None" if the checkpoint is empty.
     """
@@ -64,7 +66,7 @@ def get_checkpoint_dtype(checkpoint: Dict[str, Any]) -> Optional[str]:
         mismatched_dtypes = [
             (key, value.dtype)
             for key, value in checkpoint.items()
-            if value.dtype != dtype
+            if hasattr(value, "dtype") and value.dtype != dtype
         ]
         if len(mismatched_dtypes) > 0:
             print(
diff --git a/examples/models/llama/CMakeLists.txt b/examples/models/llama/CMakeLists.txt
index 5f49581ea25..e6d45424bd4 100644
--- a/examples/models/llama/CMakeLists.txt
+++ b/examples/models/llama/CMakeLists.txt
@@ -15,7 +15,7 @@
 # ~~~
 # It should also be cmake-lint clean.
 #
-cmake_minimum_required(VERSION 3.19)
+cmake_minimum_required(VERSION 3.24)  # 3.24 is required for WHOLE_ARCHIVE
 project(llama_runner)
 
 # Duplicating options as root CMakeLists.txt
@@ -46,7 +46,7 @@ endif()
 set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 set(TORCH_ROOT ${EXECUTORCH_ROOT}/third-party/pytorch)
 
-include(${EXECUTORCH_ROOT}/build/Utils.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 
 if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
@@ -84,14 +84,6 @@ if(CMAKE_TOOLCHAIN_IOS OR ANDROID)
   target_link_options_shared_lib(executorch)
 endif()
 
-# custom ops library
-if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
-  add_subdirectory(
-    ${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/custom_ops
-    ${CMAKE_CURRENT_BINARY_DIR}/../../../extension/llm/custom_ops
-  )
-endif()
-
 # llama_runner library
 add_subdirectory(runner)
 
@@ -119,28 +111,33 @@ target_link_options_shared_lib(quantized_ops_lib)
 list(APPEND link_libraries quantized_kernels quantized_ops_lib)
 
 if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
-  target_link_options_shared_lib(custom_ops)
-  list(APPEND link_libraries custom_ops)
+  list(APPEND link_libraries $<LINK_LIBRARY:WHOLE_ARCHIVE,custom_ops>)
 endif()
 
 if(EXECUTORCH_BUILD_TORCHAO)
-  set(TORCHAO_BUILD_EXECUTORCH_OPS ON)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/ao/torchao/experimental ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/ao/torchao/experimental)
-  target_link_options_shared_lib(torchao_ops_executorch)
-  list(APPEND link_libraries torchao_ops_executorch)
+  # Currently only enable this on Arm-based Macs
   if(CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
+    set(TORCHAO_BUILD_EXECUTORCH_OPS ON)
+    set(TORCHAO_BUILD_CPU_AARCH64 ON)
     add_subdirectory(
-      ${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/ao/torchao/experimental/ops/mps
-      ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/ao/torchao/experimental/ops/mps)
-    target_link_options_shared_lib(torchao_ops_mps_executorch)
-    list(APPEND link_libraries torchao_ops_mps_executorch)
+      ${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/ao/torchao/experimental
+      ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/ao/torchao/experimental
+    )
+    target_link_options_shared_lib(torchao_ops_executorch)
+    list(APPEND link_libraries torchao_ops_executorch)
+    if(EXECUTORCH_BUILD_MPS)
+      add_subdirectory(
+        ${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/ao/torchao/experimental/ops/mps
+        ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/ao/torchao/experimental/ops/mps)
+      target_link_options_shared_lib(torchao_ops_mps_executorch)
+      list(APPEND link_libraries torchao_ops_mps_executorch)
+    endif()
   endif()
 endif()
 
 set(XNNPACK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack)
 # Extra compile option and include dir for pthreadpool
 if(EXECUTORCH_BUILD_PTHREADPOOL)
-  list(APPEND _common_compile_options -DET_USE_THREADPOOL)
   list(APPEND link_libraries extension_threadpool pthreadpool)
   list(APPEND _common_include_directories
        ${XNNPACK_ROOT}/third-party/pthreadpool/include
@@ -219,7 +216,11 @@ if(CMAKE_BUILD_TYPE STREQUAL "Release")
   endif()
 endif()
 
-target_include_directories(llama_main PUBLIC ${_common_include_directories})
+target_include_directories(
+  llama_main
+  PUBLIC ${_common_include_directories}
+         ${EXECUTORCH_ROOT}/extension/llm/tokenizers/include
+)
 target_link_libraries(llama_main PUBLIC llama_runner ${link_libraries})
 target_compile_options(llama_main PUBLIC ${_common_compile_options})
 
diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md
index 4c0cce4dd9e..95f92ddb887 100644
--- a/examples/models/llama/README.md
+++ b/examples/models/llama/README.md
@@ -380,6 +380,83 @@ Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-de
 ### Android
 Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-demo-android.html) to for full instructions on building the Android LLAMA Demo App.
 
+## Running with low-bit kernels
+
+We now give instructions for quantizating and running your model with low-bit kernels.  These are still experimental, and require you do development on an Arm-based Mac.  Also note that low-bit quantization often requires QAT (quantization-aware training) to give good quality results.  Currently dynamic shapes must be disabled when exporting a model with these kernels.
+
+First export your model for lowbit quantization (step 2 above):
+
+```
+# Set these paths to point to the downloaded files
+LLAMA_CHECKPOINT=path/to/checkpoint.pth
+LLAMA_PARAMS=path/to/params.json
+
+# Set low-bit quantization parameters
+QLINEAR_BITWIDTH=3 # Can be 1-8
+QLINEAR_GROUP_SIZE=128 # Must be multiple of 16
+QEMBEDDING_BITWIDTH=4 # Can be 1-8
+QEMBEDDING_GROUP_SIZE=32 # Must be multiple of 16
+
+python -m examples.models.llama.export_llama \
+  --model "llama3_2" \
+  --checkpoint "${LLAMA_CHECKPOINT:?}" \
+  --params "${LLAMA_PARAMS:?}" \
+  -kv \
+  --use_sdpa_with_kv_cache \
+  --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
+  --output_name="llama3_2.pte" \
+  -qmode "torchao:8da${QLINEAR_BITWIDTH}w" \
+  --group_size ${QLINEAR_GROUP_SIZE} \
+  -E "torchao:${QEMBEDDING_BITWIDTH},${QEMBEDDING_GROUP_SIZE}" \
+  --disable_dynamic_shape \
+  -d fp32
+```
+
+A few notes:
+- If your model shares embedding/unembedding weights (like Llama1B and Llama3B do), you can add `--use_shared_embedding` to take advantage of this and reduce memory.  When this option is enabled, you can specify whether embeddings are quantized with weight zeros or not by specifying a third argument.  For example, `-E "torchao:4,32,true"` means that the embedding is quantized to 4-bits with group_size=32 and uses weight zeros (this is the default behavior if you simply use `-E "torchao:4,32"`), whereas `-E "torchao:4,32,false"` means that the embedding is quantized to 4-bits with group_size=32, but is quantized with scales-only.  If `--use_shared_embedding` is specified, the unembedding (i.e., the final linear layer) is quantized in the same way, but also uses 8-bit dynamically quantized activations.
+- To do channelwise quantization, specify group_size to 0.  This works for both linear and embedding layers.
+
+Once the model is exported, we need to build ExecuTorch and the runner with the low-bit kernels.
+
+The first step is to install ExecuTorch (the same as step 3.1 above):
+
+```
+cmake -DPYTHON_EXECUTABLE=python \
+    -DCMAKE_INSTALL_PREFIX=cmake-out \
+    -DEXECUTORCH_ENABLE_LOGGING=1 \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+    -DEXECUTORCH_BUILD_XNNPACK=ON \
+    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+    -Bcmake-out .
+cmake --build cmake-out -j16 --target install --config Release
+```
+
+Next install the llama runner with torchao kernels enabled (similar to step 3.2 above):
+
+```
+cmake -DPYTHON_EXECUTABLE=python \
+    -DCMAKE_PREFIX_PATH=$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())') \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+    -DEXECUTORCH_BUILD_XNNPACK=OFF \
+    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+    -DEXECUTORCH_BUILD_TORCHAO=ON \
+    -Bcmake-out/examples/models/llama \
+    examples/models/llama
+cmake --build cmake-out/examples/models/llama -j16 --config Release
+```
+
+Finally run your model (similar to step 3.3 above):
+
+```
+cmake-out/examples/models/llama/llama_main --model_path=<model pte file> --tokenizer_path=<tokenizer.model> --prompt=<prompt>
+```
 
 ## Utility tools for Llama enablement
 
diff --git a/examples/models/llama/TARGETS b/examples/models/llama/TARGETS
index ad978205245..48c48532f7b 100644
--- a/examples/models/llama/TARGETS
+++ b/examples/models/llama/TARGETS
@@ -16,6 +16,7 @@ runtime.python_library(
         "rope.py",
         "attention.py",
         "model_args.py",
+        "norm.py",
     ],
     _is_external_target = True,
     base_module = "executorch.examples.models.llama",
@@ -95,11 +96,12 @@ runtime.command_alias(
 )
 
 runtime.python_library(
-    name = "export_library",
+    name = "source_transformation",
+    visibility = [
+        "//executorch/examples/...",
+        "@EXECUTORCH_CLIENTS",
+    ],
     srcs = [
-        "export_llama.py",
-        "export_llama_lib.py",
-        "model.py",
         "source_transformation/apply_spin_quant_r1_r2.py",
         "source_transformation/attention.py",
         "source_transformation/lora.py",
@@ -114,6 +116,15 @@ runtime.python_library(
         "source_transformation/vulkan_rope.py",
         "source_transformation/attention_sink.py",
     ],
+)
+
+runtime.python_library(
+    name = "export_library",
+    srcs = [
+        "export_llama.py",
+        "export_llama_lib.py",
+        "model.py",
+    ],
     _is_external_target = True,
     base_module = "executorch.examples.models.llama",
     visibility = [
@@ -123,6 +134,7 @@ runtime.python_library(
         "@EXECUTORCH_CLIENTS",
     ],
     deps = [
+        ":source_transformation",
         "//ai_codesign/gen_ai/fast_hadamard_transform:fast_hadamard_transform",
         "//caffe2:torch",
         "//executorch/backends/vulkan/_passes:vulkan_passes",
diff --git a/examples/models/llama/attention.py b/examples/models/llama/attention.py
index 91168a388d3..54f738ba737 100644
--- a/examples/models/llama/attention.py
+++ b/examples/models/llama/attention.py
@@ -5,6 +5,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from executorch.examples.models.llama.model_args import ModelArgs
+from executorch.examples.models.llama.norm import RMSNorm
 from executorch.examples.models.llama.rope import Rope
 
 
@@ -175,9 +176,24 @@ def __init__(self, args: ModelArgs, layer_id: int, rope: Rope):
         self.max_batch_size = args.max_batch_size
         self.max_context_len = args.max_context_len
         self.dim = args.dim
-        self.wq = nn.Linear(self.dim, self.n_heads * self.head_dim, bias=False)
-        self.wk = nn.Linear(self.dim, self.n_kv_heads * self.head_dim, bias=False)
-        self.wv = nn.Linear(self.dim, self.n_kv_heads * self.head_dim, bias=False)
+        self.attention_qkv_bias = args.attention_qkv_bias
+        self.use_qk_norm = args.use_qk_norm
+
+        if self.use_qk_norm:
+            q_norm_dim = self.head_dim
+            k_norm_dim = self.head_dim
+            self.q_norm_fn = RMSNorm(q_norm_dim, eps=args.norm_eps)
+            self.k_norm_fn = RMSNorm(k_norm_dim, eps=args.norm_eps)
+
+        self.wq = nn.Linear(
+            self.dim, self.n_heads * self.head_dim, bias=self.attention_qkv_bias
+        )
+        self.wk = nn.Linear(
+            self.dim, self.n_kv_heads * self.head_dim, bias=self.attention_qkv_bias
+        )
+        self.wv = nn.Linear(
+            self.dim, self.n_kv_heads * self.head_dim, bias=self.attention_qkv_bias
+        )
         self.wo = nn.Linear(self.n_heads * self.head_dim, self.dim, bias=False)
 
         self.layer_id = layer_id
@@ -234,6 +250,10 @@ def forward(
         k = k.transpose(1, 2)
         v = v.transpose(1, 2)
 
+        if self.use_qk_norm:
+            q = self.q_norm_fn(q)
+            k = self.k_norm_fn(k)
+
         if self.use_kv_cache:
             assert input_pos is not None
             k, v = self.kv_cache.update(input_pos, k, v)
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 4ad92903534..1620924f4f6 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -1,5 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -15,6 +16,7 @@
 import re
 import shlex
 from enum import Enum
+from functools import partial
 from json import JSONDecodeError
 from pathlib import Path
 from typing import Callable, List, Optional, Union
@@ -84,6 +86,7 @@
 verbosity_setting = None
 
 
+# All models that leverage the transformer architecture defined in llama_transformer.py.
 EXECUTORCH_DEFINED_MODELS = [
     "stories110m",
     "llama2",
@@ -91,6 +94,9 @@
     "llama3_1",
     "llama3_2",
     "static_llama",
+    "qwen2_5",
+    "phi-4-mini",
+    "smollm2",
 ]
 TORCHTUNE_DEFINED_MODELS = ["llama3_2_vision"]
 
@@ -119,26 +125,19 @@ def verbose_export():
 
 
 def build_model(
-    modelname: str = "llama3",
-    extra_opts: str = "",
-    *,
-    par_local_output: bool = False,
-    resource_pkg_name: str = __name__,
+    model: str,
+    checkpoint: str,
+    params: str,
+    output_dir: Optional[str] = ".",
+    extra_opts: Optional[str] = "",
 ) -> str:
-    if False:  # par_local_output:
-        output_dir_path = "par:."
-    else:
-        output_dir_path = "."
-
-    argString = f"--model {modelname} --checkpoint par:model_ckpt.pt --params par:model_params.json {extra_opts} --output-dir {output_dir_path}"
+    argString = f"--model {model} --checkpoint {checkpoint} --params {params} {extra_opts} --output-dir {output_dir}"
     parser = build_args_parser()
     args = parser.parse_args(shlex.split(argString))
-    # pkg_name = resource_pkg_name
     return export_llama(args)
 
 
 def build_args_parser() -> argparse.ArgumentParser:
-    ckpt_dir = f"{Path(__file__).absolute().parent.as_posix()}"
     parser = argparse.ArgumentParser()
     parser.add_argument("-o", "--output-dir", default=".", help="output directory")
     # parser.add_argument(
@@ -157,6 +156,11 @@ def build_args_parser() -> argparse.ArgumentParser:
         type=str,
         help="type of embedding quantization, '<bitwidth>,<groupsize>', e.g., '8,1024'.",
     )
+    parser.add_argument(
+        "--use_shared_embedding",
+        action="store_true",
+        help="Whether the embedding/unembedding weights should be shared.  Only available with torchao kernels.",
+    )
     parser.add_argument(
         "--pt2e_quantize",
         default=None,
@@ -187,8 +191,8 @@ def build_args_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "-c",
         "--checkpoint",
-        default=f"{ckpt_dir}/params/demo_rand_params.pth",
-        help="checkpoint path",
+        required=False,
+        help="Path to the checkpoint .pth file. When not provided, the model will be initialized with random weights.",
     )
 
     parser.add_argument(
@@ -269,8 +273,8 @@ def build_args_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "-p",
         "--params",
-        default=f"{ckpt_dir}/params/demo_config.json",
-        help="config.json",
+        required=False,
+        help="Config file for model parameters. When not provided, the model will fallback on default values defined in examples/models/llama/model_args.py.",
     )
     parser.add_argument(
         "--optimized_rotation_path",
@@ -318,8 +322,8 @@ def build_args_parser() -> argparse.ArgumentParser:
         default="fp32",
         type=str,
         choices=["fp32", "fp16", "bf16"],
-        help="Override the dtype of the model (default is the checkpoint dtype)."
-        "Options: fp32, fp16, bf16. Please be aware that only some backends support fp16 and bf16.",
+        help="Provide the dtype of the model. This must match up with the supported dtypes of the backends that you are using."
+        "Please be aware that only some backends support fp16 and bf16.",
     )
 
     parser.add_argument(
@@ -557,47 +561,73 @@ def _prepare_for_llama_export(args) -> LLMEdgeManager:
     checkpoint_dir = (
         canonical_path(args.checkpoint_dir) if args.checkpoint_dir else None
     )
-    params_path = canonical_path(args.params)
+    params_path = canonical_path(args.params) if args.params else None
     output_dir_path = canonical_path(args.output_dir, dir=True)
     weight_type = WeightType.FAIRSEQ2 if args.fairseq2 else WeightType.LLAMA
 
-    # dtype override
-    if args.dtype_override is not None:
-        dtype_override = DType[args.dtype_override]
-    elif args.quantization_mode in ["8da4w", "8da4w-gptq"]:
-        dtype_override = DType["fp16"]
-    else:
-        dtype_override = None
+    # Convert dtype override string arg to actual type.
+    dtype_override = DType[args.dtype_override]
+
+    edge_manager = _load_llama_model(
+        args.model,
+        checkpoint=checkpoint_path,
+        checkpoint_dir=checkpoint_dir,
+        params_path=params_path,
+        use_kv_cache=args.use_kv_cache,
+        use_sdpa_with_kv_cache=args.use_sdpa_with_kv_cache,
+        generate_full_logits=args.generate_full_logits,
+        weight_type=weight_type,
+        enable_dynamic_shape=args.enable_dynamic_shape,
+        calibration_tasks=args.calibration_tasks,
+        calibration_limit=args.calibration_limit,
+        calibration_seq_length=args.calibration_seq_length,
+        calibration_data=args.calibration_data,
+        tokenizer_path=args.tokenizer_path,
+        verbose=args.verbose,
+        max_seq_len=args.max_seq_length,
+        max_context_len=args.max_context_length,
+        input_prune_map_path=args.input_prune_map,
+        output_prune_map_path=args.output_prune_map,
+        metadata_str=args.metadata,
+        dtype_override=dtype_override,
+        args=args,
+    )
 
-    return (
-        _load_llama_model(
-            args.model,
-            checkpoint=checkpoint_path,
-            checkpoint_dir=checkpoint_dir,
-            params_path=params_path,
-            use_kv_cache=args.use_kv_cache,
-            use_sdpa_with_kv_cache=args.use_sdpa_with_kv_cache,
-            generate_full_logits=args.generate_full_logits,
-            weight_type=weight_type,
-            enable_dynamic_shape=args.enable_dynamic_shape,
-            calibration_tasks=args.calibration_tasks,
-            calibration_limit=args.calibration_limit,
-            calibration_seq_length=args.calibration_seq_length,
-            calibration_data=args.calibration_data,
-            tokenizer_path=args.tokenizer_path,
-            verbose=args.verbose,
-            max_seq_len=args.max_seq_length,
-            max_context_len=args.max_context_length,
-            input_prune_map_path=args.input_prune_map,
-            output_prune_map_path=args.output_prune_map,
-            metadata_str=args.metadata,
+    # At this point, the model is loaded in the default fp32.
+
+    # Checkpoint dtype should be lower or equal precision to the dtype override.
+    checkpoint_dtype = edge_manager.model.checkpoint_dtype
+    if not (
+        checkpoint_dtype == dtype_override.to_torch_dtype()
+        or (
+            checkpoint_dtype == torch.float16
+            and dtype_override.to_torch_dtype() == torch.float32
+        )
+        or (
+            checkpoint_dtype == torch.bfloat16
+            and dtype_override.to_torch_dtype() == torch.float32
+        )
+    ):
+        logging.warning(
+            f"Checkpoint dtype {checkpoint_dtype} precision is higher than dtype override {dtype_override.to_torch_dtype()}."
+        )
+
+    edge_manager.model = edge_manager.model.to(dtype=dtype_override.to_torch_dtype())
+
+    # We want to quantize (in the source transforms) the weights of the model
+    # in the checkpoint dtype.
+    logging.info(f"Checkpoint dtype: {edge_manager.model.checkpoint_dtype}")
+    edge_manager = edge_manager.set_output_dir(output_dir_path).source_transform(
+        _get_source_transforms(
+            modelname=args.model,
             dtype_override=dtype_override,
+            checkpoint_dtype=DType.from_torch_dtype(checkpoint_dtype),
             args=args,
         )
-        .set_output_dir(output_dir_path)
-        .source_transform(_get_source_transforms(args.model, dtype_override, args))
     )
 
+    return edge_manager
+
 
 def get_quantizer_and_quant_params(args):
     pt2e_quant_params = get_pt2e_quantization_params(
@@ -660,6 +690,15 @@ def _validate_args(args):
     if args.num_sharding > 0 and not args.qnn:
         raise ValueError("Model shard is only supported with qnn backend now.")
 
+    if args.use_shared_embedding:
+        if not (
+            args.embedding_quantize is not None
+            and args.embedding_quantize.startswith("torchao:")
+        ):
+            raise ValueError(
+                "Shared embedding is only supported with torchao quantization."
+            )
+
     if (
         args.quantization_mode is not None
         and args.quantization_mode.startswith("torchao:")
@@ -674,47 +713,62 @@ def _validate_args(args):
             )
 
 
-def _export_llama(args) -> LLMEdgeManager:  # noqa: C901
-    _validate_args(args)
-
-    pt2e_quant_params, quantizers, quant_dtype = get_quantizer_and_quant_params(args)
-
-    # export_to_edge
-    builder_exported = _prepare_for_llama_export(args).export()
-
-    builder_exported.run_canonical_optimizations()
-
-    if args.export_only:
-        exit()
-
-    builder_exported_to_edge = builder_exported.pt2e_quantize(
-        quantizers
-    ).export_to_edge()
-
-    modelname = builder_exported_to_edge.modelname
-
-    # to_backend
+def _to_edge_and_lower_llama_xnnpack(
+    builder_exported,
+    modelname,
+    additional_passes,
+    pt2e_quant_params,
+    quantizers,
+    quant_dtype,
+    args,
+) -> LLMEdgeManager:  # noqa: C901
     partitioners = []
 
     # Order matters here, dynamic quantization should be applied first when both xnnpack and xnnpack_extended_ops are enabled
-    if (
-        pt2e_quant_params is not None and pt2e_quant_params.quantize_linear is not None
-    ) or (args.xnnpack):
-        partitioners.append(
-            get_xnnpack_partitioner(dynamic_quant_only_partitioner=True)
-        )
+    partitioners.append(get_xnnpack_partitioner(dynamic_quant_only_partitioner=True))
 
-        # force xnnpack to be true if pt2e_quant_params is not None and args.xnnpack is False
-        args.xnnpack = True
-        modelname = f"xnnpack_dq_{modelname}"
+    modelname = f"xnnpack_dq_{modelname}"
 
     if args.xnnpack_extended_ops:
-        assert args.xnnpack, "xnnpack_extended_ops requires xnnpack to be enabled"
         partitioners.append(
             get_xnnpack_partitioner(dynamic_quant_only_partitioner=False)
         )
         modelname = f"xnnpack_{modelname}"
 
+    logging.info("Lowering model using following partitioner(s): ")
+    for partitioner in partitioners:
+        logging.info(f"--> {partitioner.__class__.__name__}")
+
+    # TODO: Enable generating ETRecord with XNNPack and to_edge_transform_and_lower().
+    if args.generate_etrecord:
+        raise NotImplementedError(
+            "export_llama does not support XNNPack and generating ETRecord at the moment."
+        )
+
+    builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(
+        partitioners
+    )
+    if args.verbose:
+        print_delegation_info(builder.edge_manager.exported_program().graph_module)
+
+    return builder.to_executorch(passes=additional_passes)
+
+
+def _to_edge_and_lower_llama(  # noqa: C901
+    builder_exported,
+    modelname,
+    additional_passes,
+    pt2e_quant_params,
+    quantizers,
+    quant_dtype,
+    args,
+):
+    builder_exported_to_edge = builder_exported.pt2e_quantize(
+        quantizers
+    ).export_to_edge()
+
+    # to_backend
+    partitioners = []
     if args.vulkan:
         partitioners.append(
             get_vulkan_partitioner(
@@ -729,7 +783,6 @@ def _export_llama(args) -> LLMEdgeManager:  # noqa: C901
         modelname = f"vulkan_{modelname}"
 
         # Need to remove asserts from the graph to prevent graph breaks
-        # pyre-ignore: Undefined attribute [16]: `Optional` has no attribute `exported_program`.
         remove_asserts(builder_exported_to_edge.edge_manager.exported_program())
 
     if args.mps:
@@ -758,19 +811,15 @@ def _export_llama(args) -> LLMEdgeManager:  # noqa: C901
         # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.qualcomm.utils.utils`
         from executorch.backends.qualcomm.utils.utils import _transform, tag_quant_io
 
-        # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`, Optional type has no attribute `exported_program`
         _transform(builder_exported_to_edge.edge_manager.exported_program())
 
         if args.num_sharding > 0:
             model_sharding.split_graph(
                 builder_exported_to_edge.edge_manager.exported_program(),
-                # pyre-fixme[16]: `Optional` has no attribute `__getitem__`.
                 builder_exported_to_edge.metadata["get_n_layers"],
                 shares=args.num_sharding,
             )
 
-        from functools import partial
-
         # pyre-ignore
         from executorch.backends.qualcomm.quantizer.custom_annotation import (
             get_custom_quant_ios_dtype,
@@ -790,19 +839,15 @@ def _export_llama(args) -> LLMEdgeManager:  # noqa: C901
                     atten.head_dim,
                 )
             )
-        # pyre-ignore
         tag_quant_io(
             builder_exported_to_edge.edge_manager.exported_program().graph_module,
-            partial(get_custom_quant_ios_dtype, cache_shape),  # pyre-ignore
+            partial(get_custom_quant_ios_dtype, cache_shape),
         )
 
     logging.info("Lowering model using following partitioner(s): ")
     for partitioner in partitioners:
         logging.info(f"--> {partitioner.__class__.__name__}")
 
-    additional_passes = []
-    if args.model in TORCHTUNE_DEFINED_MODELS:
-        additional_passes = [InitializedMutableBufferPass(["kv_cache_pos"])]
     if args.generate_etrecord:
         if not builder_exported_to_edge.edge_manager:
             raise ValueError("Unable to generate etrecord due to missing edge manager.")
@@ -816,7 +861,6 @@ def _export_llama(args) -> LLMEdgeManager:  # noqa: C901
         if args.num_sharding > 0 and args.qnn:
             from executorch.backends.qualcomm.utils.utils import canonicalize_program
 
-            # pyre-fixme[16]: Module `backends` has no attribute `qualcomm`.
             canonicalize_program(builder.edge_manager.exported_program())
 
         builder = builder.to_executorch(
@@ -838,11 +882,55 @@ def _export_llama(args) -> LLMEdgeManager:  # noqa: C901
         if args.num_sharding > 0 and args.qnn:
             from executorch.backends.qualcomm.utils.utils import canonicalize_program
 
-            # pyre-fixme[16]: Module `backends` has no attribute `qualcomm`.
             canonicalize_program(builder.edge_manager.exported_program())
 
         builder = builder.to_executorch(passes=additional_passes)
 
+    return builder
+
+
+def _export_llama(args) -> LLMEdgeManager:  # noqa: C901
+    _validate_args(args)
+
+    pt2e_quant_params, quantizers, quant_dtype = get_quantizer_and_quant_params(args)
+
+    additional_passes = []
+    if args.model in TORCHTUNE_DEFINED_MODELS:
+        additional_passes = [InitializedMutableBufferPass(["kv_cache_pos"])]
+
+    # export_to_edge
+    builder_exported = _prepare_for_llama_export(args).export()
+    builder_exported.run_canonical_optimizations()
+    modelname = builder_exported.modelname
+
+    if args.export_only:
+        exit()
+
+    if pt2e_quant_params is not None and pt2e_quant_params.quantize_linear is not None:
+        # Force xnnpack to be true if pt2e_quant_params is not None and args.xnnpack is False
+        args.xnnpack = True
+
+    if args.xnnpack:
+        builder = _to_edge_and_lower_llama_xnnpack(
+            builder_exported,
+            modelname,
+            additional_passes,
+            pt2e_quant_params,
+            quantizers,
+            quant_dtype,
+            args,
+        )
+    else:
+        builder = _to_edge_and_lower_llama(
+            builder_exported,
+            modelname,
+            additional_passes,
+            pt2e_quant_params,
+            quantizers,
+            quant_dtype,
+            args,
+        )
+
     if args.profile_memory:
         generate_memory_trace(builder.export_program, "memory_profile.json")
 
@@ -864,7 +952,6 @@ def _export_llama(args) -> LLMEdgeManager:  # noqa: C901
         output_file = f"{builder.output_dir}/{modelname}.pte"
 
     builder.save_to_pte(output_file)
-
     return builder
 
 
@@ -906,7 +993,7 @@ def _load_llama_model(
     *,
     checkpoint: Optional[str] = None,
     checkpoint_dir: Optional[str] = None,
-    params_path: str,
+    params_path: Optional[str] = None,
     use_kv_cache: bool = False,
     use_sdpa_with_kv_cache: bool = False,
     generate_full_logits: bool = False,
@@ -933,13 +1020,6 @@ def _load_llama_model(
         An instance of LLMEdgeManager which contains the eager mode model.
     """
 
-    assert (
-        checkpoint or checkpoint_dir
-    ) and params_path, "Both checkpoint/checkpoint_dir and params can't be empty"
-    logging.info(
-        f"Loading model with checkpoint={checkpoint}, params={params_path}, use_kv_cache={use_kv_cache}, weight_type={weight_type}"
-    )
-
     if modelname in EXECUTORCH_DEFINED_MODELS:
         module_name = "llama"
         model_class_name = "Llama2Model"  # TODO: Change to "LlamaModel" in examples/models/llama/model.py.
@@ -952,6 +1032,8 @@ def _load_llama_model(
     else:
         raise ValueError(f"{modelname} is not a valid Llama model.")
 
+    torch_dtype = dtype_override.to_torch_dtype() if dtype_override else None
+
     model, example_inputs, example_kwarg_inputs, dynamic_shapes = (
         EagerModelFactory.create_model(
             module_name,
@@ -968,41 +1050,16 @@ def _load_llama_model(
             enable_dynamic_shape=enable_dynamic_shape,
             input_prune_map_path=input_prune_map_path,
             output_prune_map_path=output_prune_map_path,
+            dtype=torch_dtype,
             args=args,
         )
     )
-    if dtype_override:
-        assert isinstance(
-            dtype_override, DType
-        ), "Override dtype needs to be of type <DType>"
-        torch_dtype = dtype_override.to_torch_dtype()
-        logging.info(f"model.to {torch_dtype}")
-        model = model.to(dtype=torch_dtype)
-        dtype = dtype_override
-    else:
-        state_dict = model.state_dict()
-        dtype = state_dict[next(iter(state_dict))].dtype
-        assert dtype in [
-            torch.bfloat16,
-            torch.float16,
-            torch.float32,
-        ], f"Only support bfloat16, fp16 or fp32 got {dtype}"
-        logging.info(f"Loaded model with dtype={dtype}")
-
-        if dtype == torch.bfloat16:
-            dtype = DType.bf16
-        elif dtype == torch.float16:
-            dtype = DType.fp16
-        elif dtype == torch.float32:
-            dtype = DType.fp32
-        else:
-            raise ValueError(f"Unsupported dtype {dtype}")
 
     return LLMEdgeManager(
         model=model,
         modelname=modelname,
         max_seq_len=model.max_seq_len,
-        dtype=dtype,
+        dtype=dtype_override,
         use_kv_cache=use_kv_cache,
         generate_full_logits=generate_full_logits,
         example_inputs=example_inputs,
@@ -1039,8 +1096,31 @@ def _load_llama_model(
 
 
 def _get_source_transforms(  # noqa
-    modelname: str, dtype_override: Optional[DType], args
+    modelname: str,
+    dtype_override: DType,
+    *,
+    checkpoint_dtype: Optional[DType] = None,
+    args,
 ) -> List[Callable[[torch.nn.Module], torch.nn.Module]]:
+    """
+    Return a list of functions that transform a graph.
+
+    Args:
+        modelname: The name of the model.
+        dtype_override: The dtype to use for the model.
+        checkpoint_dtype: The dtype of the checkpoint. At the moment, if this is specified,
+            it means that you want to run quantize transformations on the weights represented
+            in their original dtype, while the overall dtype of the model maybe something
+            different. If not specified, defaults to dtype_override.
+        args: The arguments passed to the script.
+
+    Returns:
+        A list of transformation functions.
+    """
+
+    if not checkpoint_dtype:
+        checkpoint_dtype = dtype_override
+
     transforms = []
 
     if args.use_spin_quant:
@@ -1057,6 +1137,21 @@ def _get_source_transforms(  # noqa
 
             transforms.append(inject_fast_hadamard_transform_native_for_spin_quant)
 
+    if args.embedding_quantize:
+        """
+        When this option is selected, it finds all embedding layers and transforms
+        into quantized embedding equivalent module.
+
+        There are cases where the checkpoint is already quantized, for example
+        on use_spin_quant is enabled. In that case, it will do the appropriate
+        transformations based on the given checkpoint first. In those cases,
+        this wil be a no-op.
+        """
+        modelname = f"{modelname}_e"
+        transforms.append(get_quant_embedding_transform(args, checkpoint_dtype))
+
+    # quantization_mode should be applied after embedding_quantize
+    # to support shared_embedding
     if args.quantization_mode:
         """
         When this option is selected, it finds all linear layers and transforms
@@ -1073,22 +1168,13 @@ def _get_source_transforms(  # noqa
         """
         modelname = f"{modelname}_q"
         transforms.append(
-            get_quant_weight_transform(args, dtype_override, verbose_export())
+            get_quant_weight_transform(
+                args=args,
+                computation_dtype=dtype_override,
+                checkpoint_dtype=checkpoint_dtype,
+            )
         )
 
-    if args.embedding_quantize:
-        """
-        When this option is selected, it finds all embedding layers and transforms
-        into quantized embedding equivalent module.
-
-        There are cases where the checkpoint is already quantized, for example
-        on use_spin_quant is enabled. In that case, it will do the appropriate
-        transformations based on the given checkpoint first. In those cases,
-        this wil be a no-op.
-        """
-        modelname = f"{modelname}_e"
-        transforms.append(get_quant_embedding_transform(args))
-
     if args.expand_rope_table:
         transforms.append(materialze_broadcast_of_rope_freq_cis)
 
@@ -1148,3 +1234,14 @@ def _get_source_transforms(  # noqa
         transforms.append(replace_with_vulkan_rotary_emb)
 
     return transforms
+
+
+def get_llama_model(args):
+    _validate_args(args)
+    e_mgr = _prepare_for_llama_export(args)
+    model = (
+        e_mgr.model.eval().to(device="cuda")
+        if torch.cuda.is_available()
+        else e_mgr.model.eval().to(device="cpu")
+    )
+    return model, e_mgr.example_inputs, e_mgr.metadata
diff --git a/examples/models/llama/llama_test.py b/examples/models/llama/llama_test.py
deleted file mode 100644
index f3c1c1e9070..00000000000
--- a/examples/models/llama/llama_test.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-import unittest
-
-from .export_llama import build_model
-
-
-class LlamaTest(unittest.TestCase):
-    def test_quantized_llama(self):
-        _ = build_model(
-            modelname="model",
-            extra_opts="--fairseq2 -qmode int8",
-            par_local_output=True,
-            resource_pkg_name=__name__,
-        )
-
-    def test_half_llama(self):
-        _ = build_model(
-            modelname="model",
-            extra_opts="--fairseq2 -d fp16",
-            par_local_output=True,
-            resource_pkg_name=__name__,
-        )
-
-
-#    def test_half_xnnpack_llama(self):
-#        output_path = build_model(
-#            modelname="model",
-#            extra_opts="--fairseq2 -d fp16 -X",
-#            par_local_output=True,
-#            resource_pkg_name=__name__,
-#        )
diff --git a/examples/models/llama/llama_transformer.py b/examples/models/llama/llama_transformer.py
index 7a0db6adf02..5c8db7f208d 100644
--- a/examples/models/llama/llama_transformer.py
+++ b/examples/models/llama/llama_transformer.py
@@ -18,59 +18,11 @@
 )
 
 from executorch.examples.models.llama.model_args import ModelArgs
-
+from executorch.examples.models.llama.norm import RMSNorm
 from executorch.examples.models.llama.rope import Rope
-
 from torch import nn
 
 
-class RMSNorm(torch.nn.Module):
-    def __init__(self, dim: int, eps: float = 1e-6):
-        """
-        Initialize the RMSNorm normalization layer.
-
-        Args:
-            dim (int): The dimension of the input tensor.
-            eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
-
-        Attributes:
-            eps (float): A small value added to the denominator for numerical stability.
-            weight (nn.Parameter): Learnable scaling parameter.
-
-        """
-        super().__init__()
-        self.dim = dim
-        self.eps = eps
-        self.weight = nn.Parameter(torch.ones(dim))
-
-    def _norm(self, x):
-        """
-        Apply the RMSNorm normalization to the input tensor.
-
-        Args:
-            x (torch.Tensor): The input tensor.
-
-        Returns:
-            torch.Tensor: The normalized tensor.
-
-        """
-        return x * torch.rsqrt((x * x).mean(-1, keepdim=True) + self.eps)
-
-    def forward(self, x):
-        """
-        Forward pass through the RMSNorm layer.
-
-        Args:
-            x (torch.Tensor): The input tensor.
-
-        Returns:
-            torch.Tensor: The output tensor after applying RMSNorm.
-
-        """
-        output = self._norm(x.float()).type_as(x)
-        return output * self.weight
-
-
 class FeedForward(nn.Module):
     def __init__(self, args: ModelArgs):
         super().__init__()
@@ -170,14 +122,24 @@ def __init__(self, params: ModelArgs):
         self.params = params
         self.vocab_size = params.vocab_size
         self.n_layers = params.n_layers
+        self.apply_embedding = params.apply_embedding
+        self.apply_output = params.apply_output
 
-        self.tok_embeddings = nn.Embedding(params.vocab_size, params.dim)
+        self.tok_embeddings = (
+            nn.Embedding(params.vocab_size, params.dim)
+            if self.apply_embedding
+            else None
+        )
         self.rope = Rope(params)
         self.layers = torch.nn.ModuleList()
         for layer_id in range(params.n_layers):
             self.layers.append(TransformerBlock(layer_id, params, self.rope))
         self.norm = RMSNorm(params.dim, eps=params.norm_eps)
-        self.output = nn.Linear(params.dim, params.vocab_size, bias=False)
+        self.output = (
+            nn.Linear(params.dim, params.vocab_size, bias=False)
+            if self.apply_output
+            else None
+        )
         self.use_kv_cache = params.use_kv_cache
         self.generate_full_logits = params.generate_full_logits
         self.max_seq_len = params.max_seq_len
@@ -195,7 +157,7 @@ def forward(
             raise ValueError(
                 "You cannot specify both tokens and h at the same time, and must specify either one"
             )
-        if tokens is not None and h is None:
+        if self.apply_embedding and tokens is not None and h is None:
             h = self.tok_embeddings(tokens)
 
         if attn_options is None:
@@ -219,29 +181,32 @@ def forward(
 
         h = self.norm(h)
 
-        logits = self.output(h)
-
-        if self.output_prune_map is not None:
-            # expand to original size so that downstream applications can use the logits as-is.
-            if self.generate_full_logits:
-                # (1, seq_len, pruned_size) -> (1, seq_len, original_size)
-                expanded_logits = torch.full(
-                    [logits.shape[0], logits.shape[1], self.vocab_size],
-                    float("-inf"),
-                    device=logits.device,
-                    dtype=logits.dtype,
-                )
-                expanded_logits[:, :, list(self.output_prune_map.values())] = logits
-            else:
-                # (1, pruned_size) -> (1, original_size)
-                expanded_logits = torch.full(
-                    [logits.shape[0], self.vocab_size],
-                    float("-inf"),
-                    device=logits.device,
-                    dtype=logits.dtype,
-                )
-                expanded_logits[:, list(self.output_prune_map.values())] = logits
-            logits = expanded_logits
+        if self.apply_output:
+            logits = self.output(h)
+
+            if self.output_prune_map is not None:
+                # expand to original size so that downstream applications can use the logits as-is.
+                if self.generate_full_logits:
+                    # (1, seq_len, pruned_size) -> (1, seq_len, original_size)
+                    expanded_logits = torch.full(
+                        [logits.shape[0], logits.shape[1], self.vocab_size],
+                        float("-inf"),
+                        device=logits.device,
+                        dtype=logits.dtype,
+                    )
+                    expanded_logits[:, :, list(self.output_prune_map.values())] = logits
+                else:
+                    # (1, pruned_size) -> (1, original_size)
+                    expanded_logits = torch.full(
+                        [logits.shape[0], self.vocab_size],
+                        float("-inf"),
+                        device=logits.device,
+                        dtype=logits.dtype,
+                    )
+                    expanded_logits[:, list(self.output_prune_map.values())] = logits
+                logits = expanded_logits
+        else:
+            logits = h
 
         if attn_options_update is not None:
             return logits, attn_options_update
diff --git a/examples/models/llama/model.py b/examples/models/llama/model.py
index 90582af4856..ec10ae5a649 100644
--- a/examples/models/llama/model.py
+++ b/examples/models/llama/model.py
@@ -38,14 +38,13 @@ def __init__(self, **kwargs):
         resource_dir = get_default_model_resource_dir(__file__)
 
         # Use single checkpoint file.
-        checkpoint_path = kwargs.get(
-            "checkpoint", resource_dir / "demo_rand_params.pth"
-        )
-        params_path = kwargs.get("params", resource_dir / "demo_config.json")
-
+        checkpoint_path = kwargs.get("checkpoint", None)
         # Check if checkpoint_dir was provided for a sharded checkpoint.
         checkpoint_dir = kwargs.get("checkpoint_dir", None)
 
+        # Params file.
+        params_path = kwargs.get("params", None)
+
         self.use_kv_cache = kwargs.get("use_kv_cache", False)
         self.use_sdpa_with_kv_cache_op = kwargs.get("use_sdpa_with_kv_cache", False)
         self.generate_full_logits = kwargs.get("generate_full_logits", False)
@@ -66,6 +65,7 @@ def __init__(self, **kwargs):
         # flake8: noqa: TOR102
         cps = []
         # Load sharded checkpoint.
+        checkpoint = {}
         if checkpoint_dir is not None:
             # Load multiple checkpoint; ignore the single path.
             checkpoint_path = None
@@ -93,7 +93,7 @@ def __init__(self, **kwargs):
                     # Do not duplicate layers shared between each checkpoint.
                     checkpoint[key] = cps[0][key]
         # Load single checkpoint.
-        else:
+        elif checkpoint_path:
             checkpoint = torch.load(checkpoint_path, map_location=device, mmap=True)
 
         # If given checkpoint is fairseq, convert to llama checkpoint.
@@ -122,11 +122,12 @@ def __init__(self, **kwargs):
 """
             )
 
-        # Get checkpoint dtype.
-        self.dtype = get_checkpoint_dtype(checkpoint)
+        # Get optional params.
+        params = {}
+        if params_path:
+            with open(params_path, "r") as f:
+                params = json.loads(f.read())
 
-        with open(params_path, "r") as f:
-            params = json.loads(f.read())
         output_prune_map = None
         if self.output_prune_map_path is not None:
             with open(self.output_prune_map_path, "r") as f:
@@ -171,7 +172,13 @@ def __init__(self, **kwargs):
         # Within the device="meta" context, tensors that are created do not carry data.
         # They possess all other metadata a tensor carries such as size, stride, requires_grad.
         with torch.device("meta"):
+            # Model itself is loaded in default dtype, fp32.
             self.model_ = Transformer(model_args)
+            # Get checkpoint dtype.
+            if checkpoint:
+                self.model_.checkpoint_dtype = get_checkpoint_dtype(checkpoint)
+            else:
+                self.model_.checkpoint_dtype = None
 
         if "int8" in str(checkpoint_path):
             print("Using int8 weight-only quantization!")
@@ -236,14 +243,30 @@ def __init__(self, **kwargs):
                 eviction_batch_size=eviction_batch_size,
             )
 
-        # assign=True: load params/buffers by assignment instead of performing an in-place copy.
-        # Because we are using device="meta", tensors do not have memory associated with them
-        # and an in-place copy is a no-op. Use assign=True in load_state_dict for this scenario.
-        missing, unexpected = self.model_.load_state_dict(
-            checkpoint,
-            strict=False,
-            assign=True,
-        )  # self.model_ = Transformer(gptconf)
+        missing, unexpected = None, None
+        try:
+            # assign=True: load params/buffers by assignment instead of performing an in-place copy.
+            # Because we are using device="meta", tensors do not have memory associated with them
+            # and an in-place copy is a no-op. Use assign=True in load_state_dict for this scenario.
+
+            # Also, the checkpoint is loaded and dtype promoted to the transformer's dtype, which is
+            # by default initialized to fp32. This is fine because every other supported type
+            # losslessly converts to fp32, so we don't lose precision here.
+            if checkpoint:
+                missing, unexpected = self.model_.load_state_dict(
+                    checkpoint,
+                    strict=False,
+                    assign=True,
+                )  # self.model_ = Transformer(gptconf)
+            else:
+                print("Checkpoint not provided, defaulting to uninitialized weights.")
+                self.model_.to_empty(device="cpu")
+        except RuntimeError as e:
+            print(
+                f"Could not load checkpoint into mode and will default to uninitialized weights due to error: {e}."
+            )
+            # Need to provide concrete (empty) values for meta-initialized tensors for quantization.
+            self.model_.to_empty(device="cpu")
 
         if missing:
             missing_weights = [fqn for fqn in missing if fqn.endswith(".weight")]
@@ -268,14 +291,7 @@ def __init__(self, **kwargs):
             self.model_ = prune_output_vocab(self.model_, output_prune_map)
 
     def get_eager_model(self) -> torch.nn.Module:
-        if self.dtype:
-            # convert to the type of the provided checkpoint
-            # input and output are torch.long, so signature unchanged
-            return self.model_.to(self.dtype)
-        else:
-            # int8 quantization code has some bf16,
-            # switch all to FP32
-            return self.model_.to(torch.float32)
+        return self.model_
 
     def get_example_inputs(self):
         if self.use_kv_cache:
diff --git a/examples/models/llama/model_args.py b/examples/models/llama/model_args.py
index e1c4edb8e93..fdbd18eca1d 100644
--- a/examples/models/llama/model_args.py
+++ b/examples/models/llama/model_args.py
@@ -8,7 +8,7 @@ class ModelArgs:
     n_layers: int = 32
     n_heads: int = 32
     n_kv_heads: Optional[int] = None
-    vocab_size: int = -1  # defined later by tokenizer
+    vocab_size: int = 512  # Arbitrary value, should be defined later by tokenizer.
     hidden_dim: Optional[int] = None
     head_dim: Optional[int] = None  # Optional customized head_dim
     multiple_of: int = 256  # make SwiGLU hidden layer size multiple of large power of 2
@@ -21,6 +21,7 @@ class ModelArgs:
     num_experts: int = 8  # Number of experts
     num_activated_experts: int = 2  # Number of experts to activate
     attention_type: str = "mha"  # Attention type, registered in attention.py
+    attention_qkv_bias: bool = False
     use_kv_cache: bool = False  # Use key/value cache
     use_sdpa_with_kv_cache_op: bool = (
         False  # Use custom sdpa op that updates kv cache in-place
@@ -34,7 +35,11 @@ class ModelArgs:
     input_prune_map: Optional[Dict[int, int]] = None
     # A dictionary mapping from pruned token-id to original token-id
     output_prune_map: Optional[Dict[int, int]] = None
+    apply_embedding: bool = True  # Use embedding inside the transformer
+    apply_output: bool = True  # Use output layer (unembedding) inside the transformer
+    use_qk_norm: bool = False  # apply normalization to q and k in the attention
     use_hf_rope: bool = False  # Use HuggingFace's RoPE implementation
+    partial_rotary_factor: float = 1.0
     rope_theta: Optional[float] = (
         None  # The official name to override self.rope_freq_base.
     )
diff --git a/examples/models/llama/norm.py b/examples/models/llama/norm.py
new file mode 100644
index 00000000000..3786e61cd05
--- /dev/null
+++ b/examples/models/llama/norm.py
@@ -0,0 +1,55 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from torch import nn
+
+
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        """
+        Initialize the RMSNorm normalization layer.
+
+        Args:
+            dim (int): The dimension of the input tensor.
+            eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
+
+        Attributes:
+            eps (float): A small value added to the denominator for numerical stability.
+            weight (nn.Parameter): Learnable scaling parameter.
+
+        """
+        super().__init__()
+        self.dim = dim
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+
+    def _norm(self, x):
+        """
+        Apply the RMSNorm normalization to the input tensor.
+
+        Args:
+            x (torch.Tensor): The input tensor.
+
+        Returns:
+            torch.Tensor: The normalized tensor.
+
+        """
+        return x * torch.rsqrt((x * x).mean(-1, keepdim=True) + self.eps)
+
+    def forward(self, x):
+        """
+        Forward pass through the RMSNorm layer.
+
+        Args:
+            x (torch.Tensor): The input tensor.
+
+        Returns:
+            torch.Tensor: The output tensor after applying RMSNorm.
+
+        """
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
diff --git a/examples/models/llama/rope.py b/examples/models/llama/rope.py
index 01352f404df..02eb564ed76 100644
--- a/examples/models/llama/rope.py
+++ b/examples/models/llama/rope.py
@@ -114,6 +114,7 @@ def apply_rotary_emb_to_k(
     return xk_out.type_as(xk)
 
 
+# Wrap apply_rotary_emb in a module to enable it to be module swapped out.
 class RotaryEmbedding(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -133,11 +134,21 @@ def forward(
 
 
 # Based on https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L77
-def hf_precompute_freqs_cis(dim: int, end: int, theta: float):
+# and https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_rope_utils.py#L242.
+# Current only support non-long rope.
+def hf_precompute_freqs_cis(
+    dim: int, end: int, theta: float, partial_rotary_factor: float = 1.0
+):
+    # Partial rotary embeddings.
+    dim = int(dim * partial_rotary_factor)
+
+    # Short factor scaling.
     freqs = 1.0 / (
         theta
         ** (torch.arange(0, dim, 2, device="cpu", dtype=torch.int64).float() / dim)
     )
+    # TODO: support long factor scaling.
+
     # pyre-ignore Undefined attribute [16]: `float` has no attribute `device`.
     t = torch.arange(end, device=freqs.device, dtype=torch.int64).type_as(
         freqs  # pyre-ignore
@@ -179,8 +190,13 @@ def hf_apply_rotary_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
     """
     cos = cos.unsqueeze(unsqueeze_dim)
     sin = sin.unsqueeze(unsqueeze_dim)
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
+
+    rotary_dim = cos.shape[-1]
+    q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:]
+    k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:]
+
+    q_embed = torch.cat([(q_rot * cos) + (rotate_half(q_rot) * sin), q_pass], dim=-1)
+    k_embed = torch.cat([(k_rot * cos) + (rotate_half(k_rot) * sin), k_pass], dim=-1)
     return q_embed, k_embed
 
 
@@ -213,14 +229,23 @@ class Rope(torch.nn.Module):
     def __init__(self, params: ModelArgs):
         super().__init__()
         self.params = params
+
+        # Choose the appropriate RoPE implementation
         if self.params.use_hf_rope:
-            self.precompute_freqs_cis = hf_precompute_freqs_cis
+            self.precompute_freqs_cis = partial(
+                hf_precompute_freqs_cis,
+                partial_rotary_factor=self.params.partial_rotary_factor,
+            )
+            self.apply_rotary_emb = hf_apply_rotary_emb
         else:
             self.precompute_freqs_cis = partial(
                 precompute_freqs_cis,
                 use_scaled=self.params.use_scaled_rope,
                 scale_factor=self.params.rope_scale_factor,
             )
+            self.apply_rotary_emb = RotaryEmbedding()
+
+        # Precompute frequencies
         freqs_cos, freqs_sin = self.precompute_freqs_cis(
             self.params.head_dim,
             (
@@ -232,10 +257,6 @@ def __init__(self, params: ModelArgs):
         )
         self.register_buffer("freqs_cos", freqs_cos, persistent=False)
         self.register_buffer("freqs_sin", freqs_sin, persistent=False)
-        if self.params.use_hf_rope:
-            self.apply_rotary_emb = hf_apply_rotary_emb
-        else:
-            self.apply_rotary_emb = RotaryEmbedding()
 
     def forward(
         self,
diff --git a/examples/models/llama/runner/CMakeLists.txt b/examples/models/llama/runner/CMakeLists.txt
index 79fcd267af0..04fe23e4d82 100644
--- a/examples/models/llama/runner/CMakeLists.txt
+++ b/examples/models/llama/runner/CMakeLists.txt
@@ -20,8 +20,8 @@ if(NOT EXECUTORCH_ROOT)
   set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..)
 endif()
 
-include(${EXECUTORCH_ROOT}/build/Utils.cmake)
-include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
 
 #
 # The `_<target>_srcs` lists are defined by including ${EXECUTORCH_SRCS_FILE}.
@@ -43,7 +43,7 @@ target_include_directories(
 
 list(
   APPEND _llama_runner__srcs
-  ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizer/tiktoken.cpp
+  ${EXECUTORCH_ROOT}/extension/llm/tokenizers/src/tiktoken.cpp
 )
 list(APPEND _llama_runner__srcs
      ${CMAKE_CURRENT_SOURCE_DIR}/../tokenizer/llama_tiktoken.cpp
@@ -66,11 +66,11 @@ set(ABSL_PROPAGATE_CXX_STD ON)
 set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 add_subdirectory(
-  ${EXECUTORCH_ROOT}/extension/llm/third-party/abseil-cpp
+  ${EXECUTORCH_ROOT}/extension/llm/tokenizers/third-party/abseil-cpp
   ${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
 )
 add_subdirectory(
-  ${EXECUTORCH_ROOT}/extension/llm/third-party/re2
+  ${EXECUTORCH_ROOT}/extension/llm/tokenizers/third-party/re2
   ${CMAKE_CURRENT_BINARY_DIR}/re2
 )
 set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
@@ -82,6 +82,11 @@ set(llama_runner_deps executorch extension_data_loader extension_module
 target_link_libraries(llama_runner PUBLIC ${llama_runner_deps})
 
 target_include_directories(
-  llama_runner INTERFACE ${_common_include_directories} ${EXECUTORCH_ROOT}
+  llama_runner
+  INTERFACE ${_common_include_directories}
+)
+target_include_directories(
+  llama_runner
+  PUBLIC ${EXECUTORCH_ROOT}/extension/llm/tokenizers/include
 )
 target_compile_options(llama_runner PUBLIC ${_preprocessor_flag})
diff --git a/examples/models/llama/runner/generation.py b/examples/models/llama/runner/generation.py
index 3e9ceb34af5..4ba645ffd87 100644
--- a/examples/models/llama/runner/generation.py
+++ b/examples/models/llama/runner/generation.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import time
 from abc import ABC, abstractmethod
 from typing import List, Optional
 
@@ -97,6 +98,7 @@ def generate(  # noqa: C901
         pos_base: int = 0,
     ) -> List[int]:
         # Prefill
+        prefill_start = time.time()
         logits = self.forward(
             tokens=torch.tensor([prompt_tokens], dtype=torch.long, device=self.device),
             input_pos=(
@@ -105,11 +107,13 @@ def generate(  # noqa: C901
                 else None
             ),
         )
+        prefill_time = time.time() - prefill_start
 
         current_token = next_token(logits, temperature, top_p)
         print(f"{self.tokenizer.decode_token(current_token)}", end="", flush=True)
         tokens = prompt_tokens + [current_token]
 
+        generate_start = time.time()
         while len(tokens) < max_seq_len:
             if self.use_kv_cache:
                 logits = self.forward(
@@ -140,6 +144,10 @@ def generate(  # noqa: C901
             print(f"{self.tokenizer.decode_token(current_token)}", end="", flush=True)
         print("\n")
 
+        generate_time = time.time() - generate_start
+        print(f"Prefill time: {prefill_time}")
+        print(f"Generation tok/s: {len(tokens) / generate_time}")
+
         return tokens if echo else tokens[len(prompt_tokens) :]
 
     def text_completion(
diff --git a/examples/models/llama/runner/runner.cpp b/examples/models/llama/runner/runner.cpp
index 42a1a632dc6..e0a317aaff3 100644
--- a/examples/models/llama/runner/runner.cpp
+++ b/examples/models/llama/runner/runner.cpp
@@ -16,7 +16,7 @@
 #include <executorch/extension/llm/runner/util.h>
 
 #include <executorch/examples/models/llama/tokenizer/llama_tiktoken.h>
-#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
+#include <pytorch/tokenizers/llama2c_tokenizer.h>
 
 namespace example {
 
@@ -39,12 +39,12 @@ static constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache";
 Runner::Runner(
     const std::string& model_path,
     const std::string& tokenizer_path,
-    const float temperature)
+    const float temperature,
+    std::optional<const std::string> data_path)
     // NOTE: we observed ~2x loading performance increase on iPhone 15
     // and a ~5% improvement on Galaxy S22 by switching to
     // FileDataLoader instead of MmapDataLoader + UseMlockIgnoreErrors.
     : temperature_(temperature),
-      module_(std::make_unique<Module>(model_path, Module::LoadMode::File)),
       tokenizer_path_(tokenizer_path),
       metadata_({
           {kEnableDynamicShape, false},
@@ -52,6 +52,12 @@ Runner::Runner(
           {kUseKVCache, true},
           {kUseSDPAWithKVCache, false},
       }) {
+  if (data_path.has_value()) {
+    module_ = std::make_unique<Module>(
+        model_path, data_path.value(), Module::LoadMode::File);
+  } else {
+    module_ = std::make_unique<Module>(model_path, Module::LoadMode::File);
+  }
   ET_LOG(
       Info,
       "Creating LLaMa runner: model_path=%s, tokenizer_path=%s",
@@ -72,17 +78,21 @@ Error Runner::load() {
   // load tokenizer. Assuming tiktoken is the default tokenizer
   tokenizer_ = nullptr;
   tokenizer_ = get_tiktoken_for_llama();
-  Error err = tokenizer_->load(tokenizer_path_);
+  ::tokenizers::Error err = tokenizer_->load(tokenizer_path_);
   // Rely on tiktoken to throw error if the artifact is incompatible. Then we
   // fallback to BPE tokenizer.
-  if (err == Error::InvalidArgument) {
+  if (err != ::tokenizers::Error::Ok) {
     ET_LOG(
         Info,
         "Failed to load %s as a Tiktoken artifact, trying BPE tokenizer",
         tokenizer_path_.c_str());
     tokenizer_.reset();
-    tokenizer_ = std::make_unique<llm::BPETokenizer>();
-    tokenizer_->load(tokenizer_path_);
+    tokenizer_ = std::make_unique<::tokenizers::Llama2cTokenizer>();
+    err = tokenizer_->load(tokenizer_path_);
+    ET_CHECK_TK_OK_OR_RETURN_ERROR(
+        err,
+        "Failed to load %s as a llama2.c tokenizer artifact",
+        tokenizer_path_.c_str());
   }
 
   ET_LOG(Info, "Reading metadata from model");
@@ -195,12 +205,12 @@ Error Runner::generate(
       ? seq_len
       : metadata_.at(kMaxSeqLen);
 
-  Result<std::vector<uint64_t>> encode_res = tokenizer_->encode(
+  ::tokenizers::Result<std::vector<uint64_t>> encode_res = tokenizer_->encode(
       prompt,
       /* bos */ 0,
       /* eos */ 0);
 
-  ET_CHECK_OK_OR_RETURN_ERROR(
+  ET_CHECK_TK_OK_OR_RETURN_ERROR(
       encode_res.error(), "Failed to encode prompt %s", prompt.c_str());
 
   // encode the (string) prompt into tokens sequence
@@ -236,7 +246,8 @@ Error Runner::generate(
   uint64_t cur_token = prefill_res.get();
 
   // print the first token from prefill. No prev_token so use cur_token for it.
-  wrapped_callback(ET_UNWRAP(tokenizer_->decode(cur_token, cur_token)));
+  wrapped_callback(
+      ET_UNWRAP_TOKENIZER(tokenizer_->decode(cur_token, cur_token)));
   RUNNER_ET_LOG(
       warmup,
       "RSS after prompt prefill: %f MiB (0 if unsupported)",
diff --git a/examples/models/llama/runner/runner.h b/examples/models/llama/runner/runner.h
index 5b3bb010112..509fe234027 100644
--- a/examples/models/llama/runner/runner.h
+++ b/examples/models/llama/runner/runner.h
@@ -14,6 +14,7 @@
 #include <cstdint>
 #include <functional>
 #include <memory>
+#include <optional>
 #include <string>
 #include <unordered_map>
 
@@ -22,8 +23,8 @@
 #include <executorch/extension/llm/runner/text_decoder_runner.h>
 #include <executorch/extension/llm/runner/text_prefiller.h>
 #include <executorch/extension/llm/runner/text_token_generator.h>
-#include <executorch/extension/llm/tokenizer/tokenizer.h>
 #include <executorch/extension/module/module.h>
+#include <pytorch/tokenizers/tokenizer.h>
 
 namespace example {
 
@@ -32,7 +33,8 @@ class ET_EXPERIMENTAL Runner : public executorch::extension::llm::IRunner {
   explicit Runner(
       const std::string& model_path,
       const std::string& tokenizer_path,
-      const float temperature = 0.8f);
+      const float temperature = 0.8f,
+      std::optional<const std::string> data_path = std::nullopt);
 
   bool is_loaded() const;
   ::executorch::runtime::Error load();
@@ -56,7 +58,7 @@ class ET_EXPERIMENTAL Runner : public executorch::extension::llm::IRunner {
   // model
   std::unique_ptr<::executorch::extension::Module> module_;
   std::string tokenizer_path_;
-  std::unique_ptr<::executorch::extension::llm::Tokenizer> tokenizer_;
+  std::unique_ptr<::tokenizers::Tokenizer> tokenizer_;
   std::unordered_map<std::string, int64_t> metadata_;
   std::unique_ptr<::executorch::extension::llm::TextDecoderRunner>
       text_decoder_runner_;
diff --git a/examples/models/llama/runner/static_attention_io_manager.h b/examples/models/llama/runner/static_attention_io_manager.h
index f5a8c04b085..69c22dd2530 100644
--- a/examples/models/llama/runner/static_attention_io_manager.h
+++ b/examples/models/llama/runner/static_attention_io_manager.h
@@ -1,4 +1,10 @@
-// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 
 #include <memory>
 #include <tuple>
@@ -38,6 +44,11 @@ class StaticKVCache {
     reset();
   }
 
+  StaticKVCache(const StaticKVCache& other) = delete;
+  StaticKVCache& operator=(const StaticKVCache& other) = delete;
+  StaticKVCache(StaticKVCache&& other) = delete;
+  StaticKVCache& operator=(StaticKVCache&& other) = delete;
+
   ~StaticKVCache() {
     allocator_.deallocate(data_, data_size_);
   }
@@ -200,6 +211,15 @@ class StaticAttentionMask {
     reset();
   }
 
+  StaticAttentionMask(const StaticAttentionMask& other) = delete;
+  StaticAttentionMask& operator=(const StaticAttentionMask& other) = delete;
+  StaticAttentionMask(StaticAttentionMask&& other) = delete;
+  StaticAttentionMask& operator=(StaticAttentionMask&& other) = delete;
+
+  ~StaticAttentionMask() {
+    allocator_.deallocate(data_, data_size_);
+  }
+
   /**
    * Reset the mask to the state where the cache contains no valid data.
    */
@@ -315,7 +335,7 @@ class StaticAttentionIOManager {
     input_pos_ += update_len;
     kCaches_.update(method, k_cache_output_indices, update_len);
     vCaches_.update(method, v_cache_output_indices, update_len);
-    for (auto it : attentionMasks_) {
+    for (auto& it : attentionMasks_) {
       it.second.updateCacheMask(update_len);
     }
   }
@@ -324,7 +344,7 @@ class StaticAttentionIOManager {
     input_pos_ = 0;
     kCaches_.reset();
     vCaches_.reset();
-    for (auto it : attentionMasks_) {
+    for (auto& it : attentionMasks_) {
       it.second.reset();
     }
   }
diff --git a/examples/models/llama/runner/targets.bzl b/examples/models/llama/runner/targets.bzl
index 9f095b93970..60fc1f2b74d 100644
--- a/examples/models/llama/runner/targets.bzl
+++ b/examples/models/llama/runner/targets.bzl
@@ -3,9 +3,6 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 def _get_operator_lib(aten = False):
     if aten:
         return ["//executorch/kernels/aten:generated_lib"]
-    elif runtime.is_oss:
-        # TODO(T183193812): delete this path after optimized-oss.yaml is no more.
-        return ["//executorch/configurations:optimized_native_cpu_ops_oss", "//executorch/extension/llm/custom_ops:custom_ops"]
     else:
         return ["//executorch/configurations:optimized_native_cpu_ops", "//executorch/extension/llm/custom_ops:custom_ops"]
 
@@ -13,7 +10,7 @@ def get_qnn_dependency():
     # buck build -c executorch.enable_qnn=true //executorch/examples/models/llama/runner:runner
     # Check if QNN is enabled before including the dependency
     if native.read_config("executorch", "enable_qnn", "false") == "true":
-        # //executorch/backends/qualcomm:qnn_executorch_backend doesn't work, 
+        # //executorch/backends/qualcomm:qnn_executorch_backend doesn't work,
         #  likely due to it's an empty library with dependency only
         return [
             "//executorch/backends/qualcomm/runtime:runtime",
@@ -51,7 +48,7 @@ def define_common_targets():
                 "//executorch/runtime/core/exec_aten:lib" + aten_suffix,
                 "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix,
                 "//executorch/examples/models/llama/tokenizer:tiktoken",
-                "//executorch/extension/llm/tokenizer:bpe_tokenizer",
+                "//pytorch/tokenizers:llama2c_tokenizer",
             ] + (_get_operator_lib(aten)) + ([
                 # Vulkan API currently cannot build on some platforms (e.g. Apple, FBCODE)
                 # Therefore enable it explicitly for now to avoid failing tests
diff --git a/examples/models/llama/source_transformation/pre_quantization.py b/examples/models/llama/source_transformation/pre_quantization.py
index b6540b7f3ee..d284512e712 100644
--- a/examples/models/llama/source_transformation/pre_quantization.py
+++ b/examples/models/llama/source_transformation/pre_quantization.py
@@ -44,7 +44,7 @@ def replacement_fn(child: torch.nn.Module) -> torch.nn.Module:
             # pyre-fixme[6]: For 2nd argument expected `int` but got `Union[Module,
             #  Tensor]`.
             child.out_features,
-            bias=False,
+            bias=child.bias is not None,
             device=child.weight.device,
             groupsize=group_size,
             precision=precision,
diff --git a/examples/models/llama/source_transformation/quantize.py b/examples/models/llama/source_transformation/quantize.py
index 8923ab1fdec..36743bb3b79 100644
--- a/examples/models/llama/source_transformation/quantize.py
+++ b/examples/models/llama/source_transformation/quantize.py
@@ -14,12 +14,11 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-from executorch.backends.vulkan._passes import VkInt4WeightOnlyQuantizer
-
 from executorch.extension.llm.export.builder import DType
 
 from sentencepiece import SentencePieceProcessor
 
+
 try:
     from fairseq2.nn.embedding import (
         Embedding as fsEmbedding,
@@ -38,7 +37,8 @@
 def quantize(  # noqa C901
     model: torch.nn.Module,
     qmode: str,
-    activation_dtype: Optional[DType],
+    computation_dtype: Optional[DType] = None,
+    checkpoint_dtype: Optional[DType] = None,
     checkpoint_path: Optional[Path] = None,
     # following arguments only available when setting int4 or gptq quantization.
     group_size: Optional[int] = 128,
@@ -54,24 +54,33 @@ def quantize(  # noqa C901
 ) -> torch.nn.Module:
     """
     Quantizes a model by converting all weights to int8.
+
     Args:
-        model: A model to quantize.
-        qmode: quantization mode, e.g. int8, 8da4w, 8da4w-gptq
+        model: The model to quantize.
+        qmode: The quantization mode, e.g. int8, 8da4w, 8da4w-gptq.
+        computation_dtype: The dtype that ops are performed in (the resulting dtype of dequantization).
+            Also the dtype of the rest of the non-quantized compoents of the model.
+        checkpoint_dtype: The dtype of the checkpoint, this arg exists since it is more accurate to
+            quantize the weight in its original dtype.
+
     Returns:
         A quantized model.
     """
-    if activation_dtype is not None:
-        torch_dtype = activation_dtype.to_torch_dtype()
+    if computation_dtype:
+        computation_torch_dtype = computation_dtype.to_torch_dtype()
     else:
-        torch_dtype = torch.float16
+        computation_torch_dtype = torch.float32
 
-    assert checkpoint_path, "Need to specify a checkpoint"
-    # if checkpoint_path is None:
-    #     checkpoint_path = Path("checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth")
+    if not checkpoint_dtype:
+        checkpoint_torch_dtype = computation_torch_dtype
+    else:
+        checkpoint_torch_dtype = checkpoint_dtype.to_torch_dtype()
 
     if qmode == "int8":
         # Add quantization mode options here: group size, bit width, etc.
-        return WeightOnlyInt8QuantHandler(model).quantized_model()
+        return WeightOnlyInt8QuantHandler(
+            model, precision=checkpoint_torch_dtype
+        ).quantized_model()
     elif qmode.startswith("torchao:fpa"):
         pattern = r"torchao:fpa(\d+)w"
         matches = re.findall(pattern, qmode)
@@ -81,10 +90,12 @@ def quantize(  # noqa C901
         from torchao.experimental.quant_api import UIntxWeightOnlyLinearQuantizer
 
         with torch.no_grad():
+            # This quantize() is currently doing a model.to(self.precision) so cannot
+            # decouple computation and checkpoint dtypes.
             model = (
                 UIntxWeightOnlyLinearQuantizer(
                     device="mps",
-                    precision=torch.float32,
+                    precision=computation_torch_dtype,
                     groupsize=group_size,
                     bitwidth=bitwidth,
                 )
@@ -100,18 +111,24 @@ def quantize(  # noqa C901
         matches = re.findall(pattern, qmode)
         assert len(matches) == 1, f"Expected 1 match for pattern but got {len(matches)}"
         bitwidth = int(matches[0][0])
-        _load_torchao_aten_lib(libname="libtorchao_ops_aten")
-        from torchao.experimental.quant_api import Int8DynActIntxWeightLinearQuantizer
 
-        with torch.no_grad():
-            model = Int8DynActIntxWeightLinearQuantizer(
-                device="cpu",
-                precision=torch.float32,
-                groupsize=group_size,
-                bitwidth=bitwidth,
-                has_weight_zeros=False,
-            ).quantize(model)
+        from torchao.experimental.quant_api import Int8DynamicActivationIntxWeightConfig
+        from torchao.quantization.granularity import PerGroup, PerRow
+        from torchao.quantization.quant_api import quantize_
+        from torchao.utils import unwrap_tensor_subclass
 
+        with torch.no_grad():
+            # Computation dtype is fixed to fp32 in the implementation of quantize_, so
+            # no way to decouple checkpoint and computation dtype.
+            quantize_(
+                model,
+                Int8DynamicActivationIntxWeightConfig(
+                    weight_dtype=getattr(torch, f"int{bitwidth}"),
+                    granularity=(PerRow() if group_size == 0 else PerGroup(group_size)),
+                    has_weight_zeros=False,
+                ),
+            )
+            model = unwrap_tensor_subclass(model)
         if verbose:
             print("quantized model:", model)
         return model
@@ -121,9 +138,12 @@ def quantize(  # noqa C901
             raise Exception("For 8da4w quantization, group size must be specified.")
         from torchao.quantization.quant_api import Int8DynActInt4WeightQuantizer
 
+        # 1. Quantize in checkpoint dtype.
         model = Int8DynActInt4WeightQuantizer(
-            precision=torch_dtype, groupsize=group_size
+            precision=checkpoint_torch_dtype, groupsize=group_size
         ).quantize(model)
+        # 2. Set the computation dtype (what weights/acts dequantize to).
+        model = set_8da4w_computation_dtype(model, computation_torch_dtype)
 
         if verbose:
             print("quantized model:", model)
@@ -151,6 +171,7 @@ def quantize(  # noqa C901
         from torchao.quantization.quant_api import Int8DynActInt4WeightGPTQQuantizer
 
         if tokenizer_path is None:
+            assert checkpoint_path is not None, "checkpoint_path must be specified"
             tokenizer_path = checkpoint_path.parent / "tokenizer.model"
         assert tokenizer_path.is_file(), tokenizer_path
         tokenizer = SentencePieceProcessor(  # pyre-ignore[28]
@@ -176,10 +197,12 @@ def quantize(  # noqa C901
             blocksize,
             percdamp,
             group_size,
-        )
+        )  # TODO: separate computation and checkpoint dtype for GPTQ.
         model = gptq_quantizer.quantize(model, inputs)
         return model
     elif qmode == "vulkan_4w":
+        from executorch.backends.vulkan._passes import VkInt4WeightOnlyQuantizer
+
         q_group_size = 256 if group_size is None else group_size
         model = VkInt4WeightOnlyQuantizer(groupsize=q_group_size).quantize(model)
 
@@ -187,9 +210,12 @@ def quantize(  # noqa C901
         # at the moment
         from torchao.quantization.quant_api import Int8DynActInt4WeightQuantizer
 
+        # 1. Quantize in checkpoint dtype.
         model = Int8DynActInt4WeightQuantizer(
-            precision=torch_dtype, groupsize=q_group_size
+            precision=checkpoint_torch_dtype, groupsize=q_group_size
         ).quantize(model)
+        # 2. Set the computation dtype (what weights/acts dequantize to).
+        model = set_8da4w_computation_dtype(model, computation_torch_dtype)
 
         return model
     else:
@@ -345,6 +371,7 @@ def __init__(
         node_type: str = "*",
         bitwidth: Optional[int] = None,
         group_size: Optional[int] = None,
+        precision: torch.dtype = torch.float32,
     ):
         self.mod = mod
         self.group_size = group_size
@@ -353,6 +380,7 @@ def __init__(
             self.bitwidth = 8
         else:
             self.bitwidth = bitwidth
+        self.precision = precision
 
     @torch.no_grad()
     def create_quantized_state_dict(self) -> Dict:
@@ -388,7 +416,7 @@ def create_quantized_state_dict(self) -> Dict:
 
                     # print(f"expanded weight shape {input_weight.shape}")
                     weight, scales, _ = dynamically_quantize_per_channel(
-                        input_weight,
+                        input_weight.to(dtype=self.precision),
                         range_min,
                         range_max,
                         torch.int8,
@@ -573,6 +601,7 @@ def __init__(
         bitwidth: int = 8,
         group_size: Optional[int] = None,
         packed=False,
+        precision: Optional[torch.dtype] = None,
     ):
         if isinstance(packed, str):
             packed = packed == "True"
@@ -581,6 +610,8 @@ def __init__(
         self.group_size = group_size
         self.bitwidth = bitwidth
         self.packed = packed
+        # Dtype of the weights right before quantization.
+        self.precision = precision
         if (bitwidth not in [2, 4]) and packed:
             raise RuntimeError("pack only works with bitsize 2, 4")
 
@@ -611,7 +642,11 @@ def create_quantized_state_dict(self, packed=False) -> Dict:
                     f"quantize {fqn, mod} with group_size {self.group_size}, bitwidth {self.bitwidth}"
                 )
                 weight, scales, _ = dynamically_quantize_per_channel(
-                    mod.weight.float(),
+                    (
+                        mod.weight.to(dtype=self.precision)
+                        if self.precision
+                        else mod.weight
+                    ),
                     range_min,
                     range_max,
                     torch.int8,
@@ -747,22 +782,45 @@ def forward(self, indices: torch.Tensor) -> torch.Tensor:
 ############################ Source Transform Start #######################
 
 
-def get_quant_embedding_transform(args):
+def get_quant_embedding_transform(args, dtype_override: Optional[DType] = None):
     if args.embedding_quantize.startswith("torchao:"):
-        bitwidth, group_size = args.embedding_quantize.split(":")[1].split(",")
+        from torchao.experimental.quant_api import (
+            EmbeddingQuantizer,
+            SharedEmbeddingQuantizer,
+        )
+        from torchao.quantization.granularity import PerGroup, PerRow
+
+        quant_args = args.embedding_quantize.split(":")[1].split(",")
+        if len(quant_args) == 2:
+            bitwidth, group_size = quant_args
+            has_weight_zeros = True
+        else:
+            bitwidth, group_size, has_weight_zeros = quant_args
+
+        if group_size in ["none", "None", "0"]:
+            group_size = 0
+
         group_size = int(group_size)
         bitwidth = int(bitwidth)
-        _load_torchao_aten_lib(libname="libtorchao_ops_aten")
-        from torchao.experimental.quant_api import IntxWeightEmbeddingQuantizer
+        has_weight_zeros = bool(has_weight_zeros)
+        weight_dtype = getattr(torch, f"int{bitwidth}")
+        granularity = PerRow() if group_size == 0 else PerGroup(group_size)
 
         def _torchao_embedding_quantizer(model):
             with torch.no_grad():
-                model = IntxWeightEmbeddingQuantizer(
-                    device="cpu",
-                    precision=torch.float32,
-                    bitwidth=bitwidth,
-                    groupsize=group_size,
-                ).quantize(model)
+                if not args.use_shared_embedding:
+                    EmbeddingQuantizer(
+                        weight_dtype=weight_dtype,
+                        granularity=granularity,
+                        has_weight_zeros=has_weight_zeros,
+                        use_fallback=False,
+                    ).quantize(model)
+                else:
+                    SharedEmbeddingQuantizer(
+                        weight_dtype=weight_dtype,
+                        granularity=granularity,
+                        has_weight_zeros=has_weight_zeros,
+                    ).quantize(model)
             return model
 
         return _torchao_embedding_quantizer
@@ -773,16 +831,22 @@ def _torchao_embedding_quantizer(model):
     else:
         group_size = int(group_size)
     bitwidth = int(bitwidth)
+    torch_dtype = dtype_override.to_torch_dtype() if dtype_override else None
     return lambda model: EmbeddingQuantHandler(
         model,
         bitwidth=bitwidth,
         group_size=group_size,
         packed=(bitwidth in [2, 4]),
+        precision=torch_dtype,
     ).quantized_model()
 
 
-def get_quant_weight_transform(args, dtype_override, verbose):
-    # If these optional args are None, don't provide them to quantize()
+def get_quant_weight_transform(
+    args,
+    computation_dtype: Optional[DType] = None,
+    checkpoint_dtype: Optional[DType] = None,
+):
+    # If these optional args are None, don't provide them to quantize().
     quant_args_str = [
         "group_size",
         "calibration_tasks",
@@ -800,7 +864,8 @@ def get_quant_weight_transform(args, dtype_override, verbose):
         quantize,
         **quant_args,
         qmode=args.quantization_mode,
-        activation_dtype=dtype_override,
+        computation_dtype=computation_dtype,
+        checkpoint_dtype=checkpoint_dtype,
         checkpoint_path=(Path(path) if (path := args.checkpoint) is not None else None),
         tokenizer_path=(
             Path(path) if (path := args.tokenizer_path) is not None else None
@@ -827,4 +892,28 @@ def _load_torchao_aten_lib(libname):
     torch.ops.load_library(libs[0])
 
 
+# We want to do compute the actual ops in the computation dtype, since the precision of the
+# quantized linear will initially be the dtype of the checkpoint.
+def set_8da4w_computation_dtype(
+    module: nn.Module, computation_dtype: torch.dtype
+) -> nn.Module:
+
+    from torchao.quantization.GPTQ import Int8DynActInt4WeightLinear
+
+    def _set_8da4w_computation_dtype(module: nn.Module, dtype: torch.dtype) -> None:
+        """
+        Recursively iterate through the module and set the precision attributes
+        of all Int8DynActInt4WeightLinears.
+        """
+        for _name, child in module.named_children():
+            if isinstance(child, Int8DynActInt4WeightLinear):
+                child.precision = dtype
+            else:
+                # Recursively apply to child modules
+                _set_8da4w_computation_dtype(child, dtype)
+
+    _set_8da4w_computation_dtype(module, computation_dtype)
+    return module
+
+
 ############################ Source Transform End #######################
diff --git a/examples/models/llama/source_transformation/quantized_kv_cache.py b/examples/models/llama/source_transformation/quantized_kv_cache.py
index 023fc6800ff..e7138622ed9 100644
--- a/examples/models/llama/source_transformation/quantized_kv_cache.py
+++ b/examples/models/llama/source_transformation/quantized_kv_cache.py
@@ -205,9 +205,13 @@ def replace_kv_cache_with_quantized_kv_cache(module):
     # This is needed to ensure that custom ops are registered
     from executorch.extension.llm.custom_ops import custom_ops  # noqa: F401
 
-    logging.warning(
+    logging.info(
         "Replacing KVCache with QuantizedKVCache. This modifies the model in place."
     )
+    return _replace_kv_cache_with_quantized_kv_cache(module)
+
+
+def _replace_kv_cache_with_quantized_kv_cache(module):
     for name, child in module.named_children():
         if isinstance(child, KVCache) or isinstance(child, CustomKVCache):
             setattr(
@@ -220,7 +224,7 @@ def replace_kv_cache_with_quantized_kv_cache(module):
                 ),
             )
         else:
-            replace_kv_cache_with_quantized_kv_cache(child)
+            _replace_kv_cache_with_quantized_kv_cache(child)
     return module
 
 
@@ -263,16 +267,20 @@ def update(
 
 
 def replace_kv_cache_with_custom_kv_cache(module):
-    r"""
+    """
     Replace KVCache with CustomKVCache. This modifies the model in place.
     At the moment custom kv cache only supports cache with shape
     [B, S, H, D] as opposed to [B, H, S, D]
     This is because the custom op treats second dim as sequence dim.
     Future work: support [B, H, S, D]
     """
-    logging.warning(
+    logging.info(
         "Replacing KVCache with CustomKVCache. This modifies the model in place."
     )
+    return _replace_kv_cache_with_custom_kv_cache(module)
+
+
+def _replace_kv_cache_with_custom_kv_cache(module):
     for name, child in module.named_children():
         if isinstance(child, KVCache):
             cache_shape = child.k_cache.shape
@@ -290,5 +298,5 @@ def replace_kv_cache_with_custom_kv_cache(module):
                 ),
             )
         else:
-            replace_kv_cache_with_custom_kv_cache(child)
+            _replace_kv_cache_with_custom_kv_cache(child)
     return module
diff --git a/examples/models/llama/static_attention.py b/examples/models/llama/static_attention.py
index 8b341a3aafd..a9dac59051a 100644
--- a/examples/models/llama/static_attention.py
+++ b/examples/models/llama/static_attention.py
@@ -47,19 +47,29 @@ def calculate_cache_key(layer_id: int, head_id: int) -> str:
         return f"l{layer_id},h{head_id}"
 
     @staticmethod
-    def apply_update(cache, update, transpose=False):
+    def apply_update(cache, update, pos, style, transpose=False):
         """
         After inference, update the cache state for next iteration. The runtime needs to
         implement the same operation.
         """
-        if transpose:
-            update_len = update.size(-1)
-            updated = torch.roll(cache, -update_len, -1)
-            updated[:, :, -update_len:] = update
-        else:
-            update_len = update.size(-2)
-            updated = torch.roll(cache, -update_len, -2)
-            updated[:, -update_len:, :] = update
+        if style == "shift_pointer":
+            if transpose:
+                update_len = update.size(-1)
+                updated = torch.roll(cache, -update_len, -1)
+                updated[:, :, -update_len:] = update
+            else:
+                update_len = update.size(-2)
+                updated = torch.roll(cache, -update_len, -2)
+                updated[:, -update_len:, :] = update
+
+        if style == "smart_mask":
+            updated = torch.clone(cache)
+            if transpose:
+                update_len = update.size(-1)
+                updated[:, :, pos : pos + update_len] = update
+            else:
+                update_len = update.size(-2)
+                updated[:, pos : pos + update_len, :] = update
 
         return updated
 
@@ -114,15 +124,68 @@ def update(
         return all_data, (out_k_cache, out_v_cache)
 
 
-def _apply_rotary_embedding(
-    x: torch.Tensor, freqs_cos: torch.Tensor, freqs_sin: torch.Tensor
-) -> torch.Tensor:
-    x_r, x_i = x[..., ::2], x[..., 1::2]
-    x_out_r = x_r * freqs_cos - x_i * freqs_sin
-    x_out_i = x_r * freqs_sin + x_i * freqs_cos
+class StaticAttentionMask:
+    def __init__(self, input_len, cache_len, style, mask_val=float("-inf")):
+        self.input_len = input_len
+        self.cache_len = cache_len
+        assert style in ("shift_pointer", "smart_mask")
+        self.style = style
+        self.mask_val = mask_val
+        self.unmasked_len = 0
+        self.tensor = torch.zeros(1, input_len, input_len + cache_len)
+        self.reset()
+
+    def reset(self):
+        self.unmasked_len = 0
+        self.tensor[:, :, : self.cache_len] = self.mask_val
+
+    def unmask(self, new_unmasked_len):
+        if new_unmasked_len <= 0:
+            return
+
+        if self.style == "shift_pointer":
+            self.tensor[
+                :,
+                :,
+                self.cache_len
+                - self.unmasked_len
+                - new_unmasked_len : self.cache_len
+                - self.unmasked_len,
+            ] = 0
+
+        if self.style == "smart_mask":
+            self.tensor[
+                :,
+                :,
+                self.unmasked_len : self.unmasked_len + new_unmasked_len,
+            ] = 0
+
+        self.unmasked_len += new_unmasked_len
+
+
+class _Rope(nn.Module):
+    def __init__(self, use_hf_rope):
+        super().__init__()
+        self.use_hf_rope = use_hf_rope
 
-    x_out = torch.cat([x_out_r, x_out_i], dim=-1)
-    return x_out
+    def forward(
+        self, x: torch.Tensor, freqs_cos: torch.Tensor, freqs_sin: torch.Tensor
+    ) -> torch.Tensor:
+        if self.use_hf_rope:
+            if len(freqs_cos.shape) == 2:
+                freqs_cos = freqs_cos.unsqueeze(0)
+            if len(freqs_sin.shape) == 2:
+                freqs_sin = freqs_sin.unsqueeze(0)
+            x1 = x[..., : x.shape[-1] // 2]
+            x2 = x[..., x.shape[-1] // 2 :]
+            x_rotated = torch.cat((-x2, x1), dim=-1)
+            return x * freqs_cos + x_rotated * freqs_sin
+        else:
+            x_r, x_i = x[..., ::2], x[..., 1::2]
+            x_out_r = x_r * freqs_cos - x_i * freqs_sin
+            x_out_i = x_r * freqs_sin + x_i * freqs_cos
+            x_out = torch.cat([x_out_r, x_out_i], dim=-1)
+            return x_out
 
 
 @register_attention("static")
@@ -145,22 +208,26 @@ def __init__(self, config: ModelArgs, layer_id: int, rope: Rope):
         self.dim = config.dim
         self.head_dim = config.head_dim
         self.inv_scale = 1.0 / (float(self.head_dim) ** 0.5)
+        self.attention_qkv_bias = config.attention_qkv_bias
+        self.use_qk_norm = config.use_qk_norm
+        self.use_conv2d = False
 
+        assert not self.use_qk_norm, "QK norm not supported in static attention yet"
         self.wqs = nn.ModuleList(
             [
-                nn.Linear(self.dim, self.head_dim, bias=False)
+                nn.Linear(self.dim, self.head_dim, bias=self.attention_qkv_bias)
                 for _ in range(self.n_heads)
             ]
         )
         self.wks = nn.ModuleList(
             [
-                nn.Linear(self.dim, self.head_dim, bias=False)
+                nn.Linear(self.dim, self.head_dim, bias=self.attention_qkv_bias)
                 for _ in range(self.n_kv_heads)
             ]
         )
         self.wvs = nn.ModuleList(
             [
-                nn.Linear(self.dim, self.head_dim, bias=False)
+                nn.Linear(self.dim, self.head_dim, bias=self.attention_qkv_bias)
                 for _ in range(self.n_kv_heads)
             ]
         )
@@ -172,6 +239,7 @@ def __init__(self, config: ModelArgs, layer_id: int, rope: Rope):
             [StaticVCache(layer_id, i) for i in range(self.n_kv_heads)]
         )
         self.wo = nn.Linear(self.n_heads * self.head_dim, self.dim, bias=False)
+        self.rope = _Rope(rope.params.use_hf_rope)
 
     def forward(
         self,
@@ -188,12 +256,27 @@ def forward(
         in_cache_state = kwargs.get("in_cache_state")
         out_cache_state = kwargs.get("out_cache_state")
 
+        bsz, seq_len, dim = x.shape
+        if self.use_conv2d:
+            x = x.reshape(bsz, seq_len, 1, dim).transpose(1, 3)
+
         new_qs = [self.wqs[i](x) for i in range(self.n_heads)]
         new_ks = [self.wks[i](x) for i in range(self.n_kv_heads)]
         new_vs = [self.wvs[i](x) for i in range(self.n_kv_heads)]
-        new_qs = [_apply_rotary_embedding(q, freqs_cos, freqs_sin) for q in new_qs]
-        new_ks = [_apply_rotary_embedding(k, freqs_cos, freqs_sin) for k in new_ks]
 
+        if self.use_conv2d:
+
+            def from_conv2ds(ts):
+                return [
+                    t.reshape(bsz, self.head_dim, seq_len).transpose(1, 2) for t in ts
+                ]
+
+            new_qs = from_conv2ds(new_qs)
+            new_ks = from_conv2ds(new_ks)
+            new_vs = from_conv2ds(new_vs)
+
+        new_qs = [self.rope(q, freqs_cos, freqs_sin) for q in new_qs]
+        new_ks = [self.rope(k, freqs_cos, freqs_sin) for k in new_ks]
         all_ks = []
         all_vs = []
         for i in range(self.n_kv_heads):
@@ -211,12 +294,19 @@ def forward(
             kv_idx = i // self.n_heads_per_kv_group
             attn = new_qs[i] @ all_ks[kv_idx].transpose(-2, -1)
             attn = attn * self.inv_scale
-            attn = attn + mask  # pyre-ignore
+            attn = attn + mask
             attn = F.softmax(attn, dim=-1)
             heads.append(attn @ all_vs[kv_idx])
 
         y = torch.cat(heads, dim=-1)
-        y = self.wo(y)
+        if self.use_conv2d:
+            y = (
+                self.wo(y.reshape(bsz, seq_len, 1, -1).transpose(1, 3))
+                .transpose(1, 3)
+                .reshape(bsz, seq_len, -1)
+            )
+        else:
+            y = self.wo(y)
         return y, {"out_cache_state": out_cache_state}
 
     def load_weights_from_attention_mha(self, other: AttentionMHA):
@@ -234,3 +324,44 @@ def load_weights_from_attention_mha(self, other: AttentionMHA):
             )
 
         self.wo.weight.data.copy_(other.wo.weight)
+
+    def linear_to_conv2d(self):
+        def transfer_weight(linear, conv2d):
+            conv2d.weight.data.copy_(linear.weight[:, :, None, None])
+            return conv2d
+
+        self.wqs = nn.ModuleList(
+            [
+                transfer_weight(
+                    linear,
+                    nn.Conv2d(self.dim, self.head_dim, 1, bias=self.attention_qkv_bias),
+                )
+                for linear in self.wqs
+            ]
+        )
+        self.wks = nn.ModuleList(
+            [
+                transfer_weight(
+                    linear,
+                    nn.Conv2d(self.dim, self.head_dim, 1, bias=self.attention_qkv_bias),
+                )
+                for linear in self.wks
+            ]
+        )
+        self.wvs = nn.ModuleList(
+            [
+                transfer_weight(
+                    linear,
+                    nn.Conv2d(self.dim, self.head_dim, 1, bias=self.attention_qkv_bias),
+                )
+                for linear in self.wvs
+            ]
+        )
+        self.wo = transfer_weight(
+            self.wo,
+            nn.Conv2d(
+                self.n_heads * self.head_dim, self.dim, 1, bias=self.attention_qkv_bias
+            ),
+        )
+
+        self.use_conv2d = True
diff --git a/examples/models/llama/tests/test_static_attention.py b/examples/models/llama/tests/test_static_attention.py
index 401ba604cda..2f6f9639b55 100644
--- a/examples/models/llama/tests/test_static_attention.py
+++ b/examples/models/llama/tests/test_static_attention.py
@@ -7,6 +7,7 @@
 from executorch.examples.models.llama.rope import Rope
 from executorch.examples.models.llama.static_attention import (
     StaticAttention,
+    StaticAttentionMask,
     StaticKVCache,
 )
 
@@ -16,11 +17,46 @@ def setUp(self):
         torch.manual_seed(42)
 
     def test_without_cache(self):
+        def test(use_conv2d):
+            config = ModelArgs(
+                dim=64,
+                n_heads=4,
+                n_kv_heads=2,
+                max_seq_len=8,
+            )
+            layer_id = 0
+            rope = Rope(config)
+            attn_mha = AttentionMHA(config, layer_id, rope).eval()
+            static_attn = StaticAttention(config, layer_id, rope).eval()
+            static_attn.load_weights_from_attention_mha(attn_mha)
+            if use_conv2d:
+                static_attn.linear_to_conv2d()
+
+            x = torch.rand(1, config.max_seq_len, config.dim)
+            freqs_cos, freqs_sin = rope.get_freqs(None, config.max_seq_len)
+            expected, _ = attn_mha(x, freqs_cos, freqs_sin)
+            mask = torch.triu(
+                torch.full((1, config.max_seq_len, config.max_seq_len), float("-inf")),
+                diagonal=1,
+            )
+            y, _ = static_attn(
+                x,
+                freqs_cos,
+                freqs_sin,
+                mask=mask,
+            )
+            self.assertTrue(torch.isclose(y, expected, rtol=1e-3).all())
+
+        test(True)
+        test(False)
+
+    def test_hf_rope_without_cache(self):
         config = ModelArgs(
             dim=64,
             n_heads=4,
             n_kv_heads=2,
             max_seq_len=8,
+            use_hf_rope=True,
         )
         layer_id = 0
         rope = Rope(config)
@@ -37,8 +73,8 @@ def test_without_cache(self):
         )
         y, _ = static_attn(
             x,
-            freqs_cos,
-            freqs_sin,
+            freqs_cos.unsqueeze(0),
+            freqs_sin.unsqueeze(0),
             mask=mask,
         )
         self.assertTrue(torch.isclose(y, expected, rtol=1e-3).all())
@@ -63,48 +99,54 @@ def test_with_cache(self):
         n_chunks = 3
         chunk_len = config.max_seq_len // n_chunks
         cache_len = config.max_seq_len - chunk_len
-        mask = torch.zeros(1, chunk_len, cache_len + chunk_len)
-        mask[:, :, :cache_len] = float("-inf")
-        mask[:, :, cache_len:] = torch.triu(
-            torch.full((1, chunk_len, chunk_len), float("-inf")),
-            diagonal=1,
-        )
-        k_caches = {
-            StaticKVCache.calculate_cache_key(layer_id, i): torch.zeros(
-                1, cache_len, config.head_dim
-            )
-            for i in range(config.n_kv_heads)
-        }
-        v_caches = {
-            StaticKVCache.calculate_cache_key(layer_id, i): torch.zeros(
-                1, cache_len, config.head_dim
-            )
-            for i in range(config.n_kv_heads)
-        }
-        ys = []
-        for i in range(n_chunks):
-            y_i, attn_update = static_attn(
-                x[:, i * chunk_len : (i + 1) * chunk_len, :],
-                freqs_cos[i * chunk_len : (i + 1) * chunk_len],
-                freqs_sin[i * chunk_len : (i + 1) * chunk_len],
-                mask=mask,
-                in_cache_state=(k_caches, v_caches),
-                out_cache_state=({}, {}),
+
+        def test_with_style(style):
+            mask = StaticAttentionMask(chunk_len, cache_len, style=style)
+            mask.tensor[:, :, cache_len:] = torch.triu(
+                torch.full((1, chunk_len, chunk_len), float("-inf")),
+                diagonal=1,
             )
-            ys.append(y_i)
-            mask[:, :, cache_len - chunk_len * (i + 1) : cache_len] = 0
-            k_cache_updates, v_cache_updates = attn_update["out_cache_state"]
-            for cache_id, update in k_cache_updates.items():
-                k_caches[cache_id] = StaticKVCache.apply_update(
-                    k_caches[cache_id], update
+            k_caches = {
+                StaticKVCache.calculate_cache_key(layer_id, i): torch.zeros(
+                    1, cache_len, config.head_dim
+                )
+                for i in range(config.n_kv_heads)
+            }
+            v_caches = {
+                StaticKVCache.calculate_cache_key(layer_id, i): torch.zeros(
+                    1, cache_len, config.head_dim
                 )
-            for cache_id, update in v_cache_updates.items():
-                v_caches[cache_id] = StaticKVCache.apply_update(
-                    v_caches[cache_id], update
+                for i in range(config.n_kv_heads)
+            }
+            ys = []
+            for i in range(n_chunks):
+                y_i, attn_update = static_attn(
+                    x[:, i * chunk_len : (i + 1) * chunk_len, :],
+                    freqs_cos[i * chunk_len : (i + 1) * chunk_len],
+                    freqs_sin[i * chunk_len : (i + 1) * chunk_len],
+                    mask=mask.tensor,
+                    in_cache_state=(k_caches, v_caches),
+                    out_cache_state=({}, {}),
                 )
+                ys.append(y_i)
+                mask.unmask(chunk_len)
+                k_cache_updates, v_cache_updates = attn_update["out_cache_state"]
 
-        y = torch.cat(ys, dim=1)
-        self.assertTrue(torch.isclose(y, expected, rtol=1e-3).all())
+                if i < n_chunks - 1:
+                    for cache_id, update in k_cache_updates.items():
+                        k_caches[cache_id] = StaticKVCache.apply_update(
+                            k_caches[cache_id], update, pos=chunk_len * i, style=style
+                        )
+                    for cache_id, update in v_cache_updates.items():
+                        v_caches[cache_id] = StaticKVCache.apply_update(
+                            v_caches[cache_id], update, pos=chunk_len * i, style=style
+                        )
+
+            y = torch.cat(ys, dim=1)
+            self.assertTrue(torch.isclose(y, expected, rtol=1e-3).all())
+
+        test_with_style("shift_pointer")
+        test_with_style("smart_mask")
 
     def test_within_transformer(self):
         config = ModelArgs(
@@ -133,48 +175,57 @@ def test_within_transformer(self):
         n_chunks = 3
         chunk_len = config.max_seq_len // n_chunks
         cache_len = config.max_seq_len - chunk_len
-        mask = torch.zeros(1, chunk_len, cache_len + chunk_len)
-        mask[:, :, :cache_len] = float("-inf")
-        mask[:, :, cache_len:] = torch.triu(
-            torch.full((1, chunk_len, chunk_len), float("-inf")),
-            diagonal=1,
-        )
-        k_caches = {
-            StaticKVCache.calculate_cache_key(layer_id, i): torch.zeros(
-                1, cache_len, config.head_dim
-            )
-            for layer_id in range(config.n_layers)
-            for i in range(config.n_kv_heads)
-        }
-        v_caches = {
-            StaticKVCache.calculate_cache_key(layer_id, i): torch.zeros(
-                1, cache_len, config.head_dim
-            )
-            for layer_id in range(config.n_layers)
-            for i in range(config.n_kv_heads)
-        }
-        ys = []
-        for i in range(n_chunks):
-            y_i, attn_update = static_transformer(
-                x[:, i * chunk_len : (i + 1) * chunk_len],
-                attn_options=ForwardOptions(
-                    mask=mask,
-                    freqs_cos_override=freqs_cos[i * chunk_len : (i + 1) * chunk_len],
-                    freqs_sin_override=freqs_sin[i * chunk_len : (i + 1) * chunk_len],
-                    in_cache_state=(k_caches, v_caches),
-                    out_cache_state=({}, {}),
-                ),
+
+        def test_with_style(style):
+            mask = StaticAttentionMask(chunk_len, cache_len, style=style)
+            mask.tensor[:, :, cache_len:] = torch.triu(
+                torch.full((1, chunk_len, chunk_len), float("-inf")),
+                diagonal=1,
             )
-            ys.append(y_i)
-            mask[:, :, cache_len - chunk_len * (i + 1) : cache_len] = 0
-            k_cache_updates, v_cache_updates = attn_update["out_cache_state"]
-            for cache_id, update in k_cache_updates.items():
-                k_caches[cache_id] = StaticKVCache.apply_update(
-                    k_caches[cache_id], update
+            k_caches = {
+                StaticKVCache.calculate_cache_key(layer_id, i): torch.zeros(
+                    1, cache_len, config.head_dim
+                )
+                for layer_id in range(config.n_layers)
+                for i in range(config.n_kv_heads)
+            }
+            v_caches = {
+                StaticKVCache.calculate_cache_key(layer_id, i): torch.zeros(
+                    1, cache_len, config.head_dim
                 )
-            for cache_id, update in v_cache_updates.items():
-                v_caches[cache_id] = StaticKVCache.apply_update(
-                    v_caches[cache_id], update
+                for layer_id in range(config.n_layers)
+                for i in range(config.n_kv_heads)
+            }
+            ys = []
+            for i in range(n_chunks):
+                y_i, attn_update = static_transformer(
+                    x[:, i * chunk_len : (i + 1) * chunk_len],
+                    attn_options=ForwardOptions(
+                        mask=mask.tensor,
+                        freqs_cos_override=freqs_cos[
+                            i * chunk_len : (i + 1) * chunk_len
+                        ],
+                        freqs_sin_override=freqs_sin[
+                            i * chunk_len : (i + 1) * chunk_len
+                        ],
+                        in_cache_state=(k_caches, v_caches),
+                        out_cache_state=({}, {}),
+                    ),
                 )
+                ys.append(y_i)
+                mask.unmask(chunk_len)
+                k_cache_updates, v_cache_updates = attn_update["out_cache_state"]
+                if i < n_chunks - 1:
+                    for cache_id, update in k_cache_updates.items():
+                        k_caches[cache_id] = StaticKVCache.apply_update(
+                            k_caches[cache_id], update, pos=chunk_len * i, style=style
+                        )
+                    for cache_id, update in v_cache_updates.items():
+                        v_caches[cache_id] = StaticKVCache.apply_update(
+                            v_caches[cache_id], update, pos=chunk_len * i, style=style
+                        )
+
+            self.assertTrue(torch.isclose(ys[-1], expected, rtol=1e-3).all())
 
-        self.assertTrue(torch.isclose(ys[-1], expected, rtol=1e-3).all())
+        test_with_style("shift_pointer")
+        test_with_style("smart_mask")
diff --git a/examples/models/llama/tokenizer/llama_tiktoken.cpp b/examples/models/llama/tokenizer/llama_tiktoken.cpp
index 74eacc1b5f0..f595de3c4e7 100644
--- a/examples/models/llama/tokenizer/llama_tiktoken.cpp
+++ b/examples/models/llama/tokenizer/llama_tiktoken.cpp
@@ -10,7 +10,7 @@
 
 namespace example {
 
-using ::executorch::extension::llm::Tiktoken;
+using ::tokenizers::Tiktoken;
 
 namespace {
 static constexpr int32_t kSpecialTokensSize = 256;
@@ -42,8 +42,23 @@ _get_default_special_tokens() {
   return special_tokens;
 }
 
-static inline std::unique_ptr<std::vector<std::string>>
-_get_multimodal_special_tokens() {
+std::unique_ptr<std::vector<std::string>> _get_special_tokens(Version version) {
+  switch (version) {
+    case Version::Multimodal:
+      return get_multimodal_special_tokens();
+    default:
+      return _get_default_special_tokens();
+  }
+}
+
+} // namespace
+
+std::unique_ptr<Tiktoken> get_tiktoken_for_llama(Version version) {
+  return std::make_unique<Tiktoken>(
+      _get_special_tokens(version), kBOSTokenIndex, kEOSTokenIndex);
+}
+
+std::unique_ptr<std::vector<std::string>> get_multimodal_special_tokens() {
   auto special_tokens =
       std::make_unique<std::vector<std::string>>(std::vector<std::string>{
           "<|begin_of_text|>",
@@ -72,20 +87,4 @@ _get_multimodal_special_tokens() {
   return special_tokens;
 }
 
-std::unique_ptr<std::vector<std::string>> _get_special_tokens(Version version) {
-  switch (version) {
-    case Version::Multimodal:
-      return _get_multimodal_special_tokens();
-    default:
-      return _get_default_special_tokens();
-  }
-}
-
-} // namespace
-
-std::unique_ptr<Tiktoken> get_tiktoken_for_llama(Version version) {
-  return std::make_unique<Tiktoken>(
-      _get_special_tokens(version), kBOSTokenIndex, kEOSTokenIndex);
-}
-
 } // namespace example
diff --git a/examples/models/llama/tokenizer/llama_tiktoken.h b/examples/models/llama/tokenizer/llama_tiktoken.h
index 6baa3f49cc6..a7f65eca29e 100644
--- a/examples/models/llama/tokenizer/llama_tiktoken.h
+++ b/examples/models/llama/tokenizer/llama_tiktoken.h
@@ -8,7 +8,7 @@
 
 #pragma once
 
-#include <executorch/extension/llm/tokenizer/tiktoken.h>
+#include <pytorch/tokenizers/tiktoken.h>
 
 namespace example {
 
@@ -17,7 +17,9 @@ enum class Version {
   Multimodal,
 };
 
-std::unique_ptr<::executorch::extension::llm::Tiktoken> get_tiktoken_for_llama(
+std::unique_ptr<::tokenizers::Tiktoken> get_tiktoken_for_llama(
     Version version = Version::Default);
 
+std::unique_ptr<std::vector<std::string>> get_multimodal_special_tokens();
+
 } // namespace example
diff --git a/examples/models/llama/tokenizer/targets.bzl b/examples/models/llama/tokenizer/targets.bzl
index 40f8f29ac1e..704ebfeecb6 100644
--- a/examples/models/llama/tokenizer/targets.bzl
+++ b/examples/models/llama/tokenizer/targets.bzl
@@ -15,7 +15,8 @@ def define_common_targets():
             "llama_tiktoken.h",
         ],
         exported_deps = [
-            "//executorch/extension/llm/tokenizer:tiktoken",
+            "//pytorch/tokenizers:tiktoken",
+            "//executorch/extension/llm/tokenizer:tiktoken", # TODO: remove
         ],
         visibility = [
             "@EXECUTORCH_CLIENTS",
diff --git a/examples/models/llama/tokenizer/test/CMakeLists.txt b/examples/models/llama/tokenizer/test/CMakeLists.txt
index 3ed57406b21..35aebf8d2a4 100644
--- a/examples/models/llama/tokenizer/test/CMakeLists.txt
+++ b/examples/models/llama/tokenizer/test/CMakeLists.txt
@@ -15,11 +15,11 @@ cmake_minimum_required(VERSION 3.19)
 
 set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../../..)
 
-include(${EXECUTORCH_ROOT}/build/Test.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
 set(_tokenizer_test_srcs
     test_tiktoken.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/tokenizer/tiktoken.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/tokenizers/src/tiktoken.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/../llama_tiktoken.cpp
 )
 
@@ -29,11 +29,11 @@ set(ABSL_PROPAGATE_CXX_STD ON)
 set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 add_subdirectory(
-  ${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/third-party/abseil-cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/tokenizers/third-party/abseil-cpp
   ${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
 )
 add_subdirectory(
-  ${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/third-party/re2
+  ${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/tokenizers/third-party/re2
   ${CMAKE_CURRENT_BINARY_DIR}/re2
 )
 set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
@@ -43,5 +43,6 @@ target_include_directories(
   tokenizer_test
   PRIVATE
     ${CMAKE_INSTALL_PREFIX}/include
-    ${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/third-party/abseil-cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/tokenizers/include
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/tokenizers/third-party/abseil-cpp
 )
diff --git a/examples/models/llama/tokenizer/test/test_tiktoken.cpp b/examples/models/llama/tokenizer/test/test_tiktoken.cpp
index 442da621748..cbb54f2cb3b 100644
--- a/examples/models/llama/tokenizer/test/test_tiktoken.cpp
+++ b/examples/models/llama/tokenizer/test/test_tiktoken.cpp
@@ -10,7 +10,7 @@
 
 #include <vector>
 
-#include <executorch/runtime/platform/runtime.h>
+#include <executorch/extension/llm/tokenizer/tiktoken.h>
 
 #include <gtest/gtest.h>
 
@@ -36,8 +36,8 @@ static std::string get_resource_path(const std::string& name) {
 class MultimodalTiktokenV5ExtensionTest : public Test {
  public:
   void SetUp() override {
-    executorch::runtime::runtime_init();
-    tokenizer_ = get_tiktoken_for_llama(Version::Multimodal);
+    tokenizer_ = std::make_unique<executorch::extension::llm::Tiktoken>(
+        example::get_multimodal_special_tokens(), 0, 1);
     modelPath_ = get_resource_path("test_tiktoken_tokenizer.model");
   }
 
diff --git a/examples/models/llama3_2_vision/install_requirements.sh b/examples/models/llama3_2_vision/install_requirements.sh
index 4d4a6f28624..7dad02caad7 100755
--- a/examples/models/llama3_2_vision/install_requirements.sh
+++ b/examples/models/llama3_2_vision/install_requirements.sh
@@ -5,7 +5,9 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-NIGHTLY_VERSION="dev20250115"
+set +ex
+
+NIGHTLY_VERSION="dev20250311"
 
 # Install torchtune nightly for model definitions.
 pip install --pre torchtune==0.6.0.${NIGHTLY_VERSION} --extra-index-url https://download.pytorch.org/whl/nightly/cpu --no-cache-dir
diff --git a/examples/models/llava/CMakeLists.txt b/examples/models/llava/CMakeLists.txt
index f22b4471538..6003f3a000d 100644
--- a/examples/models/llava/CMakeLists.txt
+++ b/examples/models/llava/CMakeLists.txt
@@ -46,7 +46,7 @@ endif()
 
 set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 
-include(${EXECUTORCH_ROOT}/build/Utils.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 
 if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
@@ -79,7 +79,7 @@ if(LLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE)
   add_definitions(-DLLAVA_NO_TORCH_DUMMY_IMAGE=1)
   message("Buidling the runner without Torch, feeding a dummy image!")
 else()
-  find_package(Torch CONFIG REQUIRED)
+  find_package_torch()
 endif()
 
 #
@@ -93,14 +93,6 @@ if(CMAKE_TOOLCHAIN_IOS OR ANDROID)
   target_link_options_shared_lib(executorch)
 endif()
 
-# custom ops library
-if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
-  add_subdirectory(
-    ${EXECUTORCH_ROOT}/extension/llm/custom_ops
-    ${CMAKE_CURRENT_BINARY_DIR}/../../../extension/llm/custom_ops
-  )
-endif()
-
 # llava_runner library
 add_subdirectory(runner)
 
@@ -132,14 +124,12 @@ target_link_options_shared_lib(quantized_ops_lib)
 list(APPEND link_libraries quantized_kernels quantized_ops_lib)
 
 if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
-  target_link_options_shared_lib(custom_ops)
   list(APPEND link_libraries custom_ops)
 endif()
 
 set(XNNPACK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack)
 # Extra compile option and include dir for pthreadpool
 if(EXECUTORCH_BUILD_PTHREADPOOL)
-  list(APPEND _common_compile_options -DET_USE_THREADPOOL)
   list(APPEND link_libraries extension_threadpool pthreadpool)
   list(APPEND _common_include_directories
        ${XNNPACK_ROOT}/third-party/pthreadpool/include
diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py
index 82c7aca09e0..64def112908 100644
--- a/examples/models/llava/export_llava.py
+++ b/examples/models/llava/export_llava.py
@@ -67,7 +67,6 @@ def export(self) -> "LlavaEdgeManager":
                 dynamic_shapes=dynamic_shape,
                 strict=False,
             )
-            # pyre-ignore: Incompatible attribute type [8]: Attribute `pre_autograd_graph_module` declared in class `LLMEdgeManager` has type `Optional[GraphModule]` but is used as type `Module`.
             self.pre_autograd_graph_module = self.export_program.module()
         return self
 
@@ -99,9 +98,19 @@ def forward(self, input_pos, embeddings):
     dtype_override = DType.fp32
     parser = build_args_parser()
     args = parser.parse_args(
-        ["-X", "-qmode", "8da4w", "--group_size", "128", "--embedding-quantize", "4,32"]
+        [
+            "-p",
+            "params.json",
+            "-X",
+            "-qmode",
+            "8da4w",
+            "--group_size",
+            "128",
+            "--embedding-quantize",
+            "4,32",
+        ]
     )
-    quant_transform = get_quant_weight_transform(args, dtype_override, False)
+    quant_transform = get_quant_weight_transform(args, dtype_override)
     _, quantizers, _ = get_quantizer_and_quant_params(args)
     source_transforms = []
     if llava.use_sdpa_with_kv_cache_op:
diff --git a/examples/models/llava/runner/CMakeLists.txt b/examples/models/llava/runner/CMakeLists.txt
index 2d0c30a620e..c694bf87c66 100644
--- a/examples/models/llava/runner/CMakeLists.txt
+++ b/examples/models/llava/runner/CMakeLists.txt
@@ -20,8 +20,8 @@ if(NOT EXECUTORCH_ROOT)
   set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..)
 endif()
 
-include(${EXECUTORCH_ROOT}/build/Utils.cmake)
-include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
 # Let files say "include <executorch/path/to/header.h>".
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 
@@ -29,7 +29,7 @@ set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 set(_llava_runner__srcs
     "${CMAKE_CURRENT_SOURCE_DIR}/llava_runner.cpp"
     "${EXECUTORCH_ROOT}/extension/llm/sampler/sampler.cpp"
-    "${EXECUTORCH_ROOT}/extension/llm/tokenizer/bpe_tokenizer.cpp"
+    "${EXECUTORCH_ROOT}/extension/llm/tokenizers/src/llama2c_tokenizer.cpp"
 )
 
 # extension llm runner lib
@@ -47,5 +47,6 @@ set(llava_runner_deps executorch extension_data_loader extension_llm_runner
 target_link_libraries(llava_runner PUBLIC ${llava_runner_deps})
 
 target_include_directories(
-  llava_runner INTERFACE ${_common_include_directories} ${EXECUTORCH_ROOT}
+  llava_runner INTERFACE ${_common_include_directories}
+                         ${EXECUTORCH_ROOT}/extension/llm/tokenizers/include
 )
diff --git a/examples/models/llava/runner/llava_runner.cpp b/examples/models/llava/runner/llava_runner.cpp
index b3c0cce5c33..d368f8fb1a4 100644
--- a/examples/models/llava/runner/llava_runner.cpp
+++ b/examples/models/llava/runner/llava_runner.cpp
@@ -13,7 +13,7 @@
 #include <executorch/examples/models/llava/runner/llava_image_prefiller.h>
 #include <executorch/examples/models/llava/runner/llava_runner.h>
 #include <executorch/examples/models/llava/runner/llava_text_decoder_runner.h>
-#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
+#include <pytorch/tokenizers/llama2c_tokenizer.h>
 
 #include <ctime>
 #include <memory>
@@ -43,7 +43,7 @@ Error LlavaRunner::load() {
   stats_.model_load_start_ms = llm::time_in_ms();
 
   // Load the tokenizer
-  tokenizer_ = std::make_unique<llm::BPETokenizer>();
+  tokenizer_ = std::make_unique<tokenizers::Llama2cTokenizer>();
   tokenizer_->load(tokenizer_path_);
 
   // Load the text decoder runner
@@ -90,7 +90,7 @@ Result<uint64_t> LlavaRunner::prefill_prompt(
     int8_t bos,
     int8_t eos) {
   std::vector<uint64_t> prompt_tokens =
-      ET_UNWRAP(tokenizer_->encode(prompt, bos, eos));
+      ET_UNWRAP_TOKENIZER(tokenizer_->encode(prompt, bos, eos));
 
   return text_prefiller_->prefill(prompt_tokens, start_pos);
 }
diff --git a/examples/models/llava/runner/targets.bzl b/examples/models/llava/runner/targets.bzl
index 63fcc9d3b33..074c92b35e3 100644
--- a/examples/models/llava/runner/targets.bzl
+++ b/examples/models/llava/runner/targets.bzl
@@ -14,7 +14,6 @@ def define_common_targets():
         exported_deps = [
             "//executorch/backends/xnnpack:xnnpack_backend",
             "//executorch/extension/llm/runner:runner_lib",
-            "//executorch/extension/llm/tokenizer:bpe_tokenizer",
             "//executorch/extension/evalue_util:print_evalue",
             "//executorch/extension/module:module",
             "//executorch/extension/tensor:tensor",
@@ -23,5 +22,6 @@ def define_common_targets():
             "//executorch/runtime/core/exec_aten/util:tensor_util",
             "//executorch/configurations:optimized_native_cpu_ops", 
             "//executorch/extension/llm/custom_ops:custom_ops",
+            "//pytorch/tokenizers:llama2c_tokenizer",
         ],
     )
diff --git a/examples/models/llava/targets.bzl b/examples/models/llava/targets.bzl
index 5efb099f06f..6f3a370acf4 100644
--- a/examples/models/llava/targets.bzl
+++ b/examples/models/llava/targets.bzl
@@ -7,9 +7,6 @@ def define_common_targets():
             "main.cpp",
         ],
         compiler_flags = ["-Wno-global-constructors"],
-        preprocessor_flags = [
-            "-DET_USE_THREADPOOL",
-        ],
         deps = [
             "//executorch/examples/models/llava/runner:runner",
             "//executorch/extension/evalue_util:print_evalue",
diff --git a/examples/models/moshi/mimi/install_requirements.sh b/examples/models/moshi/mimi/install_requirements.sh
new file mode 100755
index 00000000000..ef915ca7eb2
--- /dev/null
+++ b/examples/models/moshi/mimi/install_requirements.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -x
+
+pip install -U moshi
+pip install bitsandbytes soundfile
+# Run llama2/install requirements for torchao deps
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+bash "$SCRIPT_DIR"/../../llama/install_requirements.sh
diff --git a/examples/models/moshi/mimi/test_mimi.py b/examples/models/moshi/mimi/test_mimi.py
new file mode 100644
index 00000000000..8160b5df79c
--- /dev/null
+++ b/examples/models/moshi/mimi/test_mimi.py
@@ -0,0 +1,224 @@
+import io
+import os
+import random
+import unittest
+
+import numpy as np
+import requests
+import torch
+import torch.nn as nn
+import torchaudio
+from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
+from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import (
+    get_symmetric_quantization_config,
+    XNNPACKQuantizer,
+)
+from executorch.devtools.backend_debug import print_delegation_info
+from executorch.exir import to_edge_transform_and_lower
+from executorch.runtime import Runtime
+
+from huggingface_hub import hf_hub_download
+from moshi.models import loaders
+from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
+from torch.export import export, ExportedProgram
+from torch.utils._pytree import tree_flatten
+
+proxies = {
+    "http": "http://fwdproxy:8080",
+    "https": "http://fwdproxy:8080",
+}
+
+
+def compute_sqnr(x: torch.Tensor, y: torch.Tensor) -> float:
+    assert x.shape == y.shape, "Tensor shapes do not match"
+    x = x.float()
+    y = y.float()
+    error = x - y
+    original_power = torch.mean(torch.pow(x, 2))
+    error_power = torch.mean(torch.pow(error, 2))
+    sqnr = 10 * torch.log10(original_power / error_power)
+    return sqnr.item()
+
+
+def read_mp3_from_url(url):
+    try:
+        response = requests.get(url)
+    except:
+        # FB-only hack, need to use a forwarding proxy to get url
+        response = requests.get(url, proxies=proxies)
+
+    response.raise_for_status()  # Ensure request is successful
+    audio_stream = io.BytesIO(response.content)
+    waveform, sample_rate = torchaudio.load(audio_stream, format="mp3")
+    return waveform.numpy(), sample_rate
+
+
+class TestMimiModel(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        """Setup once for all tests: Load model and prepare test data."""
+
+        # Get environment variables (if set), otherwise use default values
+        mimi_weight = os.getenv("MIMI_WEIGHT", None)
+        hf_repo = os.getenv("HF_REPO", loaders.DEFAULT_REPO)
+        device = "cuda" if torch.cuda.device_count() else "cpu"
+
+        def seed_all(seed):
+            torch.manual_seed(seed)
+            if torch.cuda.is_available():
+                torch.cuda.manual_seed(seed)
+                torch.cuda.manual_seed_all(seed)
+            random.seed(seed)
+            np.random.seed(seed)
+            torch.backends.cudnn.deterministic = True
+            torch.backends.cudnn.benchmark = False
+
+        seed_all(42424242)
+
+        if mimi_weight is None:
+            try:
+                mimi_weight = hf_hub_download(hf_repo, loaders.MIMI_NAME)
+            except:
+                mimi_weight = hf_hub_download(
+                    hf_repo, loaders.MIMI_NAME, proxies=proxies
+                )
+
+        cls.mimi = loaders.get_mimi(mimi_weight, device)
+        cls.device = device
+        cls.sample_pcm, cls.sample_sr = read_mp3_from_url(
+            "https://huggingface.co/lmz/moshi-swift/resolve/main/bria-24khz.mp3"
+        )
+
+    def test_mp3_loading(self):
+        """Ensure MP3 file loads correctly."""
+        self.assertIsInstance(self.sample_pcm, np.ndarray)
+        self.assertGreater(self.sample_sr, 0)
+
+    def test_encoding(self):
+        """Ensure encoding produces expected tensor shape."""
+        pcm_chunk_size = int(self.mimi.sample_rate / self.mimi.frame_rate)
+        sample_pcm = torch.tensor(self.sample_pcm, device=self.device)
+        sample_pcm = sample_pcm[None]
+        chunk = sample_pcm[..., 0:pcm_chunk_size]
+        encoded = self.mimi.encode(chunk)
+        self.assertIsInstance(encoded, torch.Tensor)
+        self.assertGreater(encoded.shape[-1], 0)
+
+    def test_decoding(self):
+        """Ensure decoding produces expected output."""
+        pcm_chunk_size = int(self.mimi.sample_rate / self.mimi.frame_rate)
+        sample_pcm = torch.tensor(self.sample_pcm, device=self.device)[None]
+        chunk = sample_pcm[..., 0:pcm_chunk_size]
+        encoded = self.mimi.encode(chunk)
+        decoded = self.mimi.decode(encoded)
+        self.assertIsInstance(decoded, torch.Tensor)
+
+    def test_streaming_encoding_decoding(self):
+        """Test streaming encoding and decoding consistency."""
+        pcm_chunk_size = int(self.mimi.sample_rate / self.mimi.frame_rate)
+        sample_rate = self.mimi.sample_rate
+        max_duration_sec = 10.0
+        max_duration_len = int(sample_rate * max_duration_sec)
+
+        sample_pcm = torch.tensor(self.sample_pcm, device=self.device)
+        if sample_pcm.shape[-1] > max_duration_len:
+            sample_pcm = sample_pcm[..., :max_duration_len]
+        sample_pcm = sample_pcm[None].to(device=self.device)
+
+        all_codes = []
+        for start_idx in range(0, sample_pcm.shape[-1], pcm_chunk_size):
+            end_idx = min(sample_pcm.shape[-1], start_idx + pcm_chunk_size)
+            chunk = sample_pcm[..., start_idx:end_idx]
+            codes = self.mimi.encode(chunk)
+            if codes.shape[-1]:
+                all_codes.append(codes)
+
+        all_codes_th = torch.cat(all_codes, dim=-1)
+
+        all_pcms = []
+        with self.mimi.streaming(1):
+            for i in range(all_codes_th.shape[-1]):
+                codes = all_codes_th[..., i : i + 1]
+                pcm = self.mimi.decode(codes)
+                all_pcms.append(pcm)
+        all_pcms = torch.cat(all_pcms, dim=-1)
+
+        pcm_ref = self.mimi.decode(all_codes_th)
+        self.assertTrue(torch.allclose(pcm_ref, all_pcms, atol=1e-5))
+
+    def test_exported_encoding(self):
+        """Ensure exported encoding model is consistent with reference output."""
+
+        class MimiEncode(nn.Module):
+            def __init__(self, mimi: nn.Module):
+                super().__init__()
+                self.mimi_model = mimi
+
+            def forward(self, x):
+                return self.mimi_model.encode(x)
+
+        mimi_encode = MimiEncode(self.mimi)
+        chunk = torch.tensor(self.sample_pcm, device=self.device)[None][
+            ..., 0 : int(self.mimi.sample_rate / self.mimi.frame_rate)
+        ]
+        ref_encode_output = mimi_encode(chunk)
+        exported_encode = export(mimi_encode, (chunk,), strict=False)
+        ep_encode_output = exported_encode.module()(chunk)
+        self.assertTrue(torch.allclose(ep_encode_output, ref_encode_output, atol=1e-6))
+
+    def test_exported_decoder_xnnpack(self):
+        class MimiDecode(nn.Module):
+            def __init__(self, mimi: nn.Module):
+                super().__init__()
+                self.mimi_model = mimi
+
+            def forward(self, x):
+                return self.mimi_model.decode(x)
+
+        sample_pcm = torch.tensor(self.sample_pcm, device=self.device)[None]
+        pcm_chunk_size = int(self.mimi.sample_rate / self.mimi.frame_rate)
+        chunk = sample_pcm[..., 0:pcm_chunk_size]
+        input = self.mimi.encode(chunk)
+
+        mimi_decode = MimiDecode(self.mimi)
+        exported_decode: ExportedProgram = export(mimi_decode, (input,), strict=False)
+        quantization_config = get_symmetric_quantization_config(
+            is_per_channel=True,
+            is_dynamic=True,
+        )
+        quantizer = XNNPACKQuantizer()
+        quantizer.set_global(quantization_config)
+        m = exported_decode.module()
+        m = prepare_pt2e(m, quantizer)
+        m(input)
+        m = convert_pt2e(m)
+        print("quantized graph:")
+        print(m.graph)
+        # Export quantized module
+        exported_decode: ExportedProgram = export(m, (input,), strict=False)
+        # Lower
+        edge_manager = to_edge_transform_and_lower(
+            exported_decode,
+            partitioner=[XnnpackPartitioner()],
+        )
+        print("delegate graph:")
+        print_delegation_info(edge_manager.exported_program().graph_module)
+        exec_prog = edge_manager.to_executorch()
+        output_file = "/tmp/mimi_decode.pte"
+        with open(output_file, "wb") as file:
+            exec_prog.write_to_file(file)
+
+        eager_res = mimi_decode(input)
+        runtime = Runtime.get()
+        program = runtime.load_program(output_file)
+        method = program.load_method("forward")
+        flattened_x = tree_flatten(input)[0]
+        res = method.execute(flattened_x)
+        # Compare results
+        sqnr = compute_sqnr(eager_res, res[0])
+        print(f"SQNR: {sqnr}")
+        torch.testing.assert_close(eager_res, res[0], atol=1e-3, rtol=1e-3)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/examples/models/phi-3-mini/CMakeLists.txt b/examples/models/phi-3-mini/CMakeLists.txt
index e1ffd0da055..5e9cad0d3de 100644
--- a/examples/models/phi-3-mini/CMakeLists.txt
+++ b/examples/models/phi-3-mini/CMakeLists.txt
@@ -41,11 +41,12 @@ add_executable(
   phi_3_mini_runner
   main.cpp runner.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/sampler/sampler.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/tokenizer/bpe_tokenizer.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/tokenizers/src/llama2c_tokenizer.cpp
 )
 target_include_directories(
   phi_3_mini_runner
   PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/gflags/src
+         ${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/tokenizers/include
 )
 target_link_libraries(
   phi_3_mini_runner PRIVATE executorch extension_module_static extension_tensor
diff --git a/examples/models/phi-3-mini/runner.cpp b/examples/models/phi-3-mini/runner.cpp
index 1163a35d66b..15f76e9522c 100644
--- a/examples/models/phi-3-mini/runner.cpp
+++ b/examples/models/phi-3-mini/runner.cpp
@@ -11,15 +11,15 @@
 #include <ctime>
 #include <iostream>
 
-#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
 #include <executorch/extension/tensor/tensor.h>
 #include <executorch/runtime/platform/log.h>
+#include <pytorch/tokenizers/llama2c_tokenizer.h>
 
 using executorch::aten::ScalarType;
 using executorch::extension::Module;
-using executorch::extension::llm::BPETokenizer;
 using executorch::extension::llm::Sampler;
 using executorch::runtime::Error;
+using tokenizers::Llama2cTokenizer;
 
 namespace example {
 
@@ -32,14 +32,14 @@ Runner::Runner(
     const std::string& tokenizer_path,
     const float temperature)
     : module_(std::make_unique<Module>(model_path, Module::LoadMode::File)),
-      tokenizer_(std::make_unique<BPETokenizer>()),
+      tokenizer_(std::make_unique<Llama2cTokenizer>()),
       sampler_(std::make_unique<Sampler>(
           VOCABULARY_SIZE,
           temperature,
           SAMPLER_TOP,
           static_cast<unsigned long long>(std::time(nullptr)))) {
   ET_CHECK_MSG(
-      tokenizer_->load(tokenizer_path) == Error::Ok,
+      tokenizer_->load(tokenizer_path) == tokenizers::Error::Ok,
       "Failed to load tokenizer at %s",
       tokenizer_path.c_str());
   ET_LOG(
@@ -52,7 +52,9 @@ Runner::Runner(
 void Runner::generate(const std::string& prompt, std::size_t max_seq_len) {
   auto encode_res = tokenizer_->encode(prompt, 0, 0);
   ET_CHECK_MSG(
-      encode_res.error() == Error::Ok, "Failed to encode %s", prompt.c_str());
+      encode_res.error() == tokenizers::Error::Ok,
+      "Failed to encode %s",
+      prompt.c_str());
   auto input_tokens = encode_res.get();
   auto prev_token = input_tokens.back();
   auto current_token = prefill(input_tokens);
diff --git a/examples/models/phi-3-mini/runner.h b/examples/models/phi-3-mini/runner.h
index 2048acdab27..2f0042a57ea 100644
--- a/examples/models/phi-3-mini/runner.h
+++ b/examples/models/phi-3-mini/runner.h
@@ -15,9 +15,9 @@
 #include <string>
 
 #include <executorch/extension/llm/sampler/sampler.h>
-#include <executorch/extension/llm/tokenizer/tokenizer.h>
 #include <executorch/extension/module/module.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <pytorch/tokenizers/tokenizer.h>
 
 namespace example {
 
@@ -43,7 +43,7 @@ class Runner {
   uint64_t run_model_step(uint64_t token);
 
   std::unique_ptr<executorch::extension::Module> module_;
-  std::unique_ptr<executorch::extension::llm::Tokenizer> tokenizer_;
+  std::unique_ptr<tokenizers::Tokenizer> tokenizer_;
   std::unique_ptr<executorch::extension::llm::Sampler> sampler_;
 };
 
diff --git a/examples/models/phi-4-mini/README.md b/examples/models/phi-4-mini/README.md
new file mode 100644
index 00000000000..a23e4f49638
--- /dev/null
+++ b/examples/models/phi-4-mini/README.md
@@ -0,0 +1,63 @@
+## Summary
+Phi-4-mini Instruct (3.8B) is a newly released version of the popular Phi-4 model developed by Microsoft.
+
+## Instructions
+
+Phi-4-mini uses the same example code as Llama, while the checkpoint, model params, and tokenizer are different. Please see the [Llama README page](../llama/README.md) for details.
+
+All commands for exporting and running Llama on various backends should also be applicable to Phi-4-mini, by swapping the following args:
+```
+--model phi_4_mini
+--params examples/models/phi-4-mini/config.json
+--checkpoint <path-to-meta-checkpoint>
+```
+
+### Generate the Checkpoint
+The original checkpoint can be obtained from HuggingFace:
+```
+huggingface-cli download microsoft/Phi-4-mini-instruct
+```
+
+We then convert it to Meta's checkpoint format:
+```
+python examples/models/phi-4-mini/convert_weights.py <path-to-checkpoint-dir> <output-path>
+```
+
+### Example export and run
+Here is an basic example for exporting and running Phi-4-mini, although please refer to [Llama README page](../llama/README.md) for more advanced usage.
+
+Export to XNNPack, no quantization:
+```
+# No quantization
+# Set these paths to point to the downloaded files
+PHI_CHECKPOINT=path/to/checkpoint.pth
+
+python -m examples.models.llama.export_llama \
+  --model phi_4_mini \
+  --checkpoint "${PHI_CHECKPOINT=path/to/checkpoint.pth:?}" \
+  --params examples/models/phi-4-mini/config.json \
+  -kv \
+  --use_sdpa_with_kv_cache \
+  -d fp32 \
+  -X \
+  --metadata '{"get_bos_id":151643, "get_eos_ids":[151643]}' \
+  --output_name="phi-4-mini.pte"
+  --verbose
+```
+
+Run using the executor runner:
+```
+# Currently a work in progress, just need to enable HuggingFace json tokenizer in C++.
+# In the meantime, can run with an example Python runner with pybindings:
+
+python -m examples.models.llama.runner.native
+  --model phi_4_mini
+  --pte <path-to-pte>
+  -kv
+  --tokenizer <path-to-tokenizer>/tokenizer.json
+  --tokenizer_config <path-to_tokenizer>/tokenizer_config.json
+  --prompt "What is in a california roll?"
+  --params examples/models/phi-4-mini/config.json
+  --max_len 64
+  --temperature 0
+```
diff --git a/examples/models/phi-4-mini/__init__.py b/examples/models/phi-4-mini/__init__.py
new file mode 100644
index 00000000000..056f2c26314
--- /dev/null
+++ b/examples/models/phi-4-mini/__init__.py
@@ -0,0 +1,14 @@
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.examples.models.llama.model import Llama2Model
+
+
+class Phi4MiniModel(Llama2Model):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+
+__all__ = [
+    "Phi4MiniModel",
+]
diff --git a/examples/models/phi-4-mini/config.json b/examples/models/phi-4-mini/config.json
new file mode 100644
index 00000000000..edce93e59fa
--- /dev/null
+++ b/examples/models/phi-4-mini/config.json
@@ -0,0 +1,15 @@
+{
+  "dim": 3072,
+  "ffn_dim_multiplier": 1,
+  "hidden_dim": 8192,
+  "n_heads": 24,
+  "n_kv_heads": 8,
+  "n_layers": 32,
+  "norm_eps": 1e-05,
+  "rope_theta": 10000.0,
+  "use_scaled_rope": false,
+  "vocab_size": 200064,
+  "use_hf_rope": true,
+  "partial_rotary_factor": 0.75,
+  "attention_qkv_bias": false
+}
diff --git a/examples/models/phi-4-mini/convert_weights.py b/examples/models/phi-4-mini/convert_weights.py
new file mode 100644
index 00000000000..b75f359d772
--- /dev/null
+++ b/examples/models/phi-4-mini/convert_weights.py
@@ -0,0 +1,88 @@
+import argparse
+from typing import Dict
+
+import torch
+
+from torchtune.models.convert_weights import get_mapped_key
+
+from torchtune.training import FullModelHFCheckpointer
+
+
+# Standard _FROM_META weight mapping of Meta weights to TorchTune.
+_PHI_4_FROM_META = {
+    "tok_embeddings.weight": "tok_embeddings.weight",
+    "norm.weight": "norm.scale",
+    "layers.{}.attention.wk.weight": "layers.{}.attn.k_proj.weight",
+    "layers.{}.attention.wq.weight": "layers.{}.attn.q_proj.weight",
+    "layers.{}.attention.wv.weight": "layers.{}.attn.v_proj.weight",
+    "layers.{}.attention.wo.weight": "layers.{}.attn.output_proj.weight",
+    "layers.{}.attention_norm.weight": "layers.{}.sa_norm.scale",
+    "layers.{}.ffn_norm.weight": "layers.{}.mlp_norm.scale",
+    "layers.{}.feed_forward.w1.weight": "layers.{}.mlp.w1.weight",
+    "layers.{}.feed_forward.w2.weight": "layers.{}.mlp.w2.weight",
+    "layers.{}.feed_forward.w3.weight": "layers.{}.mlp.w3.weight",
+}
+
+
+def phi_4_tune_to_meta(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+    """
+    Convert a state dict from torchtune's format to Meta's format. This function
+    doesn't handle any sharding or splitting of state dicts. It follows the
+    state_dict IN -> state_dict OUT pattern.
+
+    Args:
+        state_dict (Dict[str, torch.Tensor]): State dict in torchtune's format.
+
+    Returns:
+        Dict[str, torch.Tensor]: State dict in Meta's format.
+    """
+    converted_state_dict = {}
+    inverted_mapping_dict = {v: k for k, v in _PHI_4_FROM_META.items()}
+
+    for key, value in state_dict.items():
+        new_key = get_mapped_key(key, inverted_mapping_dict)
+        converted_state_dict[new_key] = value
+
+    # Input and output embeddings are tied.
+    converted_state_dict["output.weight"] = converted_state_dict[
+        "tok_embeddings.weight"
+    ]
+
+    return converted_state_dict
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert Phi-4-mini weights to Meta format."
+    )
+    parser.add_argument(
+        "input_dir",
+        type=str,
+        help="Path to directory containing checkpoint files",
+    )
+    parser.add_argument("output", type=str, help="Path to the output checkpoint")
+
+    args = parser.parse_args()
+
+    checkpointer = FullModelHFCheckpointer(
+        checkpoint_dir=args.input_dir,
+        checkpoint_files=[
+            "model-00001-of-00002.safetensors",
+            "model-00002-of-00002.safetensors",
+        ],
+        output_dir=".",
+        model_type="PHI4",
+    )
+
+    print("Loading checkpoint...")
+    sd = checkpointer.load_checkpoint()
+
+    print("Converting checkpoint...")
+    sd = phi_4_tune_to_meta(sd["model"])
+
+    torch.save(sd, args.output)
+    print(f"Checkpoint saved to {args.output}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/models/qwen2_5/1_5b_config.json b/examples/models/qwen2_5/1_5b_config.json
new file mode 100644
index 00000000000..64daca5a7cd
--- /dev/null
+++ b/examples/models/qwen2_5/1_5b_config.json
@@ -0,0 +1,14 @@
+{
+  "dim": 1536,
+  "ffn_dim_multiplier": 1,
+  "hidden_dim": 8960,
+  "n_heads": 12,
+  "n_kv_heads": 2,
+  "n_layers": 28,
+  "norm_eps": 1e-06,
+  "rope_theta": 1000000.0,
+  "use_scaled_rope": false,
+  "vocab_size": 151936,
+  "use_hf_rope": true,
+  "attention_qkv_bias": true
+}
diff --git a/examples/models/qwen2_5/README.md b/examples/models/qwen2_5/README.md
new file mode 100644
index 00000000000..9bf791a35ed
--- /dev/null
+++ b/examples/models/qwen2_5/README.md
@@ -0,0 +1,63 @@
+## Summary
+Qwen 2.5 is the latest iteration of the Qwen series of large language models (LLMs) developed by Alibaba. At the moment, 1.5b is currently supporting, with plans in the future for adding the 0.5b and 3b versions.
+
+## Instructions
+
+Qwen 2.5 uses the same example code as Llama, while the checkpoint, model params, and tokenizer are different. Please see the [Llama README page](../llama/README.md) for details.
+
+All commands for exporting and running Llama on various backends should also be applicable to Qwen 2.5, by swapping the following args:
+```
+--model qwen2_5
+--params examples/models/qwen2_5/1_5b_config.json
+--checkpoint <path-to-meta-checkpoint>
+```
+
+### Generate the Checkpoint
+The original checkpoint can be obtained from HuggingFace:
+```
+huggingface-cli download Qwen/Qwen2.5-1.5B
+```
+
+We then convert it to Meta's checkpoint format:
+```
+python examples/models/qwen2_5/convert_weights.py <path-to-checkpoint-dir> <output-path>
+```
+
+### Example export and run
+Here is an basic example for exporting and running Qwen 2.5, although please refer to [Llama README page](../llama/README.md) for more advanced usage.
+
+Export to XNNPack, no quantization:
+```
+# No quantization
+# Set these paths to point to the downloaded files
+QWEN_CHECKPOINT=path/to/checkpoint.pth
+
+python -m examples.models.llama.export_llama \
+  --model "qwen2_5" \
+  --checkpoint "${QWEN_CHECKPOINT:?}" \
+  --params examples/models/qwen2_5/1_5b_config.json \
+  -kv \
+  --use_sdpa_with_kv_cache \
+  -d fp32 \
+  -X \
+  --metadata '{"get_bos_id":151643, "get_eos_ids":[151643]}' \
+  --output_name="qwen2_5-1_5b.pte"
+  --verbose
+```
+
+Run using the executor runner:
+```
+# Currently a work in progress, just need to enable HuggingFace json tokenizer in C++.
+# In the meantime, can run with an example Python runner with pybindings:
+
+python -m examples.models.llama.runner.native
+  --model qwen2_5
+  --pte <path-to-pte>
+  -kv
+  --tokenizer <path-to-tokenizer>/tokenizer.json
+  --tokenizer_config <path-to_tokenizer>/tokenizer_config.json
+  --prompt "Who is the founder of Meta?"
+  --params examples/models/qwen2_5/1_5b_config.json
+  --max_len 64
+  --temperature 0
+```
diff --git a/examples/models/qwen2_5/__init__.py b/examples/models/qwen2_5/__init__.py
new file mode 100644
index 00000000000..03fc5aa2b30
--- /dev/null
+++ b/examples/models/qwen2_5/__init__.py
@@ -0,0 +1,14 @@
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.examples.models.llama.model import Llama2Model
+
+
+class Qwen2_5Model(Llama2Model):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+
+__all__ = [
+    "Qwen2_5Model",
+]
diff --git a/examples/models/qwen2_5/convert_weights.py b/examples/models/qwen2_5/convert_weights.py
new file mode 100644
index 00000000000..9aada5b3e90
--- /dev/null
+++ b/examples/models/qwen2_5/convert_weights.py
@@ -0,0 +1,88 @@
+import argparse
+from typing import Dict
+
+import torch
+
+from torchtune.models.convert_weights import get_mapped_key
+
+from torchtune.training import FullModelHFCheckpointer
+
+# Standard _FROM_META weight mapping of Meta weights to TorchTune + additional bias weight mappings.
+_QWEN_2_FROM_META = {
+    "tok_embeddings.weight": "tok_embeddings.weight",
+    "norm.weight": "norm.scale",
+    "layers.{}.attention.wk.weight": "layers.{}.attn.k_proj.weight",
+    "layers.{}.attention.wk.bias": "layers.{}.attn.k_proj.bias",
+    "layers.{}.attention.wq.weight": "layers.{}.attn.q_proj.weight",
+    "layers.{}.attention.wq.bias": "layers.{}.attn.q_proj.bias",
+    "layers.{}.attention.wv.weight": "layers.{}.attn.v_proj.weight",
+    "layers.{}.attention.wv.bias": "layers.{}.attn.v_proj.bias",
+    "layers.{}.attention.wo.weight": "layers.{}.attn.output_proj.weight",
+    "layers.{}.attention_norm.weight": "layers.{}.sa_norm.scale",
+    "layers.{}.ffn_norm.weight": "layers.{}.mlp_norm.scale",
+    "layers.{}.feed_forward.w1.weight": "layers.{}.mlp.w1.weight",
+    "layers.{}.feed_forward.w2.weight": "layers.{}.mlp.w2.weight",
+    "layers.{}.feed_forward.w3.weight": "layers.{}.mlp.w3.weight",
+}
+
+
+def qwen_2_tune_to_meta(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+    """
+    Convert a state dict from torchtune's format to Meta's format. This function
+    doesn't handle any sharding or splitting of state dicts. It follows the
+    state_dict IN -> state_dict OUT pattern.
+
+    Args:
+        state_dict (Dict[str, torch.Tensor]): State dict in torchtune's format.
+
+    Returns:
+        Dict[str, torch.Tensor]: State dict in Meta's format.
+    """
+    converted_state_dict = {}
+    inverted_mapping_dict = {v: k for k, v in _QWEN_2_FROM_META.items()}
+
+    for key, value in state_dict.items():
+        new_key = get_mapped_key(key, inverted_mapping_dict)
+        converted_state_dict[new_key] = value
+
+    # 0.5b and 1.5b models share the same weights for tok_embeddings and output embeddings, see https://github.com/QwenLM/Qwen2.5/issues/733.
+    converted_state_dict["output.weight"] = converted_state_dict[
+        "tok_embeddings.weight"
+    ]
+
+    return converted_state_dict
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert Qwen2 weights to Meta format."
+    )
+    parser.add_argument(
+        "input_dir",
+        type=str,
+        help="Path to directory containing checkpoint files",
+    )
+    parser.add_argument("output", type=str, help="Path to the output checkpoint")
+
+    args = parser.parse_args()
+
+    # Don't necessarily need to use TorchTune checkpointer, can just aggregate checkpoint files by ourselves.
+    checkpointer = FullModelHFCheckpointer(
+        checkpoint_dir=args.input_dir,
+        checkpoint_files=["model.safetensors"],
+        output_dir=".",
+        model_type="QWEN2",
+    )
+
+    print("Loading checkpoint...")
+    sd = checkpointer.load_checkpoint()
+
+    print("Converting checkpoint...")
+    sd = qwen_2_tune_to_meta(sd["model"])
+
+    torch.save(sd, args.output)
+    print(f"Checkpoint saved to {args.output}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/models/smollm2/135M_config.json b/examples/models/smollm2/135M_config.json
new file mode 100644
index 00000000000..604c7e94ab5
--- /dev/null
+++ b/examples/models/smollm2/135M_config.json
@@ -0,0 +1,14 @@
+{
+    "dim": 576,
+    "ffn_dim_multiplier": 1,
+    "hidden_dim": 1536,
+    "n_heads": 9,
+    "n_kv_heads": 3,
+    "n_layers": 30,
+    "norm_eps": 1e-05,
+    "rope_theta": 10000.0,
+    "use_scaled_rope": false,
+    "vocab_size": 49152,
+    "use_hf_rope": false,
+    "attention_qkv_bias": false
+  }
diff --git a/examples/models/smollm2/__init__ b/examples/models/smollm2/__init__
new file mode 100644
index 00000000000..3d01bf9eb42
--- /dev/null
+++ b/examples/models/smollm2/__init__
@@ -0,0 +1,14 @@
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.example.models.llama.model import Llama2Model
+
+
+class SmolLM2Model(Llama2Model):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+
+__all__ = [
+    "SmolLM2Model",
+]
diff --git a/examples/models/smollm2/convert_weights.py b/examples/models/smollm2/convert_weights.py
new file mode 100644
index 00000000000..db80bd47b8c
--- /dev/null
+++ b/examples/models/smollm2/convert_weights.py
@@ -0,0 +1,80 @@
+import argparse
+from typing import Dict
+
+import torch
+
+from torchtune.models.convert_weights import get_mapped_key
+
+from torchtune.training import FullModelHFCheckpointer
+
+# Standard _FROM_META weight mapping of Meta weights to TorchTune + additional bias weight mappings.
+_SMOLLM_FROM_META = {
+    "tok_embeddings.weight": "tok_embeddings.weight",
+    "norm.weight": "norm.scale",
+    "output.weight": "output.weight",
+    "layers.{}.attention.wk.weight": "layers.{}.attn.k_proj.weight",
+    "layers.{}.attention.wq.weight": "layers.{}.attn.q_proj.weight",
+    "layers.{}.attention.wv.weight": "layers.{}.attn.v_proj.weight",
+    "layers.{}.attention.wo.weight": "layers.{}.attn.output_proj.weight",
+    "layers.{}.attention_norm.weight": "layers.{}.sa_norm.scale",
+    "layers.{}.ffn_norm.weight": "layers.{}.mlp_norm.scale",
+    "layers.{}.feed_forward.w1.weight": "layers.{}.mlp.w1.weight",
+    "layers.{}.feed_forward.w2.weight": "layers.{}.mlp.w2.weight",
+    "layers.{}.feed_forward.w3.weight": "layers.{}.mlp.w3.weight",
+}
+
+
+def smollm_tune_to_meta(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+    """
+    Convert a state dict from torchtune's format to Meta's format. This function
+    doesn't handle any sharding or splitting of state dicts. It follows the
+    state_dict IN -> state_dict OUT pattern.
+
+    Args:
+        state_dict (Dict[str, torch.Tensor]): State dict in torchtune's format.
+
+    Returns:
+        Dict[str, torch.Tensor]: State dict in Meta's format.
+    """
+    converted_state_dict = {}
+    inverted_mapping_dict = {v: k for k, v in _SMOLLM_FROM_META.items()}
+    for key, value in state_dict.items():
+        new_key = get_mapped_key(key, inverted_mapping_dict)
+        converted_state_dict[new_key] = value
+
+    return converted_state_dict
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert SmolLM weights to Meta format."
+    )
+    parser.add_argument(
+        "input_dir",
+        type=str,
+        help="Path to directory containing checkpoint files",
+    )
+    parser.add_argument("output", type=str, help="Path to the output checkpoint")
+
+    args = parser.parse_args()
+
+    # Don't necessarily need to use TorchTune checkpointer, can just aggregate checkpoint files by ourselves.
+    checkpointer = FullModelHFCheckpointer(
+        checkpoint_dir=args.input_dir,
+        checkpoint_files=["model.safetensors"],
+        output_dir=".",
+        model_type="LLAMA",
+    )
+
+    print("Loading checkpoint...")
+    sd = checkpointer.load_checkpoint()
+
+    print("Converting checkpoint...")
+    sd = smollm_tune_to_meta(sd["model"])
+
+    torch.save(sd, args.output)
+    print(f"Checkpoint saved to {args.output}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/portable/custom_ops/CMakeLists.txt b/examples/portable/custom_ops/CMakeLists.txt
index 02736cca964..5a9a9a11fe6 100644
--- a/examples/portable/custom_ops/CMakeLists.txt
+++ b/examples/portable/custom_ops/CMakeLists.txt
@@ -27,8 +27,8 @@ if(NOT EXECUTORCH_ROOT)
   set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 endif()
 
-include(${EXECUTORCH_ROOT}/build/Utils.cmake)
-include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
 
 if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
diff --git a/examples/portable/custom_ops/test_custom_ops.sh b/examples/portable/custom_ops/test_custom_ops.sh
index 6d83de07d3a..5d21d393686 100644
--- a/examples/portable/custom_ops/test_custom_ops.sh
+++ b/examples/portable/custom_ops/test_custom_ops.sh
@@ -53,8 +53,7 @@ get_shared_lib_ext() {
 
 test_cmake_custom_op_2() {
   local model_name='custom_ops_2'
-  SITE_PACKAGES="$(${PYTHON_EXECUTABLE} -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
-  CMAKE_PREFIX_PATH="$PWD/cmake-out/lib/cmake/ExecuTorch;${SITE_PACKAGES}/torch"
+  CMAKE_PREFIX_PATH="$PWD/cmake-out/lib/cmake/ExecuTorch"
 
   local example_dir=examples/portable/custom_ops
   local build_dir=cmake-out/${example_dir}
diff --git a/examples/portable/executor_runner/executor_runner.cpp b/examples/portable/executor_runner/executor_runner.cpp
index f7702fae3de..7c75c39f0a9 100644
--- a/examples/portable/executor_runner/executor_runner.cpp
+++ b/examples/portable/executor_runner/executor_runner.cpp
@@ -30,11 +30,17 @@
 #include <executorch/runtime/executor/method.h>
 #include <executorch/runtime/executor/program.h>
 #include <executorch/runtime/platform/log.h>
+#include <executorch/runtime/platform/platform.h>
 #include <executorch/runtime/platform/runtime.h>
 #ifdef ET_EVENT_TRACER_ENABLED
 #include <executorch/devtools/etdump/etdump_flatcc.h>
 #endif // ET_EVENT_TRACER_ENABLED
 
+#if defined(ET_USE_THREADPOOL)
+#include <executorch/extension/threadpool/cpuinfo_utils.h>
+#include <executorch/extension/threadpool/threadpool.h>
+#endif
+
 static uint8_t method_allocator_pool[4 * 1024U * 1024U]; // 4 MB
 
 static uint8_t temp_allocator_pool[1024U * 1024U];
@@ -47,6 +53,10 @@ DEFINE_uint32(num_executions, 1, "Number of times to run the model.");
 #ifdef ET_EVENT_TRACER_ENABLED
 DEFINE_string(etdump_path, "model.etdump", "Write ETDump data to this path.");
 #endif // ET_EVENT_TRACER_ENABLED
+DEFINE_int32(
+    cpu_threads,
+    -1,
+    "Number of CPU threads for inference. Defaults to -1, which implies we'll use a heuristic to derive the # of performant cores for a specific device.");
 
 using executorch::extension::FileDataLoader;
 using executorch::runtime::Error;
@@ -124,6 +134,18 @@ int main(int argc, char** argv) {
     return 1;
   }
 
+#if defined(ET_USE_THREADPOOL)
+  auto cpu_threads = FLAGS_cpu_threads;
+  uint32_t num_performant_cores = cpu_threads == -1
+      ? ::executorch::extension::cpuinfo::get_num_performant_cores()
+      : static_cast<uint32_t>(cpu_threads);
+  ET_LOG(
+      Info, "Resetting threadpool with num threads = %d", num_performant_cores);
+  if (num_performant_cores > 0) {
+    ::executorch::extension::threadpool::get_threadpool()
+        ->_unsafe_reset_threadpool(num_performant_cores);
+  }
+#endif // ET_USE_THREADPOOL
   // Create a loader to get the data of the program file. There are other
   // DataLoaders that use mmap() or point to data that's already in memory, and
   // users can create their own DataLoaders to load from arbitrary sources.
@@ -228,29 +250,43 @@ int main(int argc, char** argv) {
       (uint32_t)method.error());
   ET_LOG(Info, "Method loaded.");
 
-  // Allocate input tensors and set all of their elements to 1. The `inputs`
-  // variable owns the allocated memory and must live past the last call to
-  // `execute()`.
-  auto inputs = executorch::extension::prepare_input_tensors(*method);
-  ET_CHECK_MSG(
-      inputs.ok(),
-      "Could not prepare inputs: 0x%" PRIx32,
-      (uint32_t)inputs.error());
-  ET_LOG(Info, "Inputs prepared.");
-
+  et_timestamp_t time_spent_executing = 0;
   // Run the model.
   for (uint32_t i = 0; i < FLAGS_num_executions; i++) {
+    ET_LOG(Debug, "Preparing inputs.");
+    // Allocate input tensors and set all of their elements to 1. The `inputs`
+    // variable owns the allocated memory and must live past the last call to
+    // `execute()`.
+    //
+    // NOTE: we have to re-prepare input tensors on every execution
+    // because inputs whose space gets reused by memory planning (if
+    // any such inputs exist) will not be preserved for the next
+    // execution.
+    auto inputs = executorch::extension::prepare_input_tensors(*method);
+    ET_CHECK_MSG(
+        inputs.ok(),
+        "Could not prepare inputs: 0x%" PRIx32,
+        (uint32_t)inputs.error());
+    ET_LOG(Debug, "Inputs prepared.");
+
+    const et_timestamp_t before_execute = et_pal_current_ticks();
     Error status = method->execute();
+    const et_timestamp_t after_execute = et_pal_current_ticks();
+    time_spent_executing += after_execute - before_execute;
     ET_CHECK_MSG(
         status == Error::Ok,
         "Execution of method %s failed with status 0x%" PRIx32,
         method_name,
         (uint32_t)status);
   }
+  const auto tick_ratio = et_pal_ticks_to_ns_multiplier();
+  constexpr auto NANOSECONDS_PER_MILLISECOND = 1000000;
   ET_LOG(
       Info,
-      "Model executed successfully %" PRIu32 " time(s).",
-      FLAGS_num_executions);
+      "Model executed successfully %" PRIu32 " time(s) in %f ms.",
+      FLAGS_num_executions,
+      static_cast<double>(time_spent_executing) * tick_ratio.numerator /
+          tick_ratio.denominator / NANOSECONDS_PER_MILLISECOND);
 
   // Print the outputs.
   std::vector<EValue> outputs(method->outputs_size());
diff --git a/examples/portable/executor_runner/targets.bzl b/examples/portable/executor_runner/targets.bzl
index 9cddaa4ed77..3d29472300f 100644
--- a/examples/portable/executor_runner/targets.bzl
+++ b/examples/portable/executor_runner/targets.bzl
@@ -1,4 +1,5 @@
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_oss_build_kwargs", "runtime")
+load("@fbsource//xplat/executorch/codegen:codegen.bzl", "executorch_generated_lib")
 
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
@@ -27,6 +28,26 @@ def define_common_targets():
         ],
     )
 
+    runtime.cxx_library(
+        name = "executor_runner_lib_with_threadpool",
+        srcs = ["executor_runner.cpp"],
+        deps = [
+            "//executorch/runtime/executor:program",
+            "//executorch/extension/data_loader:file_data_loader",
+            "//executorch/extension/evalue_util:print_evalue",
+            "//executorch/extension/runner_util:inputs",
+            "//executorch/extension/threadpool:cpuinfo_utils",
+            "//executorch/extension/threadpool:threadpool",
+        ],
+        external_deps = [
+            "gflags",
+        ],
+        define_static_target = True,
+        visibility = [
+            "//executorch/examples/...",
+        ],
+    )
+
     register_custom_op = native.read_config("executorch", "register_custom_op", "0")
     register_quantized_ops = native.read_config("executorch", "register_quantized_ops", "0")
 
@@ -52,3 +73,36 @@ def define_common_targets():
         define_static_target = True,
         **get_oss_build_kwargs()
     )
+
+    executorch_generated_lib(
+        name = "generated_op_lib_for_runner",
+        deps = [
+            "//executorch/kernels/optimized:optimized_operators",
+            "//executorch/kernels/optimized:optimized_oplist",
+            "//executorch/kernels/portable:executorch_aten_ops",
+            "//executorch/kernels/portable:executorch_custom_ops",
+            "//executorch/kernels/portable:operators",
+        ],
+        custom_ops_aten_kernel_deps = [
+            "//executorch/kernels/portable:operators_aten",
+        ],
+        functions_yaml_target = "//executorch/kernels/optimized:optimized.yaml",
+        custom_ops_yaml_target = "//executorch/kernels/portable:custom_ops.yaml",
+        fallback_yaml_target = "//executorch/kernels/portable:functions.yaml",
+        define_static_targets = True,
+    )
+
+    # Test driver for models, should have all fast CPU kernels
+    # (XNNPACK, custom SDPA, etc.) available.
+    runtime.cxx_binary(
+        name = "executor_runner_opt",
+        srcs = [],
+        deps = [
+            ":executor_runner_lib_with_threadpool",
+            ":generated_op_lib_for_runner",
+            "//executorch/backends/xnnpack:xnnpack_backend",
+            "//executorch/configurations:executor_cpu_optimized",
+            "//executorch/extension/llm/custom_ops:custom_ops",
+            "//executorch/kernels/quantized:generated_lib",
+        ],
+    )
diff --git a/examples/qualcomm/CMakeLists.txt b/examples/qualcomm/CMakeLists.txt
index 55969f937ee..7b2c43b3f46 100644
--- a/examples/qualcomm/CMakeLists.txt
+++ b/examples/qualcomm/CMakeLists.txt
@@ -15,8 +15,8 @@ if(NOT EXECUTORCH_ROOT)
   set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
 endif()
 
-include(${EXECUTORCH_ROOT}/build/Utils.cmake)
-include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
 
 if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
@@ -63,7 +63,10 @@ target_compile_options(
   full_portable_ops_lib INTERFACE -DET_EVENT_TRACER_ENABLED
 )
 target_include_directories(
-  full_portable_ops_lib PUBLIC ${_common_include_directories}
+  full_portable_ops_lib
+  PUBLIC
+    ${_common_include_directories}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/tokenizers/include
 )
 
 # find RE2 for tokenizer
@@ -72,11 +75,11 @@ set(ABSL_PROPAGATE_CXX_STD ON)
 set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 add_subdirectory(
-  ${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/third-party/abseil-cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/tokenizers/third-party/abseil-cpp
   ${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
 )
 add_subdirectory(
-  ${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/third-party/re2
+  ${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/tokenizers/third-party/re2
   ${CMAKE_CURRENT_BINARY_DIR}/re2
 )
 set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
diff --git a/examples/qualcomm/oss_scripts/llama/README.md b/examples/qualcomm/oss_scripts/llama/README.md
index 439278cb424..cd468eebb26 100644
--- a/examples/qualcomm/oss_scripts/llama/README.md
+++ b/examples/qualcomm/oss_scripts/llama/README.md
@@ -8,11 +8,16 @@ This file provides you the instructions to run LLAMA model with different parame
 
 We offer the following modes to execute the model:
 
-Prefill Mode: This is also known as batch prefill mode, where the model takes in a list of tokens as input and generates the next token along with the key-value (KV) cache for all tokens. This mode is efficient for encoding the user's prompt.
-
 KV Cache Mode: In KV Cache mode, the model takes in a single previous token and generates the next predicted token along with its KV cache. It is efficient for generating subsequent tokens after the initial prompt.
 
-Hybrid Mode: Hybrid mode leverages the strengths of both batch prefill and KV cache modes to optimize token generation speed. Initially, it uses prefill mode to efficiently generate the prompt's key-value (KV) cache. Then, the mode switches to KV cache mode, which excels at generating subsequent tokens.
+Hybrid Mode: Hybrid mode leverages the strengths of both AR-N model and KV cache modes to optimize token generation speed. Initially, it uses AR-N model to efficiently generate the prompt's key-value (KV) cache. Then, the mode switches to KV cache mode, which excels at generating subsequent tokens.
+  - AR-N model: The auto-regression (AR) length determines the number of tokens to consume and the number of logits to produce. Use it to process the prompt and generate the key-value (kv) cache, which serves as a prompt processor in hybrid mode.
+  - Prompt processing with AR-N model: 
+  <figure>
+    <img src="./assets/PromptProcessingWithARN.png" alt="Prompt Processing With AR-N Model">
+    <figcaption>Prompt processing is done using a for-loop. An N-token block is taken, and the KV cache is updated for that block. This process is repeated until all tokens are consumed, with the last block potentially requiring padding. For flexibility, the AR-N model can handle any input length less than the maximum sequence length. For TTFT, the input length (or number of blocks) will vary depending on the actual input length, rather than always being the same.
+    </figcaption>
+</figure>
 
 
 ## Instructions
@@ -50,13 +55,13 @@ At the end of this step, users should have the following files ready: `consolida
 ### Step3: Run default examples using hybrid mode.
 #### LLAMA2
 ```bash
-python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint stories110M.pt --params params.json --tokenizer_model tokenizer.model --tokenizer_bin tokenizer.bin --llama_model stories110m --model_mode hybrid --prefill_seq_len 32 --kv_seq_len 128 --prompt "Once upon a time"
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint stories110M.pt --params params.json --tokenizer_model tokenizer.model --tokenizer_bin tokenizer.bin --llama_model stories110m --model_mode hybrid --prefill_ar_len 32 --max_seq_len 128 --prompt "Once upon a time"
 ```
 
 #### LLAMA3.2
 Default example using hybrid mode.
 ```bash
-python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --llama_model llama3_2 --model_mode hybrid --prefill_seq_len 32 --kv_seq_len 128 --prompt "what is 1+1"
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --llama_model llama3_2 --model_mode hybrid --prefill_ar_len 32 --max_seq_len 128 --prompt "what is 1+1"
 ```
 
 ### KV Cache update mechanism
@@ -109,16 +114,16 @@ We have two distinct mechanisms for updating the key-value (KV) cache, which can
 ### Additional Configs when running the script
 If you would like to compile the model only, we have provided the flag `--compile_only`. Taking LLAMA3.2 as an example:
 ```bash
-python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --llama_model llama3_2 --model_mode hybrid --prefill_seq_len 32 --kv_seq_len 128 --prompt "what is 1+1" --compile_only
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --llama_model llama3_2 --model_mode hybrid --prefill_ar_len 32 --max_seq_len 128 --prompt "what is 1+1" --compile_only
 ```
 
 On the other hand, if you already have a pre-compiled .pte model, you can perform inference by providing the flag `--pre_gen_pte` and specifying the folder that contains the .pte model. Taking LLAMA3.2 as an example:
 ```bash
-python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --llama_model llama3_2 --model_mode hybrid --prefill_seq_len 32 --kv_seq_len 128 --prompt "what is 1+1" --pre_gen_pte ${FOLDER_TO_PRE_GEN_PTE}
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --llama_model llama3_2 --model_mode hybrid --prefill_ar_len 32 --max_seq_len 128 --prompt "what is 1+1" --pre_gen_pte ${FOLDER_TO_PRE_GEN_PTE}
 ```
 
 You can select the KV Cache update mechanism at runtime by setting the `KV_UPDATER` variable to either "shift_pointer" or "smart_mask". By default, it is set to "smart_mask".
 `KV_UPDATER` = "shift_pointer"
 ```bash
-python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --llama_model llama3_2 --model_mode hybrid --prefill_seq_len 32 --kv_seq_len 128 --prompt "what is 1+1" --kv_updator ${KV_UPDATER}
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --llama_model llama3_2 --model_mode hybrid --prefill_ar_len 32 --max_seq_len 128 --prompt "what is 1+1" --kv_updator ${KV_UPDATER}
 ```
diff --git a/examples/qualcomm/oss_scripts/llama/TARGETS b/examples/qualcomm/oss_scripts/llama/TARGETS
index 3ef82293e03..e4bad10a234 100644
--- a/examples/qualcomm/oss_scripts/llama/TARGETS
+++ b/examples/qualcomm/oss_scripts/llama/TARGETS
@@ -19,6 +19,7 @@ python_library(
     name = "llama_lib",
     srcs = ["llama.py"],
     deps = [
+        "//executorch/examples/models/llama:source_transformation",
         "//caffe2:torch",
         "//executorch/backends/qualcomm/partition:partition",
         "//executorch/backends/qualcomm/quantizer:quantizer",
@@ -35,23 +36,12 @@ python_library(
 
 python_binary(
     name = "llama",
-    srcs = ["llama.py"],
     main_function = "executorch.examples.qualcomm.oss_scripts.llama.llama.main",
     preload_deps = [
         "//executorch/extension/llm/custom_ops:model_sharding_py",
     ],
     deps = [
-        "//executorch/examples/qualcomm/oss_scripts/llama:static_llama",
-        "//caffe2:torch",
-        "//executorch/extension/pybindings:aten_lib",
-        "//executorch/backends/qualcomm/partition:partition",
-        "//executorch/backends/qualcomm/quantizer:quantizer",
-        "//executorch/devtools/backend_debug:delegation_info",
-        "//executorch/devtools:lib",
-        "//executorch/examples/models:models",
-        "//executorch/examples/qualcomm:utils",
-        "//executorch/extension/export_util:export_util",
-        "//executorch/extension/llm/export:export_lib",
+        ":llama_lib",
     ],
 )
 
diff --git a/examples/qualcomm/oss_scripts/llama/assets/PromptProcessingWithARN.png b/examples/qualcomm/oss_scripts/llama/assets/PromptProcessingWithARN.png
new file mode 100644
index 00000000000..228b846f7c3
Binary files /dev/null and b/examples/qualcomm/oss_scripts/llama/assets/PromptProcessingWithARN.png differ
diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py
index ab27714ae1f..a999270c15b 100755
--- a/examples/qualcomm/oss_scripts/llama/llama.py
+++ b/examples/qualcomm/oss_scripts/llama/llama.py
@@ -89,32 +89,38 @@
 logging.getLogger().setLevel(logging.INFO)
 
 
-def smart_mask_updator(atten_mask, pos, k_caches, v_caches, new_k_caches, new_v_caches):
-    for i, k_cache in enumerate(k_caches):
-        k_cache[:, :, pos] = new_k_caches[i][:, :, 0]
+def smart_mask_updater(
+    ar_len, atten_mask, pos, k_caches, v_caches, new_k_caches, new_v_caches
+):
+    # Update the KV cache input for the next inference when the position exceeds the autoregressive length.
+    if pos >= ar_len:
+        for i, k_cache in enumerate(k_caches):
+            k_cache[:, :, pos - ar_len] = new_k_caches[i][:, :, 0]
 
-    for i, v_cache in enumerate(v_caches):
-        v_cache[:, pos, :] = new_v_caches[i]
+        for i, v_cache in enumerate(v_caches):
+            v_cache[:, pos - ar_len, :] = new_v_caches[i][:, 0, :]
+        atten_mask[:, :, pos - ar_len] = 0
 
-    atten_mask[0][pos] = 0
     pos += 1
     return (atten_mask, pos, k_caches, v_caches)
 
 
-def shift_pointer_updator(
-    atten_mask, pos, k_caches, v_caches, new_k_caches, new_v_caches
+def shift_pointer_updater(
+    ar_len, atten_mask, pos, k_caches, v_caches, new_k_caches, new_v_caches
 ):
-    k_caches = [
-        torch.cat([k_cache[:, :, 1:], new_k_caches[i]], dim=-1)
-        for i, k_cache in enumerate(k_caches)
-    ]
-    v_caches = [
-        torch.cat([v_cache[:, 1:, :], new_v_caches[i]], dim=1)
-        for i, v_cache in enumerate(v_caches)
-    ]
+    # Update the KV cache input for the next inference when the position exceeds the autoregressive length.
+    if pos >= ar_len:
+        k_caches = [
+            torch.cat([k_cache[:, :, 1:], new_k_caches[i][:, :, :1]], dim=-1)
+            for i, k_cache in enumerate(k_caches)
+        ]
+        v_caches = [
+            torch.cat([v_cache[:, 1:, :], new_v_caches[i][:, :1, :]], dim=1)
+            for i, v_cache in enumerate(v_caches)
+        ]
+        atten_mask[:, :, -pos - 1] = 0
 
     pos += 1
-    atten_mask[0][-pos - 1] = 0
     return (atten_mask, pos, k_caches, v_caches)
 
 
@@ -123,15 +129,15 @@ def _kv_calibrate(
     user_prompts,
     module: torch.fx.GraphModule,
     tokenizer,
+    ar_len=1,
     max_seq_len=512,
-    updator=smart_mask_updator,
+    updater=smart_mask_updater,
     use_i64_token=False,
 ):
     _, atten_mask, _, k_caches, v_caches = example_inputs
 
     # TODO: change criteria & support batch inputs if necessary
-    pos = torch.tensor(0, dtype=torch.int32)
-    max_cache_len = max_seq_len - 1
+    all_pos = torch.arange(0, max_seq_len, 1, dtype=torch.int32).unsqueeze(0)
 
     token_list = []
     # Llama2 tokenizer has no special tokens
@@ -144,21 +150,50 @@ def _kv_calibrate(
     else:
         raise RuntimeError("Unkown tokenizer")
 
+    pos = len(token_list) if len(token_list) < ar_len else ar_len
+    dtype = torch.int64 if use_i64_token else torch.int32
+
     with torch.no_grad():
-        while token_list[-1] != tokenizer.eos_id and pos < max_cache_len:
-            dtype = torch.int64 if use_i64_token else torch.int32
-            token = torch.full((1, 1), token_list[pos], dtype=dtype)
+        while token_list[-1] != tokenizer.eos_id and pos < max_seq_len:
+            tmp_token_list = torch.tensor(
+                token_list[pos - ar_len : pos], dtype=dtype
+            ).reshape(1, -1)
+            tmp_pos = all_pos[:, pos - ar_len : pos]
+            tmp_atten_mask = atten_mask
+            if pos < ar_len:
+                tmp_token_list = torch.cat(
+                    [
+                        torch.zeros((1, ar_len - pos), dtype=dtype),
+                        torch.tensor(token_list, dtype=dtype).reshape(1, -1),
+                    ],
+                    dim=1,
+                )
+                tmp_pos = torch.cat(
+                    [
+                        torch.zeros((1, ar_len - pos), dtype=torch.int32),
+                        all_pos[:, :pos],
+                    ],
+                    dim=1,
+                )
+                tmp_atten_mask = torch.cat(
+                    [
+                        torch.ones(1, ar_len, max_seq_len - pos) * -255.0,
+                        atten_mask[:, :, -pos:],
+                    ],
+                    dim=-1,
+                )
+
             logits, new_k_caches, new_v_caches = module(
-                token,
-                atten_mask,
-                torch.full((1, 1), pos),
+                tmp_token_list,
+                tmp_atten_mask,
+                tmp_pos,
                 *k_caches,
                 *v_caches,
             )
-            atten_mask, pos, k_caches, v_caches = updator(
-                atten_mask, pos, k_caches, v_caches, new_k_caches, new_v_caches
+            atten_mask, pos, k_caches, v_caches = updater(
+                ar_len, atten_mask, pos, k_caches, v_caches, new_k_caches, new_v_caches
             )
-            if pos >= len(token_list):
+            if pos > len(token_list):
                 token_list.append(torch.argmax(logits[:, -1], dim=-1).item())
 
     print(f"kv calibration data:\n{tokenizer.decode(token_list)}")
@@ -173,7 +208,6 @@ def _prefill_calibrate(
     use_i64_token=False,
 ):
     _, atten_mask = example_inputs
-    max_cache_len = max_seq_len - 1
 
     # TODO: change criteria & support batch inputs if necessary
 
@@ -192,20 +226,24 @@ def _prefill_calibrate(
     dtype = torch.int64 if use_i64_token else torch.int32
 
     with torch.no_grad():
-        while token_list[-1] != tokenizer.eos_id and pos < max_cache_len:
+        while token_list[-1] != tokenizer.eos_id and pos < max_seq_len:
             tmp_token_list = torch.tensor(token_list, dtype=dtype).reshape(1, -1)
-            if pos < max_cache_len:
+            if pos < max_seq_len:
                 tmp_token_list = torch.cat(
                     [
                         tmp_token_list,
-                        torch.zeros((1, max_cache_len - pos), dtype=dtype),
+                        torch.zeros((1, max_seq_len - pos), dtype=dtype),
                     ],
                     dim=1,
                 )
-            logits, new_k_caches, new_v_caches = module(
+            results = module(
                 tmp_token_list,
                 atten_mask,
             )
+            if len(results) == 3:
+                logits, new_k_caches, new_v_caches = results
+            elif len(results) == 1:
+                logits = results
             token_list.append(torch.argmax(logits[:, pos - 1], dim=-1).item())
             pos += 1
 
@@ -217,8 +255,9 @@ def calibrate(
     user_prompts,
     module: torch.fx.GraphModule,
     tokenizer,
+    ar_len=1,
     max_seq_len=512,
-    kv_updator=smart_mask_updator,
+    kv_updater=smart_mask_updater,
     use_i64_token=False,
 ):
     if len(example_inputs) == 2:
@@ -236,8 +275,9 @@ def calibrate(
             user_prompts,
             module,
             tokenizer,
+            ar_len,
             max_seq_len,
-            updator=kv_updator,
+            updater=kv_updater,
             use_i64_token=use_i64_token,
         )
     else:
@@ -268,56 +308,36 @@ def _tag_ios(self, gm: torch.fx.GraphModule, fixed_point_type):
 
         # shape of k caches and v caches
         kv_cache_shape = {
-            # single head, kv mode input
+            # single head, kv input
             (self.llama_meta["get_head_dim"], self.llama_meta["get_max_seq_len"]),
             (self.llama_meta["get_max_seq_len"], self.llama_meta["get_head_dim"]),
-            # single head, kv mode output
-            (self.llama_meta["get_head_dim"], 1),
-            (1, self.llama_meta["get_head_dim"]),
-            # single head, bert mode
-            (self.llama_meta["get_head_dim"], self.llama_meta["get_max_seq_len"] - 1),
-            (self.llama_meta["get_max_seq_len"] - 1, self.llama_meta["get_head_dim"]),
+            # single head, kv output
+            (self.llama_meta["get_head_dim"], self.llama_meta["get_ar_len"]),
+            (self.llama_meta["get_ar_len"], self.llama_meta["get_head_dim"]),
         }
         io_shape = {
-            # kv mode
+            # logit output
             (
                 self.llama_meta["get_max_batch_size"],
-                1,
-                self.llama_meta["get_vocab_size"],
-            ),
-            # bert mode
-            (
-                self.llama_meta["get_max_batch_size"],
-                self.llama_meta["get_max_seq_len"] - 1,
+                self.llama_meta["get_ar_len"],
                 self.llama_meta["get_vocab_size"],
             ),
         }
 
         atten_mask_shape = {
-            # kv mode
-            (self.llama_meta["get_max_batch_size"], self.llama_meta["get_max_seq_len"]),
-            # bert mode
             (
-                self.llama_meta["get_max_seq_len"] - 1,
-                self.llama_meta["get_max_seq_len"] - 1,
+                self.llama_meta["get_max_batch_size"],
+                self.llama_meta["get_ar_len"],
+                self.llama_meta["get_max_seq_len"],
             ),
         }
 
         freq_shape = {
-            # kv mode
-            (1, self.llama_meta["get_head_dim"] // 2),
-            # bert mode
-            (
-                self.llama_meta["get_max_seq_len"] - 1,
-                self.llama_meta["get_head_dim"] // 2,
-            ),
+            (self.llama_meta["get_ar_len"], self.llama_meta["get_head_dim"] // 2),
         }
 
         freq_op = {
-            # kv mode
             exir_ops.edge.aten.select.int,
-            # bert mode
-            exir_ops.edge.aten.slice_copy.Tensor,
         }
 
         for n in gm.graph.nodes:
@@ -376,8 +396,9 @@ def quantize(self, quant_dtype, args, tokenizer, custom_annotations=()):
             args.prompt,
             fx_graph_module,
             tokenizer=tokenizer,
+            ar_len=self.llama_meta["get_ar_len"],
             max_seq_len=self.llama_meta["get_max_seq_len"],
-            kv_updator=args.kv_updator,
+            kv_updater=args.kv_updater,
             use_i64_token=args.embedding_quantize is not None,
         )
 
@@ -467,12 +488,14 @@ def compile(args, pte_filename, tokenizer):
         kv_config = ModelArgs(**json.load(f))
         # TODO: support batch inputs if necessary
         kv_config.max_batch_size = 1
-        kv_config.max_seq_len = args.kv_seq_len
+        kv_config.max_seq_len = args.max_seq_len
         kv_config.use_kv_cache = True
 
         prefill_config = copy.copy(kv_config)
-        prefill_config.max_seq_len = args.prefill_seq_len
-        prefill_config.use_kv_cache = False
+        prefill_config.max_seq_len = args.max_seq_len
+        prefill_config.use_kv_cache = (
+            False if args.max_seq_len == args.prefill_ar_len else True
+        )
 
     state_dict = torch.load(
         args.checkpoint, weights_only=True, map_location="cpu", mmap=True
@@ -484,27 +507,29 @@ def compile(args, pte_filename, tokenizer):
         if args.model_mode == "kv":
             llama_instance_list.append(
                 LlamaModel(
-                    kv_config, output_new_cache_only=True, use_i64_token=use_i64_token
-                )
-            )
-        elif args.model_mode == "prefill":
-            llama_instance_list.append(
-                LlamaModel(
-                    prefill_config,
-                    output_new_cache_only=False,
+                    kv_config,
+                    ar_len=1,
+                    output_new_cache_only=True,
+                    output_cache=True,
                     use_i64_token=use_i64_token,
                 )
             )
         elif args.model_mode == "hybrid":
             llama_instance_list.append(
                 LlamaModel(
-                    kv_config, output_new_cache_only=True, use_i64_token=use_i64_token
+                    kv_config,
+                    ar_len=1,
+                    output_new_cache_only=True,
+                    output_cache=True,
+                    use_i64_token=use_i64_token,
                 )
             )
             llama_instance_list.append(
                 LlamaModel(
                     prefill_config,
-                    output_new_cache_only=False,
+                    ar_len=args.prefill_ar_len,
+                    output_new_cache_only=True,
+                    output_cache=True,
                     use_i64_token=use_i64_token,
                 )
             )
@@ -514,6 +539,28 @@ def compile(args, pte_filename, tokenizer):
     if "model" in state_dict:
         state_dict = state_dict["model"]
 
+    # Change to HuggingFace weight to improve the performance of RoPE in HTP backend.
+    def permute(w, heads):
+        dim_0 = w.size(0)
+        dim_1 = w.size(1)
+        return (
+            w.view(heads, dim_0 // heads // 2, 2, dim_1)
+            .transpose(1, 2)
+            .reshape(dim_0, dim_1)
+        )
+
+    n_heads = llama_instance_list[0].n_heads
+    n_kv_heads = llama_instance_list[0].n_kv_heads
+    n_layers = llama_instance_list[0].n_layers
+
+    for layer_i in range(n_layers):
+        state_dict[f"layers.{layer_i}.attention.wq.weight"] = permute(
+            state_dict[f"layers.{layer_i}.attention.wq.weight"], n_heads
+        )
+        state_dict[f"layers.{layer_i}.attention.wk.weight"] = permute(
+            state_dict[f"layers.{layer_i}.attention.wk.weight"], n_kv_heads
+        )
+
     for llama_instance in llama_instance_list:
         llama_instance.load_state_dict(
             state_dict,
@@ -606,7 +653,7 @@ def compile(args, pte_filename, tokenizer):
     start_lowering_ts = time.time()
     quant_attrs = None
 
-    if args.model_mode in ["kv", "prefill"]:
+    if args.model_mode in ["kv"]:
         llama_instance_list[0].lowering_modules(
             args.artifact,
             fixed_point_type,
@@ -783,12 +830,10 @@ def compile(args, pte_filename, tokenizer):
 def inference(args, quant_attrs, pte_filename, runtime_tokenizer_path, pre_gen_pte=""):
     workspace = f"/data/local/tmp/{getpass.getuser()}/executorch/single_llama"
 
-    if args.model_mode == "prefill":
+    if args.model_mode == "kv":
         eval_mode = 0
-    elif args.model_mode == "kv":
-        eval_mode = 1
     elif args.model_mode == "hybrid":
-        eval_mode = 2
+        eval_mode = 1
     else:
         raise RuntimeError(f"Unknown model_mode: {args.model_mode}.")
 
@@ -807,7 +852,7 @@ def post_process():
         with open(f"{args.artifact}/outputs/outputs.txt", "r") as f:
             outputs.append(f.read())
 
-    seq_len = args.prefill_seq_len if args.model_mode == "prefill" else args.kv_seq_len
+    seq_len = args.max_seq_len
     runner_args = " ".join(
         [
             f'--prompt "{args.prompt}"',
@@ -820,13 +865,14 @@ def post_process():
     )
 
     runner_cmd = ""
+    performance_output_path = "outputs/inference_speed.txt"
     if args.enable_x86_64:
         # x86 emulator is intended for CI and not performance. Check only the first few tokens.
         seq_len = min(seq_len, 16)
 
-        if args.kv_updator == smart_mask_updator:
+        if args.kv_updater == smart_mask_updater:
             logging.warning(
-                "x86 only support ShiftPointer, overwrite kv_updator to ShiftPointer"
+                "x86 only support ShiftPointer, overwrite kv_updater to ShiftPointer"
             )
 
         qnn_sdk = os.getenv("QNN_SDK_ROOT")
@@ -839,7 +885,8 @@ def post_process():
                 f"--model_path {pte_path}",
                 f"--seq_len {seq_len}",
                 f"--output_path {args.artifact}/outputs/outputs.txt",
-                f"--kv_updator ShiftPointer",
+                f"--performance_output_path {performance_output_path}",
+                f"--kv_updater ShiftPointer",
                 runner_args,
             ]
         )
@@ -859,7 +906,8 @@ def post_process():
                 f"--model_path {pte_filename}.pte",
                 f"--seq_len {seq_len}",
                 "--output_path outputs/outputs.txt",
-                f"--kv_updator {'SmartMask' if args.kv_updator == smart_mask_updator else 'ShiftPointer'}",
+                f"--performance_output_path {performance_output_path}",
+                f"--kv_updater {'SmartMask' if args.kv_updater == smart_mask_updater else 'ShiftPointer'}",
                 runner_args,
             ]
         )
@@ -881,6 +929,10 @@ def post_process():
 
         adb.pull(output_path=args.artifact, callback=post_process)
     if args.ip and args.port != -1:
+        inference_speed = 0
+        with open(f"{args.artifact}/{performance_output_path}", "r") as f:
+            inference_speed = float(f.read())
+
         pte_size = os.path.getsize(pte_path)
         with Client((args.ip, args.port)) as conn:
             conn.send(
@@ -888,6 +940,7 @@ def post_process():
                     {
                         "result": outputs,
                         "pte_size": pte_size,
+                        "inference_speed": inference_speed,
                     }
                 )
             )
@@ -993,28 +1046,28 @@ def _build_parser():
 
     parser.add_argument(
         "--model_mode",
-        help="Export and inference prefill mode, kv mode or hybrid mode",
+        help="Export and inference kv mode or hybrid mode",
         default="kv",
-        choices=["prefill", "kv", "hybrid"],
+        choices=["kv", "hybrid"],
         type=str,
     )
 
     parser.add_argument(
-        "--prefill_seq_len",
-        help="Ouput sequence length for llama. Use this option for prefill or hybrid mode",
-        default=32,
+        "--max_seq_len",
+        help="This refers to maximum number of tokens that the model can process & consider at once to generate predictions/responses.",
+        default=512,
         type=int,
     )
 
     parser.add_argument(
-        "--kv_seq_len",
-        help="Ouput sequence length for llama. Use this option for kv or hybrid mode",
-        default=512,
+        "--prefill_ar_len",
+        help="The auto-regression (AR) length determines the number of tokens to consume and the number of logits to produce. Use this option to process the prompt and generate the key-value (kv) cache, which serves as a prompt processor for hybrid mode.",
+        default=32,
         type=int,
     )
 
     parser.add_argument(
-        "--kv_updator",
+        "--kv_updater",
         help="Choose how to update kv cache during runtime",
         choices=["smart_mask", "shift_pointer"],
         default="smart_mask",
@@ -1034,21 +1087,16 @@ def _build_parser():
     return parser
 
 
-def main(args) -> None:
-    parser = _build_parser()
-
-    args = parser.parse_args(args)
+def export_llama(args) -> None:
     if args.compile_only and args.pre_gen_pte:
         exit("Cannot set both compile_only and pre_gen_pte as true")
 
     if args.model_mode == "kv":
         pte_filename = "kv_llama_qnn"
-    elif args.model_mode == "prefill":
-        pte_filename = "prefill_llama_qnn"
     elif args.model_mode == "hybrid":
         assert (
-            args.kv_seq_len >= args.prefill_seq_len
-        ), "Please ensure kv_seq_len is >= prefill_seq_len"
+            args.max_seq_len >= args.prefill_ar_len
+        ), "Please ensure max_seq_len is >= prefill_ar_len"
         pte_filename = "hybrid_llama_qnn"
     else:
         raise RuntimeError(f"Unknown model_mode: {args.model_mode}.")
@@ -1071,13 +1119,13 @@ def main(args) -> None:
     else:
         raise RuntimeError(f"Unknown llama_model: {args.llama_model}.")
 
-    if args.kv_updator == "smart_mask":
+    if args.kv_updater == "smart_mask":
         args.shared_buffer = True
-        args.kv_updator = smart_mask_updator
-    elif args.kv_updator == "shift_pointer":
-        args.kv_updator = shift_pointer_updator
+        args.kv_updater = smart_mask_updater
+    elif args.kv_updater == "shift_pointer":
+        args.kv_updater = shift_pointer_updater
     else:
-        exit(f"Using an unkown kv update {args.kv_updator}")
+        exit(f"Using an unkown kv update {args.kv_updater}")
 
     if args.pre_gen_pte:
         quant_attrs = json.load(
@@ -1138,6 +1186,12 @@ def main(args) -> None:
             raise Exception(e)
 
 
+def main():
+    parser = _build_parser()
+    args = parser.parse_args()
+    export_llama(args)
+
+
 # flake8: noqa: C901
 if __name__ == "__main__":
-    main(sys.argv[1:])
+    main()
diff --git a/examples/qualcomm/oss_scripts/llama/model/static_llama.py b/examples/qualcomm/oss_scripts/llama/model/static_llama.py
index 253abc9578c..f7893792e00 100755
--- a/examples/qualcomm/oss_scripts/llama/model/static_llama.py
+++ b/examples/qualcomm/oss_scripts/llama/model/static_llama.py
@@ -19,12 +19,14 @@
 def apply_rotary_emb_single(
     x: torch.Tensor, freqs_cos: torch.Tensor, freqs_sin: torch.Tensor
 ) -> torch.Tensor:
-    x_r, x_i = x[..., ::2], x[..., 1::2]
-
-    # brodcast for batch_prefill mode input x
+    # The implementation of RoPE in HuggingFace processes query and key with two half instead of interleaved way.
+    # The main difference is stride in StrideSlice op. For interleaved way, stride is two which is not friendly for HTP backend.
+    # Ref: https://github.com/huggingface/transformers/issues/25199
+    x_r, x_i = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :]
+    # broadcast for batch_prefill mode input x
     if x.dim() == 4:
-        freqs_cos = freqs_cos[None, :, None, :]
-        freqs_sin = freqs_sin[None, :, None, :]
+        freqs_cos = freqs_cos[None, None, :, :]
+        freqs_sin = freqs_sin[None, None, :, :]
     x_out_r = x_r * freqs_cos - x_i * freqs_sin
     x_out_i = x_r * freqs_sin + x_i * freqs_cos
 
@@ -37,7 +39,7 @@ def __init__(self, config: ModelArgs, output_new_cache_only=False):
         super().__init__()
         self.dim = config.dim
         self.n_heads = config.n_heads
-        self.head_dim = config.dim // config.n_heads
+        self.head_dim = config.head_dim
         self.n_kv_heads = config.n_kv_heads
         self.num_key_value_groups = config.n_heads // self.n_kv_heads
         self.max_seq_len = config.max_seq_len
@@ -104,25 +106,33 @@ def forward_sha(
         v_caches: Optional[List[torch.Tensor]] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         bsz, seq_len, _ = hidden_states.shape
+        # In the HTP backend, the input axis order for the convolution operation is
+        # more efficient with [1, 1, seq_len, dim] compared to [1, seq_len, 1, dim].
         hidden_states = torch.reshape(
             hidden_states, (bsz, seq_len, 1, self.dim)
         ).transpose(1, 3)
         q = [
-            wq_sha(hidden_states).reshape(bsz, self.head_dim, seq_len).transpose(1, 2)
+            wq_sha(hidden_states)
+            .permute(0, 2, 3, 1)
+            .reshape(bsz, seq_len, self.head_dim)
             for wq_sha in self.wq_sha
         ]
         k = [
-            wk_sha(hidden_states).reshape(bsz, self.head_dim, seq_len).transpose(1, 2)
+            wk_sha(hidden_states)
+            .permute(0, 2, 3, 1)
+            .reshape(bsz, seq_len, self.head_dim)
             for wk_sha in self.wk_sha
         ]
         v = [
-            wv_sha(hidden_states).reshape(bsz, self.head_dim, seq_len).transpose(1, 2)
+            wv_sha(hidden_states)
+            .permute(0, 2, 3, 1)
+            .reshape(bsz, seq_len, self.head_dim)
             for wv_sha in self.wv_sha
         ]
         for i in range(len(q)):
             q[i] = apply_rotary_emb_single(q[i], freqs_cos, freqs_sin)
         for i in range(len(k)):
-            k[i] = apply_rotary_emb_single(k[i], freqs_cos, freqs_sin).permute(0, 2, 1)
+            k[i] = apply_rotary_emb_single(k[i], freqs_cos, freqs_sin).transpose(1, 2)
 
         output_y = []
         kh, vh = [], []
@@ -153,10 +163,7 @@ def forward_sha(
         y = y.reshape(bsz, seq_len, -1)
 
         if self.output_new_cache_only:
-            if k_caches and v_caches:
-                return y, k, v
-            # batch_prefill mode. Consider to remove, it's not really used
-            return y, k[-1], v[-1]
+            return y, k, v
 
         return y, kh, vh
 
@@ -252,10 +259,10 @@ def prepare_feedfoward_conv(self):
 
     def forward_feedfoward_conv(self, x):
         bsz, _, _ = x.size()
-        x = torch.reshape(x, (bsz, -1, self.dim, 1))
-        x = x.transpose(1, 2)  # Transpose right before and after Conv
+        x = torch.reshape(x, (bsz, -1, 1, self.dim))
+        x = x.transpose(1, 3)  # Transpose right before and after Conv
         x = self.w2_conv(F.silu(self.w1_conv(x)) * self.w3_conv(x))
-        x = x.transpose(1, 2)
+        x = x.transpose(1, 3)
         x = torch.reshape(x, (bsz, -1, self.dim))
         return x
 
@@ -298,11 +305,16 @@ def forward(
 
 class LlamaModel(nn.Module):
     def __init__(
-        self, config: ModelArgs, output_new_cache_only=True, use_i64_token=False
+        self,
+        config: ModelArgs,
+        ar_len=1,
+        output_new_cache_only=True,
+        output_cache=True,
+        use_i64_token=False,
     ):
         super().__init__()
         self.dim = config.dim
-        self.head_dim = config.dim // config.n_heads
+        self.head_dim = config.head_dim
         self.max_batch_size = config.max_batch_size
         self.max_seq_len = config.max_seq_len
         self.n_heads = config.n_heads
@@ -311,8 +323,10 @@ def __init__(
         self.vocab_size = config.vocab_size
         self.rope_freq_base = config.rope_freq_base
         self.use_kv_cache = config.use_kv_cache
+        self.ar_len = ar_len
         self.output_new_cache_only = output_new_cache_only
         self.use_i64_token = use_i64_token
+        self.output_cache = output_cache
 
         self.layers = nn.ModuleList(
             [
@@ -324,9 +338,11 @@ def __init__(
         self.output = nn.Linear(config.dim, config.vocab_size, bias=False)
         self.tok_embeddings = nn.Embedding(config.vocab_size, config.dim)
         freqs_cos, freqs_sin = precompute_freqs_cis(
-            config.dim // config.n_heads,
+            config.head_dim,
             config.max_seq_len,
             config.rope_freq_base,
+            config.use_scaled_rope,
+            config.rope_scale_factor,
         )
         self.register_buffer("freqs_cos", freqs_cos, persistent=False)
         self.register_buffer("freqs_sin", freqs_sin, persistent=False)
@@ -359,10 +375,10 @@ def forward(
         output_v_cache = []
         # following tensors should be invariant across batches
         freqs_cos = (
-            self.freqs_cos[input_pos][0] if self.use_kv_cache else self.freqs_cos[:-1]
+            self.freqs_cos[input_pos][0] if self.use_kv_cache else self.freqs_cos
         )
         freqs_sin = (
-            self.freqs_sin[input_pos][0] if self.use_kv_cache else self.freqs_sin[:-1]
+            self.freqs_sin[input_pos][0] if self.use_kv_cache else self.freqs_sin
         )
 
         hidden_states = self.tok_embeddings(tokens)
@@ -388,19 +404,36 @@ def forward(
         hidden_states = self.norm(hidden_states)
         logits = self.output(hidden_states)
 
-        return logits, output_k_cache, output_v_cache
+        if self.output_cache:
+            return logits, output_k_cache, output_v_cache
+        return logits
 
     def get_example_inputs(self, use_kv_cache=True):
         dtype = torch.int64 if self.use_i64_token else torch.int32
-        if use_kv_cache:
-            tokens = torch.randint(
-                self.vocab_size, (self.max_batch_size, 1), dtype=dtype
-            )
+        tokens = torch.randint(
+            self.vocab_size, (self.max_batch_size, self.ar_len), dtype=dtype
+        )
 
-            pos_ids = torch.zeros((self.max_batch_size, 1), dtype=torch.int32)
+        atten_mask = torch.full((self.ar_len, self.ar_len), torch.tensor(-255.0))
+        mask_cond = torch.arange(atten_mask.size(-1))
+        atten_mask.masked_fill_(
+            mask_cond < (mask_cond + 1).view(atten_mask.size(-1), 1), 0
+        )
+        if self.max_seq_len != self.ar_len:
+            atten_mask = torch.cat(
+                [
+                    torch.ones(self.ar_len, self.max_seq_len - self.ar_len) * -255.0,
+                    atten_mask,
+                ],
+                dim=-1,
+            )
+        atten_mask = atten_mask[None, :, :].expand(
+            self.max_batch_size, self.ar_len, self.max_seq_len
+        )
+        if use_kv_cache:
+            pos_ids = torch.zeros((self.max_batch_size, self.ar_len), dtype=torch.int32)
             k_cache, v_cache = [], []
-            atten_mask = torch.full((self.max_batch_size, self.max_seq_len), -255.0)
-            atten_mask[:, -1] = 0
+
             for _ in range(self.n_layers):
                 for _ in range(self.n_kv_heads):
                     # transpose first to decrease the runtime efforts
@@ -408,13 +441,13 @@ def get_example_inputs(self, use_kv_cache=True):
                         torch.zeros(
                             self.max_batch_size,
                             self.head_dim,
-                            self.max_seq_len - 1,
+                            self.max_seq_len - self.ar_len,
                         )
                     )
                     v_cache.append(
                         torch.zeros(
                             self.max_batch_size,
-                            self.max_seq_len - 1,
+                            self.max_seq_len - self.ar_len,
                             self.head_dim,
                         )
                     )
@@ -426,10 +459,6 @@ def get_example_inputs(self, use_kv_cache=True):
                 v_cache,
             )
 
-        max_promp = self.max_seq_len - 1
-        tokens = torch.arange(0, max_promp, 1, dtype=dtype).unsqueeze(0)
-        atten_mask = torch.triu(torch.rand((max_promp, max_promp)), 1)
-        atten_mask[atten_mask != 0] = -255
         return (
             tokens,
             atten_mask,
@@ -438,10 +467,11 @@ def get_example_inputs(self, use_kv_cache=True):
     def get_metadata(self):
         # TODO: modify this when enabling LLAMA 7B
         return {
+            "get_ar_len": self.ar_len,
             "get_bos_id": 1,
             "get_eos_id": 2,
             "get_dim": self.dim,
-            "get_head_dim": self.dim // self.n_heads,
+            "get_head_dim": self.head_dim,
             "get_max_batch_size": self.max_batch_size,
             "get_max_seq_len": self.max_seq_len,
             "get_n_bos": 1,
diff --git a/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp b/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp
index 1bc90a11f9d..d7f0d85156c 100644
--- a/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp
+++ b/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp
@@ -25,11 +25,14 @@ DEFINE_string(
     model_path,
     "kv_llama_qnn.pte",
     "Model serialized in flatbuffer format.");
-
 DEFINE_string(
     output_path,
     "outputs.txt",
     "Executorch inference data output path.");
+DEFINE_string(
+    performance_output_path,
+    "inference_speed.txt",
+    "Records inference speed. For CI purpose.");
 DEFINE_string(tokenizer_path, "tokenizer.bin", "Tokenizer stuff.");
 DEFINE_string(prompt, "The answer to the ultimate question is", "Prompt.");
 DEFINE_string(
@@ -44,17 +47,17 @@ DEFINE_int32(
     seq_len,
     128,
     "Total number of tokens to generate (prompt + output).");
-
 DEFINE_int32(
     eval_mode,
     1,
-    "0: PromptProcessor(prefill) / 1: TokenGenerator(kv) / 2: HybridMode (prefill+kv)");
+    "0: TokenGenerator(kv) / 1: HybridMode (prefill+kv)");
 DEFINE_double(logits_scale, 0.0, "Logits scale");
 DEFINE_int32(logits_offset, 0, "Logits offset");
 DEFINE_string(
-    kv_updator,
+    kv_updater,
     "How to update kv cache. Choose between SmartMask and ShiftPointer",
     "SmartMask");
+DEFINE_int32(num_iters, 1, "total num of iterations to run.");
 
 int main(int argc, char** argv) {
   gflags::ParseCommandLineFlags(&argc, &argv, true);
@@ -63,11 +66,13 @@ int main(int argc, char** argv) {
   example::Runner runner(
       {FLAGS_model_path},
       FLAGS_tokenizer_path.c_str(),
+      FLAGS_performance_output_path.c_str(),
       FLAGS_logits_scale,
       FLAGS_logits_offset,
       FLAGS_temperature,
       FLAGS_eval_mode,
-      FLAGS_kv_updator);
+      FLAGS_kv_updater,
+      FLAGS_num_iters);
   std::vector<char> buf;
   buf.reserve(5 * FLAGS_seq_len); // assume each token is around 5 char
   std::ofstream fout(FLAGS_output_path.c_str());
@@ -77,11 +82,13 @@ int main(int argc, char** argv) {
     }
   };
   // generate tokens & store inference output
-  runner.generate(
-      FLAGS_seq_len,
-      FLAGS_prompt.c_str(),
-      FLAGS_system_prompt.c_str(),
-      callback);
+  for (int i = 0; i < FLAGS_num_iters; i++) {
+    runner.generate(
+        FLAGS_seq_len,
+        FLAGS_prompt.c_str(),
+        FLAGS_system_prompt.c_str(),
+        callback);
+  }
   fout.write(buf.data(), buf.size());
   fout.close();
   return 0;
diff --git a/examples/qualcomm/oss_scripts/llama/runner/io_manager.cpp b/examples/qualcomm/oss_scripts/llama/runner/io_manager.cpp
index badaea0ca73..ce7baefa080 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/io_manager.cpp
+++ b/examples/qualcomm/oss_scripts/llama/runner/io_manager.cpp
@@ -54,7 +54,10 @@ std::vector<Tensor> IoMgrBase::get_output_tensors(
 
 ShiftPointerIoMgr::ShiftPointerIoMgr(
     std::vector<std::shared_ptr<Module>>& modules,
+    int32_t context_len,
+    int32_t prefill_ar_len,
     int32_t prefill_cache_len,
+    int32_t kv_ar_len,
     int32_t kv_cache_len,
     int32_t vocab_size,
     int32_t num_layers,
@@ -66,7 +69,10 @@ ShiftPointerIoMgr::ShiftPointerIoMgr(
     const bool use_int64_token)
     : IoMgrBase(modules),
       shard_layers_({num_layers}),
+      context_len_(context_len),
+      kv_ar_len_(kv_ar_len),
       kv_cache_len_(kv_cache_len),
+      prefill_ar_len_(prefill_ar_len),
       prefill_cache_len_(prefill_cache_len),
       vocab_size_(vocab_size),
       num_layers_(num_layers),
@@ -75,7 +81,8 @@ ShiftPointerIoMgr::ShiftPointerIoMgr(
       eval_mode_(eval_mode),
       prefill_forward_name_(prefill_forward_name),
       kv_forward_name_(kv_forward_name),
-      use_int64_token_(use_int64_token) {
+      use_int64_token_(use_int64_token),
+      is_bert_(prefill_cache_len_ == 0) {
   if (!prefill_forward_name_.empty()) {
     input_tensors_[prefill_forward_name_] =
         std::vector<std::vector<executorch::aten::TensorImpl*>>(modules.size());
@@ -113,15 +120,14 @@ void ShiftPointerIoMgr::init_io() {
   IO* ptr = static_cast<IO*>(data_ptr_.get());
   std::memset(ptr, 0, sizeof(IO));
 
-  int32_t max_cache_len = std::max(kv_cache_len_, prefill_cache_len_);
-  int32_t k_in_size = (head_dim_ + 1) * max_cache_len;
-  int32_t v_cache_size = (num_heads_ + 1) * max_cache_len * head_dim_;
-  int32_t k_cache_out_size = num_heads_ * head_dim_;
-  if (eval_mode_ == EvalMode::kHybrid || eval_mode_ == EvalMode::kPrefill) {
-    k_cache_out_size *= prefill_cache_len_;
-  }
+  int32_t max_ar_len = std::max(kv_ar_len_, prefill_ar_len_);
+  int32_t k_in_size = (head_dim_ + 1) * kv_cache_len_;
+  // Use context length to prevent exceeding the range when the AR-N model
+  // updates the last block in hybrid mode.
+  int32_t v_cache_size = (num_heads_ + 1) * context_len_ * head_dim_;
+  int32_t k_cache_out_size = num_heads_ * max_ar_len * head_dim_;
 
-  // Init kv vector shape, general enough to be shared across all 3 modes.
+  // Init kv vector shape, general enough to be shared across all modes.
   ptr->k_cache_out.reserve(num_layers_);
   ptr->v_cache.reserve(num_layers_);
   for (int layer = 0; layer < num_layers_; layer++) {
@@ -130,14 +136,15 @@ void ShiftPointerIoMgr::init_io() {
   }
 
   auto init_prefill = [&]() {
-    ptr->prefill_input_toks.resize(prefill_cache_len_);
-    ptr->prefill_atten_mask.resize(prefill_cache_len_ * prefill_cache_len_);
-    ptr->prefill_logits.resize(prefill_cache_len_ * vocab_size_);
+    ptr->prefill_input_toks.resize(prefill_ar_len_, 0);
+    ptr->prefill_input_pos.resize(prefill_ar_len_, 0);
+    ptr->prefill_attention_mask.resize((prefill_ar_len_ * context_len_), 0);
+    ptr->prefill_logits.resize(prefill_ar_len_ * vocab_size_);
   };
 
   auto init_kv = [&]() {
-    ptr->kv_logits.resize(vocab_size_);
-    ptr->kv_attention_mask.resize((kv_cache_len_ + 1), 0);
+    ptr->kv_logits.resize(kv_ar_len_ * vocab_size_);
+    ptr->kv_attention_mask.resize((kv_ar_len_ * context_len_), 0);
     ptr->k_cache.reserve(num_layers_);
     for (int layer = 0; layer < num_layers_; layer++) {
       ptr->k_cache.emplace_back();
@@ -149,9 +156,6 @@ void ShiftPointerIoMgr::init_io() {
   };
 
   switch (eval_mode_) {
-    case EvalMode::kPrefill:
-      init_prefill();
-      break;
     case EvalMode::kKVCached:
       init_kv();
       break;
@@ -164,6 +168,54 @@ void ShiftPointerIoMgr::init_io() {
   }
 }
 
+void ShiftPointerIoMgr::reset_io(
+    const std::vector<executorch::runtime::Result<
+        executorch::runtime::MethodMeta>>& prefill_methods_meta,
+    const std::vector<
+        executorch::runtime::Result<executorch::runtime::MethodMeta>>&
+        kv_methods_meta) {
+  IO* ptr = static_cast<IO*>(data_ptr_.get());
+  std::fill(ptr->prefill_input_pos.begin(), ptr->prefill_input_pos.end(), 0);
+  ptr->kv_input_pos = 0;
+  std::fill(
+      ptr->prefill_attention_mask.begin(),
+      ptr->prefill_attention_mask.end(),
+      0);
+  std::fill(ptr->kv_attention_mask.begin(), ptr->kv_attention_mask.end(), 0);
+
+  input_tensors_[kv_forward_name_].clear();
+  input_tensors_[kv_forward_name_].resize(modules_.size());
+  output_tensors_[kv_forward_name_].clear();
+  output_tensors_[kv_forward_name_].resize(modules_.size());
+
+  k_cache_in_[kv_forward_name_].clear();
+  v_cache_in_[kv_forward_name_].clear();
+  k_cache_out_[kv_forward_name_].clear();
+  v_cache_out_[kv_forward_name_].clear();
+
+  input_tensors_[prefill_forward_name_].clear();
+  input_tensors_[prefill_forward_name_].resize(modules_.size());
+  output_tensors_[prefill_forward_name_].clear();
+  output_tensors_[prefill_forward_name_].resize(modules_.size());
+
+  k_cache_in_[prefill_forward_name_].clear();
+  v_cache_in_[prefill_forward_name_].clear();
+  k_cache_out_[prefill_forward_name_].clear();
+  v_cache_out_[prefill_forward_name_].clear();
+
+  switch (eval_mode_) {
+    case EvalMode::kKVCached:
+      prepare_kv_io(kv_methods_meta);
+      break;
+    case EvalMode::kHybrid:
+      prepare_prefill_io(prefill_methods_meta);
+      prepare_kv_io(kv_methods_meta);
+      break;
+    default:
+      ET_CHECK_MSG(false, "unsupported mode");
+      break;
+  }
+}
 void ShiftPointerIoMgr::prepare_kv_io(
     const std::vector<Result<MethodMeta>>& methods_meta) {
   for (int i = 0; i < modules_.size(); ++i) {
@@ -177,37 +229,38 @@ void ShiftPointerIoMgr::prepare_kv_io(
   IO* ptr = static_cast<IO*>(data_ptr_.get());
 
   // [I]: input_tokens
-  Result<TensorInfo> input_tok = methods_meta[0]->input_tensor_meta(0);
-  input_tok_ = std::make_unique<TensorImpl>(
-      input_tok->scalar_type(),
-      input_tok->sizes().size(),
-      const_cast<TensorImpl::SizesType*>(input_tok->sizes().data()),
-      &ptr->input_tok,
-      const_cast<TensorImpl::DimOrderType*>(input_tok->dim_order().data()));
-  input_tensors_[kv_forward_name_][0].push_back(input_tok_.get());
+  Result<TensorInfo> kv_input_toks = methods_meta[0]->input_tensor_meta(0);
+  kv_input_toks_ = std::make_unique<TensorImpl>(
+      kv_input_toks->scalar_type(),
+      kv_input_toks->sizes().size(),
+      const_cast<TensorImpl::SizesType*>(kv_input_toks->sizes().data()),
+      &ptr->kv_input_toks,
+      const_cast<TensorImpl::DimOrderType*>(kv_input_toks->dim_order().data()));
+  input_tensors_[kv_forward_name_][0].push_back(kv_input_toks_.get());
 
   // [I]: atten_mask
-  Result<TensorInfo> atten_mask = methods_meta[0]->input_tensor_meta(1);
-  attention_mask_ = std::make_unique<TensorImpl>(
-      atten_mask->scalar_type(),
-      atten_mask->sizes().size(),
-      const_cast<TensorImpl::SizesType*>(atten_mask->sizes().data()),
+  Result<TensorInfo> kv_attention_mask = methods_meta[0]->input_tensor_meta(1);
+  kv_attention_mask_ = std::make_unique<TensorImpl>(
+      kv_attention_mask->scalar_type(),
+      kv_attention_mask->sizes().size(),
+      const_cast<TensorImpl::SizesType*>(kv_attention_mask->sizes().data()),
       ptr->kv_attention_mask.data(),
-      const_cast<TensorImpl::DimOrderType*>(atten_mask->dim_order().data()));
-  input_tensors_[kv_forward_name_][0].push_back(attention_mask_.get());
+      const_cast<TensorImpl::DimOrderType*>(
+          kv_attention_mask->dim_order().data()));
+  input_tensors_[kv_forward_name_][0].push_back(kv_attention_mask_.get());
 
   // [I]: input_pos
-  Result<TensorInfo> input_pos = methods_meta[0]->input_tensor_meta(2);
-  input_pos_ = std::make_unique<TensorImpl>(
-      input_pos->scalar_type(),
-      input_pos->sizes().size(),
-      const_cast<TensorImpl::SizesType*>(input_pos->sizes().data()),
-      &ptr->input_pos,
-      const_cast<TensorImpl::DimOrderType*>(input_pos->dim_order().data()));
-  input_tensors_[kv_forward_name_][0].push_back(input_pos_.get());
+  Result<TensorInfo> kv_input_pos = methods_meta[0]->input_tensor_meta(2);
+  kv_input_pos_ = std::make_unique<TensorImpl>(
+      kv_input_pos->scalar_type(),
+      kv_input_pos->sizes().size(),
+      const_cast<TensorImpl::SizesType*>(kv_input_pos->sizes().data()),
+      &ptr->kv_input_pos,
+      const_cast<TensorImpl::DimOrderType*>(kv_input_pos->dim_order().data()));
+  input_tensors_[kv_forward_name_][0].push_back(kv_input_pos_.get());
 
   // [I] kv_cache
-  int index = 3; // bypass input_tokens, input_pos, atten_mask
+  int index = 3; // bypass input_tokens, atten_mask, input_pos
   for (int offset = 0, shard_index = 0, v_stride = kv_cache_len_ * head_dim_;
        shard_index < modules_.size();
        offset += shard_layers_[shard_index], shard_index++) {
@@ -304,7 +357,7 @@ void ShiftPointerIoMgr::prepare_prefill_io(
 
   IO* ptr = static_cast<IO*>(data_ptr_.get());
 
-  // [I]: pre_input_tokens
+  // [I]: prefill_input_tokens
   Result<TensorInfo> prefill_input_toks = methods_meta[0]->input_tensor_meta(0);
   prefill_input_toks_ = std::make_unique<TensorImpl>(
       prefill_input_toks->scalar_type(),
@@ -314,25 +367,81 @@ void ShiftPointerIoMgr::prepare_prefill_io(
       const_cast<TensorImpl::DimOrderType*>(
           prefill_input_toks->dim_order().data()));
   input_tensors_[prefill_forward_name_][0].push_back(prefill_input_toks_.get());
-  // [I]: prefill_attn_mask
-  for (int i = 0; i < prefill_cache_len_; ++i) {
-    for (int j = 0; j < prefill_cache_len_; ++j) {
-      if (i < j) {
-        ptr->prefill_atten_mask[i * prefill_cache_len_ + j] = 0;
-      } else {
-        ptr->prefill_atten_mask[i * prefill_cache_len_ + j] = 65535;
+  // [I]: prefill_attention_mask
+  for (int i = 0; i < prefill_ar_len_; ++i) {
+    for (int j = 0,
+             offset = i * context_len_ + (context_len_ - prefill_ar_len_);
+         j < prefill_ar_len_;
+         ++j) {
+      if (i >= j) {
+        ptr->prefill_attention_mask[j + offset] = 65535;
       }
     }
   }
-  Result<TensorInfo> prefill_atten_mask = methods_meta[0]->input_tensor_meta(1);
-  prefill_attn_mask_ = std::make_unique<TensorImpl>(
-      prefill_atten_mask->scalar_type(),
-      prefill_atten_mask->sizes().size(),
-      const_cast<TensorImpl::SizesType*>(prefill_atten_mask->sizes().data()),
-      ptr->prefill_atten_mask.data(),
+  Result<TensorInfo> prefill_attention_mask =
+      methods_meta[0]->input_tensor_meta(1);
+  prefill_attention_mask_ = std::make_unique<TensorImpl>(
+      prefill_attention_mask->scalar_type(),
+      prefill_attention_mask->sizes().size(),
+      const_cast<TensorImpl::SizesType*>(
+          prefill_attention_mask->sizes().data()),
+      ptr->prefill_attention_mask.data(),
       const_cast<TensorImpl::DimOrderType*>(
-          prefill_atten_mask->dim_order().data()));
-  input_tensors_[prefill_forward_name_][0].push_back(prefill_attn_mask_.get());
+          prefill_attention_mask->dim_order().data()));
+  input_tensors_[prefill_forward_name_][0].push_back(
+      prefill_attention_mask_.get());
+
+  if (!is_bert_) {
+    // [I]: prefill_input_pos
+    Result<TensorInfo> prefill_input_pos =
+        methods_meta[0]->input_tensor_meta(2);
+    prefill_input_pos_ = std::make_unique<TensorImpl>(
+        prefill_input_pos->scalar_type(),
+        prefill_input_pos->sizes().size(),
+        const_cast<TensorImpl::SizesType*>(prefill_input_pos->sizes().data()),
+        ptr->prefill_input_pos.data(),
+        const_cast<TensorImpl::DimOrderType*>(
+            prefill_input_pos->dim_order().data()));
+    input_tensors_[prefill_forward_name_][0].push_back(
+        prefill_input_pos_.get());
+
+    // [I] kv_cache
+    int index = 3; // bypass input_tokens, atten_mask, input_pos
+    // Add prefill offset to align the v_out pointer with the decode model.
+    for (int offset = 0,
+             shard_index = 0,
+             v_stride = kv_cache_len_ * head_dim_,
+             prefill_offset = (kv_cache_len_ - prefill_cache_len_) * head_dim_;
+         shard_index < modules_.size();
+         offset += shard_layers_[shard_index], shard_index++) {
+      for (int cache_group = 0; cache_group < 2; ++cache_group) {
+        for (int layer = 0; layer < shard_layers_[shard_index]; ++layer) {
+          for (int head = 0; head < num_heads_; ++head, ++index) {
+            Result<TensorInfo> kv_cache =
+                methods_meta[shard_index]->input_tensor_meta(index);
+            std::vector<std::unique_ptr<TensorImpl>>& cache =
+                (cache_group == 0 ? k_cache_in_[prefill_forward_name_]
+                                  : v_cache_in_[prefill_forward_name_]);
+            void* cache_ptr = (cache_group == 0)
+                ? static_cast<void*>(ptr->k_cache[layer + offset][head].data())
+                : static_cast<void*>(
+                      ptr->v_cache[layer + offset].data() + head * v_stride +
+                      prefill_offset);
+
+            cache.emplace_back(std::make_unique<TensorImpl>(
+                kv_cache->scalar_type(),
+                kv_cache->sizes().size(),
+                const_cast<TensorImpl::SizesType*>(kv_cache->sizes().data()),
+                cache_ptr,
+                const_cast<TensorImpl::DimOrderType*>(
+                    kv_cache->dim_order().data())));
+            input_tensors_[prefill_forward_name_][shard_index].push_back(
+                cache.back().get());
+          }
+        }
+      }
+    }
+  }
   // [O]: logits
   int logit_index = 0;
   Result<TensorInfo> logits =
@@ -348,18 +457,11 @@ void ShiftPointerIoMgr::prepare_prefill_io(
 
   // [O] kv_cache
   int index = 1;
-  // prefill_k_stride should be equal to prefill_v_stride in prefill mode.
   // In hybrid mode, we use kv mode cache len for v stride since we want to
   // update prefill's result onto kv modes input.
-  int32_t prefill_k_stride = prefill_cache_len_ * head_dim_;
-  int32_t prefill_v_stride =
-      std::max(prefill_cache_len_, kv_cache_len_) * head_dim_;
+  int32_t prefill_k_stride = prefill_ar_len_ * head_dim_;
+  int32_t prefill_v_stride = kv_cache_len_ * head_dim_;
 
-  if (eval_mode_ == EvalMode::kPrefill) {
-    ET_CHECK_MSG(
-        prefill_k_stride == prefill_v_stride,
-        "prefill_k_stride should be equal to prefill_v_stride");
-  }
   for (int offset = 0, shard_index = 0; shard_index < modules_.size();
        offset += shard_layers_[shard_index], shard_index++) {
     for (int cache_group = 0; cache_group < 2; ++cache_group) {
@@ -397,13 +499,11 @@ void ShiftPointerIoMgr::update_prefill_to_kv_io(
     int64_t pos,
     std::vector<std::vector<Tensor>>& output_tensors) {
   ET_CHECK_MSG(kv_cache_len_ != 0, "k_cache_len_ should not equal to 0");
-  ET_CHECK_MSG(
-      prefill_cache_len_ != 0, "prefill_cache_len_ should not equal to 0");
   IO* ptr = static_cast<IO*>(data_ptr_.get());
 
-  ptr->input_tok =
+  ptr->kv_input_toks =
       use_int64_token_ ? cur_token : static_cast<int32_t>(cur_token);
-  ptr->input_pos = static_cast<int32_t>(pos);
+  ptr->kv_input_pos = static_cast<int32_t>(pos);
   // If prompt len is 30, prefill will handle to pos = 30.
   // At this point, pos should be 31.
   for (int i = 0; i < pos + 1; i++) {
@@ -435,17 +535,29 @@ void ShiftPointerIoMgr::update_prefill_to_kv_io(
     }
   }
 
+  // Update k_cache
   std::vector<std::unique_ptr<executorch::aten::TensorImpl>>& k_cache_in =
       k_cache_in_[kv_forward_name_];
   std::vector<std::unique_ptr<executorch::aten::TensorImpl>>& k_cache_out =
       k_cache_out_[prefill_forward_name_];
+  // copy from last to prevent from overwriting values
+  size_t copied_size = pos * sizeof(uint8_t);
   for (int i = 0; i < k_cache_in.size(); ++i) {
     uint8_t* ptr_in = k_cache_in[i]->mutable_data<uint8_t>();
-    const uint8_t* ptr_out = k_cache_out[i]->data<uint8_t>();
-    for (size_t j = 0, offset = kv_cache_len_; j < head_dim_;
-         ++j, offset += kv_cache_len_) {
-      for (int k = 0, k_stride = j * prefill_cache_len_; k < pos; k++) {
-        ptr_in[offset + k] = ptr_out[k_stride + k];
+    if (is_bert_) {
+      const uint8_t* ptr_out = k_cache_out[i]->data<uint8_t>();
+      for (size_t j = 0, offset = kv_cache_len_; j < head_dim_;
+           ++j, offset += kv_cache_len_) {
+        for (int k = 0, k_stride = j * prefill_ar_len_; k < pos; k++) {
+          ptr_in[offset + k] = ptr_out[k_stride + k];
+        }
+      }
+    } else {
+      for (int j = head_dim_; j > -1; --j) {
+        memcpy(
+            ptr_in + j * kv_cache_len_,
+            ptr_in + j * prefill_cache_len_,
+            copied_size);
       }
     }
     k_cache_in[i]->set_data(ptr_in + pos);
@@ -458,10 +570,10 @@ void ShiftPointerIoMgr::update_kv_io(
     std::vector<std::vector<Tensor>>& output_tensors) {
   IO* ptr = static_cast<IO*>(data_ptr_.get());
   // update input_tok
-  ptr->input_tok =
+  ptr->kv_input_toks =
       use_int64_token_ ? cur_token : static_cast<int32_t>(cur_token);
   // update position_ids
-  ptr->input_pos = static_cast<int32_t>(pos);
+  ptr->kv_input_pos = static_cast<int32_t>(pos);
   // update causal mask for next token
   ptr->kv_attention_mask[kv_cache_len_ - pos] = 65535;
 
@@ -505,47 +617,101 @@ void ShiftPointerIoMgr::update_prefill_io(
     int64_t cur_token,
     int64_t pos,
     std::vector<std::vector<Tensor>>& output_tensors) {
+  (void)cur_token;
   (void)output_tensors;
-  IO* ptr = static_cast<IO*>(data_ptr_.get());
-  // Support CPU 4-bit embedding, which requires int64 input.
-  // However, for QNN embedding, only int32 input is needed.
-  // Therefore, we need to cast to the correct type to write the data.
-  if (use_int64_token_) {
-    ptr->prefill_input_toks[pos] = cur_token;
-  } else {
-    int32_t* prefill_input_toks_ptr =
-        reinterpret_cast<int32_t*>(ptr->prefill_input_toks.data());
-    prefill_input_toks_ptr[pos] = static_cast<int32_t>(cur_token);
+
+  if (!is_bert_) {
+    // update v_cache
+    auto& v_cache_in = v_cache_in_[prefill_forward_name_];
+    auto& v_cache_out = v_cache_out_[prefill_forward_name_];
+    for (int i = 0; i < v_cache_in.size(); i++) {
+      v_cache_in[i]->set_data(
+          v_cache_in[i]->mutable_data<uint8_t>() + prefill_ar_len_ * head_dim_);
+      v_cache_out[i]->set_data(
+          v_cache_out[i]->mutable_data<uint8_t>() +
+          prefill_ar_len_ * head_dim_);
+    }
+
+    for (int shard = 0; shard < output_tensors.size(); shard++) {
+      for (int index = 0; index < output_tensors[shard].size(); index++) {
+        ET_CHECK_MSG(
+            modules_[shard]->set_output(
+                prefill_forward_name_, output_tensors[shard][index], index) ==
+                Error::Ok,
+            "failed to set output tensor for module %d's %d'th output "
+            "while updating kv_cache output tensors",
+            shard,
+            index);
+      }
+    }
+
+    auto& k_cache_in = k_cache_in_[prefill_forward_name_];
+    auto& k_cache_out = k_cache_out_[prefill_forward_name_];
+    // update k_cache by single thread, this part is cpu cache sensitive
+    for (int i = 0; i < k_cache_in.size(); ++i) {
+      uint8_t* ptr_in = k_cache_in[i]->mutable_data<uint8_t>();
+      const uint8_t* ptr_out = k_cache_out[i]->data<uint8_t>();
+      for (size_t j = 0, offset = prefill_cache_len_; j < head_dim_;
+           ++j, offset += prefill_cache_len_) {
+        for (int k = 0, k_stride = j * prefill_ar_len_; k < prefill_ar_len_;
+             k++) {
+          ptr_in[offset + k] = ptr_out[k_stride + k];
+        }
+      }
+      k_cache_in[i]->set_data(ptr_in + prefill_ar_len_);
+    }
   }
 }
 
 void ShiftPointerIoMgr::fill_prefill_toks(
+    int64_t start_pos,
     std::vector<uint64_t>& prompt_tokens) {
   IO* ptr = static_cast<IO*>(get_mutable_ptr());
-  for (int i = 0; i < prompt_tokens.size(); i++) {
-    // Support CPU 4-bit embedding, which requires int64 input.
-    // However, for QNN embedding, only int32 input is needed.
-    // Therefore, we need to cast to the correct type to write the data.
-    if (use_int64_token_) {
-      ptr->prefill_input_toks[i] = prompt_tokens[i];
-    } else {
-      int32_t* prefill_input_toks_ptr =
-          reinterpret_cast<int32_t*>(ptr->prefill_input_toks.data());
-      prefill_input_toks_ptr[i] = static_cast<int32_t>(prompt_tokens[i]);
+  for (int i = 0; i < prefill_ar_len_; i++) {
+    if (!is_bert_) {
+      ptr->prefill_input_pos[i] = start_pos + i;
+    }
+
+    if (start_pos + i < prompt_tokens.size()) {
+      // Support CPU 4-bit embedding, which requires int64 input.
+      // However, for QNN embedding, only int32 input is needed.
+      // Therefore, we need to cast to the correct type to write the data.
+      if (use_int64_token_) {
+        ptr->prefill_input_toks[i] = prompt_tokens[start_pos + i];
+      } else {
+        int32_t* prefill_input_toks_ptr =
+            reinterpret_cast<int32_t*>(ptr->prefill_input_toks.data());
+        prefill_input_toks_ptr[i] =
+            static_cast<int32_t>(prompt_tokens[start_pos + i]);
+      }
+    }
+    if (start_pos >= prefill_ar_len_) {
+      for (int j = 0,
+               offset = i * context_len_ +
+               (context_len_ - prefill_ar_len_ - start_pos);
+           j < prefill_ar_len_;
+           ++j) {
+        ptr->prefill_attention_mask[offset + j] = 65535;
+      }
     }
   }
 }
 
 void ShiftPointerIoMgr::fill_kv_tok_mask(int64_t pos, int64_t cur_token) {
   IO* ptr = static_cast<IO*>(get_mutable_ptr());
-  ptr->input_tok =
+  ptr->kv_input_toks =
       use_int64_token_ ? cur_token : static_cast<int32_t>(cur_token);
+  ptr->kv_input_pos = static_cast<int32_t>(pos);
+  ;
   ptr->kv_attention_mask[kv_cache_len_] = 65535;
 }
 
 SmartMaskIoMgr::SmartMaskIoMgr(
     std::vector<std::shared_ptr<Module>>& modules,
+    int32_t context_len,
+    int32_t prefill_ar_len,
     int32_t prefill_cache_len,
+    int32_t kv_ar_len,
     int32_t kv_cache_len,
     int32_t vocab_size,
     int32_t num_layers,
@@ -557,7 +723,10 @@ SmartMaskIoMgr::SmartMaskIoMgr(
     const bool use_int64_token)
     : IoMgrBase(modules),
       shard_layers_({num_layers}),
+      context_len_(context_len),
+      kv_ar_len_(kv_ar_len),
       kv_cache_len_(kv_cache_len),
+      prefill_ar_len_(prefill_ar_len),
       prefill_cache_len_(prefill_cache_len),
       vocab_size_(vocab_size),
       num_layers_(num_layers),
@@ -566,12 +735,17 @@ SmartMaskIoMgr::SmartMaskIoMgr(
       eval_mode_(eval_mode),
       prefill_forward_name_(prefill_forward_name),
       kv_forward_name_(kv_forward_name),
-      use_int64_token_(use_int64_token) {
+      use_int64_token_(use_int64_token),
+      is_bert_(prefill_cache_len == 0) {
   if (!prefill_forward_name_.empty()) {
     input_tensors_[prefill_forward_name_] =
         std::vector<std::vector<executorch::aten::TensorImpl*>>(modules.size());
     output_tensors_[prefill_forward_name_] =
         std::vector<std::vector<executorch::aten::TensorImpl*>>(modules.size());
+    k_cache_in_[prefill_forward_name_] =
+        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
+    v_cache_in_[prefill_forward_name_] =
+        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
     k_cache_out_[prefill_forward_name_] =
         std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
     v_cache_out_[prefill_forward_name_] =
@@ -597,20 +771,20 @@ SmartMaskIoMgr::SmartMaskIoMgr(
 }
 
 std::unordered_map<std::string, size_t> SmartMaskIoMgr::get_io_elements() {
-  size_t cache_len = std::max(kv_cache_len_, prefill_cache_len_);
-  size_t cache_in_ele = num_layers_ * num_heads_ * head_dim_ * cache_len;
-  size_t cache_out_ele = num_layers_ * num_heads_ * head_dim_;
+  int32_t max_ar_len = std::max(kv_ar_len_, prefill_ar_len_);
+  size_t cache_in_ele = num_layers_ * num_heads_ * head_dim_ * kv_cache_len_;
+  size_t cache_out_ele = num_layers_ * num_heads_ * head_dim_ * max_ar_len;
   return std::unordered_map<std::string, size_t>{
-      {"input_tok_ele", 1},
-      {"input_pos_ele", 1},
+      {"kv_input_toks_ele", kv_ar_len_},
+      {"kv_input_pos_ele", kv_ar_len_},
       {"cache_in_ele", cache_in_ele},
       {"cache_out_ele", cache_out_ele},
-      // 1 for the input prompt
-      {"atten_mask_ele", cache_len + 1},
-      {"kv_logits_ele", vocab_size_},
-      {"prefill_input_toks_ele", prefill_cache_len_},
-      {"prefill_atten_mask_ele", prefill_cache_len_ * prefill_cache_len_},
-      {"prefill_logits_ele", prefill_cache_len_ * vocab_size_}};
+      {"kv_attention_mask_ele", kv_ar_len_ * context_len_},
+      {"kv_logits_ele", kv_ar_len_ * vocab_size_},
+      {"prefill_input_toks_ele", prefill_ar_len_},
+      {"prefill_input_pos_ele", prefill_ar_len_},
+      {"prefill_attention_mask_ele", prefill_ar_len_ * context_len_},
+      {"prefill_logits_ele", prefill_ar_len_ * vocab_size_}};
 }
 
 std::unordered_map<std::string, size_t> SmartMaskIoMgr::get_io_bytes() {
@@ -623,21 +797,23 @@ std::unordered_map<std::string, size_t> SmartMaskIoMgr::get_io_bytes() {
              byte % static_cast<intptr_t>(alignment));
   };
   return std::unordered_map<std::string, size_t>{
-      {"input_tok_bytes",
-       align(element_map["input_tok_ele"] * sizeof(int32_t))},
-      {"input_pos_bytes",
-       align(element_map["input_pos_ele"] * sizeof(int32_t))},
+      {"kv_input_toks_bytes",
+       align(element_map["kv_input_toks_ele"] * sizeof(int32_t))},
+      {"kv_input_pos_bytes",
+       align(element_map["kv_input_pos_ele"] * sizeof(int32_t))},
       {"cache_in_bytes", align(element_map["cache_in_ele"] * sizeof(uint8_t))},
       {"cache_out_bytes",
        align(element_map["cache_out_ele"] * sizeof(uint8_t))},
-      {"atten_mask_bytes",
-       align(element_map["atten_mask_ele"] * sizeof(uint16_t))},
+      {"kv_attention_mask_bytes",
+       align(element_map["kv_attention_mask_ele"] * sizeof(uint16_t))},
       {"kv_logits_bytes",
        align(element_map["kv_logits_ele"] * sizeof(uint16_t))},
       {"prefill_input_toks_bytes",
        align(element_map["prefill_input_toks_ele"] * sizeof(int32_t))},
-      {"prefill_atten_mask_bytes",
-       align(element_map["prefill_atten_mask_ele"] * sizeof(uint16_t))},
+      {"prefill_input_pos_bytes",
+       align(element_map["prefill_input_pos_ele"] * sizeof(int32_t))},
+      {"prefill_attention_mask_bytes",
+       align(element_map["prefill_attention_mask_ele"] * sizeof(uint16_t))},
       {"prefill_logits_bytes",
        align(element_map["prefill_logits_ele"] * sizeof(uint16_t))}};
 }
@@ -654,10 +830,10 @@ void SmartMaskIoMgr::IO::init_io_ptrs(
   for (const auto& iter : io_bytes_map) {
     std::string key = iter.first;
     size_t size = iter.second;
-    if (key == "input_tok_bytes") {
-      input_tok = reinterpret_cast<int64_t*>(cur_ptr);
-    } else if (key == "input_pos_bytes") {
-      input_pos = reinterpret_cast<int32_t*>(cur_ptr);
+    if (key == "kv_input_toks_bytes") {
+      kv_input_toks = reinterpret_cast<int64_t*>(cur_ptr);
+    } else if (key == "kv_input_pos_bytes") {
+      kv_input_pos = reinterpret_cast<int32_t*>(cur_ptr);
     } else if (key == "cache_in_bytes" || key == "cache_out_bytes") {
       auto& k_cache_ref = (key == "cache_in_bytes") ? k_cache : k_cache_out;
       auto& v_cache_ref = (key == "cache_in_bytes") ? v_cache : v_cache_out;
@@ -679,14 +855,16 @@ void SmartMaskIoMgr::IO::init_io_ptrs(
         }
       }
       continue;
-    } else if (key == "atten_mask_bytes") {
+    } else if (key == "kv_attention_mask_bytes") {
       kv_attention_mask = reinterpret_cast<uint16_t*>(cur_ptr);
     } else if (key == "kv_logits_bytes") {
       kv_logits = reinterpret_cast<uint16_t*>(cur_ptr);
     } else if (key == "prefill_input_toks_bytes") {
       prefill_input_toks = reinterpret_cast<int64_t*>(cur_ptr);
-    } else if (key == "prefill_atten_mask_bytes") {
-      prefill_atten_mask = reinterpret_cast<uint16_t*>(cur_ptr);
+    } else if (key == "prefill_input_pos_bytes") {
+      prefill_input_pos = reinterpret_cast<int32_t*>(cur_ptr);
+    } else if (key == "prefill_attention_mask_bytes") {
+      prefill_attention_mask = reinterpret_cast<uint16_t*>(cur_ptr);
     } else if (key == "prefill_logits_bytes") {
       prefill_logits = reinterpret_cast<uint16_t*>(cur_ptr);
     } else {
@@ -720,15 +898,10 @@ void SmartMaskIoMgr::init_io() {
   std::unordered_map<std::string, size_t> io_bytes_map = get_io_bytes();
 
   switch (eval_mode_) {
-    case EvalMode::kPrefill:
-      io_bytes_map.erase("input_tok_bytes");
-      io_bytes_map.erase("input_pos_bytes");
-      io_bytes_map.erase("atten_mask_bytes");
-      io_bytes_map.erase("kv_logits_bytes");
-      break;
     case EvalMode::kKVCached:
       io_bytes_map.erase("prefill_input_toks_bytes");
-      io_bytes_map.erase("prefill_atten_mask_bytes");
+      io_bytes_map.erase("prefill_input_pos_bytes");
+      io_bytes_map.erase("prefill_attention_mask_bytes");
       io_bytes_map.erase("prefill_logits_bytes");
       break;
     case EvalMode::kHybrid:
@@ -760,6 +933,22 @@ void SmartMaskIoMgr::init_io() {
   ptr->init_io_ptrs(shared_ptr, io_bytes_map);
 }
 
+void SmartMaskIoMgr::reset_io(
+    const std::vector<executorch::runtime::Result<
+        executorch::runtime::MethodMeta>>& prefill_methods_meta,
+    const std::vector<
+        executorch::runtime::Result<executorch::runtime::MethodMeta>>&
+        kv_methods_meta) {
+  IO* ptr = static_cast<IO*>(data_ptr_.get());
+  int32_t prefill_attn_size = prefill_ar_len_ * context_len_;
+  int32_t kv_attn_size = kv_ar_len_ * context_len_;
+  std::fill(
+      ptr->prefill_attention_mask,
+      ptr->prefill_attention_mask + prefill_attn_size,
+      0);
+  std::fill(ptr->kv_attention_mask, ptr->kv_attention_mask + kv_attn_size, 0);
+}
+
 void SmartMaskIoMgr::prepare_kv_io(
     const std::vector<Result<MethodMeta>>& methods_meta) {
   for (int i = 0; i < modules_.size(); ++i) {
@@ -774,53 +963,55 @@ void SmartMaskIoMgr::prepare_kv_io(
   std::unordered_map<std::string, size_t> io_bytes_map = get_io_bytes();
 
   // [I]: input_tokens
-  Result<TensorInfo> input_tok = methods_meta[0]->input_tensor_meta(0);
-  input_tok_ = std::make_unique<TensorImpl>(
-      input_tok->scalar_type(),
-      input_tok->sizes().size(),
-      const_cast<TensorImpl::SizesType*>(input_tok->sizes().data()),
-      ptr->input_tok,
-      const_cast<TensorImpl::DimOrderType*>(input_tok->dim_order().data()));
-  input_tensors_[kv_forward_name_][0].push_back(input_tok_.get());
+  Result<TensorInfo> kv_input_toks = methods_meta[0]->input_tensor_meta(0);
+  kv_input_toks_ = std::make_unique<TensorImpl>(
+      kv_input_toks->scalar_type(),
+      kv_input_toks->sizes().size(),
+      const_cast<TensorImpl::SizesType*>(kv_input_toks->sizes().data()),
+      ptr->kv_input_toks,
+      const_cast<TensorImpl::DimOrderType*>(kv_input_toks->dim_order().data()));
+  input_tensors_[kv_forward_name_][0].push_back(kv_input_toks_.get());
   ptr->add_custom_mem_info(
-      ptr->input_tok,
-      io_bytes_map["input_tok_bytes"],
-      input_tok->scalar_type(),
-      input_tok.get());
+      ptr->kv_input_toks,
+      io_bytes_map["kv_input_toks_bytes"],
+      kv_input_toks->scalar_type(),
+      kv_input_toks.get());
 
   // [I]: atten_mask
-  Result<TensorInfo> atten_mask = methods_meta[0]->input_tensor_meta(1);
-  attention_mask_ = std::make_unique<TensorImpl>(
-      atten_mask->scalar_type(),
-      atten_mask->sizes().size(),
-      const_cast<TensorImpl::SizesType*>(atten_mask->sizes().data()),
+  std::fill_n(ptr->kv_attention_mask, kv_ar_len_ * context_len_, 0);
+  Result<TensorInfo> kv_attention_mask = methods_meta[0]->input_tensor_meta(1);
+  kv_attention_mask_ = std::make_unique<TensorImpl>(
+      kv_attention_mask->scalar_type(),
+      kv_attention_mask->sizes().size(),
+      const_cast<TensorImpl::SizesType*>(kv_attention_mask->sizes().data()),
       ptr->kv_attention_mask,
-      const_cast<TensorImpl::DimOrderType*>(atten_mask->dim_order().data()));
-  input_tensors_[kv_forward_name_][0].push_back(attention_mask_.get());
+      const_cast<TensorImpl::DimOrderType*>(
+          kv_attention_mask->dim_order().data()));
+  input_tensors_[kv_forward_name_][0].push_back(kv_attention_mask_.get());
   ptr->add_custom_mem_info(
       ptr->kv_attention_mask,
-      io_bytes_map["atten_mask_bytes"],
-      atten_mask->scalar_type(),
-      atten_mask.get());
+      io_bytes_map["kv_attention_mask_bytes"],
+      kv_attention_mask->scalar_type(),
+      kv_attention_mask.get());
 
   // [I]: input_pos
-  Result<TensorInfo> input_pos = methods_meta[0]->input_tensor_meta(2);
-  input_pos_ = std::make_unique<TensorImpl>(
-      input_pos->scalar_type(),
-      input_pos->sizes().size(),
-      const_cast<TensorImpl::SizesType*>(input_pos->sizes().data()),
-      ptr->input_pos,
-      const_cast<TensorImpl::DimOrderType*>(input_pos->dim_order().data()));
-  input_tensors_[kv_forward_name_][0].push_back(input_pos_.get());
+  Result<TensorInfo> kv_input_pos = methods_meta[0]->input_tensor_meta(2);
+  kv_input_pos_ = std::make_unique<TensorImpl>(
+      kv_input_pos->scalar_type(),
+      kv_input_pos->sizes().size(),
+      const_cast<TensorImpl::SizesType*>(kv_input_pos->sizes().data()),
+      ptr->kv_input_pos,
+      const_cast<TensorImpl::DimOrderType*>(kv_input_pos->dim_order().data()));
+  input_tensors_[kv_forward_name_][0].push_back(kv_input_pos_.get());
   ptr->add_custom_mem_info(
-      ptr->input_pos,
-      io_bytes_map["input_pos_bytes"],
-      input_pos->scalar_type(),
-      input_pos.get());
+      ptr->kv_input_pos,
+      io_bytes_map["kv_input_pos_bytes"],
+      kv_input_pos->scalar_type(),
+      kv_input_pos.get());
 
   // [I] kv_cache
   size_t layered_head_count = num_layers_ * num_heads_;
-  int index = 3; // bypass input_tokens, input_pos, atten_mask
+  int index = 3; // bypass input_tokens, atten_mask, input_pos
   for (int offset = 0, shard_index = 0; shard_index < modules_.size();
        offset += shard_layers_[shard_index], shard_index++) {
     for (int cache_group = 0; cache_group < 2; ++cache_group) {
@@ -913,12 +1104,11 @@ void SmartMaskIoMgr::update_kv_io(
     int64_t pos,
     std::vector<std::vector<Tensor>>& output_tensors) {
   IO* ptr = static_cast<IO*>(data_ptr_.get());
-  size_t cache_len = std::max(kv_cache_len_, prefill_cache_len_);
   // update input_tok
-  *ptr->input_tok =
+  *ptr->kv_input_toks =
       use_int64_token_ ? cur_token : static_cast<int32_t>(cur_token);
   // update position_ids
-  *ptr->input_pos = static_cast<int32_t>(pos);
+  *ptr->kv_input_pos = static_cast<int32_t>(pos);
   // update smart mask for previous cache
   ptr->kv_attention_mask[pos] = 65535;
 
@@ -937,7 +1127,8 @@ void SmartMaskIoMgr::update_kv_io(
   for (int i = 0; i < k_cache_in.size(); ++i) {
     uint8_t* ptr_in = k_cache_in[i]->mutable_data<uint8_t>() + pos;
     const uint8_t* ptr_out = k_cache_out[i]->data<uint8_t>();
-    for (size_t j = 0, offset = 0; j < head_dim_; ++j, offset += cache_len) {
+    for (size_t j = 0, offset = 0; j < head_dim_;
+         ++j, offset += kv_cache_len_) {
       ptr_in[offset] = ptr_out[j];
     }
   }
@@ -958,7 +1149,6 @@ void SmartMaskIoMgr::prepare_prefill_io(
   IO* ptr = static_cast<IO*>(data_ptr_.get());
   std::unordered_map<std::string, size_t> io_bytes_map = get_io_bytes();
 
-  int32_t cache_len = methods_meta[0]->input_tensor_meta(0)->sizes()[1];
   // [I]: pre_input_tokens
   Result<TensorInfo> prefill_input_toks = methods_meta[0]->input_tensor_meta(0);
   prefill_input_toks_ = std::make_unique<TensorImpl>(
@@ -975,30 +1165,92 @@ void SmartMaskIoMgr::prepare_prefill_io(
       executorch::aten::ScalarType::Int,
       prefill_input_toks.get());
 
-  // [I]: prefill_attn_mask
-  for (int i = 0; i < cache_len; ++i) {
-    for (int j = 0; j < cache_len; ++j) {
+  // [I]: prefill_attention_mask
+  for (int i = 0; i < prefill_ar_len_; ++i) {
+    for (int j = 0,
+             offset = i * context_len_ + (context_len_ - prefill_ar_len_);
+         j < prefill_ar_len_;
+         ++j) {
       if (i < j) {
-        ptr->prefill_atten_mask[i * cache_len + j] = 0;
+        ptr->prefill_attention_mask[j + offset] = 0;
       } else {
-        ptr->prefill_atten_mask[i * cache_len + j] = 65535;
+        ptr->prefill_attention_mask[j + offset] = 65535;
       }
     }
   }
-  Result<TensorInfo> prefill_atten_mask = methods_meta[0]->input_tensor_meta(1);
-  prefill_attn_mask_ = std::make_unique<TensorImpl>(
-      prefill_atten_mask->scalar_type(),
-      prefill_atten_mask->sizes().size(),
-      const_cast<TensorImpl::SizesType*>(prefill_atten_mask->sizes().data()),
-      ptr->prefill_atten_mask,
+  Result<TensorInfo> prefill_attention_mask =
+      methods_meta[0]->input_tensor_meta(1);
+  prefill_attention_mask_ = std::make_unique<TensorImpl>(
+      prefill_attention_mask->scalar_type(),
+      prefill_attention_mask->sizes().size(),
+      const_cast<TensorImpl::SizesType*>(
+          prefill_attention_mask->sizes().data()),
+      ptr->prefill_attention_mask,
       const_cast<TensorImpl::DimOrderType*>(
-          prefill_atten_mask->dim_order().data()));
-  input_tensors_[prefill_forward_name_][0].push_back(prefill_attn_mask_.get());
+          prefill_attention_mask->dim_order().data()));
+  input_tensors_[prefill_forward_name_][0].push_back(
+      prefill_attention_mask_.get());
   ptr->add_custom_mem_info(
-      ptr->prefill_atten_mask,
-      io_bytes_map["prefill_atten_mask_bytes"],
+      ptr->prefill_attention_mask,
+      io_bytes_map["prefill_attention_mask_bytes"],
       executorch::aten::ScalarType::Bits16,
-      prefill_atten_mask.get());
+      prefill_attention_mask.get());
+
+  if (!is_bert_) {
+    // [I]: prefill_input_pos
+    Result<TensorInfo> prefill_input_pos =
+        methods_meta[0]->input_tensor_meta(2);
+    prefill_input_pos_ = std::make_unique<TensorImpl>(
+        prefill_input_pos->scalar_type(),
+        prefill_input_pos->sizes().size(),
+        const_cast<TensorImpl::SizesType*>(prefill_input_pos->sizes().data()),
+        ptr->prefill_input_pos,
+        const_cast<TensorImpl::DimOrderType*>(
+            prefill_input_pos->dim_order().data()));
+    input_tensors_[prefill_forward_name_][0].push_back(
+        prefill_input_pos_.get());
+    ptr->add_custom_mem_info(
+        ptr->prefill_input_pos,
+        io_bytes_map["prefill_input_pos_bytes"],
+        prefill_input_pos->scalar_type(),
+        prefill_input_pos.get());
+
+    // [I] kv_cache
+    size_t layered_head_count = num_layers_ * num_heads_;
+    int index = 3; // bypass input_tokens, atten_mask, input_pos
+    for (int offset = 0, shard_index = 0; shard_index < modules_.size();
+         offset += shard_layers_[shard_index], shard_index++) {
+      for (int cache_group = 0; cache_group < 2; ++cache_group) {
+        for (int layer = 0; layer < shard_layers_[shard_index]; ++layer) {
+          for (int head = 0; head < num_heads_; ++head, ++index) {
+            Result<TensorInfo> kv_cache =
+                methods_meta[shard_index]->input_tensor_meta(index);
+            std::vector<std::unique_ptr<TensorImpl>>& cache =
+                (cache_group == 0 ? k_cache_in_[prefill_forward_name_]
+                                  : v_cache_in_[prefill_forward_name_]);
+            uint8_t* cache_ptr = (cache_group == 0)
+                ? ptr->k_cache[layer + offset][head]
+                : ptr->v_cache[layer + offset][head];
+
+            cache.emplace_back(std::make_unique<TensorImpl>(
+                kv_cache->scalar_type(),
+                kv_cache->sizes().size(),
+                const_cast<TensorImpl::SizesType*>(kv_cache->sizes().data()),
+                cache_ptr,
+                const_cast<TensorImpl::DimOrderType*>(
+                    kv_cache->dim_order().data())));
+            ptr->add_custom_mem_info(
+                cache_ptr,
+                io_bytes_map["cache_in_bytes"] / layered_head_count,
+                kv_cache->scalar_type(),
+                kv_cache.get());
+            input_tensors_[prefill_forward_name_][shard_index].push_back(
+                cache.back().get());
+          }
+        }
+      }
+    }
+  }
 
   // [O]: logits
   int logit_index = 0;
@@ -1031,8 +1283,8 @@ void SmartMaskIoMgr::prepare_prefill_io(
               (cache_group == 0 ? k_cache_out_[prefill_forward_name_]
                                 : v_cache_out_[prefill_forward_name_]);
           void* cache_ptr = (cache_group == 0)
-              ? ptr->k_cache[layer + offset][head]
-              : ptr->v_cache[layer + offset][head];
+              ? ptr->k_cache_out[layer + offset][head]
+              : ptr->v_cache_out[layer + offset][head];
           cache.emplace_back(std::make_unique<TensorImpl>(
               kv_cache->scalar_type(),
               kv_cache->sizes().size(),
@@ -1042,7 +1294,7 @@ void SmartMaskIoMgr::prepare_prefill_io(
                   kv_cache->dim_order().data())));
           ptr->add_custom_mem_info(
               cache_ptr,
-              io_bytes_map["cache_in_bytes"] / layered_head_count,
+              io_bytes_map["cache_out_bytes"] / layered_head_count,
               executorch::aten::ScalarType::Byte,
               kv_cache.get());
           output_tensors_[prefill_forward_name_][shard_index].push_back(
@@ -1059,24 +1311,50 @@ void SmartMaskIoMgr::update_prefill_to_kv_io(
     std::vector<std::vector<Tensor>>& output_tensors) {
   IO* ptr = static_cast<IO*>(data_ptr_.get());
 
-  *ptr->input_tok =
+  *ptr->kv_input_toks =
       use_int64_token_ ? cur_token : static_cast<int32_t>(cur_token);
-  *ptr->input_pos = static_cast<int32_t>(pos);
+  *ptr->kv_input_pos = static_cast<int32_t>(pos);
   // pos means the cur_token pos
   for (int i = 0; i < pos; i++) {
     ptr->kv_attention_mask[i] = 65535;
   }
 
-  // Update K is enough, copy from last to prevent from overwriting values
-  size_t copied_size = prefill_cache_len_ * sizeof(uint8_t);
-  for (int l = 0; l < num_layers_; l++) {
-    for (int h = 0; h < num_heads_; h++) {
-      uint8_t* k_cache = ptr->k_cache[l][h];
-      for (int hd = head_dim_ - 1; hd > -1; hd--) {
-        memcpy(
-            k_cache + (kv_cache_len_ * hd),
-            k_cache + (prefill_cache_len_ * hd),
-            copied_size);
+  if (is_bert_) {
+    // update v_cache
+    auto& v_cache_in = v_cache_in_[kv_forward_name_];
+    auto& v_cache_out = v_cache_out_[prefill_forward_name_];
+    // update v_cache by single thread, this part is cpu cache sensitive
+    size_t copied_size = kv_cache_len_ * head_dim_ * sizeof(uint8_t);
+    for (int i = 0; i < v_cache_in.size(); ++i) {
+      uint8_t* ptr_in = v_cache_in[i]->mutable_data<uint8_t>();
+      const uint8_t* ptr_out = v_cache_out[i]->data<uint8_t>();
+      memcpy(ptr_in, ptr_out, copied_size);
+    }
+
+    auto& k_cache_in = k_cache_in_[kv_forward_name_];
+    auto& k_cache_out = k_cache_out_[prefill_forward_name_];
+    for (int i = 0; i < k_cache_in.size(); ++i) {
+      uint8_t* ptr_in = k_cache_in[i]->mutable_data<uint8_t>();
+      const uint8_t* ptr_out = k_cache_out[i]->data<uint8_t>();
+      for (size_t j = 0, offset = 0; j < head_dim_;
+           ++j, offset += kv_cache_len_) {
+        for (size_t k = 0, k_stride = j * prefill_ar_len_; k < pos; k++) {
+          ptr_in[offset + k] = ptr_out[k_stride + k];
+        }
+      }
+    }
+  } else {
+    // Update K is enough, copy from last to prevent from overwriting values
+    size_t copied_size = pos * sizeof(uint8_t);
+    for (int l = 0; l < num_layers_; l++) {
+      for (int h = 0; h < num_heads_; h++) {
+        uint8_t* k_cache = ptr->k_cache[l][h];
+        for (int hd = head_dim_ - 1; hd > -1; hd--) {
+          memcpy(
+              k_cache + (kv_cache_len_ * hd),
+              k_cache + (prefill_cache_len_ * hd),
+              copied_size);
+        }
       }
     }
   }
@@ -1087,38 +1365,71 @@ void SmartMaskIoMgr::update_prefill_io(
     int64_t pos,
     std::vector<std::vector<Tensor>>& output_tensors) {
   (void)output_tensors;
-  IO* ptr = static_cast<IO*>(data_ptr_.get());
-  // Support CPU 4-bit embedding, which requires int64 input.
-  // However, for QNN embedding, only int32 input is needed.
-  // Therefore, we need to cast to the correct type to write the data.
-  if (use_int64_token_) {
-    ptr->prefill_input_toks[pos] = cur_token;
-  } else {
-    int32_t* prefill_input_toks_ptr =
-        reinterpret_cast<int32_t*>(ptr->prefill_input_toks);
-    prefill_input_toks_ptr[pos] = static_cast<int32_t>(cur_token);
+
+  if (!is_bert_) {
+    // update v_cache
+    auto& v_cache_in = v_cache_in_[prefill_forward_name_];
+    auto& v_cache_out = v_cache_out_[prefill_forward_name_];
+    // update v_cache by single thread, this part is cpu cache sensitive
+    size_t copied_size = prefill_ar_len_ * head_dim_ * sizeof(uint8_t);
+    for (int i = 0; i < v_cache_in.size(); ++i) {
+      uint8_t* ptr_in =
+          v_cache_in[i]->mutable_data<uint8_t>() + pos * head_dim_;
+      const uint8_t* ptr_out = v_cache_out[i]->data<uint8_t>();
+      memcpy(ptr_in, ptr_out, copied_size);
+    }
+
+    auto& k_cache_in = k_cache_in_[prefill_forward_name_];
+    auto& k_cache_out = k_cache_out_[prefill_forward_name_];
+    for (int i = 0; i < k_cache_in.size(); ++i) {
+      uint8_t* ptr_in = k_cache_in[i]->mutable_data<uint8_t>();
+      const uint8_t* ptr_out = k_cache_out[i]->data<uint8_t>();
+      for (size_t j = 0, offset = pos; j < head_dim_;
+           ++j, offset += prefill_cache_len_) {
+        for (size_t k = 0, k_stride = j * prefill_ar_len_; k < prefill_ar_len_;
+             k++) {
+          ptr_in[offset + k] = ptr_out[k_stride + k];
+        }
+      }
+    }
   }
 }
 
-void SmartMaskIoMgr::fill_prefill_toks(std::vector<uint64_t>& prompt_tokens) {
+void SmartMaskIoMgr::fill_prefill_toks(
+    int64_t start_pos,
+    std::vector<uint64_t>& prompt_tokens) {
   IO* ptr = static_cast<IO*>(get_mutable_ptr());
-  for (int i = 0; i < prompt_tokens.size(); i++) {
-    // Support CPU 4-bit embedding, which requires int64 input.
-    // However, for QNN embedding, only int32 input is needed.
-    // Therefore, we need to cast to the correct type to write the data.
-    if (use_int64_token_) {
-      ptr->prefill_input_toks[i] = prompt_tokens[i];
-    } else {
-      int32_t* prefill_input_toks_ptr =
-          reinterpret_cast<int32_t*>(ptr->prefill_input_toks);
-      prefill_input_toks_ptr[i] = static_cast<int32_t>(prompt_tokens[i]);
+  for (int i = 0; i < prefill_ar_len_; i++) {
+    if (!is_bert_) {
+      ptr->prefill_input_pos[i] = start_pos + i;
+    }
+
+    if (start_pos + i < prompt_tokens.size()) {
+      // Support CPU 4-bit embedding, which requires int64 input.
+      // However, for QNN embedding, only int32 input is needed.
+      // Therefore, we need to cast to the correct type to write the data.
+      if (use_int64_token_) {
+        ptr->prefill_input_toks[i] = prompt_tokens[start_pos + i];
+      } else {
+        int32_t* prefill_input_toks_ptr =
+            reinterpret_cast<int32_t*>(ptr->prefill_input_toks);
+        prefill_input_toks_ptr[i] =
+            static_cast<int32_t>(prompt_tokens[start_pos + i]);
+      }
+    }
+    if (start_pos >= prefill_ar_len_) {
+      for (int j = 0, offset = i * context_len_ + (start_pos - prefill_ar_len_);
+           j < prefill_ar_len_;
+           ++j) {
+        ptr->prefill_attention_mask[offset + j] = 65535;
+      }
     }
   }
 }
 
 void SmartMaskIoMgr::fill_kv_tok_mask(int64_t pos, int64_t cur_token) {
   IO* ptr = static_cast<IO*>(get_mutable_ptr());
-  *ptr->input_tok =
+  *ptr->kv_input_toks =
       use_int64_token_ ? cur_token : static_cast<int32_t>(cur_token);
   ptr->kv_attention_mask[kv_cache_len_] = 65535;
 }
diff --git a/examples/qualcomm/oss_scripts/llama/runner/io_manager.h b/examples/qualcomm/oss_scripts/llama/runner/io_manager.h
index 3a59ab6924e..03808ede3bf 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/io_manager.h
+++ b/examples/qualcomm/oss_scripts/llama/runner/io_manager.h
@@ -23,8 +23,7 @@
 namespace example {
 
 enum EvalMode {
-  kPrefill = 0,
-  kKVCached,
+  kKVCached = 0,
   kHybrid,
   kUnsupported,
 };
@@ -34,6 +33,12 @@ class IoMgrBase {
       std::vector<std::shared_ptr<executorch::extension::Module>>& modules);
   virtual ~IoMgrBase();
   virtual void init_io() = 0;
+  virtual void reset_io(
+      const std::vector<executorch::runtime::Result<
+          executorch::runtime::MethodMeta>>& prefill_methods_meta,
+      const std::vector<
+          executorch::runtime::Result<executorch::runtime::MethodMeta>>&
+          kv_methods_meta) = 0;
   virtual void prepare_prefill_io(
       const std::vector<
           executorch::runtime::Result<executorch::runtime::MethodMeta>>&
@@ -42,7 +47,9 @@ class IoMgrBase {
       const std::vector<
           executorch::runtime::Result<executorch::runtime::MethodMeta>>&
           methods_meta) = 0;
-  virtual void fill_prefill_toks(std::vector<uint64_t>& prompt_tokens) = 0;
+  virtual void fill_prefill_toks(
+      int64_t start_pos,
+      std::vector<uint64_t>& prompt_tokens) = 0;
   virtual void fill_kv_tok_mask(int64_t pos, int64_t cur_token) = 0;
   virtual void update_prefill_to_kv_io(
       int64_t cur_token,
@@ -81,7 +88,10 @@ class ShiftPointerIoMgr : public IoMgrBase {
  public:
   ShiftPointerIoMgr(
       std::vector<std::shared_ptr<executorch::extension::Module>>& modules,
+      int32_t context_len,
+      int32_t prefill_ar_len,
       int32_t prefill_cache_len,
+      int32_t kv_ar_len,
       int32_t kv_cache_len,
       int32_t vocab_size,
       int32_t num_layers,
@@ -93,6 +103,12 @@ class ShiftPointerIoMgr : public IoMgrBase {
       const bool use_int64_token);
 
   void init_io() override;
+  void reset_io(
+      const std::vector<executorch::runtime::Result<
+          executorch::runtime::MethodMeta>>& prefill_methods_meta,
+      const std::vector<
+          executorch::runtime::Result<executorch::runtime::MethodMeta>>&
+          kv_methods_meta) override;
   void prepare_prefill_io(
       const std::vector<
           executorch::runtime::Result<executorch::runtime::MethodMeta>>&
@@ -101,7 +117,9 @@ class ShiftPointerIoMgr : public IoMgrBase {
       const std::vector<
           executorch::runtime::Result<executorch::runtime::MethodMeta>>&
           methods_meta) override;
-  void fill_prefill_toks(std::vector<uint64_t>& prompt_tokens) override;
+  void fill_prefill_toks(
+      int64_t start_pos,
+      std::vector<uint64_t>& prompt_tokens) override;
   void fill_kv_tok_mask(int64_t pos, int64_t cur_token) override;
   void update_prefill_to_kv_io(
       int64_t cur_token,
@@ -119,25 +137,26 @@ class ShiftPointerIoMgr : public IoMgrBase {
       std::vector<std::vector<executorch::aten::Tensor>>& output_tensors)
       override;
   struct IO {
-    int64_t input_tok;
-    int32_t input_pos;
+    int64_t kv_input_toks;
+    int32_t kv_input_pos;
     std::vector<std::vector<std::vector<uint8_t>>> k_cache;
     std::vector<std::vector<uint8_t>> v_cache;
     std::vector<std::vector<uint8_t>> k_cache_out;
     std::vector<uint16_t> kv_attention_mask;
     std::vector<uint16_t> kv_logits;
     std::vector<int64_t> prefill_input_toks;
-    std::vector<uint16_t> prefill_atten_mask;
+    std::vector<int32_t> prefill_input_pos;
+    std::vector<uint16_t> prefill_attention_mask;
     std::vector<uint16_t> prefill_logits;
   };
 
  private:
-  std::unique_ptr<executorch::aten::TensorImpl> input_tok_;
-  std::unique_ptr<executorch::aten::TensorImpl> input_pos_;
-  std::unique_ptr<executorch::aten::TensorImpl> hidden_state_;
-  std::unique_ptr<executorch::aten::TensorImpl> attention_mask_;
+  std::unique_ptr<executorch::aten::TensorImpl> kv_input_toks_;
+  std::unique_ptr<executorch::aten::TensorImpl> kv_input_pos_;
+  std::unique_ptr<executorch::aten::TensorImpl> kv_attention_mask_;
   std::unique_ptr<executorch::aten::TensorImpl> prefill_input_toks_;
-  std::unique_ptr<executorch::aten::TensorImpl> prefill_attn_mask_;
+  std::unique_ptr<executorch::aten::TensorImpl> prefill_input_pos_;
+  std::unique_ptr<executorch::aten::TensorImpl> prefill_attention_mask_;
   std::unique_ptr<executorch::aten::TensorImpl> prefill_logits_;
   std::unordered_map<
       std::string,
@@ -157,7 +176,10 @@ class ShiftPointerIoMgr : public IoMgrBase {
       v_cache_out_;
   std::unique_ptr<executorch::aten::TensorImpl> kv_logits_;
   std::vector<int> shard_layers_;
+  int32_t context_len_{0};
+  int32_t kv_ar_len_{0};
   int32_t kv_cache_len_{0};
+  int32_t prefill_ar_len_{0};
   int32_t prefill_cache_len_{0};
   int32_t vocab_size_;
   int32_t num_layers_;
@@ -167,13 +189,17 @@ class ShiftPointerIoMgr : public IoMgrBase {
   std::string prefill_forward_name_;
   std::string kv_forward_name_;
   const bool use_int64_token_{false};
+  const bool is_bert_{false};
 };
 
 class SmartMaskIoMgr : public IoMgrBase {
  public:
   SmartMaskIoMgr(
       std::vector<std::shared_ptr<executorch::extension::Module>>& modules,
+      int32_t context_len,
+      int32_t prefill_ar_len,
       int32_t prefill_cache_len,
+      int32_t kv_ar_len,
       int32_t kv_cache_len,
       int32_t vocab_size,
       int32_t num_layers,
@@ -185,6 +211,12 @@ class SmartMaskIoMgr : public IoMgrBase {
       const bool use_int64_token);
 
   void init_io() override;
+  void reset_io(
+      const std::vector<executorch::runtime::Result<
+          executorch::runtime::MethodMeta>>& prefill_methods_meta,
+      const std::vector<
+          executorch::runtime::Result<executorch::runtime::MethodMeta>>&
+          kv_methods_meta) override;
   void prepare_prefill_io(
       const std::vector<
           executorch::runtime::Result<executorch::runtime::MethodMeta>>&
@@ -193,7 +225,9 @@ class SmartMaskIoMgr : public IoMgrBase {
       const std::vector<
           executorch::runtime::Result<executorch::runtime::MethodMeta>>&
           methods_meta) override;
-  void fill_prefill_toks(std::vector<uint64_t>& prompt_tokens) override;
+  void fill_prefill_toks(
+      int64_t start_pos,
+      std::vector<uint64_t>& prompt_tokens) override;
   void fill_kv_tok_mask(int64_t pos, int64_t cur_token) override;
   void update_prefill_to_kv_io(
       int64_t cur_token,
@@ -216,22 +250,24 @@ class SmartMaskIoMgr : public IoMgrBase {
 
   struct IO {
     void* shared_buffer_base;
-    int64_t* input_tok;
-    int32_t* input_pos;
+    int64_t* kv_input_toks;
+    int32_t* kv_input_pos;
     // layer -> head -> head_dim * seq_len
     std::vector<std::vector<uint8_t*>> k_cache;
     std::vector<std::vector<uint8_t*>> v_cache;
     // layer -> head -> head_dim
     std::vector<std::vector<uint8_t*>> k_cache_out;
     std::vector<std::vector<uint8_t*>> v_cache_out;
-    // max_seq_len
+    // kv_ar_len_ * context_len_
     uint16_t* kv_attention_mask;
-    // vocab_size
+    // kv_ar_len_ * vocab_size
     uint16_t* kv_logits;
+    // prefill_ar_len_
     int64_t* prefill_input_toks;
-    // prefill_cache_len_ ^ 2
-    uint16_t* prefill_atten_mask;
-    // vocab_size * prefill_cache_len_
+    int32_t* prefill_input_pos;
+    // prefill_ar_len_ * context_len_
+    uint16_t* prefill_attention_mask;
+    // vocab_size * prefill_ar_len_
     uint16_t* prefill_logits;
 
     size_t num_layers_;
@@ -252,12 +288,12 @@ class SmartMaskIoMgr : public IoMgrBase {
   };
 
  private:
-  std::unique_ptr<executorch::aten::TensorImpl> input_tok_;
-  std::unique_ptr<executorch::aten::TensorImpl> input_pos_;
-  std::unique_ptr<executorch::aten::TensorImpl> hidden_state_;
-  std::unique_ptr<executorch::aten::TensorImpl> attention_mask_;
+  std::unique_ptr<executorch::aten::TensorImpl> kv_input_toks_;
+  std::unique_ptr<executorch::aten::TensorImpl> kv_input_pos_;
+  std::unique_ptr<executorch::aten::TensorImpl> kv_attention_mask_;
   std::unique_ptr<executorch::aten::TensorImpl> prefill_input_toks_;
-  std::unique_ptr<executorch::aten::TensorImpl> prefill_attn_mask_;
+  std::unique_ptr<executorch::aten::TensorImpl> prefill_input_pos_;
+  std::unique_ptr<executorch::aten::TensorImpl> prefill_attention_mask_;
   std::unique_ptr<executorch::aten::TensorImpl> prefill_logits_;
   std::unordered_map<
       std::string,
@@ -277,7 +313,10 @@ class SmartMaskIoMgr : public IoMgrBase {
       v_cache_out_;
   std::unique_ptr<executorch::aten::TensorImpl> kv_logits_;
   std::vector<int> shard_layers_;
+  int32_t context_len_{0};
+  int32_t kv_ar_len_{0};
   int32_t kv_cache_len_{0};
+  int32_t prefill_ar_len_{0};
   int32_t prefill_cache_len_{0};
   int32_t vocab_size_;
   int32_t num_layers_;
@@ -287,6 +326,9 @@ class SmartMaskIoMgr : public IoMgrBase {
   std::string prefill_forward_name_;
   std::string kv_forward_name_;
   const bool use_int64_token_{false};
+  // If the cache length is zero, it indicates a BERT model, which does not use
+  // position ids or KV cache inputs.
+  const bool is_bert_{false};
 };
 
 } // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
index 4b45863147e..dafc911a172 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
+++ b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
@@ -13,11 +13,13 @@
 #include <executorch/examples/qualcomm/oss_scripts/llama/runner/runner.h>
 #include <executorch/extension/evalue_util/print_evalue.h>
 #include <executorch/extension/llm/runner/util.h>
-#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
 #include <executorch/runtime/platform/log.h>
+#include <pytorch/tokenizers/llama2c_tokenizer.h>
+
 #include <ctime>
+#include <fstream>
 #include <sstream>
 
 using executorch::aten::Tensor;
@@ -33,26 +35,32 @@ namespace example {
 
 namespace {
 static constexpr auto kTopp = 0.9f;
-void printReport(const Runner::Stats& stats);
+void printReport(
+    const Runner::Stats& stats,
+    const std::string& performance_output_path);
 std::string statsToJsonString(const Runner::Stats& stats);
 } // namespace
 
 Runner::Runner(
     const std::vector<std::string>& models_path,
     const std::string& tokenizer_path,
+    const std::string& performance_output_path,
     const float logits_scale,
     const int32_t logits_offset,
     const float temperature,
     const int eval_mode,
-    const std::string& kv_updator)
+    const std::string& kv_updater,
+    const int num_iters)
     : n_bos_(1),
       n_eos_(1),
       tokenizer_path_(tokenizer_path),
+      performance_output_path_(performance_output_path),
       logits_scale_(logits_scale),
       logits_offset_(logits_offset),
       temperature_(temperature),
       eval_mode_(static_cast<EvalMode>(eval_mode)),
-      kv_updator_(kv_updator) {
+      kv_updater_(kv_updater),
+      num_iters_(num_iters) {
   for (size_t i = 0; i < models_path.size(); ++i) {
     modules_.push_back(std::make_shared<Module>(
         models_path[i], Module::LoadMode::MmapUseMlockIgnoreErrors));
@@ -76,10 +84,6 @@ Error Runner::load() {
   }
 
   switch (eval_mode_) {
-    case EvalMode::kPrefill:
-      prefill_forward_name_ = "forward";
-      method_names_.emplace_back(prefill_forward_name_);
-      break;
     case EvalMode::kKVCached:
       kv_forward_name_ = "forward";
       method_names_.emplace_back(kv_forward_name_);
@@ -105,17 +109,22 @@ Error Runner::load() {
   }
 
   if (!prefill_forward_name_.empty()) {
-    // Use input tokens length to retrieve prefill cache len
-    // Cache len equals to prefill model seq_len - 1
-    prefill_cache_len_ = get_methods_meta(prefill_forward_name_)[0]
-                             ->input_tensor_meta(0)
-                             ->sizes()[1];
+    // Use attention mask length to retrieve prefill_ar_len and context length
+    // Prefill cache length equals to context_len - prefill_ar_len
+    auto atten_mask_meta =
+        get_methods_meta(prefill_forward_name_)[0]->input_tensor_meta(1);
+    prefill_ar_len_ = atten_mask_meta->sizes()[1];
+    context_len_ = atten_mask_meta->sizes()[2];
+    prefill_cache_len_ = context_len_ - prefill_ar_len_;
   }
   if (!kv_forward_name_.empty()) {
-    // Use k cache length to retirieve kv cache len
-    // Cache len equals to kv model seq_len - 1
-    kv_cache_len_ =
-        get_methods_meta(kv_forward_name_)[0]->input_tensor_meta(3)->sizes()[2];
+    // Use attention mask length to retrieve kv ar len and context length
+    // Cache len equals to kv model context_len - kv_ar_len
+    auto atten_mask_meta =
+        get_methods_meta(kv_forward_name_)[0]->input_tensor_meta(1);
+    kv_ar_len_ = atten_mask_meta->sizes()[1];
+    context_len_ = atten_mask_meta->sizes()[2];
+    kv_cache_len_ = context_len_ - kv_ar_len_;
   }
 
   // retrieve any method meta, can be either prefill or kv
@@ -129,10 +138,13 @@ Error Runner::load() {
       executorch::aten::ScalarType::Long;
   ET_CHECK_MSG(num_layers != -1, "Could not retrieve num layers");
 
-  if (kv_updator_ == "SmartMask") {
+  if (kv_updater_ == "SmartMask") {
     io_mgr_ = std::make_unique<SmartMaskIoMgr>(
         modules_,
+        context_len_,
+        prefill_ar_len_,
         prefill_cache_len_,
+        kv_ar_len_,
         kv_cache_len_,
         vocab_size_,
         num_layers,
@@ -142,10 +154,13 @@ Error Runner::load() {
         prefill_forward_name_,
         kv_forward_name_,
         use_int64_token_);
-  } else if (kv_updator_ == "ShiftPointer") {
+  } else if (kv_updater_ == "ShiftPointer") {
     io_mgr_ = std::make_unique<ShiftPointerIoMgr>(
         modules_,
+        context_len_,
+        prefill_ar_len_,
         prefill_cache_len_,
+        kv_ar_len_,
         kv_cache_len_,
         vocab_size_,
         num_layers,
@@ -156,16 +171,13 @@ Error Runner::load() {
         kv_forward_name_,
         use_int64_token_);
   } else {
-    ET_LOG(Error, "Using an unknown updator %s", kv_updator_.c_str());
+    ET_LOG(Error, "Using an unknown updater %s", kv_updater_.c_str());
   }
   ET_LOG(Info, "creating io_memory");
 
   // prepare io
   io_mgr_->init_io();
   switch (eval_mode_) {
-    case EvalMode::kPrefill:
-      io_mgr_->prepare_prefill_io(get_methods_meta(prefill_forward_name_));
-      break;
     case EvalMode::kKVCached:
       io_mgr_->prepare_kv_io(get_methods_meta(kv_forward_name_));
       break;
@@ -180,19 +192,19 @@ Error Runner::load() {
 
   // llama3 tokenizer
   tokenizer_ = example::get_tiktoken_for_llama();
-  Error err = tokenizer_->load(tokenizer_path_);
-  if (err == Error::InvalidArgument) {
+  auto err = tokenizer_->load(tokenizer_path_);
+  if (err != tokenizers::Error::Ok) {
     ET_LOG(
         Info,
         "Failed to load %s as a Tiktoken artifact, trying BPE tokenizer",
         tokenizer_path_.c_str());
     tokenizer_.reset();
     // llama2 tokenizer
-    tokenizer_ = std::make_unique<executorch::extension::llm::BPETokenizer>();
+    tokenizer_ = std::make_unique<tokenizers::Llama2cTokenizer>();
     err = tokenizer_->load(tokenizer_path_);
     llama_version_ = LlamaVersion::kLlama2;
     ET_CHECK_MSG(
-        err == Error::Ok,
+        err == tokenizers::Error::Ok,
         "failed to load tokenizer %s",
         tokenizer_path_.c_str());
   } else {
@@ -271,7 +283,7 @@ Error Runner::generate(
   std::unordered_map<std::string, std::vector<std::vector<Tensor>>>
       input_tensors, output_tensors;
   std::unordered_map<std::string, std::vector<std::vector<EValue>>> inputs;
-  if (!is_loaded()) {
+  if (!is_loaded() || (num_iters_ > 1)) {
     stats_.model_load_start_ms = time_in_ms();
     ET_CHECK_OK_OR_RETURN_ERROR(load());
     for (auto method_name : method_names_) {
@@ -323,70 +335,54 @@ Error Runner::generate(
       break;
   }
 
-  int max_seq_len = std::max(prefill_cache_len_, kv_cache_len_) + 1;
-  seq_len = (seq_len > 0 && seq_len <= max_seq_len) ? seq_len : max_seq_len;
-  Result<std::vector<uint64_t>> encode_res =
+  seq_len = (seq_len > 0 && seq_len <= context_len_) ? seq_len : context_len_;
+  tokenizers::Result<std::vector<uint64_t>> encode_res =
       tokenizer_->encode(prompt_, n_bos_, 0);
-  ET_CHECK_OK_OR_RETURN_ERROR(
+  ET_CHECK_TK_OK_OR_RETURN_ERROR(
       encode_res.error(), "failed to encode prompt %s", prompt_.c_str());
 
   std::vector<uint64_t> prompt_tokens = encode_res.get();
   int num_prompt_tokens = prompt_tokens.size();
-  ET_CHECK_MSG(num_prompt_tokens < max_seq_len, "max seq length exceeded");
   ET_CHECK_MSG(
       num_prompt_tokens < seq_len,
       "sequence length exceeded - please increase the seq_len value");
-  if (eval_mode_ == EvalMode::kHybrid) {
-    int prefill_seq_len = get_methods_meta(prefill_forward_name_)[0]
-                              ->input_tensor_meta(0)
-                              ->sizes()[1] +
-        1;
-    ET_CHECK_MSG(
-        num_prompt_tokens < prefill_seq_len,
-        "For hybrid mode, please ensure prompt length(%d) is less than prefill's seq_len(%d)",
-        num_prompt_tokens,
-        prefill_seq_len);
-  }
 
   int64_t pos = 0, prev_token, cur_token = prompt_tokens[0];
   if (token_callback) {
     token_callback(prompt_);
   }
   auto prefill_execute = [&](const std::string& method_name) {
-    io_mgr_->fill_prefill_toks(prompt_tokens);
+    int num_iters = 1 + ((num_prompt_tokens - 1) / prefill_ar_len_);
+    ET_LOG(
+        Info,
+        "Prompt Processor: total %d tokens (AR-%d * %d iters)",
+        num_prompt_tokens,
+        prefill_ar_len_,
+        num_iters);
 
-    pos = num_prompt_tokens - 1;
-    cur_token = prompt_tokens[pos];
-    while (pos < seq_len - 1) {
-      // inference
+    for (int i = 0; i < num_iters; i++) {
+      io_mgr_->fill_prefill_toks(pos, prompt_tokens);
       run_model_step(method_name, inputs[method_name]);
-      Tensor& logits_tensor = output_tensors[method_name].back()[0];
-      prev_token = cur_token;
-      long sample_start_time_ms = time_in_ms();
-      cur_token = logitsToToken(logits_tensor, pos);
-      stats_.aggregate_sampling_time_ms += time_in_ms() - sample_start_time_ms;
-
-      io_mgr_->update_prefill_io(cur_token, ++pos, output_tensors[method_name]);
-      auto piece_res = tokenizer_->decode(prev_token, cur_token);
-      ET_CHECK(piece_res.ok());
-      if (token_callback) {
-        token_callback(piece_res.get().c_str());
-      }
-
-      if (pos == num_prompt_tokens) {
-        stats_.first_token_ms = time_in_ms();
-        stats_.prompt_eval_end_ms = time_in_ms();
-      }
-
-      if (pos >= num_prompt_tokens && eos_id_.count(cur_token) > 0) {
-        ET_LOG(Info, "\nReached to the end of generation");
-        break;
-      }
-      // prefill model inferences once for prompt in the hybrid mode
-      if (eval_mode_ == EvalMode::kHybrid) {
-        break;
-      }
+      io_mgr_->update_prefill_io(cur_token, pos, output_tensors[method_name]);
+      pos += prefill_ar_len_;
+    }
+    Tensor& logits_tensor = output_tensors[method_name].back()[0];
+    prev_token = prompt_tokens[num_prompt_tokens - 1];
+    long sample_start_time_ms = time_in_ms();
+    cur_token = logitsToToken(
+        logits_tensor,
+        (num_prompt_tokens + prefill_ar_len_ - 1) % prefill_ar_len_);
+    stats_.aggregate_sampling_time_ms += time_in_ms() - sample_start_time_ms;
+
+    auto piece_res = tokenizer_->decode(prev_token, cur_token);
+    ET_CHECK(piece_res.ok());
+    if (token_callback) {
+      token_callback(piece_res.get().c_str());
     }
+
+    pos = num_prompt_tokens;
+    stats_.first_token_ms = time_in_ms();
+    stats_.prompt_eval_end_ms = time_in_ms();
   };
 
   auto kv_execute = [&](const std::string& method_name) {
@@ -428,9 +424,6 @@ Error Runner::generate(
   };
 
   switch (eval_mode_) {
-    case EvalMode::kPrefill:
-      prefill_execute(prefill_forward_name_);
-      break;
     case EvalMode::kKVCached:
       kv_execute(kv_forward_name_);
       break;
@@ -451,16 +444,21 @@ Error Runner::generate(
 
   stats_.num_prompt_tokens = num_prompt_tokens;
   stats_.num_generated_tokens = pos - num_prompt_tokens;
-  printReport(stats_);
+  printReport(stats_, performance_output_path_);
   if (stats_callback) {
     stats_callback(stats_);
   }
-
+  io_mgr_->reset_io(
+      get_methods_meta(prefill_forward_name_),
+      get_methods_meta(kv_forward_name_));
+  prompt_.clear();
   return Error::Ok;
 }
 
 namespace {
-void printReport(const Runner::Stats& stats) {
+void printReport(
+    const Runner::Stats& stats,
+    const std::string& performance_output_path) {
   printf("PyTorchObserver %s\n", statsToJsonString(stats).c_str());
 
   ET_LOG(
@@ -518,6 +516,20 @@ void printReport(const Runner::Stats& stats) {
       stats.num_generated_tokens,
       (double)stats.aggregate_sampling_time_ms /
           stats.SCALING_FACTOR_UNITS_PER_SECOND);
+
+  // For now, we just print the total inference time for CI, can save more info
+  // in future if needed.
+
+  std::ofstream outfile(performance_output_path.c_str());
+  if (outfile.is_open()) {
+    double num_tok = (stats.num_generated_tokens) /
+        (double)(stats.inference_end_ms - stats.inference_start_ms) *
+        stats.SCALING_FACTOR_UNITS_PER_SECOND;
+    outfile << num_tok;
+    outfile.close();
+  } else {
+    ET_CHECK_MSG(false, "Error saving the inference speed file");
+  }
 }
 
 std::string statsToJsonString(const Runner::Stats& stats) {
diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.h b/examples/qualcomm/oss_scripts/llama/runner/runner.h
index b6ba1360bff..e693bcd7077 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/runner.h
+++ b/examples/qualcomm/oss_scripts/llama/runner/runner.h
@@ -19,8 +19,8 @@
 
 #include <executorch/examples/qualcomm/oss_scripts/llama/runner/io_manager.h>
 #include <executorch/extension/llm/sampler/sampler.h>
-#include <executorch/extension/llm/tokenizer/tokenizer.h>
 #include <executorch/extension/module/module.h>
+#include <pytorch/tokenizers/tokenizer.h>
 
 namespace example {
 
@@ -29,11 +29,13 @@ class Runner {
   explicit Runner(
       const std::vector<std::string>& models_path,
       const std::string& tokenizer_path,
+      const std::string& performance_output_path_,
       const float logits_scale,
       const int32_t logits_offset,
       const float temperature,
       const int eval_mode,
-      const std::string& kv_updator);
+      const std::string& kv_updater,
+      const int num_iters);
 
   struct Stats {
     // Scaling factor for timestamps - in this case, we use ms.
@@ -89,7 +91,10 @@ class Runner {
   std::string prompt_;
 
   // metadata
+  int32_t context_len_{0};
+  int32_t prefill_ar_len_{0};
   int32_t prefill_cache_len_{0};
+  int32_t kv_ar_len_{0};
   int32_t kv_cache_len_{0};
   int32_t vocab_size_;
   int32_t bos_id_;
@@ -98,10 +103,11 @@ class Runner {
   const int32_t n_eos_;
   std::vector<std::shared_ptr<executorch::extension::Module>> modules_;
   std::string tokenizer_path_;
+  std::string performance_output_path_;
   float logits_scale_;
   int32_t logits_offset_;
   float temperature_;
-  std::unique_ptr<executorch::extension::llm::Tokenizer> tokenizer_;
+  std::unique_ptr<tokenizers::Tokenizer> tokenizer_;
   std::unique_ptr<executorch::extension::llm::Sampler> sampler_;
   Stats stats_;
   std::unique_ptr<IoMgrBase> io_mgr_;
@@ -111,7 +117,8 @@ class Runner {
   std::string kv_forward_name_;
   std::vector<std::string> method_names_;
   LlamaVersion llama_version_;
-  std::string kv_updator_;
+  std::string kv_updater_;
+  int num_iters_;
 };
 
 } // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/targets.bzl b/examples/qualcomm/oss_scripts/llama/targets.bzl
index 9780da0369d..c3f7e7fbbda 100644
--- a/examples/qualcomm/oss_scripts/llama/targets.bzl
+++ b/examples/qualcomm/oss_scripts/llama/targets.bzl
@@ -19,15 +19,16 @@ def define_common_targets():
         deps = [
             "//executorch/extension/llm/runner:stats",
             "//executorch/extension/tensor:tensor",
+             "//executorch/kernels/quantized:generated_lib",
             "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()),
         ],
         exported_deps = [
             "//executorch/extension/module:module",
             "//executorch/extension/llm/sampler:sampler",
             "//executorch/examples/models/llama/tokenizer:tiktoken",
-            "//executorch/extension/llm/tokenizer:bpe_tokenizer",
             "//executorch/extension/evalue_util:print_evalue",
             "//executorch/backends/qualcomm/runtime:runtime",
+            "//pytorch/tokenizers:llama2c_tokenizer",
         ],
         external_deps = [
             "gflags",
diff --git a/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt b/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt
index def35cb3aa7..f96d0169809 100644
--- a/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt
+++ b/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt
@@ -27,7 +27,7 @@ list(PREPEND _qaihub_llama2_7b_runner__srcs
 # build qaihub llama2 7b runner
 add_executable(qaihub_llama2_7b_runner ${_qaihub_llama2_7b_runner__srcs})
 target_include_directories(
-  qaihub_llama2_7b_runner PUBLIC ${_common_include_directories}
+  qaihub_llama2_7b_runner PUBLIC ${_common_include_directories} ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizers/include
 )
 target_link_libraries(
   qaihub_llama2_7b_runner
@@ -58,7 +58,7 @@ list(APPEND _common_compile_options -DQAIHUB_LLAMA3_RUNNER)
 
 list(
   APPEND _qaihub_llama3_8b_runner__srcs
-  ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizer/tiktoken.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizers/src/tiktoken.cpp
 )
 list(
   APPEND
@@ -69,7 +69,7 @@ list(
 # build qaihub llama3 8b runner
 add_executable(qaihub_llama3_8b_runner ${_qaihub_llama3_8b_runner__srcs})
 target_include_directories(
-  qaihub_llama3_8b_runner PUBLIC ${_common_include_directories}
+  qaihub_llama3_8b_runner PUBLIC ${_common_include_directories} ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizers/include
 )
 
 target_link_libraries(
diff --git a/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp b/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp
index 4bddb32b53e..06ea324ef6f 100644
--- a/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp
+++ b/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp
@@ -12,7 +12,7 @@
 #if defined(QAIHUB_LLAMA3_RUNNER)
 #include <executorch/examples/models/llama/tokenizer/llama_tiktoken.h>
 #else
-#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
+#include <pytorch/tokenizers/llama2c_tokenizer.h>
 #endif
 #include <executorch/examples/qualcomm/qaihub_scripts/llama/runner/runner.h>
 #include <executorch/extension/evalue_util/print_evalue.h>
@@ -79,7 +79,7 @@ Runner::Runner(
   eos_id_.insert(tokenizer_->encode("<|eot_id|>", 0, 0).get()[0]);
   version_ = LlamaVersion::kLlama3;
 #else
-  tokenizer_ = std::make_unique<executorch::extension::llm::BPETokenizer>();
+  tokenizer_ = std::make_unique<tokenizers::Llama2cTokenizer>();
   tokenizer_->load(tokenizer_path_);
   version_ = LlamaVersion::kLlama2;
 #endif
@@ -231,9 +231,9 @@ Error Runner::generate(
       break;
   }
 
-  Result<std::vector<uint64_t>> encode_res =
+  tokenizers::Result<std::vector<uint64_t>> encode_res =
       tokenizer_->encode(post_process_prompt, n_bos_, 0);
-  ET_CHECK_OK_OR_RETURN_ERROR(
+  ET_CHECK_TK_OK_OR_RETURN_ERROR(
       encode_res.error(),
       "failed to encode prompt %s",
       post_process_prompt.c_str());
diff --git a/examples/qualcomm/qaihub_scripts/llama/runner/runner.h b/examples/qualcomm/qaihub_scripts/llama/runner/runner.h
index be9af7e2275..9672d6a3586 100644
--- a/examples/qualcomm/qaihub_scripts/llama/runner/runner.h
+++ b/examples/qualcomm/qaihub_scripts/llama/runner/runner.h
@@ -19,8 +19,8 @@
 
 #include <executorch/examples/qualcomm/qaihub_scripts/llama/runner/io_memory.h>
 #include <executorch/extension/llm/sampler/sampler.h>
-#include <executorch/extension/llm/tokenizer/tokenizer.h>
 #include <executorch/extension/module/module.h>
+#include <pytorch/tokenizers/tokenizer.h>
 
 namespace example {
 
@@ -101,7 +101,7 @@ class Runner {
   std::vector<std::string> method_names_;
   std::string tokenizer_path_;
   float temperature_;
-  std::unique_ptr<executorch::extension::llm::Tokenizer> tokenizer_;
+  std::unique_ptr<tokenizers::Tokenizer> tokenizer_;
   std::unique_ptr<executorch::extension::llm::Sampler> sampler_;
   Stats stats_;
   std::unique_ptr<Memory> io_mem_;
diff --git a/examples/qualcomm/scripts/mobilebert_fine_tune.py b/examples/qualcomm/scripts/mobilebert_fine_tune.py
index 4ecdaf3583f..47a489f6d52 100755
--- a/examples/qualcomm/scripts/mobilebert_fine_tune.py
+++ b/examples/qualcomm/scripts/mobilebert_fine_tune.py
@@ -169,7 +169,7 @@ def get_fine_tuned_mobilebert(artifacts_dir, pretrained_weight, batch_size):
     dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
     dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)
 
-    epochs = 5
+    epochs = args.num_epochs
     dataloader_train = DataLoader(
         dataset_train,
         sampler=RandomSampler(dataset_train),
@@ -366,6 +366,13 @@ def calibrator(gm):
         type=str,
     )
 
+    parser.add_argument(
+        "--num_epochs",
+        help="If no pretrained weights are provided, set number of epochs to train the model",
+        default=5,
+        type=int,
+    )
+
     parser.add_argument(
         "-F",
         "--use_fp16",
diff --git a/examples/qualcomm/scripts/wav2letter.py b/examples/qualcomm/scripts/wav2letter.py
index e377c6d7e90..7f30d1865b8 100644
--- a/examples/qualcomm/scripts/wav2letter.py
+++ b/examples/qualcomm/scripts/wav2letter.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import json
+import logging
 import os
 import sys
 from multiprocessing.connection import Client
@@ -111,7 +112,12 @@ def main(args):
     # target labels " abcdefghijklmnopqrstuvwxyz'*"
     instance.vocab_size = 29
     model = instance.get_eager_model().eval()
-    model.load_state_dict(torch.load(args.pretrained_weight, weights_only=True))
+    if args.pretrained_weight:
+        model.load_state_dict(torch.load(args.pretrained_weight, weights_only=True))
+    else:
+        logging.warning(
+            "It is strongly recommended to provide pretrained weights, otherwise accuracy will be bad. This option is here mainly for CI purpose to ensure compile is successful."
+        )
 
     # convert conv1d to conv2d in nn.Module level will only introduce 2 permute
     # nodes around input & output, which is more quantization friendly.
@@ -128,9 +134,15 @@ def main(args):
 
     # retrieve dataset, will take some time to download
     data_num = 100
-    inputs, targets, input_list = get_dataset(
-        data_size=data_num, artifact_dir=args.artifact
-    )
+    if args.compile_only:
+        inputs = [(torch.rand(1, 1, 700, 1),)]
+        logging.warning(
+            "With compile_only, accuracy will be bad due to insufficient datasets for quantization."
+        )
+    else:
+        inputs, targets, input_list = get_dataset(
+            data_size=data_num, artifact_dir=args.artifact
+        )
     pte_filename = "w2l_qnn"
     build_executorch_binary(
         model,
@@ -212,7 +224,7 @@ def main(args):
         ),
         default=None,
         type=str,
-        required=True,
+        required=False,
     )
 
     args = parser.parse_args()
diff --git a/examples/qualcomm/utils.py b/examples/qualcomm/utils.py
index 5ecce30078e..29f5e96dcd0 100755
--- a/examples/qualcomm/utils.py
+++ b/examples/qualcomm/utils.py
@@ -205,6 +205,7 @@ def execute(self, custom_runner_cmd=None, method_index=0):
             qnn_executor_runner_cmds = " ".join(
                 [
                     f"cd {self.workspace} &&",
+                    "chmod +x ./qnn_executor_runner &&",
                     f"./qnn_executor_runner {qnn_executor_runner_args}",
                 ]
             )
@@ -272,6 +273,7 @@ def qat_train(ori_model, captured_model, quantizer, dataset):
 def make_quantizer(
     quant_dtype: Optional[QuantDtype] = QuantDtype.use_8a8w,
     custom_annotations=(),
+    per_block_conv=False,
     per_channel_conv=True,
     per_channel_linear=False,
     act_observer=MovingAverageMinMaxObserver,
@@ -279,6 +281,7 @@ def make_quantizer(
 ):
     quantizer = QnnQuantizer()
     quantizer.add_custom_quant_annotations(custom_annotations)
+    quantizer.set_per_block_conv_quant(per_block_conv)
     quantizer.set_per_channel_conv_quant(per_channel_conv)
     quantizer.set_per_channel_linear_quant(per_channel_linear)
     quantizer.set_quant_config(quant_dtype, is_qat, act_observer)
diff --git a/examples/selective_build/CMakeLists.txt b/examples/selective_build/CMakeLists.txt
index c2ce3f09e7a..db570bb98c3 100644
--- a/examples/selective_build/CMakeLists.txt
+++ b/examples/selective_build/CMakeLists.txt
@@ -21,8 +21,8 @@ project(selective_build_example)
 set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
 set(TORCH_ROOT ${EXECUTORCH_ROOT}/third-party/pytorch)
 
-include(${EXECUTORCH_ROOT}/build/Utils.cmake)
-include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
 
 if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
diff --git a/extension/parallel/test/TARGETS b/examples/xnnpack/executor_runner/TARGETS
similarity index 100%
rename from extension/parallel/test/TARGETS
rename to examples/xnnpack/executor_runner/TARGETS
diff --git a/examples/xnnpack/executor_runner/targets.bzl b/examples/xnnpack/executor_runner/targets.bzl
new file mode 100644
index 00000000000..f9c333d5b47
--- /dev/null
+++ b/examples/xnnpack/executor_runner/targets.bzl
@@ -0,0 +1,20 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_oss_build_kwargs", "runtime")
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+
+    # executor_runner for XNNPACK Backend and portable kernels.
+    runtime.cxx_binary(
+        name = "xnn_executor_runner",
+        deps = [
+            "//executorch/examples/portable/executor_runner:executor_runner_lib",
+            "//executorch/backends/xnnpack:xnnpack_backend",
+            "//executorch/kernels/portable:generated_lib",
+        ],
+        define_static_target = True,
+        **get_oss_build_kwargs()
+    )
diff --git a/examples/xnnpack/quantization/test_quantize.sh b/examples/xnnpack/quantization/test_quantize.sh
index d439fde6cbc..1f50667c788 100644
--- a/examples/xnnpack/quantization/test_quantize.sh
+++ b/examples/xnnpack/quantization/test_quantize.sh
@@ -44,8 +44,6 @@ test_buck2_quantization() {
 
 test_cmake_quantization() {
   echo "Building quantized ops shared library"
-  SITE_PACKAGES="$(${PYTHON_EXECUTABLE} -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
-  CMAKE_PREFIX_PATH="${SITE_PACKAGES}/torch"
 
   clean_executorch_install_folders
 
@@ -56,7 +54,6 @@ test_cmake_quantization() {
       -DEXECUTORCH_BUILD_XNNPACK="$EXECUTORCH_BUILD_XNNPACK" \
       -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
       -DEXECUTORCH_BUILD_KERNELS_QUANTIZED_AOT=ON \
-      -DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \
       -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..)
 
   cmake --build cmake-out -j4
diff --git a/exir/_serialize/TARGETS b/exir/_serialize/TARGETS
index cc6f16d78d8..6671bf00334 100644
--- a/exir/_serialize/TARGETS
+++ b/exir/_serialize/TARGETS
@@ -32,6 +32,7 @@ runtime.python_library(
         "_cord.py",
         "_dataclass.py",
         "_flatbuffer.py",
+        "_named_data_store.py",
         "_program.py",
         "_serialize.py",
         "data_serializer.py",
diff --git a/exir/_serialize/_named_data_store.py b/exir/_serialize/_named_data_store.py
new file mode 100644
index 00000000000..2c2d975937e
--- /dev/null
+++ b/exir/_serialize/_named_data_store.py
@@ -0,0 +1,210 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+import hashlib
+import math
+from dataclasses import dataclass
+
+# from dataclasses import dataclass
+from typing import Dict, List, Optional
+
+
+@dataclass
+class BufferEntry:
+    """A class to hold the buffer entries for serialization.
+
+    Attributes:
+        buffer: The buffer bytes.
+        alignment: The alignment of the buffer.
+    """
+
+    buffer: bytes
+    alignment: int
+
+
+@dataclass
+class NamedDataStoreOutput:
+    """
+    Holds named data for serialization.
+
+    Attributes:
+        buffers: A list of unique buffer entries.
+        pte_data: Contains data that is stored inside the PTE file. A mapping from
+            {key: buffer_index}.
+        external_data: Contains data that is stored external to the PTE. A mapping
+            from {filename: {key: buffer_index}}.
+    """
+
+    buffers: List[BufferEntry]
+    pte_data: Dict[str, int]
+    external_data: Dict[str, Dict[str, int]]
+
+
+class NamedDataStore:
+    """
+    NamedDataStore manages the data that delegates want to share. Backends add
+    bytes to the store under a unique key. These bytes can be retrieved at
+    runtime using the same key with the NamedDataMap.
+
+    Note:
+    - Keys are unique in the data store, regardless of whether they are stored
+        in the PTE or externally.
+    - Multiple keys can point to the same buffer entry.
+    - The same data can be added multiple times and all keys will point to one
+        buffer. If a duplicate blob is added with a different alignment, the
+        lcm of the current and new alignment is taken for that blob.
+    """
+
+    # List of unique blobs.
+    buffers: List[BufferEntry]
+    # Named data stored inside the PTE file. Map of {key: buffer_index}.
+    pte_data: Dict[str, int]
+    # Named data stored outside of the PTE file.
+    # Map of {filename: {key: buffer_index}}.
+    external_data: Dict[str, Dict[str, int]]
+
+    # Cache of the data hash for deduplication.
+    # Use a hash instead of the data as a key because a sha256 collision is
+    # unlikely, and the data may be large.
+    data_hash_to_buffer_idx: Dict[bytes, int]
+    # Cache of the key to buffer idx to ensure uniqueness.
+    # If a key is added multiple times, check the buffer idx to ensure that the
+    # data is identical too.
+    key_to_buffer_idx: Dict[str, int]
+
+    def __init__(self) -> None:
+        """
+        Initializes a new NamedDataStore.
+        """
+        self.buffers = []
+        self.pte_data = {}
+        self.external_data = {}
+
+        self.data_hash_to_buffer_idx = {}
+        self.key_to_buffer_idx = {}
+
+    def _add_named_data_to_map(
+        self,
+        key: str,
+        data: bytes,
+        alignment: int,
+        local_key_to_buffer_idx: Dict[str, int],
+    ) -> None:
+        """
+        Add data to a map and update the alignment. Ensure that the key-data
+        pair is unique.
+        - If the key exists, the data must be identical.
+        - If multiple unique keys exist for the same data, those keys should
+            point to the same buffer.
+
+        Args:
+            key (str): key associated with the data.
+            data (bytes): Bytes being requested to be serialized.
+            alignment (int): alignment for bytes to be serialized with.
+            local_key_to_buffer_idx (Dict[str, int]): map to add the data to.
+        Raises:
+            ValueError: when the key exists in the store, and corresponding data
+                is different.
+        """
+        # Get data hash.
+        hashed = hashlib.sha256(data).digest()
+
+        # Check if the key exists.
+        buffer_idx = self.key_to_buffer_idx.get(key, -1)
+        if buffer_idx != -1:
+            # If the key exists, the corresponding data must be identical.
+            if self.data_hash_to_buffer_idx.get(hashed, -1) != buffer_idx:
+                raise ValueError(
+                    f"Duplicate key {key} with different data. "
+                    f"Existing data: {self.buffers[buffer_idx].buffer}. "
+                    f"New data: {data}."
+                )
+            self.buffers[buffer_idx].alignment = math.lcm(
+                self.buffers[buffer_idx].alignment, alignment
+            )
+        else:
+            # Key doesn't exist; check if the data exists.
+            buffer_idx = self.data_hash_to_buffer_idx.get(hashed, -1)
+            if buffer_idx != -1:
+                # The data exists; update the alignment.
+                self.buffers[buffer_idx].alignment = math.lcm(
+                    self.buffers[buffer_idx].alignment, alignment
+                )
+            else:
+                # The data doesn't exist; add it to the data store.
+                buffer_idx = len(self.buffers)
+                self.buffers.append(BufferEntry(data, alignment))
+                self.data_hash_to_buffer_idx[hashed] = buffer_idx
+
+            # Add key to the map and the key cache.
+            local_key_to_buffer_idx[key] = buffer_idx
+            self.key_to_buffer_idx[key] = buffer_idx
+
+    def add_named_data(
+        self,
+        key: str,
+        data: bytes,
+        alignment: Optional[int] = 1,
+        external_tag: Optional[str] = None,
+    ) -> None:
+        """
+        Adds a named blob to the NamedDataStore.
+        Args:
+            key (str): key associated with the data.
+            data (bytes): Bytes being requested to be serialized.
+            alignment (int): alignment for bytes to be serialized with.
+            external (Optional[str]): the external filename that this data is saved to.
+        Raises:
+            ValueError: when the key exists in the store, and corresponding data
+                is different.
+        """
+
+        # Set default alignment.
+        if alignment is None:
+            alignment = 1
+        if alignment <= 0:
+            raise ValueError(f"Alignment must be greater than 0, received {alignment}.")
+
+        if external_tag is None:
+            self._add_named_data_to_map(key, data, alignment, self.pte_data)
+        else:
+            self._add_named_data_to_map(
+                key, data, alignment, self.external_data.setdefault(external_tag, {})
+            )
+
+    def get_named_data_store_output(self) -> NamedDataStoreOutput:
+        # Clean up empty maps inside self.external_data
+        self.external_data = {k: v for k, v in self.external_data.items() if len(v) > 0}
+        return NamedDataStoreOutput(self.buffers, self.pte_data, self.external_data)
+
+    def merge_named_data_store(self, other: NamedDataStoreOutput) -> None:
+        """
+        Merge another NamedDataStore into this one.
+        Args:
+            other (NamedDataStore): the other NamedDataStore to merge.
+        Raises:
+            ValueError: when the key exists in both stores, and corresponding
+                data is different between them.
+        """
+        # Merge the pte_data.
+        for key, buffer_idx in other.pte_data.items():
+            self.add_named_data(
+                key,
+                other.buffers[buffer_idx].buffer,
+                other.buffers[buffer_idx].alignment,
+            )
+
+        # Merge the external_data.
+        for filename, key_to_buffer_idx in other.external_data.items():
+            for key, buffer_idx in key_to_buffer_idx.items():
+                self.add_named_data(
+                    key,
+                    other.buffers[buffer_idx].buffer,
+                    other.buffers[buffer_idx].alignment,
+                    external_tag=filename,
+                )
diff --git a/exir/_serialize/_program.py b/exir/_serialize/_program.py
index 7656ea3f363..0994156ae50 100644
--- a/exir/_serialize/_program.py
+++ b/exir/_serialize/_program.py
@@ -8,10 +8,11 @@
 
 import copy
 import json
+import math
 import re
 
 from dataclasses import dataclass
-from typing import ClassVar, List, Literal, Optional, Tuple
+from typing import ClassVar, Dict, List, Literal, Optional, Tuple
 
 from executorch.exir._serialize._cord import Cord
 from executorch.exir._serialize._dataclass import _DataclassEncoder, _json_to_dataclass
@@ -20,6 +21,10 @@
     _program_flatbuffer_to_json,
     _program_json_to_flatbuffer,
 )
+from executorch.exir._serialize._named_data_store import (
+    BufferEntry,
+    NamedDataStoreOutput,
+)
 
 from executorch.exir._serialize.padding import aligned_size, pad_to, padding_required
 
@@ -29,6 +34,7 @@
     Buffer,
     DataLocation,
     DataSegment,
+    NamedData,
     Program,
     SubsegmentOffsets,
 )
@@ -41,6 +47,24 @@
 _HEADER_BYTEORDER: Literal["little"] = "little"
 
 
+@dataclass
+class AlignedData:
+    """
+    Holds data that should be aligned, for serialization.
+
+    Attributes:
+        data: The data to serialize, as a cord.
+        alignment: The alignment required for the data.
+    """
+
+    data: Cord
+    alignment: int
+
+    def __init__(self, data: Cord, alignment: Optional[int] = None) -> None:
+        self.data = data
+        self.alignment = alignment or 1
+
+
 def _program_to_json(program: Program) -> str:
     """Returns the JSON representation of the given Program."""
     return json.dumps(program, cls=_DataclassEncoder)
@@ -213,7 +237,7 @@ def _get_extended_header(program_data: bytes) -> Optional[_ExtendedHeader]:
 
 def _extract_delegate_segments(
     program: Program,
-    segments: List[Cord],
+    segments: List[AlignedData],
 ) -> None:
     """Extracts the delegate segments inlined in the program into a list of buffers.
         The program is modified in-place to remove the delegate data.
@@ -253,7 +277,7 @@ def _extract_delegate_segments(
                 segment_index = segment_index_map.get(inline.data)
                 if segment_index is None:
                     segment_index = len(segments)
-                    segments.append(Cord(inline.data))
+                    segments.append(AlignedData(Cord(inline.data)))
                     segment_index_map[inline.data] = segment_index
                 delegate.processed = BackendDelegateDataReference(
                     location=DataLocation.SEGMENT,
@@ -316,6 +340,44 @@ def _extract_constant_segment(
     return constant_segment_data, constant_segment_offsets
 
 
+def _extract_named_data(
+    program: Program,
+    segments: List[AlignedData],
+    buffers: List[BufferEntry],
+    name_to_buffer_idx: Dict[str, int],
+) -> None:
+    """Modifies the program in-place to add references to the named data
+        segments.
+
+    Args:
+        program: The program to extract segments from. Modified in-place.
+        segments: A list of buffers to append extracted segments to. Modified in-place.
+        buffers: A list of unique buffers and the information required to
+            serialize them. Not modified.
+        name_to_buffer_idx: A map from the name of a blob to the index in buffers.
+            Not modified.
+    """
+    if program.named_data is not None and len(program.named_data) > 0:
+        raise ValueError("Program already has named data.")
+
+    # Map from buffer_idx to segment_idx.
+    segment_index_map: Dict[int, int] = {}
+
+    named_data: List[NamedData] = []
+    for name, buffer_idx in name_to_buffer_idx.items():
+        segment_index = segment_index_map.get(buffer_idx, None)
+        if segment_index is None:
+            segment_index = len(segments)
+            segment_index_map[buffer_idx] = segment_index
+            segments.append(
+                AlignedData(
+                    Cord(buffers[buffer_idx].buffer), buffers[buffer_idx].alignment
+                )
+            )
+        named_data.append(NamedData(key=name, segment_index=segment_index))
+    program.named_data = named_data
+
+
 def serialize_pte_binary(
     program: Program,
     *,
@@ -324,6 +386,7 @@ def serialize_pte_binary(
     segment_alignment: int = 128,
     constant_tensor_alignment: Optional[int] = None,
     delegate_alignment: Optional[int] = None,
+    named_data: Optional[NamedDataStoreOutput] = None,
 ) -> Cord:
     """Returns the runtime binary representation of the given Program.
 
@@ -343,6 +406,8 @@ def serialize_pte_binary(
         delegate_alignment: If provided, the minimum alignment of delegate data
             in the program. Must be a power of 2. If not provided, uses the
             value in the schema file.
+        named_data: If provided, named blobs to be stored in segments
+            after the PTE file.
     Returns:
         The serialized form of the Program, ready for execution by the runtime.
     """
@@ -355,8 +420,9 @@ def serialize_pte_binary(
     # copy, reusing the actual data blobs.
     program = copy.deepcopy(program)
 
-    # Store extracted segment data; this may be constant data or delegate data.
-    segments: List[Cord] = []
+    # Store extracted segment data, with any buffer-specific alignment.
+    # This may be constant data, delegate data or named data.
+    segments: List[AlignedData] = []
 
     constant_segment_data, constant_segment_offsets = _extract_constant_segment(
         program.constant_buffer, tensor_alignment=constant_tensor_alignment
@@ -374,7 +440,7 @@ def serialize_pte_binary(
         # Clear the constant buffer, as constant data will be stored in segments.
         program.constant_buffer = []
         # Add to the aggregate segments cord.
-        segments.append(constant_segment_data)
+        segments.append(AlignedData(constant_segment_data))
 
     if mutable_data is not None:
         mutable_segment_data, mutable_segment_offsets = _extract_constant_segment(
@@ -389,31 +455,34 @@ def serialize_pte_binary(
                 ),
             ]
             # Add to the aggregate segments cord.
-            segments.append(mutable_segment_data)
+            segments.append(AlignedData(mutable_segment_data))
 
     if extract_delegate_segments:
         _extract_delegate_segments(program, segments)
+    if named_data is not None:
+        _extract_named_data(program, segments, named_data.buffers, named_data.pte_data)
 
     # Append all segments into a single Cord, adding any necessary padding to ensure that
     # each segment begins at the required alignment.
     # Update program.segments with the offsets to each segment.
     segments_data = Cord()
-    for data in segments:
+    for segment in segments:
         prev_end = (
             (program.segments[-1].offset + program.segments[-1].size)
             if program.segments
             else 0
         )
+        alignment = math.lcm(segment_alignment, segment.alignment)
         program.segments.append(
             DataSegment(
-                offset=aligned_size(prev_end, segment_alignment), size=len(data)
+                offset=aligned_size(prev_end, alignment), size=len(segment.data)
             )
         )
         # Add to aggregate segments cord with padding.
-        padding_length = padding_required(len(segments_data), segment_alignment)
+        padding_length = padding_required(len(segments_data), alignment)
         if padding_length > 0:
             segments_data.append(b"\x00" * padding_length)
-        segments_data.append(data)
+        segments_data.append(segment.data)
 
     # Convert to a standard flatbuffer binary.
     result: _FlatbufferResult = _program_json_to_flatbuffer(
diff --git a/exir/_serialize/_serialize.py b/exir/_serialize/_serialize.py
index c311274922f..6351875e113 100644
--- a/exir/_serialize/_serialize.py
+++ b/exir/_serialize/_serialize.py
@@ -6,12 +6,12 @@
 
 # pyre-strict
 
-
-from typing import Dict, Tuple
+from typing import Dict, Optional, Tuple
 
 from executorch.exir._serialize import _serialize_pte_binary
 
 from executorch.exir._serialize._cord import Cord
+from executorch.exir._serialize._named_data_store import NamedDataStoreOutput
 from executorch.exir._serialize.data_serializer import (
     DataPayload,
     DataSerializer,
@@ -28,10 +28,24 @@ def serialize_for_executorch(
     emitter_output: EmitterOutput,
     config: ExecutorchBackendConfig,
     data_serializer: DataSerializer,
+    named_data: Optional[NamedDataStoreOutput] = None,
 ) -> Tuple[Cord, Dict[str, Cord]]:
     """Serialize the output from Emitter into ExecuTorch artifacts; PTE and PTD files."""
 
     # Serialize PTE file.
+    pte_named_data = None
+    if (
+        named_data is not None
+        and len(named_data.buffers) > 0
+        and len(named_data.pte_data) > 0
+    ):
+        # Create a separate NamedDataStoreOutput with only pte_data; exclude
+        # external_data, which shouldn't be serialized with the PTE file.
+        pte_named_data = NamedDataStoreOutput(
+            buffers=named_data.buffers,
+            pte_data=named_data.pte_data,
+            external_data={},
+        )
     pte: Cord = _serialize_pte_binary(
         program=emitter_output.program,
         mutable_data=emitter_output.mutable_data,
@@ -39,6 +53,7 @@ def serialize_for_executorch(
         segment_alignment=config.segment_alignment,
         constant_tensor_alignment=config.constant_tensor_alignment,
         delegate_alignment=config.delegate_alignment,
+        named_data=pte_named_data,
     )
 
     # Serialize PTD files.
@@ -88,4 +103,10 @@ def serialize_for_executorch(
                 )
             )
 
+    if named_data is None or len(named_data.external_data) == 0:
+        return pte, ptd_files
+
+    if len(named_data.buffers) == 0:
+        raise RuntimeError("External data exists, but there are no buffers provided.")
+
     return pte, ptd_files
diff --git a/exir/_serialize/test/TARGETS b/exir/_serialize/test/TARGETS
index 853d82b8a9a..63f47720137 100644
--- a/exir/_serialize/test/TARGETS
+++ b/exir/_serialize/test/TARGETS
@@ -3,7 +3,7 @@ load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
 oncall("executorch")
 
 python_unittest(
-    name = "program",
+    name = "test_program",
     srcs = [
         "test_program.py",
     ],
@@ -15,7 +15,7 @@ python_unittest(
 )
 
 python_unittest(
-    name = "flatbuffer",
+    name = "test_flatbuffer",
     srcs = [
         "test_flatbuffer.py",
     ],
@@ -25,7 +25,7 @@ python_unittest(
 )
 
 python_unittest(
-    name = "cord",
+    name = "test_cord",
     srcs = [
         "test_cord.py",
     ],
@@ -33,3 +33,13 @@ python_unittest(
         "//executorch/exir/_serialize:lib",
     ],
 )
+
+python_unittest(
+    name = "test_named_data_store",
+    srcs = [
+        "test_named_data_store.py",
+    ],
+    deps = [
+        "//executorch/exir/_serialize:lib",
+    ],
+)
diff --git a/exir/_serialize/test/test_named_data_store.py b/exir/_serialize/test/test_named_data_store.py
new file mode 100644
index 00000000000..ffe6f2ddce7
--- /dev/null
+++ b/exir/_serialize/test/test_named_data_store.py
@@ -0,0 +1,144 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+import unittest
+
+from executorch.exir._serialize._named_data_store import BufferEntry, NamedDataStore
+
+
+class TestNamedDataStore(unittest.TestCase):
+    def test_add(self) -> None:
+        store = NamedDataStore()
+        store.add_named_data("key1", b"data1", None, None)
+        store.add_named_data("key2", b"data2", 16, "file1")
+        store.add_named_data("key3", b"data3", 16, "file1")
+
+        output = store.get_named_data_store_output()
+
+        self.assertEqual(len(output.buffers), 3)
+        self.assertEqual(output.buffers[0], BufferEntry(b"data1", 1))
+        self.assertEqual(output.buffers[1], BufferEntry(b"data2", 16))
+        self.assertEqual(output.buffers[2], BufferEntry(b"data3", 16))
+
+        self.assertEqual(len(output.pte_data), 1)
+        self.assertEqual(output.pte_data["key1"], 0)
+
+        self.assertEqual(len(output.external_data), 1)
+        self.assertEqual(len(output.external_data["file1"]), 2)
+        self.assertEqual(output.external_data["file1"]["key2"], 1)
+        self.assertEqual(output.external_data["file1"]["key3"], 2)
+
+    def test_add_duplicate_name_and_data(self) -> None:
+        store = NamedDataStore()
+        store.add_named_data("key", b"data", None, None)
+        store.add_named_data("key", b"data", None, None)
+
+        output = store.get_named_data_store_output()
+
+        self.assertEqual(len(output.buffers), 1)
+        self.assertEqual(output.buffers[0], BufferEntry(b"data", 1))
+
+        self.assertEqual(len(output.pte_data), 1)
+        self.assertEqual(output.pte_data["key"], 0)
+
+        self.assertEqual(len(output.external_data), 0)
+
+    def test_add_same_data_with_different_alignment(self) -> None:
+        store = NamedDataStore()
+        store.add_named_data("key", b"data", 3, None)
+        store.add_named_data("key1", b"data", 4, None)
+
+        output = store.get_named_data_store_output()
+
+        self.assertEqual(len(output.buffers), 1)
+        # Check that we take the LCM of the two alignments (3, 4) = 12
+        self.assertEqual(output.buffers[0], BufferEntry(b"data", 12))
+
+        self.assertEqual(len(output.pte_data), 2)
+        self.assertEqual(output.pte_data["key"], 0)
+        self.assertEqual(output.pte_data["key1"], 0)
+
+        self.assertEqual(len(output.external_data), 0)
+
+    def test_add_duplicate_key_fail(self) -> None:
+        store = NamedDataStore()
+        store.add_named_data("key", b"data", None, None)
+
+        # Cannot add item with the same key and different data.
+        self.assertRaises(ValueError, store.add_named_data, "key", b"data1", None, None)
+        self.assertRaises(
+            ValueError, store.add_named_data, "key", b"data1", 16, "file1"
+        )
+
+        output = store.get_named_data_store_output()
+
+        self.assertEqual(len(output.buffers), 1)
+        self.assertEqual(output.buffers[0], BufferEntry(b"data", 1))
+
+        self.assertEqual(len(output.pte_data), 1)
+        self.assertEqual(output.pte_data["key"], 0)
+        self.assertEqual(len(output.external_data), 0)
+
+    def test_merge(self) -> None:
+        store1 = NamedDataStore()
+        store1.add_named_data("key1", b"data1", None, None)
+        store1.add_named_data("key2", b"data2", 16, "file1")
+
+        # Check items in the store1.
+        output = store1.get_named_data_store_output()
+        self.assertEqual(len(output.buffers), 2)
+        self.assertEqual(len(output.pte_data), 1)
+        self.assertEqual(len(output.external_data), 1)
+        self.assertEqual(len(output.external_data["file1"]), 1)
+
+        store2 = NamedDataStore()
+        store2.add_named_data("key1", b"data1", None, None)
+        store2.add_named_data("key3", b"data3", None, None)
+        store2.add_named_data("key4", b"data4", 16, "file1")
+        store2.add_named_data("key5", b"data5", 16, "file2")
+
+        # Check items in store2.
+        output2 = store2.get_named_data_store_output()
+        self.assertEqual(len(output2.buffers), 4)
+        self.assertEqual(len(output2.pte_data), 2)
+        self.assertEqual(len(output2.external_data), 2)
+        self.assertEqual(len(output2.external_data["file1"]), 1)
+        self.assertEqual(len(output2.external_data["file2"]), 1)
+
+        # Merge store2 into store1.
+        store1.merge_named_data_store(output2)
+
+        # Check items in store2 are merged into store1.
+        output = store1.get_named_data_store_output()
+        # key1, data1 exist in both store1 and store2, so we only have one copy of it.
+        self.assertEqual(len(output.buffers), 5)
+        self.assertEqual(len(output.pte_data), 2)
+        self.assertEqual(len(output.external_data), 2)
+        self.assertEqual(len(output.external_data["file1"]), 2)
+        self.assertEqual(len(output.external_data["file2"]), 1)
+
+    def test_merge_duplicate_error(self) -> None:
+        store1 = NamedDataStore()
+        store1.add_named_data("key1", b"data1", None, None)
+
+        # Check items in the store1.
+        output = store1.get_named_data_store_output()
+        self.assertEqual(len(output.buffers), 1)
+        self.assertEqual(len(output.pte_data), 1)
+
+        store2 = NamedDataStore()
+        store2.add_named_data("key1", b"data2", None, None)
+
+        # Check items in store2.
+        output2 = store2.get_named_data_store_output()
+        self.assertEqual(len(output2.buffers), 1)
+        self.assertEqual(len(output2.pte_data), 1)
+
+        # Merge store2 into store1 raises error as key1 is already in store1
+        # with different data.
+        self.assertRaises(ValueError, store1.merge_named_data_store, output2)
diff --git a/exir/_serialize/test/test_program.py b/exir/_serialize/test/test_program.py
index f20c0b39798..c67849dd28d 100644
--- a/exir/_serialize/test/test_program.py
+++ b/exir/_serialize/test/test_program.py
@@ -10,11 +10,16 @@
 import copy
 import difflib
 import json
+import math
 import unittest
 
 from typing import List, Sequence
 
 from executorch.exir._serialize._flatbuffer import _program_flatbuffer_to_json
+from executorch.exir._serialize._named_data_store import (
+    BufferEntry,
+    NamedDataStoreOutput,
+)
 from executorch.exir._serialize._program import (
     _ExtendedHeader,
     _get_extended_header,
@@ -23,6 +28,7 @@
     deserialize_pte_binary,
     serialize_pte_binary,
 )
+from executorch.exir._serialize.padding import aligned_size
 
 from executorch.exir.schema import (
     BackendDelegate,
@@ -552,11 +558,9 @@ def test_round_trip_with_segments(self) -> None:
         # Check the segment base offset boundary.
         segment_base_offset = eh.segment_base_offset
         self.assertEqual(
-            pte_data[segment_base_offset - 2 : segment_base_offset + 3],
-            # The padding before the first segment.
-            b"\x00\x00"
+            pte_data[segment_base_offset : segment_base_offset + 3],
             # The first few bytes of the first segment.
-            + b"\x10\x11\x11",
+            b"\x10\x11\x11",
         )
 
         # Now that we've shown that the base offset is correct, slice off the
@@ -671,7 +675,7 @@ def test_constant_segment_tensor_alignment_non_power_of_2_fails(self) -> None:
                 constant_tensor_alignment=constant_tensor_alignment,
             )
 
-    def test_constant_segment_and_delegate_segment(self) -> None:
+    def test_constant_delegate_and_named_data_segments(self) -> None:
         # Create a program with some constant tensor data and delegate data blobs.
         program = get_test_program()
         constant_blobs = (
@@ -682,10 +686,22 @@ def test_constant_segment_and_delegate_segment(self) -> None:
             self.gen_blob_data(SEGMENT_ALIGNMENT // 2, b"\x30\x33\x03"),
             self.gen_blob_data(SEGMENT_ALIGNMENT + 1, b"\x40\x44\x04"),
         )
-
         add_constant_data(program, constant_blobs)
         add_delegate_data(program, program.execution_plan[0], delegate_blobs)
 
+        # Create named data segment.
+        named_data_buffers = [
+            BufferEntry(
+                buffer=self.gen_blob_data(8, b"\x50\x55\x05"), alignment=3
+            ),  # expect lcm(3, 128) = 384
+            BufferEntry(
+                buffer=self.gen_blob_data(16, b"\x60\x66\x06"), alignment=256
+            ),  # expect lcm(256, 128) = 256
+        ]
+        pte_named_data = {"key0": 0, "key1": 1}
+        named_data = NamedDataStoreOutput(
+            buffers=named_data_buffers, pte_data=pte_named_data, external_data={}
+        )
         # Extract the blobs into segments during serialization.
         pte_data = bytes(
             serialize_pte_binary(
@@ -693,6 +709,7 @@ def test_constant_segment_and_delegate_segment(self) -> None:
                 extract_delegate_segments=True,
                 segment_alignment=SEGMENT_ALIGNMENT,
                 constant_tensor_alignment=CONSTANT_TENSOR_ALIGNMENT,
+                named_data=named_data,
             )
         )
 
@@ -702,6 +719,7 @@ def test_constant_segment_and_delegate_segment(self) -> None:
             program.execution_plan[0].delegates[0].processed.location,
             DataLocation.INLINE,
         )
+        self.assertEqual(program.named_data, [])
 
         # Extended header should be present in the serialized data.
         eh = self.get_and_validate_extended_header(pte_data)
@@ -715,9 +733,12 @@ def test_constant_segment_and_delegate_segment(self) -> None:
         # Peek inside the actual flatbuffer data to see the segments.
         program_with_segments = _json_to_program(_program_flatbuffer_to_json(pte_data))
 
-        # Segment table should contain a constant segment and the delegate blobs.
+        # Segment table should contain a constant segment, the delegate blobs
+        # and a named data segment.
         segment_table: List[DataSegment] = program_with_segments.segments
-        self.assertEqual(len(segment_table), len(delegate_blobs) + 1)
+        self.assertEqual(
+            len(segment_table), len(delegate_blobs) + len(pte_named_data) + 1
+        )
         self.assertEqual(segment_table[0].offset, 0)
         # segment_table[0] is the constant segment, which
         # contains a couple of tensors with sizes:
@@ -728,6 +749,30 @@ def test_constant_segment_and_delegate_segment(self) -> None:
         self.assertEqual(segment_table[1].size, SEGMENT_ALIGNMENT // 2)
         self.assertEqual(segment_table[2].offset, SEGMENT_ALIGNMENT * 2)
         self.assertEqual(segment_table[2].size, SEGMENT_ALIGNMENT + 1)
+        # Named data segments.
+        expected_offset = aligned_size(
+            (segment_table[2].offset + segment_table[2].size),
+            math.lcm(named_data_buffers[0].alignment, SEGMENT_ALIGNMENT),
+        )
+        self.assertEqual(segment_table[3].offset, expected_offset)
+        self.assertEqual(segment_table[3].size, len(named_data_buffers[0].buffer))
+        expected_offset = aligned_size(
+            (segment_table[3].offset + segment_table[3].size),
+            math.lcm(named_data_buffers[1].alignment, SEGMENT_ALIGNMENT),
+        )
+        self.assertEqual(segment_table[4].offset, expected_offset)
+        self.assertEqual(segment_table[4].size, len(named_data_buffers[1].buffer))
+
+        # Named data.
+        self.assertTrue(program_with_segments.named_data is not None)
+        program_named_data = program_with_segments.named_data
+        self.assertEqual(len(program_named_data), len(pte_named_data))
+
+        # Check named data values.
+        self.assertEqual(program_named_data[0].key, "key0")
+        self.assertEqual(program_named_data[0].segment_index, 3)
+        self.assertEqual(program_named_data[1].key, "key1")
+        self.assertEqual(program_named_data[1].segment_index, 4)
 
         # Check constant_segment index and offsets.
         subsegment_offsets: SubsegmentOffsets = program_with_segments.constant_segment
@@ -811,6 +856,23 @@ def test_constant_segment_and_delegate_segment(self) -> None:
             + b"\x40\x44\x44",
         )
 
+        # Check named data segments
+        self.assertEqual(
+            segment_data[
+                segment_table[3].offset : segment_table[3].offset
+                + segment_table[3].size
+            ],
+            named_data_buffers[0].buffer,
+        )
+
+        self.assertEqual(
+            segment_data[
+                segment_table[4].offset : segment_table[4].offset
+                + segment_table[4].size
+            ],
+            named_data_buffers[1].buffer,
+        )
+
         # Convert back.
         program2 = deserialize_pte_binary(pte_data)
         # Programs are the same besides constant_buffer, as deserialization
@@ -820,6 +882,104 @@ def test_constant_segment_and_delegate_segment(self) -> None:
         # Number of constant tensors should be the same.
         self.assertEqual(len(program2.constant_buffer), len(program.constant_buffer))
 
+    def test_named_data_segments(self) -> None:
+        # Set segment alignment to 12 to test the padding.
+        SEGMENT_ALIGNMENT: int = 12
+
+        # Create a program with some named data segments.
+        program = get_test_program()
+
+        # Create named data segments with different alignments.
+        buffers = [
+            BufferEntry(
+                buffer=self.gen_blob_data(8, b"\x10\x11\x01"), alignment=8
+            ),  # expect lcm(8, 12) = 24
+            BufferEntry(
+                buffer=self.gen_blob_data(16, b"\x20\x22\x02"), alignment=32
+            ),  # expect lcm(32, 12) = 96
+            BufferEntry(
+                buffer=self.gen_blob_data(24, b"\x30\x33\x03"), alignment=24
+            ),  # expect lcm(24, 12) = 24
+        ]
+        pte_named_data = {"key1": 0, "key2": 0, "key3": 1, "key4": 2}
+        named_data = NamedDataStoreOutput(
+            buffers=buffers, pte_data=pte_named_data, external_data={}
+        )
+        # Serialize the program with named data segments.
+        pte_data = bytes(
+            serialize_pte_binary(
+                program,
+                extract_delegate_segments=True,
+                segment_alignment=SEGMENT_ALIGNMENT,
+                constant_tensor_alignment=CONSTANT_TENSOR_ALIGNMENT,
+                named_data=named_data,
+            )
+        )
+
+        # named_data is initially empty.
+        self.assertEqual(program.named_data, [])
+        # Extended header should be present in the serialized data.
+        eh = self.get_and_validate_extended_header(pte_data)
+        # Segment offset should be non-zero since there are segments. It
+        # should point past the end of the program data, but not beyond
+        # the end of the file.
+        self.assertGreaterEqual(eh.segment_base_offset, eh.program_size)
+        self.assertLess(eh.segment_base_offset, len(pte_data))
+
+        # Peek inside the actual flatbuffer data to see the named data segments.
+        program_with_segments = _json_to_program(_program_flatbuffer_to_json(pte_data))
+        # pyre-ignore Incompatible parameter type [6]
+        self.assertEqual(len(program_with_segments.named_data), len(pte_named_data))
+
+        # Check Program.named_data values.
+        # pyre-ignore Undefined attribute [16]
+        self.assertEqual(program_with_segments.named_data[0].key, "key1")
+        self.assertEqual(program_with_segments.named_data[0].segment_index, 0)
+        self.assertEqual(program_with_segments.named_data[1].key, "key2")
+        self.assertEqual(program_with_segments.named_data[1].segment_index, 0)
+        self.assertEqual(program_with_segments.named_data[2].key, "key3")
+        self.assertEqual(program_with_segments.named_data[2].segment_index, 1)
+        self.assertEqual(program_with_segments.named_data[3].key, "key4")
+        self.assertEqual(program_with_segments.named_data[3].segment_index, 2)
+
+        # Check Program.segments values.
+        segment_table: List[DataSegment] = program_with_segments.segments
+        self.assertEqual(len(segment_table), 3)
+
+        for i in range(len(segment_table)):
+            segment_length = (
+                segment_table[i - 1].offset + segment_table[i - 1].size if i > 0 else 0
+            )
+            expected_offset = aligned_size(
+                segment_length, math.lcm(SEGMENT_ALIGNMENT, buffers[i].alignment)
+            )
+            self.assertEqual(segment_table[i].offset, expected_offset)
+            self.assertEqual(segment_table[i].size, len(buffers[i].buffer))
+
+        # Check the pte data for buffer values.
+        segment_data: bytes = pte_data[eh.segment_base_offset :]
+        self.assertEqual(
+            segment_data[
+                segment_table[0].offset : segment_table[0].offset
+                + segment_table[0].size
+            ],
+            buffers[0].buffer,
+        )
+        self.assertEqual(
+            segment_data[
+                segment_table[1].offset : segment_table[1].offset
+                + segment_table[1].size
+            ],
+            buffers[1].buffer,
+        )
+        self.assertEqual(
+            segment_data[
+                segment_table[2].offset : segment_table[2].offset
+                + segment_table[2].size
+            ],
+            buffers[2].buffer,
+        )
+
 
 # Common data for extended header tests. The two example values should produce
 # the example data.
diff --git a/exir/backend/backend_api.py b/exir/backend/backend_api.py
index 966cae5f022..ab2e66f7885 100644
--- a/exir/backend/backend_api.py
+++ b/exir/backend/backend_api.py
@@ -1,5 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -56,9 +57,9 @@ def to_backend(
      ) -> LoweredBackendModule:
 
      def to_backend(
-         graph_module: torch.fx.GraphModule,
-         partitioner: Type[TPartitioner],
-     ) -> torch.fx.GraphModule
+         edge_program: ExportedProgram,
+         partitioner: Partitioner,
+     ) -> ExportedProgram:
     """
     pass
 
@@ -119,6 +120,7 @@ def to_backend(
                 backend_id=backend_id,
                 processed_bytes=preprocess_result.processed_bytes,
                 compile_specs=compile_specs,
+                named_data_store_output=preprocess_result.data_store_output,
             )
             lowered_module.meta = {
                 "debug_handle_map": preprocess_result.debug_handle_map
@@ -399,6 +401,11 @@ def to_backend(
         tagged_exported_program,
     )
 
+    # Partitioner added delegation tags to the graph module nodes,
+    # we make sure to remove them after we finished partition_and_lower
+    for node in tagged_graph_module.graph.nodes:
+        node.meta.pop("delegation_tag", None)
+
     return ExportedProgram(
         root=tagged_graph_module,
         graph=tagged_graph_module.graph,
diff --git a/exir/backend/backend_details.py b/exir/backend/backend_details.py
index bdbc1a1fafd..248d03f2b05 100644
--- a/exir/backend/backend_details.py
+++ b/exir/backend/backend_details.py
@@ -9,6 +9,8 @@
 
 from typing import Dict, List, Optional, Tuple, Union
 
+from executorch.exir._serialize._named_data_store import NamedDataStoreOutput
+
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from torch.export.exported_program import ExportedProgram
 
@@ -24,6 +26,11 @@ class PreprocessResult:
     debug_handle_map: Optional[Union[Dict[int, Tuple[int]], Dict[str, Tuple[int]]]] = (
         None
     )
+    # Data Store output created from NamedDataStore.
+
+    # Named Data store contains all the named data that is stored in the PTE file,
+    # but retrieveable by delegates via the NamedDataMap at runtime.
+    data_store_output: Optional[NamedDataStoreOutput] = None
 
 
 """
diff --git a/exir/backend/test/TARGETS b/exir/backend/test/TARGETS
index 5c3a5e3eb32..f0ba618936d 100644
--- a/exir/backend/test/TARGETS
+++ b/exir/backend/test/TARGETS
@@ -38,6 +38,62 @@ python_library(
     ],
 )
 
+python_library(
+    name = "backend_with_named_data_map",
+    srcs = [
+        "backend_with_named_data_map.py",
+    ],
+    visibility = [
+        "//executorch/...",
+        "//executorch/test/...",
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//caffe2/functorch:functorch_src",
+        "//executorch/exir:delegate",
+        "//executorch/exir:graph_module",
+        "//executorch/exir:lib",
+        "//executorch/exir:lowered_backend_module",
+        "//executorch/exir:print_program",
+        "//executorch/exir:schema",
+        "//executorch/exir/backend:backend_api",
+        "//executorch/exir/backend:compile_spec_schema",
+        "//executorch/exir/backend:partitioner",
+        "//executorch/exir/dialects:lib",
+        "//executorch/extension/pybindings:portable_lib",  # @manual
+        "//executorch/extension/pytree:pylib",
+        "//executorch/exir/backend/canonical_partitioners:canonical_partitioner_lib",
+    ],
+)
+
+python_unittest(
+    name = "test_backend_with_named_data_map",
+    srcs = [
+        "test_backend_with_named_data_map.py",
+    ],
+    visibility = [
+        "//executorch/...",
+        "//executorch/test/...",
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//caffe2/functorch:functorch_src",
+        "//executorch/exir:delegate",
+        "//executorch/exir:graph_module",
+        "//executorch/exir:lib",
+        "//executorch/exir:lowered_backend_module",
+        "//executorch/exir:print_program",
+        "//executorch/exir:schema",
+        "//executorch/exir/backend:backend_api",
+        "//executorch/exir/backend:compile_spec_schema",
+        "//executorch/exir/backend:partitioner",
+        "//executorch/exir/dialects:lib",
+        "//executorch/extension/pybindings:portable_lib",  # @manual
+        "//executorch/extension/pytree:pylib",
+        ":backend_with_named_data_map",
+    ],
+)
+
 python_library(
     name = "qnn_backend_demo",
     srcs = [
@@ -107,7 +163,7 @@ python_unittest(
         "test_backends.py",
     ],
     preload_deps = [
-        "//executorch/kernels/portable:custom_ops_generated_lib",
+        "//executorch/configurations:optimized_native_cpu_ops",
         "//executorch/kernels/quantized:custom_ops_generated_lib",
         "//executorch/runtime/executor/test:test_backend_compiler_lib",
     ],
diff --git a/exir/backend/test/backend_with_named_data_map.py b/exir/backend/test/backend_with_named_data_map.py
new file mode 100644
index 00000000000..47dbc294133
--- /dev/null
+++ b/exir/backend/test/backend_with_named_data_map.py
@@ -0,0 +1,115 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict, final, List, Tuple
+
+import torch
+from executorch.exir._serialize._named_data_store import NamedDataStore
+
+from executorch.exir.backend.backend_details import BackendDetails, PreprocessResult
+from executorch.exir.backend.canonical_partitioners.pattern_op_partitioner import (
+    generate_pattern_op_partitions,
+)
+
+from executorch.exir.backend.compile_spec_schema import CompileSpec
+from executorch.exir.backend.partitioner import (
+    DelegationSpec,
+    Partitioner,
+    PartitionResult,
+)
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.graph_module import get_control_flow_submodules
+from torch.export.exported_program import ExportedProgram
+from torch.fx.passes.operator_support import OperatorSupportBase
+
+
+# Backend details are final (cannot be subclassed).
+@final
+class BackendWithNamedDataMap(BackendDetails):
+    """
+    Test Backend for Named Data Map Functionality
+
+    This backend returns no processed_bytes, instead it uses
+    the named data store and serializes the name of the op
+    as the key and the data as its code value
+    """
+
+    @staticmethod
+    def preprocess(
+        edge_program: ExportedProgram,
+        compile_specs: List[CompileSpec],
+    ) -> PreprocessResult:
+        op_codes = {
+            exir_ops.edge.aten.sin.default: 0,
+            exir_ops.edge.aten.add.Tensor: 1,
+            exir_ops.edge.aten.sub.Tensor: 2,
+            exir_ops.edge.aten.mul.Tensor: 3,
+            exir_ops.edge.aten.div.Tensor: 4,
+        }
+        ndm = NamedDataStore()
+        for node in edge_program.graph.nodes:
+            if node.op == "call_function":
+                if node.target in op_codes.keys():
+                    ndm.add_named_data(
+                        node.target.__name__, bytes(op_codes[node.target])
+                    )
+
+        return PreprocessResult(
+            processed_bytes=bytes(b""),
+            debug_handle_map={},
+            data_store_output=ndm.get_named_data_store_output(),
+        )
+
+
+class SimpleOperatorSupport(OperatorSupportBase):
+    def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
+        return node.op == "call_function" and node.target in [
+            exir_ops.edge.aten.sin.default,
+            exir_ops.edge.aten.add.Tensor,
+            exir_ops.edge.aten.sub.Tensor,
+            exir_ops.edge.aten.mul.Tensor,
+            exir_ops.edge.aten.div.Tensor,
+        ]
+
+
+@final
+class BackendWithNDMPartitioner(Partitioner):
+    def __init__(self) -> None:
+        self._op_support = SimpleOperatorSupport()
+        self.backend_id = BackendWithNamedDataMap.__name__
+
+    def _partition_gm(
+        self, graph_module: torch.fx.GraphModule, id_start: int = 0
+    ) -> Tuple[int, Dict[str, DelegationSpec]]:
+        partition_tags: Dict[str, DelegationSpec] = {}
+        partition_list = generate_pattern_op_partitions(
+            graph_module, op_support=self._op_support
+        )
+
+        num_partitions_in_gm = len(partition_list)
+        for partition in partition_list:
+            curr_par_id = partition.id or 0
+            delegation_tag = f"tag_{curr_par_id + id_start}"
+            for node in partition.nodes:
+                node.meta["delegation_tag"] = delegation_tag
+            delegation_spec = DelegationSpec(self.backend_id, [])
+            partition_tags[delegation_tag] = delegation_spec
+
+        start_idx_for_submodules = num_partitions_in_gm
+        for _, submodule, _ in get_control_flow_submodules(graph_module):
+            start_idx_for_submodules, ret_partition_tags = self._partition_gm(
+                submodule, start_idx_for_submodules
+            )
+            partition_tags.update(ret_partition_tags)
+
+        return start_idx_for_submodules, partition_tags
+
+    def partition(self, edge_program: ExportedProgram) -> PartitionResult:
+        _, partition_tags = self._partition_gm(edge_program.graph_module)
+        return PartitionResult(
+            tagged_exported_program=edge_program,
+            partition_tags=partition_tags,
+        )
diff --git a/exir/backend/test/demos/rpc/CMakeLists.txt b/exir/backend/test/demos/rpc/CMakeLists.txt
index d3722e830d4..97f90ea9baa 100644
--- a/exir/backend/test/demos/rpc/CMakeLists.txt
+++ b/exir/backend/test/demos/rpc/CMakeLists.txt
@@ -20,8 +20,8 @@ if(NOT EXECUTORCH_ROOT)
   set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../../../)
 endif()
 
-include(${EXECUTORCH_ROOT}/build/Test.cmake)
-include(${EXECUTORCH_ROOT}/build/Utils.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 set(_common_compile_options -Wno-deprecated-declarations -fPIC)
diff --git a/exir/backend/test/demos/rpc/ExecutorBackend.cpp b/exir/backend/test/demos/rpc/ExecutorBackend.cpp
index d398b87123b..7dc0d2b2373 100644
--- a/exir/backend/test/demos/rpc/ExecutorBackend.cpp
+++ b/exir/backend/test/demos/rpc/ExecutorBackend.cpp
@@ -72,8 +72,11 @@ class ExecutorBackend final : public ::executorch::runtime::BackendInterface {
     // `processed` contains an executorch program. Wrap it in a DataLoader that
     // will return the data directly without copying it.
     MemoryAllocator* runtime_allocator = context.get_runtime_allocator();
-    auto loader = ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR(
-        runtime_allocator, BufferDataLoader);
+    auto loader = runtime_allocator->allocateInstance<BufferDataLoader>();
+    if (loader == nullptr) {
+      return Error::MemoryAllocationFailed;
+    }
+
     new (loader) BufferDataLoader(processed->data(), processed->size());
     // Can't free `processed` because the program will point into that memory.
 
@@ -84,8 +87,11 @@ class ExecutorBackend final : public ::executorch::runtime::BackendInterface {
     }
 
     // Move the Program off the stack.
-    auto client_program =
-        ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR(runtime_allocator, Program);
+    auto client_program = runtime_allocator->allocateInstance<Program>();
+    if (client_program == nullptr) {
+      return Error::MemoryAllocationFailed;
+    }
+
     new (client_program) Program(std::move(program_result.get()));
 
     Result<MethodMeta> method_meta = client_program->method_meta("forward");
@@ -97,35 +103,56 @@ class ExecutorBackend final : public ::executorch::runtime::BackendInterface {
     // Building all different allocators for the client executor
     auto num_memory_planned_buffers = method_meta->num_memory_planned_buffers();
 
-    Span<uint8_t>* memory_planned_buffers = ET_ALLOCATE_LIST_OR_RETURN_ERROR(
-        runtime_allocator, Span<uint8_t>, num_memory_planned_buffers);
+    Span<uint8_t>* memory_planned_buffers =
+        runtime_allocator->allocateList<Span<uint8_t>>(
+            num_memory_planned_buffers);
+    if (memory_planned_buffers == nullptr) {
+      return Error::MemoryAllocationFailed;
+    }
 
     for (size_t id = 0; id < num_memory_planned_buffers; ++id) {
       size_t buffer_size = static_cast<size_t>(
           method_meta->memory_planned_buffer_size(id).get());
-      uint8_t* buffer_i = ET_ALLOCATE_LIST_OR_RETURN_ERROR(
-          runtime_allocator, uint8_t, buffer_size);
+      uint8_t* buffer_i = runtime_allocator->allocateList<uint8_t>(buffer_size);
+      if (buffer_i == nullptr) {
+        return Error::MemoryAllocationFailed;
+      }
+
       memory_planned_buffers[id] = {buffer_i, buffer_size};
     }
 
-    auto client_planned_memory = ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR(
-        runtime_allocator, HierarchicalAllocator);
+    auto client_planned_memory =
+        runtime_allocator->allocateInstance<HierarchicalAllocator>();
+    if (client_planned_memory == nullptr) {
+      return Error::MemoryAllocationFailed;
+    }
+
     new (client_planned_memory) HierarchicalAllocator(
         {memory_planned_buffers, num_memory_planned_buffers});
 
     // Allocate some memory from runtime allocator for the client executor, in
     // real case, like if it's an executor in dsp, it should allocate memory
     // dedicated to this specific hardware
-    auto client_method_allocator = ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR(
-        runtime_allocator, MemoryAllocator);
+    auto client_method_allocator =
+        runtime_allocator->allocateInstance<MemoryAllocator>();
+    if (client_method_allocator == nullptr) {
+      return Error::MemoryAllocationFailed;
+    }
+
     const size_t kClientRuntimeMemorySize = 4 * 1024U;
-    auto runtime_pool = ET_ALLOCATE_OR_RETURN_ERROR(
-        runtime_allocator, kClientRuntimeMemorySize);
+    auto runtime_pool = runtime_allocator->allocate(kClientRuntimeMemorySize);
+    if (runtime_pool == nullptr) {
+      return Error::MemoryAllocationFailed;
+    }
     new (client_method_allocator) MemoryAllocator(
         kClientRuntimeMemorySize, static_cast<uint8_t*>(runtime_pool));
 
     auto client_memory_manager =
-        ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR(runtime_allocator, MemoryManager);
+        runtime_allocator->allocateInstance<MemoryManager>();
+    if (client_memory_manager == nullptr) {
+      return Error::MemoryAllocationFailed;
+    }
+
     new (client_memory_manager)
         MemoryManager(client_method_allocator, client_planned_memory);
 
@@ -140,8 +167,11 @@ class ExecutorBackend final : public ::executorch::runtime::BackendInterface {
       return method_res.error();
     }
 
-    auto client_method =
-        ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR(runtime_allocator, Method);
+    auto client_method = runtime_allocator->allocateInstance<Method>();
+    if (client_method == nullptr) {
+      return Error::MemoryAllocationFailed;
+    }
+
     new (client_method) Method(std::move(method_res.get()));
 
     // Return the client method so it will be passed to `execute()` as
diff --git a/exir/backend/test/demos/rpc/TARGETS b/exir/backend/test/demos/rpc/TARGETS
index 63d24ccbda4..3fdb1d4360a 100644
--- a/exir/backend/test/demos/rpc/TARGETS
+++ b/exir/backend/test/demos/rpc/TARGETS
@@ -47,7 +47,7 @@ python_unittest(
         "test_rpc.py",
     ],
     preload_deps = [
-        "//executorch/kernels/portable:custom_ops_generated_lib",
+        "//executorch/configurations:optimized_native_cpu_ops",
         "//executorch/kernels/quantized:custom_ops_generated_lib",
         # the executor backend is prebuilt and linked when building the unit test binary. When it's linked, it'll register the backend.
         # It can also be loaded in PyThon runtime via torch.ops.load_library("//executorch/exir/backend/test/demos/rpc:executor_backend")
diff --git a/exir/backend/test/demos/rpc/targets.bzl b/exir/backend/test/demos/rpc/targets.bzl
index 67935e0e373..c5cfb343a6c 100644
--- a/exir/backend/test/demos/rpc/targets.bzl
+++ b/exir/backend/test/demos/rpc/targets.bzl
@@ -23,9 +23,9 @@ def define_common_targets():
         ],
         platforms = [ANDROID, CXX],
         deps = [
-            "//executorch/runtime/executor:program",
-            "//executorch/kernels/portable:generated_lib",
+            "//executorch/configurations:optimized_native_cpu_ops",
             "//executorch/runtime/backend:interface",
+            "//executorch/runtime/executor:program",
             "//executorch/extension/data_loader:buffer_data_loader",
         ] + MODELS_ATEN_OPS_LEAN_MODE_GENERATED_LIB,
         exported_deps = [
diff --git a/exir/backend/test/test_backend_with_named_data_map.py b/exir/backend/test/test_backend_with_named_data_map.py
new file mode 100644
index 00000000000..c9e458a1878
--- /dev/null
+++ b/exir/backend/test/test_backend_with_named_data_map.py
@@ -0,0 +1,83 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+
+from executorch.exir import to_edge
+from executorch.exir.backend.backend_api import to_backend
+
+from executorch.exir.backend.test.backend_with_named_data_map import (
+    BackendWithNamedDataMap,
+    BackendWithNDMPartitioner,
+)
+
+
+class TestBackendWithNamedDataMap(unittest.TestCase):
+    def test_lowered_backend_module_has_output(self):
+        class M(torch.nn.Module):
+            def forward(self, x):
+                return x + x
+
+        ep = to_edge(torch.export.export(M(), (torch.randn(1, 2),)))
+        lowered = to_backend(
+            BackendWithNamedDataMap.__name__, ep.exported_program(), []
+        )
+
+        buffer_entries = lowered.named_data_store_output.buffers
+        self.assertTrue(len(buffer_entries) == 1)
+        stored_data = lowered.named_data_store_output.pte_data
+
+        self.assertTrue("aten.add.Tensor" in stored_data)
+        self.assertTrue(buffer_entries[0].buffer == bytes(1))
+
+    def test_named_data_with_partitioner(self):
+        class M(torch.nn.Module):
+            def forward(self, x):
+                y = x + x
+                y = torch.cos(y)
+                y = y + y
+                y = torch.sin(y)
+                return y - y
+
+        ep = to_edge(torch.export.export(M(), (torch.randn(1, 2),)))
+        ep = ep.to_backend(BackendWithNDMPartitioner())
+
+        ndm_output = ep._named_data_store.get_named_data_store_output()
+        buffer_entries = ndm_output.buffers
+        stored_data = ndm_output.pte_data
+        self.assertEqual(len(buffer_entries), 3)
+        self.assertTrue("aten.add.Tensor" in stored_data)
+        self.assertTrue("aten.sub.Tensor" in stored_data)
+        self.assertTrue("aten.sin.default" in stored_data)
+
+    def test_named_data_with_control_flow(self):
+        class M(torch.nn.Module):
+            def true_branch(self, x):
+                y = x * x
+                y = torch.cos(y)
+                return torch.sin(y)
+
+            def false_branch(self, x):
+                return torch.sin(x)
+
+            def forward(self, x, y):
+                z = x / y
+                z = torch.cond(z.sum() > 0, self.true_branch, self.false_branch, [x])
+                return z - z
+
+        ep = to_edge(torch.export.export(M(), (torch.randn(1, 2), torch.randn(1, 2))))
+        ep = ep.to_backend(BackendWithNDMPartitioner())
+
+        ndm_output = ep._named_data_store.get_named_data_store_output()
+        buffer_entries = ndm_output.buffers
+        stored_data = ndm_output.pte_data
+        self.assertEqual(len(buffer_entries), 4)
+        self.assertTrue("aten.sub.Tensor" in stored_data)
+        self.assertTrue("aten.div.Tensor" in stored_data)
+        self.assertTrue("aten.sin.default" in stored_data)
+        self.assertTrue("aten.mul.Tensor" in stored_data)
diff --git a/exir/backend/utils.py b/exir/backend/utils.py
index 9487c59a848..eb9aeb19756 100644
--- a/exir/backend/utils.py
+++ b/exir/backend/utils.py
@@ -1,5 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -8,7 +9,7 @@
 
 import logging
 import operator
-from collections import defaultdict
+from collections import defaultdict, OrderedDict
 from functools import lru_cache
 from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
 
@@ -22,9 +23,11 @@
 from executorch.exir.dialects._ops import ops as exir_ops
 
 from executorch.exir.lowered_backend_module import create_submodule_from_nodes
+from tabulate import tabulate
 from torch._export.utils import is_buffer, is_lifted_tensor_constant, is_param
 from torch.fx.experimental.symbolic_shapes import has_free_symbols
 from torch.fx.node import Node
+from torch.fx.passes.operator_support import OperatorSupportBase
 from torch.fx.passes.utils.source_matcher_utils import SourcePartition
 
 T_QuantPerTensor = exir_ops.edge.quantized_decomposed.quantize_per_tensor.default
@@ -569,3 +572,90 @@ def __call__(self, node: torch.fx.Node, reason: str) -> None:
 
     def __str__(self) -> str:
         return f"WhyNoPartition: Node {self.node} was not partitioned because {self.reason}."
+
+
+class WhyNoPartitionReporter:
+    """
+    Helper class for partitioners to gather why nodes were not lowered in a single report.
+    If a node is reported multiple times, only the first report is included.
+
+    Example usage:
+
+        # In your backend partitioner file(s)
+        reporter = WhyNoPartitionReporter()
+
+        # hypothetical function that checks if a node can be lowered
+        if not can_be_lowered(node):
+            reporter.report_reject(node, "This node was not lowered because ...")
+
+        # Back in partitioner
+        logger.info(reporter.get_table_report())
+    """
+
+    def __init__(self):
+        self._rejected_nodes: OrderedDict[torch.fx.Node, str] = (
+            OrderedDict()
+        )  # {Rejected node: reason}
+
+    def report_reject(self, node: torch.fx.Node, reason: str):
+        """Report a node that was rejected from a partition, along with a reason for why."""
+        if node not in self._rejected_nodes:
+            self._rejected_nodes[node] = reason
+
+    def get_table_report(self) -> str:
+        """Returns a string containing a table listing all rejected nodes.
+        The table looks something like this:
+        ╒══════════════════════════╤══════════════════════════╤═════════════════════════════════════╤═════════════════════════════════════╕
+        │ Node name                │ Target                   │ Torch func                          │ Reason                              │
+        ╞══════════════════════════╪══════════════════════════╪═════════════════════════════════════╪═════════════════════════════════════╡
+        │ aten_convolution_default │ aten.convolution.default │ ('conv2d_1', 'builtin_function_or_m │ Convolution needs to have           │
+        │                          │                          │ ethod.conv2d')                      │ kernel_y<=64,                       │
+        │                          │                          │                                     │ kernel_x*kernel_y<=4096, got kernel │
+        │                          │                          │                                     │ (2, 65)                             │
+        ╘══════════════════════════╧══════════════════════════╧═════════════════════════════════════╧═════════════════════════════════════╛
+        """
+        reject_report = []
+        for node in self._rejected_nodes:
+            if node.op == "placeholder" or node.op == "output":
+                continue
+            if not (target := getattr(node.target, "_op", None)):
+                target = node.target
+            torch_fn = node.meta.get("torch_fn", "-")
+            reject_report.append(
+                [node.name, target, torch_fn, self._rejected_nodes[node]]
+            )
+        if len(reject_report) > 0:
+            return tabulate(
+                reject_report,
+                ["Node name", "Target", "Torch func", "Reason"],
+                tablefmt="fancy_grid",
+                maxcolwidths=35,
+            )
+        else:
+            return "No nodes rejected."
+
+    def wrap_check(
+        self, operator_support: OperatorSupportBase, message: str
+    ) -> OperatorSupportBase:
+        """Wrap the operator_support, reporting rejects with the specified message."""
+        return ReportRejected(operator_support, self, message)
+
+
+class ReportRejected(OperatorSupportBase):
+    """Class for wrapping a OperatorSupportBase, reporting rejects with the specified message to `reporter`."""
+
+    def __init__(
+        self,
+        operator_support: OperatorSupportBase,
+        reporter: WhyNoPartitionReporter,
+        message,
+    ):
+        self.operator_support = operator_support
+        self.reporter = reporter
+        self.message = message
+
+    def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
+        is_supported = self.operator_support.is_node_supported(submodules, node)
+        if not is_supported:
+            self.reporter.report_reject(node, self.message)
+        return is_supported
diff --git a/exir/dialects/edge/spec/gen.py b/exir/dialects/edge/spec/gen.py
index 6a9c8d3caf8..6be6a2ae5ee 100644
--- a/exir/dialects/edge/spec/gen.py
+++ b/exir/dialects/edge/spec/gen.py
@@ -236,25 +236,6 @@ def print_error_msg(unsupported_funcs: List[str]):
             print(f)
 
 
-def is_not_dype_exception(exc: BaseException, dtype_str: str) -> bool:
-    """Check if an exception about unsupported dtype."""
-
-    # alias dtype means the alias name of dtype str, like "Boolean" is the alias name of "Bool".
-    # Set default alias_dtype as twice of str(exc) to make sure default alias dtype is not part of str(exc)
-    alias_dtype = 2 * str(exc)
-    if dtype_str == "Bool":
-        alias_dtype = "Boolean"
-
-    return not (
-        ("not supported" in str(exc) or "not implemented" in str(exc))
-        and (
-            dtype_str in str(exc)
-            or alias_dtype in str(exc)
-            or dtype_str.lower() in str(exc)
-        )
-    )
-
-
 class EdgeOpYamlInfo:
     def __init__(
         self,
diff --git a/exir/dialects/edge/test/TARGETS b/exir/dialects/edge/test/TARGETS
index 52a0d3ec60e..8a689b0dba6 100644
--- a/exir/dialects/edge/test/TARGETS
+++ b/exir/dialects/edge/test/TARGETS
@@ -10,7 +10,7 @@ python_unittest(
     resources = {
         "//executorch/exir/dialects/edge:edge_yaml": "edge.yaml",
     },
-    tags = ["long_running"],
+    labels = ["long_running"],
     deps = [
         "fbsource//third-party/pypi/expecttest:expecttest",  # @manual
         "//caffe2:torch",
diff --git a/exir/emit/test/test_emit.py b/exir/emit/test/test_emit.py
index f0d81ebf272..362796146ee 100644
--- a/exir/emit/test/test_emit.py
+++ b/exir/emit/test/test_emit.py
@@ -1534,7 +1534,7 @@ def forward(self, x):
         self.assertEqual(len(program.constant_buffer[1].storage), 8)
 
     def test_emit_lifted_tensor_constant(self) -> None:
-        class LiftedConstants(nn.Module):
+        class LiftedTensorConstants(nn.Module):
             def __init__(self):
                 super().__init__()
 
@@ -1542,18 +1542,41 @@ def forward(self, x):
                 x = x * torch.tensor([[4, 3], [1, 2], [5, 6]], dtype=torch.float)
                 return x
 
-        model = LiftedConstants()
+        model = LiftedTensorConstants()
+        # Specify that we want to move non-lifted constants to external file
+        et_cfg = ExecutorchBackendConfig(external_constants=True)
+        program = to_edge(
+            export(model, (torch.ones(3, 2),), strict=True)
+        ).to_executorch(et_cfg)
+        program = program._emitter_output.program
+        exec_plan = program.execution_plan[0]
+        # There should only be 1 input to this model.
+        self.assertEqual(len(exec_plan.inputs), 1)
+        self.assertEqual(len(program.constant_buffer), 2)
+        self.assertEqual(len(program.constant_buffer[1].storage), 24)
 
+    def test_emit_lifted_constant(self) -> None:
+        class LiftedConstants(nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                x = x + 1
+                return x
+
+        model = LiftedConstants()
+        # Specify that we want to move non-lifted constants to external file
+        et_cfg = ExecutorchBackendConfig(external_constants=True)
         program = to_edge(
             export(model, (torch.ones(3, 2),), strict=True)
-        ).to_executorch()
+        ).to_executorch(et_cfg)
 
         program = program._emitter_output.program
         exec_plan = program.execution_plan[0]
         # There should only be 1 input to this model.
         self.assertEqual(len(exec_plan.inputs), 1)
         self.assertEqual(len(program.constant_buffer), 2)
-        self.assertEqual(len(program.constant_buffer[1].storage), 24)
+        self.assertEqual(len(program.constant_buffer[1].storage), 8)
 
     def test_mutable_buffers(self) -> None:
         def count_copies(gm: torch.fx.GraphModule) -> int:
diff --git a/exir/lowered_backend_module.py b/exir/lowered_backend_module.py
index 720877f0555..6bcc1b2f3d8 100644
--- a/exir/lowered_backend_module.py
+++ b/exir/lowered_backend_module.py
@@ -14,6 +14,7 @@
 import torch
 import torch.utils._pytree as pytree
 from executorch.exir._serialize import _serialize_pte_binary
+from executorch.exir._serialize._named_data_store import NamedDataStoreOutput
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from executorch.exir.delegate import executorch_call_delegate, get_lowered_module_name
 from executorch.exir.emit import emit_program
@@ -62,6 +63,9 @@ class LoweredBackendModule(torch.nn.Module):
         CompileSpec
     ]  # A list of backend-specific objects with static metadata to configure the "compilation" process.
     _original_exported_program: ExportedProgram  # The original EXIR module
+    _named_data_store_output: Optional[
+        NamedDataStoreOutput
+    ]  # Named Data serialized by the backend
 
     def __init__(
         self,
@@ -69,12 +73,14 @@ def __init__(
         backend_id: str,
         processed_bytes: bytes,
         compile_specs: List[CompileSpec],
+        named_data_store_output: Optional[NamedDataStoreOutput] = None,
     ) -> None:
         super().__init__()
         self._original_exported_program = edge_program
         self._backend_id = backend_id
         self._processed_bytes = processed_bytes
         self._compile_specs = compile_specs
+        self._named_data_store_output = named_data_store_output
 
     # pyre-ignore
     def __deepcopy__(self, memo: Optional[Dict[int, Any]]) -> "LoweredBackendModule":
@@ -101,6 +107,7 @@ def __deepcopy__(self, memo: Optional[Dict[int, Any]]) -> "LoweredBackendModule"
             backend_id=self._backend_id,
             processed_bytes=self._processed_bytes,
             compile_specs=copy.deepcopy(self._compile_specs, memo),
+            named_data_store_output=self._named_data_store_output,
         )
         # pyre-fixme[16]: `LoweredBackendModule` has no attribute `meta`.
         res.meta = copy.copy(getattr(self, "meta", {}))
@@ -134,6 +141,13 @@ def original_module(self) -> ExportedProgram:
         """
         return self._original_exported_program
 
+    @property
+    def named_data_store_output(self) -> Optional[NamedDataStoreOutput]:
+        """
+        Returns the Named Data Store Output
+        """
+        return self._named_data_store_output
+
     # TODO(chenlai): consolidate the seriailization config with serialize_to_flatbuffer api
     def buffer(
         self,
@@ -154,6 +168,7 @@ def buffer(
                 segment_alignment=segment_alignment,
                 constant_tensor_alignment=constant_tensor_alignment,
                 delegate_alignment=delegate_alignment,
+                named_data=self.named_data_store_output,
             )
         )
         return out
@@ -890,7 +905,7 @@ def _unsafe_adjust_original_program(  # noqa: C901
             del original_program._state_dict[input_target]
         elif input_spec.kind == InputKind.BUFFER:
             if input_spec.persistent:
-                del original_program._state_dict[input_target]
+                original_program._state_dict.pop(input_target, None)
             else:
                 del original_program._constants[input_spec.target]
         elif input_spec.kind == InputKind.CONSTANT_TENSOR:
diff --git a/exir/memory_planning.py b/exir/memory_planning.py
index 1fc1f0e02fd..1d5d0868c50 100644
--- a/exir/memory_planning.py
+++ b/exir/memory_planning.py
@@ -24,7 +24,7 @@
 from executorch.exir.tensor import TensorSpec
 
 from torch import fx
-from torch.export.exported_program import ExportGraphSignature
+from torch.export.exported_program import ExportGraphSignature, InputKind
 from torch.fx import Node
 from torch.utils._pytree import tree_flatten
 
@@ -247,7 +247,10 @@ def verify_graph_input_output(self) -> None:
                     graph_output_allocated = allocated
                     has_dynamic_unbound_output |= has_dynamic_unbound_tensor
 
-        if "placeholder" in check_list:
+        # only check if inputs are allocated if there are user inputs:
+        user_inputs_exist = _do_user_inputs_exist(graph_signature=self.graph_signature)
+
+        if "placeholder" in check_list and user_inputs_exist:
             assert graph_input_allocated is not None, "graph_input_allocated not set"
             if not has_dynamic_unbound_input:
                 assert (
@@ -327,6 +330,22 @@ def _is_mutable_buffer(
     return False
 
 
+def _do_user_inputs_exist(graph_signature: Optional[ExportGraphSignature]) -> bool:
+    if graph_signature is None:
+        return False
+
+    return (
+        len(
+            list(
+                filter(
+                    lambda input: input.kind == InputKind.USER_INPUT,
+                    graph_signature.input_specs,
+                )
+            )
+        )
+    ) > 0
+
+
 def get_graph_input_tensors(
     nodes: Iterable[Node], graph_signature: Optional[ExportGraphSignature] = None
 ) -> Set[TensorSpec]:
diff --git a/exir/passes/external_constants_pass.py b/exir/passes/external_constants_pass.py
index bc0126a482d..e024fdcbcd2 100644
--- a/exir/passes/external_constants_pass.py
+++ b/exir/passes/external_constants_pass.py
@@ -17,7 +17,9 @@ def external_constants_pass(
     gm: GraphModule,
 ) -> PassResult:
     """
-    Move all constants to external file.
+    Move all non-lifted constants to external file.
+    NOTE: Lifted constants are not moved as they are closer
+    to code than data.
     """
     mutated = False
     for module in gm.modules():
@@ -25,7 +27,7 @@ def external_constants_pass(
             continue
 
         for node in module.graph.nodes:
-            if node.op == "placeholder":
+            if (node.op == "placeholder") and ("_lifted_tensor" not in node.name):
                 spec = node.meta.get("spec")
                 if isinstance(spec, TensorSpec) and spec.const:
                     node.meta["constant_tag"] = "_default_external_constant"
diff --git a/exir/program/_program.py b/exir/program/_program.py
index fdf4b93e19c..7a2120f9e9b 100644
--- a/exir/program/_program.py
+++ b/exir/program/_program.py
@@ -16,16 +16,22 @@
 import torch
 import torch._export
 from executorch.exir._serialize._cord import Cord
+from executorch.exir._serialize._named_data_store import (
+    NamedDataStore,
+    NamedDataStoreOutput,
+)
 from executorch.exir._serialize._serialize import serialize_for_executorch
 from executorch.exir._serialize.data_serializer import DataSerializer
 from executorch.exir._warnings import experimental
 from executorch.exir.backend.backend_api import to_backend
 from executorch.exir.backend.partitioner import Partitioner
 from executorch.exir.capture._config import EdgeCompileConfig, ExecutorchBackendConfig
+from executorch.exir.delegate import executorch_call_delegate, is_lowered_module
 from executorch.exir.emit import emit_program, EmitterOutput
 from executorch.exir.emit._emitter import _DelegateDebugIdentifierMap
 from executorch.exir.error import ExportError
 from executorch.exir.graph_module import get_control_flow_submodules
+from executorch.exir.operator.convert import _pybind_schema_to_native_schema
 from executorch.exir.pass_base import PassBase
 from executorch.exir.pass_manager import PassType
 from executorch.exir.passes import (
@@ -316,6 +322,8 @@ def lift_constant_tensor_pass(ep):
             new_input_specs.extend(lifted_constants)
             lifted_constants.clear()
         new_input_specs.append(s)
+    if len(lifted_constants) > 0:
+        new_input_specs = lifted_constants + new_input_specs
     ep.graph_signature.input_specs = new_input_specs
     ep.graph_module.recompile()
     return ep
@@ -836,6 +844,9 @@ def _replace_aten_ops_with_transformed_ops(
         ops_set_to_not_decompose, check_op_support = partitioner.ops_to_not_decompose(
             program
         )
+        ops_set_to_not_decompose = _remove_invalid_ops_for_not_decompose(
+            ops_set_to_not_decompose
+        )
 
         for op_aten in ops_set_to_not_decompose:
             _register_no_decomp_op(op_aten)
@@ -965,6 +976,67 @@ def _sanity_check_graph_for_non_decomp_ops(
                     logging.warning(warning_str)
 
 
+def _remove_invalid_ops_for_not_decompose(
+    ops_to_not_decompose: List[torch._ops.OpOverload],
+) -> List[torch._ops.OpOverload]:
+    # To address https://github.com/pytorch/executorch/issues/8781
+    def keep(op):
+        # Explicit allow list
+        allow_list = []
+        try:
+            # Ops in torch.ops.quant are not always loaded, so we use try/except
+            # Aliases output, but we need to allow it for XNNPACK
+            allow_list.append(torch.ops.quant.choose_qparams_affine.default)
+        except:
+            pass
+
+        if op in allow_list:
+            return True
+
+        schema = op._schema
+        native_schema = _pybind_schema_to_native_schema(schema)
+        if native_schema is None:
+            logging.warn(
+                f"Torchgen is not able to parse the schema of {op._schema}.  This is not fatal."
+            )
+        else:
+            if native_schema.is_mutable:
+                logging.warn(
+                    f"Op {op} was requested for preservation by partitioner.  This request is ignored because it is mutable."
+                )
+                return False
+
+            if native_schema.aliased_return_names() != [None]:
+                logging.warn(
+                    f"Op {op} was requested for preservation by partitioner.  This request is ignored because it aliases output."
+                )
+                return False
+
+        # Explicit block list of ops that don't work if asked for
+        # preservation
+        if op in [
+            # Hits infinte recursion error when op is in
+            # EDGE_DO_NOT_DECOMP namespace
+            torch.ops.aten._to_copy.default,
+            # scalar to tensor type promotion does not work on ops
+            # in EDGE_DO_NOT_DECOMP namespace
+            torch.ops.aten.mul.Tensor,
+            torch.ops.aten.add.Tensor,
+            torch.ops.aten.sub.Tensor,
+            torch.ops.aten.div.Tensor,
+            torch.ops.aten.item.default,
+            torch.ops.aten._local_scalar_dense.default,
+            torch.ops.aten.unbind.int,
+        ]:
+            logging.warn(
+                f"Op {op} was requested for preservation by partitioner.  This request is ignored because it is in a blocklist."
+            )
+            return False
+        return True
+
+    return list(filter(keep, ops_to_not_decompose))
+
+
 def _gen_edge_manager_for_partitioners(
     partitioner: Dict[str, List[Partitioner]],
     aten_programs: Dict[str, ExportedProgram],
@@ -992,6 +1064,9 @@ def _gen_edge_manager_for_partitioners(
             all_ops_no_decomp = set()
             for curr_partitioner in partitioner.get(name, []):
                 curr_ops_no_decomp, _ = curr_partitioner.ops_to_not_decompose(program)
+                curr_ops_no_decomp = _remove_invalid_ops_for_not_decompose(
+                    curr_ops_no_decomp
+                )
                 all_ops_no_decomp |= set(curr_ops_no_decomp)
 
             table = _default_decomposition_table()
@@ -1028,6 +1103,33 @@ def _gen_edge_manager_for_partitioners(
     return edge_manager
 
 
+def collect_named_data_store_from_exported_program(
+    exported_program: ExportedProgram,
+    named_data_store: NamedDataStore,
+) -> None:
+    """
+    Collects all the named data store outputs found within the exported program
+    and adds them to named_data_store.
+    """
+
+    # collected all the named data into the named data store for deduplication
+    def collect_named_data_store_outputs(
+        graph_module: torch.fx.GraphModule,
+    ) -> None:
+        for node in graph_module.graph.nodes:
+            if node.target == executorch_call_delegate:
+                lbm = getattr(graph_module, node.args[0].target)
+                assert is_lowered_module(lbm)
+                data_store_output = lbm.named_data_store_output
+                if data_store_output is not None:
+                    named_data_store.merge_named_data_store(data_store_output)
+
+        for _, submod, _ in get_control_flow_submodules(graph_module):
+            collect_named_data_store_outputs(submod)
+
+    collect_named_data_store_outputs(exported_program.graph_module)
+
+
 @et_logger("to_edge_transform_and_lower")
 def to_edge_transform_and_lower(
     programs: Union[ExportedProgram, Dict[str, ExportedProgram]],
@@ -1113,6 +1215,7 @@ def to_edge_transform_and_lower(
             curr_op_set, check_op_support = curr_partitioner.ops_to_not_decompose(
                 program
             )
+            curr_op_set = _remove_invalid_ops_for_not_decompose(curr_op_set)
             ops_set_to_not_decompose = ops_set_to_not_decompose.union(curr_op_set)
             _sanity_check_graph_for_non_decomp_ops(
                 name,
@@ -1259,6 +1362,12 @@ def __init__(
         self._edge_programs: Dict[str, ExportedProgram] = edge_programs
         self._config_methods = constant_methods
 
+        self._named_data_store = NamedDataStore()
+        for _, program in self._edge_programs.items():
+            collect_named_data_store_from_exported_program(
+                program, self._named_data_store
+            )
+
     @property
     def methods(self) -> Set[str]:
         """
@@ -1369,7 +1478,9 @@ def to_backend(
 
         config = EdgeCompileConfig(_check_ir_validity=False)
         return EdgeProgramManager(
-            new_edge_programs, copy.deepcopy(self._config_methods), config
+            new_edge_programs,
+            copy.deepcopy(self._config_methods),
+            config,
         )
 
     @et_logger("to_executorch")
@@ -1444,7 +1555,10 @@ def to_executorch(
             execution_programs[name] = program
 
         return ExecutorchProgramManager(
-            execution_programs, self._config_methods, config
+            execution_programs,
+            self._config_methods,
+            config,
+            self._named_data_store.get_named_data_store_output(),
         )
 
 
@@ -1465,6 +1579,7 @@ def __init__(
         execution_programs: Dict[str, ExportedProgram],
         config_methods: Optional[Dict[str, Any]] = None,
         backend_config: Optional[ExecutorchBackendConfig] = None,
+        named_data: Optional[NamedDataStoreOutput] = None,
     ):
         """
         End users should not call this constructor directly. Instead, they should use
@@ -1487,6 +1602,9 @@ def __init__(
         self._execution_programs: Dict[str, ExportedProgram] = execution_programs
         self._config_methods: Optional[Dict[str, Any]] = config_methods
 
+        # Named data from EdgeProgramManager
+        self._named_data: Optional[NamedDataStoreOutput] = named_data
+
         backend_config = backend_config or ExecutorchBackendConfig()
 
         # Emit methods
@@ -1499,7 +1617,10 @@ def __init__(
         # Serialize emitter output, ready to be written to a file.
         self._data_serializer = FlatTensorSerializer()
         self._pte_data, self._tensor_data = serialize_for_executorch(
-            self._emitter_output, backend_config, self._data_serializer
+            self._emitter_output,
+            backend_config,
+            self._data_serializer,
+            self._named_data,
         )
         self._buffer: Optional[bytes] = None
 
diff --git a/exir/schema.py b/exir/schema.py
index 8e1434a2fe4..7dba623aebf 100644
--- a/exir/schema.py
+++ b/exir/schema.py
@@ -290,6 +290,12 @@ class SubsegmentOffsets:
     offsets: List[int]
 
 
+@dataclass
+class NamedData:
+    key: str
+    segment_index: int
+
+
 @dataclass
 class Program:
     version: int
@@ -299,3 +305,4 @@ class Program:
     segments: List[DataSegment]
     constant_segment: SubsegmentOffsets
     mutable_data_segments: Optional[List[SubsegmentOffsets]] = None
+    named_data: Optional[List[NamedData]] = None
diff --git a/exir/tests/common.py b/exir/tests/common.py
index fdd7a3adca4..daeea109667 100644
--- a/exir/tests/common.py
+++ b/exir/tests/common.py
@@ -79,6 +79,7 @@ def get_test_program() -> Program:
         backend_delegate_data=[],
         segments=[],
         constant_segment=SubsegmentOffsets(segment_index=0, offsets=[]),
+        named_data=[],
     )
 
 
diff --git a/exir/tests/test_joint_graph.py b/exir/tests/test_joint_graph.py
index 349fa92e826..fb74b70d313 100644
--- a/exir/tests/test_joint_graph.py
+++ b/exir/tests/test_joint_graph.py
@@ -18,6 +18,7 @@
 from torch.export._trace import _export
 from torch.export.experimental import _export_forward_backward
 from torch.export.exported_program import OutputKind
+from torch.testing import assert_close
 
 
 class TestJointGraph(unittest.TestCase):
@@ -100,7 +101,8 @@ def forward(self, x, y):
             example_inputs
         )  # ET outputs are [loss, grads, weights]
 
-        self.assertTrue(torch.allclose(loss, et_outputs[0]))
+        # Without rtol and atol, this test fails in macos.
+        assert_close(loss, et_outputs[0], rtol=1e-4, atol=1e-4)
         self.assertTrue(
             torch.allclose(m.linear.weight.grad, et_outputs[1])  # pyre-ignore
         )
diff --git a/exir/tests/test_memory_format_ops_pass.py b/exir/tests/test_memory_format_ops_pass.py
index 76e994abdbf..84cd0faa485 100644
--- a/exir/tests/test_memory_format_ops_pass.py
+++ b/exir/tests/test_memory_format_ops_pass.py
@@ -24,6 +24,7 @@
 from executorch.exir.pass_base import ExportPass, ProxyValue
 
 from executorch.exir.tests.test_memory_format_ops_pass_utils import (
+    AmbiguousDimOrderError,
     MemoryFormatOpsPassTestUtils,
     MemoryFormatTestSet,
     PropagateToCopyChannalsLastModule,
@@ -124,8 +125,34 @@ def test_op_dim_order_propagation(self) -> None:
                 target_memory_format=torch.channels_last,
                 _load_for_executorch_from_buffer=_load_for_executorch_from_buffer,
             ),
+            check_unambiguous_dim_order=True,
         )
 
+    def test_op_dim_order_propagation_ambiguous(self) -> None:
+        try:
+            MemoryFormatOpsPassTestUtils.memory_format_test_runner(
+                self,
+                MemoryFormatTestSet(
+                    module=PropagateToCopyChannalsLastModule().eval(),
+                    op=torch.ops.aten._to_copy.default,
+                    sample_input=(
+                        torch.rand_like(
+                            torch.zeros(
+                                [2, 1, 2, 2]
+                            ),  # Ambiguous shape should trigger AmbiguousDimOrderError!
+                            dtype=torch.float32,
+                            memory_format=torch.contiguous_format,
+                        ),
+                    ),
+                    target_memory_format=torch.channels_last,
+                    _load_for_executorch_from_buffer=_load_for_executorch_from_buffer,
+                ),
+                check_unambiguous_dim_order=True,
+            )
+            AssertionError("Should have raised AmbiguousDimOrderError")
+        except AmbiguousDimOrderError:
+            pass  # Expected error
+
     # Only test dim order replacement result in lean mode test.
     # This test is irrelevant with operator mode.
     def test_dim_order_replacement(self) -> None:
diff --git a/exir/tests/test_memory_format_ops_pass_utils.py b/exir/tests/test_memory_format_ops_pass_utils.py
index 8bf810e847e..6daf38b187f 100644
--- a/exir/tests/test_memory_format_ops_pass_utils.py
+++ b/exir/tests/test_memory_format_ops_pass_utils.py
@@ -20,8 +20,11 @@
     is_channel_last_dim_order,
     is_contiguous_dim_order,
 )
+from executorch.exir.pass_base import ExportPass
 
 from torch.export import export
+
+from torch.fx.passes.infra.pass_manager import PassManager
 from torch.testing import FileCheck
 from torch.utils._pytree import tree_flatten
 
@@ -99,10 +102,67 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return t1 * t2
 
 
+class AmbiguousDimOrderError(RuntimeError):
+    pass
+
+
+def assert_unambiguous_dim_order(gm):
+    class ExampleNOPPass(ExportPass):
+        """
+        Does nothing!
+        """
+
+        def call_operator(self, op, args, kwargs, meta):
+            return super().call_operator(
+                op,
+                args,
+                kwargs,
+                meta,
+            )
+
+    # This is an example of how one can detect ambiguous dim_order anywhere in the graph.
+    # You can be surgical and only detect it in the nodes you are interested in or something else.
+    def detect_ambiguity(gm):
+        """
+        Check every node's output tensor dim_order and raise if it is ambiguous for a list of formats.
+        """
+
+        def get_tensors(node: torch.fx.Node) -> List[torch.Tensor]:
+            val = node.meta["val"]
+            if isinstance(val, torch.Tensor):
+                return [val]
+            elif isinstance(val, (list, tuple)):
+                return [tensor for tensor in val if isinstance(tensor, torch.Tensor)]
+            return []
+
+        for node in gm.graph.nodes:
+            if node.op == "call_function":
+                for tensor in get_tensors(node):
+                    # Let's make sure dim_order is not ambiguous, raise otherwise.
+                    # This is raising because we can't do anything about it.
+                    # The right course of follow up action is to ask user to try with a different example input.
+                    try:
+                        _ = tensor.dim_order(
+                            ambiguity_check=[
+                                torch.contiguous_format,
+                                torch.channels_last,
+                            ]
+                        )
+                    except Exception:
+                        raise AmbiguousDimOrderError
+
+    # any pass or passes, just using MemoryFormatOpsPass as an example
+    dim_order_pass_manager = PassManager(passes=[ExampleNOPPass()])
+    dim_order_pass_manager.add_checks(detect_ambiguity)
+    dim_order_pass_manager(gm)
+
+
 class MemoryFormatOpsPassTestUtils:
     @staticmethod
     def memory_format_test_runner(
-        test_class: unittest.TestCase, test_set: MemoryFormatTestSet
+        test_class: unittest.TestCase,
+        test_set: MemoryFormatTestSet,
+        check_unambiguous_dim_order: bool = False,
     ):
         before = export(
             test_set.module, test_set.sample_input, strict=True
@@ -121,6 +181,9 @@ def memory_format_test_runner(
                 before, compile_config=EdgeCompileConfig(_skip_dim_order=False)
             )
 
+        if check_unambiguous_dim_order:
+            assert_unambiguous_dim_order(epm.exported_program().graph_module)
+
         # check memory format ops, if needed
         if test_set.op_level_check:
             aten_op_str, edge_op_str = MemoryFormatOps2Str[test_set.op]
diff --git a/exir/tests/test_memory_planning.py b/exir/tests/test_memory_planning.py
index d5e5627dfa6..d885239acd8 100644
--- a/exir/tests/test_memory_planning.py
+++ b/exir/tests/test_memory_planning.py
@@ -708,10 +708,10 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
         et_program = et.executorch_program
         inputs = et_program.execution_plan[0].inputs
         self.assertNotEqual(
-            et_program.execution_plan[0]  # pyre-ignore
+            et_program.execution_plan[0]
             .values[inputs[0]]
             .val.allocation_info.memory_offset_low,
-            et_program.execution_plan[0]  # pyre-ignore
+            et_program.execution_plan[0]
             .values[inputs[1]]
             .val.allocation_info.memory_offset_low,
         )
@@ -749,7 +749,7 @@ def forward(self, input, label):
         net = TrainingNet(Net())
         inputs = (torch.randn(1, 6, 5, 5), torch.ones(1, dtype=torch.int64))
 
-        ep = export(net, inputs)
+        ep = export(net, inputs, strict=True)
         ep = _export_forward_backward(ep)
         ep = to_edge(ep)
         ep = ep.to_executorch()
diff --git a/exir/tests/test_passes.py b/exir/tests/test_passes.py
index 8e40c49e33f..39dbd3f51d3 100644
--- a/exir/tests/test_passes.py
+++ b/exir/tests/test_passes.py
@@ -1057,6 +1057,36 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             new_ep.graph_module.code
         )
 
+    def test_pass_no_user_inputs(self) -> None:
+        class NoUserInputs(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_buffer("a", torch.ones(1))
+
+            def forward(self) -> torch.Tensor:
+                return 3 + self.a
+
+        mod = NoUserInputs()
+        exported_program = export(mod, (), strict=True)
+        edge = to_edge(
+            exported_program,
+            compile_config=EdgeCompileConfig(_skip_dim_order=False),
+        )
+        ep = edge.exported_program()
+        # because there is no user input, the lifted constant should be the first input.
+        FileCheck().check("_lifted_tensor_constant1").check(
+            "b_a"  # followed by the buffer input.
+        ).run(ep.graph_module.code)
+
+        # the graph signature should also be the same:
+        self.assertEqual(
+            ep.graph_signature.input_specs[0].arg.name, "_lifted_tensor_constant1"
+        )
+        self.assertEqual(ep.graph_signature.input_specs[1].arg.name, "b_a")
+
+        # Validate that the program successfully passes validation to executorch:
+        edge.to_executorch()
+
     def test_constant_prop_pass_for_parameter(self) -> None:
         def count_additions(gm: torch.fx.GraphModule) -> int:
             return sum(
diff --git a/exir/verification/TARGETS b/exir/verification/TARGETS
index 8ee9e5546e3..092b48658df 100644
--- a/exir/verification/TARGETS
+++ b/exir/verification/TARGETS
@@ -10,13 +10,11 @@ cpp_python_extension(
         "bindings.cpp",
     ],
     deps = [
+        "fbsource//third-party/pybind11:pybind11",
         "//caffe2:torch-cpp-cpu",
         "//caffe2:torch_extension",
         "//caffe2/c10:c10",
     ],
-    external_deps = [
-        "pybind11",
-    ],
 )
 
 python_library(
diff --git a/exir/verification/test/test_verifier.py b/exir/verification/test/test_verifier.py
index b2182242dd7..8520d3ce13e 100644
--- a/exir/verification/test/test_verifier.py
+++ b/exir/verification/test/test_verifier.py
@@ -153,7 +153,7 @@ def forward(self, input, label):
         net = TrainingNet(Net())
         inputs = (torch.randn(1, 6, 5, 5), torch.ones(1, dtype=torch.int64))
 
-        export_model = export(net, inputs)
+        export_model = export(net, inputs, strict=True)
         export_model = _export_forward_backward(export_model)
 
         edge = to_edge(export_model)
diff --git a/extension/android/BUCK b/extension/android/BUCK
index 040c9258d42..7312545d6eb 100644
--- a/extension/android/BUCK
+++ b/extension/android/BUCK
@@ -5,12 +5,12 @@ oncall("executorch")
 fb_android_library(
     name = "executorch",
     srcs = [
-        "src/main/java/org/pytorch/executorch/DType.java",
-        "src/main/java/org/pytorch/executorch/EValue.java",
-        "src/main/java/org/pytorch/executorch/Module.java",
-        "src/main/java/org/pytorch/executorch/NativePeer.java",
-        "src/main/java/org/pytorch/executorch/Tensor.java",
-        "src/main/java/org/pytorch/executorch/annotations/Experimental.java",
+        "executorch_android/src/main/java/org/pytorch/executorch/DType.java",
+        "executorch_android/src/main/java/org/pytorch/executorch/EValue.java",
+        "executorch_android/src/main/java/org/pytorch/executorch/Module.java",
+        "executorch_android/src/main/java/org/pytorch/executorch/NativePeer.java",
+        "executorch_android/src/main/java/org/pytorch/executorch/Tensor.java",
+        "executorch_android/src/main/java/org/pytorch/executorch/annotations/Experimental.java",
     ],
     autoglob = False,
     language = "JAVA",
@@ -23,8 +23,8 @@ fb_android_library(
 fb_android_library(
     name = "executorch_llama",
     srcs = [
-        "src/main/java/org/pytorch/executorch/LlamaCallback.java",
-        "src/main/java/org/pytorch/executorch/LlamaModule.java",
+        "executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmCallback.java",
+        "executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java",
     ],
     autoglob = False,
     language = "JAVA",
diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt
index 70f21f2751c..3a1fe79d8f5 100644
--- a/extension/android/CMakeLists.txt
+++ b/extension/android/CMakeLists.txt
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-cmake_minimum_required(VERSION 3.19)
+cmake_minimum_required(VERSION 3.24) # 3.24 is required for WHOLE_ARCHIVE
 
 project(executorch_jni)
 
@@ -17,7 +17,7 @@ if(NOT ANDROID)
 endif()
 
 set(EXECUTORCH_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/../..")
-include(${EXECUTORCH_ROOT}/build/Utils.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 set(_common_compile_options -Wno-deprecated-declarations -fPIC)
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 if(NOT ANDROID_PLATFORM)
@@ -84,10 +84,6 @@ if(TARGET optimized_native_cpu_ops_lib)
     APPEND
     link_libraries
     optimized_native_cpu_ops_lib
-    optimized_kernels
-    portable_kernels
-    cpublas
-    eigen_blas
   )
   target_link_options_shared_lib(optimized_native_cpu_ops_lib)
 else()
@@ -115,16 +111,10 @@ if(TARGET vulkan_backend)
 endif()
 
 if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
-  add_subdirectory(
-    ${EXECUTORCH_ROOT}/extension/llm/custom_ops
-    ${CMAKE_CURRENT_BINARY_DIR}/../../extension/llm/custom_ops
-  )
-  list(APPEND link_libraries custom_ops)
-  target_link_options_shared_lib(custom_ops)
+  list(APPEND link_libraries $<LINK_LIBRARY:WHOLE_ARCHIVE,custom_ops>)
 endif()
 
 if(TARGET pthreadpool)
-  target_compile_definitions(executorch_jni PRIVATE ET_USE_THREADPOOL=1)
   target_include_directories(
     executorch_jni
     PUBLIC
diff --git a/extension/android/build.gradle b/extension/android/build.gradle
index b40f08e0c45..ac031653a7a 100644
--- a/extension/android/build.gradle
+++ b/extension/android/build.gradle
@@ -1,24 +1,30 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
+allprojects {
+    buildscript {
+        ext {
+            minSdkVersion = 21
+            targetSdkVersion = 34
+            compileSdkVersion = 34
+            buildToolsVersion = '33.0.1'
 
-plugins {
-    id 'java-library'
-}
+            fbjniJavaOnlyVersion = "0.5.1"
+            soLoaderNativeLoaderVersion = "0.10.5"
+        }
 
-group 'org.pytorch.executorch'
+        repositories {
+            google()
+            mavenCentral()
+        }
 
-repositories {
-    mavenCentral()
-}
+        dependencies {
+            classpath 'com.android.tools.build:gradle:8.9.0'
+            classpath 'com.vanniktech:gradle-maven-publish-plugin:0.31.0'
+        }
+
+    }
 
-task makeJar(type: Jar) {
-    dependencies {
-        implementation 'com.facebook.fbjni:fbjni-java-only:0.2.2'
-        implementation 'com.facebook.soloader:nativeloader:0.10.5'
+    repositories {
+        google()
+        jcenter()
+        mavenCentral()
     }
 }
diff --git a/extension/android/executorch_android/build.gradle b/extension/android/executorch_android/build.gradle
new file mode 100644
index 00000000000..b284ce3896e
--- /dev/null
+++ b/extension/android/executorch_android/build.gradle
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+plugins {
+    id "com.android.library" version "8.9.0"
+    id "com.vanniktech.maven.publish" version "0.31.0"
+}
+
+android {
+    namespace = "org.pytorch.executorch"
+    compileSdk = 34
+
+    defaultConfig {
+        minSdk = 23
+
+        testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner"
+    }
+
+    compileOptions {
+        sourceCompatibility = JavaVersion.VERSION_1_8
+        targetCompatibility = JavaVersion.VERSION_1_8
+    }
+
+    sourceSets {
+        androidTest {
+            resources.srcDirs += [ 'src/androidTest/resources' ]
+        }
+    }
+}
+
+dependencies {
+    implementation 'com.facebook.fbjni:fbjni:0.5.1'
+    implementation 'com.facebook.soloader:nativeloader:0.10.5'
+    testImplementation 'junit:junit:4.12'
+    androidTestImplementation 'androidx.test.ext:junit:1.1.5'
+    androidTestImplementation 'androidx.test:rules:1.2.0'
+    androidTestImplementation 'commons-io:commons-io:2.4'
+}
+
+import com.vanniktech.maven.publish.SonatypeHost
+
+mavenPublishing {
+  publishToMavenCentral(SonatypeHost.DEFAULT)
+  signAllPublications()
+
+  coordinates("org.pytorch", "executorch-android", "0.5.0-SNAPSHOT")
+
+  pom {
+    name = "ExecuTorch Android"
+    description = "ExecuTorch Android API"
+    inceptionYear = "2025"
+    url = "https://github.com/pytorch/executorch/"
+    licenses {
+      license {
+        name = "BSD 3-Clause"
+        url = "https://github.com/pytorch/executorch/blob/main/LICENSE"
+        distribution = "https://github.com/pytorch/executorch/blob/main/LICENSE"
+      }
+    }
+    developers {
+      developer {
+        id = "pytorch"
+        name = "pytorch"
+        url = "https://github.com/pytorch/executorch/"
+      }
+    }
+    scm {
+      url = "https://github.com/pytorch/executorch.git"
+      connection = "scm:git:https://github.com/pytorch/executorch"
+      developerConnection = "scm:git:git@github.com:pytorch/executorch.git"
+    }
+  }
+
+}
+
+repositories {
+    maven {
+        url "https://oss.sonatype.org/content/repositories/snapshots"
+    }
+}
diff --git a/extension/android_test/src/main/AndroidManifest.xml b/extension/android/executorch_android/src/androidTest/AndroidManifest.xml
similarity index 59%
rename from extension/android_test/src/main/AndroidManifest.xml
rename to extension/android/executorch_android/src/androidTest/AndroidManifest.xml
index b8ac862938e..7ea0516e5c2 100644
--- a/extension/android_test/src/main/AndroidManifest.xml
+++ b/extension/android/executorch_android/src/androidTest/AndroidManifest.xml
@@ -1,12 +1,10 @@
-<!-- AndroidManifest.xml -->
+<?xml version="1.0" encoding="utf-8"?>
 <manifest xmlns:android="http://schemas.android.com/apk/res/android">
     <uses-permission android:name="android.permission.READ_EXTERNAL_STORAGE" />
-    <uses-permission android:name="android.permission.WRITE_EXTERNAL_STORAGE" />
     <application>
-        <uses-library android:name="android.test.runner" />
     </application>
     <instrumentation
         android:name="androidx.test.runner.AndroidJUnitRunner"
         android:targetPackage="org.pytorch.executorch"
-        android:label="Tests for Executorch Modules" />
+        android:label="Tests for ExecuTorch Modules" />
 </manifest>
diff --git a/extension/android_test/src/androidTest/java/org/pytorch/executorch/LlamaModuleInstrumentationTest.java b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/LlmModuleInstrumentationTest.java
similarity index 81%
rename from extension/android_test/src/androidTest/java/org/pytorch/executorch/LlamaModuleInstrumentationTest.java
rename to extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/LlmModuleInstrumentationTest.java
index 940e34d684f..b3b515d7ed0 100644
--- a/extension/android_test/src/androidTest/java/org/pytorch/executorch/LlamaModuleInstrumentationTest.java
+++ b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/LlmModuleInstrumentationTest.java
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-package com.example.executorch;
+package org.pytorch.executorch;
 
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
@@ -34,15 +34,12 @@
 import org.apache.commons.io.FileUtils;
 import androidx.test.ext.junit.runners.AndroidJUnit4;
 import androidx.test.InstrumentationRegistry;
-import org.pytorch.executorch.LlamaModule;
-import org.pytorch.executorch.LlamaCallback;
-import org.pytorch.executorch.Module;
-import org.pytorch.executorch.EValue;
-import org.pytorch.executorch.Tensor;
+import org.pytorch.executorch.extension.llm.LlmCallback;
+import org.pytorch.executorch.extension.llm.LlmModule;
 
-/** Unit tests for {@link LlamaModule}. */
+/** Unit tests for {@link org.pytorch.executorch.extension.llm.LlmModule}. */
 @RunWith(AndroidJUnit4.class)
-public class LlamaModuleInstrumentationTest implements LlamaCallback {
+public class LlmModuleInstrumentationTest implements LlmCallback {
     private static String TEST_FILE_NAME = "/tinyllama_portable_fp16_h.pte";
     private static String TOKENIZER_FILE_NAME = "/tokenizer.bin";
     private static String TEST_PROMPT = "Hello";
@@ -51,7 +48,7 @@ public class LlamaModuleInstrumentationTest implements LlamaCallback {
 
     private final List<String> results = new ArrayList<>();
     private final List<Float> tokensPerSecond = new ArrayList<>();
-    private LlamaModule mModule;
+    private LlmModule mModule;
 
     private static String getTestFilePath(String fileName) {
         return InstrumentationRegistry.getInstrumentation().getTargetContext().getExternalCacheDir() + fileName;
@@ -70,7 +67,7 @@ public void setUp() throws IOException {
         FileUtils.copyInputStreamToFile(inputStream, tokenizerFile);
         inputStream.close();
 
-        mModule = new LlamaModule(getTestFilePath(TEST_FILE_NAME), getTestFilePath(TOKENIZER_FILE_NAME), 0.0f);
+        mModule = new LlmModule(getTestFilePath(TEST_FILE_NAME), getTestFilePath(TOKENIZER_FILE_NAME), 0.0f);
     }
 
     @Rule
@@ -82,7 +79,7 @@ public void testGenerate() throws IOException, URISyntaxException{
         // Check that the model can be load successfully
         assertEquals(OK, loadResult);
 
-        mModule.generate(TEST_PROMPT, SEQ_LEN, LlamaModuleInstrumentationTest.this);
+        mModule.generate(TEST_PROMPT, SEQ_LEN, LlmModuleInstrumentationTest.this);
         assertEquals(results.size(), SEQ_LEN);
         assertTrue(tokensPerSecond.get(tokensPerSecond.size() - 1) > 0);
     }
@@ -90,16 +87,16 @@ public void testGenerate() throws IOException, URISyntaxException{
     @Test
     public void testGenerateAndStop() throws IOException, URISyntaxException{
         int seqLen = 32;
-        mModule.generate(TEST_PROMPT, SEQ_LEN, new LlamaCallback() {
+        mModule.generate(TEST_PROMPT, SEQ_LEN, new LlmCallback() {
             @Override
             public void onResult(String result) {
-                LlamaModuleInstrumentationTest.this.onResult(result);
+                LlmModuleInstrumentationTest.this.onResult(result);
                 mModule.stop();
             }
 
             @Override
             public void onStats(float tps) {
-                LlamaModuleInstrumentationTest.this.onStats(tps);
+                LlmModuleInstrumentationTest.this.onStats(tps);
             }
         });
 
diff --git a/extension/android_test/src/androidTest/java/org/pytorch/executorch/ModuleInstrumentationTest.java b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/ModuleInstrumentationTest.java
similarity index 96%
rename from extension/android_test/src/androidTest/java/org/pytorch/executorch/ModuleInstrumentationTest.java
rename to extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/ModuleInstrumentationTest.java
index e8259969ab1..a25c0bf6343 100644
--- a/extension/android_test/src/androidTest/java/org/pytorch/executorch/ModuleInstrumentationTest.java
+++ b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/ModuleInstrumentationTest.java
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-package com.example.executorch;
+package org.pytorch.executorch;
 
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
@@ -32,9 +32,6 @@
 import org.apache.commons.io.FileUtils;
 import androidx.test.ext.junit.runners.AndroidJUnit4;
 import androidx.test.InstrumentationRegistry;
-import org.pytorch.executorch.Module;
-import org.pytorch.executorch.EValue;
-import org.pytorch.executorch.Tensor;
 
 /** Unit tests for {@link Module}. */
 @RunWith(AndroidJUnit4.class)
diff --git a/extension/android_test/src/androidTest/resources/test.txt b/extension/android/executorch_android/src/androidTest/resources/test.txt
similarity index 100%
rename from extension/android_test/src/androidTest/resources/test.txt
rename to extension/android/executorch_android/src/androidTest/resources/test.txt
diff --git a/extension/android/executorch_android/src/main/AndroidManifest.xml b/extension/android/executorch_android/src/main/AndroidManifest.xml
new file mode 100644
index 00000000000..cb2a2dcdc53
--- /dev/null
+++ b/extension/android/executorch_android/src/main/AndroidManifest.xml
@@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="utf-8"?>
+<manifest xmlns:android="http://schemas.android.com/apk/res/android">
+    <application>
+    </application>
+    <instrumentation
+        android:name="androidx.test.runner.AndroidJUnitRunner"
+        android:targetPackage="org.pytorch.executorch"
+        android:label="Tests for ExecuTorch Modules" />
+</manifest>
diff --git a/extension/android/src/main/java/org/pytorch/executorch/DType.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/DType.java
similarity index 100%
rename from extension/android/src/main/java/org/pytorch/executorch/DType.java
rename to extension/android/executorch_android/src/main/java/org/pytorch/executorch/DType.java
diff --git a/extension/android/src/main/java/org/pytorch/executorch/EValue.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/EValue.java
similarity index 96%
rename from extension/android/src/main/java/org/pytorch/executorch/EValue.java
rename to extension/android/executorch_android/src/main/java/org/pytorch/executorch/EValue.java
index 599818a00d7..ab3b77ff1fb 100644
--- a/extension/android/src/main/java/org/pytorch/executorch/EValue.java
+++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/EValue.java
@@ -177,10 +177,10 @@ private String getTypeName(int typeCode) {
   }
 
   /**
-   * Serializes an {@code EValue} into a byte array.
+   * Serializes an {@code EValue} into a byte array. Note: This method is experimental and subject
+   * to change without notice.
    *
    * @return The serialized byte array.
-   * @apiNote This method is experimental and subject to change without notice.
    */
   public byte[] toByteArray() {
     if (isNone()) {
@@ -212,11 +212,11 @@ public byte[] toByteArray() {
   }
 
   /**
-   * Deserializes an {@code EValue} from a byte[].
+   * Deserializes an {@code EValue} from a byte[]. Note: This method is experimental and subject to
+   * change without notice.
    *
    * @param bytes The byte array to deserialize from.
    * @return The deserialized {@code EValue}.
-   * @apiNote This method is experimental and subject to change without notice.
    */
   public static EValue fromByteArray(byte[] bytes) {
     ByteBuffer buffer = ByteBuffer.wrap(bytes);
diff --git a/extension/android/src/main/java/org/pytorch/executorch/Module.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/Module.java
similarity index 100%
rename from extension/android/src/main/java/org/pytorch/executorch/Module.java
rename to extension/android/executorch_android/src/main/java/org/pytorch/executorch/Module.java
diff --git a/extension/android/src/main/java/org/pytorch/executorch/NativePeer.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/NativePeer.java
similarity index 100%
rename from extension/android/src/main/java/org/pytorch/executorch/NativePeer.java
rename to extension/android/executorch_android/src/main/java/org/pytorch/executorch/NativePeer.java
diff --git a/extension/android/src/main/java/org/pytorch/executorch/Tensor.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/Tensor.java
similarity index 98%
rename from extension/android/src/main/java/org/pytorch/executorch/Tensor.java
rename to extension/android/executorch_android/src/main/java/org/pytorch/executorch/Tensor.java
index f76a247a59a..6b32d90cda8 100644
--- a/extension/android/src/main/java/org/pytorch/executorch/Tensor.java
+++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/Tensor.java
@@ -682,11 +682,10 @@ private static Tensor nativeNewTensor(
   }
 
   /**
-   * Serializes a {@code Tensor} into a byte array.
+   * Serializes a {@code Tensor} into a byte array. Note: This method is experimental and subject to
+   * change without notice. This does NOT supoprt list type.
    *
    * @return The serialized byte array.
-   * @apiNote This method is experimental and subject to change without notice. This does NOT
-   *     supoprt list type.
    */
   public byte[] toByteArray() {
     int dtypeSize = 0;
@@ -738,12 +737,11 @@ public byte[] toByteArray() {
   }
 
   /**
-   * Deserializes a {@code Tensor} from a byte[].
+   * Deserializes a {@code Tensor} from a byte[]. Note: This method is experimental and subject to
+   * change without notice. This does NOT supoprt list type.
    *
-   * @param buffer The byte array to deserialize from.
+   * @param bytes The byte array to deserialize from.
    * @return The deserialized {@code Tensor}.
-   * @apiNote This method is experimental and subject to change without notice. This does NOT
-   *     supoprt list type.
    */
   public static Tensor fromByteArray(byte[] bytes) {
     if (bytes == null) {
diff --git a/extension/android/src/main/java/org/pytorch/executorch/annotations/Experimental.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/annotations/Experimental.java
similarity index 100%
rename from extension/android/src/main/java/org/pytorch/executorch/annotations/Experimental.java
rename to extension/android/executorch_android/src/main/java/org/pytorch/executorch/annotations/Experimental.java
diff --git a/extension/android/src/main/java/org/pytorch/executorch/LlamaCallback.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmCallback.java
similarity index 92%
rename from extension/android/src/main/java/org/pytorch/executorch/LlamaCallback.java
rename to extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmCallback.java
index b30fa2515a9..c05b30b0625 100644
--- a/extension/android/src/main/java/org/pytorch/executorch/LlamaCallback.java
+++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmCallback.java
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-package org.pytorch.executorch;
+package org.pytorch.executorch.extension.llm;
 
 import com.facebook.jni.annotations.DoNotStrip;
 import org.pytorch.executorch.annotations.Experimental;
@@ -18,7 +18,7 @@
  * <p>Warning: These APIs are experimental and subject to change without notice
  */
 @Experimental
-public interface LlamaCallback {
+public interface LlmCallback {
   /**
    * Called when a new result is available from JNI. Users will keep getting onResult() invocations
    * until generate() finishes.
diff --git a/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java
similarity index 74%
rename from extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java
rename to extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java
index 6de26bc7fe8..8262d7cfdad 100644
--- a/extension/android/src/main/java/org/pytorch/executorch/LlamaModule.java
+++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-package org.pytorch.executorch;
+package org.pytorch.executorch.extension.llm;
 
 import com.facebook.jni.HybridData;
 import com.facebook.jni.annotations.DoNotStrip;
@@ -15,13 +15,13 @@
 import org.pytorch.executorch.annotations.Experimental;
 
 /**
- * LlamaModule is a wrapper around the Executorch Llama model. It provides a simple interface to
+ * LlmModule is a wrapper around the Executorch LLM. It provides a simple interface to
  * generate text from the model.
  *
  * <p>Warning: These APIs are experimental and subject to change without notice
  */
 @Experimental
-public class LlamaModule {
+public class LlmModule {
 
   public static final int MODEL_TYPE_TEXT = 1;
   public static final int MODEL_TYPE_TEXT_VISION = 2;
@@ -39,16 +39,24 @@ public class LlamaModule {
 
   @DoNotStrip
   private static native HybridData initHybrid(
-      int modelType, String modulePath, String tokenizerPath, float temperature);
+      int modelType, String modulePath, String tokenizerPath, float temperature, String dataPath);
 
-  /** Constructs a LLAMA Module for a model with given path, tokenizer, and temperature. */
-  public LlamaModule(String modulePath, String tokenizerPath, float temperature) {
-    mHybridData = initHybrid(MODEL_TYPE_TEXT, modulePath, tokenizerPath, temperature);
+  /** Constructs a LLM Module for a model with given model path, tokenizer, temperature. */
+  public LlmModule(String modulePath, String tokenizerPath, float temperature) {
+    mHybridData = initHybrid(MODEL_TYPE_TEXT, modulePath, tokenizerPath, temperature, null);
+  }
+
+  /**
+   * Constructs a LLM Module for a model with given model path, tokenizer, temperature and data
+   * path.
+   */
+  public LlmModule(String modulePath, String tokenizerPath, float temperature, String dataPath) {
+    mHybridData = initHybrid(MODEL_TYPE_TEXT, modulePath, tokenizerPath, temperature, dataPath);
   }
 
   /** Constructs a LLM Module for a model with given path, tokenizer, and temperature. */
-  public LlamaModule(int modelType, String modulePath, String tokenizerPath, float temperature) {
-    mHybridData = initHybrid(modelType, modulePath, tokenizerPath, temperature);
+  public LlmModule(int modelType, String modulePath, String tokenizerPath, float temperature) {
+    mHybridData = initHybrid(modelType, modulePath, tokenizerPath, temperature, null);
   }
 
   public void resetNative() {
@@ -59,10 +67,10 @@ public void resetNative() {
    * Start generating tokens from the module.
    *
    * @param prompt Input prompt
-   * @param llamaCallback callback object to receive results.
+   * @param llmCallback callback object to receive results.
    */
-  public int generate(String prompt, LlamaCallback llamaCallback) {
-    return generate(prompt, DEFAULT_SEQ_LEN, llamaCallback, DEFAULT_ECHO);
+  public int generate(String prompt, LlmCallback llmCallback) {
+    return generate(prompt, DEFAULT_SEQ_LEN, llmCallback, DEFAULT_ECHO);
   }
 
   /**
@@ -70,21 +78,21 @@ public int generate(String prompt, LlamaCallback llamaCallback) {
    *
    * @param prompt Input prompt
    * @param seqLen sequence length
-   * @param llamaCallback callback object to receive results.
+   * @param llmCallback callback object to receive results.
    */
-  public int generate(String prompt, int seqLen, LlamaCallback llamaCallback) {
-    return generate(null, 0, 0, 0, prompt, seqLen, llamaCallback, DEFAULT_ECHO);
+  public int generate(String prompt, int seqLen, LlmCallback llmCallback) {
+    return generate(null, 0, 0, 0, prompt, seqLen, llmCallback, DEFAULT_ECHO);
   }
 
   /**
    * Start generating tokens from the module.
    *
    * @param prompt Input prompt
-   * @param llamaCallback callback object to receive results
+   * @param llmCallback callback object to receive results
    * @param echo indicate whether to echo the input prompt or not (text completion vs chat)
    */
-  public int generate(String prompt, LlamaCallback llamaCallback, boolean echo) {
-    return generate(null, 0, 0, 0, prompt, DEFAULT_SEQ_LEN, llamaCallback, echo);
+  public int generate(String prompt, LlmCallback llmCallback, boolean echo) {
+    return generate(null, 0, 0, 0, prompt, DEFAULT_SEQ_LEN, llmCallback, echo);
   }
 
   /**
@@ -92,11 +100,11 @@ public int generate(String prompt, LlamaCallback llamaCallback, boolean echo) {
    *
    * @param prompt Input prompt
    * @param seqLen sequence length
-   * @param llamaCallback callback object to receive results
+   * @param llmCallback callback object to receive results
    * @param echo indicate whether to echo the input prompt or not (text completion vs chat)
    */
-  public int generate(String prompt, int seqLen, LlamaCallback llamaCallback, boolean echo) {
-    return generate(null, 0, 0, 0, prompt, seqLen, llamaCallback, echo);
+  public int generate(String prompt, int seqLen, LlmCallback llmCallback, boolean echo) {
+    return generate(null, 0, 0, 0, prompt, seqLen, llmCallback, echo);
   }
 
   /**
@@ -108,7 +116,7 @@ public int generate(String prompt, int seqLen, LlamaCallback llamaCallback, bool
    * @param channels Input image number of channels
    * @param prompt Input prompt
    * @param seqLen sequence length
-   * @param llamaCallback callback object to receive results.
+   * @param llmCallback callback object to receive results.
    * @param echo indicate whether to echo the input prompt or not (text completion vs chat)
    */
   @DoNotStrip
@@ -119,7 +127,7 @@ public native int generate(
       int channels,
       String prompt,
       int seqLen,
-      LlamaCallback llamaCallback,
+      LlmCallback llmCallback,
       boolean echo);
 
   /**
@@ -178,7 +186,7 @@ public long prefillPrompt(String prompt, long startPos, int bos, int eos) {
    * @return The error code.
    */
   public native int generateFromPos(
-      String prompt, int seqLen, long startPos, LlamaCallback callback, boolean echo);
+      String prompt, int seqLen, long startPos, LlmCallback callback, boolean echo);
 
   /** Stop current generate() before it finishes. */
   @DoNotStrip
diff --git a/extension/android_test/src/test/java/org/pytorch/executorch/EValueTest.java b/extension/android/executorch_android/src/test/java/org/pytorch/executorch/EValueTest.java
similarity index 100%
rename from extension/android_test/src/test/java/org/pytorch/executorch/EValueTest.java
rename to extension/android/executorch_android/src/test/java/org/pytorch/executorch/EValueTest.java
diff --git a/extension/android_test/src/test/java/org/pytorch/executorch/TensorTest.java b/extension/android/executorch_android/src/test/java/org/pytorch/executorch/TensorTest.java
similarity index 100%
rename from extension/android_test/src/test/java/org/pytorch/executorch/TensorTest.java
rename to extension/android/executorch_android/src/test/java/org/pytorch/executorch/TensorTest.java
diff --git a/extension/android/gradle.properties b/extension/android/gradle.properties
new file mode 100644
index 00000000000..5bac8ac5046
--- /dev/null
+++ b/extension/android/gradle.properties
@@ -0,0 +1 @@
+android.useAndroidX=true
diff --git a/extension/android/gradle/wrapper/gradle-wrapper.properties b/extension/android/gradle/wrapper/gradle-wrapper.properties
index a80b22ce5cf..e2847c82004 100644
--- a/extension/android/gradle/wrapper/gradle-wrapper.properties
+++ b/extension/android/gradle/wrapper/gradle-wrapper.properties
@@ -1,6 +1,6 @@
 distributionBase=GRADLE_USER_HOME
 distributionPath=wrapper/dists
-distributionUrl=https\://services.gradle.org/distributions/gradle-8.6-bin.zip
+distributionUrl=https\://services.gradle.org/distributions/gradle-8.11.1-bin.zip
 networkTimeout=10000
 validateDistributionUrl=true
 zipStoreBase=GRADLE_USER_HOME
diff --git a/extension/android/jni/jni_layer.cpp b/extension/android/jni/jni_layer.cpp
index 551307b495a..f3c62e1d70f 100644
--- a/extension/android/jni/jni_layer.cpp
+++ b/extension/android/jni/jni_layer.cpp
@@ -408,14 +408,14 @@ class ExecuTorchJni : public facebook::jni::HybridClass<ExecuTorchJni> {
 } // namespace executorch::extension
 
 #ifdef EXECUTORCH_BUILD_LLAMA_JNI
-extern void register_natives_for_llama();
+extern void register_natives_for_llm();
 #else
-// No op if we don't build llama
-void register_natives_for_llama() {}
+// No op if we don't build LLM
+void register_natives_for_llm() {}
 #endif
 JNIEXPORT jint JNICALL JNI_OnLoad(JavaVM* vm, void*) {
   return facebook::jni::initialize(vm, [] {
     executorch::extension::ExecuTorchJni::registerNatives();
-    register_natives_for_llama();
+    register_natives_for_llm();
   });
 }
diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
index 5c58c89ee91..d6ade74ee1f 100644
--- a/extension/android/jni/jni_layer_llama.cpp
+++ b/extension/android/jni/jni_layer_llama.cpp
@@ -75,14 +75,14 @@ std::string token_buffer;
 
 namespace executorch_jni {
 
-class ExecuTorchLlamaCallbackJni
-    : public facebook::jni::JavaClass<ExecuTorchLlamaCallbackJni> {
+class ExecuTorchLlmCallbackJni
+    : public facebook::jni::JavaClass<ExecuTorchLlmCallbackJni> {
  public:
   constexpr static const char* kJavaDescriptor =
-      "Lorg/pytorch/executorch/LlamaCallback;";
+      "Lorg/pytorch/executorch/extension/llm/LlmCallback;";
 
   void onResult(std::string result) const {
-    static auto cls = ExecuTorchLlamaCallbackJni::javaClassStatic();
+    static auto cls = ExecuTorchLlmCallbackJni::javaClassStatic();
     static const auto method =
         cls->getMethod<void(facebook::jni::local_ref<jstring>)>("onResult");
 
@@ -99,7 +99,7 @@ class ExecuTorchLlamaCallbackJni
   }
 
   void onStats(const llm::Stats& result) const {
-    static auto cls = ExecuTorchLlamaCallbackJni::javaClassStatic();
+    static auto cls = ExecuTorchLlmCallbackJni::javaClassStatic();
     static const auto method = cls->getMethod<void(jfloat)>("onStats");
     double eval_time =
         (double)(result.inference_end_ms - result.prompt_eval_end_ms);
@@ -111,8 +111,7 @@ class ExecuTorchLlamaCallbackJni
   }
 };
 
-class ExecuTorchLlamaJni
-    : public facebook::jni::HybridClass<ExecuTorchLlamaJni> {
+class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
  private:
   friend HybridBase;
   int model_type_category_;
@@ -121,7 +120,7 @@ class ExecuTorchLlamaJni
 
  public:
   constexpr static auto kJavaDescriptor =
-      "Lorg/pytorch/executorch/LlamaModule;";
+      "Lorg/pytorch/executorch/extension/llm/LlmModule;";
 
   constexpr static int MODEL_TYPE_CATEGORY_LLM = 1;
   constexpr static int MODEL_TYPE_CATEGORY_MULTIMODAL = 2;
@@ -132,16 +131,22 @@ class ExecuTorchLlamaJni
       jint model_type_category,
       facebook::jni::alias_ref<jstring> model_path,
       facebook::jni::alias_ref<jstring> tokenizer_path,
-      jfloat temperature) {
+      jfloat temperature,
+      facebook::jni::alias_ref<jstring> data_path) {
     return makeCxxInstance(
-        model_type_category, model_path, tokenizer_path, temperature);
+        model_type_category,
+        model_path,
+        tokenizer_path,
+        temperature,
+        data_path);
   }
 
-  ExecuTorchLlamaJni(
+  ExecuTorchLlmJni(
       jint model_type_category,
       facebook::jni::alias_ref<jstring> model_path,
       facebook::jni::alias_ref<jstring> tokenizer_path,
-      jfloat temperature) {
+      jfloat temperature,
+      facebook::jni::alias_ref<jstring> data_path = nullptr) {
 #if defined(ET_USE_THREADPOOL)
     // Reserve 1 thread for the main thread.
     uint32_t num_performant_cores =
@@ -160,10 +165,18 @@ class ExecuTorchLlamaJni
           tokenizer_path->toStdString().c_str(),
           temperature);
     } else if (model_type_category == MODEL_TYPE_CATEGORY_LLM) {
-      runner_ = std::make_unique<example::Runner>(
-          model_path->toStdString().c_str(),
-          tokenizer_path->toStdString().c_str(),
-          temperature);
+      if (data_path != nullptr) {
+        runner_ = std::make_unique<example::Runner>(
+            model_path->toStdString().c_str(),
+            tokenizer_path->toStdString().c_str(),
+            temperature,
+            data_path->toStdString().c_str());
+      } else {
+        runner_ = std::make_unique<example::Runner>(
+            model_path->toStdString().c_str(),
+            tokenizer_path->toStdString().c_str(),
+            temperature);
+      }
 #if defined(EXECUTORCH_BUILD_MEDIATEK)
     } else if (model_type_category == MODEL_TYPE_MEDIATEK_LLAMA) {
       runner_ = std::make_unique<MTKLlamaRunner>(
@@ -183,7 +196,7 @@ class ExecuTorchLlamaJni
       jint channels,
       facebook::jni::alias_ref<jstring> prompt,
       jint seq_len,
-      facebook::jni::alias_ref<ExecuTorchLlamaCallbackJni> callback,
+      facebook::jni::alias_ref<ExecuTorchLlmCallbackJni> callback,
       jboolean echo) {
     if (model_type_category_ == MODEL_TYPE_CATEGORY_MULTIMODAL) {
       auto image_size = image->size();
@@ -282,7 +295,7 @@ class ExecuTorchLlamaJni
       facebook::jni::alias_ref<jstring> prompt,
       jint seq_len,
       jlong start_pos,
-      facebook::jni::alias_ref<ExecuTorchLlamaCallbackJni> callback,
+      facebook::jni::alias_ref<ExecuTorchLlmCallbackJni> callback,
       jboolean echo) {
     if (model_type_category_ != MODEL_TYPE_CATEGORY_MULTIMODAL) {
       return static_cast<jint>(Error::NotSupported);
@@ -315,22 +328,22 @@ class ExecuTorchLlamaJni
 
   static void registerNatives() {
     registerHybrid({
-        makeNativeMethod("initHybrid", ExecuTorchLlamaJni::initHybrid),
-        makeNativeMethod("generate", ExecuTorchLlamaJni::generate),
-        makeNativeMethod("stop", ExecuTorchLlamaJni::stop),
-        makeNativeMethod("load", ExecuTorchLlamaJni::load),
+        makeNativeMethod("initHybrid", ExecuTorchLlmJni::initHybrid),
+        makeNativeMethod("generate", ExecuTorchLlmJni::generate),
+        makeNativeMethod("stop", ExecuTorchLlmJni::stop),
+        makeNativeMethod("load", ExecuTorchLlmJni::load),
         makeNativeMethod(
-            "prefillImagesNative", ExecuTorchLlamaJni::prefill_images),
+            "prefillImagesNative", ExecuTorchLlmJni::prefill_images),
         makeNativeMethod(
-            "prefillPromptNative", ExecuTorchLlamaJni::prefill_prompt),
+            "prefillPromptNative", ExecuTorchLlmJni::prefill_prompt),
         makeNativeMethod(
-            "generateFromPos", ExecuTorchLlamaJni::generate_from_pos),
+            "generateFromPos", ExecuTorchLlmJni::generate_from_pos),
     });
   }
 };
 
 } // namespace executorch_jni
 
-void register_natives_for_llama() {
-  executorch_jni::ExecuTorchLlamaJni::registerNatives();
+void register_natives_for_llm() {
+  executorch_jni::ExecuTorchLlmJni::registerNatives();
 }
diff --git a/extension/android/settings.gradle b/extension/android/settings.gradle
index 2a0a78b30cc..95d46203058 100644
--- a/extension/android/settings.gradle
+++ b/extension/android/settings.gradle
@@ -5,10 +5,20 @@
  * For more detailed information on multi-project builds, please refer to https://docs.gradle.org/8.6/userguide/multi_project_builds.html in the Gradle documentation.
  */
 
+pluginManagement {
+  repositories {
+    google()
+    gradlePluginPortal()
+    mavenCentral()
+    mavenLocal()
+  }
+}
+
 plugins {
     // Apply the foojay-resolver plugin to allow automatic download of JDKs
     id 'org.gradle.toolchains.foojay-resolver-convention' version '0.7.0'
 }
 
 rootProject.name = 'executorch'
-include('src')
+
+include('executorch_android')
diff --git a/extension/android_test/.gitignore b/extension/android_test/.gitignore
deleted file mode 100644
index a43b7e827a8..00000000000
--- a/extension/android_test/.gitignore
+++ /dev/null
@@ -1,6 +0,0 @@
-local.properties
-.gradle
-.idea/*
-.externalNativeBuild
-src/libs/*
-build
diff --git a/extension/android_test/TARGETS b/extension/android_test/TARGETS
deleted file mode 100644
index 5c4f482b5ea..00000000000
--- a/extension/android_test/TARGETS
+++ /dev/null
@@ -1 +0,0 @@
-# This file needs to exist to avoid build system breakage, see https://fburl.com/workplace/jtdlgdmd
diff --git a/extension/android_test/add_model.py b/extension/android_test/add_model.py
deleted file mode 100644
index b7ac3955ee3..00000000000
--- a/extension/android_test/add_model.py
+++ /dev/null
@@ -1,26 +0,0 @@
-import torch
-from executorch.exir import to_edge
-from torch.export import export
-
-
-# Start with a PyTorch model that adds two input tensors (matrices)
-class Add(torch.nn.Module):
-    def __init__(self):
-        super(Add, self).__init__()
-
-    def forward(self, x: torch.Tensor, y: torch.Tensor):
-        return x + y
-
-
-# 1. torch.export: Defines the program with the ATen operator set.
-aten_dialect = export(Add(), (torch.ones(1), torch.ones(1)), strict=True)
-
-# 2. to_edge: Make optimizations for Edge devices
-edge_program = to_edge(aten_dialect)
-
-# 3. to_executorch: Convert the graph to an ExecuTorch program
-executorch_program = edge_program.to_executorch()
-
-# 4. Save the compiled .pte program
-with open("add.pte", "wb") as file:
-    file.write(executorch_program.buffer)
diff --git a/extension/android_test/build.gradle b/extension/android_test/build.gradle
deleted file mode 100644
index d2310d8b5fb..00000000000
--- a/extension/android_test/build.gradle
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-plugins {
-  id("com.android.application") version "8.1.0" apply false
-}
-
-group 'org.pytorch.executorch'
-
-apply plugin: "com.android.application"
-
-android {
-    namespace 'org.pytorch.executorch'
-    compileSdk = 34
-
-    defaultConfig {
-        minSdk = 28
-        targetSdk = 33
-        versionCode 1
-        versionName "1.0"
-        testInstrumentationRunner "androidx.test.runner.AndroidJUnitRunner"
-    }
-    compileOptions {
-        sourceCompatibility JavaVersion.VERSION_1_8
-        targetCompatibility JavaVersion.VERSION_1_8
-    }
-    sourceSets {
-        androidTest {
-                resources.srcDirs += [ 'src/androidTest/resources' ]
-        }
-    }
-}
-
-dependencies {
-    implementation 'com.facebook.soloader:nativeloader:0.10.5'
-    implementation("com.facebook.fbjni:fbjni:0.5.1")
-    implementation(files("src/libs/executorch.aar"))
-    testImplementation 'junit:junit:4.13.2'
-    androidTestImplementation 'androidx.test.ext:junit:1.1.5'
-    androidTestImplementation 'androidx.test:rules:1.2.0'
-    androidTestImplementation 'commons-io:commons-io:2.4'
-    androidTestImplementation 'androidx.test.espresso:espresso-core:3.5.1'
-    androidTestImplementation 'com.google.gms:google-services:4.3.3'
-}
-
-task('setupNativeLibs', type: Exec){
-    commandLine("bash", "setup.sh")
-}
-
-gradle.projectsEvaluated {
-    preBuild.dependsOn setupNativeLibs
-}
diff --git a/extension/android_test/gradle.properties b/extension/android_test/gradle.properties
deleted file mode 100644
index 2cbd6d19d33..00000000000
--- a/extension/android_test/gradle.properties
+++ /dev/null
@@ -1,23 +0,0 @@
-# Project-wide Gradle settings.
-# IDE (e.g. Android Studio) users:
-# Gradle settings configured through the IDE *will override*
-# any settings specified in this file.
-# For more details on how to configure your build environment visit
-# http://www.gradle.org/docs/current/userguide/build_environment.html
-# Specifies the JVM arguments used for the daemon process.
-# The setting is particularly useful for tweaking memory settings.
-org.gradle.jvmargs=-Xmx2048m -Dfile.encoding=UTF-8
-# When configured, Gradle will run in incubating parallel mode.
-# This option should only be used with decoupled projects. More details, visit
-# http://www.gradle.org/docs/current/userguide/multi_project_builds.html#sec:decoupled_projects
-# org.gradle.parallel=true
-# AndroidX package structure to make it clearer which packages are bundled with the
-# Android operating system, and which are packaged with your app's APK
-# https://developer.android.com/topic/libraries/support-library/androidx-rn
-android.useAndroidX=true
-# Kotlin code style for this project: "official" or "obsolete":
-kotlin.code.style=official
-# Enables namespacing of each library's R class so that its R class includes only the
-# resources declared in the library itself and none from the library's dependencies,
-# thereby reducing the size of the R class for that library
-android.nonTransitiveRClass=true
diff --git a/extension/android_test/gradle/libs.versions.toml b/extension/android_test/gradle/libs.versions.toml
deleted file mode 100644
index 561988cb1f6..00000000000
--- a/extension/android_test/gradle/libs.versions.toml
+++ /dev/null
@@ -1,12 +0,0 @@
-# This file was generated by the Gradle 'init' task.
-# https://docs.gradle.org/current/userguide/platforms.html#sub::toml-dependencies-format
-
-[versions]
-commons-math3 = "3.6.1"
-guava = "32.1.3-jre"
-junit = "4.13.2"
-
-[libraries]
-commons-math3 = { module = "org.apache.commons:commons-math3", version.ref = "commons-math3" }
-guava = { module = "com.google.guava:guava", version.ref = "guava" }
-junit = { module = "junit:junit", version.ref = "junit" }
diff --git a/extension/android_test/gradle/wrapper/gradle-wrapper.jar b/extension/android_test/gradle/wrapper/gradle-wrapper.jar
deleted file mode 100644
index d64cd491770..00000000000
Binary files a/extension/android_test/gradle/wrapper/gradle-wrapper.jar and /dev/null differ
diff --git a/extension/android_test/gradle/wrapper/gradle-wrapper.properties b/extension/android_test/gradle/wrapper/gradle-wrapper.properties
deleted file mode 100644
index a80b22ce5cf..00000000000
--- a/extension/android_test/gradle/wrapper/gradle-wrapper.properties
+++ /dev/null
@@ -1,7 +0,0 @@
-distributionBase=GRADLE_USER_HOME
-distributionPath=wrapper/dists
-distributionUrl=https\://services.gradle.org/distributions/gradle-8.6-bin.zip
-networkTimeout=10000
-validateDistributionUrl=true
-zipStoreBase=GRADLE_USER_HOME
-zipStorePath=wrapper/dists
diff --git a/extension/android_test/gradlew b/extension/android_test/gradlew
deleted file mode 100755
index 1aa94a42690..00000000000
--- a/extension/android_test/gradlew
+++ /dev/null
@@ -1,249 +0,0 @@
-#!/bin/sh
-
-#
-# Copyright © 2015-2021 the original authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-##############################################################################
-#
-#   Gradle start up script for POSIX generated by Gradle.
-#
-#   Important for running:
-#
-#   (1) You need a POSIX-compliant shell to run this script. If your /bin/sh is
-#       noncompliant, but you have some other compliant shell such as ksh or
-#       bash, then to run this script, type that shell name before the whole
-#       command line, like:
-#
-#           ksh Gradle
-#
-#       Busybox and similar reduced shells will NOT work, because this script
-#       requires all of these POSIX shell features:
-#         * functions;
-#         * expansions «$var», «${var}», «${var:-default}», «${var+SET}»,
-#           «${var#prefix}», «${var%suffix}», and «$( cmd )»;
-#         * compound commands having a testable exit status, especially «case»;
-#         * various built-in commands including «command», «set», and «ulimit».
-#
-#   Important for patching:
-#
-#   (2) This script targets any POSIX shell, so it avoids extensions provided
-#       by Bash, Ksh, etc; in particular arrays are avoided.
-#
-#       The "traditional" practice of packing multiple parameters into a
-#       space-separated string is a well documented source of bugs and security
-#       problems, so this is (mostly) avoided, by progressively accumulating
-#       options in "$@", and eventually passing that to Java.
-#
-#       Where the inherited environment variables (DEFAULT_JVM_OPTS, JAVA_OPTS,
-#       and GRADLE_OPTS) rely on word-splitting, this is performed explicitly;
-#       see the in-line comments for details.
-#
-#       There are tweaks for specific operating systems such as AIX, CygWin,
-#       Darwin, MinGW, and NonStop.
-#
-#   (3) This script is generated from the Groovy template
-#       https://github.com/gradle/gradle/blob/HEAD/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt
-#       within the Gradle project.
-#
-#       You can find Gradle at https://github.com/gradle/gradle/.
-#
-##############################################################################
-
-# Attempt to set APP_HOME
-
-# Resolve links: $0 may be a link
-app_path=$0
-
-# Need this for daisy-chained symlinks.
-while
-    APP_HOME=${app_path%"${app_path##*/}"}  # leaves a trailing /; empty if no leading path
-    [ -h "$app_path" ]
-do
-    ls=$( ls -ld "$app_path" )
-    link=${ls#*' -> '}
-    case $link in             #(
-      /*)   app_path=$link ;; #(
-      *)    app_path=$APP_HOME$link ;;
-    esac
-done
-
-# This is normally unused
-# shellcheck disable=SC2034
-APP_BASE_NAME=${0##*/}
-# Discard cd standard output in case $CDPATH is set (https://github.com/gradle/gradle/issues/25036)
-APP_HOME=$( cd "${APP_HOME:-./}" > /dev/null && pwd -P ) || exit
-
-# Use the maximum available, or set MAX_FD != -1 to use that value.
-MAX_FD=maximum
-
-warn () {
-    echo "$*"
-} >&2
-
-die () {
-    echo
-    echo "$*"
-    echo
-    exit 1
-} >&2
-
-# OS specific support (must be 'true' or 'false').
-cygwin=false
-msys=false
-darwin=false
-nonstop=false
-case "$( uname )" in                #(
-  CYGWIN* )         cygwin=true  ;; #(
-  Darwin* )         darwin=true  ;; #(
-  MSYS* | MINGW* )  msys=true    ;; #(
-  NONSTOP* )        nonstop=true ;;
-esac
-
-CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
-
-
-# Determine the Java command to use to start the JVM.
-if [ -n "$JAVA_HOME" ] ; then
-    if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
-        # IBM's JDK on AIX uses strange locations for the executables
-        JAVACMD=$JAVA_HOME/jre/sh/java
-    else
-        JAVACMD=$JAVA_HOME/bin/java
-    fi
-    if [ ! -x "$JAVACMD" ] ; then
-        die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
-
-Please set the JAVA_HOME variable in your environment to match the
-location of your Java installation."
-    fi
-else
-    JAVACMD=java
-    if ! command -v java >/dev/null 2>&1
-    then
-        die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
-
-Please set the JAVA_HOME variable in your environment to match the
-location of your Java installation."
-    fi
-fi
-
-# Increase the maximum file descriptors if we can.
-if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then
-    case $MAX_FD in #(
-      max*)
-        # In POSIX sh, ulimit -H is undefined. That's why the result is checked to see if it worked.
-        # shellcheck disable=SC2039,SC3045
-        MAX_FD=$( ulimit -H -n ) ||
-            warn "Could not query maximum file descriptor limit"
-    esac
-    case $MAX_FD in  #(
-      '' | soft) :;; #(
-      *)
-        # In POSIX sh, ulimit -n is undefined. That's why the result is checked to see if it worked.
-        # shellcheck disable=SC2039,SC3045
-        ulimit -n "$MAX_FD" ||
-            warn "Could not set maximum file descriptor limit to $MAX_FD"
-    esac
-fi
-
-# Collect all arguments for the java command, stacking in reverse order:
-#   * args from the command line
-#   * the main class name
-#   * -classpath
-#   * -D...appname settings
-#   * --module-path (only if needed)
-#   * DEFAULT_JVM_OPTS, JAVA_OPTS, and GRADLE_OPTS environment variables.
-
-# For Cygwin or MSYS, switch paths to Windows format before running java
-if "$cygwin" || "$msys" ; then
-    APP_HOME=$( cygpath --path --mixed "$APP_HOME" )
-    CLASSPATH=$( cygpath --path --mixed "$CLASSPATH" )
-
-    JAVACMD=$( cygpath --unix "$JAVACMD" )
-
-    # Now convert the arguments - kludge to limit ourselves to /bin/sh
-    for arg do
-        if
-            case $arg in                                #(
-              -*)   false ;;                            # don't mess with options #(
-              /?*)  t=${arg#/} t=/${t%%/*}              # looks like a POSIX filepath
-                    [ -e "$t" ] ;;                      #(
-              *)    false ;;
-            esac
-        then
-            arg=$( cygpath --path --ignore --mixed "$arg" )
-        fi
-        # Roll the args list around exactly as many times as the number of
-        # args, so each arg winds up back in the position where it started, but
-        # possibly modified.
-        #
-        # NB: a `for` loop captures its iteration list before it begins, so
-        # changing the positional parameters here affects neither the number of
-        # iterations, nor the values presented in `arg`.
-        shift                   # remove old arg
-        set -- "$@" "$arg"      # push replacement arg
-    done
-fi
-
-
-# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
-DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
-
-# Collect all arguments for the java command:
-#   * DEFAULT_JVM_OPTS, JAVA_OPTS, JAVA_OPTS, and optsEnvironmentVar are not allowed to contain shell fragments,
-#     and any embedded shellness will be escaped.
-#   * For example: A user cannot expect ${Hostname} to be expanded, as it is an environment variable and will be
-#     treated as '${Hostname}' itself on the command line.
-
-set -- \
-        "-Dorg.gradle.appname=$APP_BASE_NAME" \
-        -classpath "$CLASSPATH" \
-        org.gradle.wrapper.GradleWrapperMain \
-        "$@"
-
-# Stop when "xargs" is not available.
-if ! command -v xargs >/dev/null 2>&1
-then
-    die "xargs is not available"
-fi
-
-# Use "xargs" to parse quoted args.
-#
-# With -n1 it outputs one arg per line, with the quotes and backslashes removed.
-#
-# In Bash we could simply go:
-#
-#   readarray ARGS < <( xargs -n1 <<<"$var" ) &&
-#   set -- "${ARGS[@]}" "$@"
-#
-# but POSIX shell has neither arrays nor command substitution, so instead we
-# post-process each arg (as a line of input to sed) to backslash-escape any
-# character that might be a shell metacharacter, then use eval to reverse
-# that process (while maintaining the separation between arguments), and wrap
-# the whole thing up as a single "set" statement.
-#
-# This will of course break if any of these variables contains a newline or
-# an unmatched quote.
-#
-
-eval "set -- $(
-        printf '%s\n' "$DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS" |
-        xargs -n1 |
-        sed ' s~[^-[:alnum:]+,./:=@_]~\\&~g; ' |
-        tr '\n' ' '
-    )" '"$@"'
-
-exec "$JAVACMD" "$@"
diff --git a/extension/android_test/gradlew.bat b/extension/android_test/gradlew.bat
deleted file mode 100644
index 25da30dbdee..00000000000
--- a/extension/android_test/gradlew.bat
+++ /dev/null
@@ -1,92 +0,0 @@
-@rem
-@rem Copyright 2015 the original author or authors.
-@rem
-@rem Licensed under the Apache License, Version 2.0 (the "License");
-@rem you may not use this file except in compliance with the License.
-@rem You may obtain a copy of the License at
-@rem
-@rem      https://www.apache.org/licenses/LICENSE-2.0
-@rem
-@rem Unless required by applicable law or agreed to in writing, software
-@rem distributed under the License is distributed on an "AS IS" BASIS,
-@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@rem See the License for the specific language governing permissions and
-@rem limitations under the License.
-@rem
-
-@if "%DEBUG%"=="" @echo off
-@rem ##########################################################################
-@rem
-@rem  Gradle startup script for Windows
-@rem
-@rem ##########################################################################
-
-@rem Set local scope for the variables with windows NT shell
-if "%OS%"=="Windows_NT" setlocal
-
-set DIRNAME=%~dp0
-if "%DIRNAME%"=="" set DIRNAME=.
-@rem This is normally unused
-set APP_BASE_NAME=%~n0
-set APP_HOME=%DIRNAME%
-
-@rem Resolve any "." and ".." in APP_HOME to make it shorter.
-for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
-
-@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
-set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
-
-@rem Find java.exe
-if defined JAVA_HOME goto findJavaFromJavaHome
-
-set JAVA_EXE=java.exe
-%JAVA_EXE% -version >NUL 2>&1
-if %ERRORLEVEL% equ 0 goto execute
-
-echo. 1>&2
-echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 1>&2
-echo. 1>&2
-echo Please set the JAVA_HOME variable in your environment to match the 1>&2
-echo location of your Java installation. 1>&2
-
-goto fail
-
-:findJavaFromJavaHome
-set JAVA_HOME=%JAVA_HOME:"=%
-set JAVA_EXE=%JAVA_HOME%/bin/java.exe
-
-if exist "%JAVA_EXE%" goto execute
-
-echo. 1>&2
-echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 1>&2
-echo. 1>&2
-echo Please set the JAVA_HOME variable in your environment to match the 1>&2
-echo location of your Java installation. 1>&2
-
-goto fail
-
-:execute
-@rem Setup the command line
-
-set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
-
-
-@rem Execute Gradle
-"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
-
-:end
-@rem End local scope for the variables with windows NT shell
-if %ERRORLEVEL% equ 0 goto mainEnd
-
-:fail
-rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
-rem the _cmd.exe /c_ return code!
-set EXIT_CODE=%ERRORLEVEL%
-if %EXIT_CODE% equ 0 set EXIT_CODE=1
-if not ""=="%GRADLE_EXIT_CONSOLE%" exit %EXIT_CODE%
-exit /b %EXIT_CODE%
-
-:mainEnd
-if "%OS%"=="Windows_NT" endlocal
-
-:omega
diff --git a/extension/android_test/settings.gradle b/extension/android_test/settings.gradle
deleted file mode 100644
index 6b1bd4f7f85..00000000000
--- a/extension/android_test/settings.gradle
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * This file was generated by the Gradle 'init' task.
- *
- * The settings file is used to specify which projects to include in your build.
- * For more detailed information on multi-project builds, please refer to https://docs.gradle.org/8.6/userguide/multi_project_builds.html in the Gradle documentation.
- */
-pluginManagement {
-  repositories {
-    google()
-    mavenCentral()
-    gradlePluginPortal()
-  }
-}
-
-dependencyResolutionManagement {
-  repositoriesMode.set(RepositoriesMode.FAIL_ON_PROJECT_REPOS)
-  repositories {
-    google()
-    mavenCentral()
-  }
-}
-
-rootProject.name = 'executorch'
-include('src')
diff --git a/extension/android_test/setup.sh b/extension/android_test/setup.sh
deleted file mode 100755
index 725728b8092..00000000000
--- a/extension/android_test/setup.sh
+++ /dev/null
@@ -1,63 +0,0 @@
-#!/bin/bash
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-set -eu
-
-BUILD_AAR_DIR="$(mktemp -d)"
-export BUILD_AAR_DIR
-
-BASEDIR=$(dirname "$0")
-source "$BASEDIR"/../../build/build_android_llm_demo.sh
-
-build_native_library() {
-  ANDROID_ABI="$1"
-  CMAKE_OUT="cmake-out-android-${ANDROID_ABI}"
-  ANDROID_NDK="${ANDROID_NDK:-/opt/ndk}"
-  EXECUTORCH_CMAKE_BUILD_TYPE="${EXECUTORCH_CMAKE_BUILD_TYPE:-Release}"
-  cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
-    -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}/build/cmake/android.toolchain.cmake" \
-    -DANDROID_ABI="${ANDROID_ABI}" \
-    -DEXECUTORCH_BUILD_XNNPACK=ON \
-    -DEXECUTORCH_XNNPACK_SHARED_WORKSPACE=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
-    -B"${CMAKE_OUT}"
-
-  cmake --build "${CMAKE_OUT}" -j16 --target install
-
-  cmake extension/android \
-    -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}"/build/cmake/android.toolchain.cmake \
-    -DANDROID_ABI="${ANDROID_ABI}" \
-    -DCMAKE_INSTALL_PREFIX=c"${CMAKE_OUT}" \
-    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
-    -DEXECUTORCH_BUILD_LLAMA_JNI=ON \
-    -B"${CMAKE_OUT}"/extension/android
-
-  cmake --build "${CMAKE_OUT}"/extension/android -j16
-
-  # Copy artifacts to ABI specific directory
-  mkdir -p "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}"
-  cp "${CMAKE_OUT}"/extension/android/*.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/"
-}
-
-pushd "$BASEDIR"/../../
-build_jar
-build_native_library "arm64-v8a"
-build_native_library "x86_64"
-build_aar
-bash examples/models/llama/install_requirements.sh
-source ".ci/scripts/test_llama.sh" -model stories110M -build_tool cmake -dtype fp16 -mode portable -upload ${BUILD_AAR_DIR}
-popd
-mkdir -p "$BASEDIR"/src/libs
-cp "$BUILD_AAR_DIR/executorch.aar" "$BASEDIR"/src/libs/executorch.aar
-python add_model.py
-mv "add.pte" "$BASEDIR"/src/androidTest/resources/add.pte
-unzip -o "$BUILD_AAR_DIR"/model.zip -d "$BASEDIR"/src/androidTest/resources
diff --git a/extension/apple/CMakeLists.txt b/extension/apple/CMakeLists.txt
index ed233da1482..b7abfbe11a2 100644
--- a/extension/apple/CMakeLists.txt
+++ b/extension/apple/CMakeLists.txt
@@ -18,18 +18,34 @@ endif()
 
 add_library(extension_apple)
 
-set(EXPORTED_SOURCES ExecuTorch/Exported/ExecuTorchLog.mm)
+file(GLOB EXPORTED_SOURCES
+  ExecuTorch/Exported/*.m
+  ExecuTorch/Exported/*.mm
+)
+
+file(GLOB INTERNAL_SOURCES
+  ExecuTorch/Internal/*.m
+  ExecuTorch/Internal/*.mm
+)
 
-target_sources(extension_apple PRIVATE ${EXPORTED_SOURCES})
+target_sources(extension_apple PRIVATE
+  ${EXPORTED_SOURCES}
+  ${INTERNAL_SOURCES}
+)
 
-target_include_directories(extension_apple PUBLIC ExecuTorch/Exported)
+target_include_directories(extension_apple
+  PUBLIC ExecuTorch/Exported
+  PRIVATE ExecuTorch/Internal
+)
 
 find_library(FOUNDATION_FRAMEWORK Foundation)
-target_link_libraries(
-  extension_apple PRIVATE executorch ${FOUNDATION_FRAMEWORK}
+target_link_libraries(extension_apple
+  PRIVATE executorch ${FOUNDATION_FRAMEWORK}
 )
 
 target_compile_options(extension_apple PUBLIC ${_common_compile_options})
-target_compile_options(extension_apple PRIVATE "-fobjc-arc")
-target_compile_options(extension_apple PRIVATE "-fno-exceptions")
-target_compile_options(extension_apple PRIVATE "-fno-rtti")
+target_compile_options(extension_apple PRIVATE
+  "-fobjc-arc"
+  "-fno-exceptions"
+  "-fno-rtti"
+)
diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorch.h b/extension/apple/ExecuTorch/Exported/ExecuTorch.h
index e16439714f2..3a12a5ddbae 100644
--- a/extension/apple/ExecuTorch/Exported/ExecuTorch.h
+++ b/extension/apple/ExecuTorch/Exported/ExecuTorch.h
@@ -6,4 +6,8 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#import "ExecuTorchError.h"
 #import "ExecuTorchLog.h"
+#import "ExecuTorchModule.h"
+#import "ExecuTorchTensor.h"
+#import "ExecuTorchValue.h"
diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorchError.h b/extension/apple/ExecuTorch/Exported/ExecuTorchError.h
new file mode 100644
index 00000000000..cdf52051d05
--- /dev/null
+++ b/extension/apple/ExecuTorch/Exported/ExecuTorchError.h
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#import <Foundation/Foundation.h>
+
+NS_ASSUME_NONNULL_BEGIN
+
+FOUNDATION_EXPORT NSErrorDomain const ExecuTorchErrorDomain NS_SWIFT_NAME(ErrorDomain);
+
+NS_ASSUME_NONNULL_END
diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorchError.m b/extension/apple/ExecuTorch/Exported/ExecuTorchError.m
new file mode 100644
index 00000000000..43996dc213e
--- /dev/null
+++ b/extension/apple/ExecuTorch/Exported/ExecuTorchError.m
@@ -0,0 +1,11 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#import "ExecuTorchError.h"
+
+NSErrorDomain const ExecuTorchErrorDomain = @"org.pytorch.executorch.error";
diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorchModule.h b/extension/apple/ExecuTorch/Exported/ExecuTorchModule.h
new file mode 100644
index 00000000000..5e6e0ecaf47
--- /dev/null
+++ b/extension/apple/ExecuTorch/Exported/ExecuTorchModule.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#import "ExecuTorchValue.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+NS_SWIFT_NAME(Module)
+__attribute__((deprecated("This API is experimental.")))
+@interface ExecuTorchModule : NSObject
+
++ (instancetype)new NS_UNAVAILABLE;
+- (instancetype)init NS_UNAVAILABLE;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorchModule.mm b/extension/apple/ExecuTorch/Exported/ExecuTorchModule.mm
new file mode 100644
index 00000000000..866dcc6901b
--- /dev/null
+++ b/extension/apple/ExecuTorch/Exported/ExecuTorchModule.mm
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#import "ExecuTorchModule.h"
+
+#import "ExecuTorchError.h"
+
+#import <executorch/extension/module/module.h>
+#import <executorch/extension/tensor/tensor.h>
+
+@implementation ExecuTorchModule {
+  std::unique_ptr<executorch::extension::Module> _module;
+}
+
+@end
diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorchTensor.h b/extension/apple/ExecuTorch/Exported/ExecuTorchTensor.h
new file mode 100644
index 00000000000..220e377b60d
--- /dev/null
+++ b/extension/apple/ExecuTorch/Exported/ExecuTorchTensor.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#import <Foundation/Foundation.h>
+
+NS_ASSUME_NONNULL_BEGIN
+
+NS_SWIFT_NAME(Tensor)
+__attribute__((deprecated("This API is experimental.")))
+@interface ExecuTorchTensor : NSObject
+
++ (instancetype)new NS_UNAVAILABLE;
+- (instancetype)init NS_UNAVAILABLE;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorchTensor.mm b/extension/apple/ExecuTorch/Exported/ExecuTorchTensor.mm
new file mode 100644
index 00000000000..4b072444bec
--- /dev/null
+++ b/extension/apple/ExecuTorch/Exported/ExecuTorchTensor.mm
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#import "ExecuTorchTensor.h"
+
+#import "ExecuTorchError.h"
+
+#import <executorch/extension/tensor/tensor.h>
+
+@implementation ExecuTorchTensor {
+  ::executorch::extension::TensorPtr _tensor;
+}
+
+@end
diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorchValue.h b/extension/apple/ExecuTorch/Exported/ExecuTorchValue.h
new file mode 100644
index 00000000000..9b2c8aaaae6
--- /dev/null
+++ b/extension/apple/ExecuTorch/Exported/ExecuTorchValue.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#import "ExecuTorchTensor.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+NS_SWIFT_NAME(Value)
+__attribute__((deprecated("This API is experimental.")))
+@interface ExecuTorchValue : NSObject
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorchValue.m b/extension/apple/ExecuTorch/Exported/ExecuTorchValue.m
new file mode 100644
index 00000000000..98a6f774176
--- /dev/null
+++ b/extension/apple/ExecuTorch/Exported/ExecuTorchValue.m
@@ -0,0 +1,13 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#import "ExecuTorchValue.h"
+
+@implementation ExecuTorchValue
+
+@end
diff --git a/extension/apple/ExecuTorch/Internal/ExecuTorchUtils.h b/extension/apple/ExecuTorch/Internal/ExecuTorchUtils.h
new file mode 100644
index 00000000000..e53999cd355
--- /dev/null
+++ b/extension/apple/ExecuTorch/Internal/ExecuTorchUtils.h
@@ -0,0 +1,9 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#import <Foundation/Foundation.h>
diff --git a/extension/apple/ExecuTorch/__tests__/ModuleTest.swift b/extension/apple/ExecuTorch/__tests__/ModuleTest.swift
new file mode 100644
index 00000000000..609727ec93f
--- /dev/null
+++ b/extension/apple/ExecuTorch/__tests__/ModuleTest.swift
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+@testable import ExecuTorch
+
+import XCTest
+
+class ModuleTest: XCTestCase {
+  func test() throws {
+    let bundle = Bundle(for: type(of: self))
+    guard let modelPath = bundle.path(forResource: "add", ofType: "pte") else {
+      XCTFail("Couldn't find the model file")
+      return
+    }
+  }
+}
diff --git a/extension/apple/ExecuTorch/__tests__/TensorTest.swift b/extension/apple/ExecuTorch/__tests__/TensorTest.swift
new file mode 100644
index 00000000000..f5c2ccdbeba
--- /dev/null
+++ b/extension/apple/ExecuTorch/__tests__/TensorTest.swift
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+@testable import ExecuTorch
+
+import XCTest
+
+class TensorTest: XCTestCase {
+  func test() {
+  }
+}
diff --git a/extension/apple/ExecuTorch/__tests__/ValueTest.swift b/extension/apple/ExecuTorch/__tests__/ValueTest.swift
new file mode 100644
index 00000000000..56802ee540c
--- /dev/null
+++ b/extension/apple/ExecuTorch/__tests__/ValueTest.swift
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+@testable import ExecuTorch
+
+import XCTest
+
+class ValueTest: XCTestCase {
+  func test() {
+  }
+}
diff --git a/extension/apple/ExecuTorch/__tests__/resources/add.pte b/extension/apple/ExecuTorch/__tests__/resources/add.pte
new file mode 100644
index 00000000000..43252ca7d3d
Binary files /dev/null and b/extension/apple/ExecuTorch/__tests__/resources/add.pte differ
diff --git a/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/Exported/Data/ExecutorchRuntimeTensorValue.h b/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/Exported/Data/ExecutorchRuntimeTensorValue.h
new file mode 100644
index 00000000000..103f0781017
--- /dev/null
+++ b/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/Exported/Data/ExecutorchRuntimeTensorValue.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#import <Foundation/Foundation.h>
+
+#ifdef __cplusplus
+ #import <executorch/extension/module/module.h>
+ #import <executorch/runtime/core/evalue.h>
+#endif
+
+NS_ASSUME_NONNULL_BEGIN
+
+@interface ExecutorchRuntimeTensorValue : NSObject
+
+@property (nonatomic, readonly) NSArray<NSNumber *> *shape;
+
+- (instancetype)init NS_UNAVAILABLE;
++ (instancetype)new NS_UNAVAILABLE;
+
+- (instancetype)initWithFloatArray:(NSArray<NSNumber *> *)floatArray shape:(NSArray<NSNumber *> *)sizes NS_SWIFT_NAME(init(floatArray:shape:));
+
+#ifdef __cplusplus
+- (nullable instancetype)initWithTensor:(torch::executor::Tensor)tensor error:(NSError * _Nullable * _Nullable)error;
+- (instancetype)initWithData:(std::vector<float>)floatData
+                       shape:(std::vector<int32_t>)shape NS_DESIGNATED_INITIALIZER;
+- (torch::executor::Tensor)backedValue;
+#endif
+
+#pragma mark - 
+- (NSArray<NSNumber *> * _Nullable)floatArrayAndReturnError:(NSError * _Nullable * _Nullable)error;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/Exported/Data/ExecutorchRuntimeTensorValue.mm b/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/Exported/Data/ExecutorchRuntimeTensorValue.mm
new file mode 100644
index 00000000000..cb3dcddb45f
--- /dev/null
+++ b/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/Exported/Data/ExecutorchRuntimeTensorValue.mm
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#import "ExecutorchRuntimeTensorValue.h"
+
+#import <memory>
+
+#import <executorch/extension/module/module.h>
+
+using torch::executor::TensorImpl;
+using torch::executor::ScalarType;
+
+@implementation ExecutorchRuntimeTensorValue
+{
+  std::unique_ptr<TensorImpl> _tensor;
+  // TensorImpl DOES NOT take ownership.
+  // This float vector is what keeps the data in memory.
+  std::vector<float> _floatData;
+  std::vector<int32_t> _shape;
+}
+
+- (instancetype)initWithData:(std::vector<float>)floatData
+                       shape:(std::vector<int32_t>)shape
+{
+  if (self = [super init]) {
+    _floatData.assign(floatData.begin(), floatData.end());
+    _shape.assign(shape.begin(), shape.end());
+    _tensor = std::make_unique<TensorImpl>(ScalarType::Float, std::size(_shape), _shape.data(), _floatData.data());
+  }
+  return self;
+}
+
+- (instancetype)initWithFloatArray:(NSArray<NSNumber *> *)floatArray shape:(NSArray<NSNumber *> *)shape
+{
+  std::vector<float> floatVector;
+  std::vector<int32_t> shapeVector;
+
+  floatVector.reserve(floatArray.count);
+  for (int i = 0; i < floatArray.count; i++) {
+    floatVector.push_back([floatArray[i] floatValue]);
+  }
+  shapeVector.reserve(shape.count);
+  for (int i = 0; i < shape.count; i++) {
+    shapeVector.push_back([shape[i] intValue]);
+  }
+
+  return [self initWithData:floatVector shape:shapeVector];
+}
+
+- (nullable instancetype)initWithTensor:(torch::executor::Tensor)tensor error:(NSError * _Nullable * _Nullable)error
+{
+  if (tensor.scalar_type() != ScalarType::Float) {
+    if (error) {
+      *error = [NSError
+        errorWithDomain:@"ExecutorchRuntimeEngine"
+        code:(NSInteger)executorch::runtime::Error::InvalidArgument
+        userInfo: @{NSDebugDescriptionErrorKey: [NSString stringWithFormat:@"Invalid type: torch::executor::ScalarType::%hhd, expected torch::executor::ScalarType::Float", tensor.scalar_type()]}];
+    }
+    return nil;
+  }
+
+  std::vector<float> floatVector;
+  std::vector<int32_t> shapeVector;
+  shapeVector.assign(tensor.sizes().begin(), tensor.sizes().end());
+  floatVector.assign(tensor.const_data_ptr<float>(), tensor.const_data_ptr<float>() + tensor.numel());
+  return [self initWithData:floatVector shape:shapeVector];
+}
+
+- (NSArray<NSNumber *> *)shape
+{
+  const auto sizes = _tensor->sizes();
+  std::vector<int32_t> tensorSizes(sizes.begin(), sizes.end());
+
+  NSMutableArray<NSNumber *> *sizesArray = [[NSMutableArray alloc] initWithCapacity:tensorSizes.size()];
+  for (int &tensorSize : tensorSizes) {
+    [sizesArray addObject:@(tensorSize)];
+  }
+
+  return sizesArray;
+}
+
+- (NSArray<NSNumber *> * _Nullable)floatArrayAndReturnError:(NSError * _Nullable * _Nullable)error {
+  if (_tensor->scalar_type() == torch::executor::ScalarType::Float) {
+      const auto *tensorPtr = _tensor->data<float>();
+      const auto sizes = _tensor->sizes();
+      std::vector<float> tensorVec(tensorPtr, tensorPtr + _tensor->numel());
+      std::vector<int32_t> tensorSizes(sizes.begin(), sizes.end());
+
+      NSMutableArray<NSNumber *> *floatArray = [[NSMutableArray alloc] initWithCapacity:tensorVec.size()];
+      for (float &i : tensorVec) {
+        [floatArray addObject:@(i)];
+      }
+    return floatArray;
+  }
+
+  if (error) {
+    *error = [NSError
+        errorWithDomain:@"ExecutorchRuntimeEngine"
+        code:(NSInteger)executorch::runtime::Error::InvalidArgument
+        userInfo: @{NSDebugDescriptionErrorKey: [NSString stringWithFormat:@"Invalid type: torch::executor::ScalarType::%hhd, expected torch::executor::ScalarType::Float", _tensor->scalar_type()]}];
+  }
+
+  return nil;
+}
+
+- (torch::executor::Tensor)backedValue
+{
+  return torch::executor::Tensor(_tensor.get());
+}
+
+@end
diff --git a/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/Exported/Data/ExecutorchRuntimeValue.h b/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/Exported/Data/ExecutorchRuntimeValue.h
new file mode 100644
index 00000000000..fc1c2c4a35f
--- /dev/null
+++ b/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/Exported/Data/ExecutorchRuntimeValue.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifdef __cplusplus
+ #import <executorch/extension/module/module.h>
+ #import <executorch/runtime/core/evalue.h>
+#endif
+
+#import "ExecutorchRuntimeTensorValue.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+@interface ExecutorchRuntimeValue : NSObject
+
+- (instancetype)init NS_UNAVAILABLE;
++ (instancetype)new NS_UNAVAILABLE;
+
+- (instancetype)initWithTensor:(ExecutorchRuntimeTensorValue *)tensorValue;
+
+#ifdef __cplusplus
+- (instancetype)initWithEValue:(torch::executor::EValue)value NS_DESIGNATED_INITIALIZER;
+- (torch::executor::EValue)getBackedValue;
+#endif
+
+#pragma mark -
+- (ExecutorchRuntimeTensorValue *_Nullable)asTensorValueAndReturnError:(NSError * _Nullable * _Nullable)error;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/Exported/Data/ExecutorchRuntimeValue.mm b/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/Exported/Data/ExecutorchRuntimeValue.mm
new file mode 100644
index 00000000000..9cedc6d2afc
--- /dev/null
+++ b/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/Exported/Data/ExecutorchRuntimeValue.mm
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#import "ExecutorchRuntimeValue.h"
+
+#import <map>
+#import <vector>
+
+#import "ExecutorchRuntimeTensorValue.h"
+
+using torch::executor::EValue;
+
+@implementation ExecutorchRuntimeValue
+{
+  EValue _value;
+  // IMPORTANT
+  // Tensor value keeps a reference to the original tensor value. However, the value that is wrapped by LiteInterpreterRuntimeTensorValue DOES NOT TAKE OWNERSHIP OF THE RAW DATA!
+  // This means once the wrapper is deallocated, the tensor value will be deallocated as well.
+  // This reference here is to keep the tensor value alive until the runtime is deallocated.
+  ExecutorchRuntimeTensorValue *_tensorValue;
+}
+
+- (instancetype)initWithEValue:(EValue)value
+{
+  if (self = [super init]) {
+    _value = value;
+  }
+  return self;
+}
+
+- (instancetype)initWithTensor:(ExecutorchRuntimeTensorValue *)tensorValue
+{
+  if (self = [self initWithEValue:EValue([tensorValue backedValue])]) {
+    _tensorValue = tensorValue;
+  }
+  return self;
+}
+
+- (nullable ExecutorchRuntimeTensorValue *)asTensorValueAndReturnError:(NSError * _Nullable * _Nullable)error
+{
+  if (_value.isTensor()) {
+    return [[ExecutorchRuntimeTensorValue alloc] initWithTensor:_value.toTensor() error:error];
+  }
+
+  if (error) {
+    *error = [NSError
+      errorWithDomain:@"ExecutorchRuntimeEngine"
+      code:static_cast<uint32_t>(executorch::runtime::Error::InvalidArgument)
+      userInfo: @{NSDebugDescriptionErrorKey: [NSString stringWithFormat:@"Invalid type: Tag::%d, expected Tag::Tensor", _value.tag]}];
+  }
+  return nil;
+}
+
+- (EValue)getBackedValue
+{
+  return _value;
+}
+
+@end
diff --git a/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/Exported/ExecutorchRuntimeEngine.h b/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/Exported/ExecutorchRuntimeEngine.h
new file mode 100644
index 00000000000..a03f6b3c62f
--- /dev/null
+++ b/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/Exported/ExecutorchRuntimeEngine.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#import <Foundation/Foundation.h>
+
+#import "ExecutorchRuntimeValue.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+@interface ExecutorchRuntimeEngine : NSObject
+
+- (nonnull instancetype)init NS_UNAVAILABLE;
++ (nonnull instancetype)new NS_UNAVAILABLE;
+
+- (nullable instancetype)initWithModelPath:(NSString *)modelPath
+                           modelMethodName:(NSString *)modelMethodName
+                                     error:(NSError * _Nullable * _Nullable)error NS_DESIGNATED_INITIALIZER;
+
+- (nullable NSArray<ExecutorchRuntimeValue *> *)infer:(NSArray<ExecutorchRuntimeValue *> *)values
+                                                error:(NSError * _Nullable * _Nullable)error NS_SWIFT_NAME(infer(input:));
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/Exported/ExecutorchRuntimeEngine.mm b/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/Exported/ExecutorchRuntimeEngine.mm
new file mode 100644
index 00000000000..756ca94f114
--- /dev/null
+++ b/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/Exported/ExecutorchRuntimeEngine.mm
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#import "ExecutorchRuntimeEngine.h"
+
+#import <map>
+#import <vector>
+
+#import <executorch/extension/module/module.h>
+
+@implementation ExecutorchRuntimeEngine
+{
+  NSString *_modelPath;
+  NSString *_modelMethodName;
+  std::unique_ptr<torch::executor::Module> _module;
+}
+
+- (instancetype)initWithModelPath:(NSString *)modelPath
+                  modelMethodName:(NSString *)modelMethodName
+                            error:(NSError **)error
+{
+  if (self = [super init]) {
+    _modelPath = modelPath;
+    _modelMethodName = modelMethodName;
+    _module = std::make_unique<torch::executor::Module>(modelPath.UTF8String);
+    const auto e = _module->load_method(modelMethodName.UTF8String);
+    if (e != executorch::runtime::Error::Ok) {
+      if (error) {
+        *error = [NSError errorWithDomain:@"ExecutorchRuntimeEngine"
+                                      code:(NSInteger)e
+                                  userInfo:nil];
+      }
+      return nil;
+    }
+  }
+  return self;
+}
+
+- (nullable NSArray<ExecutorchRuntimeValue *> *)infer:(NSArray<ExecutorchRuntimeValue *> *)values
+                                                error:(NSError **)error
+{
+  std::vector<torch::executor::EValue> inputEValues;
+  inputEValues.reserve(values.count);
+  for (ExecutorchRuntimeValue *inputValue in values) {
+    inputEValues.push_back([inputValue getBackedValue]);
+  }
+  const auto result = _module->execute(_modelMethodName.UTF8String, inputEValues);
+  if (!result.ok()) {
+    if (error) {
+      *error = [NSError errorWithDomain:@"ExecutorchRuntimeEngine"
+                                    code:(NSInteger)result.error()
+                                userInfo:nil];
+    }
+    return nil;
+  }
+  NSMutableArray<ExecutorchRuntimeValue *> *const resultValues = [NSMutableArray new];
+  for (const auto &evalue : result.get()) {
+    [resultValues addObject:[[ExecutorchRuntimeValue alloc] initWithEValue:evalue]];
+  }
+  return resultValues;
+}
+
+@end
diff --git a/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/__tests__/ExecutorchRuntimeEngineTests.mm b/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/__tests__/ExecutorchRuntimeEngineTests.mm
new file mode 100644
index 00000000000..23bc59396b2
--- /dev/null
+++ b/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/__tests__/ExecutorchRuntimeEngineTests.mm
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#import <XCTest/XCTest.h>
+
+#import <ExecutorchRuntimeBridge/ExecutorchRuntimeEngine.h>
+
+NS_ASSUME_NONNULL_BEGIN
+
+@interface ExecutorchRuntimeEngineTests : XCTestCase
+@end
+
+@implementation ExecutorchRuntimeEngineTests
+
+- (void)testInvalidModel
+{
+  NSString *const modelPath = @"invalid_model_path";
+
+  NSError *runtimeInitError = nil;
+  ExecutorchRuntimeEngine *const engine = [[ExecutorchRuntimeEngine alloc] initWithModelPath:modelPath modelMethodName:@"forward" error:&runtimeInitError];
+  XCTAssertNil(engine);
+  XCTAssertNotNil(runtimeInitError);
+
+  XCTAssertEqual(runtimeInitError.code, 34);
+  // 34 is the code for AccessFailed.
+}
+
+- (void)testValidModel
+{
+  NSBundle *const bundle = [NSBundle bundleForClass:[self class]];
+  // This is a simple model that adds two tensors.
+  NSString *const modelPath = [bundle pathForResource:@"add" ofType:@"pte"];
+  NSError *runtimeInitError = nil;
+  ExecutorchRuntimeEngine *const engine = [[ExecutorchRuntimeEngine alloc] initWithModelPath:modelPath modelMethodName:@"forward" error:&runtimeInitError];
+  XCTAssertNotNil(engine);
+  XCTAssertNil(runtimeInitError);
+
+  ExecutorchRuntimeTensorValue *inputTensor = [[ExecutorchRuntimeTensorValue alloc] initWithFloatArray:@[@2.0] shape:@[@1]];
+  ExecutorchRuntimeValue *inputValue = [[ExecutorchRuntimeValue alloc] initWithTensor:inputTensor];
+
+  NSError *inferenceError = nil;
+  const auto output = [engine infer:@[inputValue, inputValue] error:&inferenceError];
+  XCTAssertNil(inferenceError);
+
+  XCTAssertEqual(output.count, 1);
+  NSError *tensorValueError = nil;
+  NSError *floatRepresentationError = nil;
+  const auto tensorValue = [output.firstObject asTensorValueAndReturnError:&tensorValueError];
+  const auto resultFloatArray = [tensorValue floatArrayAndReturnError:&floatRepresentationError];
+  const auto resultShape = tensorValue.shape;
+
+  XCTAssertNil(tensorValueError);
+  XCTAssertNil(floatRepresentationError);
+  XCTAssertEqual(resultFloatArray.count, 1);
+  XCTAssertEqual(resultShape.count, 1);
+  XCTAssertEqual(resultFloatArray.firstObject.floatValue, 4.0);
+  XCTAssertEqual(resultShape.firstObject.integerValue, 1);
+}
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/__tests__/ExecutorchRuntimeValueTests.mm b/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/__tests__/ExecutorchRuntimeValueTests.mm
new file mode 100644
index 00000000000..c3d3599fef2
--- /dev/null
+++ b/extension/apple/ExecutorchRuntimeBridge/ExecutorchRuntimeBridge/__tests__/ExecutorchRuntimeValueTests.mm
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#import <XCTest/XCTest.h>
+
+#import <ExecutorchRuntimeBridge/ExecutorchRuntimeValue.h>
+#import <executorch/extension/module/module.h>
+
+using torch::executor::EValue;
+using torch::executor::TensorImpl;
+using torch::executor::ScalarType;
+
+@interface ExecutorchRuntimeValueTests : XCTestCase
+@end
+
+@implementation ExecutorchRuntimeValueTests
+
+- (void)testTensorValue
+{
+  NSMutableArray *data = [NSMutableArray new];
+  for (int i = 0; i < 10; i++) {
+    [data addObject:@(i + 0.5f)];
+  }
+
+  NSArray *shape = @[@(10)];
+
+  ExecutorchRuntimeTensorValue *tensorValue = [[ExecutorchRuntimeTensorValue alloc] initWithFloatArray:data shape:shape];
+
+  const auto floatArray = [tensorValue floatArrayAndReturnError:nil];
+  const auto shapeArray = [tensorValue shape];
+
+  XCTAssertEqualObjects(floatArray, data);
+  XCTAssertEqualObjects(shapeArray, shape);
+}
+
+- (void)testTensorValueWithFloatArrayWithError
+{
+  std::vector<std::int16_t> data = {1, 2, 3};
+  std::vector<int32_t> shape = {3};
+  TensorImpl tensorImpl(ScalarType::Int, std::size(shape), shape.data(), data.data());
+
+  XCTAssertNil([[ExecutorchRuntimeTensorValue alloc] initWithTensor:*new torch::executor::Tensor(&tensorImpl) error:nil]);
+  NSError *error = nil;
+  XCTAssertNil([[ExecutorchRuntimeTensorValue alloc] initWithTensor:*new torch::executor::Tensor(&tensorImpl) error:&error]);
+  XCTAssertNotNil(error);
+  XCTAssertEqual(error.code, static_cast<uint32_t>(executorch::runtime::Error::InvalidArgument));
+  XCTAssertEqualObjects(error.userInfo[NSDebugDescriptionErrorKey], @"Invalid type: torch::executor::ScalarType::3, expected torch::executor::ScalarType::Float");
+}
+
+- (void)testTensorValueWithError
+{
+  ExecutorchRuntimeValue *value = [[ExecutorchRuntimeValue alloc] initWithEValue:EValue((int64_t)1)];
+  XCTAssertNil([value asTensorValueAndReturnError:nil]);
+  NSError *error = nil;
+  XCTAssertNil([value asTensorValueAndReturnError:&error]);
+  XCTAssertNotNil(error);
+  XCTAssertEqual(error.code, static_cast<uint32_t>(executorch::runtime::Error::InvalidArgument));
+  XCTAssertEqualObjects(error.userInfo[NSDebugDescriptionErrorKey], @"Invalid type: Tag::4, expected Tag::Tensor");
+}
+
+@end
diff --git a/extension/aten_util/make_aten_functor_from_et_functor.h b/extension/aten_util/make_aten_functor_from_et_functor.h
index 64a8fcc2887..4a158ec5302 100644
--- a/extension/aten_util/make_aten_functor_from_et_functor.h
+++ b/extension/aten_util/make_aten_functor_from_et_functor.h
@@ -166,24 +166,6 @@ struct type_convert<std::optional<F>, torch::executor::optional<T>> final {
   }
 };
 
-// Optionals: ETen to ATen.
-template <class F, class T>
-struct type_convert<torch::executor::optional<F>, std::optional<T>> final {
- public:
-  torch::executor::optional<F> val;
-  std::unique_ptr<struct type_convert<F, T>> convert_struct;
-  explicit type_convert(torch::executor::optional<F> value) : val(value) {}
-  std::optional<T> call() {
-    if (val.has_value()) {
-      convert_struct = std::make_unique<struct type_convert<F, T>>(
-          type_convert<F, T>(val.value()));
-      return std::optional<T>(convert_struct->call());
-    } else {
-      return std::optional<T>();
-    }
-  }
-};
-
 // ArrayRefs: ATen to ETen.
 template <class F, class T>
 struct type_convert<c10::ArrayRef<F>, torch::executor::ArrayRef<T>> final {
diff --git a/extension/benchmark/android/benchmark/README.md b/extension/benchmark/android/benchmark/README.md
index cfc5ef0e594..f6731023f47 100644
--- a/extension/benchmark/android/benchmark/README.md
+++ b/extension/benchmark/android/benchmark/README.md
@@ -15,7 +15,7 @@ Minibench is usedful for giving reference performance data when developers integ
 You will need executorch AAR for Java and JNI dependencies.
 ```
 export ANDROID_NDK=<path_to_android_ndk>
-sh build/build_android_llm_demo.sh
+sh scripts/build_android_library.sh
 ```
 and copy the AAR to `app/libs`.
 ```
diff --git a/extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml.j2 b/extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml.j2
index ae25a071e5c..4f8e72d21bc 100644
--- a/extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml.j2
+++ b/extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml.j2
@@ -35,6 +35,10 @@ phases:
 
   test:
     commands:
+      # Fail the test if the model doesn't exist, doing it here so that AWS can report the status back
+      - echo "Verify model"
+      - curl -I --fail '{{ model_path }}' || false
+
       # By default, the following ADB command is used by Device Farm to run your Instrumentation test.
       # Please refer to Android's documentation for more options on running instrumentation tests with adb:
       # https://developer.android.com/studio/test/command-line#run-tests-with-adb
@@ -105,6 +109,10 @@ phases:
       - |
         adb -s $DEVICEFARM_DEVICE_UDID shell am force-stop org.pytorch.minibench
 
+        adb -s $DEVICEFARM_DEVICE_UDID shell dumpsys deviceidle force-idle
+        adb -s $DEVICEFARM_DEVICE_UDID shell dumpsys deviceidle unforce
+        adb -s $DEVICEFARM_DEVICE_UDID shell sleep 180
+
         if [ -n "$BIN_FOUND" ]; then
           adb -s $DEVICEFARM_DEVICE_UDID shell am start -W -n org.pytorch.minibench/.LlmBenchmarkActivity \
             --es "model_dir" "/data/local/tmp/minibench" \
diff --git a/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java b/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java
index 15f527475bc..78830d5a54d 100644
--- a/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java
+++ b/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java
@@ -12,6 +12,7 @@
 import android.content.Intent;
 import android.os.AsyncTask;
 import android.os.Bundle;
+import android.os.Debug;
 import android.system.ErrnoException;
 import android.system.Os;
 import com.google.gson.Gson;
@@ -20,6 +21,7 @@
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Collections;
 import java.util.List;
 import java.util.stream.Collectors;
 import org.pytorch.executorch.Module;
@@ -44,6 +46,9 @@ protected void onCreate(Bundle savedInstanceState) {
             .get();
 
     int numIter = intent.getIntExtra("num_iter", 50);
+    int numWarmupIter = intent.getIntExtra("num_warm_up_iter", 10);
+
+    long pssIdle = Debug.getPss();
 
     // TODO: Format the string with a parsable format
     Stats stats = new Stats();
@@ -58,6 +63,10 @@ protected Void doInBackground(Void... voids) {
         stats.errorCode = module.loadMethod("forward");
         stats.loadEnd = System.nanoTime();
 
+        for (int i = 0; i < numWarmupIter; i++) {
+          module.forward();
+        }
+
         for (int i = 0; i < numIter; i++) {
           long start = System.nanoTime();
           module.forward();
@@ -75,12 +84,25 @@ protected void onPostExecute(Void aVoid) {
         final List<BenchmarkMetric> results = new ArrayList<>();
         // The list of metrics we have atm includes:
         // Avg inference latency after N iterations
+        // Currently the result has large variance from outliers, so only use
+        // 80% samples in the middle (trimmean 0.2)
+        Collections.sort(stats.latency);
+        int resultSize = stats.latency.size();
+        List<Double> usedLatencyResults =
+            stats.latency.subList(resultSize / 10, resultSize * 9 / 10);
+
         results.add(
             new BenchmarkMetric(
                 benchmarkModel,
                 "avg_inference_latency(ms)",
                 stats.latency.stream().mapToDouble(l -> l).average().orElse(0.0f),
                 0.0f));
+        results.add(
+            new BenchmarkMetric(
+                benchmarkModel,
+                "trimmean_inference_latency(ms)",
+                usedLatencyResults.stream().mapToDouble(l -> l).average().orElse(0.0f),
+                0.0f));
         // Model load time
         results.add(
             new BenchmarkMetric(
@@ -90,6 +112,10 @@ protected void onPostExecute(Void aVoid) {
                 0.0f));
         // Load status
         results.add(new BenchmarkMetric(benchmarkModel, "load_status", stats.errorCode, 0));
+        // RAM PSS usage
+        results.add(
+            new BenchmarkMetric(
+                benchmarkModel, "ram_pss_usage(mb)", (Debug.getPss() - pssIdle) / 1024, 0));
 
         try (FileWriter writer = new FileWriter(getFilesDir() + "/benchmark_results.json")) {
           Gson gson = new Gson();
diff --git a/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunner.java b/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunner.java
index 9e9b9e003d8..6ba1f57c4f3 100644
--- a/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunner.java
+++ b/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunner.java
@@ -12,12 +12,12 @@
 import android.os.HandlerThread;
 import android.os.Looper;
 import android.os.Message;
-import org.pytorch.executorch.LlamaCallback;
-import org.pytorch.executorch.LlamaModule;
+import org.pytorch.executorch.extension.llm.LlmCallback;
+import org.pytorch.executorch.extension.llm.LlmModule;
 
 /** A helper class to handle all model running logic within this class. */
-public class ModelRunner implements LlamaCallback {
-  LlamaModule mModule = null;
+public class ModelRunner implements LlmCallback {
+  LlmModule mModule = null;
 
   String mModelFilePath = "";
   String mTokenizerFilePath = "";
@@ -44,7 +44,7 @@ public class ModelRunner implements LlamaCallback {
     mTokenizerFilePath = tokenizerFilePath;
     mCallback = callback;
 
-    mModule = new LlamaModule(mModelFilePath, mTokenizerFilePath, 0.8f);
+    mModule = new LlmModule(mModelFilePath, mTokenizerFilePath, 0.8f);
     mHandlerThread = new HandlerThread("ModelRunner");
     mHandlerThread.start();
     mHandler = new ModelRunnerHandler(mHandlerThread.getLooper(), this);
diff --git a/extension/benchmark/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj b/extension/benchmark/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj
index c43b701e885..9fc1d47cb22 100644
--- a/extension/benchmark/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj
+++ b/extension/benchmark/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj
@@ -3,15 +3,13 @@
 	archiveVersion = 1;
 	classes = {
 	};
-	objectVersion = 56;
+	objectVersion = 60;
 	objects = {
 
 /* Begin PBXBuildFile section */
 		032A73CA2CAFBA8600932D36 /* LLaMATests.mm in Sources */ = {isa = PBXBuildFile; fileRef = 032A73C82CAFBA8600932D36 /* LLaMATests.mm */; };
 		032A74182CAFBB7800932D36 /* text_decoder_runner.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 032A73DB2CAFBB7800932D36 /* text_decoder_runner.cpp */; };
-		032A741A2CAFBB7800932D36 /* bpe_tokenizer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 032A73FA2CAFBB7800932D36 /* bpe_tokenizer.cpp */; };
 		032A741D2CAFBB7800932D36 /* text_prefiller.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 032A73DD2CAFBB7800932D36 /* text_prefiller.cpp */; };
-		032A741E2CAFBB7800932D36 /* tiktoken.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 032A73FE2CAFBB7800932D36 /* tiktoken.cpp */; };
 		032A741F2CAFBB7800932D36 /* sampler.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 032A73E62CAFBB7800932D36 /* sampler.cpp */; };
 		032A74232CAFC1B300932D36 /* runner.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 032A74222CAFC1B300932D36 /* runner.cpp */; };
 		032A74262CAFC34800932D36 /* llama_tiktoken.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 032A74252CAFC34800932D36 /* llama_tiktoken.cpp */; };
@@ -20,21 +18,18 @@
 		03B011912CAD114E00054791 /* ResourceTestCase.m in Sources */ = {isa = PBXBuildFile; fileRef = 03B011902CAD114E00054791 /* ResourceTestCase.m */; };
 		03B2D3682C8A515A0046936E /* App.swift in Sources */ = {isa = PBXBuildFile; fileRef = 03B2D3672C8A515A0046936E /* App.swift */; };
 		03B2D37A2C8A515C0046936E /* GenericTests.mm in Sources */ = {isa = PBXBuildFile; fileRef = 03B2D3792C8A515C0046936E /* GenericTests.mm */; };
-		03DD00A92C8FE44600FE4619 /* backend_coreml.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03DD00992C8FE44600FE4619 /* backend_coreml.xcframework */; };
-		03DD00AA2C8FE44600FE4619 /* kernels_custom.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03DD009A2C8FE44600FE4619 /* kernels_custom.xcframework */; };
-		03DD00AF2C8FE44600FE4619 /* kernels_portable.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03DD009F2C8FE44600FE4619 /* kernels_portable.xcframework */; };
-		03DD00B02C8FE44600FE4619 /* kernels_optimized.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03DD00A02C8FE44600FE4619 /* kernels_optimized.xcframework */; };
-		03DD00B12C8FE44600FE4619 /* backend_xnnpack.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03DD00A12C8FE44600FE4619 /* backend_xnnpack.xcframework */; };
-		03DD00B22C8FE44600FE4619 /* backend_mps.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03DD00A22C8FE44600FE4619 /* backend_mps.xcframework */; };
-		03DD00B32C8FE44600FE4619 /* executorch.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03DD00A32C8FE44600FE4619 /* executorch.xcframework */; settings = {ATTRIBUTES = (Required, ); }; };
-		03DD00B52C8FE44600FE4619 /* kernels_quantized.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03DD00A52C8FE44600FE4619 /* kernels_quantized.xcframework */; };
 		03E7E6792CBDCAE900205E71 /* CoreMLTests.mm in Sources */ = {isa = PBXBuildFile; fileRef = 03E7E6782CBDC1C900205E71 /* CoreMLTests.mm */; };
-		03ED6D0F2C8AAFE900F2D6EE /* libsqlite3.0.tbd in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D0E2C8AAFE900F2D6EE /* libsqlite3.0.tbd */; };
-		03ED6D112C8AAFF200F2D6EE /* MetalPerformanceShadersGraph.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D102C8AAFF200F2D6EE /* MetalPerformanceShadersGraph.framework */; };
-		03ED6D132C8AAFF700F2D6EE /* MetalPerformanceShaders.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D122C8AAFF700F2D6EE /* MetalPerformanceShaders.framework */; };
-		03ED6D152C8AAFFF00F2D6EE /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D142C8AAFFF00F2D6EE /* Metal.framework */; };
-		03ED6D172C8AB00500F2D6EE /* CoreML.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D162C8AB00500F2D6EE /* CoreML.framework */; };
-		03ED6D192C8AB00A00F2D6EE /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 03ED6D182C8AB00A00F2D6EE /* Accelerate.framework */; };
+		03F1814E2D7262FC0058BDF9 /* backend_coreml in Frameworks */ = {isa = PBXBuildFile; productRef = 03F1814D2D7262FC0058BDF9 /* backend_coreml */; };
+		03F181502D7262FC0058BDF9 /* backend_mps in Frameworks */ = {isa = PBXBuildFile; productRef = 03F1814F2D7262FC0058BDF9 /* backend_mps */; };
+		03F181522D7262FC0058BDF9 /* backend_xnnpack in Frameworks */ = {isa = PBXBuildFile; productRef = 03F181512D7262FC0058BDF9 /* backend_xnnpack */; };
+		03F181542D7262FC0058BDF9 /* executorch in Frameworks */ = {isa = PBXBuildFile; productRef = 03F181532D7262FC0058BDF9 /* executorch */; };
+		03F181562D7262FC0058BDF9 /* kernels_custom in Frameworks */ = {isa = PBXBuildFile; productRef = 03F181552D7262FC0058BDF9 /* kernels_custom */; };
+		03F181582D7262FC0058BDF9 /* kernels_optimized in Frameworks */ = {isa = PBXBuildFile; productRef = 03F181572D7262FC0058BDF9 /* kernels_optimized */; };
+		03F1815A2D7262FC0058BDF9 /* kernels_portable in Frameworks */ = {isa = PBXBuildFile; productRef = 03F181592D7262FC0058BDF9 /* kernels_portable */; };
+		03F1815C2D7262FC0058BDF9 /* kernels_quantized in Frameworks */ = {isa = PBXBuildFile; productRef = 03F1815B2D7262FC0058BDF9 /* kernels_quantized */; };
+		F292B01D2D88AF3500BE6839 /* bpe_tokenizer_base.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F292B0162D88AF3500BE6839 /* bpe_tokenizer_base.cpp */; };
+		F292B0202D88AF3500BE6839 /* llama2c_tokenizer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F292B0172D88AF3500BE6839 /* llama2c_tokenizer.cpp */; };
+		F292B0212D88AF3500BE6839 /* tiktoken.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F292B01A2D88AF3500BE6839 /* tiktoken.cpp */; };
 /* End PBXBuildFile section */
 
 /* Begin PBXContainerItemProxy section */
@@ -61,12 +56,6 @@
 		032A73DF2CAFBB7800932D36 /* util.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = util.h; sourceTree = "<group>"; };
 		032A73E52CAFBB7800932D36 /* sampler.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = sampler.h; sourceTree = "<group>"; };
 		032A73E62CAFBB7800932D36 /* sampler.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = sampler.cpp; sourceTree = "<group>"; };
-		032A73F82CAFBB7800932D36 /* base64.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = base64.h; sourceTree = "<group>"; };
-		032A73F92CAFBB7800932D36 /* bpe_tokenizer.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = bpe_tokenizer.h; sourceTree = "<group>"; };
-		032A73FA2CAFBB7800932D36 /* bpe_tokenizer.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = bpe_tokenizer.cpp; sourceTree = "<group>"; };
-		032A73FD2CAFBB7800932D36 /* tiktoken.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = tiktoken.h; sourceTree = "<group>"; };
-		032A73FE2CAFBB7800932D36 /* tiktoken.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = tiktoken.cpp; sourceTree = "<group>"; };
-		032A73FF2CAFBB7800932D36 /* tokenizer.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = tokenizer.h; sourceTree = "<group>"; };
 		032A74212CAFC1B300932D36 /* runner.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = runner.h; path = ../../../../examples/models/llama/runner/runner.h; sourceTree = SOURCE_ROOT; };
 		032A74222CAFC1B300932D36 /* runner.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = runner.cpp; path = ../../../../examples/models/llama/runner/runner.cpp; sourceTree = SOURCE_ROOT; };
 		032A74242CAFC34800932D36 /* llama_tiktoken.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = llama_tiktoken.h; path = ../../../../examples/models/llama/tokenizer/llama_tiktoken.h; sourceTree = SOURCE_ROOT; };
@@ -83,21 +72,18 @@
 		03B2D3752C8A515C0046936E /* Tests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = Tests.xctest; sourceTree = BUILT_PRODUCTS_DIR; };
 		03B2D3792C8A515C0046936E /* GenericTests.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = GenericTests.mm; sourceTree = "<group>"; };
 		03C7FA322C8AA24200E6E9AE /* Resources */ = {isa = PBXFileReference; lastKnownFileType = folder; path = Resources; sourceTree = SOURCE_ROOT; };
-		03DD00992C8FE44600FE4619 /* backend_coreml.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = backend_coreml.xcframework; path = Frameworks/backend_coreml.xcframework; sourceTree = "<group>"; };
-		03DD009A2C8FE44600FE4619 /* kernels_custom.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_custom.xcframework; path = Frameworks/kernels_custom.xcframework; sourceTree = "<group>"; };
-		03DD009F2C8FE44600FE4619 /* kernels_portable.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_portable.xcframework; path = Frameworks/kernels_portable.xcframework; sourceTree = "<group>"; };
-		03DD00A02C8FE44600FE4619 /* kernels_optimized.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_optimized.xcframework; path = Frameworks/kernels_optimized.xcframework; sourceTree = "<group>"; };
-		03DD00A12C8FE44600FE4619 /* backend_xnnpack.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = backend_xnnpack.xcframework; path = Frameworks/backend_xnnpack.xcframework; sourceTree = "<group>"; };
-		03DD00A22C8FE44600FE4619 /* backend_mps.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = backend_mps.xcframework; path = Frameworks/backend_mps.xcframework; sourceTree = "<group>"; };
-		03DD00A32C8FE44600FE4619 /* executorch.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = executorch.xcframework; path = Frameworks/executorch.xcframework; sourceTree = "<group>"; };
-		03DD00A52C8FE44600FE4619 /* kernels_quantized.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = kernels_quantized.xcframework; path = Frameworks/kernels_quantized.xcframework; sourceTree = "<group>"; };
 		03E7E6782CBDC1C900205E71 /* CoreMLTests.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = CoreMLTests.mm; sourceTree = "<group>"; };
-		03ED6D0E2C8AAFE900F2D6EE /* libsqlite3.0.tbd */ = {isa = PBXFileReference; lastKnownFileType = "sourcecode.text-based-dylib-definition"; name = libsqlite3.0.tbd; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk/usr/lib/libsqlite3.0.tbd; sourceTree = DEVELOPER_DIR; };
-		03ED6D102C8AAFF200F2D6EE /* MetalPerformanceShadersGraph.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = MetalPerformanceShadersGraph.framework; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk/System/Library/Frameworks/MetalPerformanceShadersGraph.framework; sourceTree = DEVELOPER_DIR; };
-		03ED6D122C8AAFF700F2D6EE /* MetalPerformanceShaders.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = MetalPerformanceShaders.framework; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk/System/Library/Frameworks/MetalPerformanceShaders.framework; sourceTree = DEVELOPER_DIR; };
-		03ED6D142C8AAFFF00F2D6EE /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk/System/Library/Frameworks/Metal.framework; sourceTree = DEVELOPER_DIR; };
-		03ED6D162C8AB00500F2D6EE /* CoreML.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreML.framework; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk/System/Library/Frameworks/CoreML.framework; sourceTree = DEVELOPER_DIR; };
-		03ED6D182C8AB00A00F2D6EE /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk/System/Library/Frameworks/Accelerate.framework; sourceTree = DEVELOPER_DIR; };
+		F292B0162D88AF3500BE6839 /* bpe_tokenizer_base.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = bpe_tokenizer_base.cpp; path = src/bpe_tokenizer_base.cpp; sourceTree = "<group>"; };
+		F292B0172D88AF3500BE6839 /* llama2c_tokenizer.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = llama2c_tokenizer.cpp; path = src/llama2c_tokenizer.cpp; sourceTree = "<group>"; };
+		F292B01A2D88AF3500BE6839 /* tiktoken.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = tiktoken.cpp; path = src/tiktoken.cpp; sourceTree = "<group>"; };
+		F292B0222D88AF4800BE6839 /* base64.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = base64.h; sourceTree = "<group>"; };
+		F292B0232D88AF4800BE6839 /* bpe_tokenizer_base.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = bpe_tokenizer_base.h; sourceTree = "<group>"; };
+		F292B0242D88AF4800BE6839 /* error.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = error.h; sourceTree = "<group>"; };
+		F292B0262D88AF4800BE6839 /* llama2c_tokenizer.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = llama2c_tokenizer.h; sourceTree = "<group>"; };
+		F292B0272D88AF4800BE6839 /* log.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = log.h; sourceTree = "<group>"; };
+		F292B0292D88AF4800BE6839 /* result.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = result.h; sourceTree = "<group>"; };
+		F292B02B2D88AF4800BE6839 /* tiktoken.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = tiktoken.h; sourceTree = "<group>"; };
+		F292B02D2D88AF4800BE6839 /* tokenizer.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = tokenizer.h; sourceTree = "<group>"; };
 /* End PBXFileReference section */
 
 /* Begin PBXFrameworksBuildPhase section */
@@ -112,20 +98,14 @@
 			isa = PBXFrameworksBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
-				03ED6D192C8AB00A00F2D6EE /* Accelerate.framework in Frameworks */,
-				03ED6D172C8AB00500F2D6EE /* CoreML.framework in Frameworks */,
-				03ED6D152C8AAFFF00F2D6EE /* Metal.framework in Frameworks */,
-				03ED6D132C8AAFF700F2D6EE /* MetalPerformanceShaders.framework in Frameworks */,
-				03ED6D112C8AAFF200F2D6EE /* MetalPerformanceShadersGraph.framework in Frameworks */,
-				03ED6D0F2C8AAFE900F2D6EE /* libsqlite3.0.tbd in Frameworks */,
-				03DD00A92C8FE44600FE4619 /* backend_coreml.xcframework in Frameworks */,
-				03DD00B22C8FE44600FE4619 /* backend_mps.xcframework in Frameworks */,
-				03DD00B12C8FE44600FE4619 /* backend_xnnpack.xcframework in Frameworks */,
-				03DD00B32C8FE44600FE4619 /* executorch.xcframework in Frameworks */,
-				03DD00AA2C8FE44600FE4619 /* kernels_custom.xcframework in Frameworks */,
-				03DD00B02C8FE44600FE4619 /* kernels_optimized.xcframework in Frameworks */,
-				03DD00AF2C8FE44600FE4619 /* kernels_portable.xcframework in Frameworks */,
-				03DD00B52C8FE44600FE4619 /* kernels_quantized.xcframework in Frameworks */,
+				03F181542D7262FC0058BDF9 /* executorch in Frameworks */,
+				03F1815C2D7262FC0058BDF9 /* kernels_quantized in Frameworks */,
+				03F181502D7262FC0058BDF9 /* backend_mps in Frameworks */,
+				03F1814E2D7262FC0058BDF9 /* backend_coreml in Frameworks */,
+				03F181522D7262FC0058BDF9 /* backend_xnnpack in Frameworks */,
+				03F181562D7262FC0058BDF9 /* kernels_custom in Frameworks */,
+				03F1815A2D7262FC0058BDF9 /* kernels_portable in Frameworks */,
+				03F181582D7262FC0058BDF9 /* kernels_optimized in Frameworks */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
@@ -137,7 +117,7 @@
 			children = (
 				032A73E02CAFBB7800932D36 /* runner */,
 				032A73E92CAFBB7800932D36 /* sampler */,
-				032A74022CAFBB7800932D36 /* tokenizer */,
+				032A74022CAFBB7800932D36 /* tokenizers */,
 				032A73C82CAFBA8600932D36 /* LLaMATests.mm */,
 			);
 			path = LLaMA;
@@ -173,20 +153,18 @@
 			path = ../../../llm/sampler;
 			sourceTree = SOURCE_ROOT;
 		};
-		032A74022CAFBB7800932D36 /* tokenizer */ = {
+		032A74022CAFBB7800932D36 /* tokenizers */ = {
 			isa = PBXGroup;
 			children = (
-				032A73F82CAFBB7800932D36 /* base64.h */,
-				032A73F92CAFBB7800932D36 /* bpe_tokenizer.h */,
-				032A73FA2CAFBB7800932D36 /* bpe_tokenizer.cpp */,
+				F292B0302D88AF4800BE6839 /* include */,
+				F292B0162D88AF3500BE6839 /* bpe_tokenizer_base.cpp */,
+				F292B0172D88AF3500BE6839 /* llama2c_tokenizer.cpp */,
+				F292B01A2D88AF3500BE6839 /* tiktoken.cpp */,
 				032A74242CAFC34800932D36 /* llama_tiktoken.h */,
 				032A74252CAFC34800932D36 /* llama_tiktoken.cpp */,
-				032A73FD2CAFBB7800932D36 /* tiktoken.h */,
-				032A73FE2CAFBB7800932D36 /* tiktoken.cpp */,
-				032A73FF2CAFBB7800932D36 /* tokenizer.h */,
 			);
-			name = tokenizer;
-			path = ../../../llm/tokenizer;
+			name = tokenizers;
+			path = ../../../llm/tokenizers;
 			sourceTree = SOURCE_ROOT;
 		};
 		03B0118D2CAC567900054791 /* TestUtils */ = {
@@ -204,7 +182,6 @@
 			isa = PBXGroup;
 			children = (
 				03B2D3662C8A515A0046936E /* App */,
-				03ED6CEB2C8AAF5300F2D6EE /* Frameworks */,
 				03C7FA322C8AA24200E6E9AE /* Resources */,
 				03B2D3782C8A515C0046936E /* Tests */,
 				03B0118D2CAC567900054791 /* TestUtils */,
@@ -242,26 +219,36 @@
 			path = Tests;
 			sourceTree = SOURCE_ROOT;
 		};
-		03ED6CEB2C8AAF5300F2D6EE /* Frameworks */ = {
+		F292B02E2D88AF4800BE6839 /* tokenizers */ = {
 			isa = PBXGroup;
 			children = (
-				03ED6D182C8AB00A00F2D6EE /* Accelerate.framework */,
-				03ED6D162C8AB00500F2D6EE /* CoreML.framework */,
-				03ED6D142C8AAFFF00F2D6EE /* Metal.framework */,
-				03ED6D122C8AAFF700F2D6EE /* MetalPerformanceShaders.framework */,
-				03ED6D102C8AAFF200F2D6EE /* MetalPerformanceShadersGraph.framework */,
-				03ED6D0E2C8AAFE900F2D6EE /* libsqlite3.0.tbd */,
-				03DD00992C8FE44600FE4619 /* backend_coreml.xcframework */,
-				03DD00A22C8FE44600FE4619 /* backend_mps.xcframework */,
-				03DD00A12C8FE44600FE4619 /* backend_xnnpack.xcframework */,
-				03DD00A32C8FE44600FE4619 /* executorch.xcframework */,
-				03DD009A2C8FE44600FE4619 /* kernels_custom.xcframework */,
-				03DD00A02C8FE44600FE4619 /* kernels_optimized.xcframework */,
-				03DD009F2C8FE44600FE4619 /* kernels_portable.xcframework */,
-				03DD00A52C8FE44600FE4619 /* kernels_quantized.xcframework */,
-			);
-			name = Frameworks;
-			sourceTree = SOURCE_ROOT;
+				F292B0222D88AF4800BE6839 /* base64.h */,
+				F292B0232D88AF4800BE6839 /* bpe_tokenizer_base.h */,
+				F292B0242D88AF4800BE6839 /* error.h */,
+				F292B0262D88AF4800BE6839 /* llama2c_tokenizer.h */,
+				F292B0272D88AF4800BE6839 /* log.h */,
+				F292B0292D88AF4800BE6839 /* result.h */,
+				F292B02B2D88AF4800BE6839 /* tiktoken.h */,
+				F292B02D2D88AF4800BE6839 /* tokenizer.h */,
+			);
+			path = tokenizers;
+			sourceTree = "<group>";
+		};
+		F292B02F2D88AF4800BE6839 /* pytorch */ = {
+			isa = PBXGroup;
+			children = (
+				F292B02E2D88AF4800BE6839 /* tokenizers */,
+			);
+			path = pytorch;
+			sourceTree = "<group>";
+		};
+		F292B0302D88AF4800BE6839 /* include */ = {
+			isa = PBXGroup;
+			children = (
+				F292B02F2D88AF4800BE6839 /* pytorch */,
+			);
+			path = include;
+			sourceTree = "<group>";
 		};
 /* End PBXGroup section */
 
@@ -329,6 +316,9 @@
 				Base,
 			);
 			mainGroup = 03B2D35B2C8A515A0046936E;
+			packageReferences = (
+				03F1814C2D7262FC0058BDF9 /* XCLocalSwiftPackageReference "../../../.." */,
+			);
 			productRefGroup = 03B2D3652C8A515A0046936E /* Products */;
 			projectDirPath = "";
 			projectRoot = "";
@@ -367,7 +357,7 @@
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 			shellPath = /bin/sh;
-			shellScript = "set -e\n\nif ! command -v cmake &> /dev/null\nthen\n    echo \"Cmake not found, please install Cmake. \\n1. Download Cmake.app from https://cmake.org/download with version > 3.19. \\n2. Install it to Applications/ folder and run sudo /Applications/CMake.app/Contents/bin/cmake-gui --install to install CMake commandline tools.\"\n    exit 1\nfi\n\nCMAKE_DIR=\"$TEMP_DIR/cmake\"\nrm -rf \"$CMAKE_DIR\"\n\nPLATFORM=\"SIMULATORARM64\"\nDEPLOYMENT_TARGET=\"17.0\"\n\nif [[ \"$PLATFORM_NAME\" == *\"iphoneos\"* ]]; then\n  PLATFORM=\"OS64\"\nelif [[ \"$PLATFORM_NAME\" == *\"macos\"* ]]; then\n  PLATFORM=\"MAC_ARM64\"\n  DEPLOYMENT_TARGET=\"10.15\"\nfi\n\ncmake_build() {\n    local src_dir=$1\n    local target=$2\n    shift 2\n    local extra_args=(\"$@\")\n    local build_dir=\"$CMAKE_DIR/build/$(basename \"$src_dir\")\"\n\n    mkdir -p \"$build_dir\" && cd \"$build_dir\"\n\n    if [[ \"$PLATFORM\" == \"MAC_ARM64\" ]]; then\n        extra_args+=(-DCMAKE_INSTALL_BUNDLEDIR=\"${CMAKE_DIR}/bin\")\n        extra_args+=(-DCMAKE_MACOSX_BUNDLE=OFF)\n    fi\n    cmake -G Xcode \\\n          -DCMAKE_BUILD_TYPE=\"Release\" \\\n          -DCMAKE_CXX_STANDARD=17 \\\n          -DCMAKE_TOOLCHAIN_FILE=\"$SRCROOT/../../../../third-party/ios-cmake/ios.toolchain.cmake\" \\\n          -DCMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD=\"c++17\" \\\n          -DCMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY=\"libc++\" \\\n          -DPLATFORM=\"$PLATFORM\" \\\n          -DDEPLOYMENT_TARGET=\"$DEPLOYMENT_TARGET\" \\\n          -DCMAKE_INSTALL_PREFIX=\"$CMAKE_DIR\" \\\n          \"${extra_args[@]}\" \\\n          \"$src_dir\"\n    cmake --build . --config \"Release\" --target \"$target\"\n    if [[ \"$target\" == \"install\" ]]; then\n        cmake --install . --prefix \"$CMAKE_DIR\"\n    fi\n}\n\ncmake_build \"$SRCROOT/../../../llm/third-party/abseil-cpp\" \"install\" \\\n    -DABSL_PROPAGATE_CXX_STD=ON\n\ncmake_build \"$SRCROOT/../../../llm/third-party/re2\" \"install\"\n\ncmake_build \"$SRCROOT/../../../llm/third-party/sentencepiece\" \"sentencepiece-static\" \\\n    -DSPM_ENABLE_SHARED=OFF\n\necho \"$(find $CMAKE_DIR/lib -name \"*.a\" | sed -E 's|^.*/lib([^/]+)\\.a|-l\\1|g' | tr '\\n' ' ')\" > \"$CMAKE_DIR/linker_flags\"\n";
+			shellScript = "set -e\n\nif ! command -v cmake &> /dev/null\nthen\n    echo \"Cmake not found, please install Cmake. \\n1. Download Cmake.app from https://cmake.org/download with version > 3.19. \\n2. Install it to Applications/ folder and run sudo /Applications/CMake.app/Contents/bin/cmake-gui --install to install CMake commandline tools.\"\n    exit 1\nfi\n\nCMAKE_DIR=\"$TEMP_DIR/cmake\"\nrm -rf \"$CMAKE_DIR\"\n\nPLATFORM=\"SIMULATORARM64\"\nDEPLOYMENT_TARGET=\"17.0\"\n\nif [[ \"$PLATFORM_NAME\" == *\"iphoneos\"* ]]; then\n  PLATFORM=\"OS64\"\nelif [[ \"$PLATFORM_NAME\" == *\"macos\"* ]]; then\n  PLATFORM=\"MAC_ARM64\"\n  DEPLOYMENT_TARGET=\"10.15\"\nfi\n\ncmake_build() {\n    local src_dir=$1\n    local target=$2\n    shift 2\n    local extra_args=(\"$@\")\n    local build_dir=\"$CMAKE_DIR/build/$(basename \"$src_dir\")\"\n\n    mkdir -p \"$build_dir\" && cd \"$build_dir\"\n\n    if [[ \"$PLATFORM\" == \"MAC_ARM64\" ]]; then\n        extra_args+=(-DCMAKE_INSTALL_BUNDLEDIR=\"${CMAKE_DIR}/bin\")\n        extra_args+=(-DCMAKE_MACOSX_BUNDLE=OFF)\n    fi\n    cmake -G Xcode \\\n          -DCMAKE_BUILD_TYPE=\"Release\" \\\n          -DCMAKE_CXX_STANDARD=17 \\\n          -DCMAKE_TOOLCHAIN_FILE=\"$SRCROOT/../../../../third-party/ios-cmake/ios.toolchain.cmake\" \\\n          -DCMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD=\"c++17\" \\\n          -DCMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY=\"libc++\" \\\n          -DPLATFORM=\"$PLATFORM\" \\\n          -DDEPLOYMENT_TARGET=\"$DEPLOYMENT_TARGET\" \\\n          -DCMAKE_INSTALL_PREFIX=\"$CMAKE_DIR\" \\\n          \"${extra_args[@]}\" \\\n          \"$src_dir\"\n    cmake --build . --config \"Release\" --target \"$target\"\n    if [[ \"$target\" == \"install\" ]]; then\n        cmake --install . --prefix \"$CMAKE_DIR\"\n    fi\n}\n\ncmake_build \"$SRCROOT/../../../llm/tokenizers/third-party/abseil-cpp\" \"install\" \\\n    -DABSL_PROPAGATE_CXX_STD=ON\n\ncmake_build \"$SRCROOT/../../../llm/tokenizers/third-party/re2\" \"install\"\n\ncmake_build \"$SRCROOT/../../../llm/tokenizers/third-party/sentencepiece\" \"sentencepiece-static\" \\\n    -DSPM_ENABLE_SHARED=OFF\n\necho \"$(find $CMAKE_DIR/lib -name \"*.a\" | sed -E 's|^.*/lib([^/]+)\\.a|-l\\1|g' | tr '\\n' ' ')\" > \"$CMAKE_DIR/linker_flags\"\n";
 		};
 /* End PBXShellScriptBuildPhase section */
 
@@ -386,11 +376,12 @@
 			files = (
 				03B0118E2CAC567900054791 /* DynamicTestCase.m in Sources */,
 				032A74182CAFBB7800932D36 /* text_decoder_runner.cpp in Sources */,
-				032A741A2CAFBB7800932D36 /* bpe_tokenizer.cpp in Sources */,
 				032A741D2CAFBB7800932D36 /* text_prefiller.cpp in Sources */,
-				032A741E2CAFBB7800932D36 /* tiktoken.cpp in Sources */,
 				032A741F2CAFBB7800932D36 /* sampler.cpp in Sources */,
 				03B011912CAD114E00054791 /* ResourceTestCase.m in Sources */,
+				F292B01D2D88AF3500BE6839 /* bpe_tokenizer_base.cpp in Sources */,
+				F292B0202D88AF3500BE6839 /* llama2c_tokenizer.cpp in Sources */,
+				F292B0212D88AF3500BE6839 /* tiktoken.cpp in Sources */,
 				03E7E6792CBDCAE900205E71 /* CoreMLTests.mm in Sources */,
 				032A74232CAFC1B300932D36 /* runner.cpp in Sources */,
 				03B2D37A2C8A515C0046936E /* GenericTests.mm in Sources */,
@@ -671,6 +662,48 @@
 			defaultConfigurationName = Release;
 		};
 /* End XCConfigurationList section */
+
+/* Begin XCLocalSwiftPackageReference section */
+		03F1814C2D7262FC0058BDF9 /* XCLocalSwiftPackageReference "../../../.." */ = {
+			isa = XCLocalSwiftPackageReference;
+			relativePath = ../../../..;
+		};
+/* End XCLocalSwiftPackageReference section */
+
+/* Begin XCSwiftPackageProductDependency section */
+		03F1814D2D7262FC0058BDF9 /* backend_coreml */ = {
+			isa = XCSwiftPackageProductDependency;
+			productName = backend_coreml;
+		};
+		03F1814F2D7262FC0058BDF9 /* backend_mps */ = {
+			isa = XCSwiftPackageProductDependency;
+			productName = backend_mps;
+		};
+		03F181512D7262FC0058BDF9 /* backend_xnnpack */ = {
+			isa = XCSwiftPackageProductDependency;
+			productName = backend_xnnpack;
+		};
+		03F181532D7262FC0058BDF9 /* executorch */ = {
+			isa = XCSwiftPackageProductDependency;
+			productName = executorch;
+		};
+		03F181552D7262FC0058BDF9 /* kernels_custom */ = {
+			isa = XCSwiftPackageProductDependency;
+			productName = kernels_custom;
+		};
+		03F181572D7262FC0058BDF9 /* kernels_optimized */ = {
+			isa = XCSwiftPackageProductDependency;
+			productName = kernels_optimized;
+		};
+		03F181592D7262FC0058BDF9 /* kernels_portable */ = {
+			isa = XCSwiftPackageProductDependency;
+			productName = kernels_portable;
+		};
+		03F1815B2D7262FC0058BDF9 /* kernels_quantized */ = {
+			isa = XCSwiftPackageProductDependency;
+			productName = kernels_quantized;
+		};
+/* End XCSwiftPackageProductDependency section */
 	};
 	rootObject = 03B2D35C2C8A515A0046936E /* Project object */;
 }
diff --git a/extension/benchmark/apple/Benchmark/Frameworks/download_frameworks.sh b/extension/benchmark/apple/Benchmark/Frameworks/download_frameworks.sh
deleted file mode 100755
index 6cd1a56a0f7..00000000000
--- a/extension/benchmark/apple/Benchmark/Frameworks/download_frameworks.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/bash
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-VERSION="0.5.0.20250130"
-FRAMEWORKS=(
-  "backend_coreml"
-  "backend_mps"
-  "backend_xnnpack"
-  "executorch"
-  "kernels_custom"
-  "kernels_optimized"
-  "kernels_portable"
-  "kernels_quantized"
-)
-
-cd "$(dirname "$0")" || exit
-
-for FRAMEWORK in "${FRAMEWORKS[@]}"; do
-  rm -f "${FRAMEWORK}-${VERSION}.zip"
-  rm -rf "${FRAMEWORK}.xcframework"
-  curl -sSLO "https://ossci-ios.s3.amazonaws.com/executorch/${FRAMEWORK}-${VERSION}.zip" && \
-  unzip -q "${FRAMEWORK}-${VERSION}.zip" && \
-  rm "${FRAMEWORK}-${VERSION}.zip"
-done
diff --git a/extension/benchmark/apple/Benchmark/README.md b/extension/benchmark/apple/Benchmark/README.md
index 79daf070e44..e993ae4f970 100644
--- a/extension/benchmark/apple/Benchmark/README.md
+++ b/extension/benchmark/apple/Benchmark/README.md
@@ -30,47 +30,11 @@ cd executorch
 
 This command performs a shallow clone to speed up the process.
 
-### Set Up the Frameworks
+### Build the Frameworks
 
-The Benchmark App relies on prebuilt ExecuTorch frameworks.
-You have two options:
+The Benchmark App is configured to use a Swift PM package that provides the prebuilt ExecuTorch frameworks.
 
-<details>
-<summary>Option 1: Download Prebuilt Frameworks</summary>
-<br/>
-
-Run the provided script to download the prebuilt frameworks:
-
-```bash
-./extension/benchmark/apple/Benchmark/Frameworks/download_frameworks.sh
-```
-</details>
-
-<details>
-<summary>Option 2: Build Frameworks Locally</summary>
-<br/>
-
-Alternatively, you can build the frameworks yourself by following the [guide](https://pytorch.org/executorch/main/apple-runtime.html#local-build).
-</details>
-
-Once the frameworks are downloaded or built, verify that the `Frameworks` directory contains the necessary `.xcframework` files:
-
-```bash
-ls extension/benchmark/apple/Benchmark/Frameworks
-```
-
-You should see:
-
-```
-backend_coreml.xcframework
-backend_mps.xcframework
-backend_xnnpack.xcframework
-executorch.xcframework
-kernels_custom.xcframework
-kernels_optimized.xcframework
-kernels_portable.xcframework
-kernels_quantized.xcframework
-```
+By default, the app relies on the package referencing locally built binaries. To ensure it functions correctly, you must first build the frameworks by following the [guide](https://pytorch.org/executorch/main/using-executorch-ios.html#building-from-source).
 
 ## Adding Models and Resources
 
diff --git a/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm b/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm
index 16c1c1c1d6a..332c3986b0b 100644
--- a/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm
+++ b/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm
@@ -82,7 +82,7 @@ @implementation LLaMATests
         return;
       }
       TokensPerSecondMetric *tokensPerSecondMetric = [TokensPerSecondMetric new];
-      [testCase measureWithMetrics:@[ tokensPerSecondMetric, [XCTMemoryMetric new] ]
+      [testCase measureWithMetrics:@[ tokensPerSecondMetric, [XCTClockMetric new], [XCTMemoryMetric new] ]
                             block:^{
                               tokensPerSecondMetric.tokenCount = 0;
                               const auto status = runner->generate(
diff --git a/extension/benchmark/apple/Benchmark/Tests/Tests.xcconfig b/extension/benchmark/apple/Benchmark/Tests/Tests.xcconfig
index 25c3f9a6267..0172f28b1bb 100644
--- a/extension/benchmark/apple/Benchmark/Tests/Tests.xcconfig
+++ b/extension/benchmark/apple/Benchmark/Tests/Tests.xcconfig
@@ -3,20 +3,21 @@ ET_PLATFORM[sdk=iphoneos*] = ios
 ET_PLATFORM[sdk=macos*] = macos
 
 OTHER_LDFLAGS = $(inherited) \
-    -force_load $(BUILT_PRODUCTS_DIR)/libexecutorch-$(ET_PLATFORM)-release.a \
-    -force_load $(BUILT_PRODUCTS_DIR)/libbackend_coreml-$(ET_PLATFORM)-release.a \
-    -force_load $(BUILT_PRODUCTS_DIR)/libbackend_mps-$(ET_PLATFORM)-release.a \
-    -force_load $(BUILT_PRODUCTS_DIR)/libbackend_xnnpack-$(ET_PLATFORM)-release.a \
-    -force_load $(BUILT_PRODUCTS_DIR)/libkernels_custom-$(ET_PLATFORM)-release.a \
-    -force_load $(BUILT_PRODUCTS_DIR)/libkernels_optimized-$(ET_PLATFORM)-release.a \
-    -force_load $(BUILT_PRODUCTS_DIR)/libkernels_quantized-$(ET_PLATFORM)-release.a \
+    -force_load $(BUILT_PRODUCTS_DIR)/libexecutorch_$(ET_PLATFORM).a \
+    -force_load $(BUILT_PRODUCTS_DIR)/libbackend_coreml_$(ET_PLATFORM).a \
+    -force_load $(BUILT_PRODUCTS_DIR)/libbackend_mps_$(ET_PLATFORM).a \
+    -force_load $(BUILT_PRODUCTS_DIR)/libbackend_xnnpack_$(ET_PLATFORM).a \
+    -force_load $(BUILT_PRODUCTS_DIR)/libkernels_custom_$(ET_PLATFORM).a \
+    -force_load $(BUILT_PRODUCTS_DIR)/libkernels_optimized_$(ET_PLATFORM).a \
+    -force_load $(BUILT_PRODUCTS_DIR)/libkernels_quantized_$(ET_PLATFORM).a \
     @$(TEMP_DIR)/cmake/linker_flags
 
 // LLaMARunner requires additional dependencies built with CMake in a custom run script phase.
 // Include headers and libraries from $(TEMP_DIR)/cmake for it.
 HEADER_SEARCH_PATHS = $(inherited) \
     $(SRCROOT)/../../../../.. \
-    $(TEMP_DIR)/cmake/include
+    $(TEMP_DIR)/cmake/include \
+    $(SRCROOT)/../../../../extension/llm/tokenizers/include
 
 LIBRARY_SEARCH_PATHS = $(inherited) \
     $(TEMP_DIR)/cmake/lib
diff --git a/extension/benchmark/apple/Benchmark/default-ios-device-farm-appium-test-spec.yml.j2 b/extension/benchmark/apple/Benchmark/default-ios-device-farm-appium-test-spec.yml.j2
index 05816685638..a24c0257100 100644
--- a/extension/benchmark/apple/Benchmark/default-ios-device-farm-appium-test-spec.yml.j2
+++ b/extension/benchmark/apple/Benchmark/default-ios-device-farm-appium-test-spec.yml.j2
@@ -34,6 +34,11 @@ phases:
   # The test phase includes commands that run your test suite execution.
   test:
     commands:
+      # Fail the test if the model doesn't exist, doing it here so that AWS can report the status back
+      - echo "Verify model"
+      - curl -I --fail '{{ model_path }}' || false
+
+      # Run the benchmark
       - xcodebuild test-without-building -destination id=$DEVICEFARM_DEVICE_UDID -xctestrun $DEVICEFARM_TEST_PACKAGE_PATH/*.xctestrun -derivedDataPath $DEVICEFARM_LOG_DIR
 
   # The post test phase includes are commands that are run after your tests are executed.
diff --git a/extension/data_loader/file_data_loader.cpp b/extension/data_loader/file_data_loader.cpp
index 1d097cfd989..1a9ddad259f 100644
--- a/extension/data_loader/file_data_loader.cpp
+++ b/extension/data_loader/file_data_loader.cpp
@@ -14,10 +14,10 @@
 #include <cstring>
 #include <limits>
 
+#include <executorch/runtime/platform/compat_unistd.h>
 #include <fcntl.h>
 #include <sys/stat.h>
 #include <sys/types.h>
-#include <unistd.h>
 
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/result.h>
@@ -71,6 +71,9 @@ FileDataLoader::~FileDataLoader() {
   std::free(const_cast<char*>(file_name_));
   // fd_ can be -1 if this instance was moved from, but closing a negative fd is
   // safe (though it will return an error).
+  if (fd_ == -1) {
+    return;
+  }
   ::close(fd_);
 }
 
diff --git a/extension/data_loader/mman.h b/extension/data_loader/mman.h
new file mode 100644
index 00000000000..246068986ea
--- /dev/null
+++ b/extension/data_loader/mman.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// This file ensures that mman.h compatible functions are defined in the global
+// namespace for windows and posix environments.
+
+#pragma once
+
+#include <executorch/runtime/platform/compiler.h>
+
+#ifndef _WIN32
+
+#include <sys/mman.h>
+#include <unistd.h>
+
+ET_INLINE size_t get_os_page_size() {
+  return sysconf(_SC_PAGESIZE);
+}
+
+#else
+
+#define NOMINMAX
+#include <windows.h>
+#undef NOMINMAX
+#include <io.h>
+
+#include <executorch/extension/data_loader/mman_windows.h>
+
+ET_INLINE long get_os_page_size() {
+  SYSTEM_INFO si;
+  GetSystemInfo(&si);
+  long pagesize = si.dwAllocationGranularity > si.dwPageSize
+      ? si.dwAllocationGranularity
+      : si.dwPageSize;
+  return pagesize;
+}
+
+#endif
diff --git a/extension/data_loader/mman_windows.cpp b/extension/data_loader/mman_windows.cpp
new file mode 100644
index 00000000000..2a7f462f99c
--- /dev/null
+++ b/extension/data_loader/mman_windows.cpp
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) Google Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the MIT license.
+ */
+
+/*
+ * Adapted from: https://code.google.com/archive/p/mman-win32/
+ *
+ * mman-win32
+ * mman library for Windows
+ *
+ * A light implementation of the mmap functions for MinGW.
+ *
+ * The mmap-win32 library implements a wrapper for mmap functions around the
+ * memory mapping Windows API.
+ */
+
+#include <executorch/extension/data_loader/mman_windows.h>
+
+#include <errno.h>
+#include <io.h>
+#include <windows.h>
+
+#ifndef STATUS_SECTION_TOO_BIG
+#define STATUS_SECTION_TOO_BIG ((NTSTATUS)0xC0000040L)
+#endif
+
+#ifndef FILE_MAP_EXECUTE
+#define FILE_MAP_EXECUTE 0x0020
+#endif /* FILE_MAP_EXECUTE */
+
+#define RETURN_IF_FAILED(hr) \
+  do {                       \
+    if (FAILED((hr))) {      \
+      return hr;             \
+    }                        \
+  } while (false)
+
+namespace {
+
+HRESULT try_grow_process_memory_working_set(DWORD dwSizeRequired) {
+  // Get current working set
+  size_t minWorkingSetInitial;
+  size_t maxWorkingSet;
+  if (!GetProcessWorkingSetSize(
+          GetCurrentProcess(), &minWorkingSetInitial, &maxWorkingSet)) {
+    return GetLastError();
+  }
+
+  // Calculate new sizes
+  size_t minWorkingSet = minWorkingSetInitial + dwSizeRequired;
+  if (minWorkingSet < minWorkingSetInitial) {
+    return HRESULT_FROM_WIN32(ERROR_ARITHMETIC_OVERFLOW);
+  }
+
+  if (maxWorkingSet < minWorkingSet) {
+    maxWorkingSet = minWorkingSet;
+  }
+
+  // Grow working set
+  if (!SetProcessWorkingSetSize(
+          GetCurrentProcess(), minWorkingSet, maxWorkingSet)) {
+    return GetLastError();
+  }
+  return S_OK;
+}
+
+HRESULT virtual_lock(void* pMem, DWORD dwSize) {
+  if (!VirtualLock(pMem, dwSize)) {
+    return GetLastError();
+  }
+  return S_OK;
+}
+
+HRESULT virtual_lock_allowing_working_set_growth(void* pMem, DWORD dwSize) {
+  HRESULT hr = virtual_lock(pMem, dwSize);
+
+  if (hr == HRESULT_FROM_WIN32(STATUS_SECTION_TOO_BIG)) {
+    // Attempt to grow the process working set and try again
+    RETURN_IF_FAILED(try_grow_process_memory_working_set(dwSize));
+    RETURN_IF_FAILED(virtual_lock(pMem, dwSize));
+  }
+
+  return hr;
+}
+
+static int __map_mman_error(const DWORD err, const int deferr) {
+  if (err == 0) {
+    return 0;
+  }
+  // TODO: implement
+  return err;
+}
+
+static DWORD __map_mmap_prot_page(const int prot) {
+  DWORD protect = 0;
+
+  if (prot == PROT_NONE) {
+    return protect;
+  }
+  if ((prot & PROT_EXEC) != 0) {
+    protect =
+        ((prot & PROT_WRITE) != 0) ? PAGE_EXECUTE_READWRITE : PAGE_EXECUTE_READ;
+  } else {
+    protect = ((prot & PROT_WRITE) != 0) ? PAGE_READWRITE : PAGE_READONLY;
+  }
+  return protect;
+}
+
+static DWORD __map_mmap_prot_file(const int prot) {
+  DWORD desiredAccess = 0;
+
+  if (prot == PROT_NONE) {
+    return desiredAccess;
+  }
+  if ((prot & PROT_READ) != 0) {
+    desiredAccess |= FILE_MAP_READ;
+  }
+  if ((prot & PROT_WRITE) != 0) {
+    desiredAccess |= FILE_MAP_WRITE;
+  }
+  if ((prot & PROT_EXEC) != 0) {
+    desiredAccess |= FILE_MAP_EXECUTE;
+  }
+  return desiredAccess;
+}
+
+} // namespace
+
+void* mmap(void* addr, size_t len, int prot, int flags, int fildes, off_t off) {
+  HANDLE fm, h;
+
+  void* map = MAP_FAILED;
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4293)
+#endif
+
+  const DWORD dwFileOffsetLow = (sizeof(off_t) <= sizeof(DWORD))
+      ? (DWORD)off
+      : (DWORD)(off & 0xFFFFFFFFL);
+  const DWORD dwFileOffsetHigh = (sizeof(off_t) <= sizeof(DWORD))
+      ? (DWORD)0
+      : (DWORD)((off >> 32) & 0xFFFFFFFFL);
+  const DWORD protect = __map_mmap_prot_page(prot);
+  const DWORD desiredAccess = __map_mmap_prot_file(prot);
+
+  const off_t maxSize = off + (off_t)len;
+
+  const DWORD dwMaxSizeLow = (sizeof(off_t) <= sizeof(DWORD))
+      ? (DWORD)maxSize
+      : (DWORD)(maxSize & 0xFFFFFFFFL);
+  const DWORD dwMaxSizeHigh = (sizeof(off_t) <= sizeof(DWORD))
+      ? (DWORD)0
+      : (DWORD)((maxSize >> 32) & 0xFFFFFFFFL);
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+  errno = 0;
+
+  if (len == 0
+      /* Unsupported flag combinations */
+      || (flags & MAP_FIXED) != 0
+      /* Usupported protection combinations */
+      || prot == PROT_EXEC) {
+    errno = EINVAL;
+    return MAP_FAILED;
+  }
+
+  h = ((flags & MAP_ANONYMOUS) == 0) ? (HANDLE)_get_osfhandle(fildes)
+                                     : INVALID_HANDLE_VALUE;
+
+  if ((flags & MAP_ANONYMOUS) == 0 && h == INVALID_HANDLE_VALUE) {
+    errno = EBADF;
+    return MAP_FAILED;
+  }
+
+  fm = CreateFileMapping(h, NULL, protect, dwMaxSizeHigh, dwMaxSizeLow, NULL);
+
+  if (fm == NULL) {
+    errno = __map_mman_error(GetLastError(), EPERM);
+    return MAP_FAILED;
+  }
+
+  map =
+      MapViewOfFile(fm, desiredAccess, dwFileOffsetHigh, dwFileOffsetLow, len);
+
+  CloseHandle(fm);
+
+  if (map == NULL) {
+    errno = __map_mman_error(GetLastError(), EPERM);
+    return MAP_FAILED;
+  }
+
+  return map;
+}
+
+int munmap(void* addr, size_t len) {
+  if (UnmapViewOfFile(addr))
+    return 0;
+
+  errno = __map_mman_error(GetLastError(), EPERM);
+
+  return -1;
+}
+
+int mprotect(void* addr, size_t len, int prot) {
+  DWORD newProtect = __map_mmap_prot_page(prot);
+  DWORD oldProtect = 0;
+
+  if (VirtualProtect(addr, len, newProtect, &oldProtect))
+    return 0;
+
+  errno = __map_mman_error(GetLastError(), EPERM);
+
+  return -1;
+}
+
+int msync(void* addr, size_t len, int flags) {
+  if (FlushViewOfFile(addr, len))
+    return 0;
+
+  errno = __map_mman_error(GetLastError(), EPERM);
+
+  return -1;
+}
+
+int mlock(const void* addr, size_t len) {
+  HRESULT hr = virtual_lock_allowing_working_set_growth((LPVOID)addr, len);
+  if (SUCCEEDED(hr)) {
+    return 0;
+  }
+
+  errno = __map_mman_error(hr, EPERM);
+
+  return -1;
+}
+
+int munlock(const void* addr, size_t len) {
+  if (VirtualUnlock((LPVOID)addr, len))
+    return 0;
+
+  errno = __map_mman_error(GetLastError(), EPERM);
+
+  return -1;
+}
diff --git a/extension/data_loader/mman_windows.h b/extension/data_loader/mman_windows.h
new file mode 100644
index 00000000000..563db5d8b21
--- /dev/null
+++ b/extension/data_loader/mman_windows.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) Google Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the MIT license.
+ */
+
+/*
+ * Adapted from: https://code.google.com/archive/p/mman-win32/
+ *
+ * mman-win32
+ * mman library for Windows
+ *
+ * A light implementation of the mmap functions for MinGW.
+ *
+ * The mmap-win32 library implements a wrapper for mmap functions around the
+ * memory mapping Windows API.
+ */
+
+#pragma once
+
+#ifndef _WIN32_WINNT // Allow use of features specific to Windows XP or later.
+#define _WIN32_WINNT \
+  0x0501 // Change this to the appropriate value to target other versions of
+         // Windows.
+#endif
+
+/* All the headers include this file. */
+#ifndef _MSC_VER
+#include <_mingw.h>
+#endif
+
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define PROT_NONE 0
+#define PROT_READ 1
+#define PROT_WRITE 2
+#define PROT_EXEC 4
+
+#define MAP_FILE 0
+#define MAP_SHARED 1
+#define MAP_PRIVATE 2
+#define MAP_TYPE 0xf
+#define MAP_FIXED 0x10
+#define MAP_ANONYMOUS 0x20
+#define MAP_ANON MAP_ANONYMOUS
+
+#define MAP_FAILED ((void*)-1)
+
+/* Flags for msync. */
+#define MS_ASYNC 1
+#define MS_SYNC 2
+#define MS_INVALIDATE 4
+
+void* mmap(void* addr, size_t len, int prot, int flags, int fildes, off_t off);
+int munmap(void* addr, size_t len);
+int mprotect(void* addr, size_t len, int prot);
+int msync(void* addr, size_t len, int flags);
+int mlock(const void* addr, size_t len);
+int munlock(const void* addr, size_t len);
+
+#ifdef __cplusplus
+};
+#endif
diff --git a/extension/data_loader/mmap_data_loader.cpp b/extension/data_loader/mmap_data_loader.cpp
index e990117d586..53fd7bdf624 100644
--- a/extension/data_loader/mmap_data_loader.cpp
+++ b/extension/data_loader/mmap_data_loader.cpp
@@ -13,11 +13,10 @@
 #include <limits>
 
 #include <fcntl.h>
-#include <sys/mman.h>
 #include <sys/stat.h>
 #include <sys/types.h>
-#include <unistd.h>
 
+#include <executorch/extension/data_loader/mman.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/result.h>
 #include <executorch/runtime/platform/log.h>
@@ -63,14 +62,16 @@ MmapDataLoader::~MmapDataLoader() {
   std::free(const_cast<char*>(file_name_));
   // fd_ can be -1 if this instance was moved from, but closing a negative fd is
   // safe (though it will return an error).
-  ::close(fd_);
+  if (fd_ != -1) {
+    ::close(fd_);
+  }
 }
 
 Result<MmapDataLoader> MmapDataLoader::from(
     const char* file_name,
     MmapDataLoader::MlockConfig mlock_config) {
   // Cache the page size.
-  long page_size = sysconf(_SC_PAGESIZE);
+  long page_size = get_os_page_size();
   if (page_size < 0) {
     ET_LOG(Error, "Could not get page size: %s (%d)", ::strerror(errno), errno);
     return Error::AccessFailed;
@@ -182,12 +183,22 @@ Result<FreeableBuffer> MmapDataLoader::load(
   Range range =
       get_overlapping_pages(static_cast<uintptr_t>(offset), size, page_size_);
 
+  size_t map_size = range.size;
+  if (range.start + map_size > file_size_) {
+    // Clamp to the end of the file.
+    //
+    // The Windows implementation of mmap uses CreateFileMapping which returns
+    // error STATUS_SECTION_TOO_BIG (0xc0000040) if we try to map past the end
+    // of the last page of a file mapped in as read-only.
+    map_size = file_size_ - range.start;
+  }
+
   // Map the pages read-only. MAP_PRIVATE vs. MAP_SHARED doesn't matter since
   // the data is read-only, but use PRIVATE just to further avoid accidentally
   // modifying the file.
   void* pages = ::mmap(
       nullptr,
-      range.size,
+      map_size,
       PROT_READ,
       MAP_PRIVATE,
       fd_,
diff --git a/extension/data_loader/targets.bzl b/extension/data_loader/targets.bzl
index fcc7cba5419..50e779b4bd8 100644
--- a/extension/data_loader/targets.bzl
+++ b/extension/data_loader/targets.bzl
@@ -69,8 +69,20 @@ def define_common_targets():
 
     runtime.cxx_library(
         name = "mmap_data_loader",
-        srcs = ["mmap_data_loader.cpp"],
-        exported_headers = ["mmap_data_loader.h"],
+        srcs = [
+            "mmap_data_loader.cpp"
+        ] + select({
+            "DEFAULT": [],
+            "ovr_config//os:windows": ["mman_windows.cpp"],
+        }),
+        headers = select({
+            "DEFAULT": [],
+            "ovr_config//os:windows": ["mman_windows.h"],
+        }),
+        exported_headers = [
+            "mman.h",
+            "mmap_data_loader.h"
+        ],
         visibility = [
             "//executorch/test/...",
             "//executorch/extension/pybindings/...",
diff --git a/extension/data_loader/test/CMakeLists.txt b/extension/data_loader/test/CMakeLists.txt
index 1ce1d5b15fe..fbb8a4901d7 100644
--- a/extension/data_loader/test/CMakeLists.txt
+++ b/extension/data_loader/test/CMakeLists.txt
@@ -17,7 +17,7 @@ cmake_minimum_required(VERSION 3.19)
 
 set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 
-include(${EXECUTORCH_ROOT}/build/Test.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
 set(_test_srcs buffer_data_loader_test.cpp shared_ptr_data_loader_test.cpp
                file_data_loader_test.cpp mmap_data_loader_test.cpp
diff --git a/extension/data_loader/test/mmap_data_loader_test.cpp b/extension/data_loader/test/mmap_data_loader_test.cpp
index a76121109a8..c01b3454493 100644
--- a/extension/data_loader/test/mmap_data_loader_test.cpp
+++ b/extension/data_loader/test/mmap_data_loader_test.cpp
@@ -10,10 +10,9 @@
 
 #include <cstring>
 
-#include <unistd.h>
-
 #include <gtest/gtest.h>
 
+#include <executorch/extension/data_loader/mman.h>
 #include <executorch/extension/testing_util/temp_file.h>
 #include <executorch/runtime/core/result.h>
 #include <executorch/runtime/platform/runtime.h>
@@ -34,7 +33,7 @@ class MmapDataLoaderTest : public ::testing::Test {
     executorch::runtime::runtime_init();
 
     // Get the page size and ensure it's a power of 2.
-    long page_size = sysconf(_SC_PAGESIZE);
+    long page_size = get_os_page_size();
     ASSERT_GT(page_size, 0);
     ASSERT_EQ(page_size & ~(page_size - 1), page_size);
     page_size_ = page_size;
diff --git a/extension/evalue_util/targets.bzl b/extension/evalue_util/targets.bzl
index e700ea21467..47934eb78af 100644
--- a/extension/evalue_util/targets.bzl
+++ b/extension/evalue_util/targets.bzl
@@ -1,4 +1,4 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime")
 
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
@@ -7,7 +7,7 @@ def define_common_targets():
     TARGETS and BUCK files that call this function.
     """
 
-    for aten_mode in (True, False):
+    for aten_mode in get_aten_mode_options():
         aten_suffix = ("_aten" if aten_mode else "")
 
         runtime.cxx_library(
diff --git a/extension/evalue_util/test/CMakeLists.txt b/extension/evalue_util/test/CMakeLists.txt
index fe92778dda9..452657ca7a5 100644
--- a/extension/evalue_util/test/CMakeLists.txt
+++ b/extension/evalue_util/test/CMakeLists.txt
@@ -17,7 +17,7 @@ cmake_minimum_required(VERSION 3.19)
 
 set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 
-include(${EXECUTORCH_ROOT}/build/Test.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
 set(_test_srcs print_evalue_test.cpp ../print_evalue.cpp)
 
diff --git a/extension/evalue_util/test/targets.bzl b/extension/evalue_util/test/targets.bzl
index 9e78f665c97..5d6161d09e9 100644
--- a/extension/evalue_util/test/targets.bzl
+++ b/extension/evalue_util/test/targets.bzl
@@ -1,4 +1,4 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime")
 
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
@@ -7,7 +7,7 @@ def define_common_targets():
     TARGETS and BUCK files that call this function.
     """
 
-    for aten_mode in (True, False):
+    for aten_mode in get_aten_mode_options():
         aten_suffix = "_aten" if aten_mode else ""
 
         runtime.cxx_test(
diff --git a/extension/export_util/export_hf_model.py b/extension/export_util/export_hf_model.py
deleted file mode 100644
index 929773fa4d3..00000000000
--- a/extension/export_util/export_hf_model.py
+++ /dev/null
@@ -1,117 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-import argparse
-import os
-
-import torch
-import torch.export._trace
-from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
-from executorch.exir import EdgeCompileConfig, ExecutorchBackendConfig, to_edge
-from torch.nn.attention import SDPBackend
-from transformers import AutoModelForCausalLM
-from transformers.generation.configuration_utils import GenerationConfig
-from transformers.integrations.executorch import convert_and_export_with_cache
-from transformers.modeling_utils import PreTrainedModel
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-hfm",
-        "--hf_model_repo",
-        required=True,
-        default=None,
-        help="a valid huggingface model repo name",
-    )
-    parser.add_argument(
-        "-d",
-        "--dtype",
-        type=str,
-        choices=["float32", "float16", "bfloat16"],
-        default="float32",
-        help="specify the dtype for loading the model",
-    )
-    parser.add_argument(
-        "-o",
-        "--output_name",
-        required=False,
-        default=None,
-        help="output name of the exported model",
-    )
-
-    args = parser.parse_args()
-
-    # Configs to HF model
-    device = "cpu"
-    # TODO: remove getattr once https://github.com/huggingface/transformers/pull/33741 is merged
-    dtype = getattr(torch, args.dtype)
-    batch_size = 1
-    max_length = 123
-    cache_implementation = "static"
-    attn_implementation = "sdpa"
-
-    # Load and configure a HF model
-    model = AutoModelForCausalLM.from_pretrained(
-        args.hf_model_repo,
-        attn_implementation=attn_implementation,
-        device_map=device,
-        torch_dtype=dtype,
-        generation_config=GenerationConfig(
-            use_cache=True,
-            cache_implementation=cache_implementation,
-            max_length=max_length,
-            cache_config={
-                "batch_size": batch_size,
-                "max_cache_len": max_length,
-            },
-        ),
-    )
-    print(f"{model.config}")
-    print(f"{model.generation_config}")
-
-    input_ids = torch.tensor([[1]], dtype=torch.long)
-    cache_position = torch.tensor([0], dtype=torch.long)
-
-    def _get_constant_methods(model: PreTrainedModel):
-        metadata = {
-            "get_dtype": 5 if model.config.torch_dtype == torch.float16 else 6,
-            "get_bos_id": model.config.bos_token_id,
-            "get_eos_id": model.config.eos_token_id,
-            "get_head_dim": model.config.hidden_size / model.config.num_attention_heads,
-            "get_max_batch_size": model.generation_config.cache_config.batch_size,
-            "get_max_seq_len": model.generation_config.cache_config.max_cache_len,
-            "get_n_kv_heads": model.config.num_key_value_heads,
-            "get_n_layers": model.config.num_hidden_layers,
-            "get_vocab_size": model.config.vocab_size,
-            "use_kv_cache": model.generation_config.use_cache,
-        }
-        return {k: v for k, v in metadata.items() if v is not None}
-
-    with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad():
-
-        exported_prog = convert_and_export_with_cache(model, input_ids, cache_position)
-        prog = (
-            to_edge(
-                exported_prog,
-                compile_config=EdgeCompileConfig(
-                    _check_ir_validity=False,
-                    _skip_dim_order=True,
-                ),
-                constant_methods=_get_constant_methods(model),
-            )
-            .to_backend(XnnpackPartitioner())
-            .to_executorch(ExecutorchBackendConfig(extract_delegate_segments=True))
-        )
-        out_name = args.output_name if args.output_name else model.config.model_type
-        filename = os.path.join("./", f"{out_name}.pte")
-        with open(filename, "wb") as f:
-            prog.write_to_file(f)
-            print(f"Saved exported program to {filename}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/extension/flat_tensor/CMakeLists.txt b/extension/flat_tensor/CMakeLists.txt
index 14d49d244e3..caacd96b557 100644
--- a/extension/flat_tensor/CMakeLists.txt
+++ b/extension/flat_tensor/CMakeLists.txt
@@ -36,6 +36,9 @@ install(
   DESTINATION ${_common_include_directories}
 )
 
+add_subdirectory(serialize)
+add_dependencies(extension_flat_tensor flat_tensor_schema)
+
 if(BUILD_TESTING)
   add_subdirectory(test)
 endif()
diff --git a/extension/flat_tensor/flat_tensor_data_map.cpp b/extension/flat_tensor/flat_tensor_data_map.cpp
index 20ebc99994a..bf54ae014b5 100644
--- a/extension/flat_tensor/flat_tensor_data_map.cpp
+++ b/extension/flat_tensor/flat_tensor_data_map.cpp
@@ -52,11 +52,14 @@ Result<const flat_tensor_flatbuffer::TensorMetadata*> get_flat_tensor_metadata(
   for (int i = 0; i < tensors->size(); i++) {
     if (std::strcmp(tensors->Get(i)->fully_qualified_name()->c_str(), key) ==
         0) {
-      // TODO(T214294528): Support multiple segments in FlatTensor.
-      if (tensors->Get(i)->segment_index() != 0) {
-        return Error::InvalidExternalData;
-      }
-      return tensors->Get(i);
+      const auto* metadata = tensors->Get(i);
+      ET_CHECK_OR_RETURN_ERROR(
+          metadata->segment_index() >= 0 && metadata->offset() >= 0,
+          InvalidExternalData,
+          "Invalid segment_index %d or offset %" PRIu64 "; malformed PTD file.",
+          metadata->segment_index(),
+          metadata->offset());
+      return metadata;
     }
   }
   return Error::NotFound;
@@ -75,6 +78,23 @@ Result<const TensorLayout> create_tensor_layout(
       scalar_type);
 }
 
+Result<int> get_and_check_segment_offset(
+    const flatbuffers::Vector<
+        flatbuffers::Offset<flat_tensor_flatbuffer::DataSegment>>* segments,
+    const flat_tensor_flatbuffer::TensorMetadata* metadata) {
+  ET_CHECK_OR_RETURN_ERROR(
+      segments != nullptr,
+      InvalidExternalData,
+      "No segments in external data flatbuffer.");
+
+  ET_CHECK_OR_RETURN_ERROR(
+      metadata->segment_index() < segments->size(),
+      InvalidExternalData,
+      "Invalid segment_index %d; malformed PTD file.",
+      metadata->segment_index());
+  return segments->Get(metadata->segment_index())->offset();
+}
+
 } // namespace
 
 ET_NODISCARD Result<const TensorLayout> FlatTensorDataMap::get_metadata(
@@ -89,39 +109,73 @@ ET_NODISCARD Result<const TensorLayout> FlatTensorDataMap::get_metadata(
 
 ET_NODISCARD Result<FreeableBuffer> FlatTensorDataMap::get_data(
     const char* key) const {
-  auto tensor_metadata = flat_tensor_->tensors();
-
-  Result<const flat_tensor_flatbuffer::TensorMetadata*> metadata_res =
-      get_flat_tensor_metadata(key, tensor_metadata);
-  if (!metadata_res.ok()) {
-    return metadata_res.error();
+  Result<const flat_tensor_flatbuffer::TensorMetadata*> metadata =
+      get_flat_tensor_metadata(key, flat_tensor_->tensors());
+  if (!metadata.ok()) {
+    return metadata.error();
   }
-  const auto metadata = metadata_res.get();
-  if (metadata->segment_index() < 0 || metadata->offset() < 0) {
-    // Invalid segment_index/offset; malformed PTD file.
-    return Error::InvalidExternalData;
+  Result<const TensorLayout> tensor_layout =
+      create_tensor_layout(metadata.get());
+  if (!tensor_layout.ok()) {
+    return tensor_layout.error();
   }
-
-  Result<const TensorLayout> tensor_layout_res = create_tensor_layout(metadata);
-  if (!tensor_layout_res.ok()) {
-    return tensor_layout_res.error();
+  Result<int> segment_offset =
+      get_and_check_segment_offset(flat_tensor_->segments(), metadata.get());
+  if (!segment_offset.ok()) {
+    return segment_offset.error();
   }
 
-  // This FreeableBuffer doesn't own the underlying data, and will not free it,
-  // which is why the free function is a nullptr.
-  // TODO(T214294528): Remove data_ro_ and instead load the data here, letting
-  // FreeableBuffer own it.
-  return FreeableBuffer(
-      static_cast<const uint8_t*>(data_ro_.data()) + metadata->offset(),
-      tensor_layout_res.get().nbytes(),
-      nullptr);
+  // Load constant data.
+  ET_CHECK_OR_RETURN_ERROR(
+      segment_offset.get() <
+          header_.segment_base_offset + header_.segment_data_size,
+      InvalidExternalData,
+      "Invalid segment offset %d is larger than the segment_base_offset + segment_data_size %" PRIu64
+      "; malformed PTD file.",
+      segment_offset.get(),
+      header_.segment_base_offset + header_.segment_data_size);
+  return loader_->load(
+      header_.segment_base_offset + segment_offset.get() +
+          metadata.get()->offset(),
+      tensor_layout.get().nbytes(),
+      DataLoader::SegmentInfo(DataLoader::SegmentInfo::Type::External));
 }
 
-ET_NODISCARD Result<size_t> FlatTensorDataMap::load_data_into(
+ET_NODISCARD Error FlatTensorDataMap::load_data_into(
     ET_UNUSED const char* key,
     ET_UNUSED void* buffer,
     ET_UNUSED size_t size) const {
-  return Error::NotImplemented;
+  Result<const flat_tensor_flatbuffer::TensorMetadata*> metadata =
+      get_flat_tensor_metadata(key, flat_tensor_->tensors());
+  if (!metadata.ok()) {
+    return metadata.error();
+  }
+  Result<const TensorLayout> tensor_layout =
+      create_tensor_layout(metadata.get());
+  if (!tensor_layout.ok()) {
+    return tensor_layout.error();
+  }
+  ET_CHECK_OR_RETURN_ERROR(
+      size <= tensor_layout.get().nbytes(),
+      InvalidArgument,
+      "Buffer size %zu is smaller than tensor size %zu",
+      size,
+      tensor_layout.get().nbytes());
+
+  Result<int> segment_offset =
+      get_and_check_segment_offset(flat_tensor_->segments(), metadata.get());
+  if (!segment_offset.ok()) {
+    return segment_offset.error();
+  }
+  // Load mutable data.
+  DataLoader::SegmentInfo info = DataLoader::SegmentInfo(
+      DataLoader::SegmentInfo::Type::Mutable, 0, nullptr);
+  return loader_->load_into(
+      header_.segment_base_offset + segment_offset.get() +
+          metadata.get()->offset(),
+      tensor_layout.get().nbytes(),
+      info,
+      buffer);
 }
 
 ET_NODISCARD Result<size_t> FlatTensorDataMap::get_num_keys() const {
@@ -133,50 +187,40 @@ ET_NODISCARD Result<const char*> FlatTensorDataMap::get_key(
   if (index < 0 || index >= flat_tensor_->tensors()->size()) {
     return Error::InvalidArgument;
   }
+
   return flat_tensor_->tensors()->Get(index)->fully_qualified_name()->c_str();
 }
 
 /* static */ Result<FlatTensorDataMap> FlatTensorDataMap::load(
     DataLoader* loader) {
-  // Load data map.
-  size_t flatbuffer_offset = 0;
-  size_t flatbuffer_size = 0;
-  size_t segment_base_offset = 0;
-  size_t segment_data_size = 0;
-  {
-    // Check header.
-    Result<FreeableBuffer> header = loader->load(
-        /*offset=*/0,
-        FlatTensorHeader::kNumHeadBytes,
-        DataLoader::SegmentInfo(DataLoader::SegmentInfo::Type::External));
-    if (!header.ok()) {
-      return header.error();
-    }
-    Result<FlatTensorHeader> fh =
-        FlatTensorHeader::Parse(header->data(), header->size());
-    if (fh.ok()) {
-      // The header has the data map size.
-      flatbuffer_offset = fh->flatbuffer_offset;
-      flatbuffer_size = fh->flatbuffer_size;
-      segment_base_offset = fh->segment_base_offset;
-      segment_data_size = fh->segment_data_size;
-    } else if (fh.error() == Error::NotFound) {
-      // No header, throw error.
-      ET_LOG(Error, "No FlatTensorHeader found.");
-      return fh.error();
-    } else {
-      // corruption, throw error.
-      ET_LOG(Error, "Flat tensor header may be corrupt.");
-      return fh.error();
-    }
+  // Check header.
+  Result<FreeableBuffer> header = loader->load(
+      /*offset=*/0,
+      FlatTensorHeader::kNumHeadBytes,
+      DataLoader::SegmentInfo(DataLoader::SegmentInfo::Type::External));
+  if (!header.ok()) {
+    ET_LOG(Error, "Failed to load header.");
+    return header.error();
+  }
+  Result<FlatTensorHeader> fh =
+      FlatTensorHeader::Parse(header->data(), header->size());
+  if (fh.error() == Error::NotFound) {
+    // No header, throw error.
+    ET_LOG(Error, "No FlatTensorHeader found.");
+    return fh.error();
+  } else if (fh.error() != Error::Ok) {
+    // corruption, throw error.
+    ET_LOG(Error, "Flat tensor header may be corrupt.");
+    return fh.error();
   }
 
   // Load flatbuffer data as a segment.
   Result<FreeableBuffer> flat_tensor_data = loader->load(
       /*offset=*/0,
-      flatbuffer_offset + flatbuffer_size,
+      fh->flatbuffer_offset + fh->flatbuffer_size,
       DataLoader::SegmentInfo(DataLoader::SegmentInfo::Type::External));
   if (!flat_tensor_data.ok()) {
+    ET_LOG(Error, "Failed to load flat_tensor data.");
     return flat_tensor_data.error();
   }
 
@@ -204,54 +248,8 @@ ET_NODISCARD Result<const char*> FlatTensorDataMap::get_key(
   const flat_tensor_flatbuffer::FlatTensor* flat_tensor =
       flat_tensor_flatbuffer::GetFlatTensor(flat_tensor_data->data());
 
-  // Validate flatbuffer data.
-  flatbuffers::Verifier verifier(
-      reinterpret_cast<const uint8_t*>(flat_tensor_data->data()),
-      flat_tensor_data->size());
-  bool ok = flat_tensor_flatbuffer::VerifyFlatTensorBuffer(verifier);
-  ET_CHECK_OR_RETURN_ERROR(
-      ok,
-      InvalidExternalData,
-      "Verification failed; data may be truncated or corrupt");
-
-  // Get pointer to tensor metadata.
-  const auto* s_tensor_metadata = flat_tensor->tensors();
-  if (s_tensor_metadata == nullptr) {
-    ET_LOG(Error, "FlatTensor has no tensor metadata.");
-    return Error::InvalidExternalData;
-  }
-
-  // Load constant data.
-  const auto* s_data_segment = flat_tensor->segments();
-
-  // TODO(T214294528): Support multiple segments in FlatTensor.
-  if (s_data_segment->size() != 1) {
-    ET_LOG(
-        Error,
-        "FlatTensor has %u segments, only 1 supported.",
-        s_data_segment->size());
-  }
-  // First segment size should be <= the total segment data size.
-  int segment_size = s_data_segment->Get(0)->size();
-  int segment_offset = s_data_segment->Get(0)->offset();
-  if (segment_size > segment_data_size) {
-    ET_LOG(
-        Error,
-        "FlatTensor segment size %d > segment data size %zu",
-        segment_size,
-        segment_data_size);
-  }
-
-  Result<FreeableBuffer> data_ro = loader->load(
-      /*offset=*/segment_base_offset + segment_offset,
-      segment_size,
-      DataLoader::SegmentInfo(DataLoader::SegmentInfo::Type::External));
-  if (!data_ro.ok()) {
-    return data_ro.error();
-  }
-
   return FlatTensorDataMap(
-      std::move(flat_tensor_data.get()), flat_tensor, std::move(data_ro.get()));
+      fh.get(), std::move(flat_tensor_data.get()), flat_tensor, loader);
 }
 
 } // namespace extension
diff --git a/extension/flat_tensor/flat_tensor_data_map.h b/extension/flat_tensor/flat_tensor_data_map.h
index 7bd33e68927..972a5fa9c55 100644
--- a/extension/flat_tensor/flat_tensor_data_map.h
+++ b/extension/flat_tensor/flat_tensor_data_map.h
@@ -10,6 +10,8 @@
 
 #include <executorch/runtime/core/named_data_map.h>
 
+#include <executorch/extension/flat_tensor/serialize/flat_tensor_header.h>
+
 #include <executorch/runtime/core/data_loader.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/result.h>
@@ -41,17 +43,50 @@ class FlatTensorDataMap final : public executorch::runtime::NamedDataMap {
   static executorch::runtime::Result<FlatTensorDataMap> load(
       executorch::runtime::DataLoader* loader);
 
+  /**
+   * Retrieve the metadata for the specified key.
+   *
+   * @param[in] key The name of the tensor to get metadata on.
+   *
+   * @return Error::NotFound if the key is not present.
+   */
   ET_NODISCARD
   executorch::runtime::Result<const executorch::runtime::TensorLayout>
   get_metadata(const char* key) const override;
+
+  /**
+   * Retrieve read-only data for the specified key.
+   *
+   * @param[in] key The name of the tensor to get data on.
+   *
+   * @return error if the key is not present or data cannot be loaded.
+   */
   ET_NODISCARD
   executorch::runtime::Result<executorch::runtime::FreeableBuffer> get_data(
       const char* key) const override;
-  ET_NODISCARD executorch::runtime::Result<size_t>
+
+  /**
+   * Loads the data of the specified tensor into the provided buffer.
+   *
+   * @param[in] key The name of the tensor to get the data of.
+   * @param[in] buffer The buffer to load data into. Must point to at least
+   * `size` bytes of memory.
+   * @param[in] size The number of bytes to load.
+   *
+   * @returns an Error indicating if the load was successful.
+   */
+  ET_NODISCARD executorch::runtime::Error
   load_data_into(const char* key, void* buffer, size_t size) const override;
 
+  /**
+   * @returns The number of keys in the map.
+   */
   ET_NODISCARD executorch::runtime::Result<size_t> get_num_keys()
       const override;
+
+  /**
+   * @returns The key at the specified index, error if index out of bounds.
+   */
   ET_NODISCARD executorch::runtime::Result<const char*> get_key(
       size_t index) const override;
 
@@ -61,26 +96,31 @@ class FlatTensorDataMap final : public executorch::runtime::NamedDataMap {
 
  private:
   FlatTensorDataMap(
+      const FlatTensorHeader& header,
       executorch::runtime::FreeableBuffer&& flat_tensor_data,
       const flat_tensor_flatbuffer::FlatTensor* flat_tensor,
-      executorch::runtime::FreeableBuffer&& data_ro)
-      : flat_tensor_data_(std::move(flat_tensor_data)),
+      executorch::runtime::DataLoader* loader)
+      : header_(header),
+        flat_tensor_data_(std::move(flat_tensor_data)),
         flat_tensor_(flat_tensor),
-        data_ro_(std::move(data_ro)) {}
+        loader_(loader) {}
 
   // Not copyable or assignable.
   FlatTensorDataMap(const FlatTensorDataMap& rhs) = delete;
   FlatTensorDataMap& operator=(FlatTensorDataMap&& rhs) noexcept = delete;
   FlatTensorDataMap& operator=(const FlatTensorDataMap& rhs) = delete;
 
+  // FlatTensor header, containing segment_base_offset and segment_data_size.
+  const FlatTensorHeader header_;
+
   // Serialized flat_tensor flatbuffer data.
   executorch::runtime::FreeableBuffer flat_tensor_data_;
 
   // Flatbuffer representation of the flat_tensor.
   const flat_tensor_flatbuffer::FlatTensor* flat_tensor_;
 
-  // Loaded read-only tensor data.
-  executorch::runtime::FreeableBuffer data_ro_;
+  // Data loader, used to load segment data.
+  executorch::runtime::DataLoader* loader_;
 };
 
 } // namespace extension
diff --git a/extension/flat_tensor/serialize/CMakeLists.txt b/extension/flat_tensor/serialize/CMakeLists.txt
index f1278c804db..d1ae797f8b3 100644
--- a/extension/flat_tensor/serialize/CMakeLists.txt
+++ b/extension/flat_tensor/serialize/CMakeLists.txt
@@ -9,10 +9,6 @@
 # cmake-format -i CMakeLists.txt
 # ~~~
 
-if(NOT FLATC_EXECUTABLE)
-  set(FLATC_EXECUTABLE flatc)
-endif()
-
 # The include directory that will contain the generated schema headers.
 set(_flat_tensor_schema__include_dir "${CMAKE_BINARY_DIR}/extension/flat_tensor/include")
 set(_flat_tensor_schema__output_dir "${_flat_tensor_schema__include_dir}/executorch/extension/flat_tensor/serialize")
@@ -37,7 +33,7 @@ function(generate_flat_tensor_schema _schema_srcs _schema_name)
       ${FLATC_EXECUTABLE} --cpp --cpp-std c++11 --gen-mutable --scoped-enums -o
       "${_flat_tensor_schema__output_dir}" ${_schema_srcs}
     WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-    DEPENDS ${FLATC_EXECUTABLE} ${_schema_srcs}
+    DEPENDS flatc ${_schema_srcs}
     COMMENT "Generating ${_schema_name} headers"
     VERBATIM
   )
@@ -49,7 +45,7 @@ function(generate_flat_tensor_schema _schema_srcs _schema_name)
   # and some users need an alignment larger than the default, which is typically
   # 32.
   target_compile_definitions(
-    ${_schema_name} INTERFACE FLATBUFFERS_MAX_ALIGNMENT=1024
+    ${_schema_name} INTERFACE FLATBUFFERS_MAX_ALIGNMENT=${FLATBUFFERS_MAX_ALIGNMENT}
   )
 
   target_include_directories(
diff --git a/extension/flat_tensor/serialize/targets.bzl b/extension/flat_tensor/serialize/targets.bzl
index 78054af30e9..717418ec7e6 100644
--- a/extension/flat_tensor/serialize/targets.bzl
+++ b/extension/flat_tensor/serialize/targets.bzl
@@ -39,7 +39,9 @@ def define_common_targets():
         name = "flat_tensor_header",
         srcs = ["flat_tensor_header.cpp"],
         exported_headers = ["flat_tensor_header.h"],
-        visibility = ["//executorch/..."],
+        visibility = [
+            "//executorch/...",
+        ],
         exported_deps = ["//executorch/runtime/core:core"],
     )
 
@@ -54,6 +56,7 @@ def define_common_targets():
         exported_headers = ["serialize.h"],
         visibility = [
             "//executorch/...",
+            "@EXECUTORCH_CLIENTS",
         ],
         exported_external_deps = ["flatbuffers-api"],
     )
diff --git a/extension/flat_tensor/targets.bzl b/extension/flat_tensor/targets.bzl
index ed2adefc581..0d49995aa6e 100644
--- a/extension/flat_tensor/targets.bzl
+++ b/extension/flat_tensor/targets.bzl
@@ -8,14 +8,16 @@ def define_common_targets():
         ],
         exported_headers = ["flat_tensor_data_map.h"],
         deps = [
-            "//executorch/extension/flat_tensor/serialize:generated_headers",
-            "//executorch/extension/flat_tensor/serialize:flat_tensor_header",
             "//executorch/runtime/core:core",
             "//executorch/runtime/core:evalue",
             "//executorch/runtime/core:named_data_map",
             "//executorch/runtime/core/exec_aten:lib",
             "//executorch/runtime/core/exec_aten/util:tensor_util",
         ],
+        exported_deps = [
+            "//executorch/extension/flat_tensor/serialize:flat_tensor_header",
+            "//executorch/extension/flat_tensor/serialize:generated_headers",
+        ],
         visibility = [
             "//executorch/...",
         ],
diff --git a/extension/flat_tensor/test/CMakeLists.txt b/extension/flat_tensor/test/CMakeLists.txt
index 81fc7d63c5b..e22025215c5 100644
--- a/extension/flat_tensor/test/CMakeLists.txt
+++ b/extension/flat_tensor/test/CMakeLists.txt
@@ -16,11 +16,11 @@ cmake_minimum_required(VERSION 3.19)
 
 set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 
-include(${EXECUTORCH_ROOT}/build/Test.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
 add_custom_command(
   OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.pte"
-         "${CMAKE_CURRENT_BINARY_DIR}/_default_external_constant.ptd"
+         "${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.ptd"
   COMMAND
     python -m test.models.export_program --modules "ModuleLinear"
     --external-constants --outdir "${CMAKE_CURRENT_BINARY_DIR}" 2> /dev/null
@@ -30,12 +30,12 @@ add_custom_command(
 add_custom_target(
   extension_flat_tensor_test_resources
   DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.pte"
-          "${CMAKE_CURRENT_BINARY_DIR}/_default_external_constant.ptd"
+          "${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.ptd"
 )
 
 set(test_env
     "ET_MODULE_LINEAR_PROGRAM_PATH=${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.pte"
-    "ET_MODULE_LINEAR_DATA_PATH=${CMAKE_CURRENT_BINARY_DIR}/_default_external_constant.ptd"
+    "ET_MODULE_LINEAR_DATA_PATH=${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.ptd"
 )
 
 set(_test_srcs flat_tensor_data_map_test.cpp flat_tensor_header_test.cpp)
diff --git a/extension/flat_tensor/test/flat_tensor_data_map_test.cpp b/extension/flat_tensor/test/flat_tensor_data_map_test.cpp
index 681bc39a129..ac4583eda88 100644
--- a/extension/flat_tensor/test/flat_tensor_data_map_test.cpp
+++ b/extension/flat_tensor/test/flat_tensor_data_map_test.cpp
@@ -137,3 +137,26 @@ TEST_F(FlatTensorDataMapTest, FlatTensorDataMap_Keys) {
   Result<const char*> key2_res = data_map->get_key(2);
   EXPECT_EQ(key2_res.error(), Error::InvalidArgument);
 }
+
+TEST_F(FlatTensorDataMapTest, FlatTensorDataMap_LoadInto) {
+  Result<FlatTensorDataMap> data_map =
+      FlatTensorDataMap::load(data_map_loader_.get());
+  EXPECT_EQ(data_map.error(), Error::Ok);
+
+  // get the metadata
+  auto meta_data_res = data_map->get_metadata("a");
+  ASSERT_EQ(meta_data_res.error(), Error::Ok);
+
+  // get data blob
+  void* data = malloc(meta_data_res->nbytes());
+  auto load_into_error =
+      data_map->load_data_into("a", data, meta_data_res->nbytes());
+  ASSERT_EQ(load_into_error, Error::Ok);
+
+  // Check tensor data is correct.
+  float* data_a = static_cast<float*>(data);
+  for (int i = 0; i < 4; i++) {
+    EXPECT_EQ(data_a[i], 3.0);
+  }
+  free(data);
+}
diff --git a/extension/flat_tensor/test/targets.bzl b/extension/flat_tensor/test/targets.bzl
index bc04edfbe1e..a2b96526ab5 100644
--- a/extension/flat_tensor/test/targets.bzl
+++ b/extension/flat_tensor/test/targets.bzl
@@ -40,16 +40,13 @@ def define_common_targets(is_fbcode=False):
         }
 
         runtime.cxx_test(
-            name = "flat_tensor_data_map",
+            name = "flat_tensor_data_map_test",
             srcs = [
                 "flat_tensor_data_map_test.cpp",
             ],
             deps = [
                 "//executorch/extension/data_loader:file_data_loader",
                 "//executorch/extension/flat_tensor:flat_tensor_data_map",
-                "//executorch/extension/flat_tensor/serialize:flat_tensor_header",
-                "//executorch/extension/flat_tensor/serialize:generated_headers",
-                "//executorch/extension/flat_tensor/serialize:schema",
                 "//executorch/runtime/core:named_data_map",
                 "//executorch/runtime/core/exec_aten:lib",
             ],
diff --git a/extension/kernel_util/test/CMakeLists.txt b/extension/kernel_util/test/CMakeLists.txt
index ee86c41d23e..0e4ce6c462a 100644
--- a/extension/kernel_util/test/CMakeLists.txt
+++ b/extension/kernel_util/test/CMakeLists.txt
@@ -17,7 +17,7 @@ cmake_minimum_required(VERSION 3.19)
 
 set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 
-include(${EXECUTORCH_ROOT}/build/Test.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
 set(_test_srcs make_boxed_from_unboxed_functor_test.cpp)
 
diff --git a/extension/llm/custom_ops/CMakeLists.txt b/extension/llm/custom_ops/CMakeLists.txt
index 4b793905339..fd2ead6c8b0 100644
--- a/extension/llm/custom_ops/CMakeLists.txt
+++ b/extension/llm/custom_ops/CMakeLists.txt
@@ -22,8 +22,8 @@ endif()
 
 set(_common_compile_options -Wno-deprecated-declarations -fPIC)
 
-include(${EXECUTORCH_ROOT}/build/Utils.cmake)
-include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
 
 #
 # The `_<target>_srcs` lists are defined by including ${EXECUTORCH_SRCS_FILE}.
@@ -45,6 +45,22 @@ list(APPEND custom_ops_libs cpuinfo)
 list(APPEND custom_ops_libs cpublas)
 list(APPEND custom_ops_libs eigen_blas)
 
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm64|armv7)$")
+  list(APPEND _custom_ops__srcs
+       "extension/llm/custom_ops/spinquant/third-party/FFHT/fht_neon.c"
+  )
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|AMD64)")
+  list(APPEND _custom_ops__srcs
+       "extension/llm/custom_ops/spinquant/third-party/FFHT/fht_avx.c"
+  )
+else()
+  message(
+    FATAL_ERROR
+      "Unsupported CMAKE_SYSTEM_PROCESSOR ${CMAKE_SYSTEM_PROCESSOR}. (If \
+32-bit x86, try using fht_avx.c and send a PR if it works!)"
+  )
+endif()
+
 list(TRANSFORM _custom_ops__srcs PREPEND "${EXECUTORCH_ROOT}/")
 
 if(NOT EXECUTORCH_BUILD_XNNPACK)
@@ -62,16 +78,14 @@ target_include_directories(
 target_link_libraries(custom_ops PUBLIC ${custom_ops_libs} executorch_core)
 
 target_compile_options(
-  custom_ops PUBLIC ${_common_compile_options} -DET_USE_THREADPOOL
+  custom_ops PUBLIC ${_common_compile_options}
 )
 
 install(TARGETS custom_ops DESTINATION lib)
 
 if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT)
   # Add a AOT library
-  if(NOT TARGET torch)
-    find_package(Torch CONFIG REQUIRED)
-  endif()
+  find_package_torch()
   add_library(
     custom_ops_aot_lib SHARED
     ${_custom_ops__srcs}
@@ -116,10 +130,15 @@ if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT)
   target_compile_options(
     custom_ops_aot_lib
     PUBLIC -Wno-deprecated-declarations -fPIC -frtti -fexceptions
-           ${_common_compile_options} -DET_USE_THREADPOOL
+           ${_common_compile_options}
   )
 
   install(TARGETS custom_ops_aot_lib
           LIBRARY DESTINATION executorch/extension/llm/custom_ops
   )
 endif()
+
+add_subdirectory(spinquant/third-party/FFHT)
+if(BUILD_TESTING)
+  add_subdirectory(spinquant/test)
+endif()
diff --git a/extension/llm/custom_ops/custom_ops.py b/extension/llm/custom_ops/custom_ops.py
index b3b05db68fb..d299b314816 100644
--- a/extension/llm/custom_ops/custom_ops.py
+++ b/extension/llm/custom_ops/custom_ops.py
@@ -22,23 +22,19 @@
     op2 = torch.ops.llama.fast_hadamard_transform.default
     assert op2 is not None
 except:
-    import glob
-
-    import executorch
-
     # This is needed to ensure that custom ops are registered
     from executorch.extension.pybindings import portable_lib  # noqa # usort: skip
 
     # Ideally package is installed in only one location but usage of
     # PYATHONPATH can result in multiple locations.
     # ATM this is mainly used in CI for qnn runner. Will need to revisit this
-    executorch_package_path = executorch.__path__[-1]
-    logging.info(f"Looking for libcustom_ops_aot_lib.so in {executorch_package_path}")
-    libs = list(
-        glob.glob(
-            f"{executorch_package_path}/**/libcustom_ops_aot_lib.*", recursive=True
-        )
-    )
+    from pathlib import Path
+
+    package_path = Path(__file__).parent.resolve()
+    logging.info(f"Looking for libcustom_ops_aot_lib.so in {package_path}")
+
+    libs = list(package_path.glob("**/libcustom_ops_aot_lib.*"))
+
     assert len(libs) == 1, f"Expected 1 library but got {len(libs)}"
     logging.info(f"Loading custom ops library: {libs[0]}")
     torch.ops.load_library(libs[0])
diff --git a/extension/llm/custom_ops/op_sdpa.cpp b/extension/llm/custom_ops/op_sdpa.cpp
index d23572d8d04..14b2bc694a6 100644
--- a/extension/llm/custom_ops/op_sdpa.cpp
+++ b/extension/llm/custom_ops/op_sdpa.cpp
@@ -19,8 +19,8 @@
 #include <vector>
 
 #ifdef ET_USE_THREADPOOL
-#include <executorch/extension/parallel/thread_parallel.h>
 #include <executorch/extension/threadpool/threadpool.h>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
 #endif
 #include <executorch/extension/kernel_util/make_boxed_from_unboxed_functor.h>
 
@@ -452,6 +452,7 @@ void cpu_flash_attention(
       // However, lets just fix that as well.
       int64_t num_keys =
           is_causal ? std::min(m + start_pos + qBlockSize, kvSize) : kvSize;
+      int64_t m_start_pos = m + start_pos;
       auto j_kv = j / num_reps;
       for (int64_t n = 0; n < num_keys; n += kvSplitSize) {
         int64_t kvBlockSize = std::min(kvSplitSize, kvSize - n);
@@ -471,29 +472,62 @@ void cpu_flash_attention(
             static_cast<accum_t>(0),
             qk_data,
             kvBlockSize);
-        // Apply causal mask, fill unused, i.e. future values, with -inf
-        // Say you have q @ k.T size = [16, 32]
-        // With qblock size = 4, say you are processing
-        // q seq len dim = 8:11.
-        // Say kvSplitSize = 4
-        // Then for causal mask, the entries that needs to be
-        // ignored are
-        // [8, 9:31], [9, 10:31], [10, 10:31], [11, 11:31]
-        // Following condition says that num_keys = 8 + 4 =12
-        // (num_keys - n) <= kvSplitSize
-        // num_keys <= n + kvSplitSize
-        // If n + kvSplitSize is larger than 12, then some
-        // entries need masked out. In our example n = 4
-        // will qualify for that
-        if (is_causal && num_keys - n <= kvSplitSize) {
+        // There are 4 cases that is_causal has to cover to fill
+        // not-attendable-position with -inf
+        /* 1. Everything is attended to. This happens when m_start_pos > n +
+        kvSplitSize e.g m_pos [8:15] and n_pos [0:7]. Since you must attend to
+        all previous tokens matrix is full
+        + + + + + + + +
+        + + + + + + + +
+        + + + + + + + +
+        + + + + + + + +
+        + + + + + + + +
+        + + + + + + + +
+        + + + + + + + +
+           2. Everything is not attended to. However only some tokens at the
+        beginning dont attend to everything. This happens when m_start_pos <= n
+        + kvSplitSize but m_start_pos + qBlockSize > n + kvSplitSize m_start_pos
+        = 8 qBlockSize = 8 n = 4 kvSplitSize = 8 For example m_pos [8:15] but
+        n_pos is [4:11]
+        + + + + + - - -
+        + + + + + + - -
+        + + + + + + + -
+        + + + + + + + +
+        + + + + + + + +
+        + + + + + + + +
+        + + + + + + + +
+        + + + + + + + +
+           3. In this case only last few tokens have something to attend to.
+        This happens when m_start_pos < n and m_start_pos + qBlockSize >= n and
+        m_start_pos + qBlockSize <= n + kvSplitSize m_start_pos = 8 qBlockSize =
+        8 n = 13 kvSplitSize = 8 For example m_pos [8:15] but n_pos is [13:20]
+        - - - - - - - -
+        - - - - - - - -
+        - - - - - - - -
+        - - - - - - - -
+        - - - - - - - -
+        + - - - - - - -
+        + + - - - - - -
+        + + + - - - - -
+           4. In this no tokens attend to anything, but we dont really have to
+        take care of this case because the loop for (int64_t n = 0; n <
+        num_keys; n += kvSplitSize) will exit before that.
+        */
+        if (is_causal && m_start_pos <= n + kvSplitSize) {
           // For this fn to work k_split_size > q_split_size
-          for (int32_t row = 0; row < qBlockSize; ++row) {
-            int64_t last_col = m + (row + start_pos) - n;
+          for (int32_t row = 0;
+               row < qBlockSize && (m_start_pos + row < n + (kvSplitSize - 1));
+               ++row) {
+            // When last_col is 0, it means that the entire row is not attended
+            // to because m_pos is smaller than n_pos. So everything in n is for
+            // future.
+            int64_t last_col =
+                n > (m_start_pos + row) ? 0 : row + m_start_pos + 1 - n;
             accum_t* row_ptr = qk_data + row * kvBlockSize;
             fill_stub(
-                row_ptr + last_col + 1,
+                row_ptr + last_col,
                 -std::numeric_limits<accum_t>::infinity(),
-                kvBlockSize - last_col - 1);
+                kvBlockSize - last_col);
           }
         }
         // Update attention weights with attention mask
@@ -594,46 +628,46 @@ bool validate_flash_attention_args(
     const Tensor& key,
     const Tensor& value,
     const optional<Tensor>& attn_mask) {
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(query.dim() == 4, "query must be a 4D tensor");
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(key.dim() == 4, "key must be a 4D tensor");
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(value.dim() == 4, "value must be a 4D tensor");
+  ET_CHECK_OR_RETURN_FALSE(query.dim() == 4, "query must be a 4D tensor");
+  ET_CHECK_OR_RETURN_FALSE(key.dim() == 4, "key must be a 4D tensor");
+  ET_CHECK_OR_RETURN_FALSE(value.dim() == 4, "value must be a 4D tensor");
 
   // Sizes
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       (query.size(3) == value.size(3)) && (key.size(3) == value.size(3)),
       "scaled_dot_product_attention_flash_attention: Q/K/V should have the same head size");
 
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       (query.scalar_type() == ScalarType::Float), "Query must be Float type");
 
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       (query.scalar_type() == key.scalar_type()) &&
           (query.scalar_type() == value.scalar_type()),
       "Key and Value must have the same data type as Query");
 
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       !attn_mask.has_value() || attn_mask.value().dim() == 2,
       "Attention mask must be a 2D tensor");
 
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       !attn_mask.has_value() ||
           attn_mask.value().scalar_type() == query.scalar_type(),
       "Attention mask must be a 2D tensor");
 
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       is_contiguous_dim_order(query.dim_order().data(), query.dim()),
       "key cache must be in contiguous dim order");
 
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       is_contiguous_dim_order(key.dim_order().data(), key.dim()),
       "value cache must be in contiguous dim order");
 
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       is_contiguous_dim_order(value.dim_order().data(), value.dim()),
       "value cache must be in contiguous dim order");
 
   if (attn_mask.has_value()) {
-    ET_LOG_MSG_AND_RETURN_IF_FALSE(
+    ET_CHECK_OR_RETURN_FALSE(
         is_contiguous_dim_order(
             attn_mask.value().dim_order().data(), attn_mask.value().dim()),
         "value cache must be in contiguous dim order");
@@ -647,21 +681,19 @@ bool validate_cache_params(
     const Tensor& v_cache,
     int64_t start_pos,
     int64_t seq_length) {
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
-      k_cache.dim() == 4, "kcache must be a 4D tensor");
+  ET_CHECK_OR_RETURN_FALSE(k_cache.dim() == 4, "kcache must be a 4D tensor");
 
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
-      v_cache.dim() == 4, "v_cache must be a 4D tensor");
+  ET_CHECK_OR_RETURN_FALSE(v_cache.dim() == 4, "v_cache must be a 4D tensor");
 
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       start_pos < k_cache.size(1),
       "start_pos must be less than key cache at dim 1");
 
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       start_pos < v_cache.size(1),
       "start_pos must be less than value cache at dim 1");
 
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       (start_pos + seq_length) <= k_cache.size(1),
       "start_post + seq_length must be less than max seq length supported by key cache."
       "start pos: %" PRId64 ", seq_length: %" PRId64
@@ -671,7 +703,7 @@ bool validate_cache_params(
       seq_length,
       k_cache.size(1));
 
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       (start_pos + seq_length) <= v_cache.size(1),
       "start_post + seq_length must be less than max seq length supported by key cache."
       "start pos: %" PRId64 ", seq_length: %" PRId64
@@ -682,11 +714,11 @@ bool validate_cache_params(
       v_cache.size(1));
 
   // Make sure they are in contiguous dim order
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       is_contiguous_dim_order(k_cache.dim_order().data(), k_cache.dim()),
       "key cache must be in contiguous dim order");
 
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       is_contiguous_dim_order(v_cache.dim_order().data(), v_cache.dim()),
       "value cache must be in contiguous dim order");
 
diff --git a/extension/llm/custom_ops/op_update_cache.cpp b/extension/llm/custom_ops/op_update_cache.cpp
index bbc0190dab1..323b7a65ddb 100644
--- a/extension/llm/custom_ops/op_update_cache.cpp
+++ b/extension/llm/custom_ops/op_update_cache.cpp
@@ -25,17 +25,17 @@ bool validate_cache_params(
     const Tensor& quantized_cache,
     int64_t start_pos,
     int64_t seq_length) {
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       quantized_cache.dim() == 4, "quantized cache must be a 4D tensor");
 
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       quantized_value.dim() == 4, "quantized_value must be a 4D tensor");
 
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       start_pos < quantized_cache.size(1),
       "start_pos must be less than cache size at dim 1");
 
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       (start_pos + seq_length) <= quantized_cache.size(1),
       "start_post + seq_length must be less than max seq length supported by cache."
       "start pos: %" PRId64 ", seq_length: %" PRId64
@@ -46,12 +46,12 @@ bool validate_cache_params(
       quantized_cache.size(1));
 
   // Make sure they are in contiguous dim order
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       is_contiguous_dim_order(
           quantized_cache.dim_order().data(), quantized_cache.dim()),
       "quantized cache must be in contiguous dim order");
 
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       is_contiguous_dim_order(
           quantized_value.dim_order().data(), quantized_value.dim()),
       "quantized value must be in contiguous dim order");
diff --git a/extension/llm/custom_ops/spinquant/test/CMakeLists.txt b/extension/llm/custom_ops/spinquant/test/CMakeLists.txt
new file mode 100644
index 00000000000..7bca0df6513
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/test/CMakeLists.txt
@@ -0,0 +1,30 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# @generated by test/utils/generate_gtest_cmakelists.py
+#
+# This file should be formatted with
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+# It should also be cmake-lint clean.
+#
+
+cmake_minimum_required(VERSION 3.19)
+
+set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../../..)
+
+include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
+
+set(_test_srcs
+    fast_hadamard_transform_test.cpp fast_hadamard_transform_test_impl.cpp
+    op_fast_hadamard_transform_test.cpp
+)
+
+et_cxx_test(
+  extension_llm_custom_ops_spinquant_test SOURCES ${_test_srcs} EXTRA_LIBS
+  custom_ops dumb_fht
+)
diff --git a/extension/llm/custom_ops/spinquant/third-party/FFHT/CMakeLists.txt b/extension/llm/custom_ops/spinquant/third-party/FFHT/CMakeLists.txt
new file mode 100644
index 00000000000..2e3089be72e
--- /dev/null
+++ b/extension/llm/custom_ops/spinquant/third-party/FFHT/CMakeLists.txt
@@ -0,0 +1,12 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Please this file formatted by running:
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+
+add_library(dumb_fht dumb_fht.c)
diff --git a/extension/llm/custom_ops/targets.bzl b/extension/llm/custom_ops/targets.bzl
index e3e8b30520f..1c4686fe3d0 100644
--- a/extension/llm/custom_ops/targets.bzl
+++ b/extension/llm/custom_ops/targets.bzl
@@ -37,7 +37,6 @@ def define_common_targets():
                 "//executorch/kernels/optimized:libblas{}".format(mkl_dep),
                 "//executorch/kernels/optimized:libvec",
                 "//executorch/extension/kernel_util:kernel_util",
-                "//executorch/extension/parallel:thread_parallel",
                 "//executorch/extension/threadpool:threadpool",
             ],
             deps = [
diff --git a/extension/llm/custom_ops/test_sdpa_with_kv_cache.py b/extension/llm/custom_ops/test_sdpa_with_kv_cache.py
index 9c8029c7b70..a1f054a153e 100644
--- a/extension/llm/custom_ops/test_sdpa_with_kv_cache.py
+++ b/extension/llm/custom_ops/test_sdpa_with_kv_cache.py
@@ -590,3 +590,14 @@ def test_sdpa_with_cache_seq_len_llava_example_gqa(self):
         self._test_sdpa_common(
             n_heads_kv, n_heads_q, head_dim, max_seq_len, seq_len, next_iter_seq_len
         )
+
+    def test_sdpa_to_repro_long_seq_failure(self):
+        n_heads_kv = 16
+        n_heads_q = 32
+        head_dim = 128
+        max_seq_len = 2048
+        seq_len = 508
+        next_iter_seq_len = 127
+        self._test_sdpa_common(
+            n_heads_kv, n_heads_q, head_dim, max_seq_len, seq_len, next_iter_seq_len
+        )
diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
index 88d2bc0cab9..751e2d16175 100644
--- a/extension/llm/export/builder.py
+++ b/extension/llm/export/builder.py
@@ -21,7 +21,7 @@
     DuplicateDynamicQuantChainPass,
 )
 from executorch.backends.xnnpack._passes.convert_to_linear import ConvertToLinearPass
-from executorch.exir import EdgeProgramManager
+from executorch.exir import EdgeProgramManager, to_edge_transform_and_lower
 from executorch.exir.backend.partitioner import Partitioner
 
 from executorch.exir.backend.utils import format_delegated_graph
@@ -39,7 +39,7 @@
 from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
 from torch.ao.quantization.quantizer import Quantizer
 from torch.ao.quantization.quantizer.composable_quantizer import ComposableQuantizer
-from torch.export import export_for_training
+from torch.export import export_for_training, ExportedProgram
 from torch.nn.attention import SDPBackend
 
 FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
@@ -61,6 +61,17 @@ def to_torch_dtype(self) -> torch.dtype:
             raise ValueError(f"Unsupported dtype {self}")
         return mapping[self]
 
+    @staticmethod
+    def from_torch_dtype(dtype: torch.dtype):
+        mapping = {
+            torch.float32: DType.fp32,
+            torch.float16: DType.fp16,
+            torch.bfloat16: DType.bf16,
+        }
+        if dtype not in mapping:
+            raise ValueError(f"Unsupported torch.dtype {dtype}")
+        return mapping[dtype]
+
 
 class LLMEdgeManager:
     """
@@ -89,8 +100,11 @@ def __init__(
         dynamic_shapes: Optional[Any] = None,
     ):
         self.model = model
-        # graph module returned from export()
-        self.pre_autograd_graph_module: Optional[torch.fx.GraphModule] = None
+        # Note: treat this as the source of truth for the result of
+        # torch.export'ing a model. If the overall ExportedProgram is needed,
+        # make sure to re-export this graph module to persist any changes. See
+        # https://github.com/pytorch/pytorch/blob/main/torch/export/exported_program.py#L921
+        self.pre_autograd_graph_module: Optional[torch.nn.Module] = None
         self.modelname = modelname
         self.max_seq_len = max_seq_len
         self.dtype = dtype
@@ -184,7 +198,7 @@ def _get_edge_config(self) -> EdgeCompileConfig:
         )
         return edge_config
 
-    def export(self) -> "LLMEdgeManager":
+    def _export(self, module: Optional[torch.nn.Module] = None) -> ExportedProgram:
         dynamic_shape = self._get_dynamic_shape()
         # 1. torch.nn.attention.sdpa_kernel([SDPBackend.MATH]) is for bypassing the dynamo error when tracing
         # 2. torch.no_grad() is for getting rid of the dropout (not sure why training ops will show up)
@@ -201,29 +215,42 @@ def export(self) -> "LLMEdgeManager":
                     # TODO: this is temporary and export_for_training doesn't work with qnn either. We need a
                     # functional graph. See issue https://github.com/pytorch/executorch/pull/4627 for more details
                     exported_module = torch.export.export(
-                        self.model,
+                        self.model if not module else module,
                         self.example_inputs,
                         self.example_kwarg_inputs,
                         dynamic_shapes=dynamic_shape,
                         strict=True,
                     )
             else:
-                logging.info("Exporting with:")
+                if module:
+                    logging.info("Re-exporting with:")
+                else:
+                    logging.info("Exporting with:")
                 logging.info(f"inputs: {self.example_inputs}")
                 logging.info(f"kwargs: {self.example_kwarg_inputs}")
                 logging.info(f"dynamic shapes: {dynamic_shape}")
                 exported_module = export_for_training(
-                    self.model,
+                    self.model if not module else module,
                     self.example_inputs,
                     kwargs=self.example_kwarg_inputs,
                     dynamic_shapes=dynamic_shape,
                 )
-            # pyre-fixme[8]: Attribute has type `Optional[GraphModule]`; used as
-            #  `Module`.
-            self.pre_autograd_graph_module = exported_module.module()
-            if hasattr(self.args, "export_only") and self.args.export_only:
-                torch.export.save(exported_module, self.args.output_name)
+        return exported_module
 
+    def export(self) -> "LLMEdgeManager":
+        """
+        Exports the model pre-autograd. This is not a full export, since it uses
+        torch.export_for_training() to keep autograd-safe ops from getting decomposed.
+        The full torch.export() if called later on during to_edge() or
+        to_edge_transform_and_lower().
+        """
+        exported_module = self._export()
+        # Need to store the graph module to record transformation passes.
+        # Persisting those changes back to an ExportedProgram will require
+        # an additional export().
+        self.pre_autograd_graph_module = exported_module.module()
+        if hasattr(self.args, "export_only") and self.args.export_only:
+            torch.export.save(exported_module, self.args.output_name)
         return self
 
     def run_canonical_optimizations(self):
@@ -330,7 +357,10 @@ def pt2e_quantize(self, quantizers: Optional[List[Quantizer]]) -> "LLMEdgeManage
                 assert (
                     self.pre_autograd_graph_module is not None
                 ), "Please run export() first"
-                m = prepare_pt2e(self.pre_autograd_graph_module, composed_quantizer)
+                m = prepare_pt2e(
+                    self.pre_autograd_graph_module,  # pyre-ignore[6]
+                    composed_quantizer,
+                )
                 logging.info(
                     f"Calibrating with tasks: {self.calibration_tasks}, limit: {self.calibration_limit}, calibration_data: {self.calibration_data}, tokenizer_path: {self.tokenizer_path}, seq_length: {self.calibration_seq_length}"
                 )
@@ -357,7 +387,10 @@ def pt2e_quantize(self, quantizers: Optional[List[Quantizer]]) -> "LLMEdgeManage
                     logging.info(
                         "No calibration provided, using dummy input to calibrate..."
                     )
-                    m(*self.example_inputs)
+                    if self.example_kwarg_inputs:
+                        m(*self.example_inputs, **self.example_kwarg_inputs)
+                    else:
+                        m(*self.example_inputs)
                 m = convert_pt2e(m)
                 DuplicateDynamicQuantChainPass()(m)
                 self.pre_autograd_graph_module = m
@@ -430,6 +463,26 @@ def to_backend(self, partitioners: Optional[List[Partitioner]]) -> "LLMEdgeManag
 
         return self
 
+    def to_edge_transform_and_lower(
+        self, partitioners: Optional[List[Partitioner]]
+    ) -> "LLMEdgeManager":
+        if partitioners is None:
+            logging.info("No partitioner provided, skipping backend lowering...")
+
+        # Need to construct ExportedProgram with the new transformed graph module.
+        exported_module = self._export(self.pre_autograd_graph_module)
+
+        edge_config = self._get_edge_config()
+        self.edge_manager = to_edge_transform_and_lower(
+            exported_module,
+            partitioner=partitioners,
+            compile_config=edge_config,
+            constant_methods=self.metadata,
+        )
+        if self.verbose:
+            logging.info(f"Exported graph:\n{self.edge_manager.exported_program()}")
+        return self
+
     def to_executorch(
         self, passes: Optional[List[ExportPass]] = None
     ) -> "LLMEdgeManager":
diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py
index 55e530553f0..40d81075d9f 100644
--- a/extension/llm/export/quantizer_lib.py
+++ b/extension/llm/export/quantizer_lib.py
@@ -107,10 +107,8 @@ def check_embedding_byte_registered():
                 raise RuntimeError(
                     "Need to specify shared library path to register quantized ops (and their out variants) into EXIR.\n"
                     "Follow the following steps to build the needed lib via cmake.\n"
-                    'Use `python -c "import torch as _; print(_.__path__)"` to find where torch package is installed.\n'
-                    "Set that as TORCH_PACKAGE_DIR.\n"
                     "Then from root executorch dir do the following:\n"
-                    "rm -rf cmake-out && mkdir cmake-out && (cd cmake-out && cmake -DBUCK2=<path-to-buck2> -DCMAKE_PREFIX_PATH=$TORCH_PACKAGE_DIR -DEXECUTORCH_BUILD_KERNELS_QUANTIZED_AOT=ON ..) && cmake --build . -j16\n"
+                    "rm -rf cmake-out && mkdir cmake-out && (cd cmake-out && cmake -DBUCK2=<path-to-buck2> -DEXECUTORCH_BUILD_KERNELS_QUANTIZED_AOT=ON ..) && cmake --build . -j16\n"
                     'To find the location of the lib: find cmake-out -name "libquantized_ops_aot_lib*"\n'
                     "Then specify the said library via -s <path to libquantized_ops_aot_lib.so\n"
                 )
diff --git a/extension/llm/runner/CMakeLists.txt b/extension/llm/runner/CMakeLists.txt
index a9245768b9d..993314ccd63 100644
--- a/extension/llm/runner/CMakeLists.txt
+++ b/extension/llm/runner/CMakeLists.txt
@@ -20,8 +20,8 @@ if(NOT EXECUTORCH_ROOT)
   set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 endif()
 
-include(${EXECUTORCH_ROOT}/build/Utils.cmake)
-include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
 
 #
 # The `_<target>_srcs` lists are defined by including ${EXECUTORCH_SRCS_FILE}.
@@ -51,5 +51,5 @@ target_link_libraries(extension_llm_runner PUBLIC ${runner_deps})
 
 target_include_directories(
   extension_llm_runner INTERFACE ${_common_include_directories}
-                                 ${EXECUTORCH_ROOT}
+                                 ${EXECUTORCH_ROOT}/extension/llm/tokenizers/include
 )
diff --git a/extension/llm/runner/multimodal_runner.h b/extension/llm/runner/multimodal_runner.h
index 94539c65cc6..c17e039c11b 100644
--- a/extension/llm/runner/multimodal_runner.h
+++ b/extension/llm/runner/multimodal_runner.h
@@ -26,8 +26,8 @@
 #include <executorch/extension/llm/runner/text_prefiller.h>
 #include <executorch/extension/llm/runner/text_token_generator.h>
 #include <executorch/extension/llm/sampler/sampler.h>
-#include <executorch/extension/llm/tokenizer/tokenizer.h>
 #include <executorch/extension/module/module.h>
+#include <pytorch/tokenizers/tokenizer.h>
 
 namespace executorch {
 namespace extension {
@@ -129,7 +129,7 @@ class ET_EXPERIMENTAL MultimodalRunner {
   std::unique_ptr<ImagePrefiller> image_prefiller_;
   std::unique_ptr<TextTokenGenerator> text_token_generator_;
   std::string tokenizer_path_;
-  std::unique_ptr<Tokenizer> tokenizer_;
+  std::unique_ptr<::tokenizers::Tokenizer> tokenizer_;
 
   // stats
   Stats stats_;
diff --git a/extension/llm/runner/targets.bzl b/extension/llm/runner/targets.bzl
index aa42c22b1b9..03b593cacf5 100644
--- a/extension/llm/runner/targets.bzl
+++ b/extension/llm/runner/targets.bzl
@@ -49,7 +49,7 @@ def define_common_targets():
             ],
             exported_deps = [
                 ":text_decoder_runner" + aten_suffix,
-                "//executorch/extension/llm/tokenizer:tokenizer_header",
+                "//pytorch/tokenizers:headers",
                 "//executorch/extension/module:module" + aten_suffix,
                 "//executorch/extension/tensor:tensor" + aten_suffix,
             ],
@@ -63,7 +63,7 @@ def define_common_targets():
             ],
             exported_deps = [
                 ":text_decoder_runner" + aten_suffix,
-                "//executorch/extension/llm/tokenizer:tokenizer_header",
+                "//pytorch/tokenizers:headers",
                 "//executorch/extension/module:module" + aten_suffix,
                 "//executorch/extension/tensor:tensor" + aten_suffix,
             ],
diff --git a/extension/llm/runner/text_prefiller.h b/extension/llm/runner/text_prefiller.h
index 2f1d5ae2b75..007f8188f56 100644
--- a/extension/llm/runner/text_prefiller.h
+++ b/extension/llm/runner/text_prefiller.h
@@ -12,8 +12,6 @@
 #pragma once
 
 #include <executorch/extension/llm/runner/text_decoder_runner.h>
-#include <executorch/extension/llm/tokenizer/tokenizer.h>
-#include <functional>
 
 namespace executorch {
 namespace extension {
diff --git a/extension/llm/runner/text_token_generator.h b/extension/llm/runner/text_token_generator.h
index 62b924a57d8..e8bf891f8ec 100644
--- a/extension/llm/runner/text_token_generator.h
+++ b/extension/llm/runner/text_token_generator.h
@@ -11,8 +11,8 @@
 
 #include <executorch/extension/llm/runner/stats.h>
 #include <executorch/extension/llm/runner/text_decoder_runner.h>
-#include <executorch/extension/llm/tokenizer/tokenizer.h>
 #include <executorch/extension/tensor/tensor.h>
+#include <pytorch/tokenizers/tokenizer.h>
 
 namespace executorch {
 namespace extension {
@@ -21,7 +21,7 @@ namespace llm {
 class ET_EXPERIMENTAL TextTokenGenerator {
  public:
   TextTokenGenerator(
-      Tokenizer* tokenizer,
+      ::tokenizers::Tokenizer* tokenizer,
       TextDecoderRunner* text_decoder_runner,
       bool use_kv_cache,
       std::unique_ptr<std::unordered_set<uint64_t>>&& eos_ids,
@@ -106,7 +106,8 @@ class ET_EXPERIMENTAL TextTokenGenerator {
       }
 
       // print the token as string, decode it with the Tokenizer object
-      token_callback(ET_UNWRAP(tokenizer_->decode(prev_token, cur_token)));
+      token_callback(
+          ET_UNWRAP_TOKENIZER(tokenizer_->decode(prev_token, cur_token)));
 
       if (should_stop_) {
         break;
@@ -130,7 +131,7 @@ class ET_EXPERIMENTAL TextTokenGenerator {
   }
 
  private:
-  Tokenizer* tokenizer_;
+  ::tokenizers::Tokenizer* tokenizer_;
   TextDecoderRunner* text_decoder_runner_;
   std::unique_ptr<std::unordered_set<uint64_t>> eos_ids_;
   bool use_kv_cache_;
diff --git a/extension/llm/runner/util.h b/extension/llm/runner/util.h
index 04d4eccc4a7..d6ab23827f9 100644
--- a/extension/llm/runner/util.h
+++ b/extension/llm/runner/util.h
@@ -15,6 +15,30 @@
 #include <sys/resource.h>
 #endif
 
+#define ET_UNWRAP_TOKENIZER(result__)                       \
+  ({                                                        \
+    auto tk_result__ = (result__);                          \
+    if (!tk_result__.ok()) {                                \
+      ET_LOG(                                               \
+          Error,                                            \
+          "Tokenizers error code %d",                       \
+          static_cast<uint32_t>(tk_result__.error()));      \
+      return ::executorch::runtime::Error::InvalidArgument; \
+    }                                                       \
+    std::move(*tk_result__);                                \
+  })
+
+#define ET_CHECK_TK_OK_OR_RETURN_ERROR(result__, ...)                        \
+  ({                                                                         \
+    auto tk_result__ = (result__);                                           \
+    if (tk_result__ != ::tokenizers::Error::Ok) {                            \
+      ET_LOG(                                                                \
+          Error, "Tokenizer error: %d", static_cast<uint32_t>(tk_result__)); \
+      ET_CHECK_OK_OR_RETURN_ERROR(                                           \
+          ::executorch::runtime::Error::InvalidArgument, ##__VA_ARGS__);     \
+    }                                                                        \
+  })
+
 namespace executorch {
 namespace extension {
 namespace llm {
diff --git a/extension/llm/third-party/TARGETS b/extension/llm/third-party/TARGETS
deleted file mode 100644
index 978c12371fe..00000000000
--- a/extension/llm/third-party/TARGETS
+++ /dev/null
@@ -1,47 +0,0 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
-
-oncall("executorch")
-
-runtime.cxx_library(
-    name = "abseil",
-    public_include_directories = ["abseil-cpp"],
-    srcs = glob(
-        ["abseil-cpp/absl/**/*.cc"],
-        exclude = [
-            "abseil-cpp/absl/**/*test*.cc",
-            "abseil-cpp/absl/**/*mock*.cc",
-            "abseil-cpp/absl/**/*matchers*.cc",
-            "abseil-cpp/absl/**/*benchmark*.cc",
-        ],
-    ),
-    exported_linker_flags = select(
-        {
-            "DEFAULT": [],
-            "ovr_config//os:macos": ["-Wl,-framework,CoreFoundation"],
-        },
-    ),
-    visibility = ["PUBLIC"],
-    _is_external_target = True,
-)
-
-runtime.cxx_library(
-    name = "re2",
-    public_include_directories = ["re2"],
-    srcs = glob(
-        [
-            "re2/re2/**/*.cc",
-            "re2/util/**/*.cc",
-        ],
-        exclude = [
-            "re2/re2/**/*test*.cc",
-            "re2/re2/testing/*.cc",
-            "re2/re2/fuzzing/*.cc",
-            "re2/re2/**/*benchmark*.cc",
-        ],
-    ),
-    exported_deps = [
-        ":abseil",
-    ],
-    visibility = ["PUBLIC"],
-    _is_external_target = True,
-)
diff --git a/extension/llm/third-party/abseil-cpp b/extension/llm/third-party/abseil-cpp
deleted file mode 160000
index eb852207758..00000000000
--- a/extension/llm/third-party/abseil-cpp
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit eb852207758a773965301d0ae717e4235fc5301a
diff --git a/extension/llm/third-party/re2 b/extension/llm/third-party/re2
deleted file mode 160000
index 6dcd83d60f7..00000000000
--- a/extension/llm/third-party/re2
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 6dcd83d60f7944926bfd308cc13979fc53dd69ca
diff --git a/extension/llm/third-party/sentencepiece b/extension/llm/third-party/sentencepiece
deleted file mode 160000
index 6225e08edb2..00000000000
--- a/extension/llm/third-party/sentencepiece
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 6225e08edb2577757163b3f5dbba4c0b670ef445
diff --git a/extension/llm/tokenizer/CMakeLists.txt b/extension/llm/tokenizer/CMakeLists.txt
index 8745da6780a..779d9190187 100644
--- a/extension/llm/tokenizer/CMakeLists.txt
+++ b/extension/llm/tokenizer/CMakeLists.txt
@@ -21,11 +21,11 @@ set(ABSL_PROPAGATE_CXX_STD ON)
 set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 add_subdirectory(
-  ${CMAKE_CURRENT_SOURCE_DIR}/../third-party/abseil-cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/../tokenizers/third-party/abseil-cpp
   ${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
 )
 add_subdirectory(
-  ${CMAKE_CURRENT_SOURCE_DIR}/../third-party/re2
+  ${CMAKE_CURRENT_SOURCE_DIR}/../tokenizers/third-party/re2
   ${CMAKE_CURRENT_BINARY_DIR}/re2
 )
 set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
@@ -35,6 +35,7 @@ add_library(extension_llm_tokenizer ${_extension_llm_tokenizer__srcs})
 target_include_directories(
   extension_llm_tokenizer PUBLIC ${EXECUTORCH_ROOT}/..
                                  ${_common_include_directories}
+                                 ${CMAKE_CURRENT_SOURCE_DIR}/../tokenizers/include
 )
 
 target_link_libraries(extension_llm_tokenizer re2::re2)
@@ -53,7 +54,7 @@ install(
 target_include_directories(
   extension_llm_tokenizer
   PRIVATE ${CMAKE_INSTALL_PREFIX}/include
-          ${CMAKE_CURRENT_SOURCE_DIR}/../third-party/abseil-cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/../tokenizers/third-party/abseil-cpp
 )
 
 if(BUILD_TESTING)
diff --git a/extension/llm/tokenizer/string_integer_map.h b/extension/llm/tokenizer/string_integer_map.h
new file mode 100644
index 00000000000..e8ff2d023e0
--- /dev/null
+++ b/extension/llm/tokenizer/string_integer_map.h
@@ -0,0 +1,569 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <unordered_map>
+#include <vector>
+
+namespace executorch::extension::llm {
+
+/**
+ * StringIntegerMap is an immutable bidirectional map between strings and 64 bit
+ * unsigned integers. The element data is stored in a contiguous array and is
+ * shared between both the string buckets and the integer buckets, offering a
+ * compact representation.
+ *
+ * Variable sized integers are used internally, which are sized based on the
+ * data being stored. Custom hash functions are supported, with a stateful hash
+ * functor being optionally provided at construction time.
+ */
+template <
+    typename TStringHash = std::hash<std::string_view>,
+    typename TIntegerHash = std::hash<std::uint64_t>,
+    typename TAllocator = std::allocator<std::uint8_t>>
+class StringIntegerMap {
+ public:
+  /// @name Constructors
+  /// @{
+
+  /// Default constructor is deleted, as this container is intended to be
+  /// constructed with a map of strings to integers.
+  StringIntegerMap() = delete;
+
+  /**
+   * Construct a StringIntegerMap from a map of strings to integers.  Each
+   * string and integer in the map must be unique.
+   * @param map map of strings to integers
+   */
+  explicit StringIntegerMap(
+      const std::unordered_map<std::string, std::uint64_t>& map);
+
+  /**
+   * Construct a StringIntegerMap from a map of strings to integers, explicitly
+   * intializing the integer and string hash objects.  Each string and integer
+   * in the map must be unique.
+   * @param map map of strings to integers
+   */
+  StringIntegerMap(
+      const std::unordered_map<std::string, std::uint64_t>& map,
+      TStringHash string_hasher,
+      TIntegerHash integer_hasher);
+
+  /// @}
+  /// @name Accessors
+  /// @{
+
+  /**
+   * Attempts to retrieve the integer mapped for the given string.
+   * @param str string to lookup
+   * @return a std::optional containing the integer if the string was found,
+   * std::nullopt otherwise
+   */
+  std::optional<std::uint64_t> tryGetInteger(std::string_view str) const;
+
+  /**
+   * Attempts to retrieve the string mapped for the given integer.
+   * @param integer integer to lookup
+   * @return a std::optional containing the string if the integer was found,
+   * std::nullopt otherwise
+   */
+  std::optional<std::string_view> tryGetString(std::uint64_t integer) const;
+
+  /// @}
+
+ private:
+  template <typename TLogical>
+  class VariableSizedInteger {
+   public:
+    VariableSizedInteger() = default;
+
+    explicit VariableSizedInteger(TLogical max_value) {
+      while (max_value != 0) {
+        ++byte_count_;
+        max_value >>= 8;
+      }
+
+      mask_ = (TLogical(1) << (byte_count_ * 8)) - TLogical(1);
+    }
+
+    std::size_t getByteCount() const {
+      return byte_count_;
+    }
+
+    TLogical getMask() const {
+      return mask_;
+    }
+
+    std::uint8_t* write(std::uint8_t* target, TLogical value) const {
+      std::memcpy(target, &value, byte_count_);
+      return target + byte_count_;
+    }
+
+    TLogical read(const std::uint8_t* source) const {
+      TLogical value;
+      std::memcpy(&value, source, sizeof(TLogical));
+      return value & mask_;
+    }
+
+   private:
+    std::size_t byte_count_ = 0;
+    TLogical mask_ = 0;
+  };
+
+  bool tryGetInteger(std::string_view str, std::uint64_t& result) const;
+
+  bool tryGetString(std::uint64_t integer, std::string_view& result) const;
+
+  std::size_t getBucketIndex(std::string_view value) const;
+
+  std::size_t getBucketIndex(std::uint64_t value) const;
+
+  static std::uint8_t getSmallHash(std::size_t hash);
+
+  /// Get the string data and string small hash stored in the element buffer at
+  /// the The hasher used for strings.
+  const TStringHash string_hasher_ = {};
+
+  /// The hasher used for integers.
+  const TIntegerHash integer_hasher_ = {};
+
+  /// String bucket references.
+  std::vector<std::uint8_t, TAllocator> integer_bucket_data_;
+
+  /// Integer bucket elements.
+  /// Laid out as:
+  /// struct {
+  ///   std::uint64_t integer; - Physically using integer_ bytes.
+  ///   std::size_t string_size; - Physically using string_size_ bytes
+  ///   std::size_t string_offset; - Physically using string_offset_ bytes
+  /// }
+  std::vector<std::uint8_t, TAllocator> integer_element_data_;
+
+  /// String bucket references.
+  std::vector<std::uint8_t, TAllocator> string_bucket_data_;
+
+  /// String bucket elements.
+  /// Laid out as:
+  /// struct {
+  ///   std::uint64_t integer; - Physically using integer_ bytes.
+  ///   std::size_t string_size; - Physically using string_size_ bytes
+  ///   std::uint8_t small_hash; - Using std::uint8_t bytes.
+  ///   char string[string_size]; - String data, not zero terminated.
+  /// }
+  std::vector<std::uint8_t, TAllocator> string_element_data_;
+
+  /// Number of hash buckets to use.
+  std::size_t bucket_count_ = 0;
+
+  /// Variable sized element offset info.
+  VariableSizedInteger<std::size_t> element_offset_;
+
+  /// Variable size string offset info.
+  VariableSizedInteger<std::size_t> string_offset_;
+
+  /// Variable sized string size info.
+  VariableSizedInteger<std::size_t> string_size_;
+
+  /// Variable sized integer info.
+  VariableSizedInteger<std::uint64_t> integer_;
+};
+
+template <typename TStringHash, typename TIntegerHash, typename TAllocator>
+StringIntegerMap<TStringHash, TIntegerHash, TAllocator>::StringIntegerMap(
+    const std::unordered_map<std::string, std::uint64_t>& map)
+    : StringIntegerMap(map, TStringHash(), TIntegerHash()) {}
+
+template <typename TStringHash, typename TIntegerHash, typename TAllocator>
+StringIntegerMap<TStringHash, TIntegerHash, TAllocator>::StringIntegerMap(
+    const std::unordered_map<std::string, std::uint64_t>& map,
+    TStringHash string_hasher,
+    TIntegerHash integer_hasher)
+    : string_hasher_(string_hasher), integer_hasher_(integer_hasher) {
+  assert(map.size() <= std::numeric_limits<std::uint32_t>::max());
+  bucket_count_ = map.size();
+
+  struct BuilderElement {
+    std::uint64_t integer = 0;
+    std::string_view string;
+    std::size_t hash = 0;
+    std::size_t element_offset = 0;
+  };
+
+  std::vector<BuilderElement> builder_string_elements;
+  std::vector<BuilderElement> builder_integer_elements;
+
+  //
+  // Calculate various item sizes and gather the builder elements.
+  //
+
+  std::size_t largest_string_size = 0;
+  std::uint64_t largest_integer = 0;
+  std::size_t total_string_size = 0;
+
+  for (const auto& [str, integer] : map) {
+    total_string_size += str.size();
+    largest_string_size = std::max(largest_string_size, str.size());
+    largest_integer = std::max(largest_integer, integer);
+    builder_string_elements.push_back({integer, str, string_hasher_(str)});
+    builder_integer_elements.push_back(
+        {integer, str, integer_hasher_(integer)});
+  }
+
+  integer_ = VariableSizedInteger<std::uint64_t>(largest_integer);
+  string_size_ = VariableSizedInteger<std::size_t>(largest_string_size);
+  string_offset_ = VariableSizedInteger<std::size_t>(total_string_size);
+
+  const auto string_element_data_size =
+      ((integer_.getByteCount() + string_size_.getByteCount() + 1) *
+       map.size()) +
+      total_string_size;
+  const auto integer_element_size = integer_.getByteCount() +
+      string_offset_.getByteCount() + string_size_.getByteCount();
+  const auto integer_element_data_size = integer_element_size * map.size();
+
+  element_offset_ = VariableSizedInteger<std::size_t>(
+      std::max(string_element_data_size, integer_element_data_size));
+
+  string_bucket_data_.resize(
+      ((bucket_count_ + 1) * element_offset_.getByteCount()) +
+      sizeof(std::uint64_t));
+  integer_bucket_data_.resize(
+      ((bucket_count_ + 1) * element_offset_.getByteCount()) +
+      sizeof(std::uint64_t));
+
+  //
+  // Set up terminal bucket indices.
+  //
+
+  element_offset_.write(
+      string_bucket_data_.data() +
+          (bucket_count_ * element_offset_.getByteCount()),
+      string_element_data_size);
+  element_offset_.write(
+      integer_bucket_data_.data() +
+          (bucket_count_ * element_offset_.getByteCount()),
+      integer_element_data_size);
+  //
+  // Sort the builder elements.
+  //
+
+  std::sort(
+      std::begin(builder_string_elements),
+      std::end(builder_string_elements),
+      [this](const BuilderElement& first, const BuilderElement& second) {
+        const auto first_bucket = first.hash % bucket_count_;
+        const auto second_bucket = second.hash % bucket_count_;
+        if (first_bucket == second_bucket) {
+          const auto first_small_hash = getSmallHash(first.hash);
+          const auto second_small_hash = getSmallHash(second.hash);
+          return first_small_hash < second_small_hash;
+        }
+
+        return first_bucket < second_bucket;
+      });
+
+  std::sort(
+      std::begin(builder_integer_elements),
+      std::end(builder_integer_elements),
+      [this](const BuilderElement& first, const BuilderElement& second) {
+        const auto first_bucket = first.hash % bucket_count_;
+        const auto second_bucket = second.hash % bucket_count_;
+        if (first_bucket == second_bucket) {
+          return first.integer < second.integer;
+        }
+
+        return first_bucket < second_bucket;
+      });
+
+  //
+  // Lay out the string elements and record their positions.
+  //
+
+  std::unordered_map<std::string_view, std::size_t>
+      string_element_byte_index_map;
+  string_element_data_.resize(string_element_data_size + sizeof(std::uint64_t));
+  auto* string_element = string_element_data_.data();
+  for (auto& builder_element : builder_string_elements) {
+    builder_element.element_offset =
+        string_element - string_element_data_.data();
+
+    auto insert_result = string_element_byte_index_map.insert(
+        {builder_element.string, builder_element.element_offset});
+    assert(insert_result.second);
+    (void)insert_result;
+
+    string_element = integer_.write(string_element, builder_element.integer);
+    string_element =
+        string_size_.write(string_element, builder_element.string.size());
+    *string_element = getSmallHash(builder_element.hash);
+    string_element++;
+    std::memcpy(
+        string_element,
+        builder_element.string.data(),
+        builder_element.string.size());
+    string_element += builder_element.string.size();
+    assert(
+        string_element >= string_element_data_.data() &&
+        string_element <=
+            string_element_data_.data() + string_element_data_size);
+  }
+
+  //
+  // Lay out the integer elements.
+  //
+
+  integer_element_data_.resize(
+      integer_element_data_size + sizeof(std::uint64_t));
+  auto* integer_element = integer_element_data_.data();
+  for (auto& builder_element : builder_integer_elements) {
+    builder_element.element_offset =
+        integer_element - integer_element_data_.data();
+    auto string_element_byte_index_iter =
+        string_element_byte_index_map.find(builder_element.string);
+    assert(
+        string_element_byte_index_iter !=
+        std::end(string_element_byte_index_map));
+    integer_element = integer_.write(integer_element, builder_element.integer);
+    integer_element =
+        string_size_.write(integer_element, builder_element.string.size());
+    integer_element = string_offset_.write(
+        integer_element, string_element_byte_index_iter->second);
+    assert(
+        integer_element >= integer_element_data_.data() &&
+        integer_element <=
+            integer_element_data_.data() + integer_element_data_size);
+  }
+
+  //
+  // Both the string elements and integer elements are laid out in order of
+  // their respective hashes. Generate the hash indexes for the string elements
+  // and integer elements.
+  //
+
+  auto builder_string_elements_iter = std::begin(builder_string_elements);
+  auto builder_integer_elements_iter = std::begin(builder_integer_elements);
+
+  for (std::size_t bucket_idx = 0; bucket_idx < bucket_count_; ++bucket_idx) {
+    auto* string_bucket = string_bucket_data_.data() +
+        (bucket_idx * element_offset_.getByteCount());
+    if (builder_string_elements_iter != std::end(builder_string_elements)) {
+      element_offset_.write(
+          string_bucket, builder_string_elements_iter->element_offset);
+    } else {
+      element_offset_.write(string_bucket, string_element_data_size);
+    }
+
+    auto* integer_bucket = integer_bucket_data_.data() +
+        (bucket_idx * element_offset_.getByteCount());
+    if (builder_integer_elements_iter != std::end(builder_integer_elements)) {
+      element_offset_.write(
+          integer_bucket, builder_integer_elements_iter->element_offset);
+    } else {
+      element_offset_.write(integer_bucket, integer_element_data_size);
+    }
+
+    //
+    // Advance the string element iterator past all string elements that map
+    // into this bucket.
+    //
+
+    while (builder_string_elements_iter != std::end(builder_string_elements) &&
+           getBucketIndex(builder_string_elements_iter->string) == bucket_idx) {
+      ++builder_string_elements_iter;
+    }
+
+    //
+    // Advance the integer element index past all integer elements that map into
+    // this bucket.
+    //
+
+    while (
+        builder_integer_elements_iter != std::end(builder_integer_elements) &&
+        getBucketIndex(builder_integer_elements_iter->integer) == bucket_idx) {
+      ++builder_integer_elements_iter;
+    }
+  }
+}
+
+template <typename TStringHash, typename TIntegerHash, typename TAllocator>
+std::optional<std::uint64_t>
+StringIntegerMap<TStringHash, TIntegerHash, TAllocator>::tryGetInteger(
+    std::string_view str) const {
+  std::uint64_t result;
+  return tryGetInteger(str, result) ? std::optional<std::uint64_t>(result)
+                                    : std::nullopt;
+}
+
+template <typename TStringHash, typename TIntegerHash, typename TAllocator>
+bool StringIntegerMap<TStringHash, TIntegerHash, TAllocator>::tryGetInteger(
+    std::string_view str,
+    std::uint64_t& result) const {
+  if (bucket_count_ == 0) {
+    return false;
+  }
+
+  const auto hash = string_hasher_(str);
+  const auto bucket_index = hash % bucket_count_;
+  const auto small_hash = getSmallHash(hash);
+
+  const auto* bucket_data = string_bucket_data_.data() +
+      (bucket_index * element_offset_.getByteCount());
+  const auto lower_element_offset = element_offset_.read(bucket_data);
+  const auto upper_element_offset =
+      element_offset_.read(bucket_data + element_offset_.getByteCount());
+
+  const auto integer_size = integer_.getByteCount();
+  const auto string_size_size = string_size_.getByteCount();
+
+  std::size_t element_size = 0;
+  auto* element_data_end = string_element_data_.data() + upper_element_offset;
+  for (auto* element_data = string_element_data_.data() + lower_element_offset;
+       element_data < element_data_end;
+       element_data += element_size) {
+    //
+    // Read the string length.
+    //
+
+    const auto element_string_length =
+        string_size_.read(element_data + integer_size);
+    element_size = integer_size + string_size_size + 1 + element_string_length;
+
+    //
+    // Read the string small hash.
+    //
+
+    const auto element_small_hash =
+        element_data[integer_size + string_size_size];
+    if (element_small_hash < small_hash) {
+      continue;
+    } else if (element_small_hash > small_hash) {
+      break;
+    }
+
+    //
+    // Get a view on the string for a full comparison.
+    //
+
+    std::string_view element_string(
+        reinterpret_cast<const char*>(
+            element_data + integer_size + string_size_size + 1),
+        element_string_length);
+    if (str == element_string) {
+      result = integer_.read(element_data);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+template <typename TStringHash, typename TIntegerHash, typename TAllocator>
+std::optional<std::string_view>
+StringIntegerMap<TStringHash, TIntegerHash, TAllocator>::tryGetString(
+    std::uint64_t integer) const {
+  std::string_view result;
+  return tryGetString(integer, result) ? std::optional<std::string_view>(result)
+                                       : std::nullopt;
+}
+
+template <typename TStringHash, typename TIntegerHash, typename TAllocator>
+bool StringIntegerMap<TStringHash, TIntegerHash, TAllocator>::tryGetString(
+    std::uint64_t integer,
+    std::string_view& result) const {
+  if (bucket_count_ == 0) {
+    return false;
+  }
+
+  const auto bucket_index = getBucketIndex(integer);
+
+  const auto* bucket_data = integer_bucket_data_.data() +
+      (bucket_index * element_offset_.getByteCount());
+  const auto lower_element_offset = element_offset_.read(bucket_data);
+  const auto upper_element_offset =
+      element_offset_.read(bucket_data + element_offset_.getByteCount());
+
+  const auto integer_element_size = integer_.getByteCount() +
+      string_offset_.getByteCount() + string_size_.getByteCount();
+  auto* element_data_end = integer_element_data_.data() + upper_element_offset;
+  for (auto* element_data = integer_element_data_.data() + lower_element_offset;
+       element_data < element_data_end;
+       element_data += integer_element_size) {
+    const auto element_integer = integer_.read(element_data);
+    if (element_integer == integer) {
+      const auto element_string_size =
+          string_size_.read(element_data + integer_.getByteCount());
+      const auto element_string_offset = string_offset_.read(
+          element_data + integer_.getByteCount() + string_size_.getByteCount());
+      const auto* string_element =
+          string_element_data_.data() + element_string_offset;
+      const auto* string_data = reinterpret_cast<const char*>(
+          string_element + integer_.getByteCount() +
+          string_size_.getByteCount() + 1);
+      result = std::string_view(string_data, element_string_size);
+      return true;
+    } else if (element_integer > integer) {
+      break;
+    }
+  }
+
+  return false;
+}
+
+template <typename TStringHash, typename TIntegerHash, typename TAllocator>
+std::size_t
+StringIntegerMap<TStringHash, TIntegerHash, TAllocator>::getBucketIndex(
+    std::string_view value) const {
+  return string_hasher_(value) % bucket_count_;
+}
+
+template <typename TStringHash, typename TIntegerHash, typename TAllocator>
+std::size_t
+StringIntegerMap<TStringHash, TIntegerHash, TAllocator>::getBucketIndex(
+    std::uint64_t value) const {
+  return integer_hasher_(value) % bucket_count_;
+}
+
+template <typename TStringHash, typename TIntegerHash, typename TAllocator>
+std::uint8_t
+StringIntegerMap<TStringHash, TIntegerHash, TAllocator>::getSmallHash(
+    std::size_t hash) {
+  const auto shift = (sizeof(std::size_t) * 8) - 8;
+  return static_cast<std::uint8_t>(hash >> shift);
+}
+
+template <
+    typename TStringHash = std::hash<std::string_view>,
+    typename TIntegerHash = std::hash<std::uint64_t>,
+    typename TAllocator = std::allocator<std::uint8_t>>
+struct StringIntegerMapTypeBuilder {
+  using Map = StringIntegerMap<TStringHash, TIntegerHash, TAllocator>;
+
+  template <typename TOtherStringHash>
+  using WithStringHash =
+      StringIntegerMapTypeBuilder<TOtherStringHash, TIntegerHash, TAllocator>;
+
+  template <typename TOtherIntegerHash>
+  using WithIntegerHash =
+      StringIntegerMapTypeBuilder<TStringHash, TOtherIntegerHash, TAllocator>;
+
+  template <typename TOtherAllocator>
+  using WithAllocator =
+      StringIntegerMapTypeBuilder<TStringHash, TIntegerHash, TOtherAllocator>;
+};
+} // namespace executorch::extension::llm
diff --git a/extension/llm/tokenizer/targets.bzl b/extension/llm/tokenizer/targets.bzl
index be5606ccd2b..7b545054390 100644
--- a/extension/llm/tokenizer/targets.bzl
+++ b/extension/llm/tokenizer/targets.bzl
@@ -10,6 +10,7 @@ def define_common_targets():
         name = "tokenizer_py_lib",
         srcs = [
             "__init__.py",
+            "hf_tokenizer.py",
             "tokenizer.py",
             "utils.py",
         ],
@@ -82,6 +83,7 @@ def define_common_targets():
         exported_headers = [
             "tiktoken.h",
             "base64.h",
+            "string_integer_map.h",
         ],
         exported_deps = [
             ":tokenizer_header",
diff --git a/extension/llm/tokenizer/test/CMakeLists.txt b/extension/llm/tokenizer/test/CMakeLists.txt
index ffc37f9e46f..a700b847cbc 100644
--- a/extension/llm/tokenizer/test/CMakeLists.txt
+++ b/extension/llm/tokenizer/test/CMakeLists.txt
@@ -15,11 +15,11 @@ cmake_minimum_required(VERSION 3.19)
 
 set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..)
 
-include(${EXECUTORCH_ROOT}/build/Test.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
 set(test_env "RESOURCES_PATH=${EXECUTORCH_ROOT}/extension/llm/tokenizer/test/resources")
 
-set(_test_srcs test_bpe_tokenizer.cpp test_tiktoken.cpp)
+set(_test_srcs test_bpe_tokenizer.cpp test_tiktoken.cpp test_string_integer_map.cpp)
 
 et_cxx_test(
   extension_llm_tokenizer_test SOURCES ${_test_srcs} EXTRA_LIBS
diff --git a/extension/llm/tokenizer/test/targets.bzl b/extension/llm/tokenizer/test/targets.bzl
index 2c314a98230..8755ae6273f 100644
--- a/extension/llm/tokenizer/test/targets.bzl
+++ b/extension/llm/tokenizer/test/targets.bzl
@@ -22,6 +22,20 @@ def define_common_targets():
         ],
     )
 
+    runtime.cxx_test(
+        name = "test_string_integer_map",
+        srcs = [
+            "test_string_integer_map.cpp",
+        ],
+        deps = [
+            "//executorch/extension/llm/tokenizer:tiktoken",
+        ],
+        env = {
+            "RESOURCES_PATH": "$(location :resources)/resources",
+        },
+        platforms = [CXX, ANDROID],  # Cannot bundle resources on Apple platform.
+    )
+
     runtime.cxx_test(
         name = "test_bpe_tokenizer",
         srcs = [
@@ -59,3 +73,8 @@ def define_common_targets():
             "resources/**",
         ]),
     )
+
+    runtime.export_file(
+        name = "test_tiktoken_tokenizer_model",
+        src = "resources/test_tiktoken_tokenizer.model",
+    )
diff --git a/extension/llm/tokenizer/test/test_string_integer_map.cpp b/extension/llm/tokenizer/test/test_string_integer_map.cpp
new file mode 100644
index 00000000000..24a9853429d
--- /dev/null
+++ b/extension/llm/tokenizer/test/test_string_integer_map.cpp
@@ -0,0 +1,318 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifdef EXECUTORCH_FB_BUCK
+#include <TestResourceUtils/TestResourceUtils.h>
+#endif
+#include <executorch/extension/llm/tokenizer/base64.h>
+#include <executorch/extension/llm/tokenizer/string_integer_map.h>
+#include <executorch/runtime/platform/runtime.h>
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include <fstream>
+#include <unordered_map>
+
+#if defined(__APPLE__) || defined(WIN32) || defined(__linux__)
+#define TEST_MEMORY_COMPARISON 1
+
+#if defined(__APPLE__)
+#include <malloc/malloc.h>
+#else
+#include <malloc.h>
+#endif
+#endif
+
+using namespace ::testing;
+using ::executorch::extension::llm::StringIntegerMap;
+using ::executorch::extension::llm::StringIntegerMapTypeBuilder;
+using ::executorch::extension::llm::base64::decode;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::Result;
+using TokenizerMap = std::unordered_map<std::string, std::uint64_t>;
+
+class StringIntegerMapTest : public Test {
+ public:
+  void SetUp() override {
+    ::executorch::runtime::runtime_init();
+#ifdef EXECUTORCH_FB_BUCK
+    modelPath_ = facebook::xplat::testing::getPathForTestResource(
+        "resources/test_tiktoken_tokenizer.model");
+#else
+    modelPath_ = std::getenv("RESOURCES_PATH") +
+        std::string("/test_tiktoken_tokenizer.model");
+#endif
+  }
+
+  Result<TokenizerMap> loadModel() {
+    std::ifstream file(modelPath_);
+    ET_CHECK_OR_RETURN_ERROR(
+        file,
+        InvalidArgument,
+        "failed to open encoder file: %s",
+        modelPath_.c_str());
+
+    TokenizerMap model;
+    for (std::string line; std::getline(file, line);) {
+      if (line.empty()) {
+        continue;
+      }
+
+      auto pos = line.find(' ');
+      auto token = ET_UNWRAP(decode({line.data(), pos}));
+      uint64_t rank = 0;
+      try {
+        rank = std::stoul(line.substr(pos + 1));
+      } catch (const std::exception&) {
+        ET_CHECK_OR_RETURN_ERROR(
+            false, InvalidArgument, "invalid encoder rank: %s", line.c_str());
+      }
+      model[token] = rank;
+    }
+
+    return model;
+  }
+
+  std::string modelPath_;
+};
+
+#if defined(TEST_MEMORY_COMPARISON) && TEST_MEMORY_COMPARISON
+
+class TrackingAllocatorBase {
+ public:
+  static void reset();
+  static std::size_t getSize();
+
+ protected:
+  static void* allocate(std::size_t size);
+  static void deallocate(void* ptr);
+
+  static std::size_t size_;
+};
+
+void TrackingAllocatorBase::reset() {
+  size_ = 0;
+}
+
+std::size_t TrackingAllocatorBase::getSize() {
+  return size_;
+}
+
+void* TrackingAllocatorBase::allocate(std::size_t size) {
+  void* ptr = malloc(size);
+  if (!ptr) {
+    return nullptr;
+  }
+
+#if defined(WIN32)
+  size_ += _msize(ptr);
+#elif defined(__APPLE__)
+  size_ += malloc_size(const_cast<const void*>(ptr));
+#else
+  size_ += malloc_usable_size(ptr);
+#endif
+
+  return ptr;
+}
+
+void TrackingAllocatorBase::deallocate(void* ptr) {
+  if (!ptr) {
+    return;
+  }
+
+#if defined(WIN32)
+  size_ -= _msize(ptr);
+#elif defined(__APPLE__)
+  size_ -= malloc_size(const_cast<const void*>(ptr));
+#else
+  size_ -= malloc_usable_size(ptr);
+#endif
+
+  free(ptr);
+}
+
+std::size_t TrackingAllocatorBase::size_ = 0;
+
+template <typename T>
+class TrackingAllocator : public TrackingAllocatorBase {
+ public:
+  using value_type = T;
+  TrackingAllocator() noexcept = default;
+  template <class U>
+  explicit TrackingAllocator(TrackingAllocator<U> const&) noexcept {}
+
+  value_type* allocate(std::size_t count) {
+    return static_cast<value_type*>(
+        TrackingAllocatorBase::allocate(count * sizeof(value_type))); // NOLINT
+  }
+
+  void deallocate(value_type* ptr, std::size_t /*count*/) noexcept {
+    TrackingAllocatorBase::deallocate(ptr);
+  }
+};
+
+template <class T, class U>
+bool operator==(
+    TrackingAllocator<T> const&,
+    TrackingAllocator<U> const&) noexcept {
+  return true;
+}
+
+template <class T, class U>
+bool operator!=(
+    TrackingAllocator<T> const& lhs,
+    TrackingAllocator<U> const& rhs) noexcept {
+  return !(lhs == rhs);
+}
+
+#endif
+
+TEST_F(StringIntegerMapTest, CreateFromModel) {
+  const auto res = loadModel();
+  ASSERT_EQ(res.ok(), true);
+  const auto& model = res.get();
+  StringIntegerMap map(model);
+
+  for (const auto& [model_key, model_value] : model) {
+    EXPECT_THAT(map.tryGetInteger(model_key), testing::Optional(model_value))
+        << model_key;
+    EXPECT_THAT(map.tryGetString(model_value), testing::Optional(model_key))
+        << model_value;
+  }
+
+  EXPECT_FALSE(map.tryGetInteger("Ich weiß nicht"));
+  EXPECT_FALSE(map.tryGetString(999999999));
+}
+
+#if defined(TEST_MEMORY_COMPARISON) && TEST_MEMORY_COMPARISON
+
+TEST_F(StringIntegerMapTest, MemoryConsumptionComparison) {
+  TrackingAllocatorBase::reset();
+  EXPECT_EQ(TrackingAllocatorBase::getSize(), 0);
+
+  const auto res = loadModel();
+  ASSERT_EQ(res.ok(), true);
+  const auto& model = res.get();
+
+  std::size_t string_integer_map_size = 0;
+  {
+    typename StringIntegerMapTypeBuilder<>::WithAllocator<
+        TrackingAllocator<std::uint8_t>>::Map map(model);
+    string_integer_map_size = TrackingAllocatorBase::getSize();
+  }
+
+  EXPECT_EQ(TrackingAllocatorBase::getSize(), 0);
+
+  std::size_t unordered_map_size = 0;
+  {
+    std::unordered_map<
+        std::string,
+        std::uint64_t,
+        std::hash<std::string>,
+        std::equal_to<std::string>,
+        TrackingAllocator<std::pair<const std::string, std::uint64_t>>>
+        strings_to_ints;
+    std::unordered_map<
+        std::uint64_t,
+        std::string,
+        std::hash<std::uint64_t>,
+        std::equal_to<std::uint64_t>,
+        TrackingAllocator<std::pair<const std::uint64_t, std::string>>>
+        ints_to_strings;
+    for (const auto& [k, v] : model) {
+      strings_to_ints.emplace(k, v);
+      ints_to_strings.emplace(v, k);
+    }
+
+    unordered_map_size = TrackingAllocatorBase::getSize();
+  }
+
+  EXPECT_LT(string_integer_map_size, unordered_map_size);
+
+#if 1
+  std::cout << "string integer map size = " << string_integer_map_size
+            << std::endl;
+  std::cout << "unordered map size = " << unordered_map_size << std::endl;
+#endif
+}
+
+#endif
+
+template <std::size_t hash_offset>
+struct FixedHash {
+  std::size_t operator()(const std::string_view& str) const {
+    if (str.empty()) {
+      return hash_offset;
+    }
+
+    return str.size() - 1 + hash_offset;
+  }
+
+  std::size_t operator()(std::uint64_t value) const {
+    if (value == 0) {
+      return hash_offset;
+    }
+
+    return static_cast<std::size_t>(std::log10(value)) + hash_offset;
+  }
+};
+
+template <typename THash>
+class StringIntegerMapHashTest : public Test {
+ public:
+  using Container = typename StringIntegerMapTypeBuilder<>::WithIntegerHash<
+      THash>::template WithStringHash<THash>::Map;
+};
+
+using StringIntegerMapHashTestTypes =
+    ::testing::Types<FixedHash<0>, FixedHash<1>, FixedHash<2>, FixedHash<3>>;
+TYPED_TEST_SUITE(StringIntegerMapHashTest, StringIntegerMapHashTestTypes);
+
+TYPED_TEST(StringIntegerMapHashTest, HashCollisions) {
+  std::unordered_map<std::string, std::uint64_t> source = {
+      {"a", 0},
+      {"b", 1},
+      {"c", 2},
+      {"d", 3},
+  };
+
+  typename TestFixture::Container map(source);
+
+  //
+  // Check that the strings exist in the map.
+  //
+
+  EXPECT_THAT(map.tryGetInteger("a"), Optional(0ull));
+  EXPECT_THAT(map.tryGetInteger("b"), Optional(1ull));
+  EXPECT_THAT(map.tryGetInteger("c"), Optional(2ull));
+  EXPECT_THAT(map.tryGetInteger("d"), Optional(3ull));
+
+  EXPECT_FALSE(map.tryGetInteger("e"));
+
+  //
+  // Check that the integers exist in the map.
+  //
+
+  EXPECT_THAT(map.tryGetString(0), Optional(std::string_view("a")));
+  EXPECT_THAT(map.tryGetString(1), Optional(std::string_view("b")));
+  EXPECT_THAT(map.tryGetString(2), Optional(std::string_view("c")));
+  EXPECT_THAT(map.tryGetString(3), Optional(std::string_view("d")));
+
+  EXPECT_FALSE(map.tryGetString(4));
+
+  //
+  // Test a lookup into the next bucket (which should be empty).
+  //
+
+  EXPECT_FALSE(map.tryGetInteger("aa"));
+  EXPECT_FALSE(map.tryGetInteger("aaa"));
+  EXPECT_FALSE(map.tryGetInteger("aaaa"));
+
+  EXPECT_FALSE(map.tryGetString(10));
+  EXPECT_FALSE(map.tryGetString(100));
+  EXPECT_FALSE(map.tryGetString(1000));
+}
diff --git a/extension/llm/tokenizer/tiktoken.cpp b/extension/llm/tokenizer/tiktoken.cpp
index f99ac2e955e..725a3fe453d 100644
--- a/extension/llm/tokenizer/tiktoken.cpp
+++ b/extension/llm/tokenizer/tiktoken.cpp
@@ -29,7 +29,9 @@
 #include <executorch/extension/llm/tokenizer/tiktoken.h>
 #include <executorch/runtime/core/result.h>
 #include <fstream>
+#include <functional>
 #include <limits>
+#include <unordered_set>
 
 using ::executorch::runtime::Error;
 using ::executorch::runtime::Result;
@@ -97,6 +99,7 @@ static Result<Encoder> _load_encoder(const std::string& path) {
 
   Encoder encoder;
   std::string line;
+  std::unordered_set<std::uint64_t> ranks;
   while (std::getline(file, line)) {
     auto [token, rank] = ET_UNWRAP(_parse(line));
 
@@ -105,28 +108,20 @@ static Result<Encoder> _load_encoder(const std::string& path) {
         InvalidArgument,
         "duplicate item: %s",
         line.c_str());
-  }
 
-  return encoder;
-}
-
-static Result<Decoder> _build_decoder(const Encoder& encoder) {
-  Decoder decoder;
-  for (const auto& [k, v] : encoder) {
-    decoder.emplace(v, k);
+    ET_CHECK_OR_RETURN_ERROR(
+        ranks.insert(rank).second,
+        InvalidArgument,
+        "duplicate rank: %s",
+        line.c_str());
   }
 
-  ET_CHECK_OR_RETURN_ERROR(
-      encoder.size() == decoder.size(),
-      InvalidArgument,
-      "duplicate items in encoder");
-
-  return decoder;
+  return encoder;
 }
 
 static std::vector<uint64_t> _byte_pair_merge(
     const std::string& piece,
-    const std::unordered_map<std::string, uint64_t>& ranks,
+    const StringIntegerMap<>& ranks,
     std::function<uint64_t(uint64_t, uint64_t)> func) {
   // This is a vector of (start, rank).
   // The rank is of the byte pair starting at position start.
@@ -145,10 +140,7 @@ static std::vector<uint64_t> _byte_pair_merge(
       auto s = parts[start_idx].first;
       auto e = parts[start_idx + skip + 2].first;
       auto key = piece.substr(s, e - s);
-      auto iter = ranks.find(key);
-      if (iter != ranks.end()) {
-        return iter->second;
-      }
+      return ranks.tryGetInteger(key);
     }
     return std::nullopt;
   };
@@ -230,11 +222,11 @@ static std::vector<uint64_t> _byte_pair_merge(
 
 static std::vector<uint64_t> _byte_pair_encode(
     const std::string& piece,
-    const Encoder& encoder) {
+    const StringIntegerMap<>& tokenizer) {
   if (piece.size() == 1) {
-    auto iter = encoder.find(piece);
-    if (iter != encoder.end()) {
-      return std::vector<uint64_t>({iter->second});
+    const auto result = tokenizer.tryGetInteger(piece);
+    if (result) {
+      return std::vector<uint64_t>(*result);
     } else {
       // TODO: is it possible?
       return {};
@@ -242,11 +234,11 @@ static std::vector<uint64_t> _byte_pair_encode(
   }
 
   return _byte_pair_merge(
-      piece, encoder, [&piece, &encoder](uint64_t start, uint64_t stop) {
+      piece, tokenizer, [&piece, &tokenizer](uint64_t start, uint64_t stop) {
         std::string key = piece.substr(start, stop - start);
-        auto iter = encoder.find(key);
-        if (iter != encoder.end()) {
-          return iter->second;
+        const auto result = tokenizer.tryGetInteger(key);
+        if (result) {
+          return *result;
         } else {
           // TODO: what if key does not exist? Should we return `unknown`?
           // assert(false); // ??
@@ -278,7 +270,7 @@ Tiktoken::_split_with_allowed_special_token(
       break;
     }
 
-    if (allowed_special.count(special) == 1) {
+    if (allowed_special.tryGetInteger(special).has_value()) {
       // Found an allowed special token, split the text with it.
 #if __cplusplus >= 202002L
       return std::make_pair(
@@ -302,13 +294,13 @@ void Tiktoken::_encode(
   std::string piece;
   assert(_regex);
   while (re2::RE2::FindAndConsume(&input, *_regex, &piece)) {
-    auto iter = _encoder.find(piece);
-    if (iter != _encoder.end()) {
+    const auto result = _token_map->tryGetInteger(piece);
+    if (result) {
       last_piece_token_len = 1;
-      ret.push_back(iter->second);
+      ret.push_back(*result);
       continue;
     }
-    auto tokens = _byte_pair_encode(piece, _encoder);
+    auto tokens = _byte_pair_encode(piece, *_token_map);
     last_piece_token_len = tokens.size();
     ret.insert(ret.end(), tokens.begin(), tokens.end());
   }
@@ -328,16 +320,14 @@ std::pair<std::vector<uint64_t>, uint64_t> Tiktoken::_encode_with_special_token(
     _encode(sub_input, tokens, last_piece_token_len);
 
     if (special) {
-      uint64_t token = 0;
-      try {
-        token = _special_token_encoder.at(*special);
-      } catch (const std::out_of_range&) {
+      const auto result = _special_token_map->tryGetInteger(*special);
+      if (!result) {
         // Should never go here, since special pattern includes all special
         // chars.
         ET_CHECK_MSG(false, "unknown special token: %s", special->c_str());
       }
 
-      tokens.push_back(token);
+      tokens.push_back(*result);
       last_piece_token_len = 0;
     } else {
       break;
@@ -380,11 +370,10 @@ Tiktoken::Tiktoken(
 }
 
 Error Tiktoken::load(const std::string& path) {
-  _encoder = ET_UNWRAP(_load_encoder(path));
-  _special_token_encoder = _build_special_token_encoder(_encoder.size());
-
-  _decoder = ET_UNWRAP(_build_decoder(_encoder));
-  _special_token_decoder = ET_UNWRAP(_build_decoder(_special_token_encoder));
+  auto encoder = ET_UNWRAP(_load_encoder(path));
+  _token_map.emplace(StringIntegerMap<>(encoder));
+  auto special_token_encoder = _build_special_token_encoder(encoder.size());
+  _special_token_map.emplace(StringIntegerMap<>(special_token_encoder));
 
   _regex = _create_regex(_pattern);
   // Warmup re2 as it is slow on the first run, void the return value as it's
@@ -392,14 +381,14 @@ Error Tiktoken::load(const std::string& path) {
   // https://github.com/google/re2/blob/6dcd83d60f7944926bfd308cc13979fc53dd69ca/re2/fuzzing/re2_fuzzer.cc#L136-L141
   (void)_regex->ReverseProgramSize();
 
-  _special_token_regex = _build_special_token_regex(_special_token_encoder);
+  _special_token_regex = _build_special_token_regex(special_token_encoder);
   // Same as above, warm up re2
   (void)_special_token_regex->ReverseProgramSize();
 
   // initialize vocab_size, bos_tok, eos_tok
-  vocab_size_ = _encoder.size() + _special_token_encoder.size();
-  bos_tok_ = _special_token_encoder.at(_special_tokens->at(_bos_token_index));
-  eos_tok_ = _special_token_encoder.at(_special_tokens->at(_eos_token_index));
+  vocab_size_ = encoder.size() + special_token_encoder.size();
+  bos_tok_ = special_token_encoder.at(_special_tokens->at(_bos_token_index));
+  eos_tok_ = special_token_encoder.at(_special_tokens->at(_eos_token_index));
 
   initialized_ = true;
   return Error::Ok;
@@ -410,7 +399,7 @@ Tiktoken::encode(const std::string& text, int8_t bos, int8_t eos) const {
   if (!initialized_) {
     return Error::NotSupported;
   }
-  auto res = _encode_with_special_token(text, _special_token_encoder).first;
+  auto res = _encode_with_special_token(text, *_special_token_map).first;
   for (auto i = 0; i < bos; ++i) {
     res.insert(res.begin(), bos_tok_);
   }
@@ -425,21 +414,19 @@ Result<std::string> Tiktoken::decode(uint64_t prev, uint64_t cur) const {
   ET_CHECK_OK_OR_RETURN_ERROR(Tokenizer::decode_verify(cur));
   std::string ret;
 
-  std::string token_bytes;
-  auto iter = _decoder.find(cur);
-  if (iter != _decoder.end()) {
-    token_bytes = iter->second;
-  } else {
-    iter = _special_token_decoder.find(cur);
-    if (iter != _special_token_decoder.end()) {
-      token_bytes = iter->second;
-    } else {
+  std::string_view token_bytes;
+  auto result = _token_map->tryGetString(cur);
+  if (!result) {
+    result = _special_token_map->tryGetString(cur);
+    if (!result) {
       ET_CHECK_MSG(false, "unknown token: %" PRIu64, cur);
+    } else {
+      token_bytes = *result;
     }
+  } else {
+    token_bytes = *result;
   }
-  ret += token_bytes;
-
-  return ret;
+  return std::string(token_bytes);
 }
 // -------------------------public method end-------------------------------
 
diff --git a/extension/llm/tokenizer/tiktoken.h b/extension/llm/tokenizer/tiktoken.h
index 5201c07a184..d7d93b27597 100644
--- a/extension/llm/tokenizer/tiktoken.h
+++ b/extension/llm/tokenizer/tiktoken.h
@@ -8,6 +8,7 @@
 
 #pragma once
 
+#include <executorch/extension/llm/tokenizer/string_integer_map.h>
 #include <executorch/extension/llm/tokenizer/tokenizer.h>
 #include <re2/re2.h>
 #include <memory>
@@ -68,10 +69,8 @@ class ET_EXPERIMENTAL Tiktoken : public Tokenizer {
   // Removed negative lookahead \s+(?!\S) since it's not supported by RE2.
   const std::string _pattern =
       R"((?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+)";
-  Encoder _encoder;
-  Encoder _special_token_encoder;
-  Decoder _decoder;
-  Decoder _special_token_decoder;
+  std::optional<StringIntegerMap<>> _token_map;
+  std::optional<StringIntegerMap<>> _special_token_map;
 
   Re2UPtr _regex;
   Re2UPtr _special_token_regex;
diff --git a/extension/llm/tokenizers b/extension/llm/tokenizers
new file mode 160000
index 00000000000..d70f5a76055
--- /dev/null
+++ b/extension/llm/tokenizers
@@ -0,0 +1 @@
+Subproject commit d70f5a760552d8d3bb288cdd93eebde477bb6eb0
diff --git a/extension/memory_allocator/test/CMakeLists.txt b/extension/memory_allocator/test/CMakeLists.txt
index 4153d76ffa5..f6fc2062d45 100644
--- a/extension/memory_allocator/test/CMakeLists.txt
+++ b/extension/memory_allocator/test/CMakeLists.txt
@@ -17,7 +17,7 @@ cmake_minimum_required(VERSION 3.19)
 
 set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 
-include(${EXECUTORCH_ROOT}/build/Test.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
 set(_test_srcs malloc_memory_allocator_test.cpp)
 
diff --git a/extension/module/CMakeLists.txt b/extension/module/CMakeLists.txt
index 70441265c61..d144ce95356 100644
--- a/extension/module/CMakeLists.txt
+++ b/extension/module/CMakeLists.txt
@@ -27,7 +27,7 @@ if(CMAKE_TOOLCHAIN_IOS
 else()
   add_library(extension_module SHARED ${_extension_module__srcs})
 endif()
-target_link_libraries(extension_module PRIVATE executorch extension_data_loader)
+target_link_libraries(extension_module PRIVATE executorch extension_data_loader extension_flat_tensor)
 target_include_directories(extension_module PUBLIC ${EXECUTORCH_ROOT}/..)
 target_compile_options(
   extension_module PUBLIC -Wno-deprecated-declarations -fPIC
@@ -37,7 +37,7 @@ target_compile_options(
 # after cleaning up CMake targets.
 add_library(extension_module_static STATIC ${_extension_module__srcs})
 target_link_libraries(
-  extension_module_static PRIVATE executorch extension_data_loader
+  extension_module_static PRIVATE executorch extension_data_loader extension_flat_tensor
 )
 target_include_directories(extension_module_static PUBLIC ${EXECUTORCH_ROOT}/..)
 target_compile_options(
diff --git a/extension/module/module.cpp b/extension/module/module.cpp
index 99cc7e38bd6..400a2c45049 100644
--- a/extension/module/module.cpp
+++ b/extension/module/module.cpp
@@ -10,6 +10,7 @@
 
 #include <executorch/extension/data_loader/file_data_loader.h>
 #include <executorch/extension/data_loader/mmap_data_loader.h>
+#include <executorch/extension/flat_tensor/flat_tensor_data_map.h>
 #include <executorch/extension/memory_allocator/malloc_memory_allocator.h>
 #include <executorch/runtime/platform/runtime.h>
 
@@ -36,6 +37,32 @@
 namespace executorch {
 namespace extension {
 
+namespace {
+runtime::Result<std::unique_ptr<runtime::DataLoader>> load_file(
+    const std::string& file_path,
+    Module::LoadMode mode) {
+  std::unique_ptr<runtime::DataLoader> res = nullptr;
+  switch (mode) {
+    case Module::LoadMode::File:
+      res = ET_UNWRAP_UNIQUE(FileDataLoader::from(file_path.c_str()));
+      break;
+    case Module::LoadMode::Mmap:
+      res = ET_UNWRAP_UNIQUE(MmapDataLoader::from(
+          file_path.c_str(), MmapDataLoader::MlockConfig::NoMlock));
+      break;
+    case Module::LoadMode::MmapUseMlock:
+      res = ET_UNWRAP_UNIQUE(MmapDataLoader::from(file_path.c_str()));
+      break;
+    case Module::LoadMode::MmapUseMlockIgnoreErrors:
+      res = ET_UNWRAP_UNIQUE(MmapDataLoader::from(
+          file_path.c_str(),
+          MmapDataLoader::MlockConfig::UseMlockIgnoreErrors));
+      break;
+  }
+  return res;
+}
+} // namespace
+
 Module::Module(
     const std::string& file_path,
     const LoadMode load_mode,
@@ -44,7 +71,25 @@ Module::Module(
       load_mode_(load_mode),
       memory_allocator_(std::make_unique<MallocMemoryAllocator>()),
       temp_allocator_(std::make_unique<MallocMemoryAllocator>()),
-      event_tracer_(std::move(event_tracer)) {
+      event_tracer_(std::move(event_tracer)),
+      data_map_loader_(nullptr),
+      data_map_(nullptr) {
+  runtime::runtime_init();
+}
+
+Module::Module(
+    const std::string& file_path,
+    const std::string& data_map_path,
+    const LoadMode load_mode,
+    std::unique_ptr<runtime::EventTracer> event_tracer)
+    : file_path_(file_path),
+      data_map_path_(data_map_path),
+      load_mode_(load_mode),
+      memory_allocator_(std::make_unique<MallocMemoryAllocator>()),
+      temp_allocator_(std::make_unique<MallocMemoryAllocator>()),
+      event_tracer_(std::move(event_tracer)),
+      data_map_loader_(nullptr),
+      data_map_(nullptr) {
   runtime::runtime_init();
 }
 
@@ -52,7 +97,8 @@ Module::Module(
     std::unique_ptr<runtime::DataLoader> data_loader,
     std::unique_ptr<runtime::MemoryAllocator> memory_allocator,
     std::unique_ptr<runtime::MemoryAllocator> temp_allocator,
-    std::unique_ptr<runtime::EventTracer> event_tracer)
+    std::unique_ptr<runtime::EventTracer> event_tracer,
+    std::unique_ptr<runtime::DataLoader> data_map_loader)
     : data_loader_(std::move(data_loader)),
       memory_allocator_(
           memory_allocator ? std::move(memory_allocator)
@@ -60,7 +106,9 @@ Module::Module(
       temp_allocator_(
           temp_allocator ? std::move(temp_allocator)
                          : std::make_unique<MallocMemoryAllocator>()),
-      event_tracer_(std::move(event_tracer)) {
+      event_tracer_(std::move(event_tracer)),
+      data_map_loader_(std::move(data_map_loader)),
+      data_map_(nullptr) {
   runtime::runtime_init();
 }
 
@@ -68,7 +116,8 @@ Module::Module(
     std::shared_ptr<runtime::Program> program,
     std::unique_ptr<runtime::MemoryAllocator> memory_allocator,
     std::unique_ptr<runtime::MemoryAllocator> temp_allocator,
-    std::unique_ptr<runtime::EventTracer> event_tracer)
+    std::unique_ptr<runtime::EventTracer> event_tracer,
+    std::unique_ptr<runtime::DataLoader> data_map_loader)
     : program_(std::move(program)),
       memory_allocator_(
           memory_allocator ? std::move(memory_allocator)
@@ -76,33 +125,37 @@ Module::Module(
       temp_allocator_(
           temp_allocator ? std::move(temp_allocator)
                          : std::make_unique<MallocMemoryAllocator>()),
-      event_tracer_(std::move(event_tracer)) {
+      event_tracer_(std::move(event_tracer)),
+      data_map_loader_(std::move(data_map_loader)),
+      data_map_(nullptr) {
   runtime::runtime_init();
 }
 
 runtime::Error Module::load(const runtime::Program::Verification verification) {
   if (!is_loaded()) {
+    // Load the program
     if (!data_loader_) {
-      switch (load_mode_) {
-        case LoadMode::File:
-          data_loader_ =
-              ET_UNWRAP_UNIQUE(FileDataLoader::from(file_path_.c_str()));
-          break;
-        case LoadMode::Mmap:
-          data_loader_ = ET_UNWRAP_UNIQUE(MmapDataLoader::from(
-              file_path_.c_str(), MmapDataLoader::MlockConfig::NoMlock));
-          break;
-        case LoadMode::MmapUseMlock:
-          data_loader_ =
-              ET_UNWRAP_UNIQUE(MmapDataLoader::from(file_path_.c_str()));
-          break;
-        case LoadMode::MmapUseMlockIgnoreErrors:
-          data_loader_ = ET_UNWRAP_UNIQUE(MmapDataLoader::from(
-              file_path_.c_str(),
-              MmapDataLoader::MlockConfig::UseMlockIgnoreErrors));
-          break;
+      auto res = load_file(file_path_, load_mode_);
+      if (!res.ok()) {
+        return res.error();
       }
-    };
+      data_loader_ = std::move(res.get());
+    }
+    // If a .ptd path was given load it.
+    if (data_map_path_ != "") {
+      auto res = load_file(data_map_path_, load_mode_);
+      if (!res.ok()) {
+        return res.error();
+      }
+      data_map_loader_ = std::move(res.get());
+    }
+    // If we have a .ptd loader, then load the map.
+    if (data_map_loader_) {
+      data_map_ =
+          ET_UNWRAP_UNIQUE(FlatTensorDataMap::load(data_map_loader_.get()));
+    }
+    // else: either the map itself was provided or we have no data map, either
+    // way no work to do.
     auto program = ET_UNWRAP_UNIQUE(
         runtime::Program::load(data_loader_.get(), verification));
     program_ = std::shared_ptr<runtime::Program>(
@@ -111,6 +164,11 @@ runtime::Error Module::load(const runtime::Program::Verification verification) {
   return runtime::Error::Ok;
 }
 
+runtime::Result<size_t> Module::num_methods() {
+  ET_CHECK_OK_OR_RETURN_ERROR(load());
+  return program_->num_methods();
+}
+
 runtime::Result<std::unordered_set<std::string>> Module::method_names() {
   ET_CHECK_OK_OR_RETURN_ERROR(load());
   const auto method_count = program_->num_methods();
@@ -125,37 +183,41 @@ runtime::Result<std::unordered_set<std::string>> Module::method_names() {
 
 runtime::Error Module::load_method(
     const std::string& method_name,
+    runtime::HierarchicalAllocator* planned_memory,
     torch::executor::EventTracer* event_tracer) {
   if (!is_method_loaded(method_name)) {
     ET_CHECK_OK_OR_RETURN_ERROR(load());
 
     MethodHolder method_holder;
-    const auto method_metadata =
-        ET_UNWRAP(program_->method_meta(method_name.c_str()));
-    const auto planned_buffersCount =
-        method_metadata.num_memory_planned_buffers();
-    method_holder.planned_buffers.reserve(planned_buffersCount);
-    method_holder.planned_spans.reserve(planned_buffersCount);
-
-    for (auto index = 0; index < planned_buffersCount; ++index) {
-      const auto buffer_size =
-          method_metadata.memory_planned_buffer_size(index).get();
-      method_holder.planned_buffers.emplace_back(buffer_size);
-      method_holder.planned_spans.emplace_back(
-          method_holder.planned_buffers.back().data(), buffer_size);
+
+    if (!planned_memory) {
+      const auto method_metadata =
+          ET_UNWRAP(program_->method_meta(method_name.c_str()));
+      const auto planned_buffers_count =
+          method_metadata.num_memory_planned_buffers();
+      method_holder.planned_buffers.reserve(planned_buffers_count);
+      method_holder.planned_spans.reserve(planned_buffers_count);
+
+      for (auto index = 0; index < planned_buffers_count; ++index) {
+        const auto buffer_size =
+            method_metadata.memory_planned_buffer_size(index).get();
+        method_holder.planned_buffers.emplace_back(buffer_size);
+        method_holder.planned_spans.emplace_back(
+            method_holder.planned_buffers.back().data(), buffer_size);
+      }
+      method_holder.planned_memory =
+          std::make_unique<runtime::HierarchicalAllocator>(runtime::Span(
+              method_holder.planned_spans.data(),
+              method_holder.planned_spans.size()));
+      planned_memory = method_holder.planned_memory.get();
     }
-    method_holder.planned_memory =
-        std::make_unique<runtime::HierarchicalAllocator>(runtime::Span(
-            method_holder.planned_spans.data(),
-            method_holder.planned_spans.size()));
     method_holder.memory_manager = std::make_unique<runtime::MemoryManager>(
-        memory_allocator_.get(),
-        method_holder.planned_memory.get(),
-        temp_allocator_.get());
+        memory_allocator_.get(), planned_memory, temp_allocator_.get());
     method_holder.method = ET_UNWRAP_UNIQUE(program_->load_method(
         method_name.c_str(),
         method_holder.memory_manager.get(),
-        event_tracer ? event_tracer : this->event_tracer()));
+        event_tracer ? event_tracer : this->event_tracer(),
+        data_map_.get()));
     method_holder.inputs.resize(method_holder.method->inputs_size());
     methods_.emplace(method_name, std::move(method_holder));
   }
diff --git a/extension/module/module.h b/extension/module/module.h
index 45ed38a7ff2..45d2cc1d14b 100644
--- a/extension/module/module.h
+++ b/extension/module/module.h
@@ -51,6 +51,21 @@ class Module {
       const LoadMode load_mode = LoadMode::MmapUseMlock,
       std::unique_ptr<runtime::EventTracer> event_tracer = nullptr);
 
+  /**
+   * Constructs an instance by loading a program from a file with specified
+   * memory locking behavior.
+   *
+   * @param[in] file_path The path to the ExecuTorch program file to load.
+   * @param[in] data_map_path The path to a .ptd file
+   * @param[in] load_mode The loading mode to use.
+   * @param[in] event_tracer A EventTracer used for tracking and logging events.
+   */
+  explicit Module(
+      const std::string& file_path,
+      const std::string& data_map_path,
+      const LoadMode load_mode = LoadMode::MmapUseMlock,
+      std::unique_ptr<runtime::EventTracer> event_tracer = nullptr);
+
   /**
    * Constructs an instance with the provided data loader and memory allocator.
    *
@@ -59,12 +74,14 @@ class Module {
    * @param[in] temp_allocator A MemoryAllocator to use when allocating
    * temporary data during kernel or delegate execution.
    * @param[in] event_tracer A EventTracer used for tracking and logging events.
+   * @param[in] data_map_loader A DataLoader used for loading external weights.
    */
   explicit Module(
       std::unique_ptr<runtime::DataLoader> data_loader,
       std::unique_ptr<runtime::MemoryAllocator> memory_allocator = nullptr,
       std::unique_ptr<runtime::MemoryAllocator> temp_allocator = nullptr,
-      std::unique_ptr<runtime::EventTracer> event_tracer = nullptr);
+      std::unique_ptr<runtime::EventTracer> event_tracer = nullptr,
+      std::unique_ptr<runtime::DataLoader> data_map_loader = nullptr);
 
   /**
    * Constructs an instance using an existing shared program.
@@ -75,12 +92,14 @@ class Module {
    * @param[in] temp_allocator A MemoryAllocator to use when allocating
    * temporary data.
    * @param[in] event_tracer A EventTracer used for tracking and logging events.
+   * @param[in] data_map_loader A DataLoader used for loading external weights.
    */
   explicit Module(
       std::shared_ptr<runtime::Program> program,
       std::unique_ptr<runtime::MemoryAllocator> memory_allocator = nullptr,
       std::unique_ptr<runtime::MemoryAllocator> temp_allocator = nullptr,
-      std::unique_ptr<runtime::EventTracer> event_tracer = nullptr);
+      std::unique_ptr<runtime::EventTracer> event_tracer = nullptr,
+      std::unique_ptr<runtime::DataLoader> data_map_loader = nullptr);
 
   Module(const Module&) = delete;
   Module& operator=(const Module&) = delete;
@@ -119,6 +138,14 @@ class Module {
     return program_;
   }
 
+  /**
+   * Get the number of methods available in the loaded program.
+   *
+   * @returns A Result object containing either the number of methods available
+   *          or an error to indicate failure.
+   */
+  runtime::Result<size_t> num_methods();
+
   /**
    * Get a list of method names available in the loaded program.
    * Loads the program and method if needed.
@@ -133,6 +160,8 @@ class Module {
    * needed. The loaded method is cached to reuse the next time it's executed.
    *
    * @param[in] method_name The name of the method to load.
+   * @param[in] planned_memory The memory-planned buffers to use for mutable
+   * tensor data when executing a method.
    * @param[in] event_tracer Per-method event tracer to profile/trace methods
    * individually. When not given, the event tracer passed to the Module
    * constructor is used. Otherwise, this per-method event tracer takes
@@ -143,20 +172,35 @@ class Module {
   ET_NODISCARD
   runtime::Error load_method(
       const std::string& method_name,
+      runtime::HierarchicalAllocator* planned_memory = nullptr,
       torch::executor::EventTracer* event_tracer = nullptr);
 
+  ET_DEPRECATED ET_NODISCARD runtime::Error inline load_method(
+      const std::string& method_name,
+      torch::executor::EventTracer* event_tracer) {
+    return load_method(method_name, nullptr, event_tracer);
+  }
+
   /**
    * Load the 'forward' method from the program and set up memory management if
    * needed. The loaded method is cached to reuse the next time it's executed.
    *
+   * @param[in] planned_memory The memory-planned buffers to use for mutable
+   * tensor data when executing the 'forward' method.
    * @param[in] event_tracer An event tracer used for tracking and logging
    * events.
    *
    * @returns An Error to indicate success or failure.
    */
   ET_NODISCARD inline runtime::Error load_forward(
+      runtime::HierarchicalAllocator* planned_memory = nullptr,
       torch::executor::EventTracer* event_tracer = nullptr) {
-    return load_method("forward", event_tracer);
+    return load_method("forward", planned_memory, event_tracer);
+  }
+
+  ET_DEPRECATED ET_NODISCARD inline runtime::Error load_forward(
+      torch::executor::EventTracer* event_tracer) {
+    return load_forward(nullptr, event_tracer);
   }
 
   /**
@@ -433,14 +477,16 @@ class Module {
     std::vector<runtime::EValue> inputs;
   };
 
- private:
   std::string file_path_;
+  std::string data_map_path_;
   LoadMode load_mode_{LoadMode::MmapUseMlock};
   std::shared_ptr<runtime::Program> program_;
   std::unique_ptr<runtime::DataLoader> data_loader_;
   std::unique_ptr<runtime::MemoryAllocator> memory_allocator_;
   std::unique_ptr<runtime::MemoryAllocator> temp_allocator_;
   std::unique_ptr<runtime::EventTracer> event_tracer_;
+  std::unique_ptr<runtime::DataLoader> data_map_loader_;
+  std::unique_ptr<runtime::NamedDataMap> data_map_;
 
  protected:
   std::unordered_map<std::string, MethodHolder> methods_;
diff --git a/extension/module/targets.bzl b/extension/module/targets.bzl
index 61251047dc8..09a610a1fca 100644
--- a/extension/module/targets.bzl
+++ b/extension/module/targets.bzl
@@ -1,4 +1,4 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime")
 
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
@@ -7,7 +7,7 @@ def define_common_targets():
     TARGETS and BUCK files that call this function.
     """
 
-    for aten_mode in (True, False):
+    for aten_mode in get_aten_mode_options():
         aten_suffix = ("_aten" if aten_mode else "")
 
         runtime.cxx_library(
@@ -25,6 +25,7 @@ def define_common_targets():
                 "//executorch/extension/memory_allocator:malloc_memory_allocator",
                 "//executorch/extension/data_loader:file_data_loader",
                 "//executorch/extension/data_loader:mmap_data_loader",
+                "//executorch/extension/flat_tensor:flat_tensor_data_map",
             ],
             exported_deps = [
                 "//executorch/runtime/executor:program" + aten_suffix,
diff --git a/extension/module/test/CMakeLists.txt b/extension/module/test/CMakeLists.txt
index dbd0bccbda8..0192b63e632 100644
--- a/extension/module/test/CMakeLists.txt
+++ b/extension/module/test/CMakeLists.txt
@@ -15,7 +15,7 @@ cmake_minimum_required(VERSION 3.19)
 
 set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 
-include(${EXECUTORCH_ROOT}/build/Test.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
 set(_test_srcs module_test.cpp)
 
diff --git a/extension/module/test/module_test.cpp b/extension/module/test/module_test.cpp
index 2dbb0fea936..a82e257a703 100644
--- a/extension/module/test/module_test.cpp
+++ b/extension/module/test/module_test.cpp
@@ -22,14 +22,20 @@ using namespace ::executorch::runtime;
 class ModuleTest : public ::testing::Test {
  protected:
   static void SetUpTestSuite() {
-    model_path_ = std::getenv("RESOURCES_PATH") + std::string("/add.pte");
+    std::string resources_path;
+    if (const char* env = std::getenv("RESOURCES_PATH")) {
+      resources_path = env;
+    }
+    model_path_ = resources_path + "/add.pte";
+    linear_path_ = resources_path + "/linear.pte";
+    linear_data_path_ = resources_path + "/linear.ptd";
   }
 
-  static std::string model_path_;
+  static inline std::string model_path_;
+  static inline std::string linear_path_;
+  static inline std::string linear_data_path_;
 };
 
-std::string ModuleTest::model_path_;
-
 TEST_F(ModuleTest, TestLoad) {
   Module module(model_path_);
 
@@ -63,6 +69,14 @@ TEST_F(ModuleTest, TestMethodNames) {
   EXPECT_EQ(method_names.get(), std::unordered_set<std::string>{"forward"});
 }
 
+TEST_F(ModuleTest, TestNumMethods) {
+  Module module(model_path_);
+
+  const auto num_methods = module.num_methods();
+  EXPECT_EQ(num_methods.error(), Error::Ok);
+  EXPECT_EQ(num_methods.get(), 1);
+}
+
 TEST_F(ModuleTest, TestNonExistentMethodNames) {
   Module module("/path/to/nonexistent/file.pte");
 
@@ -435,3 +449,14 @@ TEST_F(ModuleTest, TestSetOutputInvalidType) {
 
   EXPECT_NE(module.set_output(EValue()), Error::Ok);
 }
+
+TEST_F(ModuleTest, TestPTD) {
+  Module module(linear_path_, linear_data_path_);
+
+  ASSERT_EQ(module.load_method("forward"), Error::Ok);
+
+  auto tensor1 =
+      make_tensor_ptr({3, 3}, {2.f, 3.f, 4.f, 2.f, 3.f, 4.f, 2.f, 3.f, 4.f});
+
+  ASSERT_EQ(module.forward(tensor1).error(), Error::Ok);
+}
diff --git a/extension/module/test/resources/README.md b/extension/module/test/resources/README.md
index e2b54633fae..ecbdd41c107 100644
--- a/extension/module/test/resources/README.md
+++ b/extension/module/test/resources/README.md
@@ -1,11 +1,23 @@
 ## Resources
 
-### model.pte
+### add.pte, linear.pte, linear.ptd
 - Internally generated after D62209852, 2024-09-06 with:
     ```
     buck2 run fbcode//executorch/examples/portable/scripts:export -- --model_name="add"
     ```
+
+    and
+
+    ```
+    buck2 run fbcode//executorch/examples/portable/scripts:export -- --model_name="linear" -examples
+    ```
 - In OSS, the same file can be generated after [#5145](https://github.com/pytorch/executorch/pull/5145), 2024-09-06 with:
     ```
     python -m examples.portable.scripts.export --model_name="add"
     ```
+
+    and
+
+    ```
+    python -m examples.portable.scripts.export --model_name="linear" -e
+    ```
diff --git a/extension/module/test/resources/linear.ptd b/extension/module/test/resources/linear.ptd
new file mode 100644
index 00000000000..edab857bb3f
Binary files /dev/null and b/extension/module/test/resources/linear.ptd differ
diff --git a/extension/module/test/resources/linear.pte b/extension/module/test/resources/linear.pte
new file mode 100644
index 00000000000..707815ad881
Binary files /dev/null and b/extension/module/test/resources/linear.pte differ
diff --git a/extension/module/test/targets.bzl b/extension/module/test/targets.bzl
index bc4ce2c6af7..19ba09cf4e6 100644
--- a/extension/module/test/targets.bzl
+++ b/extension/module/test/targets.bzl
@@ -3,7 +3,7 @@ load(
     "ANDROID",
     "CXX",
 )
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime")
 
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
@@ -12,7 +12,7 @@ def define_common_targets():
     TARGETS and BUCK files that call this function.
     """
 
-    for aten_mode in (True, False):
+    for aten_mode in get_aten_mode_options():
         aten_suffix = ("_aten" if aten_mode else "")
 
         runtime.cxx_test(
diff --git a/extension/parallel/targets.bzl b/extension/parallel/targets.bzl
index b1da51b6171..dbfb3ff160c 100644
--- a/extension/parallel/targets.bzl
+++ b/extension/parallel/targets.bzl
@@ -1,4 +1,4 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime")
 
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
@@ -7,24 +7,16 @@ def define_common_targets():
     TARGETS and BUCK files that call this function.
     """
 
-    for aten_mode in (True, False):
-        aten_suffix = ("_aten" if aten_mode else "")
-
-        runtime.cxx_library(
-            name = "thread_parallel" + aten_suffix,
-            srcs = [
-                "thread_parallel.cpp",
-            ],
-            exported_headers = [
-                "thread_parallel.h",
-            ],
-            visibility = [
-                "//executorch/...",
-                "@EXECUTORCH_CLIENTS",
-            ],
-            deps = [
-                "//executorch/extension/threadpool:threadpool",
-                "//executorch/runtime/core:core",
-                "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix,
-            ],
-        )
+    runtime.cxx_library(
+        name = "thread_parallel",
+        exported_headers = [
+            "thread_parallel.h",
+        ],
+        visibility = [
+            "//executorch/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+        deps = [
+            "//executorch/runtime/kernel:thread_parallel_interface",
+        ],
+    )
diff --git a/extension/parallel/test/CMakeLists.txt b/extension/parallel/test/CMakeLists.txt
deleted file mode 100644
index ab37f66c17d..00000000000
--- a/extension/parallel/test/CMakeLists.txt
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# This file should be formatted with
-# ~~~
-# cmake-format -i CMakeLists.txt
-# ~~~
-# It should also be cmake-lint clean.
-#
-
-cmake_minimum_required(VERSION 3.19)
-project(extension_parallel_test)
-
-# Use C++17 for test.
-set(CMAKE_CXX_STANDARD 17)
-
-set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
-
-include(${EXECUTORCH_ROOT}/build/Test.cmake)
-
-set(_test_srcs thread_parallel_test.cpp ../thread_parallel.cpp)
-
-et_cxx_test(
-  extension_parallel_test
-  SOURCES
-  ${_test_srcs}
-  EXTRA_LIBS
-  pthreadpool
-  cpuinfo
-  extension_threadpool
-)
-target_include_directories(
-  extension_parallel_test
-  PRIVATE ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include
-          ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include
-)
diff --git a/extension/parallel/thread_parallel.h b/extension/parallel/thread_parallel.h
index 8b174075ae9..8bd1a572cd7 100644
--- a/extension/parallel/thread_parallel.h
+++ b/extension/parallel/thread_parallel.h
@@ -8,46 +8,7 @@
 
 #pragma once
 
-#include <cstdint>
-#include <functional>
-
-namespace executorch {
-namespace extension {
-
-/**
- * A helper to run function in parallel.
- *
- * begin, end: describe the extent of the workitems via first and last workitem
- * to be processed
- * grain_size: number of workitems processed by user callback which is
- * described below
- * f: user function applied in parallel to the chunks, signature:
- *   void f(int64_t begin, int64_t end)
- * Returns true if all work items are processed successfully, false otherwise
- *
- * Warning: parallel_for does NOT copy thread local states from the current
- * thread to the worker threads. Users need to protect the access to captured
- * data if they mutate them in f.
- */
-bool parallel_for(
-    const int64_t begin,
-    const int64_t end,
-    const int64_t grain_size,
-    const std::function<void(int64_t, int64_t)>& f);
-
-int64_t get_thread_num();
-
-void set_thread_num(int64_t thread_num);
-
-} // namespace extension
-} // namespace executorch
-
-namespace torch {
-namespace executor {
-// TODO(T197294990): Remove these deprecated aliases once all users have moved
-// to the new `::executorch` namespaces.
-using ::executorch::extension::get_thread_num;
-using ::executorch::extension::parallel_for;
-using ::executorch::extension::set_thread_num;
-} // namespace executor
-} // namespace torch
+// This header is a stub left behind after the move to
+// executorch/runtime/kernel. As such, it is deprecated; include and
+// use the below header directly instead.
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
diff --git a/extension/pybindings/TARGETS b/extension/pybindings/TARGETS
index 17ccbb2477c..2e77127bf56 100644
--- a/extension/pybindings/TARGETS
+++ b/extension/pybindings/TARGETS
@@ -70,5 +70,8 @@ runtime.python_library(
         "//executorch/runtime/...",
         "@EXECUTORCH_CLIENTS",
     ],
-    deps = [":_portable_lib"],
+    deps = [
+        ":_portable_lib",
+        "//executorch/exir:_warnings",
+    ],
 )
diff --git a/extension/pybindings/portable_lib.py b/extension/pybindings/portable_lib.py
index 25624ad60c0..758e41545d1 100644
--- a/extension/pybindings/portable_lib.py
+++ b/extension/pybindings/portable_lib.py
@@ -38,11 +38,14 @@
     _create_profile_block,  # noqa: F401
     _dump_profile_results,  # noqa: F401
     _get_operator_names,  # noqa: F401
+    _get_registered_backend_names,  # noqa: F401
+    _is_available,  # noqa: F401
     _load_bundled_program_from_buffer,  # noqa: F401
     _load_for_executorch,  # noqa: F401
     _load_for_executorch_from_buffer,  # noqa: F401
     _load_for_executorch_from_bundled_program,  # noqa: F401
     _reset_profile_results,  # noqa: F401
+    _unsafe_reset_threadpool,  # noqa: F401
     BundledModule,  # noqa: F401
     ExecuTorchModule,  # noqa: F401
     MethodMeta,  # noqa: F401
diff --git a/extension/pybindings/pybindings.cpp b/extension/pybindings/pybindings.cpp
index 97bff671149..0f2689b7068 100644
--- a/extension/pybindings/pybindings.cpp
+++ b/extension/pybindings/pybindings.cpp
@@ -23,6 +23,8 @@
 #include <executorch/extension/data_loader/buffer_data_loader.h>
 #include <executorch/extension/data_loader/mmap_data_loader.h>
 #include <executorch/extension/memory_allocator/malloc_memory_allocator.h>
+#include <executorch/extension/threadpool/threadpool.h>
+#include <executorch/runtime/backend/interface.h>
 #include <executorch/runtime/core/data_loader.h>
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
 #include <executorch/runtime/executor/method.h>
@@ -87,10 +89,14 @@ using ::executorch::extension::BufferDataLoader;
 using ::executorch::extension::MallocMemoryAllocator;
 using ::executorch::extension::MmapDataLoader;
 using ::executorch::runtime::ArrayRef;
+using ::executorch::runtime::BackendInterface;
 using ::executorch::runtime::DataLoader;
 using ::executorch::runtime::Error;
 using ::executorch::runtime::EValue;
 using ::executorch::runtime::EventTracerDebugLogLevel;
+using ::executorch::runtime::get_backend_class;
+using ::executorch::runtime::get_backend_name;
+using ::executorch::runtime::get_num_registered_backends;
 using ::executorch::runtime::get_registered_kernels;
 using ::executorch::runtime::HierarchicalAllocator;
 using ::executorch::runtime::Kernel;
@@ -975,6 +981,26 @@ py::list get_operator_names() {
   return res;
 }
 
+py::list get_registered_backend_names() {
+  size_t n_of_registered_backends = get_num_registered_backends();
+  py::list res;
+  for (size_t i = 0; i < n_of_registered_backends; i++) {
+    auto backend_name_res = get_backend_name(i);
+    THROW_IF_ERROR(backend_name_res.error(), "Failed to get backend name");
+    auto backend_name = backend_name_res.get();
+    res.append(backend_name);
+  }
+  return res;
+}
+
+py::bool_ is_available(const std::string& backend_name) {
+  BackendInterface* backend = get_backend_class(backend_name.c_str());
+  if (backend == nullptr) {
+    return false;
+  }
+  return backend->is_available();
+}
+
 } // namespace
 
 PYBIND11_MODULE(EXECUTORCH_PYTHON_MODULE_NAME, m) {
@@ -1028,12 +1054,25 @@ PYBIND11_MODULE(EXECUTORCH_PYTHON_MODULE_NAME, m) {
             prof_result.num_bytes);
       },
       call_guard);
+  m.def(
+      "_get_registered_backend_names",
+      &get_registered_backend_names,
+      call_guard);
   m.def("_get_operator_names", &get_operator_names);
+  m.def("_is_available", &is_available, py::arg("backend_name"), call_guard);
   m.def("_create_profile_block", &create_profile_block, call_guard);
   m.def(
       "_reset_profile_results",
       []() { EXECUTORCH_RESET_PROFILE_RESULTS(); },
       call_guard);
+  m.def(
+      "_unsafe_reset_threadpool",
+      [](int num_threads) {
+        executorch::extension::threadpool::get_threadpool()
+            ->_unsafe_reset_threadpool(num_threads);
+      },
+      py::arg("num_threads"),
+      call_guard);
 
   py::class_<PyModule>(m, "ExecuTorchModule")
       .def("load_bundled_input", &PyModule::load_bundled_input, call_guard)
diff --git a/extension/pybindings/pybindings.pyi b/extension/pybindings/pybindings.pyi
index fc44ce388a4..64ea14f08ff 100644
--- a/extension/pybindings/pybindings.pyi
+++ b/extension/pybindings/pybindings.pyi
@@ -211,6 +211,15 @@ def _load_bundled_program_from_buffer(
     """
     ...
 
+@experimental("This API is experimental and subject to change without notice.")
+def _is_available(backend_name: str) -> bool:
+    """
+    .. warning::
+
+        This API is experimental and subject to change without notice.
+    """
+    ...
+
 @experimental("This API is experimental and subject to change without notice.")
 def _get_operator_names() -> List[str]:
     """
@@ -220,6 +229,15 @@ def _get_operator_names() -> List[str]:
     """
     ...
 
+@experimental("This API is experimental and subject to change without notice.")
+def _get_registered_backend_names() -> List[str]:
+    """
+    .. warning::
+
+        This API is experimental and subject to change without notice.
+    """
+    ...
+
 @experimental("This API is experimental and subject to change without notice.")
 def _create_profile_block(name: str) -> None:
     """
@@ -246,3 +264,12 @@ def _reset_profile_results() -> None:
         This API is experimental and subject to change without notice.
     """
     ...
+
+@experimental("This API is experimental and subject to change without notice.")
+def _unsafe_reset_threadpool(num_threads: int) -> None:
+    """
+    .. warning::
+
+        This API is experimental and subject to change without notice.
+    """
+    ...
diff --git a/extension/pybindings/test/TARGETS b/extension/pybindings/test/TARGETS
index 73063deb651..4770bebbcc4 100644
--- a/extension/pybindings/test/TARGETS
+++ b/extension/pybindings/test/TARGETS
@@ -47,3 +47,11 @@ runtime.python_test(
         "//executorch/kernels/quantized:aot_lib",
     ],
 )
+
+runtime.python_test(
+    name = "test_backend_pybinding",
+    srcs = ["test_backend_pybinding.py"],
+    deps = [
+        "//executorch/runtime:runtime",
+    ],
+)
diff --git a/extension/pybindings/test/test_backend_pybinding.py b/extension/pybindings/test/test_backend_pybinding.py
new file mode 100644
index 00000000000..4dafc2fae15
--- /dev/null
+++ b/extension/pybindings/test/test_backend_pybinding.py
@@ -0,0 +1,27 @@
+import unittest
+
+from executorch.runtime import Runtime
+
+
+class TestBackendsPybinding(unittest.TestCase):
+    def test_backend_name_list(
+        self,
+    ) -> None:
+
+        runtime = Runtime.get()
+        registered_backend_names = runtime.backend_registry.registered_backend_names
+        self.assertGreaterEqual(len(registered_backend_names), 1)
+        self.assertIn("XnnpackBackend", registered_backend_names)
+
+    def test_backend_is_available(
+        self,
+    ) -> None:
+        # XnnpackBackend is available
+        runtime = Runtime.get()
+        self.assertTrue(
+            runtime.backend_registry.is_available(backend_name="XnnpackBackend")
+        )
+        # NonExistBackend doesn't exist and not available
+        self.assertFalse(
+            runtime.backend_registry.is_available(backend_name="NonExistBackend")
+        )
diff --git a/extension/pytree/TARGETS b/extension/pytree/TARGETS
index 400a5b9504c..005c5c9c2d7 100644
--- a/extension/pytree/TARGETS
+++ b/extension/pytree/TARGETS
@@ -16,11 +16,9 @@ cpp_python_extension(
     ],
     base_module = "executorch.extension.pytree",
     deps = [
+        "fbsource//third-party/pybind11:pybind11",
         ":pytree",
     ],
-    external_deps = [
-        "pybind11",
-    ],
 )
 
 cpp_python_extension(
@@ -30,11 +28,9 @@ cpp_python_extension(
     ],
     base_module = "executorch.extension.pytree",
     deps = [
+        "fbsource//third-party/pybind11:pybind11",
         ":pytree",
     ],
-    external_deps = [
-        "pybind11",
-    ],
 )
 
 python_library(
diff --git a/extension/pytree/test/CMakeLists.txt b/extension/pytree/test/CMakeLists.txt
index a1514bc728f..5d99bad1339 100644
--- a/extension/pytree/test/CMakeLists.txt
+++ b/extension/pytree/test/CMakeLists.txt
@@ -17,7 +17,7 @@ cmake_minimum_required(VERSION 3.19)
 
 set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 
-include(${EXECUTORCH_ROOT}/build/Test.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
 set(_test_srcs function_ref_test.cpp test_pytree.cpp)
 
diff --git a/extension/runner_util/targets.bzl b/extension/runner_util/targets.bzl
index bc0fee197d6..3ab0c26cc72 100644
--- a/extension/runner_util/targets.bzl
+++ b/extension/runner_util/targets.bzl
@@ -1,4 +1,4 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime")
 
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
@@ -7,7 +7,7 @@ def define_common_targets():
     TARGETS and BUCK files that call this function.
     """
 
-    for aten_mode in (True, False):
+    for aten_mode in get_aten_mode_options():
         aten_suffix = ("_aten" if aten_mode else "")
 
         runtime.cxx_library(
diff --git a/extension/runner_util/test/CMakeLists.txt b/extension/runner_util/test/CMakeLists.txt
index 82e9fe22ecc..99136f1f3e0 100644
--- a/extension/runner_util/test/CMakeLists.txt
+++ b/extension/runner_util/test/CMakeLists.txt
@@ -15,7 +15,7 @@ cmake_minimum_required(VERSION 3.19)
 
 set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 
-include(${EXECUTORCH_ROOT}/build/Test.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
 add_custom_command(
   OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/ModuleAdd.pte"
diff --git a/extension/runner_util/test/targets.bzl b/extension/runner_util/test/targets.bzl
index f55a1ea995f..95d5804ecdf 100644
--- a/extension/runner_util/test/targets.bzl
+++ b/extension/runner_util/test/targets.bzl
@@ -1,4 +1,4 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime")
 
 def define_common_targets(is_fbcode = False):
     """Defines targets that should be shared between fbcode and xplat.
@@ -7,7 +7,7 @@ def define_common_targets(is_fbcode = False):
     TARGETS and BUCK files that call this function.
     """
 
-    for aten_mode in (True, False):
+    for aten_mode in get_aten_mode_options():
         aten_suffix = ("_aten" if aten_mode else "")
 
         # TODO(dbort): Find a way to make these run for ANDROID/APPLE in xplat. The
diff --git a/extension/tensor/targets.bzl b/extension/tensor/targets.bzl
index 97654094af6..bf1485aaba5 100644
--- a/extension/tensor/targets.bzl
+++ b/extension/tensor/targets.bzl
@@ -1,4 +1,4 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime")
 
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
@@ -7,7 +7,7 @@ def define_common_targets():
     TARGETS and BUCK files that call this function.
     """
 
-    for aten_mode in (True, False):
+    for aten_mode in get_aten_mode_options():
         aten_suffix = ("_aten" if aten_mode else "")
 
         runtime.cxx_library(
diff --git a/extension/tensor/tensor_ptr_maker.h b/extension/tensor/tensor_ptr_maker.h
index e3351fe37ec..eb3745d34e2 100644
--- a/extension/tensor/tensor_ptr_maker.h
+++ b/extension/tensor/tensor_ptr_maker.h
@@ -555,8 +555,7 @@ inline TensorPtr rand(
 }
 
 /**
- * Creates a TensorPtr filled with random values between 0 and 1, with specified
- * strides.
+ * Creates a TensorPtr filled with random values from a normal distribution.
  *
  * @param sizes A vector specifying the size of each dimension.
  * @param strides A vector specifying the stride for each dimension.
@@ -597,8 +596,7 @@ inline TensorPtr randn_like(
 }
 
 /**
- * Creates a TensorPtr filled with random values sampled from a normal
- * distribution.
+ * Creates a TensorPtr filled with random values from a normal distribution.
  *
  * @param sizes A vector specifying the size of each dimension.
  * @param type The scalar type of the tensor elements.
diff --git a/extension/tensor/test/CMakeLists.txt b/extension/tensor/test/CMakeLists.txt
index c6c3009be69..0e5fd1d97ef 100644
--- a/extension/tensor/test/CMakeLists.txt
+++ b/extension/tensor/test/CMakeLists.txt
@@ -17,7 +17,7 @@ cmake_minimum_required(VERSION 3.19)
 
 set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 
-include(${EXECUTORCH_ROOT}/build/Test.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
 set(_test_srcs tensor_ptr_maker_test.cpp tensor_ptr_test.cpp)
 
diff --git a/extension/tensor/test/targets.bzl b/extension/tensor/test/targets.bzl
index 29c8bff84bc..5bf8c7019b8 100644
--- a/extension/tensor/test/targets.bzl
+++ b/extension/tensor/test/targets.bzl
@@ -1,4 +1,4 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime")
 
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
@@ -7,7 +7,7 @@ def define_common_targets():
     TARGETS and BUCK files that call this function.
     """
 
-    for aten_mode in (True, False):
+    for aten_mode in get_aten_mode_options():
         aten_suffix = ("_aten" if aten_mode else "")
 
         runtime.cxx_test(
diff --git a/extension/testing_util/targets.bzl b/extension/testing_util/targets.bzl
index a04ffb90c9f..05b825645e8 100644
--- a/extension/testing_util/targets.bzl
+++ b/extension/testing_util/targets.bzl
@@ -12,9 +12,12 @@ def define_common_targets():
         srcs = [],
         exported_headers = ["temp_file.h"],
         visibility = [
+            "//executorch/devtools/etdump/tests/...",
             "//executorch/extension/data_loader/test/...",
             "//executorch/extension/testing_util/test/...",
             "//executorch/extension/fb/ptez/decompression_methods/test/...",
             "//executorch/extension/fb/ptez/test/...",
+            "//executorch/runtime/executor/test/...",
+            "//executorch/backends/xnnpack/test/...",
         ],
     )
diff --git a/extension/threadpool/CMakeLists.txt b/extension/threadpool/CMakeLists.txt
index 90288656674..6e107cb6634 100644
--- a/extension/threadpool/CMakeLists.txt
+++ b/extension/threadpool/CMakeLists.txt
@@ -21,7 +21,8 @@ if(NOT CMAKE_CXX_STANDARD)
 endif()
 
 add_library(
-  extension_threadpool threadpool.cpp threadpool_guard.cpp cpuinfo_utils.cpp
+  extension_threadpool threadpool.cpp threadpool_guard.cpp thread_parallel.cpp
+                       cpuinfo_utils.cpp
 )
 target_link_libraries(
   extension_threadpool PUBLIC executorch_core cpuinfo pthreadpool
@@ -32,6 +33,7 @@ target_include_directories(
   PUBLIC ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include
          ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include
 )
+target_compile_definitions(extension_threadpool PUBLIC ET_USE_THREADPOOL)
 target_compile_options(extension_threadpool PUBLIC ${_common_compile_options})
 
 # Install libraries
@@ -41,3 +43,7 @@ install(
   INCLUDES
   DESTINATION ${_common_include_directories}
 )
+
+if(BUILD_TESTING)
+  add_subdirectory(test)
+endif()
diff --git a/extension/threadpool/cpuinfo_utils.cpp b/extension/threadpool/cpuinfo_utils.cpp
index 5dc3fa7fae5..21862fbd4aa 100644
--- a/extension/threadpool/cpuinfo_utils.cpp
+++ b/extension/threadpool/cpuinfo_utils.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <executorch/extension/threadpool/cpuinfo_utils.h>
 
 #include <fstream>
@@ -84,7 +85,7 @@ bool populate_available_cpu_mids() {
   cpu_midrs->resize(num_possible_cores);
   const std::string kMidrFilePathPrefix = "/sys/devices/system/cpu/cpu";
   const std::string kMidrFilePathSuffix = "/regs/identification/midr_el1";
-  for (int32_t i = 0; i < num_possible_cores; ++i) {
+  for (const auto i : c10::irange(num_possible_cores)) {
     std::string midr_file_path =
         kMidrFilePathPrefix + std::to_string(i) + kMidrFilePathSuffix;
     ET_LOG(Info, "Reading file %s", midr_file_path.c_str());
@@ -115,7 +116,7 @@ uint32_t _get_num_performant_cores() {
     ET_LOG(Info, "CPU info and manual query on # of cpus dont match.");
     return 0;
   }
-  for (int32_t i = 0; i < cpu_midrs->size(); ++i) {
+  for (const auto i : c10::irange(cpu_midrs->size())) {
     uint32_t masked_midr = (*cpu_midrs)[i] & RIVISION_MASK;
     switch (masked_midr) {
       case CPUINFO_ARM_MIDR_CORTEX_A520:
@@ -148,7 +149,7 @@ uint32_t get_num_performant_cores() {
   uint32_t num_possible_cores = cpuinfo_get_processors_count();
   uint32_t num_non_performant_core = 0;
   if (uarch_count > 1) {
-    for (int32_t i = 0; i < uarch_count; ++i) {
+    for (const auto i : c10::irange(uarch_count)) {
       const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
       if (is_non_performant_core(uarch_info)) {
         num_non_performant_core += uarch_info->processor_count;
diff --git a/extension/threadpool/targets.bzl b/extension/threadpool/targets.bzl
index 4a7185ce972..1c34dbbc7d4 100644
--- a/extension/threadpool/targets.bzl
+++ b/extension/threadpool/targets.bzl
@@ -9,6 +9,7 @@ def define_common_targets():
     """
 
     _THREADPOOL_SRCS = [
+        "thread_parallel.cpp",
         "threadpool.cpp",
         "threadpool_guard.cpp",
     ] + (["fb/threadpool_use_n_threads.cpp"] if not runtime.is_oss else [])
@@ -23,11 +24,14 @@ def define_common_targets():
         srcs = _THREADPOOL_SRCS,
         deps = [
             "//executorch/runtime/core:core",
+            "//executorch/runtime/core/portable_type/c10/c10:c10",
         ],
         exported_headers = _THREADPOOL_HEADERS,
         exported_deps = [
             third_party_dep("pthreadpool"),
             third_party_dep("cpuinfo"),
+            # Allow users to use the header without an extra deps entry.
+            "//executorch/runtime/kernel:thread_parallel_interface",
         ],
         exported_preprocessor_flags = [
             "-DET_USE_THREADPOOL",
diff --git a/extension/threadpool/test/CMakeLists.txt b/extension/threadpool/test/CMakeLists.txt
new file mode 100644
index 00000000000..e9b074ca47c
--- /dev/null
+++ b/extension/threadpool/test/CMakeLists.txt
@@ -0,0 +1,27 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# @generated by test/utils/generate_gtest_cmakelists.py
+#
+# This file should be formatted with
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+# It should also be cmake-lint clean.
+#
+
+cmake_minimum_required(VERSION 3.19)
+
+set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
+
+include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
+
+set(_test_srcs thread_parallel_test.cpp threadpool_test.cpp)
+
+et_cxx_test(
+  extension_threadpool_test SOURCES ${_test_srcs} EXTRA_LIBS
+  extension_threadpool
+)
diff --git a/extension/threadpool/test/targets.bzl b/extension/threadpool/test/targets.bzl
index b8a39d8969a..8bdf776c825 100644
--- a/extension/threadpool/test/targets.bzl
+++ b/extension/threadpool/test/targets.bzl
@@ -18,3 +18,15 @@ def define_common_targets():
             "//executorch/extension/threadpool:threadpool",
         ],
     )
+
+    runtime.cxx_test(
+        name = "thread_parallel_test",
+        srcs = [
+            "thread_parallel_test.cpp",
+        ],
+        deps = [
+            "//executorch/extension/threadpool:threadpool",
+            "//executorch/runtime/kernel:thread_parallel_interface",
+            "//executorch/runtime/platform:platform",
+        ],
+    )
diff --git a/extension/parallel/test/thread_parallel_test.cpp b/extension/threadpool/test/thread_parallel_test.cpp
similarity index 77%
rename from extension/parallel/test/thread_parallel_test.cpp
rename to extension/threadpool/test/thread_parallel_test.cpp
index d386429100d..fd72211a789 100644
--- a/extension/parallel/test/thread_parallel_test.cpp
+++ b/extension/threadpool/test/thread_parallel_test.cpp
@@ -11,13 +11,13 @@
 #include <array>
 #include <mutex>
 
-#include <executorch/extension/parallel/thread_parallel.h>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
 #include <executorch/runtime/platform/platform.h>
 
 using namespace ::testing;
 using ::executorch::extension::parallel_for;
 
-class ParallelTest : public ::testing::Test {
+class ParallelTest : public ::testing::TestWithParam<bool> {
  protected:
   void SetUp() override {
     data_.fill(0);
@@ -42,12 +42,25 @@ class ParallelTest : public ::testing::Test {
     }
   }
 
+  template <typename Func>
+  bool parallel_for(
+      const int64_t begin,
+      const int64_t end,
+      const int64_t grain_size,
+      const Func& func) {
+    if (GetParam()) {
+      return executorch::extension::parallel_for(begin, end, grain_size, func);
+    }
+    return executorch::extension::internal::parallel_for_no_threadpool(
+        begin, end, grain_size, func);
+  }
+
   std::array<int, 10> data_;
   std::mutex mutex_;
   int sum_of_all_elements_;
 };
 
-TEST_F(ParallelTest, TestAllInvoked) {
+TEST_P(ParallelTest, TestAllInvoked) {
   EXPECT_TRUE(parallel_for(0, 10, 1, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
   }));
@@ -57,7 +70,7 @@ TEST_F(ParallelTest, TestAllInvoked) {
   }
 }
 
-TEST_F(ParallelTest, TestAllInvokedWithMutex) {
+TEST_P(ParallelTest, TestAllInvokedWithMutex) {
   EXPECT_TRUE(parallel_for(0, 10, 1, [this](int64_t begin, int64_t end) {
     this->RunExclusiveTask(begin, end);
   }));
@@ -70,7 +83,7 @@ TEST_F(ParallelTest, TestAllInvokedWithMutex) {
   EXPECT_EQ(sum_of_all_elements_, expected_sum);
 }
 
-TEST_F(ParallelTest, TestInvalidRange) {
+TEST_P(ParallelTest, TestInvalidRange) {
   et_pal_init();
   EXPECT_FALSE(parallel_for(10, 0, 1, [this](int64_t begin, int64_t end) {
     this->RunExclusiveTask(begin, end);
@@ -82,7 +95,7 @@ TEST_F(ParallelTest, TestInvalidRange) {
   EXPECT_EQ(sum_of_all_elements_, 0);
 }
 
-TEST_F(ParallelTest, TestInvalidRange2) {
+TEST_P(ParallelTest, TestInvalidRange2) {
   et_pal_init();
   EXPECT_FALSE(parallel_for(6, 5, 1, [this](int64_t begin, int64_t end) {
     this->RunExclusiveTask(begin, end);
@@ -94,7 +107,7 @@ TEST_F(ParallelTest, TestInvalidRange2) {
   EXPECT_EQ(sum_of_all_elements_, 0);
 }
 
-TEST_F(ParallelTest, TestInvokePartialFromBeginning) {
+TEST_P(ParallelTest, TestInvokePartialFromBeginning) {
   EXPECT_TRUE(parallel_for(0, 5, 1, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
   }));
@@ -107,7 +120,7 @@ TEST_F(ParallelTest, TestInvokePartialFromBeginning) {
   }
 }
 
-TEST_F(ParallelTest, TestInvokePartialToEnd) {
+TEST_P(ParallelTest, TestInvokePartialToEnd) {
   EXPECT_TRUE(parallel_for(5, 10, 1, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
   }));
@@ -120,7 +133,7 @@ TEST_F(ParallelTest, TestInvokePartialToEnd) {
   }
 }
 
-TEST_F(ParallelTest, TestInvokePartialMiddle) {
+TEST_P(ParallelTest, TestInvokePartialMiddle) {
   EXPECT_TRUE(parallel_for(2, 8, 1, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
   }));
@@ -136,7 +149,7 @@ TEST_F(ParallelTest, TestInvokePartialMiddle) {
   }
 }
 
-TEST_F(ParallelTest, TestChunkSize2) {
+TEST_P(ParallelTest, TestChunkSize2) {
   EXPECT_TRUE(parallel_for(0, 10, 2, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
   }));
@@ -146,7 +159,7 @@ TEST_F(ParallelTest, TestChunkSize2) {
   }
 }
 
-TEST_F(ParallelTest, TestChunkSize2Middle) {
+TEST_P(ParallelTest, TestChunkSize2Middle) {
   EXPECT_TRUE(parallel_for(3, 8, 2, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
   }));
@@ -162,7 +175,7 @@ TEST_F(ParallelTest, TestChunkSize2Middle) {
   }
 }
 
-TEST_F(ParallelTest, TestChunkSize3) {
+TEST_P(ParallelTest, TestChunkSize3) {
   EXPECT_TRUE(parallel_for(0, 10, 3, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
   }));
@@ -172,7 +185,7 @@ TEST_F(ParallelTest, TestChunkSize3) {
   }
 }
 
-TEST_F(ParallelTest, TestChunkSize6) {
+TEST_P(ParallelTest, TestChunkSize6) {
   EXPECT_TRUE(parallel_for(0, 10, 6, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
   }));
@@ -182,7 +195,7 @@ TEST_F(ParallelTest, TestChunkSize6) {
   }
 }
 
-TEST_F(ParallelTest, TestChunkSizeTooLarge) {
+TEST_P(ParallelTest, TestChunkSizeTooLarge) {
   EXPECT_TRUE(parallel_for(0, 10, 11, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
   }));
@@ -191,3 +204,8 @@ TEST_F(ParallelTest, TestChunkSizeTooLarge) {
     EXPECT_EQ(data_[i], i);
   }
 }
+
+INSTANTIATE_TEST_SUITE_P(
+    ParallelTestWithOrWithoutThreadpool,
+    ParallelTest,
+    ::testing::Values(true, false));
diff --git a/extension/parallel/thread_parallel.cpp b/extension/threadpool/thread_parallel.cpp
similarity index 82%
rename from extension/parallel/thread_parallel.cpp
rename to extension/threadpool/thread_parallel.cpp
index dfbb911d3a9..b8705ef8ecd 100644
--- a/extension/parallel/thread_parallel.cpp
+++ b/extension/threadpool/thread_parallel.cpp
@@ -6,11 +6,13 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <algorithm>
+#include <cinttypes>
 #include <tuple>
 
-#include <executorch/extension/parallel/thread_parallel.h>
 #include <executorch/extension/threadpool/threadpool.h>
-#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
 #include <executorch/runtime/platform/assert.h>
 
 namespace executorch {
@@ -53,14 +55,17 @@ bool parallel_for(
     const int64_t end,
     const int64_t grain_size,
     const std::function<void(int64_t, int64_t)>& f) {
-  ET_LOG_AND_RETURN_IF_FALSE(begin >= 0 && end >= 0);
-  ET_LOG_AND_RETURN_IF_FALSE(end >= begin);
-  ET_LOG_AND_RETURN_IF_FALSE(grain_size > 0);
+  ET_CHECK_OR_RETURN_FALSE(
+      begin >= 0 && end >= 0 && end >= begin,
+      "begin = %" PRId64 ", end = %" PRId64,
+      begin,
+      end);
+  ET_CHECK_OR_RETURN_FALSE(grain_size > 0, "grain_size = %" PRId64, grain_size);
   int64_t num_tasks = 0, chunk_size = 0;
   std::tie(num_tasks, chunk_size) =
       calc_num_tasks_and_chunk_size(begin, end, grain_size);
 
-  auto task = [f, begin, end, chunk_size](size_t task_id) {
+  auto task = [&f, begin, end, chunk_size](size_t task_id) {
     set_thread_num(task_id);
     int64_t local_start = begin + static_cast<int64_t>(task_id) * chunk_size;
     if (local_start < end) {
diff --git a/extension/training/CMakeLists.txt b/extension/training/CMakeLists.txt
index e50bb3c71eb..97e75955837 100644
--- a/extension/training/CMakeLists.txt
+++ b/extension/training/CMakeLists.txt
@@ -26,7 +26,7 @@ target_include_directories(
 target_include_directories(extension_training PUBLIC ${EXECUTORCH_ROOT}/..)
 target_compile_options(extension_training PUBLIC ${_common_compile_options})
 target_link_libraries(extension_training executorch_core
-    extension_data_loader extension_module extension_tensor)
+    extension_data_loader extension_module extension_tensor extension_flat_tensor)
 
 
 list(TRANSFORM _train_xor__srcs PREPEND "${EXECUTORCH_ROOT}/")
diff --git a/extension/training/examples/XOR/export_model.py b/extension/training/examples/XOR/export_model.py
index bfbe0ce2138..98e04f09a2f 100644
--- a/extension/training/examples/XOR/export_model.py
+++ b/extension/training/examples/XOR/export_model.py
@@ -11,14 +11,14 @@
 import os
 
 import torch
-from executorch.exir import to_edge
+from executorch.exir import ExecutorchBackendConfig, to_edge
 
 from executorch.extension.training.examples.XOR.model import Net, TrainingNet
 from torch.export import export
 from torch.export.experimental import _export_forward_backward
 
 
-def _export_model():
+def _export_model(external_mutable_weights: bool = False):
     net = TrainingNet(Net())
     x = torch.randn(1, 2)
 
@@ -30,7 +30,11 @@ def _export_model():
     # Lower the graph to edge dialect.
     ep = to_edge(ep)
     # Lower the graph to executorch.
-    ep = ep.to_executorch()
+    ep = ep.to_executorch(
+        config=ExecutorchBackendConfig(
+            external_mutable_weights=external_mutable_weights
+        )
+    )
     return ep
 
 
@@ -44,19 +48,27 @@ def main() -> None:
         "--outdir",
         type=str,
         required=True,
-        help="Path to the directory to write xor.pte files to",
+        help="Path to the directory to write xor.pte and xor.ptd files to",
+    )
+    parser.add_argument(
+        "--external",
+        action="store_true",
+        help="Export the model with external weights",
     )
     args = parser.parse_args()
 
-    ep = _export_model()
+    ep = _export_model(args.external)
 
     # Write out the .pte file.
     os.makedirs(args.outdir, exist_ok=True)
     outfile = os.path.join(args.outdir, "xor.pte")
     with open(outfile, "wb") as fp:
-        fp.write(
-            ep.buffer,
-        )
+        ep.write_to_file(fp)
+
+    if args.external:
+        # current infra doesnt easily allow renaming this file, so just hackily do it here.
+        ep._tensor_data["xor"] = ep._tensor_data.pop("_default_external_constant")
+        ep.write_tensor_data_to_file(args.outdir)
 
 
 if __name__ == "__main__":
diff --git a/extension/training/examples/XOR/train.cpp b/extension/training/examples/XOR/train.cpp
index 746daebbf1b..af1c37a6a50 100644
--- a/extension/training/examples/XOR/train.cpp
+++ b/extension/training/examples/XOR/train.cpp
@@ -23,12 +23,18 @@ using executorch::extension::training::optimizer::SGDOptions;
 using executorch::runtime::Error;
 using executorch::runtime::Result;
 DEFINE_string(model_path, "xor.pte", "Model serialized in flatbuffer format.");
+DEFINE_string(ptd_path, "", "Model weights serialized in flatbuffer format.");
 
 int main(int argc, char** argv) {
   gflags::ParseCommandLineFlags(&argc, &argv, true);
-  if (argc != 1) {
+  if (argc == 0) {
+    ET_LOG(Error, "Please provide a model path.");
+    return 1;
+  } else if (argc > 2) {
     std::string msg = "Extra commandline args: ";
-    for (int i = 1 /* skip argv[0] (program name) */; i < argc; i++) {
+    for (int i = 2 /* skip argv[0] (pte path) and argv[1] (ptd path) */;
+         i < argc;
+         i++) {
       msg += argv[i];
     }
     ET_LOG(Error, "%s", msg.c_str());
@@ -46,7 +52,21 @@ int main(int argc, char** argv) {
   auto loader = std::make_unique<executorch::extension::FileDataLoader>(
       std::move(loader_res.get()));
 
-  auto mod = executorch::extension::training::TrainingModule(std::move(loader));
+  std::unique_ptr<executorch::extension::FileDataLoader> ptd_loader = nullptr;
+  if (!FLAGS_ptd_path.empty()) {
+    executorch::runtime::Result<executorch::extension::FileDataLoader>
+        ptd_loader_res =
+            executorch::extension::FileDataLoader::from(FLAGS_ptd_path.c_str());
+    if (ptd_loader_res.error() != Error::Ok) {
+      ET_LOG(Error, "Failed to open ptd file: %s", FLAGS_ptd_path.c_str());
+      return 1;
+    }
+    ptd_loader = std::make_unique<executorch::extension::FileDataLoader>(
+        std::move(ptd_loader_res.get()));
+  }
+
+  auto mod = executorch::extension::training::TrainingModule(
+      std::move(loader), nullptr, nullptr, nullptr, std::move(ptd_loader));
 
   // Create full data set of input and labels.
   std::vector<std::pair<
@@ -70,7 +90,10 @@ int main(int argc, char** argv) {
   // Get the params and names
   auto param_res = mod.named_parameters("forward");
   if (param_res.error() != Error::Ok) {
-    ET_LOG(Error, "Failed to get named parameters");
+    ET_LOG(
+        Error,
+        "Failed to get named parameters, error: %d",
+        static_cast<int>(param_res.error()));
     return 1;
   }
 
@@ -112,5 +135,6 @@ int main(int argc, char** argv) {
         std::string(param.first.data()), param.second});
   }
 
-  executorch::extension::flat_tensor::save_ptd("xor.ptd", param_map, 16);
+  executorch::extension::flat_tensor::save_ptd(
+      "trained_xor.ptd", param_map, 16);
 }
diff --git a/extension/training/module/state_dict_util.cpp b/extension/training/module/state_dict_util.cpp
new file mode 100644
index 00000000000..7c742d11c08
--- /dev/null
+++ b/extension/training/module/state_dict_util.cpp
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/training/module/state_dict_util.h>
+
+namespace executorch {
+namespace extension {
+namespace training {
+
+runtime::Result<std::map<std::string, executorch::extension::TensorPtr>>
+load_state_dict(const runtime::NamedDataMap& data_map) {
+  std::map<std::string, executorch::extension::TensorPtr> state_dict;
+  auto num_key_res = data_map.get_num_keys();
+  if (!num_key_res.ok()) {
+    return num_key_res.error();
+  }
+  for (size_t i = 0; i < num_key_res.get(); i++) {
+    // get the key
+    auto key_res = data_map.get_key(i);
+    if (!key_res.ok()) {
+      return key_res.error();
+    }
+
+    // get the metadata
+    auto metadata_res = data_map.get_metadata(key_res.get());
+    if (!metadata_res.ok()) {
+      return metadata_res.error();
+    }
+
+    // get data blob
+    void* data = nullptr;
+    static constexpr size_t kMallocAlignment = alignof(std::max_align_t);
+    if constexpr (kMallocAlignment < 8) {
+      // Skip manually aligning the memory since PyTorch doesn't have dtypes >
+      // 8 bytes wide, and I don't expect to ever encounter a platform where
+      // malloc aligns to less than 8.
+      ET_LOG(
+          Error,
+          "kMallocAlignment is too small: %zu. Cannot safely create buffer to load tensor. Please open an issue on https://github.com/pytorch/executorch/issues",
+          kMallocAlignment);
+      return runtime::Error::NotSupported;
+    }
+
+    data = malloc(metadata_res->nbytes());
+    if (data == nullptr && metadata_res->nbytes() != 0) {
+      ET_LOG(Error, "Failed to allocate memory for tensor, malloc failed");
+      return runtime::Error::MemoryAllocationFailed;
+    }
+    auto load_into_error =
+        data_map.load_data_into(key_res.get(), data, metadata_res->nbytes());
+    if (load_into_error != runtime::Error::Ok) {
+      ET_LOG(
+          Error,
+          "Failed to load data into tensor, likely a malformed .ptd 0x%" PRIx32,
+          static_cast<uint32_t>(load_into_error));
+      return load_into_error;
+    }
+
+    // Get metadata
+    std::vector<executorch::aten::SizesType> sizes;
+    for (auto x : metadata_res->sizes()) {
+      sizes.push_back(x);
+    }
+    std::vector<executorch::aten::DimOrderType> dim_order;
+    for (auto x : metadata_res->dim_order()) {
+      dim_order.push_back(x);
+    }
+    std::vector<executorch::aten::StridesType> strides;
+    for (auto stride_index = 0; stride_index < metadata_res->sizes().size();
+         stride_index++) {
+      if (stride_index == 0) {
+        strides.push_back(1);
+      } else {
+        strides.insert(
+            strides.begin(),
+            sizes.at(stride_index) * strides.at(stride_index - 1));
+      }
+    }
+
+    // create tensor
+    auto tensor = make_tensor_ptr(
+        sizes,
+        data,
+        dim_order,
+        strides,
+        metadata_res->scalar_type(),
+        exec_aten::TensorShapeDynamism::STATIC,
+        [](void* ptr) {
+          free(ptr);
+          ptr = nullptr;
+        });
+
+    // add to state dict
+    state_dict.insert({std::string(key_res.get()), std::move(tensor)});
+  }
+
+  return state_dict;
+}
+
+} // namespace training
+} // namespace extension
+} // namespace executorch
diff --git a/extension/training/module/state_dict_util.h b/extension/training/module/state_dict_util.h
new file mode 100644
index 00000000000..f98dd77a5af
--- /dev/null
+++ b/extension/training/module/state_dict_util.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/extension/tensor/tensor.h>
+#include <executorch/runtime/core/named_data_map.h>
+#include <executorch/runtime/platform/compiler.h>
+
+#include <map>
+#include <string>
+
+namespace executorch {
+namespace extension {
+namespace training {
+
+/**
+ * Generate a map of string to tensor.
+ *
+ * @param data The NamedDataMap to load the tensors and names from.
+ * @return A result containing a map of tensor names to tensors if
+ *   successful, an error otherwise.
+ */
+ET_EXPERIMENTAL
+runtime::Result<std::map<std::string, executorch::extension::TensorPtr>>
+load_state_dict(const runtime::NamedDataMap& data);
+
+} // namespace training
+} // namespace extension
+} // namespace executorch
diff --git a/extension/training/module/targets.bzl b/extension/training/module/targets.bzl
index 88da84ed131..0ae00aa447d 100644
--- a/extension/training/module/targets.bzl
+++ b/extension/training/module/targets.bzl
@@ -1,4 +1,4 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime")
 
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
@@ -7,7 +7,25 @@ def define_common_targets():
     TARGETS and BUCK files that call this function.
     """
 
-    for aten_mode in (True, False):
+    runtime.cxx_library(
+        name = "state_dict_util",
+        srcs = [
+            "state_dict_util.cpp",
+        ],
+        exported_headers = [
+            "state_dict_util.h",
+        ],
+        visibility = [
+            "@EXECUTORCH_CLIENTS",
+        ],
+        exported_deps = [
+            "//executorch/runtime/core:named_data_map",
+            "//executorch/extension/tensor:tensor",
+            "//executorch/runtime/core:core",
+        ],
+    )
+
+    for aten_mode in get_aten_mode_options():
         aten_suffix = ("_aten" if aten_mode else "")
 
         runtime.cxx_library(
diff --git a/extension/training/module/test/state_dict_util_test.cpp b/extension/training/module/test/state_dict_util_test.cpp
new file mode 100644
index 00000000000..14e5b0d4f0d
--- /dev/null
+++ b/extension/training/module/test/state_dict_util_test.cpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/data_loader/file_data_loader.h>
+#include <executorch/extension/flat_tensor/flat_tensor_data_map.h>
+#include <executorch/extension/training/module/state_dict_util.h>
+
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/result.h>
+#include <executorch/runtime/platform/runtime.h>
+
+#include <gtest/gtest.h>
+
+using namespace ::testing;
+using executorch::extension::FlatTensorDataMap;
+using executorch::extension::FlatTensorHeader;
+using executorch::runtime::DataLoader;
+using executorch::runtime::Error;
+using executorch::runtime::FreeableBuffer;
+using executorch::runtime::Result;
+using executorch::runtime::TensorLayout;
+using torch::executor::util::FileDataLoader;
+
+class LoadStateDictTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // Since these tests cause ET_LOG to be called, the PAL must be initialized
+    // first.
+    executorch::runtime::runtime_init();
+
+    // Load data map.
+    // The eager linear model is defined at:
+    // //executorch/test/models/linear_model.py
+    const char* path = std::getenv("ET_MODULE_LINEAR_DATA_PATH");
+    Result<FileDataLoader> loader = FileDataLoader::from(path);
+    ASSERT_EQ(loader.error(), Error::Ok);
+
+    Result<FreeableBuffer> header = loader->load(
+        /*offset=*/0,
+        FlatTensorHeader::kNumHeadBytes,
+        /*segment_info=*/
+        DataLoader::SegmentInfo(DataLoader::SegmentInfo::Type::External));
+
+    ASSERT_EQ(header.error(), Error::Ok);
+
+    data_map_loader_ =
+        std::make_unique<FileDataLoader>(std::move(loader.get()));
+  }
+  std::unique_ptr<FileDataLoader> data_map_loader_;
+};
+
+TEST_F(LoadStateDictTest, LoadDataMap) {
+  Result<FlatTensorDataMap> data_map =
+      FlatTensorDataMap::load(data_map_loader_.get());
+  EXPECT_EQ(data_map.error(), Error::Ok);
+
+  auto state_dict =
+      executorch::extension::training::load_state_dict(data_map.get());
+  ASSERT_TRUE(state_dict.ok());
+
+  EXPECT_EQ(state_dict->size(), 2);
+  EXPECT_EQ(state_dict->at("a")->sizes().size(), 2);
+  EXPECT_EQ(state_dict->at("a")->sizes()[0], 2);
+  EXPECT_EQ(state_dict->at("a")->sizes()[1], 2);
+  EXPECT_EQ(
+      state_dict->at("a")->scalar_type(), torch::executor::ScalarType::Float);
+  EXPECT_EQ(state_dict->at("a")->dim(), 2);
+  EXPECT_EQ(state_dict->at("a")->const_data_ptr<float>()[0], 3.f);
+  EXPECT_EQ(state_dict->at("a")->const_data_ptr<float>()[1], 3.f);
+  EXPECT_EQ(state_dict->at("a")->const_data_ptr<float>()[2], 3.f);
+  EXPECT_EQ(state_dict->at("a")->const_data_ptr<float>()[3], 3.f);
+
+  EXPECT_EQ(state_dict->size(), 2);
+  EXPECT_EQ(state_dict->at("b")->sizes().size(), 2);
+  EXPECT_EQ(state_dict->at("b")->sizes()[0], 2);
+  EXPECT_EQ(state_dict->at("b")->sizes()[1], 2);
+  EXPECT_EQ(
+      state_dict->at("b")->scalar_type(), torch::executor::ScalarType::Float);
+  EXPECT_EQ(state_dict->at("b")->dim(), 2);
+  EXPECT_EQ(state_dict->at("b")->const_data_ptr<float>()[0], 2.f);
+  EXPECT_EQ(state_dict->at("b")->const_data_ptr<float>()[1], 2.f);
+  EXPECT_EQ(state_dict->at("b")->const_data_ptr<float>()[2], 2.f);
+  EXPECT_EQ(state_dict->at("b")->const_data_ptr<float>()[3], 2.f);
+}
diff --git a/extension/training/module/test/targets.bzl b/extension/training/module/test/targets.bzl
index 8b260e2a7e8..17e8d1fe6ef 100644
--- a/extension/training/module/test/targets.bzl
+++ b/extension/training/module/test/targets.bzl
@@ -16,6 +16,10 @@ def define_common_targets(is_fbcode = False):
             # an fbcode target path because the authoring/export tools
             # intentionally don't work in xplat (since they're host-only tools).
             "ET_MODULE_ADD_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleAdd.pte])",
+            "ET_MODULE_LINEAR_DATA_PATH": "$(location fbcode//executorch/test/models:exported_program_and_data[ModuleLinear.ptd])",
+            "ET_MODULE_LINEAR_PROGRAM_PATH": "$(location fbcode//executorch/test/models:exported_program_and_data[ModuleLinear.pte])",
+            "ET_MODULE_TRAIN_DATA_PATH": "$(location fbcode//executorch/test/models:exported_program_and_data[ModuleSimpleTrain.ptd])",
+            "ET_MODULE_TRAIN_PROGRAM_PATH": "$(location fbcode//executorch/test/models:exported_program_and_data[ModuleSimpleTrainProgram.pte])",
             "ET_MODULE_SIMPLE_TRAIN_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleSimpleTrain.pte])",
         }
 
@@ -27,8 +31,23 @@ def define_common_targets(is_fbcode = False):
             deps = [
                 "//executorch/extension/training/module:training_module",
                 "//executorch/extension/data_loader:file_data_loader",
+                "//executorch/extension/flat_tensor:flat_tensor_data_map",
                 "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
                 "//executorch/kernels/portable:generated_lib",
             ],
             env = modules_env,
         )
+
+        runtime.cxx_test(
+            name = "state_dict_util_test",
+            srcs = [
+                "state_dict_util_test.cpp",
+            ],
+            deps = [
+                "//executorch/extension/data_loader:file_data_loader",
+                "//executorch/extension/flat_tensor:flat_tensor_data_map",
+                "//executorch/extension/training/module:state_dict_util",
+                "//executorch/runtime/core/exec_aten:lib",
+            ],
+            env = modules_env,
+        )
diff --git a/extension/training/module/test/training_module_test.cpp b/extension/training/module/test/training_module_test.cpp
index ccd1c995554..3ba46c6f653 100644
--- a/extension/training/module/test/training_module_test.cpp
+++ b/extension/training/module/test/training_module_test.cpp
@@ -7,6 +7,7 @@
  */
 
 #include <executorch/extension/data_loader/file_data_loader.h>
+#include <executorch/extension/flat_tensor/flat_tensor_data_map.h>
 #include <executorch/extension/training/module/training_module.h>
 
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
@@ -18,9 +19,17 @@
 using namespace ::testing;
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
+using executorch::extension::FlatTensorDataMap;
+using executorch::extension::FlatTensorHeader;
+using executorch::runtime::DataLoader;
+using executorch::runtime::Error;
+using executorch::runtime::FreeableBuffer;
+using executorch::runtime::Result;
+using executorch::runtime::TensorLayout;
 using torch::executor::Error;
 using torch::executor::Span;
 using torch::executor::testing::TensorFactory;
+using torch::executor::util::FileDataLoader;
 
 class TrainingModuleTest : public ::testing::Test {
  protected:
@@ -105,3 +114,42 @@ TEST_F(TrainingModuleTest, NonTrainingModuleTest) {
   auto res = mod.execute_forward_backward("forward", inputs);
   ASSERT_EQ(res.error(), Error::InvalidArgument);
 }
+
+TEST_F(TrainingModuleTest, SeperateDataTest) {
+  // Load data map.
+  // The eager linear model is defined at:
+  // //executorch/test/models/linear_model.py
+  const char* ptd_path = std::getenv("ET_MODULE_TRAIN_DATA_PATH");
+  Result<FileDataLoader> data_map_loader_res = FileDataLoader::from(ptd_path);
+  ASSERT_EQ(data_map_loader_res.error(), Error::Ok);
+
+  auto data_map_loader =
+      std::make_unique<torch::executor::util::FileDataLoader>(
+          std::move(data_map_loader_res.get()));
+
+  const char* pte_path = std::getenv("ET_MODULE_TRAIN_PROGRAM_PATH");
+  Result<FileDataLoader> pte_loader_res = FileDataLoader::from(pte_path);
+  ASSERT_EQ(pte_loader_res.error(), Error::Ok);
+
+  auto pte_loader = std::make_unique<torch::executor::util::FileDataLoader>(
+      std::move(pte_loader_res.get()));
+
+  auto mod = executorch::extension::training::TrainingModule(
+      std::move(pte_loader),
+      nullptr,
+      nullptr,
+      nullptr,
+      std::move(data_map_loader));
+
+  TensorFactory<ScalarType::Float> tf;
+  Tensor input = tf.make({3}, {1.0, 1.0, 1.0});
+  Tensor label = tf.make({3}, {1.0, 0.0, 0.0});
+
+  std::vector<executorch::runtime::EValue> inputs;
+  inputs.push_back(input);
+  inputs.push_back(label);
+
+  auto res = mod.execute_forward_backward("forward", inputs);
+  ASSERT_EQ(res.error(), Error::Ok);
+  ASSERT_EQ(res.get().size(), 1);
+}
diff --git a/extension/training/module/training_module.cpp b/extension/training/module/training_module.cpp
index 52d293c69ef..d119738715e 100644
--- a/extension/training/module/training_module.cpp
+++ b/extension/training/module/training_module.cpp
@@ -43,7 +43,6 @@ TrainingModule::execute_forward_backward(
   uint64_t param_start = param_res.get()[0].toInt();
 
   // Execute the forward and backward pass.
-
   auto outputs = torch::executor::Module::execute(method_name, input);
   if (!outputs.ok()) {
     return outputs.error();
@@ -56,19 +55,23 @@ TrainingModule::execute_forward_backward(
     user_outputs.push_back(outputs.get().at(i));
   }
 
-  // Extract and store the gradients.
+  // Extract and store the gradients and params if this is the first time seeing
+  // this method.
   if (method_named_gradients_.find(method_name) ==
       method_named_gradients_.end()) {
+    // Fully qualified names
+    std::vector<runtime::EValue> fqn_list;
     method_named_gradients_.insert({method_name, {}});
 
     auto& gradients_map = method_named_gradients_.at(method_name);
-    // Get names.
+
+    // Get names if we havent seen this method before.
     const std::string fqn_method_name = fqn_method_prefix + method_name;
     auto fqn_res = executorch::extension::Module::execute(fqn_method_name);
     if (!fqn_res.ok()) {
       return fqn_res.error();
     }
-    const auto& fqn_list = fqn_res.get();
+    fqn_list = fqn_res.get();
 
     // Only have to initialize the dict once because the tensors in the dict and
     // the tensors in the method alias the same TensorImpl, so updating one will
@@ -87,43 +90,49 @@ TrainingModule::execute_forward_backward(
 runtime::Result<
     const std::map<executorch::aten::string_view, executorch::aten::Tensor>>
 TrainingModule::named_parameters(const std::string& method_name) {
-  std::map<executorch::aten::string_view, executorch::aten::Tensor>
-      named_parameters;
-  const std::string fqn_method_name = fqn_method_prefix + method_name;
-  const std::string parameters_method_name =
-      parameters_method_prefix + method_name;
+  // If we haven't seen this method before, populate the dict.
+  if (method_named_parameters_.find(method_name) ==
+      method_named_parameters_.end()) {
+    const std::string fqn_method_name = fqn_method_prefix + method_name;
+    const std::string parameters_method_name =
+        parameters_method_prefix + method_name;
 
-  // get names.
-  auto fqn_res = executorch::extension::Module::execute(fqn_method_name);
-  if (!fqn_res.ok()) {
-    return fqn_res.error();
-  }
-  const auto& fqn_list = fqn_res.get();
+    method_named_parameters_.insert({method_name, {}});
 
-  // get params start.
-  auto param_res =
-      executorch::extension::Module::execute(parameters_method_name);
-  if (!param_res.ok()) {
-    return param_res.error();
-  }
+    // get names.
+    auto fqn_res = executorch::extension::Module::execute(fqn_method_name);
+    if (!fqn_res.ok()) {
+      return fqn_res.error();
+    }
+    const auto& fqn_list = fqn_res.get();
 
-  uint64_t param_start = param_res.get()[0].toInt();
+    // get params start.
+    auto param_res =
+        executorch::extension::Module::execute(parameters_method_name);
+    if (!param_res.ok()) {
+      return param_res.error();
+    }
 
-  auto e = executorch::extension::Module::load_method(method_name);
-  if (e != runtime::Error::Ok) {
-    return e;
-  }
-  auto& method = methods_.at(method_name).method;
-
-  // create dict
-  size_t name_index = 0;
-  for (size_t param_index = param_start; param_index < method->outputs_size();
-       ++param_index, ++name_index) {
-    executorch::aten::string_view fqn = fqn_list.at(name_index).toString();
-    executorch::aten::Tensor param = method->get_output(param_index).toTensor();
-    named_parameters.insert({fqn, param});
+    uint64_t param_start = param_res.get()[0].toInt();
+
+    // Load the method if it is not already loaded.
+    auto e = executorch::extension::Module::load_method(method_name);
+    if (e != runtime::Error::Ok) {
+      return e;
+    }
+    auto& method = methods_.at(method_name).method;
+
+    // populate dict
+    size_t name_index = 0;
+    for (size_t param_index = param_start; param_index < method->outputs_size();
+         ++param_index, ++name_index) {
+      executorch::aten::string_view fqn = fqn_list.at(name_index).toString();
+      executorch::aten::Tensor param =
+          method->get_output(param_index).toTensor();
+      method_named_parameters_.at(method_name).insert({fqn, param});
+    }
   }
-  return named_parameters;
+  return method_named_parameters_.at(method_name);
 }
 
 runtime::Result<
diff --git a/extension/training/module/training_module.h b/extension/training/module/training_module.h
index 9e7aa49cacf..7bf81623c04 100644
--- a/extension/training/module/training_module.h
+++ b/extension/training/module/training_module.h
@@ -33,13 +33,16 @@ class ET_EXPERIMENTAL TrainingModule final
       std::unique_ptr<runtime::DataLoader> data_loader,
       std::unique_ptr<runtime::MemoryAllocator> memory_allocator = nullptr,
       std::unique_ptr<runtime::MemoryAllocator> temp_allocator = nullptr,
-      std::unique_ptr<runtime::EventTracer> event_tracer = nullptr)
+      std::unique_ptr<runtime::EventTracer> event_tracer = nullptr,
+      std::unique_ptr<runtime::DataLoader> data_map_data_loader = nullptr)
       : executorch::extension::Module(
             std::move(data_loader),
             std::move(memory_allocator),
             std::move(temp_allocator),
-            std::move(event_tracer)),
-        method_named_gradients_({}) {}
+            std::move(event_tracer),
+            std::move(data_map_data_loader)),
+        method_named_gradients_({}),
+        method_named_parameters_({}) {}
 
   explicit TrainingModule(const Module&) = delete;
   TrainingModule& operator=(const Module&) = delete;
@@ -97,6 +100,11 @@ class ET_EXPERIMENTAL TrainingModule final
       std::string,
       std::map<executorch::aten::string_view, executorch::aten::Tensor>>
       method_named_gradients_;
+
+  std::unordered_map<
+      std::string,
+      std::map<executorch::aten::string_view, executorch::aten::Tensor>>
+      method_named_parameters_;
 };
 
 } // namespace training
diff --git a/extension/training/optimizer/targets.bzl b/extension/training/optimizer/targets.bzl
index 3b00ae0bfdc..fb33f41f1ca 100644
--- a/extension/training/optimizer/targets.bzl
+++ b/extension/training/optimizer/targets.bzl
@@ -1,4 +1,4 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime")
 
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
@@ -7,7 +7,7 @@ def define_common_targets():
     TARGETS and BUCK files that call this function.
     """
 
-    for aten_mode in (True, False):
+    for aten_mode in get_aten_mode_options():
         aten_suffix = "_aten" if aten_mode else ""
 
         # if aten_mode:
diff --git a/extension/training/optimizer/test/targets.bzl b/extension/training/optimizer/test/targets.bzl
index 11269bfa180..7a93337a379 100644
--- a/extension/training/optimizer/test/targets.bzl
+++ b/extension/training/optimizer/test/targets.bzl
@@ -1,4 +1,4 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime")
 
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
@@ -7,7 +7,7 @@ def define_common_targets():
     TARGETS and BUCK files that call this function.
     """
 
-    for aten_mode in (True, False):
+    for aten_mode in get_aten_mode_options():
         aten_suffix = "_aten" if aten_mode else ""
         runtime.cxx_test(
             name = "sgd_test" + aten_suffix,
diff --git a/extension/training/pybindings/TARGETS b/extension/training/pybindings/TARGETS
index 6aa11ea6726..19b54961493 100644
--- a/extension/training/pybindings/TARGETS
+++ b/extension/training/pybindings/TARGETS
@@ -17,13 +17,11 @@ runtime.cxx_python_extension(
     types = ["_training_lib.pyi"],
     visibility = ["//executorch/extension/training/..."],
     deps = [
+        "fbsource//third-party/pybind11:pybind11",
         "//executorch/extension/aten_util:aten_bridge",
         "//executorch/extension/training/optimizer:sgd",
     ],
-    external_deps = [
-        "pybind11",
-        "libtorch_python",
-    ],
+    external_deps = ["libtorch_python"],
 )
 
 runtime.python_library(
diff --git a/install_executorch.bat b/install_executorch.bat
index 863ade7bdbb..e6d5c5db363 100644
--- a/install_executorch.bat
+++ b/install_executorch.bat
@@ -7,14 +7,8 @@ rem This batch file provides a basic functionality similar to the bash script.
 
 cd /d "%~dp0"
 
-rem Find the names of the python tools to use (replace with your actual python installation)
-if "%PYTHON_EXECUTABLE%"=="" (
-  if "%CONDA_DEFAULT_ENV%"=="" OR "%CONDA_DEFAULT_ENV%"=="base" OR NOT EXIST "python" (
-    set PYTHON_EXECUTABLE=python3
-  ) else (
-    set PYTHON_EXECUTABLE=python
-  )
-)
+rem Under windows, it's always python
+set PYTHON_EXECUTABLE=python
 
 "%PYTHON_EXECUTABLE%" install_executorch.py %*
 
diff --git a/install_executorch.py b/install_executorch.py
index 4797f5b2e2c..3b2a4091888 100644
--- a/install_executorch.py
+++ b/install_executorch.py
@@ -14,6 +14,7 @@
 import shutil
 import subprocess
 import sys
+from contextlib import contextmanager
 
 from install_requirements import (
     install_requirements,
@@ -28,6 +29,17 @@
 logger = logging.getLogger()
 
 
+@contextmanager
+def pushd(new_dir):
+    """Change the current directory to new_dir and yield. When exiting the context, change back to the original directory."""
+    original_dir = os.getcwd()
+    os.chdir(new_dir)
+    try:
+        yield
+    finally:
+        os.chdir(original_dir)
+
+
 def clean():
     print("Cleaning build artifacts...")
     print("Cleaning pip-out/...")
@@ -65,6 +77,8 @@ def clean():
     "prelude": "BUCK",
     "pthreadpool": "CMakeLists.txt",
     "pybind11": "CMakeLists.txt",
+    "shim": "BUCK",
+    "tokenizers": "CMakeLists.txt",
     "XNNPACK": "CMakeLists.txt",
 }
 
@@ -116,6 +130,11 @@ def check_folder(folder: str, file: str) -> bool:
                 logger.error(f"{file} not found in {path}.")
                 logger.error("Please run `git submodule update --init`.")
                 exit(1)
+    # Go into tokenizers submodule and install its submodules
+    tokenizers_path = get_required_submodule_paths().get("tokenizers", None)
+    if tokenizers_path:
+        with pushd(tokenizers_path):
+            subprocess.check_call(["git", "submodule", "update", "--init"])
     logger.info("All required submodules are present.")
 
 
@@ -138,6 +157,14 @@ def build_args_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="build from the pinned PyTorch commit instead of nightly",
     )
+    parser.add_argument(
+        "--editable",
+        "-e",
+        action="store_true",
+        help="build an editable pip wheel, changes to python code will be "
+        "picked up without rebuilding the wheel. Extension libraries will be "
+        "installed inside the source tree.",
+    )
     return parser
 
 
@@ -226,6 +253,9 @@ def main(args):
             "-m",
             "pip",
             "install",
+        ]
+        + (["--editable"] if args.editable else [])
+        + [
             ".",
             "--no-build-isolation",
             "-v",
diff --git a/install_requirements.py b/install_requirements.py
index e0e9629db78..0331f76522a 100644
--- a/install_requirements.py
+++ b/install_requirements.py
@@ -6,6 +6,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import argparse
+import os
 import platform
 import re
 import subprocess
@@ -67,7 +68,10 @@ def python_is_compatible():
 # NOTE: If a newly-fetched version of the executorch repo changes the value of
 # NIGHTLY_VERSION, you should re-run this script to install the necessary
 # package versions.
-NIGHTLY_VERSION = "dev20250131"
+#
+# NOTE: If you're changing, make the corresponding change in .ci/docker/ci_commit_pins/pytorch.txt
+# by picking the hash from the same date in https://hud.pytorch.org/hud/pytorch/pytorch/nightly/
+NIGHTLY_VERSION = "dev20250311"
 
 
 def install_requirements(use_pytorch_nightly):
@@ -117,6 +121,8 @@ def install_requirements(use_pytorch_nightly):
 
     # Install packages directly from local copy instead of pypi.
     # This is usually not recommended.
+    new_env = os.environ.copy()
+    new_env["USE_CPP"] = "1"  # install torchao kernels
     subprocess.run(
         [
             sys.executable,
@@ -127,6 +133,7 @@ def install_requirements(use_pytorch_nightly):
             "--no-build-isolation",
             *LOCAL_REQUIREMENTS,
         ],
+        env=new_env,
         check=True,
     )
 
@@ -143,8 +150,6 @@ def main(args):
 
 
 if __name__ == "__main__":
-    import os
-
     # Before doing anything, cd to the directory containing this script.
     os.chdir(os.path.dirname(os.path.abspath(__file__)))
     if not python_is_compatible():
diff --git a/kernels/aten/functions.yaml b/kernels/aten/functions.yaml
index 463ef0f9d32..7069f9140ab 100644
--- a/kernels/aten/functions.yaml
+++ b/kernels/aten/functions.yaml
@@ -249,6 +249,8 @@
 
 - op: max_pool2d_with_indices.out
 
+- op: max_pool2d_with_indices_backward.grad_input
+
 - op: max.dim_max
 
 - op: max.unary_out
@@ -403,6 +405,8 @@
 
 - op: unbind_copy.int_out
 
+- op: unfold_copy.out
+
 - op: unsafe_split.Tensor_out
 
 - op: unsqueeze_copy.dim_out
diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt
index 1f3aff57ecf..7cba9e91fe5 100644
--- a/kernels/optimized/CMakeLists.txt
+++ b/kernels/optimized/CMakeLists.txt
@@ -33,8 +33,8 @@ list(APPEND _common_compile_options -DET_BUILD_WITH_BLAS)
 # probably need to figure out how to detect compiler flag that suggest we are
 # compiling for avx2 for now punting this to come back
 
-include(${EXECUTORCH_ROOT}/build/Utils.cmake)
-include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
 
 if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
@@ -43,18 +43,18 @@ endif()
 list(TRANSFORM _optimized_cpublas__srcs PREPEND "${EXECUTORCH_ROOT}/")
 add_library(cpublas STATIC ${_optimized_cpublas__srcs})
 target_link_libraries(
-  cpublas PRIVATE executorch_core eigen_blas extension_threadpool
+  cpublas PUBLIC executorch_core eigen_blas extension_threadpool
 )
 target_compile_options(cpublas PUBLIC ${_common_compile_options})
 
 # Generate C++ bindings to register kernels into both PyTorch (for AOT) and
 # Executorch (for runtime). Here select all ops in optimized.yaml
-set(_yaml "${CMAKE_CURRENT_LIST_DIR}/optimized-oss.yaml")
+set(_yaml "${CMAKE_CURRENT_LIST_DIR}/optimized.yaml")
 gen_selected_ops(LIB_NAME "optimized_ops_lib" OPS_SCHEMA_YAML "${_yaml}")
 
 generate_bindings_for_kernels(
   LIB_NAME "optimized_ops_lib" FUNCTIONS_YAML
-  ${CMAKE_CURRENT_SOURCE_DIR}/optimized-oss.yaml
+  ${CMAKE_CURRENT_SOURCE_DIR}/optimized.yaml
   ADD_EXCEPTION_BOUNDARY
 )
 message("Generated files ${gen_command_sources}")
@@ -63,7 +63,7 @@ list(TRANSFORM _optimized_kernels__srcs PREPEND "${EXECUTORCH_ROOT}/")
 add_library(optimized_kernels ${_optimized_kernels__srcs})
 target_include_directories(optimized_kernels PRIVATE ${TORCH_INCLUDE_DIRS} "${EXECUTORCH_ROOT}/third-party/pocketfft")
 target_link_libraries(
-  optimized_kernels PRIVATE executorch_core cpublas extension_threadpool
+  optimized_kernels PUBLIC executorch_core cpublas extension_threadpool
 )
 target_compile_options(optimized_kernels PUBLIC ${_common_compile_options})
 # Build a library for _optimized_kernels_srcs
diff --git a/kernels/optimized/blas/BlasKernel.h b/kernels/optimized/blas/BlasKernel.h
index c2b03cfebdd..fc47b4482d6 100644
--- a/kernels/optimized/blas/BlasKernel.h
+++ b/kernels/optimized/blas/BlasKernel.h
@@ -11,8 +11,8 @@
 #include <executorch/kernels/optimized/utils/math_utils.h>
 #include <executorch/kernels/optimized/utils/unroll.h>
 
-#include <executorch/extension/parallel/thread_parallel.h>
 #include <executorch/runtime/core/portable_type/bfloat16.h>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
 
 #include <array>
 
diff --git a/kernels/optimized/cpu/op_add.cpp b/kernels/optimized/cpu/op_add.cpp
index dbf828e5882..5f164f1eb13 100644
--- a/kernels/optimized/cpu/op_add.cpp
+++ b/kernels/optimized/cpu/op_add.cpp
@@ -14,59 +14,11 @@
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
 
+#include <executorch/kernels/optimized/cpu/op_add_sub_impl.h>
+
 namespace torch {
 namespace executor {
 namespace native {
-namespace {
-
-template <
-    bool can_cast,
-    typename CTYPE_A,
-    typename CTYPE_B,
-    typename CTYPE_IN,
-    typename CTYPE_OUT>
-struct AddInner;
-
-template <
-    typename CTYPE_A,
-    typename CTYPE_B,
-    typename CTYPE_IN,
-    typename CTYPE_OUT>
-struct AddInner<true, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT> {
-  static void
-  run(const Tensor& a, const Tensor& b, CTYPE_IN alpha_val, Tensor& out) {
-    apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
-        // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue)
-        [alpha_val](const CTYPE_A val_a, const CTYPE_B val_b) {
-          CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
-          CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
-          CTYPE_IN value = a_casted + alpha_val * b_casted;
-
-          return static_cast<CTYPE_OUT>(value);
-        },
-        a,
-        b,
-        out);
-  }
-};
-
-template <typename CTYPE_IN>
-struct ReportCanCastBug {
-  static void run(const Tensor&, const Tensor&, CTYPE_IN, Tensor&) {
-    ET_DCHECK_MSG(false, "BUG: canCast should have been checked above");
-  }
-};
-
-template <
-    typename CTYPE_A,
-    typename CTYPE_B,
-    typename CTYPE_IN,
-    typename CTYPE_OUT>
-struct AddInner<false, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
-    : public ReportCanCastBug<CTYPE_IN> {};
-
-} // namespace
-
 using Tensor = executorch::aten::Tensor;
 using ScalarType = executorch::aten::ScalarType;
 
@@ -76,8 +28,6 @@ Tensor& opt_add_out(
     const Tensor& b,
     const Scalar& alpha,
     Tensor& out) {
-  (void)ctx;
-
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = b.scalar_type();
   ScalarType out_type = out.scalar_type();
@@ -95,7 +45,9 @@ Tensor& opt_add_out(
         ET_SWITCH_REALB_TYPES(b_type, ctx, "add.out", CTYPE_B, [&]() {
           CTYPE alpha_val;
           ET_KERNEL_CHECK(
-              ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, );
+              ctx,
+              torch::executor::native::utils::extract_scalar(alpha, &alpha_val),
+              InvalidArgument, );
           CTYPE_B b_val = *b.const_data_ptr<CTYPE_B>();
           CTYPE b_casted = static_cast<CTYPE>(b_val);
 
@@ -115,100 +67,9 @@ Tensor& opt_add_out(
     return opt_add_out(ctx, b, a, alpha, out);
   }
 
-  auto selected_optimized_path = select_optimized_path(a, b, out);
-  if (selected_optimized_path == ElementwiseOptimizedPath::kTreatAs1d) {
-    // Resize for dynamic shape
-    auto error = resize_tensor(out, a.sizes());
-    ET_KERNEL_CHECK_MSG(
-        ctx,
-        error == Error::Ok,
-        InvalidArgument,
-        out,
-        "Failed to resize output tensor.");
-
-    ET_SWITCH_REALB_TYPES(a_type, ctx, "add.out", CTYPE, [&]() {
-      CTYPE alpha_val;
-      ET_KERNEL_CHECK(
-          ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, );
-
-      using Vec = executorch::vec::Vectorized<CTYPE>;
-      executorch::vec::map2<CTYPE>(
-          [alpha_val](Vec x, Vec y) { return x + Vec(alpha_val) * y; },
-          out.mutable_data_ptr<CTYPE>(),
-          a.const_data_ptr<CTYPE>(),
-          b.const_data_ptr<CTYPE>(),
-          out.numel());
-    });
-  } else if (selected_optimized_path != ElementwiseOptimizedPath::kNone) {
-    ET_SWITCH_REALB_TYPES(out_type, ctx, "add.out", CTYPE, [&]() {
-      CTYPE alpha_val;
-      ET_KERNEL_CHECK_MSG(
-          ctx,
-          utils::extract_scalar(alpha, &alpha_val),
-          InvalidArgument,
-          out,
-          "Failed to extract scalar alpha.");
-      using Vec = executorch::vec::Vectorized<CTYPE>;
-      Vec alpha_val_vec(alpha_val);
-      if (selected_optimized_path ==
-              ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments ||
-          selected_optimized_path ==
-              ElementwiseOptimizedPath::kBroadcastLastDimReverseArguments ||
-          selected_optimized_path ==
-              ElementwiseOptimizedPath::kBroadcastNdByNdReverseArguments) {
-        // Reason we swap out args here is because handle_broadcast_elementwise
-        // handles this selected_optimized_path option a bit differently.
-        // This should really be resolved in handle_broadcast_elementwise.
-        // However, the current blocker is that handle_broadcast_elementwise
-        // tries to be agnostic of op. This should be fixed, likely by moving
-        // lambda creation to handle_broadcast_elementwise and it be aware of
-        // which op is being executed.
-        auto add_lambda = [&alpha_val_vec](auto x, auto y) {
-          return y + alpha_val_vec * x;
-        };
-        return torch::executor::handle_broadcast_elementwise<CTYPE>(
-            ctx, add_lambda, a, b, out, selected_optimized_path, alpha);
-      } else {
-        auto add_lambda = [&alpha_val_vec](auto x, auto y) {
-          return x + alpha_val_vec * y;
-        };
-        return torch::executor::handle_broadcast_elementwise<CTYPE>(
-            ctx, add_lambda, a, b, out, selected_optimized_path, alpha);
-      }
-    });
-  } else {
-    ScalarType common_type =
-        promoteTypes(a_type, b_type, /*half_to_float*/ true);
-    ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out);
-
-    ET_KERNEL_CHECK(
-        ctx,
-        resize_to_broadcast_target_size(a, b, out) == Error::Ok,
-        InvalidArgument,
-        out);
-
-    ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, "add.out", CTYPE_A, [&]() {
-      ET_SWITCH_REALHBBF16_TYPES(b_type, ctx, "add.out", CTYPE_B, [&]() {
-        using CTYPE_IN = typename torch::executor::
-            promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
-        ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
-        ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, "add.out", CTYPE_OUT, [&]() {
-          CTYPE_IN alpha_val;
-          ET_KERNEL_CHECK(
-              ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, );
-
-          AddInner<
-              can_cast<CTYPE_IN, CTYPE_OUT>::value,
-              CTYPE_A,
-              CTYPE_B,
-              CTYPE_IN,
-              CTYPE_OUT>::run(a, b, alpha_val, out);
-        });
-      });
-    });
-  }
-
-  return out;
+  static constexpr const char op_name[] = "add.out";
+  return torch::executor::kernels::impl::opt_add_sub_out_impl<false, op_name>(
+      ctx, a, b, alpha, out);
 }
 
 Tensor& opt_add_scalar_out(
diff --git a/kernels/optimized/cpu/op_add_sub_impl.h b/kernels/optimized/cpu/op_add_sub_impl.h
new file mode 100644
index 00000000000..6fb8574688b
--- /dev/null
+++ b/kernels/optimized/cpu/op_add_sub_impl.h
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/optimized/cpu/binary_ops.h>
+#include <executorch/kernels/optimized/vec/functional.h>
+#include <executorch/kernels/optimized/vec/vec.h>
+#include <executorch/kernels/portable/cpu/scalar_utils.h>
+#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/platform/assert.h>
+
+namespace torch {
+namespace executor {
+namespace kernels {
+namespace impl {
+
+namespace {
+template <
+    bool can_cast,
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct AddInner;
+
+template <
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct AddInner<true, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT> {
+  static void
+  run(const Tensor& a, const Tensor& b, CTYPE_IN alpha_val, Tensor& out) {
+    apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
+        // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue)
+        [alpha_val](const CTYPE_A val_a, const CTYPE_B val_b) {
+          CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
+          CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
+          CTYPE_IN value = a_casted + alpha_val * b_casted;
+
+          return static_cast<CTYPE_OUT>(value);
+        },
+        a,
+        b,
+        out);
+  }
+};
+
+template <typename CTYPE_IN>
+struct ReportCanCastBug {
+  static void run(const Tensor&, const Tensor&, CTYPE_IN, Tensor&) {
+    ET_DCHECK_MSG(false, "BUG: canCast should have been checked above");
+  }
+};
+
+template <
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct AddInner<false, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
+    : public ReportCanCastBug<CTYPE_IN> {};
+
+} // namespace
+
+using Tensor = executorch::aten::Tensor;
+using ScalarType = executorch::aten::ScalarType;
+
+template <bool is_sub, const char* op_name>
+Tensor& opt_add_sub_out_impl(
+    KernelRuntimeContext& ctx,
+    const Tensor& a,
+    const Tensor& b,
+    const Scalar& alpha,
+    Tensor& out) {
+  (void)ctx;
+
+  ScalarType a_type = a.scalar_type();
+  ScalarType b_type = b.scalar_type();
+  ScalarType out_type = out.scalar_type();
+
+  auto selected_optimized_path = select_optimized_path(a, b, out);
+  if (selected_optimized_path == ElementwiseOptimizedPath::kTreatAs1d) {
+    // Resize for dynamic shape
+    auto error = resize_tensor(out, a.sizes());
+    ET_KERNEL_CHECK_MSG(
+        ctx,
+        error == Error::Ok,
+        InvalidArgument,
+        out,
+        "Failed to resize output tensor.");
+
+    ET_SWITCH_REALB_TYPES(a_type, ctx, op_name, CTYPE, [&]() {
+      CTYPE alpha_val;
+      ET_KERNEL_CHECK(
+          ctx,
+          torch::executor::native::utils::extract_scalar(alpha, &alpha_val),
+          InvalidArgument, );
+      if constexpr (is_sub) {
+        alpha_val = -alpha_val;
+      }
+      using Vec = executorch::vec::Vectorized<CTYPE>;
+      executorch::vec::map2<CTYPE>(
+          [alpha_val](Vec x, Vec y) { return x + Vec(alpha_val) * y; },
+          out.mutable_data_ptr<CTYPE>(),
+          a.const_data_ptr<CTYPE>(),
+          b.const_data_ptr<CTYPE>(),
+          out.numel());
+    });
+  } else if (selected_optimized_path != ElementwiseOptimizedPath::kNone) {
+    // Cannot apply the trick of -alpha here because alpha is Scalar without
+    // support for - operator. At least not right now.
+    ET_SWITCH_REALB_TYPES(out_type, ctx, op_name, CTYPE, [&]() {
+      CTYPE alpha_val;
+      ET_KERNEL_CHECK_MSG(
+          ctx,
+          torch::executor::native::utils::extract_scalar(alpha, &alpha_val),
+          InvalidArgument,
+          out,
+          "Failed to extract scalar alpha.");
+      using Vec = executorch::vec::Vectorized<CTYPE>;
+      Vec alpha_val_vec(alpha_val);
+      if constexpr (is_sub) {
+        if (selected_optimized_path ==
+                ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments ||
+            selected_optimized_path ==
+                ElementwiseOptimizedPath::kBroadcastLastDimReverseArguments ||
+            selected_optimized_path ==
+                ElementwiseOptimizedPath::kBroadcastNdByNdReverseArguments) {
+          auto add_lambda = [&alpha_val_vec](auto x, auto y) {
+            return y - alpha_val_vec * x;
+          };
+          return torch::executor::handle_broadcast_elementwise<CTYPE>(
+              ctx, add_lambda, a, b, out, selected_optimized_path, alpha);
+        } else {
+          auto add_lambda = [&alpha_val_vec](auto x, auto y) {
+            return x - alpha_val_vec * y;
+          };
+          return torch::executor::handle_broadcast_elementwise<CTYPE>(
+              ctx, add_lambda, a, b, out, selected_optimized_path, alpha);
+        }
+      } else {
+        if (selected_optimized_path ==
+                ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments ||
+            selected_optimized_path ==
+                ElementwiseOptimizedPath::kBroadcastLastDimReverseArguments ||
+            selected_optimized_path ==
+                ElementwiseOptimizedPath::kBroadcastNdByNdReverseArguments) {
+          // Reason we swap out args here is because
+          // handle_broadcast_elementwise handles this selected_optimized_path
+          // option a bit differently. This should really be resolved in
+          // handle_broadcast_elementwise. However, the current blocker is that
+          // handle_broadcast_elementwise tries to be agnostic of op. This
+          // should be fixed, likely by moving lambda creation to
+          // handle_broadcast_elementwise and it be aware of which op is being
+          // executed.
+          auto add_lambda = [&alpha_val_vec](auto x, auto y) {
+            return y + alpha_val_vec * x;
+          };
+          return torch::executor::handle_broadcast_elementwise<CTYPE>(
+              ctx, add_lambda, a, b, out, selected_optimized_path, alpha);
+        } else {
+          auto add_lambda = [&alpha_val_vec](auto x, auto y) {
+            return x + alpha_val_vec * y;
+          };
+          return torch::executor::handle_broadcast_elementwise<CTYPE>(
+              ctx, add_lambda, a, b, out, selected_optimized_path, alpha);
+        }
+      }
+    });
+  } else {
+    ScalarType common_type =
+        promoteTypes(a_type, b_type, /*half_to_float*/ true);
+    ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out);
+
+    ET_KERNEL_CHECK(
+        ctx,
+        resize_to_broadcast_target_size(a, b, out) == Error::Ok,
+        InvalidArgument,
+        out);
+
+    ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, op_name, CTYPE_A, [&]() {
+      ET_SWITCH_REALHBBF16_TYPES(b_type, ctx, op_name, CTYPE_B, [&]() {
+        using CTYPE_IN = typename torch::executor::
+            promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
+        ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
+        ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, op_name, CTYPE_OUT, [&]() {
+          CTYPE_IN alpha_val;
+          ET_KERNEL_CHECK(
+              ctx,
+              torch::executor::native::utils::extract_scalar(alpha, &alpha_val),
+              InvalidArgument, );
+          if constexpr (is_sub) {
+            alpha_val = -alpha_val;
+          }
+
+          AddInner<
+              can_cast<CTYPE_IN, CTYPE_OUT>::value,
+              CTYPE_A,
+              CTYPE_B,
+              CTYPE_IN,
+              CTYPE_OUT>::run(a, b, alpha_val, out);
+        });
+      });
+    });
+  }
+
+  return out;
+}
+} // namespace impl
+} // namespace kernels
+} // namespace executor
+} // namespace torch
diff --git a/kernels/optimized/cpu/op_bmm.cpp b/kernels/optimized/cpu/op_bmm.cpp
index 21ae7dfca90..5e7fa1dd839 100644
--- a/kernels/optimized/cpu/op_bmm.cpp
+++ b/kernels/optimized/cpu/op_bmm.cpp
@@ -31,39 +31,38 @@ namespace {
 // Verifies that the parameters are valid.
 bool check_bmm_out_args(const Tensor& self, const Tensor& mat2, Tensor& out) {
   // Ensure dimensions is 3 for all input and out
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       self.dim() == mat2.dim(),
       "self.dim() %zd != mat2.dim() %zd",
       self.dim(),
       mat2.dim());
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       self.dim() == out.dim(),
       "self.dim() %zd != out.dim() %zd",
       self.dim(),
       out.dim());
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
-      self.dim() == 3, "self.dim() %zd != 3", self.dim());
+  ET_CHECK_OR_RETURN_FALSE(self.dim() == 3, "self.dim() %zd != 3", self.dim());
   // Ensure batch larger than or equals to 0
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       self.size(0) >= 0, "self.size(0) %zd < 0", self.size(0));
   // Ensure batches are the same
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       self.size(0) == mat2.size(0),
       "self.size(0) %zd != mat2.size(0) %zd",
       self.size(0),
       mat2.size(0));
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       self.size(0) == out.size(0),
       "self.size(0) %zd != out.size(0) %zd",
       self.size(0),
       out.size(0));
   // Ensure the out size is compatible with input tensors
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       mat2.size(2) == out.size(2),
       "mat2.size(2) %zd != out.size(2) %zd",
       mat2.size(2),
       out.size(2));
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       self.size(1) == out.size(1),
       "self.size(1) %zd != out.size(1) %zd",
       self.size(1),
diff --git a/kernels/optimized/cpu/op_div.cpp b/kernels/optimized/cpu/op_div.cpp
index 4d7b8efe9e3..e630f1c03bd 100644
--- a/kernels/optimized/cpu/op_div.cpp
+++ b/kernels/optimized/cpu/op_div.cpp
@@ -120,46 +120,22 @@ Tensor& opt_div_out(
           out.numel());
     });
   } else if (selected_optimized_path != ElementwiseOptimizedPath::kNone) {
-    const Tensor* lhs;
-    const Tensor* rhs;
-    if (selected_optimized_path ==
-        ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments) {
-      lhs = &b;
-      rhs = &a;
-    } else {
-      // Catch failure to update logic when subing new broadcasting possibility.
-      ET_DCHECK(
-          selected_optimized_path ==
-          ElementwiseOptimizedPath::kBroadcast2dBy1d);
-      lhs = &a;
-      rhs = &b;
-    }
-    auto error = resize_tensor(out, lhs->sizes());
-    ET_KERNEL_CHECK_MSG(
-        ctx,
-        error == Error::Ok,
-        InvalidArgument,
-        out,
-        "Failed to resize output tensor.");
-    ET_SWITCH_REALB_TYPES(out_type, ctx, "sub.out", CTYPE, [&]() {
-      using Vec = executorch::vec::Vectorized<CTYPE>;
+    // Reason for using alpha is becasuse handle_broadcast_elementwise
+    // is used for add and sub as well:
+    ET_SWITCH_REALB_TYPES(out_type, ctx, "div.out", CTYPE, [&]() {
       if (selected_optimized_path ==
-          ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments) {
-        executorch::vec::broadcasting_map_2d_by_1d<CTYPE>(
-            [](Vec x, Vec y) { return y / x; },
-            out.mutable_data_ptr<CTYPE>(),
-            lhs->const_data_ptr<CTYPE>(),
-            rhs->const_data_ptr<CTYPE>(),
-            lhs->sizes()[lhs->dim() - 2],
-            lhs->sizes()[lhs->dim() - 1]);
+              ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments ||
+          selected_optimized_path ==
+              ElementwiseOptimizedPath::kBroadcastLastDimReverseArguments ||
+          selected_optimized_path ==
+              ElementwiseOptimizedPath::kBroadcastNdByNdReverseArguments) {
+        auto div_lambda = [](auto x, auto y) { return y / x; };
+        return torch::executor::handle_broadcast_elementwise<CTYPE>(
+            ctx, div_lambda, a, b, out, selected_optimized_path);
       } else {
-        executorch::vec::broadcasting_map_2d_by_1d<CTYPE>(
-            [](Vec x, Vec y) { return x / y; },
-            out.mutable_data_ptr<CTYPE>(),
-            lhs->const_data_ptr<CTYPE>(),
-            rhs->const_data_ptr<CTYPE>(),
-            lhs->sizes()[lhs->dim() - 2],
-            lhs->sizes()[lhs->dim() - 1]);
+        auto div_lambda = [](auto x, auto y) { return x / y; };
+        return torch::executor::handle_broadcast_elementwise<CTYPE>(
+            ctx, div_lambda, a, b, out, selected_optimized_path);
       }
     });
   } else {
diff --git a/kernels/optimized/cpu/op_gelu.cpp b/kernels/optimized/cpu/op_gelu.cpp
index dcb6bbc4279..ebe8923b590 100644
--- a/kernels/optimized/cpu/op_gelu.cpp
+++ b/kernels/optimized/cpu/op_gelu.cpp
@@ -98,6 +98,12 @@ Tensor& opt_gelu_out(
   ET_KERNEL_CHECK(
       context, check_gelu_args(input, approximate, out), InvalidArgument, out);
 
+  ET_KERNEL_CHECK(
+      context,
+      resize_tensor(out, input.sizes()) == Error::Ok,
+      InvalidArgument,
+      out);
+
   ET_SWITCH_FLOATHBF16_TYPES(
       input.scalar_type(), context, "gelu.out", CTYPE, [&]() {
         gelu<CTYPE>(context, input, approximate, out);
diff --git a/kernels/optimized/cpu/op_le.cpp b/kernels/optimized/cpu/op_le.cpp
index ccacdd5b279..94c2d5ffa76 100644
--- a/kernels/optimized/cpu/op_le.cpp
+++ b/kernels/optimized/cpu/op_le.cpp
@@ -9,6 +9,7 @@
 #include <executorch/kernels/optimized/vec/functional.h>
 #include <executorch/kernels/optimized/vec/vec.h>
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
+#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
 
@@ -26,6 +27,58 @@ Tensor& opt_le_tensor_out(
     Tensor& out) {
   (void)ctx;
 
+  ScalarType a_type = a.scalar_type();
+  ScalarType b_type = b.scalar_type();
+  ScalarType out_type = out.scalar_type();
+
+  if (a.numel() == 1 || b.numel() == 1) {
+    const Tensor* tensor;
+    const Tensor* scalar;
+    ScalarType tensor_type;
+    ScalarType scalar_type;
+    if (a.numel() == 1) {
+      tensor = &b;
+      tensor_type = b_type;
+      scalar = &a;
+      scalar_type = a_type;
+    } else {
+      tensor = &a;
+      tensor_type = a_type;
+      scalar = &b;
+      scalar_type = b_type;
+    }
+    ET_KERNEL_CHECK(
+        ctx,
+        resize_to_broadcast_target_size(a, b, out) == Error::Ok,
+        InvalidArgument,
+        out);
+
+    constexpr auto name = "le.Tensor_out";
+
+    ET_SWITCH_REALB_TYPES(tensor_type, ctx, name, CTYPE, [&]() {
+      ET_SWITCH_REALB_TYPES(scalar_type, ctx, name, CTYPE_SCALAR, [&]() {
+        CTYPE_SCALAR scalar_val = *scalar->const_data_ptr<CTYPE_SCALAR>();
+        CTYPE scalar_casted = static_cast<CTYPE>(scalar_val);
+
+        using Vec = executorch::vec::Vectorized<CTYPE>;
+        if (a.numel() == 1) {
+          executorch::vec::map<CTYPE>(
+              [scalar_casted](Vec x) { return Vec(scalar_casted).le(x); },
+              out.mutable_data_ptr<CTYPE>(),
+              tensor->const_data_ptr<CTYPE>(),
+              out.numel());
+        } else {
+          executorch::vec::map<CTYPE>(
+              [scalar_casted](Vec x) { return x.le(Vec(scalar_casted)); },
+              out.mutable_data_ptr<CTYPE>(),
+              tensor->const_data_ptr<CTYPE>(),
+              out.numel());
+        }
+      });
+    });
+    return out;
+  }
+
   ET_KERNEL_CHECK(ctx, tensors_have_same_shape(a, b), InvalidArgument, out);
 
   // Resize for dynamic shape
@@ -37,10 +90,6 @@ Tensor& opt_le_tensor_out(
       out,
       "Failed to resize output tensor.");
 
-  ScalarType a_type = a.scalar_type();
-  ScalarType b_type = b.scalar_type();
-  ScalarType out_type = out.scalar_type();
-
   if (a_type == b_type && a_type == out_type) {
     ET_SWITCH_REAL_TYPES_AND(
         Bool, out_type, ctx, "le.Tensor_out", CTYPE, [&]() {
diff --git a/kernels/optimized/cpu/op_log_softmax.cpp b/kernels/optimized/cpu/op_log_softmax.cpp
index c3f090a6dfe..1822a06f29f 100644
--- a/kernels/optimized/cpu/op_log_softmax.cpp
+++ b/kernels/optimized/cpu/op_log_softmax.cpp
@@ -14,6 +14,8 @@
 #include <cmath>
 #include <type_traits>
 
+#include <ATen/cpu/vec/functional.h>
+#include <ATen/cpu/vec/vec.h>
 #include <executorch/kernels/portable/cpu/util/activation_ops_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
@@ -66,30 +68,33 @@ void log_softmax_kernel(const Tensor& input, int64_t dim, Tensor& out) {
       }
       // calculate sum and exponential in softmax dim
       OUT_T temp_sum = 0;
-#ifndef __aarch64__
-      for (auto d = 0; d < dim_size; ++d) {
-        output_data[d * dim_stride] =
-            std::exp(input_data[d * dim_stride] - max_input);
-        temp_sum += output_data[d * dim_stride];
-      }
-#else
+      using VecOut = at::vec::Vectorized<OUT_T>;
+      using VecIn = at::vec::Vectorized<IN_T>;
       auto d = 0;
-      for (; d + 4 < dim_size; d += 4) {
-        auto index = d * dim_stride;
-        float32x4_t in =
-            vld1q_f32(static_cast<const float*>(&input_data[index]));
-        float32x4_t out_ =
-            Sleef_expf4_u10(vsubq_f32(in, vmovq_n_f32(max_input)));
-        vst1q_f32(static_cast<float*>(&output_data[index]), out_);
-        temp_sum += vaddvq_f32(out_);
+      static_assert(sizeof(IN_T) == sizeof(OUT_T));
+      static_assert(
+          std::is_same_v<OUT_T, float>,
+          "Below loop actually only supports float.");
+      // It is not correct to vectorize if dim is not contiguous!
+      if (dim_stride == 1) {
+        const VecIn max_input_vec(max_input);
+        for (; d + VecOut::size() < dim_size; d += VecOut::size()) {
+          auto index = d * dim_stride;
+          auto in = VecIn::loadu(&input_data[index]);
+          auto out_ = (in - max_input_vec).exp();
+          out_.store(&output_data[index]);
+#if defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE)
+          temp_sum += vaddvq_f32(out_);
+#else
+          temp_sum += at::vec::vec_reduce_all<float>(std::plus<VecOut>(), out_);
+#endif
+        }
       }
-
       for (; d < dim_size; ++d) {
         output_data[d * dim_stride] =
             std::exp(input_data[d * dim_stride] - max_input);
         temp_sum += output_data[d * dim_stride];
       }
-#endif // __aarch64__
 
       temp_sum = std::log(temp_sum);
 
diff --git a/kernels/optimized/cpu/op_sub.cpp b/kernels/optimized/cpu/op_sub.cpp
index 7ee880d9977..489421f1b2d 100644
--- a/kernels/optimized/cpu/op_sub.cpp
+++ b/kernels/optimized/cpu/op_sub.cpp
@@ -15,6 +15,8 @@
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
 
+#include <executorch/kernels/optimized/cpu/op_add_sub_impl.h>
+
 namespace torch {
 namespace executor {
 namespace native {
@@ -138,110 +140,9 @@ Tensor& opt_sub_out(
     }
   }
 
-  auto selected_optimized_path = select_optimized_path(a, b, out);
-  if (selected_optimized_path == ElementwiseOptimizedPath::kTreatAs1d) {
-    // Resize for dynamic shape
-    auto error = resize_tensor(out, a.sizes());
-    ET_KERNEL_CHECK_MSG(
-        ctx,
-        error == Error::Ok,
-        InvalidArgument,
-        out,
-        "Failed to resize output tensor.");
-
-    ET_SWITCH_REAL_TYPES(a_type, ctx, "sub.out", CTYPE, [&]() {
-      CTYPE alpha_val;
-      ET_KERNEL_CHECK(
-          ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, );
-
-      using Vec = executorch::vec::Vectorized<CTYPE>;
-      executorch::vec::map2<CTYPE>(
-          [alpha_val](Vec x, Vec y) { return x - Vec(alpha_val) * y; },
-          out.mutable_data_ptr<CTYPE>(),
-          a.const_data_ptr<CTYPE>(),
-          b.const_data_ptr<CTYPE>(),
-          out.numel());
-    });
-  } else if (selected_optimized_path != ElementwiseOptimizedPath::kNone) {
-    const Tensor* lhs;
-    const Tensor* rhs;
-    if (selected_optimized_path ==
-        ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments) {
-      lhs = &b;
-      rhs = &a;
-    } else {
-      // Catch failure to update logic when subing new broadcasting possibility.
-      ET_DCHECK(
-          selected_optimized_path ==
-          ElementwiseOptimizedPath::kBroadcast2dBy1d);
-      lhs = &a;
-      rhs = &b;
-    }
-    auto error = resize_tensor(out, lhs->sizes());
-    ET_KERNEL_CHECK_MSG(
-        ctx,
-        error == Error::Ok,
-        InvalidArgument,
-        out,
-        "Failed to resize output tensor.");
-    ET_SWITCH_REAL_TYPES(out_type, ctx, "sub.out", CTYPE, [&]() {
-      CTYPE alpha_val;
-      ET_KERNEL_CHECK(
-          ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, );
-
-      using Vec = executorch::vec::Vectorized<CTYPE>;
-      if (selected_optimized_path ==
-          ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments) {
-        executorch::vec::broadcasting_map_2d_by_1d<CTYPE>(
-            [alpha_val](Vec x, Vec y) { return y - Vec(alpha_val) * x; },
-            out.mutable_data_ptr<CTYPE>(),
-            lhs->const_data_ptr<CTYPE>(),
-            rhs->const_data_ptr<CTYPE>(),
-            lhs->sizes()[lhs->dim() - 2],
-            lhs->sizes()[lhs->dim() - 1]);
-      } else {
-        executorch::vec::broadcasting_map_2d_by_1d<CTYPE>(
-            [alpha_val](Vec x, Vec y) { return x - Vec(alpha_val) * y; },
-            out.mutable_data_ptr<CTYPE>(),
-            lhs->const_data_ptr<CTYPE>(),
-            rhs->const_data_ptr<CTYPE>(),
-            lhs->sizes()[lhs->dim() - 2],
-            lhs->sizes()[lhs->dim() - 1]);
-      }
-    });
-  } else {
-    ScalarType common_type =
-        promoteTypes(a_type, b_type, /*half_to_float*/ true);
-    ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out);
-
-    ET_KERNEL_CHECK(
-        ctx,
-        resize_to_broadcast_target_size(a, b, out) == Error::Ok,
-        InvalidArgument,
-        out);
-
-    ET_SWITCH_REALH_TYPES(a_type, ctx, "sub.out", CTYPE_A, [&]() {
-      ET_SWITCH_REALH_TYPES(b_type, ctx, "sub.out", CTYPE_B, [&]() {
-        using CTYPE_IN = typename torch::executor::
-            promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
-        ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
-        ET_SWITCH_REALH_TYPES(out_type, ctx, "sub.out", CTYPE_OUT, [&]() {
-          CTYPE_IN alpha_val;
-          ET_KERNEL_CHECK(
-              ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, );
-
-          SubInner<
-              can_cast<CTYPE_IN, CTYPE_OUT>::value,
-              CTYPE_A,
-              CTYPE_B,
-              CTYPE_IN,
-              CTYPE_OUT>::run(a, b, alpha_val, out);
-        });
-      });
-    });
-  }
-
-  return out;
+  static constexpr const char op_name[] = "sub.out";
+  return torch::executor::kernels::impl::opt_add_sub_out_impl<true, op_name>(
+      ctx, a, b, alpha, out);
 }
 
 Tensor& opt_sub_scalar_out(
diff --git a/kernels/optimized/cpu/op_where.cpp b/kernels/optimized/cpu/op_where.cpp
new file mode 100644
index 00000000000..fb14e542891
--- /dev/null
+++ b/kernels/optimized/cpu/op_where.cpp
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
+
+namespace torch {
+namespace executor {
+namespace native {
+
+Tensor& opt_where_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& cond,
+    const Tensor& a,
+    const Tensor& b,
+    Tensor& out) {
+  // Common Dtype
+  ScalarType common_type = promoteTypes(a.scalar_type(), b.scalar_type());
+
+  // Check Common Dtype
+  ET_KERNEL_CHECK(ctx, common_type == out.scalar_type(), InvalidArgument, out);
+
+  // Check Dim Order
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(cond, a, b, out), InvalidArgument, out);
+
+  // Resize
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_to_broadcast_target_size(a, b, cond, out) == Error::Ok,
+      InvalidArgument,
+      out);
+
+  // Compute Dtype
+  ScalarType compute_type = utils::get_compute_type(common_type);
+
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  static constexpr const char op_name[] = "where.self_out";
+
+  if (a.scalar_type() == b.scalar_type() &&
+      a.scalar_type() == out.scalar_type() && a.scalar_type() == compute_type &&
+      // Using a Byte tensor for cond has been deprecated for a long time.
+      cond.scalar_type() == ScalarType::Bool) {
+    auto out_numel = out.numel();
+    ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
+      const CTYPE_COMPUTE* const data_a = a.const_data_ptr<CTYPE_COMPUTE>();
+      const CTYPE_COMPUTE* const data_b = b.const_data_ptr<CTYPE_COMPUTE>();
+      const bool* const data_cond = cond.const_data_ptr<bool>();
+      CTYPE_COMPUTE* const data_out = out.data_ptr<CTYPE_COMPUTE>();
+      executorch::extension::parallel_for(
+          0,
+          out_numel,
+          ::executorch::extension::internal::GRAIN_SIZE,
+          [&](const auto begin, const auto end) {
+            auto range = BroadcastIndexesRange<3>(out, a, b, cond);
+            auto begin_it = range.begin();
+            begin_it += begin;
+            for (; (*begin_it)[0] < end; ++begin_it) {
+              const auto [out_index, a_index, b_index, cond_index] = *begin_it;
+              data_out[out_index] =
+                  data_cond[cond_index] ? data_a[a_index] : data_b[b_index];
+            }
+          });
+    });
+  } else {
+    // Fall back for mixed dtype to keep code size and compile time
+    // reasonable.
+    ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
+      utils::apply_tritensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+          [](const CTYPE_COMPUTE val_a,
+             const CTYPE_COMPUTE val_b,
+             const CTYPE_COMPUTE val_c) { return val_c ? val_a : val_b; },
+          ctx,
+          a,
+          utils::SupportedTensorDtypes::REALHBBF16,
+          b,
+          utils::SupportedTensorDtypes::REALHBBF16,
+          cond,
+          utils::SupportedTensorDtypes::BOOL_OR_BYTE,
+          out,
+          utils::SupportedTensorDtypes::SAME_AS_COMMON);
+    });
+  }
+
+  return out;
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
diff --git a/kernels/optimized/cpu/targets.bzl b/kernels/optimized/cpu/targets.bzl
index 1c62b683b8f..017dff8a127 100644
--- a/kernels/optimized/cpu/targets.bzl
+++ b/kernels/optimized/cpu/targets.bzl
@@ -1,11 +1,12 @@
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
-load("@fbsource//xplat/executorch/kernels/optimized:op_registration_util.bzl", "define_op_target", "is_op_disabled", "op_target")
+load("@fbsource//xplat/executorch/kernels/optimized:op_registration_util.bzl", "define_op_target", "op_target")
 
 _OPTIMIZED_ATEN_OPS = (
     op_target(
         name = "op_add",
         deps = [
             ":binary_ops",
+            ":add_sub_impl",
             "//executorch/kernels/portable/cpu:scalar_utils",
             "//executorch/kernels/portable/cpu/util:broadcast_util",
         ],
@@ -45,6 +46,7 @@ _OPTIMIZED_ATEN_OPS = (
         name = "op_le",
         deps = [
             "//executorch/kernels/portable/cpu:scalar_utils",
+            "//executorch/kernels/portable/cpu/util:broadcast_util",
         ],
     ),
     op_target(
@@ -56,15 +58,10 @@ _OPTIMIZED_ATEN_OPS = (
     ),
     op_target(
         name = "op_log_softmax",
-        deps = select({
-            "DEFAULT": [
-                "//executorch/kernels/portable/cpu/util:activation_ops_util",
-            ],
-            "ovr_config//cpu:arm64": [
-                "//executorch/kernels/portable/cpu/util:activation_ops_util",
-                "fbsource//third-party/sleef:sleef_arm",
-            ],
-        }),
+        deps = [
+            "//executorch/kernels/portable/cpu/util:activation_ops_util",
+            "//executorch/runtime/core/portable_type/c10/c10:aten_headers_for_executorch",
+        ],
     ),
     op_target(
         name = "op_mm",
@@ -94,10 +91,18 @@ _OPTIMIZED_ATEN_OPS = (
         name = "op_sub",
         deps = [
             ":binary_ops",
+            ":add_sub_impl",
             "//executorch/kernels/portable/cpu:scalar_utils",
             "//executorch/kernels/portable/cpu/util:broadcast_util",
         ],
     ),
+    op_target(
+        name = "op_where",
+        deps = [
+            "//executorch/kernels/portable/cpu/util:elementwise_util",
+            "//executorch/runtime/kernel:thread_parallel_interface",
+        ],
+    ),
 )
 
 
@@ -114,15 +119,21 @@ def define_common_targets():
     TARGETS and BUCK files that call this function.
     """
 
-    enabled_ops = [op for op in _OPTIMIZED_ATEN_OPS if not is_op_disabled(op["name"])]
-
     # Define build targets for all operators registered in the tables above.
-    for op in enabled_ops:
+    for op in _OPTIMIZED_ATEN_OPS:
         define_op_target(**op)
 
-    aten_op_targets = [":{}".format(op["name"]) for op in enabled_ops]
+    aten_op_targets = [":{}".format(op["name"]) for op in _OPTIMIZED_ATEN_OPS]
     all_op_targets = aten_op_targets
 
+    runtime.cxx_library(
+        name = "add_sub_impl",
+        srcs = [],
+        exported_headers = ["op_add_sub_impl.h"],
+        visibility = ["//executorch/kernels/optimized/cpu/..."],
+        exported_deps = ["//executorch/runtime/core:core"],
+    )
+
     runtime.cxx_library(
         name = "binary_ops",
         exported_headers = ["binary_ops.h"],
diff --git a/kernels/optimized/lib_defs.bzl b/kernels/optimized/lib_defs.bzl
index 659c7afe090..6e884457e35 100644
--- a/kernels/optimized/lib_defs.bzl
+++ b/kernels/optimized/lib_defs.bzl
@@ -186,7 +186,10 @@ def define_libs(is_fbcode=False):
         ],
     )
 
-    LIBBLAS_DEPS = [third_party_dep("cpuinfo")]
+    LIBBLAS_DEPS = [
+        third_party_dep("cpuinfo"),
+        "//executorch/extension/threadpool:threadpool",
+    ]
 
     for libblas_name, mkl_dep in [("libblas", "fbsource//third-party/mkl:mkl_lp64_omp"), ("libblas_mkl_noomp", "fbsource//third-party/mkl:mkl")]:
         runtime.cxx_library(
@@ -229,9 +232,9 @@ def define_libs(is_fbcode=False):
                 "DEFAULT": [],
             }) + LIBBLAS_DEPS,
             exported_deps = [
-                "//executorch/extension/parallel:thread_parallel",
                 "//executorch/kernels/optimized:libutils",
                 "//executorch/runtime/core/exec_aten:lib",
+                "//executorch/runtime/kernel:thread_parallel_interface",
             ],
             **get_apple_framework_deps_kwargs(is_fbcode),
         )
diff --git a/kernels/optimized/op_registration_util.bzl b/kernels/optimized/op_registration_util.bzl
index 12a5f012a38..3ac89132380 100644
--- a/kernels/optimized/op_registration_util.bzl
+++ b/kernels/optimized/op_registration_util.bzl
@@ -2,8 +2,8 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 load("@fbsource//xplat/executorch/build:selects.bzl", "selects")
 load(
     "@fbsource//xplat/executorch/kernels/optimized:lib_defs.bzl",
-    "get_vec_preprocessor_flags",
     "get_vec_deps",
+    "get_vec_preprocessor_flags",
 )
 load(
     "@fbsource//xplat/executorch/kernels/portable:op_registration_util.bzl",
@@ -137,7 +137,3 @@ def define_op_target(name, compiler_flags, deps):
         compiler_flags = compiler_flags,
         deps = deps,
     )
-
-def is_op_disabled(name):
-    # All ops are enabled for internal builds.
-    return False
diff --git a/kernels/optimized/optimized-oss.yaml b/kernels/optimized/optimized-oss.yaml
deleted file mode 100644
index a24aa9ca173..00000000000
--- a/kernels/optimized/optimized-oss.yaml
+++ /dev/null
@@ -1,96 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This yaml file contains operators that have optimized kernels available.
-# Note that this is a copy of optimized.yaml that does not include log_softmax,
-# due to the OSS build not currently including sleef.
-# TODO (T183193812)
-
-- op: _fft_r2c.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::opt_fft_r2c_out
-
-- op: add.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::opt_add_out
-
-- op: add.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::opt_add_scalar_out
-
-- op: bmm.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::opt_bmm_out
-
-- op: div.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::opt_div_out
-
-- op: div.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::opt_div_scalar_out
-
-- op: exp.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::opt_exp_out
-
-- op: sigmoid.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::opt_sigmoid_out
-
-- op: gelu.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::opt_gelu_out
-
-- op: le.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::opt_le_scalar_out
-
-- op: le.Tensor_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::opt_le_tensor_out
-
-- op: linear.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::opt_linear_out
-
-- op: mul.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::opt_mul_out
-
-- op: mul.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::opt_mul_scalar_out
-
-- op: native_layer_norm.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::opt_native_layer_norm_out
-
-- op: neg.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::opt_neg_out
-
-- op: sub.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::opt_sub_out
-
-- op: sub.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::opt_sub_scalar_out
diff --git a/kernels/optimized/optimized.yaml b/kernels/optimized/optimized.yaml
index fd5143b1511..4f90059aa93 100644
--- a/kernels/optimized/optimized.yaml
+++ b/kernels/optimized/optimized.yaml
@@ -101,3 +101,8 @@
   kernels:
     - arg_meta: null
       kernel_name: torch::executor::opt_sub_scalar_out
+
+- op: where.self_out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::opt_where_out
diff --git a/kernels/optimized/targets.bzl b/kernels/optimized/targets.bzl
index 9978d4196dd..c655cb149a3 100644
--- a/kernels/optimized/targets.bzl
+++ b/kernels/optimized/targets.bzl
@@ -19,14 +19,6 @@ def define_common_targets(is_fbcode=False):
         ],
     )
 
-    runtime.export_file(
-        name = "optimized-oss.yaml",
-        visibility = [
-            "//executorch/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-    )
-
     runtime.cxx_library(
         name = "optimized_operators",
         srcs = [],
diff --git a/kernels/portable/CMakeLists.txt b/kernels/portable/CMakeLists.txt
index 885c509246b..e27ba12ac0d 100644
--- a/kernels/portable/CMakeLists.txt
+++ b/kernels/portable/CMakeLists.txt
@@ -23,8 +23,8 @@ endif()
 
 set(_common_compile_options -Wno-deprecated-declarations)
 
-include(${EXECUTORCH_ROOT}/build/Utils.cmake)
-include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
 
 if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
@@ -63,6 +63,22 @@ gen_operators_lib(
   LIB_NAME "portable_ops_lib" KERNEL_LIBS portable_kernels DEPS executorch
 )
 
+# Portable kernels support optional parallelization (and, in the
+# future, perhaps other performance features). If support is present,
+# produce an optimized version.
+set(BUILD_OPTIMIZED_PORTABLE_KERNELS EXECUTORCH_BUILD_PTHREADPOOL)
+
+if(BUILD_OPTIMIZED_PORTABLE_KERNELS)
+  add_library(optimized_portable_kernels ${_portable_kernels__srcs})
+  target_link_libraries(optimized_portable_kernels PRIVATE executorch)
+  target_link_libraries(optimized_portable_kernels PUBLIC extension_threadpool)
+  target_compile_options(optimized_portable_kernels PUBLIC ${_common_compile_options})
+  install(
+    TARGETS optimized_portable_kernels
+    DESTINATION lib
+  )
+endif()
+
 install(
   TARGETS portable_kernels portable_ops_lib
   DESTINATION lib
diff --git a/kernels/portable/cpu/op__to_dim_order_copy.cpp b/kernels/portable/cpu/op__to_dim_order_copy.cpp
index efb74e3a01f..40ce86e8fdc 100644
--- a/kernels/portable/cpu/op__to_dim_order_copy.cpp
+++ b/kernels/portable/cpu/op__to_dim_order_copy.cpp
@@ -6,6 +6,8 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
+
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
 #include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
 #include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
@@ -41,7 +43,7 @@ int64_t coordinateToIndexWithDimOrder(
 
   dim_order_to_stride_nocheck(
       sizes.data(), dim_order.data(), sizes.size(), strides);
-  for (size_t i = 0; i < self.dim(); ++i) {
+  for (const auto i : c10::irange(self.dim())) {
     index += cur_indices[i] * strides[i];
   }
   return index;
@@ -59,7 +61,7 @@ void _to_dim_order_copy_impl(const Tensor& self, Tensor& out) {
   for (ssize_t i = 0; i < self.numel(); i++) {
     // Update the current indices.
     for (ssize_t j = self.dim() - 1; j >= 0; j--) {
-      if (coordinate[j] + 1 < self.size(j)) {
+      if (coordinate[j] + 1 < static_cast<size_t>(self.size(j))) {
         coordinate[j]++;
         break;
       } else {
diff --git a/kernels/portable/cpu/op_amax.cpp b/kernels/portable/cpu/op_amax.cpp
index 9f879179ec6..4ad409d4820 100644
--- a/kernels/portable/cpu/op_amax.cpp
+++ b/kernels/portable/cpu/op_amax.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <cmath>
 
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
@@ -42,17 +43,20 @@ Tensor& amax_out(
   ET_KERNEL_CHECK(
       ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
 
+  ReduceOverDimListPlan plan(in, dim_list);
   ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, "amax.out", CTYPE, [&]() {
     CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
-    for (size_t out_ix = 0; out_ix < out.numel(); ++out_ix) {
-      out_data[out_ix] = reduce_over_dim_list<CTYPE>(
-          [](CTYPE v, CTYPE max_v) {
-            return std::isnan(v) || v > max_v ? v : max_v;
-          },
-          in,
-          dim_list,
-          out_ix);
-    }
+    const bool success = parallel_for_each_reduce_over_dim_list_output_index(
+        in, dim_list, out, [&](const auto begin, const auto end) {
+          for (const auto out_ix : c10::irange(begin, end)) {
+            out_data[out_ix] = plan.execute<CTYPE>(
+                [](CTYPE v, CTYPE max_v) {
+                  return std::isnan(v) || v > max_v ? v : max_v;
+                },
+                out_ix);
+          }
+        });
+    ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed");
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_amin.cpp b/kernels/portable/cpu/op_amin.cpp
index 4f6f3ce52e5..396cb6c016d 100644
--- a/kernels/portable/cpu/op_amin.cpp
+++ b/kernels/portable/cpu/op_amin.cpp
@@ -5,7 +5,7 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
-
+#include <c10/util/irange.h>
 #include <cmath>
 
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
@@ -42,17 +42,20 @@ Tensor& amin_out(
   ET_KERNEL_CHECK(
       ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
 
+  ReduceOverDimListPlan plan(in, dim_list);
   ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, "amin.out", CTYPE, [&]() {
     CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
-    for (size_t out_ix = 0; out_ix < out.numel(); ++out_ix) {
-      out_data[out_ix] = reduce_over_dim_list<CTYPE>(
-          [](CTYPE v, CTYPE min_v) {
-            return std::isnan(v) || v < min_v ? v : min_v;
-          },
-          in,
-          dim_list,
-          out_ix);
-    }
+    const bool success = parallel_for_each_reduce_over_dim_list_output_index(
+        in, dim_list, out, [&](const auto begin, const auto end) {
+          for (const auto out_ix : c10::irange(begin, end)) {
+            out_data[out_ix] = plan.execute<CTYPE>(
+                [](CTYPE v, CTYPE min_v) {
+                  return std::isnan(v) || v < min_v ? v : min_v;
+                },
+                out_ix);
+          }
+        });
+    ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed");
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_any.cpp b/kernels/portable/cpu/op_any.cpp
index ef09e4837ab..ee9e54fc0c3 100644
--- a/kernels/portable/cpu/op_any.cpp
+++ b/kernels/portable/cpu/op_any.cpp
@@ -6,9 +6,12 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
+#include <optional>
+
 namespace torch {
 namespace executor {
 namespace native {
@@ -34,7 +37,7 @@ Tensor& any_all_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
       const auto data_in = in.const_data_ptr<CTYPE_IN>();
       auto data_out = out.mutable_data_ptr<CTYPE_OUT>();
       data_out[0] = static_cast<CTYPE_OUT>(false);
-      for (auto i = 0; i < in.numel(); ++i) {
+      for (const auto i : c10::irange(in.numel())) {
         if (static_cast<bool>(data_in[i])) {
           data_out[0] = static_cast<CTYPE_OUT>(true);
           break;
@@ -78,28 +81,36 @@ Tensor& any_dims_out(
   ScalarType out_type = out.scalar_type();
   constexpr auto name = "any.dims_out";
 
+  const bool in_not_empty = in.numel() > 0;
+  std::optional<MapReduceOverDimListPlan> plan;
+  if ((!dim_list.has_value() || !dim_list.value().empty()) && in_not_empty) {
+    plan.emplace(in, dim_list);
+  }
   ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, name, CTYPE_IN, [&] {
     ET_SWITCH_TWO_TYPES(Bool, Byte, out_type, ctx, name, CTYPE_OUT, [&] {
       CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
       if (dim_list.has_value() && dim_list.value().empty()) {
         const CTYPE_IN* in_data = in.const_data_ptr<CTYPE_IN>();
-        for (size_t out_ix = 0; out_ix < out.numel(); ++out_ix) {
+        for (const auto out_ix : c10::irange(out.numel())) {
           out_data[out_ix] =
               static_cast<CTYPE_OUT>(static_cast<bool>(in_data[out_ix]));
         }
       } else {
-        for (size_t out_ix = 0; out_ix < out.numel(); ++out_ix) {
-          bool any = false;
-          if (in.numel() > 0) {
-            any = map_reduce_over_dim_list<CTYPE_IN, bool>(
-                [](CTYPE_IN v) { return static_cast<bool>(v); },
-                [](bool outv, bool acc) { return acc || outv; },
-                in,
-                dim_list,
-                out_ix);
-          }
-          out_data[out_ix] = static_cast<CTYPE_OUT>(any);
-        }
+        const bool success =
+            parallel_for_each_reduce_over_dim_list_output_index(
+                in, dim_list, out, [&](const auto begin, const auto end) {
+                  for (const auto out_ix : c10::irange(begin, end)) {
+                    bool any = false;
+                    if (in_not_empty) {
+                      any = plan->execute<CTYPE_IN, bool>(
+                          [](CTYPE_IN v) { return static_cast<bool>(v); },
+                          [](bool outv, bool acc) { return acc || outv; },
+                          out_ix);
+                    }
+                    out_data[out_ix] = static_cast<CTYPE_OUT>(any);
+                  }
+                });
+        ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed");
       }
     });
   });
@@ -138,22 +149,26 @@ Tensor& any_out(
   ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, name, CTYPE_IN, [&] {
     ET_SWITCH_TWO_TYPES(Bool, Byte, out_type, ctx, name, CTYPE_OUT, [&] {
       CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
-      for (size_t out_ix = 0; out_ix < out.numel(); ++out_ix) {
-        CTYPE_OUT any = false;
-        if (in.numel() > 0) {
-          std::tuple<CTYPE_OUT, long> acc =
-              map_reduce_over_dim<CTYPE_IN, CTYPE_OUT>(
-                  [](CTYPE_IN v) { return static_cast<bool>(v); },
-                  [](bool outv, long, bool acc, long) {
-                    return std::tuple<bool, long>{acc || outv, 0};
-                  },
-                  in,
-                  dim,
-                  out_ix);
-          any = std::get<0>(acc);
-        }
-        out_data[out_ix] = any;
-      }
+      const bool success = parallel_for_each_reduce_over_dim_output_index(
+          in, dim, out, [&](const auto begin, const auto end) {
+            for (const auto out_ix : c10::irange(begin, end)) {
+              CTYPE_OUT any = false;
+              if (in.numel() > 0) {
+                std::tuple<CTYPE_OUT, long> acc =
+                    map_reduce_over_dim<CTYPE_IN, CTYPE_OUT>(
+                        [](CTYPE_IN v) { return static_cast<bool>(v); },
+                        [](bool outv, long, bool acc, long) {
+                          return std::tuple<bool, long>{acc || outv, 0};
+                        },
+                        in,
+                        dim,
+                        out_ix);
+                any = std::get<0>(acc);
+              }
+              out_data[out_ix] = any;
+            }
+          });
+      ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed");
     });
   });
 
diff --git a/kernels/portable/cpu/op_argmax.cpp b/kernels/portable/cpu/op_argmax.cpp
index 5eb656d5b76..ffbc469c53d 100644
--- a/kernels/portable/cpu/op_argmax.cpp
+++ b/kernels/portable/cpu/op_argmax.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <cmath>
 #include <tuple>
 
@@ -46,20 +47,27 @@ Tensor& argmax_out(
   ET_SWITCH_REALHBF16_TYPES(in.scalar_type(), ctx, "argmax.out", CTYPE, [&] {
     long* out_data = out.mutable_data_ptr<long>();
 
-    for (size_t out_ix = 0; out_ix < out.numel(); ++out_ix) {
-      std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
-          [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {
-            if (!std::isnan(acc_val) && (std::isnan(v) || v > acc_val)) {
-              acc_val = v;
-              acc_ix = ix;
-            }
-            return std::tuple<CTYPE, long>{acc_val, acc_ix};
-          },
-          in,
-          dim,
-          out_ix);
-      out_data[out_ix] = std::get<1>(acc);
-    }
+    const bool success = parallel_for_each_reduce_over_dim_output_index(
+        in, dim, out, [&](const auto begin, const auto end) {
+          for (const auto out_ix : c10::irange(begin, end)) {
+            std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
+                [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {
+                  // the below condition as written is equivalent to
+                  // !isnan(accval) && (isnan(v) || v > acc_val). See
+                  // argument in op_argmin.cpp.
+                  if (!std::isnan(acc_val) && !(v <= acc_val)) {
+                    acc_val = v;
+                    acc_ix = ix;
+                  }
+                  return std::tuple<CTYPE, long>{acc_val, acc_ix};
+                },
+                in,
+                dim,
+                out_ix);
+            out_data[out_ix] = std::get<1>(acc);
+          }
+        });
+    ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed");
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_argmin.cpp b/kernels/portable/cpu/op_argmin.cpp
index 1c4a2572ea8..b0816596e4e 100644
--- a/kernels/portable/cpu/op_argmin.cpp
+++ b/kernels/portable/cpu/op_argmin.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <cmath>
 #include <tuple>
 
@@ -46,20 +47,34 @@ Tensor& argmin_out(
   ET_SWITCH_REALHBF16_TYPES(in.scalar_type(), ctx, "argmin.out", CTYPE, [&] {
     long* out_data = out.mutable_data_ptr<long>();
 
-    for (size_t out_ix = 0; out_ix < out.numel(); ++out_ix) {
-      std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
-          [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {
-            if (!std::isnan(acc_val) && (std::isnan(v) || v < acc_val)) {
-              acc_val = v;
-              acc_ix = ix;
-            }
-            return std::tuple<CTYPE, long>{acc_val, acc_ix};
-          },
-          in,
-          dim,
-          out_ix);
-      out_data[out_ix] = std::get<1>(acc);
-    }
+    const bool success = parallel_for_each_reduce_over_dim_output_index(
+        in, dim, out, [&](const auto begin, const auto end) {
+          for (const auto out_ix : c10::irange(begin, end)) {
+            std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
+                [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {
+                  // the below condition as written is equivalent to
+                  // !isnan(accval) && (isnan(v) || v < acc_val). cases:
+                  // - if neither acc_val nor v is NaN, !(v >= acc_val) is
+                  //   trivially equivalent to v < acc_val.
+                  // - if acc_val is NaN, the whole thing is trivially false.
+                  // - if acc_val is not NaN and v is NaN, then v >= acc_val
+                  // - is false because all comparisons involving NaN are
+                  // - false, so the result is true. The result is trivially
+                  // - true for the above condition that uses isnan(v) as
+                  // - well.
+                  if (!std::isnan(acc_val) && !(v >= acc_val)) {
+                    acc_val = v;
+                    acc_ix = ix;
+                  }
+                  return std::tuple<CTYPE, long>{acc_val, acc_ix};
+                },
+                in,
+                dim,
+                out_ix);
+            out_data[out_ix] = std::get<1>(acc);
+          }
+        });
+    ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed");
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_cdist_forward.cpp b/kernels/portable/cpu/op_cdist_forward.cpp
index 1489ec6f6ed..03d6d47ec75 100644
--- a/kernels/portable/cpu/op_cdist_forward.cpp
+++ b/kernels/portable/cpu/op_cdist_forward.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <executorch/kernels/portable/cpu/util/broadcast_util.h>
 #include <executorch/kernels/portable/cpu/util/distance_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
@@ -34,7 +35,7 @@ void cdist(const Tensor& x1, const Tensor& x2, Tensor& out, double p) {
   // If the last dimension of x1 (which is equal to the last dimension of x2)
   // has size 0, then the output is filled with 0s.
   if (x1.numel() == 0) {
-    for (size_t out_ix = 0; out_ix < out.numel(); ++out_ix) {
+    for (const auto out_ix : c10::irange(out.numel())) {
       out_data[out_ix] = 0;
     }
     return;
@@ -64,7 +65,7 @@ void cdist(const Tensor& x1, const Tensor& x2, Tensor& out, double p) {
   size_t x2_inner_size = R * M;
   size_t out_inner_size = P * R;
 
-  for (size_t b = 0; b < out_batch_numel; ++b) {
+  for (const auto b : c10::irange(out_batch_numel)) {
     size_t x1_base_ix = b * x1_inner_size;
     size_t x2_base_ix = b * x2_inner_size;
     size_t out_base_ix = b * out_inner_size;
@@ -81,14 +82,13 @@ void cdist(const Tensor& x1, const Tensor& x2, Tensor& out, double p) {
         x2_base_ix = linearize_access_indexes(out_base_coord, out.dim(), x2);
       }
     }
-
     size_t out_ix = 0;
-    for (size_t i = 0; i < P; ++i) {
+    for (const auto i : c10::irange(P)) {
       const CTYPE* row_i = x1_data + x1_base_ix + i * M;
-      for (size_t j = 0; j < R; ++j) {
+      for (const auto j : c10::irange(R)) {
         const CTYPE* row_j = x2_data + x2_base_ix + j * M;
         CTYPE agg = 0;
-        for (size_t k = 0; k < M; ++k) {
+        for (const auto k : c10::irange(M)) {
           CTYPE diff = std::abs(row_i[k] - row_j[k]);
           agg = Norm::reduce(agg, Norm::map(diff, p));
         }
diff --git a/kernels/portable/cpu/op_constant_pad_nd.cpp b/kernels/portable/cpu/op_constant_pad_nd.cpp
index 328207d70f3..71dc7ff658f 100644
--- a/kernels/portable/cpu/op_constant_pad_nd.cpp
+++ b/kernels/portable/cpu/op_constant_pad_nd.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <cmath>
 #include <cstring>
 
@@ -56,7 +57,7 @@ void apply_padding_to_dim(
   size_t out_step_len = out_strides[dim];
   size_t in_step_len = self_strides[dim];
 
-  for (size_t i = 0; i < pad_before; ++i) {
+  for ([[maybe_unused]] const auto i : c10::irange(pad_before)) {
     set_all_to_value(out_data, out_step_len, value);
     out_data += out_step_len;
   }
@@ -75,7 +76,7 @@ void apply_padding_to_dim(
   }
   // Otherwise, call this function recursively
   else {
-    for (size_t i = 0; i < self_sizes[dim]; ++i) {
+    for ([[maybe_unused]] const auto i : c10::irange(self_sizes[dim])) {
       apply_padding_to_dim(
           ndim,
           self_data,
@@ -94,7 +95,7 @@ void apply_padding_to_dim(
     }
   }
 
-  for (int i = 0; i < pad_after; ++i) {
+  for ([[maybe_unused]] const auto i : c10::irange(pad_after)) {
     set_all_to_value(out_data, out_step_len, value);
     out_data += out_step_len;
   }
@@ -124,7 +125,7 @@ void constant_pad_nd_out_impl(
   // Collect sizes and strides of input and output tensors and determine the
   // last padded dimension
   size_t last_padded_dim = 0;
-  for (size_t i = 0; i < ndim; ++i) {
+  for (const auto i : c10::irange(ndim)) {
     self_sizes[i] = self.size(i);
     self_strides[i] = getTrailingDims(self, static_cast<int64_t>(i));
     out_sizes[i] = out.size(i);
diff --git a/kernels/portable/cpu/op_convolution.cpp b/kernels/portable/cpu/op_convolution.cpp
index cdd37e8f78a..44da2cc0f1f 100644
--- a/kernels/portable/cpu/op_convolution.cpp
+++ b/kernels/portable/cpu/op_convolution.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <cstring>
 
 #include <executorch/kernels/portable/cpu/util/dtype_util.h>
@@ -91,25 +92,25 @@ void conv2d_impl(
   if (!transposed) {
     w_coord[0] = out_c;
     // Compute 2D output region
-    for (size_t out_y = 0; out_y < out_H; ++out_y) {
+    for (const auto out_y : c10::irange(out_H)) {
       out_coord[2] = out_y;
-      for (size_t out_x = 0; out_x < out_W; ++out_x) {
+      for (const auto out_x : c10::irange(out_W)) {
         out_coord[3] = out_x;
 
         CTYPE accum = 0.0f;
-        for (size_t in_c = in_c_start; in_c < in_c_start + in_C_per_group;
-             ++in_c) {
+        for (const auto in_c :
+             c10::irange(in_c_start, in_c_start + in_C_per_group)) {
           in_coord[1] = in_c;
           w_coord[1] = in_c - in_c_start;
 
-          for (size_t w_y = 0; w_y < w_H; ++w_y) {
+          for (const auto w_y : c10::irange(w_H)) {
             w_coord[2] = w_y;
 
             size_t in_y = stride_y * out_y + dilation_y * w_y - padding_y;
             in_coord[2] = in_y;
             // Only proceed if input y coordinate is within bounds
             if (in_y >= 0 && in_y < in_H) {
-              for (size_t w_x = 0; w_x < w_W; ++w_x) {
+              for (const auto w_x : c10::irange(w_W)) {
                 w_coord[3] = w_x;
 
                 size_t in_x = stride_x * out_x + dilation_x * w_x - padding_x;
@@ -143,14 +144,14 @@ void conv2d_impl(
   } else { // transposed convolution
     w_coord[1] = out_c - out_c_start;
 
-    for (size_t in_y = 0; in_y < in_H; ++in_y) {
+    for (const auto in_y : c10::irange(in_H)) {
       in_coord[2] = in_y;
 
-      for (size_t in_x = 0; in_x < in_W; ++in_x) {
+      for (const auto in_x : c10::irange(in_W)) {
         in_coord[3] = in_x;
 
-        for (size_t in_c = in_c_start; in_c < in_c_start + in_C_per_group;
-             ++in_c) {
+        for (const auto in_c :
+             c10::irange(in_c_start, in_c_start + in_C_per_group)) {
           in_coord[1] = in_c;
 
           size_t in_idx =
@@ -158,14 +159,14 @@ void conv2d_impl(
           CTYPE in_val = in_ptr[in_idx];
 
           w_coord[0] = in_c;
-          for (size_t w_y = 0; w_y < w_H; ++w_y) {
+          for (const auto w_y : c10::irange(w_H)) {
             w_coord[2] = w_y;
             size_t out_y = stride_y * in_y + dilation_y * w_y - padding_y;
             out_coord[2] = out_y;
 
             // Only proceed if output y coordinate is within bounds
             if (out_y >= 0 && out_y < out_H) {
-              for (size_t w_x = 0; w_x < w_W; ++w_x) {
+              for (const auto w_x : c10::irange(w_W)) {
                 w_coord[3] = w_x;
                 size_t out_x = stride_x * in_x + dilation_x * w_x - padding_x;
                 out_coord[3] = out_x;
@@ -302,7 +303,7 @@ void convolution_wrapper(
       memset(out_ptr, 0, out.nbytes());
     } else {
       // If bias is present, we initialize the output to the bias value
-      for (size_t out_ix = 0; out_ix < out.numel(); ++out_ix) {
+      for (const auto out_ix : c10::irange(out.numel())) {
         out_ptr[out_ix] = load_bias(&bias_ptr
                                         [((out_ix / out_strides[1]) % out_C) *
                                          bias.value().element_size()]);
@@ -310,13 +311,13 @@ void convolution_wrapper(
     }
   }
 
-  for (size_t batch = 0; batch < out_N; ++batch) {
-    for (size_t group = 0; group < groups; ++group) {
+  for (const auto batch : c10::irange(out_N)) {
+    for (const auto group : c10::irange(groups)) {
       // Align channel offset based on the group
       size_t out_c_start = group * out_C_per_group;
       // Populate all the out channels in the group
-      for (size_t out_c = out_c_start; out_c < out_c_start + out_C_per_group;
-           ++out_c) {
+      for (const auto out_c :
+           c10::irange(out_c_start, out_c_start + out_C_per_group)) {
         conv2d_impl(
             in_ptr,
             in_sizes,
diff --git a/kernels/portable/cpu/op_convolution_backward.cpp b/kernels/portable/cpu/op_convolution_backward.cpp
index 7884ea0c44c..cd635cda8f9 100644
--- a/kernels/portable/cpu/op_convolution_backward.cpp
+++ b/kernels/portable/cpu/op_convolution_backward.cpp
@@ -38,9 +38,9 @@ bool check_convolution_backward_args(
     Tensor& grad_input,
     Tensor& grad_weight,
     Tensor& grad_bias) {
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       transposed == false, "Transposed Convolution Backward not supported yet");
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       weight.dim() == 4, "Only 2D Convolution Backward supported for now");
 
   ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(weight, input));
@@ -58,7 +58,7 @@ bool check_convolution_backward_args(
     ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(grad_bias, input));
   }
 
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       check_convolution_args(
           input,
           weight,
@@ -89,7 +89,7 @@ bool check_convolution_backward_args(
   ET_LOG_AND_RETURN_IF_FALSE(
       output_size_is_valid({output_sizes, output_ndim}, input.dim() - 2));
 
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       grad_output.dim() == input.dim(),
       "grad_output should have same number of dimensions as input");
 
diff --git a/kernels/portable/cpu/op_diagonal_copy.cpp b/kernels/portable/cpu/op_diagonal_copy.cpp
index 6d923a6d904..445bfd2027f 100644
--- a/kernels/portable/cpu/op_diagonal_copy.cpp
+++ b/kernels/portable/cpu/op_diagonal_copy.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
@@ -40,20 +41,21 @@ void diagonal_copy_impl(
 
   size_t new_ndim = out.dim();
   int64_t new_sizes[kTensorDimensionLimit];
-  for (size_t i = 0; i < new_ndim; ++i) {
+  for (const auto i : c10::irange(new_ndim)) {
     new_sizes[i] = out.size(i);
   }
 
   int64_t new_strides[kTensorDimensionLimit];
   size_t shift = 0;
-  for (size_t d = 0; d < in.dim(); ++d) {
-    if (d == dim1 || d == dim2) {
+  size_t in_dim = in.dim();
+  for (const auto d : c10::irange(in_dim)) {
+    if (static_cast<int64_t>(d) == dim1 || static_cast<int64_t>(d) == dim2) {
       shift++;
     } else {
       new_strides[d - shift] = in.strides().at(d);
     }
   }
-  new_strides[in.dim() - 2] = in.strides().at(dim1) + in.strides().at(dim2);
+  new_strides[in_dim - 2] = in.strides().at(dim1) + in.strides().at(dim2);
 
   as_strided_copy<CTYPE>(
       in, {new_sizes, new_ndim}, {new_strides, new_ndim}, storage_offset, out);
diff --git a/kernels/portable/cpu/op_expand_copy.cpp b/kernels/portable/cpu/op_expand_copy.cpp
index f1a7bfbf1fb..6c8685dd867 100644
--- a/kernels/portable/cpu/op_expand_copy.cpp
+++ b/kernels/portable/cpu/op_expand_copy.cpp
@@ -96,7 +96,8 @@ Tensor& expand_copy_out(
 
   ET_KERNEL_CHECK(
       ctx,
-      repeat_tensor(self, {repeats, repeats_size}, out) == Error::Ok,
+      repeat_tensor(self, makeArrayRef(repeats, repeats_size), out) ==
+          Error::Ok,
       InvalidArgument,
       out);
 
diff --git a/kernels/portable/cpu/op_flip.cpp b/kernels/portable/cpu/op_flip.cpp
index 41e99953c93..8ad122b7e7e 100644
--- a/kernels/portable/cpu/op_flip.cpp
+++ b/kernels/portable/cpu/op_flip.cpp
@@ -5,6 +5,7 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+#include <c10/util/irange.h>
 
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
@@ -25,7 +26,7 @@ size_t unflip_flat_ix(size_t ix, const Tensor& in, ArrayRef<bool> flip_dim) {
   indexToCoordinate(in, ix, ix_coord);
 
   size_t unflip_coord[kTensorDimensionLimit];
-  for (size_t d = 0; d < in.dim(); d++) {
+  for (const auto d : c10::irange(in.dim())) {
     if (flip_dim[d]) {
       unflip_coord[d] = in.size(d) - ix_coord[d] - 1;
     } else {
@@ -54,10 +55,10 @@ Tensor& flip_out(
   ET_KERNEL_CHECK(ctx, check_flip_args(in, dims, out), InvalidArgument, out);
 
   bool flip_dim_data[kTensorDimensionLimit];
-  for (size_t i = 0; i < in.dim(); i++) {
+  for (const auto i : c10::irange(in.dim())) {
     flip_dim_data[i] = false;
   }
-  for (size_t i = 0; i < dims.size(); i++) {
+  for (const auto i : c10::irange(dims.size())) {
     const auto d = dims[i] < 0 ? dims[i] + nonzero_dim(in) : dims[i];
     flip_dim_data[d] = true;
   }
@@ -70,7 +71,7 @@ Tensor& flip_out(
     const CTYPE* in_data = in.const_data_ptr<CTYPE>();
     CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
 
-    for (size_t ix = 0; ix < out.numel(); ++ix) {
+    for (const auto ix : c10::irange(in.numel())) {
       out_data[ix] = in_data[unflip_flat_ix(ix, in, flip_dim)];
     }
   });
diff --git a/kernels/portable/cpu/op_full.cpp b/kernels/portable/cpu/op_full.cpp
index 668033a44af..69b4c8fd150 100644
--- a/kernels/portable/cpu/op_full.cpp
+++ b/kernels/portable/cpu/op_full.cpp
@@ -5,6 +5,7 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+#include <c10/util/irange.h>
 
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
@@ -44,7 +45,7 @@ Tensor& full_out(
     ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, name, CTYPE_OUT, [&] {
       CTYPE_OUT val_casted = static_cast<CTYPE_OUT>(val);
       auto data_out = out.mutable_data_ptr<CTYPE_OUT>();
-      for (size_t i = 0; i < out.numel(); ++i) {
+      for (const auto i : c10::irange(out.numel())) {
         data_out[i] = val_casted;
       }
     });
diff --git a/kernels/portable/cpu/op_full_like.cpp b/kernels/portable/cpu/op_full_like.cpp
index 2aeb45d22f4..7671cd61ea9 100644
--- a/kernels/portable/cpu/op_full_like.cpp
+++ b/kernels/portable/cpu/op_full_like.cpp
@@ -5,6 +5,7 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+#include <c10/util/irange.h>
 
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
@@ -60,7 +61,7 @@ Tensor& full_like_out(
     ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, name, CTYPE_OUT, [&] {
       CTYPE_OUT val_casted = static_cast<CTYPE_OUT>(val);
       auto data_out = out.mutable_data_ptr<CTYPE_OUT>();
-      for (size_t i = 0; i < out.numel(); ++i) {
+      for (const auto i : c10::irange(out.numel())) {
         data_out[i] = val_casted;
       }
     });
diff --git a/kernels/portable/cpu/op_gather.cpp b/kernels/portable/cpu/op_gather.cpp
index 3f2e365503f..9899c21a94e 100644
--- a/kernels/portable/cpu/op_gather.cpp
+++ b/kernels/portable/cpu/op_gather.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <cinttypes>
 #include <cstdint>
 #include <cstring>
@@ -37,12 +38,12 @@ void gather_helper(
     return;
   }
 
-  for (size_t ix = 0; ix < index.numel(); ++ix) {
+  for (const auto ix : c10::irange(index.numel())) {
     size_t ix_coord[kTensorDimensionLimit];
     indexToCoordinate(index, ix, ix_coord);
 
     size_t in_coord[kTensorDimensionLimit];
-    for (size_t i = 0; i < out.dim(); ++i) {
+    for (const auto i : c10::irange(out.dim())) {
       if (i == dim) {
         in_coord[i] = index_data[ix];
       } else {
diff --git a/kernels/portable/cpu/op_glu.cpp b/kernels/portable/cpu/op_glu.cpp
index 20fb3cf0290..edc82c55eb8 100644
--- a/kernels/portable/cpu/op_glu.cpp
+++ b/kernels/portable/cpu/op_glu.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <executorch/kernels/portable/cpu/util/activation_ops_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
@@ -37,7 +38,7 @@ float exp_overload(float f) {
 template <typename CTYPE_OUT>
 void sigmoid_tensor(Tensor& out) {
   CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
-  for (size_t i = 0; i < out.numel(); i++) {
+  for (const auto i : c10::irange(out.numel())) {
     out_data[i] = 1.0 / (1.0 + exp_overload(-out_data[i]));
   }
 }
@@ -57,13 +58,13 @@ void mul_tensors(const Tensor& in, int64_t dim, Tensor& out) {
   const CTYPE_IN* input_data_base = in.const_data_ptr<CTYPE_IN>();
   CTYPE_OUT* output_data_base = out.mutable_data_ptr<CTYPE_OUT>();
 
-  for (size_t i = 0; i < leading_dims; i++) {
+  for (const auto i : c10::irange(leading_dims)) {
     const CTYPE_IN* input_data =
         input_data_base + i * dim_length_in * trailing_dims;
     CTYPE_OUT* output_data =
         output_data_base + i * dim_length_out * trailing_dims;
-    for (size_t j = 0; j < num_values; j++) {
-      for (size_t k = 0; k < trailing_dims; ++k) {
+    for ([[maybe_unused]] const auto j : c10::irange(num_values)) {
+      for (const auto k : c10::irange(trailing_dims)) {
         output_data[k] = static_cast<CTYPE_OUT>(input_data[k]) * output_data[k];
       }
       input_data += trailing_dims;
@@ -94,13 +95,13 @@ void slice_tensor(
   const CTYPE_IN* input_data_base = in.const_data_ptr<CTYPE_IN>();
   CTYPE_OUT* output_data_base = out.mutable_data_ptr<CTYPE_OUT>();
 
-  for (size_t i = 0; i < leading_dims; i++) {
+  for (const auto i : c10::irange(leading_dims)) {
     const CTYPE_IN* input_data = input_data_base +
         (i * dim_length_in + non_negative_start) * trailing_dims;
     CTYPE_OUT* output_data =
         output_data_base + i * dim_length_out * trailing_dims;
-    for (size_t j = 0; j < num_values; j++) {
-      for (size_t k = 0; k < trailing_dims; ++k) {
+    for ([[maybe_unused]] const auto j : c10::irange(num_values)) {
+      for (const auto k : c10::irange(trailing_dims)) {
         output_data[k] = static_cast<CTYPE_OUT>(input_data[k]);
       }
       input_data += trailing_dims;
diff --git a/kernels/portable/cpu/op_index_put.cpp b/kernels/portable/cpu/op_index_put.cpp
index f22026d759c..942892c31ec 100644
--- a/kernels/portable/cpu/op_index_put.cpp
+++ b/kernels/portable/cpu/op_index_put.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <cstring>
 
 #include <executorch/kernels/portable/cpu/util/advanced_index_util.h>
@@ -116,7 +117,7 @@ Tensor& index_put_out(
 
   // Compute the number of elements in the indexed space
   size_t x_numel = 1;
-  for (size_t i = 0; i < x_dim; i++) {
+  for (const auto i : c10::irange(x_dim)) {
     x_numel *= x_sizes[i];
   }
 
@@ -124,7 +125,7 @@ Tensor& index_put_out(
     const CTYPE* const values_data = values.const_data_ptr<CTYPE>();
     CTYPE* const out_data = out.mutable_data_ptr<CTYPE>();
 
-    for (auto x_ix = 0; x_ix < x_numel; x_ix++) {
+    for (const auto x_ix : c10::irange(x_numel)) {
       size_t in_ix = 0;
 
       size_t x_coord[kTensorDimensionLimit];
diff --git a/kernels/portable/cpu/op_index_select.cpp b/kernels/portable/cpu/op_index_select.cpp
index 98f8f9f7ab0..fb39a42e5a2 100644
--- a/kernels/portable/cpu/op_index_select.cpp
+++ b/kernels/portable/cpu/op_index_select.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <cinttypes>
 #include <cstdint>
 #include <cstring>
@@ -73,10 +74,10 @@ Tensor& index_select_out(
   ET_SWITCH_TWO_TYPES(
       Long, Int, ix_type, ctx, "index_select.out", CTYPE, [&]() {
         const CTYPE* const index_arr = index.mutable_data_ptr<CTYPE>();
-        for (int i = 0; i < leading_dims; i++) {
+        for (const auto i : c10::irange(leading_dims)) {
           const char* src = input_data + i * in_dim_length * length_per_step;
           char* dest = out_data + i * out_dim_length * length_per_step;
-          for (auto j = 0; j < out_dim_length; j++) {
+          for (const auto j : c10::irange(out_dim_length)) {
             const char* copy_src = src + index_arr[j] * length_per_step;
             memcpy(dest, copy_src, length_per_step);
             dest += length_per_step;
diff --git a/kernels/portable/cpu/op_linear_scratch_example.cpp b/kernels/portable/cpu/op_linear_scratch_example.cpp
index b217e9ad942..096fea8bc4c 100644
--- a/kernels/portable/cpu/op_linear_scratch_example.cpp
+++ b/kernels/portable/cpu/op_linear_scratch_example.cpp
@@ -40,13 +40,13 @@ bool check_linear_scratch_example_args(
     const optional<Tensor>& bias,
     Tensor& out,
     Tensor& scratch) {
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       input.size(1) == weight.size(1), "Unexpected weight size 1");
 
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       scratch.size(0) == input.size(0), "Unexpected scratch size 0");
 
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       scratch.size(1) == weight.size(0), "Unexpected scratch size 1");
 
   return true;
@@ -102,7 +102,9 @@ Tensor& linear_scratch_example(
 
     // add the bias
     if (bias.has_value()) {
-      ET_CHECK_MSG(K == bias.value().numel(), "Unexpected numel for bias");
+      ET_CHECK_MSG(
+          static_cast<ssize_t>(K) == bias.value().numel(),
+          "Unexpected numel for bias");
       for (size_t i = 0; i < M; ++i) {
         for (size_t j = 0; j < K; ++j) {
           scalar_t* scratch_ptr =
diff --git a/kernels/portable/cpu/op_masked_select.cpp b/kernels/portable/cpu/op_masked_select.cpp
index b176000f6c8..88a568be5ac 100644
--- a/kernels/portable/cpu/op_masked_select.cpp
+++ b/kernels/portable/cpu/op_masked_select.cpp
@@ -5,6 +5,7 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+#include <c10/util/irange.h>
 
 #include <executorch/kernels/portable/cpu/util/broadcast_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
@@ -53,14 +54,14 @@ Tensor& masked_select_out(
         ctx, false, InvalidArgument, out, "Failed to broadcast input and mask");
   }
   size_t broadcast_numel = 1;
-  for (size_t i = 0; i < broadcast_ndim; i++) {
+  for (const auto i : c10::irange(broadcast_ndim)) {
     broadcast_numel *= broadcast_sizes[i];
   }
 
   // Compute the number of out elements
   size_t mask_true_count = 0;
   const bool* const mask_data = mask.const_data_ptr<bool>();
-  for (size_t i = 0; i < mask.numel(); ++i) {
+  for (const auto i : c10::irange(mask.numel())) {
     if (mask_data[i]) {
       mask_true_count++;
     }
@@ -79,10 +80,10 @@ Tensor& masked_select_out(
 
   // Figure out if `in` is broadcasted
   bool in_is_broadcasted = false;
-  if (in.dim() != broadcast_ndim) {
+  if (in.dim() != static_cast<ssize_t>(broadcast_ndim)) {
     in_is_broadcasted = true;
   } else {
-    for (size_t i = 0; i < in.dim(); ++i) {
+    for (const auto i : c10::irange(in.dim())) {
       if (in.size(i) != broadcast_sizes[i]) {
         in_is_broadcasted = true;
       }
@@ -91,10 +92,10 @@ Tensor& masked_select_out(
 
   // Figure out if `mask` is broadcasted
   bool mask_is_broadcasted = false;
-  if (mask.dim() != broadcast_ndim) {
+  if (mask.dim() != static_cast<ssize_t>(broadcast_ndim)) {
     mask_is_broadcasted = true;
   } else {
-    for (size_t i = 0; i < mask.dim(); ++i) {
+    for (const auto i : c10::irange(mask.dim())) {
       if (mask.size(i) != broadcast_sizes[i]) {
         mask_is_broadcasted = true;
       }
@@ -105,7 +106,7 @@ Tensor& masked_select_out(
   bool any_is_broadcasted = (in_is_broadcasted || mask_is_broadcasted);
 
   size_t out_ix = 0;
-  for (size_t i = 0; i < broadcast_numel; ++i) {
+  for (const auto i : c10::irange(broadcast_numel)) {
     size_t in_linear_index = i;
     size_t mask_linear_index = i;
 
diff --git a/kernels/portable/cpu/op_max.cpp b/kernels/portable/cpu/op_max.cpp
index c5b5d2fb6bc..3f4a1d27c0e 100644
--- a/kernels/portable/cpu/op_max.cpp
+++ b/kernels/portable/cpu/op_max.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <cmath>
 #include <tuple>
 
@@ -82,21 +83,26 @@ std::tuple<Tensor&, Tensor&> max_out(
         CTYPE* max_data = max.mutable_data_ptr<CTYPE>();
         long* max_indices_data = max_indices.mutable_data_ptr<long>();
 
-        for (size_t out_ix = 0; out_ix < max.numel(); ++out_ix) {
-          std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
-              [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {
-                if (!std::isnan(acc_val) && (std::isnan(v) || v > acc_val)) {
-                  acc_val = v;
-                  acc_ix = ix;
-                }
-                return std::tuple<CTYPE, long>{acc_val, acc_ix};
-              },
-              in,
-              dim,
-              out_ix);
-          max_data[out_ix] = std::get<0>(acc);
-          max_indices_data[out_ix] = std::get<1>(acc);
-        }
+        const bool success = parallel_for_each_reduce_over_dim_output_index(
+            in, dim, max, [&](const auto begin, const auto end) {
+              for (const auto out_ix : c10::irange(begin, end)) {
+                std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
+                    [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {
+                      if (!std::isnan(acc_val) &&
+                          (std::isnan(v) || v > acc_val)) {
+                        acc_val = v;
+                        acc_ix = ix;
+                      }
+                      return std::tuple<CTYPE, long>{acc_val, acc_ix};
+                    },
+                    in,
+                    dim,
+                    out_ix);
+                max_data[out_ix] = std::get<0>(acc);
+                max_indices_data[out_ix] = std::get<1>(acc);
+              }
+            });
+        ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed");
       });
 
   return {max, max_indices};
@@ -124,7 +130,7 @@ max_unary_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
       const auto data_in = in.const_data_ptr<CTYPE_IN>();
       auto data_out = out.mutable_data_ptr<CTYPE_OUT>();
       data_out[0] = lower_bound<CTYPE_OUT>();
-      for (auto i = 0; i < in.numel(); ++i) {
+      for (const auto i : c10::irange(in.numel())) {
         CTYPE_OUT val = static_cast<CTYPE_OUT>(data_in[i]);
         if (std::isnan(val)) {
           data_out[0] = val;
diff --git a/kernels/portable/cpu/op_max_pool2d_with_indices_backward.cpp b/kernels/portable/cpu/op_max_pool2d_with_indices_backward.cpp
new file mode 100644
index 00000000000..5edce5a2c67
--- /dev/null
+++ b/kernels/portable/cpu/op_max_pool2d_with_indices_backward.cpp
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch {
+namespace executor {
+namespace native {
+
+using Tensor = executorch::aten::Tensor;
+using ScalarType = executorch::aten::ScalarType;
+using IntArrayRef = executorch::aten::ArrayRef<int64_t>;
+
+namespace {
+
+bool check_max_pool2d_backward_args(
+    const Tensor& grad_output,
+    const Tensor& input,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    bool ceil_mode,
+    const Tensor& indices,
+    const Tensor& grad_input) {
+  ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(grad_output, input));
+  ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(grad_input, input));
+
+  ET_CHECK_OR_RETURN_FALSE(
+      check_max_pool2d_with_indices_args(
+          input,
+          kernel_size,
+          stride,
+          padding,
+          dilation,
+          ceil_mode,
+          grad_output,
+          indices),
+      "Invalid max_pool_2d arguments");
+
+  size_t output_ndim = 0;
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  executorch::aten::SizesType output_sizes[kTensorDimensionLimit];
+  get_max_pool2d_with_indices_out_target_size(
+      input,
+      kernel_size,
+      stride,
+      padding,
+      dilation,
+      ceil_mode,
+      output_sizes,
+      &output_ndim);
+
+  ET_LOG_AND_RETURN_IF_FALSE(
+      output_size_is_valid({output_sizes, output_ndim}, 2));
+
+  ET_CHECK_OR_RETURN_FALSE(
+      grad_output.dim() == input.dim(),
+      "grad_output should have same number of dimensions as input");
+
+  ET_LOG_AND_RETURN_IF_FALSE(
+      tensor_has_expected_size(grad_output, {output_sizes, output_ndim}));
+
+  return true;
+}
+
+template <typename CTYPE, bool is_3d>
+void max_pool_backward_impl(
+    const Tensor& grad_input,
+    const Tensor& grad_output,
+    const Tensor& indices) {
+  const CTYPE* grad_output_data = grad_output.const_data_ptr<CTYPE>();
+  const int64_t* indices_data = indices.const_data_ptr<int64_t>();
+  CTYPE* grad_input_data = grad_input.mutable_data_ptr<CTYPE>();
+
+  // treat batch size and channels as one dimension
+  //
+  // MaxPool2d:
+  //   ndim == 3: CHW
+  //   ndim == 4: NCHW
+  //
+  // MaxPool3d:
+  //   ndim == 4: CDHW
+  //   ndim == 5: NCDHW
+  int64_t ndim = grad_output.dim();
+  int64_t channels;
+  if (is_3d) {
+    channels = ndim == 4 ? grad_output.size(0)
+                         : grad_output.size(0) * grad_output.size(1);
+  } else {
+    channels = ndim == 3 ? grad_output.size(0)
+                         : grad_output.size(0) * grad_output.size(1);
+  }
+  int64_t input_depth = is_3d ? grad_input.size(-3) : 1;
+
+  int64_t input_height = grad_input.size(ndim - 2);
+  int64_t input_width = grad_input.size(ndim - 1);
+  int64_t output_depth = is_3d ? grad_output.size(ndim - 3) : 1;
+  int64_t output_height = grad_output.size(ndim - 2);
+  int64_t output_width = grad_output.size(ndim - 1);
+
+  for (int64_t c = 0; c < channels; ++c) {
+    CTYPE* grad_input_ptr =
+        grad_input_data + c * input_depth * input_height * input_width;
+    const CTYPE* grad_output_ptr =
+        grad_output_data + c * output_depth * output_height * output_width;
+    const int64_t* indices_ptr =
+        indices_data + c * output_depth * output_height * output_width;
+
+    for (int64_t od = 0; od < output_depth; od++) {
+      for (int64_t oh = 0; oh < output_height; oh++) {
+        for (int64_t ow = 0; ow < output_width; ow++) {
+          // retrieve position of max
+          int64_t index =
+              od * output_height * output_width + oh * output_width + ow;
+          int64_t maxindex = indices_ptr[index];
+          if (maxindex != -1) {
+            // update gradient
+            grad_input_ptr[maxindex] += grad_output_ptr[index];
+          }
+        }
+      }
+    }
+  }
+}
+
+} // namespace
+
+Tensor& max_pool2d_with_indices_backward_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& grad_output,
+    const Tensor& input,
+    ET_UNUSED IntArrayRef kernel_size,
+    ET_UNUSED IntArrayRef stride,
+    ET_UNUSED IntArrayRef padding,
+    ET_UNUSED IntArrayRef dilation,
+    ET_UNUSED bool ceil_mode,
+    const Tensor& indices,
+    Tensor& grad_input) {
+  (void)ctx;
+
+  ET_KERNEL_CHECK(
+      ctx,
+      check_max_pool2d_backward_args(
+          grad_output,
+          input,
+          kernel_size,
+          stride,
+          padding,
+          dilation,
+          ceil_mode,
+          indices,
+          grad_input),
+      InvalidArgument,
+      grad_input);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_tensor(grad_input, input.sizes()) == Error::Ok,
+      InvalidArgument,
+      grad_input);
+
+  constexpr auto name = "max_pool2d_with_indices_backward.grad_input";
+
+  ET_SWITCH_FLOATHBF16_TYPES(input.scalar_type(), ctx, name, CTYPE, [&]() {
+    max_pool_backward_impl<CTYPE, false>(grad_input, grad_output, indices);
+  });
+
+  return grad_input;
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
diff --git a/kernels/portable/cpu/op_mean.cpp b/kernels/portable/cpu/op_mean.cpp
index c0316e685d6..423c2564232 100644
--- a/kernels/portable/cpu/op_mean.cpp
+++ b/kernels/portable/cpu/op_mean.cpp
@@ -5,6 +5,7 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+#include <c10/util/irange.h>
 
 #include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
@@ -44,24 +45,28 @@ Tensor& mean_dim_out(
       InvalidArgument,
       out);
 
-  ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, "mean.out", CTYPE_IN, [&] {
-    ET_SWITCH_FLOATHBF16_TYPES(
-        out.scalar_type(), ctx, "mean.out", CTYPE_OUT, [&] {
-          CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
-          const size_t num = get_reduced_dim_product(in, dim_list);
-          for (size_t out_ix = 0; out_ix < out.numel(); ++out_ix) {
-            CTYPE_OUT sum = 0;
-            if (in.numel() > 0) {
-              sum = map_reduce_over_dim_list<CTYPE_IN, CTYPE_OUT>(
-                  [](CTYPE_IN v) { return static_cast<CTYPE_OUT>(v); },
-                  [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; },
-                  in,
-                  dim_list,
-                  out_ix);
+  MapReduceOverDimListPlan plan(in, dim_list);
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  static constexpr const char op_name[] = "add.out";
+  ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE_IN, [&] {
+    ET_SWITCH_FLOATHBF16_TYPES(out.scalar_type(), ctx, op_name, CTYPE_OUT, [&] {
+      CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
+      const size_t num = get_reduced_dim_product(in, dim_list);
+      const bool success = parallel_for_each_reduce_over_dim_list_output_index(
+          in, dim_list, out, [&](const auto begin, const auto end) {
+            for (const auto out_ix : c10::irange(begin, end)) {
+              CTYPE_OUT sum = 0;
+              if (in.numel() > 0) {
+                sum = plan.execute<CTYPE_IN, CTYPE_OUT>(
+                    [](CTYPE_IN v) { return static_cast<CTYPE_OUT>(v); },
+                    [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; },
+                    out_ix);
+              }
+              out_data[out_ix] = sum / static_cast<float>(num);
             }
-            out_data[out_ix] = sum / static_cast<float>(num);
-          }
-        });
+          });
+      ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed");
+    });
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_min.cpp b/kernels/portable/cpu/op_min.cpp
index ca8a9135ccd..8b70bcd40f5 100644
--- a/kernels/portable/cpu/op_min.cpp
+++ b/kernels/portable/cpu/op_min.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <cmath>
 #include <tuple>
 
@@ -82,21 +83,26 @@ std::tuple<Tensor&, Tensor&> min_out(
         CTYPE* min_data = min.mutable_data_ptr<CTYPE>();
         long* min_indices_data = min_indices.mutable_data_ptr<long>();
 
-        for (size_t out_ix = 0; out_ix < min.numel(); ++out_ix) {
-          std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
-              [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {
-                if (!std::isnan(acc_val) && (std::isnan(v) || v < acc_val)) {
-                  acc_val = v;
-                  acc_ix = ix;
-                }
-                return std::tuple<CTYPE, long>{acc_val, acc_ix};
-              },
-              in,
-              dim,
-              out_ix);
-          min_data[out_ix] = std::get<0>(acc);
-          min_indices_data[out_ix] = std::get<1>(acc);
-        }
+        const bool success = parallel_for_each_reduce_over_dim_output_index(
+            in, dim, min, [&](const auto begin, const auto end) {
+              for (const auto out_ix : c10::irange(begin, end)) {
+                std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
+                    [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {
+                      if (!std::isnan(acc_val) &&
+                          (std::isnan(v) || v < acc_val)) {
+                        acc_val = v;
+                        acc_ix = ix;
+                      }
+                      return std::tuple<CTYPE, long>{acc_val, acc_ix};
+                    },
+                    in,
+                    dim,
+                    out_ix);
+                min_data[out_ix] = std::get<0>(acc);
+                min_indices_data[out_ix] = std::get<1>(acc);
+              }
+            });
+        ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed");
       });
 
   return {min, min_indices};
@@ -124,7 +130,7 @@ min_unary_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
       const auto data_in = in.const_data_ptr<CTYPE_IN>();
       auto data_out = out.mutable_data_ptr<CTYPE_OUT>();
       data_out[0] = upper_bound<CTYPE_OUT>();
-      for (auto i = 0; i < in.numel(); ++i) {
+      for (const auto i : c10::irange(in.numel())) {
         CTYPE_OUT val = static_cast<CTYPE_OUT>(data_in[i]);
         if (std::isnan(val)) {
           data_out[0] = val;
diff --git a/kernels/portable/cpu/op_native_group_norm.cpp b/kernels/portable/cpu/op_native_group_norm.cpp
index d4937532161..c373dfe26bd 100644
--- a/kernels/portable/cpu/op_native_group_norm.cpp
+++ b/kernels/portable/cpu/op_native_group_norm.cpp
@@ -5,6 +5,7 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+#include <c10/util/irange.h>
 
 #include <executorch/kernels/portable/cpu/util/normalization_ops_util.h>
 #include <executorch/kernels/portable/cpu/vec_ops.h>
@@ -51,7 +52,7 @@ void group_norm(
   CTYPE* rstd_data = rstd.mutable_data_ptr<CTYPE>();
 
   if (inner_size == 0) {
-    for (int i = 0; i < leading; ++i) {
+    for (const auto i : c10::irange(leading)) {
       mean_data[i] = static_cast<CTYPE>(0);
       rstd_data[i] = static_cast<CTYPE>(NAN);
     }
@@ -72,7 +73,7 @@ void group_norm(
     bias_data = nullptr;
   }
 
-  for (int i = 0; i < leading; ++i) {
+  for (const auto i : c10::irange(leading)) {
     const CTYPE* x = input_data + i * inner_size;
 
     // compute E[X] and Var[x] = E[x^2] - E[x]^2
@@ -86,12 +87,12 @@ void group_norm(
     // Calculate the elements of output
     if (weight_data == nullptr && bias_data == nullptr) {
       CTYPE* y = out_data + i * inner_size;
-      for (size_t j = 0; j < inner_size; j++) {
+      for (const auto j : c10::irange(inner_size)) {
         y[j] = (x[j] - mean_value) * rstd_value;
       }
     } else {
       const size_t g = i % G;
-      for (size_t j = 0; j < D; j++) {
+      for (const auto j : c10::irange(D)) {
         const size_t ch = g * D + j;
         const CTYPE scale =
             rstd_value * (weight_data == nullptr ? 1.0 : weight_data[ch]);
@@ -99,7 +100,7 @@ void group_norm(
             -scale * mean_value + (bias_data == nullptr ? 0.0 : bias_data[ch]);
         x = input_data + (i * D + j) * HxW;
         CTYPE* y = out_data + (i * D + j) * HxW;
-        for (size_t k = 0; k < HxW; k++) {
+        for (const auto k : c10::irange(HxW)) {
           y[k] = scale * x[k] + beta;
         }
       }
diff --git a/kernels/portable/cpu/op_native_layer_norm.cpp b/kernels/portable/cpu/op_native_layer_norm.cpp
index 2e70e5d2ba9..66c80b7cccc 100644
--- a/kernels/portable/cpu/op_native_layer_norm.cpp
+++ b/kernels/portable/cpu/op_native_layer_norm.cpp
@@ -5,6 +5,7 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+#include <c10/util/irange.h>
 
 #include <executorch/kernels/portable/cpu/util/normalization_ops_util.h>
 #include <executorch/kernels/portable/cpu/vec_ops.h>
@@ -45,7 +46,7 @@ void layer_norm(
   CTYPE* rstd_data = rstd.mutable_data_ptr<CTYPE>();
 
   if (normalized == 0) {
-    for (int i = 0; i < leading; ++i) {
+    for (const auto i : c10::irange(leading)) {
       mean_data[i] = static_cast<CTYPE>(0);
       rstd_data[i] = static_cast<CTYPE>(NAN);
     }
@@ -67,7 +68,7 @@ void layer_norm(
   }
 
   const CTYPE ct_normalized = static_cast<CTYPE>(normalized);
-  for (int i = 0; i < leading; ++i) {
+  for (const auto i : c10::irange(leading)) {
     const CTYPE* x = input_data + i * normalized;
     CTYPE* y = out_data + i * normalized;
 
@@ -79,7 +80,7 @@ void layer_norm(
     CTYPE std = std::sqrt(variance + eps);
 
     // Calculate the elements of output
-    for (int j = 0; j < normalized; ++j) {
+    for (const auto j : c10::irange(normalized)) {
       CTYPE w = weight_data ? weight_data[j] : static_cast<CTYPE>(1);
       CTYPE b = bias_data ? bias_data[j] : static_cast<CTYPE>(0);
       y[j] = (x[j] - mean_value) / std * w + b;
diff --git a/kernels/portable/cpu/op_nonzero.cpp b/kernels/portable/cpu/op_nonzero.cpp
index 20e10be4b65..5a319b95749 100644
--- a/kernels/portable/cpu/op_nonzero.cpp
+++ b/kernels/portable/cpu/op_nonzero.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <cmath>
 #include <cstring>
 
@@ -26,7 +27,7 @@ namespace {
 void increment_index(size_t* index, const ArrayRef<SizesType> sizes) {
   for (ssize_t i = sizes.size() - 1; i >= 0; --i) {
     index[i]++;
-    if (index[i] == sizes[i]) {
+    if (static_cast<ssize_t>(index[i]) == sizes[i]) {
       index[i] = 0;
     } else {
       return;
@@ -45,7 +46,7 @@ void nonzero(KernelRuntimeContext& ctx, const Tensor& input, Tensor& output) {
   int32_t num_nonzero = 0;
 
   // Count number of non zeros
-  for (size_t i = 0; i < lim; ++i) {
+  for (const auto i : c10::irange(lim)) {
     if (in_data[i] != 0) {
       num_nonzero++;
     }
@@ -68,9 +69,9 @@ void nonzero(KernelRuntimeContext& ctx, const Tensor& input, Tensor& output) {
   size_t out_idx = 0;
 
   // Loop again and this time write the proper indices into out
-  for (size_t i = 0; i < lim; i++) {
+  for (const auto i : c10::irange(lim)) {
     if (in_data[i] != 0) {
-      for (size_t j = 0; j < input.dim(); j++) {
+      for (const auto j : c10::irange(input.dim())) {
         out_data[out_idx++] = index[j];
       }
     }
diff --git a/kernels/portable/cpu/op_ones.cpp b/kernels/portable/cpu/op_ones.cpp
index 9135966e9d8..68826ab1a1f 100644
--- a/kernels/portable/cpu/op_ones.cpp
+++ b/kernels/portable/cpu/op_ones.cpp
@@ -5,6 +5,7 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+#include <c10/util/irange.h>
 
 #include <executorch/runtime/kernel/kernel_includes.h>
 
@@ -22,7 +23,7 @@ Tensor& ones_out(KernelRuntimeContext& ctx, IntArrayRef size, Tensor& out) {
   ScalarType out_type = out.scalar_type();
   ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, __func__, CTYPE, [&] {
     auto out_data = out.mutable_data_ptr<CTYPE>();
-    for (size_t i = 0; i < out.numel(); i++) {
+    for (const auto i : c10::irange(out.numel())) {
       out_data[i] = static_cast<CTYPE>(1);
     }
   });
diff --git a/kernels/portable/cpu/op_permute_copy.cpp b/kernels/portable/cpu/op_permute_copy.cpp
index 237b31ee988..719f8fcb445 100644
--- a/kernels/portable/cpu/op_permute_copy.cpp
+++ b/kernels/portable/cpu/op_permute_copy.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
@@ -26,7 +27,7 @@ void increment_coordinate_permuted(
   for (int i = dims.size() - 1; i >= 0; i--) {
     size_t d = dims[i] >= 0 ? dims[i] : dims[i] + tensor.dim();
     coordinate[d]++;
-    if (coordinate[d] == tensor.size(d)) {
+    if (static_cast<ssize_t>(coordinate[d]) == tensor.size(d)) {
       coordinate[d] = 0;
     } else {
       return;
@@ -70,7 +71,7 @@ Tensor& permute_copy_out(
     const CTYPE* const in_data = in.const_data_ptr<CTYPE>();
     CTYPE* const out_data = out.mutable_data_ptr<CTYPE>();
 
-    for (size_t i = 0; i < out.numel(); ++i) {
+    for (const auto i : c10::irange(out.numel())) {
       out_data[i] =
           in_data[executorch::runtime::coordinateToIndexWithTrailingDimsMemo(
               in, in_coord, trailing_dims_memo)];
diff --git a/kernels/portable/cpu/op_pixel_shuffle.cpp b/kernels/portable/cpu/op_pixel_shuffle.cpp
index a3bb417d9d5..fd9f1739e57 100644
--- a/kernels/portable/cpu/op_pixel_shuffle.cpp
+++ b/kernels/portable/cpu/op_pixel_shuffle.cpp
@@ -5,6 +5,7 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+#include <c10/util/irange.h>
 
 #include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
@@ -38,12 +39,12 @@ void pixel_shuffle_impl(const Tensor& in, int64_t upscale_factor, Tensor& out) {
   // input tensor shape of [n, c, s1, s2, h, w]
   // output tensor shape of [n, c, h, s1, w, s2]
   size_t i = 0;
-  for (size_t n = 0; n < leading_dims; n++) {
-    for (size_t c = 0; c < sub_channels; c++) {
-      for (size_t h = 0; h < height; h++) {
-        for (size_t s1 = 0; s1 < S; s1++) {
-          for (size_t w = 0; w < width; w++) {
-            for (size_t s2 = 0; s2 < S; s2++) {
+  for (const auto n : c10::irange(leading_dims)) {
+    for (const auto c : c10::irange(sub_channels)) {
+      for (const auto h : c10::irange(height)) {
+        for (const auto s1 : c10::irange(S)) {
+          for (const auto w : c10::irange(width)) {
+            for (const auto s2 : c10::irange(S)) {
               size_t input_offset = n * stride_n + c * stride_c +
                   s1 * stride_s1 + s2 * stride_s2 + h * stride_h + w;
               std::memcpy(
diff --git a/kernels/portable/cpu/op_pixel_unshuffle.cpp b/kernels/portable/cpu/op_pixel_unshuffle.cpp
index f0bd5e4d10f..68d7bbbc27a 100644
--- a/kernels/portable/cpu/op_pixel_unshuffle.cpp
+++ b/kernels/portable/cpu/op_pixel_unshuffle.cpp
@@ -5,6 +5,7 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+#include <c10/util/irange.h>
 
 #include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
@@ -41,12 +42,12 @@ void pixel_unshuffle_impl(
   // input tensor shape of [n, c, h, s1, w, s2]
   // output tensor shape of [n, c, s1, s2, h, w]
   size_t i = 0;
-  for (size_t n = 0; n < leading_dims; n++) {
-    for (size_t c = 0; c < sub_channels; c++) {
-      for (size_t h = 0; h < height; h++) {
-        for (size_t s1 = 0; s1 < S; s1++) {
-          for (size_t w = 0; w < width; w++) {
-            for (size_t s2 = 0; s2 < S; s2++) {
+  for (const auto n : c10::irange(leading_dims)) {
+    for (const auto c : c10::irange(sub_channels)) {
+      for (const auto h : c10::irange(height)) {
+        for (const auto s1 : c10::irange(S)) {
+          for (const auto w : c10::irange(width)) {
+            for (const auto s2 : c10::irange(S)) {
               size_t output_offset = n * stride_n + c * stride_c +
                   s1 * stride_s1 + s2 * stride_s2 + h * stride_h + w;
               std::memcpy(
diff --git a/kernels/portable/cpu/op_prod.cpp b/kernels/portable/cpu/op_prod.cpp
index 61bda38f68f..54580459d7c 100644
--- a/kernels/portable/cpu/op_prod.cpp
+++ b/kernels/portable/cpu/op_prod.cpp
@@ -5,6 +5,7 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+#include <c10/util/irange.h>
 
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
@@ -38,7 +39,7 @@ Tensor& prod_out(
       const auto data_in = in.const_data_ptr<CTYPE_IN>();
       auto data_out = out.mutable_data_ptr<CTYPE_OUT>();
       data_out[0] = static_cast<CTYPE_OUT>(1);
-      for (auto i = 0; i < in.numel(); ++i) {
+      for (const auto i : c10::irange(in.numel())) {
         data_out[0] *= static_cast<CTYPE_OUT>(data_in[i]);
       }
     });
@@ -76,22 +77,26 @@ Tensor& prod_int_out(
   ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, name, CTYPE_IN, [&] {
     ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, name, CTYPE_OUT, [&] {
       CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
-      for (size_t out_ix = 0; out_ix < out.numel(); ++out_ix) {
-        CTYPE_OUT prod = 1;
-        if (in.numel() > 0) {
-          std::tuple<CTYPE_OUT, long> acc =
-              map_reduce_over_dim<CTYPE_IN, CTYPE_OUT>(
-                  [](CTYPE_IN v) { return static_cast<CTYPE_OUT>(v); },
-                  [](CTYPE_OUT outv, long, CTYPE_OUT acc, long) {
-                    return std::tuple<CTYPE_OUT, long>{acc * outv, 0};
-                  },
-                  in,
-                  dim,
-                  out_ix);
-          prod = std::get<0>(acc);
-        }
-        out_data[out_ix] = prod;
-      }
+      const bool success = parallel_for_each_reduce_over_dim_output_index(
+          in, dim, out, [&](const auto begin, const auto end) {
+            for (const auto out_ix : c10::irange(begin, end)) {
+              CTYPE_OUT prod = 1;
+              if (in.numel() > 0) {
+                std::tuple<CTYPE_OUT, long> acc =
+                    map_reduce_over_dim<CTYPE_IN, CTYPE_OUT>(
+                        [](CTYPE_IN v) { return static_cast<CTYPE_OUT>(v); },
+                        [](CTYPE_OUT outv, long, CTYPE_OUT acc, long) {
+                          return std::tuple<CTYPE_OUT, long>{acc * outv, 0};
+                        },
+                        in,
+                        dim,
+                        out_ix);
+                prod = std::get<0>(acc);
+              }
+              out_data[out_ix] = prod;
+            }
+          });
+      ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed");
     });
   });
 
diff --git a/kernels/portable/cpu/op_repeat.cpp b/kernels/portable/cpu/op_repeat.cpp
index 8b64eefde31..1d42cc90189 100644
--- a/kernels/portable/cpu/op_repeat.cpp
+++ b/kernels/portable/cpu/op_repeat.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <cstring>
 
 #include <executorch/kernels/portable/cpu/util/repeat_util.h>
@@ -23,17 +24,17 @@ bool calculate_output_size(
     Tensor::SizesType* out_sizes_ptr) {
   ET_LOG_AND_RETURN_IF_FALSE(repeats.size() < kTensorDimensionLimit);
 
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       repeats.size() >= self_sizes.size(),
       "Repeats vector size is %zu must be >= self_sizes %zu.",
       repeats.size(),
       self_sizes.size());
 
-  int32_t i = 0;
+  size_t i = 0;
   for (; i < (repeats.size() - self_sizes.size()); ++i) {
     out_sizes_ptr[i] = static_cast<executorch::aten::SizesType>(repeats[i]);
   }
-  int32_t j = 0;
+  size_t j = 0;
   for (; i < repeats.size(); ++i) {
     out_sizes_ptr[i] =
         static_cast<executorch::aten::SizesType>(repeats[i]) * self_sizes[j];
diff --git a/kernels/portable/cpu/op_repeat_interleave.cpp b/kernels/portable/cpu/op_repeat_interleave.cpp
index c8a84e8c748..4ee77695f86 100644
--- a/kernels/portable/cpu/op_repeat_interleave.cpp
+++ b/kernels/portable/cpu/op_repeat_interleave.cpp
@@ -5,6 +5,7 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+#include <c10/util/irange.h>
 
 #include <executorch/runtime/kernel/kernel_includes.h>
 
@@ -18,26 +19,26 @@ bool check_repeat_interleave_args(
     int64_t output_size_value,
     int64_t repeats_sum,
     Tensor& out) {
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       repeats.scalar_type() == ScalarType::Int ||
           repeats.scalar_type() == ScalarType::Long,
       "repeats must be int or long");
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(repeats.dim() == 1, "repeats must be 1D");
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(repeats.dim() == 1, "repeats must be 1D");
+  ET_CHECK_OR_RETURN_FALSE(
       output_size_value == repeats_sum,
       "output_size, if provided, must be equal to repeats.sum()");
   ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(repeats, out));
 
   if (repeats.scalar_type() == ScalarType::Long) {
     const int64_t* const repeats_data = repeats.const_data_ptr<int64_t>();
-    for (size_t i = 0; i < repeats.numel(); ++i) {
-      ET_LOG_MSG_AND_RETURN_IF_FALSE(
+    for (const auto i : c10::irange(repeats.numel())) {
+      ET_CHECK_OR_RETURN_FALSE(
           repeats_data[i] >= 0, "repeats cannot be negative");
     }
   } else {
     const int32_t* const repeats_data = repeats.const_data_ptr<int32_t>();
-    for (size_t i = 0; i < repeats.numel(); ++i) {
-      ET_LOG_MSG_AND_RETURN_IF_FALSE(
+    for (const auto i : c10::irange(repeats.numel())) {
+      ET_CHECK_OR_RETURN_FALSE(
           repeats_data[i] >= 0, "repeats cannot be negative");
     }
   }
@@ -62,7 +63,7 @@ Tensor& repeat_interleave_Tensor_out(
 
   ET_SWITCH_TWO_TYPES(Int, Long, repeats.scalar_type(), ctx, name, CTYPE, [&] {
     const CTYPE* repeats_data = repeats.const_data_ptr<CTYPE>();
-    for (size_t ix = 0; ix < repeats.numel(); ++ix) {
+    for (const auto ix : c10::irange(repeats.numel())) {
       repeats_sum += static_cast<int64_t>(repeats_data[ix]);
     }
   });
@@ -96,7 +97,7 @@ Tensor& repeat_interleave_Tensor_out(
     const CTYPE* repeats_data = repeats.const_data_ptr<CTYPE>();
     CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
     size_t out_ix = 0;
-    for (size_t ix = 0; ix < repeats.numel(); ix++) {
+    for (const auto ix : c10::irange(repeats.numel())) {
       for (CTYPE i = 0; i < repeats_data[ix]; i++, out_ix++) {
         out_data[out_ix] = static_cast<CTYPE>(ix);
       }
diff --git a/kernels/portable/cpu/op_roll.cpp b/kernels/portable/cpu/op_roll.cpp
index ee735758c52..109be64fbed 100644
--- a/kernels/portable/cpu/op_roll.cpp
+++ b/kernels/portable/cpu/op_roll.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <cstddef>
 
@@ -36,7 +37,7 @@ size_t unshift_flat_ix(size_t ix, const Tensor& in, IntArrayRef dim_shifts) {
   indexToCoordinate(in, ix, ix_coord);
 
   size_t shifted_coord[kTensorDimensionLimit];
-  for (size_t d = 0; d < in.dim(); d++) {
+  for (const auto d : c10::irange(in.dim())) {
     shifted_coord[d] =
         (ix_coord[d] + in.size(d) - dim_shifts[d] % in.size(d)) % in.size(d);
   }
@@ -68,10 +69,10 @@ Tensor& roll_out(
   }
 
   int64_t dim_shift_array[kTensorDimensionLimit];
-  for (size_t i = 0; i < in.dim(); i++) {
+  for (const auto i : c10::irange(in.dim())) {
     dim_shift_array[i] = 0;
   }
-  for (size_t i = 0; i < dims.size(); i++) {
+  for (const auto i : c10::irange(dims.size())) {
     const auto d = dims[i] < 0 ? dims[i] + in.dim() : dims[i];
     dim_shift_array[d] += shifts[i];
   }
@@ -85,7 +86,7 @@ Tensor& roll_out(
     const CTYPE* in_data = in.const_data_ptr<CTYPE>();
     CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
 
-    for (size_t ix = 0; ix < out.numel(); ++ix) {
+    for (const auto ix : c10::irange(out.numel())) {
       out_data[ix] = in_data[unshift_flat_ix(ix, in, dim_shifts)];
     }
   });
diff --git a/kernels/portable/cpu/op_scatter.cpp b/kernels/portable/cpu/op_scatter.cpp
index af4ca8a8390..f8f4b21264e 100644
--- a/kernels/portable/cpu/op_scatter.cpp
+++ b/kernels/portable/cpu/op_scatter.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <cinttypes>
 #include <cstdint>
 #include <cstring>
@@ -41,7 +42,7 @@ void scatter_src_helper(
     dim += nonzero_dim(in);
   }
 
-  for (size_t ix = 0; ix < index.numel(); ++ix) {
+  for (const auto ix : c10::irange(index.numel())) {
     // @lint-ignore CLANGTIDY facebook-hte-CArray
     size_t ix_coord[kTensorDimensionLimit];
     indexToCoordinate(index, ix, ix_coord);
@@ -50,7 +51,7 @@ void scatter_src_helper(
 
     // @lint-ignore CLANGTIDY facebook-hte-CArray
     size_t out_coord[kTensorDimensionLimit];
-    for (size_t i = 0; i < out.dim(); ++i) {
+    for (const auto i : c10::irange(out.dim())) {
       if (i == dim) {
         out_coord[i] = index_data[ix];
       } else {
@@ -80,14 +81,14 @@ void scatter_value_helper(
     dim += nonzero_dim(in);
   }
 
-  for (size_t ix = 0; ix < index.numel(); ++ix) {
+  for (const auto ix : c10::irange(index.numel())) {
     // @lint-ignore CLANGTIDY facebook-hte-CArray
     size_t ix_coord[kTensorDimensionLimit];
     indexToCoordinate(index, ix, ix_coord);
 
     // @lint-ignore CLANGTIDY facebook-hte-CArray
     size_t out_coord[kTensorDimensionLimit];
-    for (size_t i = 0; i < out.dim(); ++i) {
+    for (const auto i : c10::irange(out.dim())) {
       if (i == dim) {
         out_coord[i] = index_data[ix];
       } else {
diff --git a/kernels/portable/cpu/op_scatter_add.cpp b/kernels/portable/cpu/op_scatter_add.cpp
index 1b53777e731..b83a56c2e01 100644
--- a/kernels/portable/cpu/op_scatter_add.cpp
+++ b/kernels/portable/cpu/op_scatter_add.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <executorch/kernels/portable/cpu/util/index_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <cstring>
@@ -28,14 +29,14 @@ void scatter_add_helper(
     const Tensor& index,
     Tensor& out,
     int64_t dim) {
-  for (size_t ix = 0; ix < index.numel(); ++ix) {
+  for (const auto ix : c10::irange(index.numel())) {
     size_t ix_coord[kTensorDimensionLimit];
     indexToCoordinate(index, ix, ix_coord);
 
     size_t src_ix = coordinateToIndex(src, ix_coord);
 
     size_t out_coord[kTensorDimensionLimit];
-    for (size_t i = 0; i < out.dim(); ++i) {
+    for (const auto i : c10::irange(out.dim())) {
       if (i == dim) {
         out_coord[i] = index_data[ix];
       } else {
diff --git a/kernels/portable/cpu/op_select_scatter.cpp b/kernels/portable/cpu/op_select_scatter.cpp
index e4622d8fda2..18c39c005d5 100644
--- a/kernels/portable/cpu/op_select_scatter.cpp
+++ b/kernels/portable/cpu/op_select_scatter.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <cinttypes>
 #include <cstdint>
 #include <cstring>
@@ -79,8 +80,8 @@ Tensor& select_scatter_out(
           CTYPE* const out_data = out.mutable_data_ptr<CTYPE>();
           const CTYPE_SRC* const src_data = src.const_data_ptr<CTYPE_SRC>();
 
-          for (size_t i = 0; i < leading_dims; ++i) {
-            for (size_t j = 0; j < trailing_stride; ++j) {
+          for (const auto i : c10::irange(leading_dims)) {
+            for (const auto j : c10::irange(trailing_stride)) {
               out_data[start_offset + i * out_step + j] =
                   convert<CTYPE, CTYPE_SRC>(src_data[i * trailing_stride + j]);
             }
diff --git a/kernels/portable/cpu/op_slice_scatter.cpp b/kernels/portable/cpu/op_slice_scatter.cpp
index c2fe2d70581..5a9138a0359 100644
--- a/kernels/portable/cpu/op_slice_scatter.cpp
+++ b/kernels/portable/cpu/op_slice_scatter.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <cstdint>
 #include <cstring>
 
@@ -85,10 +86,10 @@ Tensor& slice_scatter_out(
 
           size_t src_offset = 0;
 
-          for (int i = 0; i < leading_dims; i++) {
+          for (const auto i : c10::irange(leading_dims)) {
             size_t out_offset = (i * dim_length + start) * trailing_dims;
-            for (int j = 0; j < num_values; j++) {
-              for (size_t k = 0; k < trailing_dims; ++k) {
+            for ([[maybe_unused]] const auto j : c10::irange(num_values)) {
+              for (const auto k : c10::irange(trailing_dims)) {
                 out_data[out_offset + k] =
                     convert<CTYPE, CTYPE_SRC>(src_data[src_offset + k]);
               }
diff --git a/kernels/portable/cpu/op_split_with_sizes_copy.cpp b/kernels/portable/cpu/op_split_with_sizes_copy.cpp
index daa0845c6f3..c99a7fb6815 100644
--- a/kernels/portable/cpu/op_split_with_sizes_copy.cpp
+++ b/kernels/portable/cpu/op_split_with_sizes_copy.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <cstdint>
 #include <cstring>
 
@@ -38,7 +39,8 @@ void split_with_sizes_copy_out(
       check_split_with_sizes_copy_args(in, split_sizes, dim, out),
       InvalidArgument, );
 
-  for (size_t i = 0; i < out.size(); ++i) {
+  // All output tensors must have the same dim order as the input
+  for (const auto i : c10::irange(out.size())) {
     ET_KERNEL_CHECK(
         ctx, tensors_have_same_dim_order(in, out[i]), InvalidArgument, );
   }
@@ -52,11 +54,12 @@ void split_with_sizes_copy_out(
   // Check that all chunks broadcast to their respective out tensor
   Tensor::SizesType target_out_sizes[kTensorDimensionLimit];
   size_t target_out_ndim = in.dim();
-  for (size_t d = 0; d < in.dim(); ++d) {
+
+  for (const auto d : c10::irange(in.dim())) {
     target_out_sizes[d] = static_cast<Tensor::SizesType>(in.size(d));
   }
 
-  for (size_t i = 0; i < split_sizes.size(); i++) {
+  for (const auto i : c10::irange(split_sizes.size())) {
     target_out_sizes[dim] = static_cast<Tensor::SizesType>(split_sizes[i]);
     ET_KERNEL_CHECK(
         ctx,
@@ -76,7 +79,7 @@ void split_with_sizes_copy_out(
       const CTYPE_IN* in_data = in.const_data_ptr<CTYPE_IN>();
 
       // Iterate through list of out tensors
-      for (size_t i = 0; i < out.size(); ++i) {
+      for (const auto i : c10::irange(out.size())) {
         const Tensor& out_tensor = out[i];
 
         // If out tensor is empty, no action is required
@@ -99,8 +102,8 @@ void split_with_sizes_copy_out(
         // Simpler logic if there's no broadcasting
         if (!is_broadcasted) {
           const CTYPE_IN* src = in_data;
-          for (size_t j = 0; j < leading_dims; ++j) {
-            for (size_t k = 0; k < chunk_step; ++k) {
+          for ([[maybe_unused]] const auto j : c10::irange(leading_dims)) {
+            for (const auto k : c10::irange(chunk_step)) {
               out_data[k] = convert<CTYPE_OUT, CTYPE_IN>(src[k]);
             }
             src += step;
@@ -119,7 +122,7 @@ void split_with_sizes_copy_out(
 
           // For each element in the out tensor, find its corresponding index
           // in the input tensor and copy it over
-          for (size_t ix = 0; ix < out_tensor.numel(); ++ix) {
+          for (const auto ix : c10::irange(out_tensor.numel())) {
             size_t out_coord[kTensorDimensionLimit];
             delinearize_index(ix, out_tensor, out_coord, kTensorDimensionLimit);
 
diff --git a/kernels/portable/cpu/op_sum.cpp b/kernels/portable/cpu/op_sum.cpp
index 0fec3e37f2a..550f6b9572f 100644
--- a/kernels/portable/cpu/op_sum.cpp
+++ b/kernels/portable/cpu/op_sum.cpp
@@ -5,11 +5,14 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+#include <c10/util/irange.h>
 
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
 
+#include <optional>
+
 namespace torch {
 namespace executor {
 namespace native {
@@ -43,25 +46,31 @@ Tensor& sum_dim_out(
 
   ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out);
 
-  ET_SWITCH_REALHBBF16_TYPES(
-      in.scalar_type(), ctx, "sum.IntList_out", CTYPE_IN, [&] {
-        ET_SWITCH_REALHBBF16_TYPES(
-            out.scalar_type(), ctx, "sum.IntList_out", CTYPE_OUT, [&] {
-              CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
-              for (size_t out_ix = 0; out_ix < out.numel(); ++out_ix) {
-                CTYPE_OUT sum = 0;
-                if (in.numel() > 0) {
-                  sum = map_reduce_over_dim_list<CTYPE_IN, CTYPE_OUT>(
-                      [](CTYPE_IN v) { return static_cast<CTYPE_OUT>(v); },
-                      [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; },
-                      in,
-                      dim_list,
-                      out_ix);
-                }
-                out_data[out_ix] = sum;
+  std::optional<MapReduceOverDimListPlan> plan;
+  if (in.numel() > 0) {
+    plan.emplace(in, dim_list);
+  }
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  static constexpr const char op_name[] = "sum.IntList_out";
+  ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE_IN, [&] {
+    ET_SWITCH_REALHBBF16_TYPES(out.scalar_type(), ctx, op_name, CTYPE_OUT, [&] {
+      CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
+      const bool success = parallel_for_each_reduce_over_dim_list_output_index(
+          in, dim_list, out, [&](const auto begin, const auto end) {
+            for (const auto out_ix : c10::irange(begin, end)) {
+              CTYPE_OUT sum = 0;
+              if (plan.has_value()) {
+                sum = plan->execute<CTYPE_IN, CTYPE_OUT>(
+                    [](CTYPE_IN v) { return static_cast<CTYPE_OUT>(v); },
+                    [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; },
+                    out_ix);
               }
-            });
-      });
+              out_data[out_ix] = sum;
+            }
+          });
+      ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed");
+    });
+  });
 
   return out;
 }
diff --git a/kernels/portable/cpu/op_topk.cpp b/kernels/portable/cpu/op_topk.cpp
index 987e974bbf5..c56545b9235 100644
--- a/kernels/portable/cpu/op_topk.cpp
+++ b/kernels/portable/cpu/op_topk.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <cmath>
 #include <tuple>
 
@@ -28,7 +29,7 @@ bool check_topk_args(
   if (dim < 0) {
     dim += nonzero_dim(in);
   }
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       k >= 0 && k <= nonempty_size(in, dim), "selected index k out of range");
   return true;
 }
@@ -40,8 +41,8 @@ bool get_topk_target_size(
     Tensor::SizesType* target_size,
     size_t* target_dim) {
   *target_dim = in.dim();
-  for (size_t i = 0; i < *target_dim; ++i) {
-    if (i == dim) {
+  for (const auto i : c10::irange(*target_dim)) {
+    if (static_cast<int64_t>(i) == dim) {
       target_size[i] = k;
     } else {
       target_size[i] = in.size(i);
@@ -90,19 +91,19 @@ void perform_topk(
   const size_t outer_stride_in = dim_size * dim_stride;
   const size_t outer_stride_out = k * dim_stride;
 
-  bool use_partial_sort = k * 64 <= dim_size;
+  bool use_partial_sort = k * 64 <= static_cast<int64_t>(dim_size);
 
   // Loop through all outer dimensions
-  for (size_t outer_idx = 0; outer_idx < outer_size; ++outer_idx) {
+  for (const auto outer_idx : c10::irange(outer_size)) {
     size_t outer_in = outer_idx * outer_stride_in;
     size_t outer_out = outer_idx * outer_stride_out;
     // Loop through all inner dimensions
-    for (size_t inner_idx = 0; inner_idx < dim_stride; ++inner_idx) {
+    for (const auto inner_idx : c10::irange(dim_stride)) {
       size_t base_in = outer_in + inner_idx;
       size_t base_out = outer_out + inner_idx;
 
       // Populate the queue with the values from the input tensor
-      for (size_t i = 0; i < dim_size; ++i) {
+      for (const auto i : c10::irange(dim_size)) {
         size_t in_ix = base_in + i * dim_stride;
         queue[i].first = in_data[in_ix];
         queue[i].second = i;
@@ -126,7 +127,7 @@ void perform_topk(
       }
 
       // Write the topk values and indices to the output tensors
-      for (size_t i = 0; i < k; ++i) {
+      for (const auto i : c10::irange(k)) {
         size_t out_ix = base_out + i * dim_stride;
 
         values_data[out_ix] = queue[i].first;
diff --git a/kernels/portable/cpu/op_tril.cpp b/kernels/portable/cpu/op_tril.cpp
index 9e28cff825f..b21c9918a99 100644
--- a/kernels/portable/cpu/op_tril.cpp
+++ b/kernels/portable/cpu/op_tril.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <cstring>
@@ -44,8 +45,8 @@ void apply_tril(
     int64_t num_cols,
     int64_t row_stride,
     int64_t col_stride) {
-  for (int64_t i = 0; i < num_rows; i++) {
-    for (int64_t j = 0; j < std::min(num_cols, i + diagonal + 1); j++) {
+  for (const auto i : c10::irange(num_rows)) {
+    for (const auto j : c10::irange(std::min(num_cols, i + diagonal + 1))) {
       out[i * row_stride + j * col_stride] =
           self[i * row_stride + j * col_stride];
     }
@@ -63,21 +64,21 @@ void tril_kernel(
     const Tensor& out) {
   // Dynamically compute `self` sizes and strides.
 
-  int64_t ndim = self.dim();
+  size_t ndim = static_cast<size_t>(self.dim());
 
   ET_KERNEL_CHECK_MSG(
       ctx,
       ndim < kTensorDimensionLimit,
       InvalidArgument,
       ,
-      "ndim %" PRId64 " >= %zu",
+      "ndim %zu >= %zu",
       ndim,
       kTensorDimensionLimit);
 
   int64_t sizes[kTensorDimensionLimit];
   int64_t strides[kTensorDimensionLimit];
 
-  for (size_t i = 0; i < ndim; ++i) {
+  for (const auto i : c10::irange(ndim)) {
     sizes[i] = self.size(i);
     strides[i] = getTrailingDims(self, static_cast<int64_t>(i));
   }
@@ -102,7 +103,7 @@ void tril_kernel(
   int64_t row_stride = strides_ref[ndim - 2];
   int64_t col_stride = strides_ref[ndim - 1];
 
-  for (int64_t i = 0; i < batch_size; i++) {
+  for (const auto i : c10::irange(batch_size)) {
     CTYPE* __restrict__ data_self_ptr = &data_self[i * self_stride];
     CTYPE* __restrict__ data_out_ptr = &data_out[i * self_stride];
 
diff --git a/kernels/portable/cpu/op_unbind_copy.cpp b/kernels/portable/cpu/op_unbind_copy.cpp
index b8ab1e489f2..bcf65c673b3 100644
--- a/kernels/portable/cpu/op_unbind_copy.cpp
+++ b/kernels/portable/cpu/op_unbind_copy.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <cstdint>
 #include <cstring>
 
@@ -36,7 +37,7 @@ void unbind_copy_int_out(
   ET_KERNEL_CHECK(
       ctx, check_unbind_copy_args(input, dim, out), InvalidArgument, );
 
-  for (int i = 0; i < out.size(); ++i) {
+  for (const auto i : c10::irange(out.size())) {
     ET_KERNEL_CHECK(
         ctx, tensors_have_same_dim_order(input, out[i]), InvalidArgument, );
   }
@@ -64,8 +65,9 @@ void unbind_copy_int_out(
                 size_t input_offset = i * trailing_dims;
                 CTYPE_OUT* const dest = out[i].mutable_data_ptr<CTYPE_OUT>();
                 size_t dest_offset = 0;
-                for (size_t j = 0; j < leading_dims; ++j) {
-                  for (size_t k = 0; k < trailing_dims; ++k) {
+                for ([[maybe_unused]] const auto j :
+                     c10::irange(leading_dims)) {
+                  for (const auto k : c10::irange(trailing_dims)) {
                     dest[dest_offset + k] = convert<CTYPE_OUT, CTYPE_IN>(
                         input_data[input_offset + k]);
                   }
diff --git a/kernels/portable/cpu/op_unfold_copy.cpp b/kernels/portable/cpu/op_unfold_copy.cpp
new file mode 100644
index 00000000000..c6f725cf6bc
--- /dev/null
+++ b/kernels/portable/cpu/op_unfold_copy.cpp
@@ -0,0 +1,73 @@
+#include <c10/util/irange.h>
+#include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/platform/assert.h>
+#include <cstring>
+namespace torch {
+namespace executor {
+namespace native {
+
+using Tensor = executorch::aten::Tensor;
+
+// unfold_copy(Tensor self, int dimension, int size, int step, *, Tensor(a!)
+// out) -> Tensor(a!)
+Tensor& unfold_copy_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& self,
+    int64_t dim,
+    int64_t size,
+    int64_t step,
+    Tensor& out) {
+  (void)ctx;
+  // Check if dimension is valid
+  ET_KERNEL_CHECK(
+      ctx, check_unfold_copy_args(self, dim, size, step), InvalidArgument, out);
+  if (dim < 0) {
+    dim += nonzero_dim(self);
+  }
+  // Calculate output size
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  Tensor::SizesType expected_output_size[kTensorDimensionLimit];
+  size_t expected_out_dim = 0;
+
+  get_unfold_copy_out_target_size(
+      self, dim, size, step, expected_output_size, &expected_out_dim);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_tensor(out, {expected_output_size, expected_out_dim}) == Error::Ok,
+      InvalidArgument,
+      out);
+
+  // Copy data
+  const size_t leading_dims = getLeadingDims(self, dim);
+  const size_t trailing_dims = getTrailingDims(self, dim);
+  ScalarType in_type = self.scalar_type();
+  ScalarType out_type = out.scalar_type();
+
+  ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, "unfold_copy.out", CTYPE_IN, [&]() {
+    const CTYPE_IN* input_ptr = self.const_data_ptr<CTYPE_IN>();
+    ET_SWITCH_REALHBBF16_TYPES(
+        out_type, ctx, "unfold_copy.out", CTYPE_OUT, [&] {
+          CTYPE_OUT* out_ptr = out.mutable_data_ptr<CTYPE_OUT>();
+          for (const auto i : c10::irange(leading_dims)) {
+            const CTYPE_IN* src =
+                input_ptr + i * self.size(dim) * trailing_dims;
+            for (const auto j : c10::irange(out.size(dim))) {
+              const CTYPE_IN* dim_src = src + j * step * trailing_dims;
+              for (const auto k : c10::irange(trailing_dims)) {
+                for (const auto l : c10::irange(size)) {
+                  *out_ptr = convert<CTYPE_OUT, CTYPE_IN>(
+                      dim_src[k + l * trailing_dims]);
+                  out_ptr++;
+                }
+              }
+            }
+          }
+        });
+  });
+  return out;
+}
+} // namespace native
+} // namespace executor
+} // namespace torch
diff --git a/kernels/portable/cpu/op_unsqueeze_copy.cpp b/kernels/portable/cpu/op_unsqueeze_copy.cpp
index e6eec2e8916..3f0e44ab2ae 100644
--- a/kernels/portable/cpu/op_unsqueeze_copy.cpp
+++ b/kernels/portable/cpu/op_unsqueeze_copy.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <cstdint>
 #include <cstring>
 
@@ -38,7 +39,7 @@ Tensor& unsqueeze_copy_out(
   ET_KERNEL_CHECK(ctx, self.dim() + 1 == out.dim(), InvalidArgument, out);
   ET_KERNEL_CHECK(ctx, dim <= self.dim(), InvalidArgument, out);
 
-  for (size_t i = 0; i < out.dim(); ++i) {
+  for (const auto i : c10::irange(out.dim())) {
     if (i < dim) {
       expected_output_size[i] = self.size(i);
     } else if (i > dim) {
diff --git a/kernels/portable/cpu/op_upsample_bilinear2d.cpp b/kernels/portable/cpu/op_upsample_bilinear2d.cpp
index c30abe5e33e..69f7917822b 100644
--- a/kernels/portable/cpu/op_upsample_bilinear2d.cpp
+++ b/kernels/portable/cpu/op_upsample_bilinear2d.cpp
@@ -5,6 +5,7 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+#include <c10/util/irange.h>
 
 #include <executorch/kernels/portable/cpu/util/upsample_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
@@ -29,9 +30,9 @@ void upsample_bilinear2d_kernel_impl(
   auto out_data = out.mutable_data_ptr<CTYPE>();
 
   auto in_plane = in_data;
-  for (auto n = 0; n < out.size(0); n++) {
-    for (auto c = 0; c < out.size(1); c++) {
-      for (auto h = 0; h < out.size(2); h++) {
+  for ([[maybe_unused]] const auto n : c10::irange(out.size(0))) {
+    for ([[maybe_unused]] const auto c : c10::irange(out.size(1))) {
+      for (const auto h : c10::irange(out.size(2))) {
         // Compute source index and weights.
         int64_t in_h1, in_h2;
         float weight_h, inv_weight_h;
@@ -47,7 +48,7 @@ void upsample_bilinear2d_kernel_impl(
             out.sizes()[2],
             align_corners);
 
-        for (auto w = 0; w < out.size(3); w++) {
+        for (const auto w : c10::irange(out.size(3))) {
           int64_t in_w1, in_w2;
           float weight_w, inv_weight_w;
 
diff --git a/kernels/portable/cpu/op_var.cpp b/kernels/portable/cpu/op_var.cpp
index c3627281481..f09f1d92bc9 100644
--- a/kernels/portable/cpu/op_var.cpp
+++ b/kernels/portable/cpu/op_var.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <cmath>
 
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
@@ -20,6 +21,7 @@ namespace {
 
 template <typename CTYPE_IN, typename CTYPE_OUT>
 void compute_variance(
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     Tensor& out,
     optional<ArrayRef<int64_t>> dim_list,
@@ -27,30 +29,31 @@ void compute_variance(
     const double denominator) {
   CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
   if (num == 0 || denominator <= 0) {
-    for (size_t out_ix = 0; out_ix < out.numel(); ++out_ix) {
+    for (const auto out_ix : c10::irange(out.numel())) {
       out_data[out_ix] = NAN;
     }
   } else {
-    for (size_t out_ix = 0; out_ix < out.numel(); ++out_ix) {
-      CTYPE_OUT sum = map_reduce_over_dim_list<CTYPE_IN, CTYPE_OUT>(
-          [](CTYPE_IN v) { return static_cast<CTYPE_OUT>(v); },
-          [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; },
-          in,
-          dim_list,
-          out_ix);
-      CTYPE_OUT mean = sum / static_cast<CTYPE_OUT>(num);
-      CTYPE_OUT sum2 = map_reduce_over_dim_list<CTYPE_IN, CTYPE_OUT>(
-          [mean](CTYPE_IN v) {
-            return (
-                (static_cast<CTYPE_OUT>(v) - mean) *
-                (static_cast<CTYPE_OUT>(v) - mean));
-          },
-          [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; },
-          in,
-          dim_list,
-          out_ix);
-      out_data[out_ix] = sum2 / denominator;
-    }
+    MapReduceOverDimListPlan plan(in, dim_list);
+    const bool success = parallel_for_each_reduce_over_dim_list_output_index(
+        in, dim_list, out, [&](const auto begin, const auto end) {
+          for (const auto out_ix : c10::irange(begin, end)) {
+            CTYPE_OUT sum = plan.execute<CTYPE_IN, CTYPE_OUT>(
+                [](CTYPE_IN v) { return static_cast<CTYPE_OUT>(v); },
+                [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; },
+                out_ix);
+            CTYPE_OUT mean = sum / static_cast<CTYPE_OUT>(num);
+            CTYPE_OUT sum2 = plan.execute<CTYPE_IN, CTYPE_OUT>(
+                [mean](CTYPE_IN v) {
+                  return (
+                      (static_cast<CTYPE_OUT>(v) - mean) *
+                      (static_cast<CTYPE_OUT>(v) - mean));
+                },
+                [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; },
+                out_ix);
+            out_data[out_ix] = sum2 / denominator;
+          }
+        });
+    ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed");
   }
 }
 
@@ -92,7 +95,7 @@ Tensor& var_out(
 
   ET_SWITCH_FLOATHBF16_TYPES(in.scalar_type(), ctx, name, CTYPE_IN, [&] {
     ET_SWITCH_FLOATHBF16_TYPES(out.scalar_type(), ctx, name, CTYPE_OUT, [&] {
-      compute_variance<CTYPE_IN, CTYPE_OUT>(in, out, dim_list, num, denom);
+      compute_variance<CTYPE_IN, CTYPE_OUT>(ctx, in, out, dim_list, num, denom);
     });
   });
 
@@ -137,7 +140,7 @@ Tensor& var_correction_out(
 
   ET_SWITCH_FLOATHBF16_TYPES(in.scalar_type(), ctx, name, CTYPE_IN, [&] {
     ET_SWITCH_FLOATHBF16_TYPES(out.scalar_type(), ctx, name, CTYPE_OUT, [&] {
-      compute_variance<CTYPE_IN, CTYPE_OUT>(in, out, dim_list, num, denom);
+      compute_variance<CTYPE_IN, CTYPE_OUT>(ctx, in, out, dim_list, num, denom);
     });
   });
 
diff --git a/kernels/portable/cpu/op_zeros.cpp b/kernels/portable/cpu/op_zeros.cpp
index e24324e55fd..a3c70795705 100644
--- a/kernels/portable/cpu/op_zeros.cpp
+++ b/kernels/portable/cpu/op_zeros.cpp
@@ -9,6 +9,7 @@
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
 
+#include <c10/util/irange.h>
 #include <cstdint>
 #include <cstring>
 
@@ -24,7 +25,7 @@ bool check_sizes(
     executorch::aten::ArrayRef<int64_t> size_int64_t,
     executorch::aten::ArrayRef<int32_t> size_int32_t) {
   ET_LOG_AND_RETURN_IF_FALSE(size_int64_t.size() == size_int32_t.size());
-  for (int i = 0; i < size_int64_t.size(); i++) {
+  for (const auto i : c10::irange(size_int64_t.size())) {
     ET_LOG_AND_RETURN_IF_FALSE(((int64_t)size_int32_t[i] == size_int64_t[i]));
   }
 
diff --git a/kernels/portable/cpu/targets.bzl b/kernels/portable/cpu/targets.bzl
index 20434459489..b428a5d107e 100644
--- a/kernels/portable/cpu/targets.bzl
+++ b/kernels/portable/cpu/targets.bzl
@@ -1,4 +1,4 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime")
 load("@fbsource//xplat/executorch/kernels/portable:op_registration_util.bzl", "ATEN_OPS", "CUSTOM_OPS", "define_op_target")
 
 def define_common_targets():
@@ -29,17 +29,21 @@ def define_common_targets():
         exported_deps = all_op_targets,
     )
 
-    runtime.cxx_library(
-        name = "cpu_aten",
-        srcs = [],
-        visibility = ["//executorch/kernels/portable/..."],
-        exported_deps = [t + "_aten" for t in custom_op_targets],
-    )
+    if True in get_aten_mode_options():
+        runtime.cxx_library(
+            name = "cpu_aten",
+            srcs = [],
+            visibility = ["//executorch/kernels/portable/..."],
+            exported_deps = [t + "_aten" for t in custom_op_targets],
+        )
 
     # Only for use by op targets under //executorch. This API needs to be
     # reevaluated before becoming a public API.
     runtime.cxx_library(
         name = "vec_ops",
+        exported_deps = [
+            "//executorch/runtime/core/portable_type/c10/c10:c10",
+        ],
         srcs = [],
         exported_headers = ["vec_ops.h"],
         visibility = ["//executorch/kernels/portable/cpu/...", "//executorch/kernels/quantized/..."],
diff --git a/kernels/portable/cpu/util/activation_ops_util.cpp b/kernels/portable/cpu/util/activation_ops_util.cpp
index 908758a2e36..abde15f8740 100644
--- a/kernels/portable/cpu/util/activation_ops_util.cpp
+++ b/kernels/portable/cpu/util/activation_ops_util.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <cstring>
 
 #include <executorch/kernels/portable/cpu/util/activation_ops_util.h>
@@ -17,7 +18,7 @@ namespace executor {
 bool check_gelu_args(const Tensor& in, string_view approximate, Tensor& out) {
   ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, out));
   ET_LOG_AND_RETURN_IF_FALSE(in.scalar_type() != ScalarType::Bool);
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       approximate == "tanh" || approximate == "none",
       "Invalid approximation format: %.*s for gelu",
       static_cast<int>(approximate.length()),
@@ -30,9 +31,9 @@ bool check_glu_args(const Tensor& in, int64_t dim, Tensor& out) {
   ET_LOG_AND_RETURN_IF_FALSE(tensor_is_floating_type(in));
 
   const size_t non_negative_dim = dim < 0 ? dim + in.dim() : dim;
-  const size_t dim_size = in.size(non_negative_dim);
+  const ssize_t dim_size = in.size(non_negative_dim);
 
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       dim_size % 2 == 0,
       "Halving dimension must be even, but dimension %zd is size %zd",
       non_negative_dim,
@@ -40,12 +41,12 @@ bool check_glu_args(const Tensor& in, int64_t dim, Tensor& out) {
 
   ET_LOG_AND_RETURN_IF_FALSE(tensor_is_floating_type(out));
   ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_rank(in, out));
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       out.size(non_negative_dim) == dim_size / 2,
       "output tensor must have half the size of the input tensor along the specified dimension.");
 
-  for (size_t i = 0; i < in.dim(); ++i) {
-    if (i != non_negative_dim) {
+  for (const auto i : c10::irange(in.dim())) {
+    if (static_cast<size_t>(i) != non_negative_dim) {
       if (out.size(i) != in.size(i)) {
 #if ET_LOG_ENABLED
         auto out_shape_str = executorch::runtime::tensor_shape_to_c_string(
@@ -73,7 +74,7 @@ bool check_log_softmax_args(
     int64_t dim,
     bool half_to_float,
     Tensor& out) {
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       !half_to_float, "half to float conversion is not supported on CPU");
   ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, out));
   ET_LOG_AND_RETURN_IF_FALSE(tensor_has_dim(in, dim));
@@ -94,9 +95,10 @@ Error resize_glu_out(const Tensor& in, int64_t dim, Tensor& out) {
   executorch::aten::SizesType expected_output_size[kTensorDimensionLimit];
 
   const size_t non_negative_dim = dim < 0 ? dim + in.dim() : dim;
-  for (size_t i = 0; i < in.dim(); i++) {
-    expected_output_size[i] =
-        (i == non_negative_dim) ? (in.size(i) / 2) : in.size(i);
+  for (const auto i : c10::irange(in.dim())) {
+    expected_output_size[i] = (static_cast<size_t>(i) == non_negative_dim)
+        ? (in.size(i) / 2)
+        : in.size(i);
   }
 
   ArrayRef<executorch::aten::SizesType> output_size{
diff --git a/kernels/portable/cpu/util/advanced_index_util.cpp b/kernels/portable/cpu/util/advanced_index_util.cpp
index cc205df0e43..304ba3a3f96 100644
--- a/kernels/portable/cpu/util/advanced_index_util.cpp
+++ b/kernels/portable/cpu/util/advanced_index_util.cpp
@@ -5,6 +5,7 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+#include <c10/util/irange.h>
 
 #include <executorch/kernels/portable/cpu/util/broadcast_util.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_shape_to_c_string.h>
@@ -20,11 +21,11 @@ using TensorOptList =
 namespace {
 
 bool check_indices_dtypes(TensorOptList indices) {
-  for (auto i = 0; i < indices.size(); i++) {
+  for (const auto i : c10::irange(indices.size())) {
     if (indices[i].has_value()) {
       const Tensor& index = indices[i].value();
       ScalarType ix_type = index.scalar_type();
-      ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      ET_CHECK_OR_RETURN_FALSE(
           ix_type == ScalarType::Long || ix_type == ScalarType::Int ||
               ix_type == ScalarType::Byte || ix_type == ScalarType::Bool,
           "Index tensors should be Long, Int, Byte or Bool");
@@ -43,13 +44,13 @@ bool is_mask_index(const Tensor& index) {
 
 bool check_mask_indices(const Tensor& in, TensorOptList indices) {
   size_t in_i = 0;
-  for (auto i = 0; i < indices.size(); i++) {
+  for (const auto i : c10::irange(indices.size())) {
     if (indices[i].has_value()) {
       const Tensor& index = indices[i].value();
       if (is_mask_index(index)) {
-        ET_LOG_MSG_AND_RETURN_IF_FALSE(
+        ET_CHECK_OR_RETURN_FALSE(
             index.dim() > 0, "Zero-dimensional mask index not allowed");
-        for (auto j = 0; j < index.dim(); j++) {
+        for (const auto j : c10::irange(index.dim())) {
           if (index.size(j) != in.size(in_i + j)) {
 #if ET_LOG_ENABLED
             auto mask_shape = executorch::runtime::tensor_shape_to_c_string(
@@ -82,7 +83,7 @@ template <typename CTYPE_IX>
 size_t _count_trues_in_mask_index(const Tensor& index) {
   const CTYPE_IX* const index_ptr = index.const_data_ptr<CTYPE_IX>();
   size_t sum = 0;
-  for (size_t i = 0; i < index.numel(); ++i) {
+  for (const auto i : c10::irange(index.numel())) {
     if (index_ptr[i]) {
       sum += 1;
     }
@@ -110,7 +111,7 @@ void _query_mask_index(const Tensor& index, size_t query_idx, size_t* res) {
   // true.
   size_t count = 0;
   size_t flat_ix = 0;
-  for (size_t i = 0; i < index.numel(); ++i) {
+  for (const auto i : c10::irange(index.numel())) {
     if (index_ptr[i]) {
       if (count == query_idx) {
         flat_ix = i;
@@ -156,8 +157,9 @@ int64_t query_integral_index(
 bool check_index_args(const Tensor& in, TensorOptList indices, Tensor& out) {
   ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, out));
   ET_LOG_AND_RETURN_IF_FALSE(check_indices_dtypes(indices));
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
-      indices.size() <= in.dim(), "Indexing too many dimensions");
+  ET_CHECK_OR_RETURN_FALSE(
+      static_cast<ssize_t>(indices.size()) <= in.dim(),
+      "Indexing too many dimensions");
   ET_LOG_AND_RETURN_IF_FALSE(check_mask_indices(in, indices));
   return true;
 }
@@ -165,7 +167,7 @@ bool check_index_args(const Tensor& in, TensorOptList indices, Tensor& out) {
 size_t count_index_blocks(TensorOptList indices) {
   size_t block_count = 0;
   bool in_block = false;
-  for (size_t i = 0; i < indices.size(); i++) {
+  for (const auto i : c10::irange(indices.size())) {
     if (indices[i].has_value()) {
       if (!in_block) {
         in_block = true;
@@ -184,39 +186,40 @@ bool get_indices_broadcast_shape(
     size_t* ix_ndim) {
   // Holds the (reversed) broadcasted shape of the indices.
   Tensor::SizesType rev_ix_sizes[kTensorDimensionLimit];
-  size_t curr_ndim = 0;
+  ssize_t curr_ndim = 0;
 
-  for (size_t i = 0; i < indices.size(); i++) {
+  for (const auto i : c10::irange(indices.size())) {
     if (indices[i].has_value()) {
       const Tensor& index = indices[i].value();
       if (is_mask_index(index)) {
-        size_t len = count_trues_in_mask_index(index);
+        Tensor::SizesType len =
+            static_cast<Tensor::SizesType>(count_trues_in_mask_index(index));
         if (curr_ndim == 0) {
           curr_ndim = 1;
           rev_ix_sizes[0] = len;
         } else if (rev_ix_sizes[0] == 1) {
           rev_ix_sizes[0] = len;
         } else if (len != 1 && rev_ix_sizes[0] != len) {
-          ET_LOG_MSG_AND_RETURN_IF_FALSE(
-              false, "Broadcast of mask index failed.");
+          ET_CHECK_OR_RETURN_FALSE(false, "Broadcast of mask index failed.");
         }
       } else {
-        for (size_t j = 0; j < index.dim(); j++) {
-          size_t rev_j_size = index.size(index.dim() - j - 1);
+        for (const auto j : c10::irange(index.dim())) {
+          Tensor::SizesType rev_j_size =
+              static_cast<Tensor::SizesType>(index.size(index.dim() - j - 1));
           if (j >= curr_ndim) {
             curr_ndim = j + 1;
             rev_ix_sizes[j] = rev_j_size;
           } else if (rev_ix_sizes[j] == 1) {
             rev_ix_sizes[j] = rev_j_size;
           } else if (rev_j_size != 1 && rev_ix_sizes[j] != rev_j_size) {
-            ET_LOG_MSG_AND_RETURN_IF_FALSE(false, "Broadcast of index failed.");
+            ET_CHECK_OR_RETURN_FALSE(false, "Broadcast of index failed.");
           }
         }
       }
     }
   }
 
-  for (size_t i = 0; i < curr_ndim; i++) {
+  for (const auto i : c10::irange(curr_ndim)) {
     ix_sizes[i] = rev_ix_sizes[curr_ndim - i - 1];
   }
   (*ix_ndim) = curr_ndim;
@@ -224,8 +227,8 @@ bool get_indices_broadcast_shape(
 }
 
 size_t get_indices_broadcast_ndim(TensorOptList indices) {
-  size_t ndim = 0;
-  for (size_t i = 0; i < indices.size(); i++) {
+  ssize_t ndim = 0;
+  for (const auto i : c10::irange(indices.size())) {
     if (indices[i].has_value()) {
       const Tensor& index = indices[i].value();
       if (is_mask_index(index)) {
@@ -244,7 +247,7 @@ size_t get_indices_broadcast_ndim(TensorOptList indices) {
 
 size_t get_num_indexed_dims(TensorOptList indices) {
   size_t num_indexed_dims = 0;
-  for (size_t i = 0; i < indices.size(); i++) {
+  for (const auto i : c10::irange(indices.size())) {
     if (indices[i].has_value()) {
       const Tensor& index = indices[i].value();
       if (is_mask_index(index)) {
@@ -259,7 +262,7 @@ size_t get_num_indexed_dims(TensorOptList indices) {
 
 size_t get_num_null_indices(TensorOptList indices) {
   size_t num_null_indices = 0;
-  for (size_t i = 0; i < indices.size(); i++) {
+  for (const auto i : c10::irange(indices.size())) {
     if (!indices[i].has_value()) {
       num_null_indices += 1;
     }
@@ -290,11 +293,11 @@ bool get_index_out_target_size(
   size_t num_null_indices = get_num_null_indices(indices);
   size_t num_indexed_dims = get_num_indexed_dims(indices);
 
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
-      num_null_indices + num_indexed_dims <= in.dim(),
+  ET_CHECK_OR_RETURN_FALSE(
+      static_cast<ssize_t>(num_null_indices + num_indexed_dims) <= in.dim(),
       "Indexing too many dimensions");
 
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       in.dim() + broadcast_ndim - num_indexed_dims <= kTensorDimensionLimit,
       "Out tensor would exceed number of allowed dimensions");
 
@@ -302,22 +305,22 @@ bool get_index_out_target_size(
 
   if (adjacent) {
     size_t start = get_num_leading_null_indices(indices);
-    for (size_t i = 0; i < start; i++) {
+    for (const auto i : c10::irange(start)) {
       out_sizes[i] = in.size(i);
     }
-    for (size_t i = 0; i < broadcast_ndim; i++) {
+    for (const auto i : c10::irange(broadcast_ndim)) {
       out_sizes[i + start] = broadcast_sizes[i];
     }
-    for (size_t i = num_indexed_dims + start; i < in.dim(); i++) {
+    for (const auto i : c10::irange(num_indexed_dims + start, in.dim())) {
       out_sizes[i + broadcast_ndim - num_indexed_dims] = in.size(i);
     }
   } else {
-    for (size_t i = 0; i < broadcast_ndim; i++) {
+    for (const auto i : c10::irange(broadcast_ndim)) {
       out_sizes[i] = broadcast_sizes[i];
     }
     size_t in_i = 0;
     size_t out_i = broadcast_ndim;
-    for (size_t i = 0; i < indices.size(); i++) {
+    for (const auto i : c10::irange(indices.size())) {
       if (!indices[i].has_value()) {
         out_sizes[out_i++] = in.size(in_i++);
       } else {
@@ -329,7 +332,8 @@ bool get_index_out_target_size(
         }
       }
     }
-    for (size_t i = num_indexed_dims + num_null_indices; i < in.dim(); i++) {
+    for (const auto i :
+         c10::irange(num_indexed_dims + num_null_indices, in.dim())) {
       out_sizes[i + broadcast_ndim - num_indexed_dims] = in.size(i);
     }
   }
@@ -349,25 +353,25 @@ void compute_dim_map(
   size_t num_null_indices = get_num_null_indices(indices);
 
   if (adjacent) {
-    for (auto i = 0; i < start; i++) {
+    for (const auto i : c10::irange(start)) {
       dim_map[i] = i;
     }
-    for (auto i = start; i < start + num_indexed_dims; i++) {
+    for (const auto i : c10::irange(start, start + num_indexed_dims)) {
       dim_map[i] = -1;
     }
-    for (auto i = start + num_indexed_dims; i < in.dim(); i++) {
+    for (const auto i : c10::irange(start + num_indexed_dims, in.dim())) {
       dim_map[i] = i - num_indexed_dims + broadcast_ndim;
     }
   } else {
     size_t in_i = 0;
     size_t out_i = broadcast_ndim;
-    for (size_t i = 0; i < indices.size(); i++) {
+    for (const auto i : c10::irange(indices.size())) {
       if (!indices[i].has_value()) {
         dim_map[in_i++] = out_i++;
       } else {
         const Tensor& index = indices[i].value();
         if (is_mask_index(index)) {
-          for (auto j = 0; j < index.dim(); j++) {
+          for ([[maybe_unused]] const auto j : c10::irange(index.dim())) {
             dim_map[in_i++] = -1;
           }
         } else {
@@ -375,7 +379,8 @@ void compute_dim_map(
         }
       }
     }
-    for (size_t i = num_indexed_dims + num_null_indices; i < in.dim(); i++) {
+    for (const auto i :
+         c10::irange(num_indexed_dims + num_null_indices, in.dim())) {
       dim_map[i] = i - num_indexed_dims + broadcast_ndim;
     }
   }
@@ -387,15 +392,15 @@ void compute_index_map(
     const Tensor& in,
     TensorOptList indices,
     int32_t* ix_map) {
-  for (size_t i = 0; i < in.dim(); i++) {
+  for (const auto i : c10::irange(in.dim())) {
     ix_map[i] = -1;
   }
   size_t in_i = 0;
-  for (size_t i = 0; i < indices.size(); i++) {
+  for (const auto i : c10::irange(indices.size())) {
     if (indices[i].has_value()) {
       const Tensor& index = indices[i].value();
       if (is_mask_index(index)) {
-        for (auto j = 0; j < index.dim(); j++) {
+        for ([[maybe_unused]] const auto j : c10::irange(index.dim())) {
           ix_map[in_i++] = i;
         }
       } else {
@@ -423,7 +428,7 @@ bool get_in_coord(
       const Tensor& index = indices[ix_map[i]].value();
 
       size_t ix_coord[kTensorDimensionLimit];
-      for (auto j = 0; j < broadcast_ndim; j++) {
+      for (const auto j : c10::irange(broadcast_ndim)) {
         ix_coord[j] = out_coord[j + start];
       }
 
@@ -431,7 +436,7 @@ bool get_in_coord(
         size_t query_ix = ix_coord[broadcast_ndim - 1];
         size_t query_result[kTensorDimensionLimit];
         query_mask_index(index, query_ix, query_result);
-        for (auto j = 0; j < index.dim(); j++) {
+        for (const auto j : c10::irange(index.dim())) {
           in_coord[i + j] = query_result[j];
         }
         i += index.dim() - 1;
@@ -441,7 +446,7 @@ bool get_in_coord(
         if (index_val < 0) {
           index_val += in.size(i);
         }
-        ET_LOG_MSG_AND_RETURN_IF_FALSE(
+        ET_CHECK_OR_RETURN_FALSE(
             index_val >= 0 && index_val < in.size(i),
             "Index %" PRId64
             " is out of bounds for input dimension %zd with size %zd.",
diff --git a/kernels/portable/cpu/util/broadcast_indexes_range.h b/kernels/portable/cpu/util/broadcast_indexes_range.h
new file mode 100644
index 00000000000..aaf7207d0c9
--- /dev/null
+++ b/kernels/portable/cpu/util/broadcast_indexes_range.h
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+#include <iterator>
+#include <tuple>
+
+#include <executorch/kernels/portable/cpu/util/delinearize_index.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/util/tensor_dimension_limit.h>
+
+namespace torch::executor {
+
+namespace internal {
+template <std::size_t kNumInputs>
+class BroadcastIndexesIterator {
+ public:
+  using difference_type = ssize_t;
+  using value_type = std::array<ssize_t, kNumInputs + 1>;
+  using reference = const value_type&;
+  using pointer = const value_type*;
+  using iterator_category = std::forward_iterator_tag;
+
+  BroadcastIndexesIterator() = default;
+
+  template <typename... Args>
+  explicit BroadcastIndexesIterator(const Tensor& output, const Args&... args)
+      : output_dim_or_zero_if_no_broadcasting_(
+            ((args.sizes() == output.sizes()) && ...) ? 0 : output.dim()),
+        output_shape_(output.sizes()) {
+    static_assert(
+        sizeof...(args) == kNumInputs && (std::is_same_v<Args, Tensor> && ...),
+        "BroadcastIndexesIterator constructor requires kNumInputs input tensor"
+        "arguments!");
+    if (output_dim_or_zero_if_no_broadcasting_ != 0) {
+      effective_input_broadcast_strides_ = {
+          effective_input_broadcast_stride(output, args)...};
+    }
+  }
+
+  struct make_end_t {
+    explicit constexpr make_end_t() = default;
+  };
+
+  template <typename... Args>
+  BroadcastIndexesIterator(make_end_t, const Tensor& t, const Args&... args)
+      : current_indexes_{
+            t.numel(),
+            0,
+        } {}
+
+  bool operator==(const BroadcastIndexesIterator& rhs) const {
+    return output_index() == rhs.output_index();
+  }
+
+  bool operator!=(const BroadcastIndexesIterator& rhs) const {
+    return !operator==(rhs);
+  }
+
+  reference operator*() const {
+    return current_indexes_;
+  }
+
+  pointer operator->() const {
+    return &current_indexes_;
+  }
+
+  BroadcastIndexesIterator& operator++() {
+    output_index()++;
+    if (output_dim_or_zero_if_no_broadcasting_ == 0) {
+      std::fill(
+          current_indexes_.begin() + 1, current_indexes_.end(), output_index());
+      return *this;
+    }
+    // TODO: add optimization for particular input tensors not being
+    // broadcasted?
+    for (auto ii = output_dim_or_zero_if_no_broadcasting_ - 1; ii >= 0; --ii) {
+      // You might wonder what happens if output_shape_[ii] == 0. In
+      // that case, output.numel() would be 0, and thus we would have
+      // begin() == end() and no iteration.
+      if ET_UNLIKELY (
+          static_cast<exec_aten::SizesType>(delinearized_output_index_[ii]) ==
+          output_shape_[ii] - 1) {
+        const auto old_delinearized_output_index_item =
+            delinearized_output_index_[ii];
+        delinearized_output_index_[ii] = 0;
+        for (const auto jj : c10::irange(1, kNumInputs + 1)) {
+          current_indexes_[jj] -= old_delinearized_output_index_item *
+              effective_input_broadcast_strides_[jj - 1][ii];
+        }
+      } else {
+        delinearized_output_index_[ii]++;
+        for (const auto jj : c10::irange(1, kNumInputs + 1)) {
+          current_indexes_.at(jj) +=
+              effective_input_broadcast_strides_[jj - 1][ii];
+        }
+        break;
+      }
+    }
+    return *this;
+  }
+
+  BroadcastIndexesIterator operator++(int) {
+    auto it = *this;
+    operator++();
+    return it;
+  }
+
+  BroadcastIndexesIterator& operator+=(difference_type n) {
+    if (n <= 3) {
+      std::advance(*this, n);
+      return *this;
+    }
+
+    output_index() += n;
+    if (output_dim_or_zero_if_no_broadcasting_ == 0) {
+      std::fill(
+          current_indexes_.begin() + 1, current_indexes_.end(), output_index());
+      return *this;
+    }
+    delinearize_index(
+        output_index(),
+        output_shape_,
+        delinearized_output_index_.data(),
+        delinearized_output_index_.size());
+    for (const auto ii : c10::irange(1, kNumInputs + 1)) {
+      current_indexes_[ii] = 0;
+      for (const auto jj :
+           c10::irange(output_dim_or_zero_if_no_broadcasting_)) {
+        current_indexes_[ii] += delinearized_output_index_[jj] *
+            effective_input_broadcast_strides_[ii - 1][jj];
+      }
+    }
+    return *this;
+  }
+
+  BroadcastIndexesIterator operator+(difference_type n) {
+    auto it = *this;
+    it += n;
+    return it;
+  }
+
+  difference_type operator-(const BroadcastIndexesIterator& rhs) const {
+    return difference_type(output_index() - rhs.output_index());
+  }
+
+ private:
+  using ShapeType =
+      std::array<std::size_t, executorch::runtime::kTensorDimensionLimit>;
+
+  ssize_t output_index() const {
+    return current_indexes_[0];
+  }
+
+  ssize_t& output_index() {
+    return current_indexes_[0];
+  }
+
+  ShapeType effective_input_broadcast_stride(
+      const Tensor& output,
+      const Tensor& t) const {
+    ShapeType result = {0};
+    ET_CHECK_MSG(
+        t.dim() <= output.dim(),
+        "input to broadcasting op should have dim at most output dim, but %d > %d!",
+        (int)t.dim(),
+        (int)output.dim());
+
+    const auto num_leading_ones = output.dim() - t.dim();
+    for (const auto idx : c10::irange(num_leading_ones)) {
+      result[idx] = 0;
+    }
+    const auto t_sizes = t.sizes();
+    const auto t_strides = t.strides();
+    for (const auto idx :
+         c10::irange(num_leading_ones, num_leading_ones + t.dim())) {
+      result[idx] = t_sizes[idx - num_leading_ones] == 1
+          ? 0
+          : t_strides[idx - num_leading_ones];
+    }
+    return result;
+  }
+
+  // The 0th entry is the current linear index into the output,
+  // followed by kNumInputs input indexes.
+  std::array<ssize_t, kNumInputs + 1> current_indexes_ = {0};
+  ShapeType delinearized_output_index_ = {0};
+  ssize_t output_dim_or_zero_if_no_broadcasting_;
+  ArrayRef<exec_aten::SizesType> output_shape_;
+  // The linear index for a broadcast tensor is
+  // sum(delinearized_output_index_[i] * input_stride_[i] if
+  // padded_input_shape_[i] != 1 else 0), where padded_input_shape is
+  // input.sizes() with leading 1s added to make its size equal to
+  // output_dim. This is straightforwardly implementable with an
+  // adjusted stride array that contains 0s where the padded input
+  // shape would contain 1s.
+  std::array<ShapeType, kNumInputs> effective_input_broadcast_strides_;
+};
+} // namespace internal
+
+/**
+ * Efficient mechanism for looping over the index space for an output
+ * tensor and kNumInputs possibly-broadcasted input tensors. Use as follows:
+ *
+ * auto* output_data = output.mutable_data_ptr<OutputType>();
+ * const auto* a_data = a.mutable_data_ptr<AType>();
+ * const auto* b_data = b.mutable_data_ptr<BType>();
+ * for (const auto [output_index, a_index, b_index] :
+ *      BroadcastIndexesRange<2>(output, a, b)) {
+ *   // Access output_data[output_index], a_data[a_index], and b_data[b_index].
+ * }
+ *
+ * (where OutputType, AType, and BType are known concrete types.)
+ *
+ * Unlike looping using delinearize_index() and
+ * linearize_access_indexes(), BroadcastIndexesRange avoids expensive
+ * division and modulo operations on each iteration.
+ */
+template <std::size_t kNumInputs>
+class BroadcastIndexesRange {
+ public:
+  using iterator = internal::BroadcastIndexesIterator<kNumInputs>;
+
+  template <typename... Args>
+  BroadcastIndexesRange(const Tensor& output, const Args&... args)
+      : tensors_{&output, (&args)...} {}
+
+  iterator begin() const {
+    return std::apply(
+        [](const auto&... args) { return iterator((*args)...); }, tensors_);
+  }
+
+  iterator end() const {
+    return std::apply(
+        [](const auto&... args) {
+          return iterator(typename iterator::make_end_t(), (*args)...);
+        },
+        tensors_);
+  }
+
+ private:
+  std::array<const Tensor*, kNumInputs + 1> tensors_;
+};
+} // namespace torch::executor
diff --git a/kernels/portable/cpu/util/broadcast_util.cpp b/kernels/portable/cpu/util/broadcast_util.cpp
index d8569d23c2f..28a34426b23 100644
--- a/kernels/portable/cpu/util/broadcast_util.cpp
+++ b/kernels/portable/cpu/util/broadcast_util.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
 #include <executorch/kernels/portable/cpu/util/repeat_util.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
@@ -268,28 +269,6 @@ ET_NODISCARD Error get_broadcast_target_size(
       a.sizes(), b.sizes(), out_sizes, out_sizes_len, out_dim);
 }
 
-void delinearize_index(
-    size_t linear_index,
-    executorch::aten::ArrayRef<Tensor::SizesType> shape,
-    size_t* out_indexes,
-    const size_t out_indexes_len) {
-  ET_CHECK(shape.size() <= out_indexes_len);
-  for (auto i = 0; i < shape.size(); ++i) {
-    auto dim = shape.size() - 1 - i;
-    auto dim_size = shape[dim];
-    out_indexes[dim] = linear_index % dim_size;
-    linear_index /= dim_size;
-  }
-}
-
-void delinearize_index(
-    size_t linear_index,
-    const Tensor& t,
-    size_t* out_indexes,
-    const size_t out_indexes_len) {
-  delinearize_index(linear_index, t.sizes(), out_indexes, out_indexes_len);
-}
-
 size_t linearize_access_indexes(
     ArrayRef<size_t> indexes_broadcast_to,
     ssize_t broadcast_to_ndim,
@@ -304,7 +283,8 @@ size_t linearize_access_indexes(
   size_t linear_index = 0;
   for (size_t i = 0; i < indexes_broadcast_from.size(); ++i) {
     // If this dimension is broadcasted, add zero to the linear address.
-    if (indexes_broadcast_from[i] >= broadcast_from_shape[i]) {
+    if (indexes_broadcast_from[i] >=
+        static_cast<size_t>(broadcast_from_shape[i])) {
       ET_CHECK_MSG(
           broadcast_from_shape[i] == 1,
           "Expected dim size == 1 if broadcasted, but actual dim size is %zu",
diff --git a/kernels/portable/cpu/util/broadcast_util.h b/kernels/portable/cpu/util/broadcast_util.h
index 35344345242..2b10ee24411 100644
--- a/kernels/portable/cpu/util/broadcast_util.h
+++ b/kernels/portable/cpu/util/broadcast_util.h
@@ -8,6 +8,9 @@
 
 #pragma once
 
+#include <c10/util/irange.h>
+#include <executorch/kernels/portable/cpu/util/broadcast_indexes_range.h>
+#include <executorch/kernels/portable/cpu/util/delinearize_index.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
 
@@ -205,36 +208,6 @@ ET_NODISCARD inline Error resize_to_broadcast_target_size(
 ET_DEPRECATED void free_broadcast_tensor(
     const executorch::aten::Tensor& broadcast_tensor);
 
-/**
- * Delinearize a flattened index to per-dimension indexes.
- *
- * @param[in] linear_index The flattened index
- * @param[in] shape The tensor shape
- * @param[out] out_indexes The per-dimension indexes
- * @param[in] out_indexes_len The maximum size of the out_indexes array
- * @returns void
- */
-void delinearize_index(
-    size_t linear_index,
-    executorch::aten::ArrayRef<Tensor::SizesType> shape,
-    size_t* out_indexes,
-    const size_t out_indexes_len);
-
-/**
- * Delinearize a flattened index to per-dimension indexes.
- *
- * @param[in] linear_index The flattened index
- * @param[in] t The tensor object
- * @param[out] out_indexes The per-dimension indexes
- * @param[in] out_indexes_len The maximum size of the out_indexes array
- * @returns void
- */
-void delinearize_index(
-    size_t linear_index,
-    const Tensor& t,
-    size_t* out_indexes,
-    const size_t out_indexes_len);
-
 /**
  * Return the linear index for broatcast_from tensor, given the indexes and
  * number of dimensions of broadcast_to tensor, and the shape and strides
@@ -281,31 +254,13 @@ inline void apply_binary_elementwise_fn(
     const Tensor& a,
     const Tensor& b,
     const Tensor& out) {
-  const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
-  const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
-  const bool any_is_broadcasted = (a_is_broadcasted || b_is_broadcasted);
-
   const CTYPE_A* const data_a = a.const_data_ptr<CTYPE_A>();
   const CTYPE_B* const data_b = b.const_data_ptr<CTYPE_B>();
   CTYPE_OUT* const data_out = out.mutable_data_ptr<CTYPE_OUT>();
 
-  for (size_t i = 0; i < out.numel(); ++i) {
-    size_t a_linear_index = i;
-    size_t b_linear_index = i;
-
-    if (any_is_broadcasted) {
-      size_t out_indexes[kTensorDimensionLimit];
-      delinearize_index(i, out, out_indexes, kTensorDimensionLimit);
-
-      if (a_is_broadcasted) {
-        a_linear_index = linearize_access_indexes(out_indexes, out.dim(), a);
-      }
-      if (b_is_broadcasted) {
-        b_linear_index = linearize_access_indexes(out_indexes, out.dim(), b);
-      }
-    }
-
-    data_out[i] = compute_fun(data_a[a_linear_index], data_b[b_linear_index]);
+  for (const auto [out_index, a_index, b_index] :
+       BroadcastIndexesRange<2>(out, a, b)) {
+    data_out[out_index] = compute_fun(data_a[a_index], data_b[b_index]);
   }
 }
 
@@ -326,39 +281,15 @@ inline void apply_ternary_elementwise_fn(
     const Tensor& b,
     const Tensor& c,
     const Tensor& out) {
-  const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
-  const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
-  const bool c_is_broadcasted = !out.sizes().equals(c.sizes());
-  const bool any_is_broadcasted =
-      (a_is_broadcasted || b_is_broadcasted || c_is_broadcasted);
-
   const CTYPE_A* const data_a = a.const_data_ptr<CTYPE_A>();
   const CTYPE_B* const data_b = b.const_data_ptr<CTYPE_B>();
   const CTYPE_C* const data_c = c.const_data_ptr<CTYPE_C>();
   CTYPE_OUT* const data_out = out.mutable_data_ptr<CTYPE_OUT>();
 
-  for (size_t i = 0; i < out.numel(); ++i) {
-    size_t a_linear_index = i;
-    size_t b_linear_index = i;
-    size_t c_linear_index = i;
-
-    if (any_is_broadcasted) {
-      size_t out_indexes[kTensorDimensionLimit];
-      delinearize_index(i, out, out_indexes, kTensorDimensionLimit);
-
-      if (a_is_broadcasted) {
-        a_linear_index = linearize_access_indexes(out_indexes, out.dim(), a);
-      }
-      if (b_is_broadcasted) {
-        b_linear_index = linearize_access_indexes(out_indexes, out.dim(), b);
-      }
-      if (c_is_broadcasted) {
-        c_linear_index = linearize_access_indexes(out_indexes, out.dim(), c);
-      }
-    }
-
-    data_out[i] = compute_fun(
-        data_a[a_linear_index], data_b[b_linear_index], data_c[c_linear_index]);
+  for (const auto [out_index, a_index, b_index, c_index] :
+       BroadcastIndexesRange<3>(out, a, b, c)) {
+    data_out[out_index] =
+        compute_fun(data_a[a_index], data_b[b_index], data_c[c_index]);
   }
 }
 
diff --git a/kernels/portable/cpu/util/copy_ops_util.cpp b/kernels/portable/cpu/util/copy_ops_util.cpp
index 78b66b05f22..229fba2dad0 100644
--- a/kernels/portable/cpu/util/copy_ops_util.cpp
+++ b/kernels/portable/cpu/util/copy_ops_util.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <cstring>
 
 #include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
@@ -26,7 +27,7 @@ size_t as_strided_copy_compute_storage_nbytes(
   // size of the underlying storage is 1 bigger than the offset
   // of the last element according to stride
   size_t size = 1;
-  for (size_t i = 0; i < sizes.size(); ++i) {
+  for (const auto i : c10::irange(sizes.size())) {
     if (sizes[i] == 0) {
       return 0;
     }
@@ -44,16 +45,16 @@ bool check_as_strided_copy_args(
     optional<int64_t> storage_offset,
     Tensor& out) {
   ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, out));
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       size.size() == stride.size(), "mismatch in length of strides and shape");
   for (const auto& val : stride) {
-    ET_LOG_MSG_AND_RETURN_IF_FALSE(
+    ET_CHECK_OR_RETURN_FALSE(
         val >= 0,
         "as_strided: Negative strides are not supported at the moment");
   }
 
   int64_t offset = storage_offset.has_value() ? storage_offset.value() : 0;
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(offset >= 0, "Negative storage offset");
+  ET_CHECK_OR_RETURN_FALSE(offset >= 0, "Negative storage offset");
 
   // Check that the requested storage is within bounds of input storage
   size_t storage_size_bytes =
@@ -63,7 +64,7 @@ bool check_as_strided_copy_args(
     return true;
   }
   size_t new_storage_size_bytes = in.nbytes();
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       storage_size_bytes + storage_offset_bytes <= new_storage_size_bytes,
       "Requiring a storage size of %zd are out of bounds for storage of size %zd",
       storage_size_bytes + storage_offset_bytes,
@@ -80,7 +81,7 @@ bool check_cat_args(
 
   // Find the first non-empty tensor in the list to use as a reference
   size_t ref_i = 0;
-  for (size_t i = 0; i < tensors.size(); ++i) {
+  for (const auto i : c10::irange(tensors.size())) {
     if (tensors[i].numel() > 0) {
       ref_i = i;
       break;
@@ -90,7 +91,7 @@ bool check_cat_args(
   // "All tensors must either have the same shape (except in the concatenating
   // dimension) or be empty."
   // https://pytorch.org/docs/stable/generated/torch.cat.html
-  for (size_t i = 0; i < tensors.size(); ++i) {
+  for (const auto i : c10::irange(tensors.size())) {
     // All input dtypes must be castable to the output dtype.
     ET_LOG_AND_RETURN_IF_FALSE(
         canCast(tensors[i].scalar_type(), out.scalar_type()));
@@ -106,7 +107,7 @@ bool check_cat_args(
     ET_LOG_AND_RETURN_IF_FALSE(
         tensor_is_rank(tensors[ref_i], tensors[i].dim()));
 
-    for (size_t d = 0; d < tensors[i].dim(); ++d) {
+    for (const auto d : c10::irange(tensors[i].dim())) {
       if (d != dim) {
         ET_LOG_AND_RETURN_IF_FALSE(
             tensors_have_same_size_at_dims(tensors[i], d, tensors[ref_i], d));
@@ -132,7 +133,7 @@ void get_cat_out_target_size(
   // calculate out dim
   size_t ref_i = 0;
   size_t cat_dim_size = 0;
-  for (size_t i = 0; i < tensors.size(); ++i) {
+  for (const auto i : c10::irange(tensors.size())) {
     if (tensors[i].numel() > 0) {
       cat_dim_size += tensors[i].size(dim);
     }
@@ -143,15 +144,14 @@ void get_cat_out_target_size(
 
   *out_ndim = tensors[ref_i].dim();
 
-  for (size_t d = 0; d < *out_ndim; ++d) {
-    if (d != dim) {
+  for (const auto d : c10::irange(*out_ndim)) {
+    if (static_cast<int64_t>(d) != dim) {
       out_sizes[d] = tensors[ref_i].size(d);
     } else {
       out_sizes[d] = cat_dim_size;
     }
   }
 }
-
 bool check_expand_copy_args(
     const Tensor& input,
     ArrayRef<int64_t> expand_sizes,
@@ -159,17 +159,17 @@ bool check_expand_copy_args(
     Tensor& out) {
   (void)out;
 
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       implicit == false,
       "This operator is not implemented for when implicit == true.");
 
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       expand_sizes.size() >= input.sizes().size(),
       "The number of sizes provided (%zu) must at least be equal to the number of dimensions in the tensor (%zu)",
       expand_sizes.size(),
       input.sizes().size());
 
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       expand_sizes.size() <= kTensorDimensionLimit,
       "The number of expanded dims (%zu) exceeds the configured maximum (%zu). Increase this limit.",
       expand_sizes.size(),
@@ -198,7 +198,7 @@ bool get_expand_copy_out_target_size(
       // -1 can use for replacing any corresponding dimension
       output_sizes[j] = self_sizes[i];
     } else if (self_sizes[i] != 1) {
-      ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      ET_CHECK_OR_RETURN_FALSE(
           expand_sizes[j] == self_sizes[i],
           "The expanded size of the tensor (%zu) must match the existing size (%zu) at non-singleton dimension %zu.",
           (size_t)expand_sizes[j],
@@ -211,7 +211,7 @@ bool get_expand_copy_out_target_size(
   while (j > 0) {
     --j;
     output_sizes[j] = expand_sizes[j];
-    ET_LOG_MSG_AND_RETURN_IF_FALSE(
+    ET_CHECK_OR_RETURN_FALSE(
         expand_sizes[j] >= 0,
         "The expanded size of the tensor (%zu) isn't allowed in a leading, non-existing dimension %zu",
         (size_t)expand_sizes[j],
@@ -231,7 +231,7 @@ bool check_permute_copy_args(const Tensor& in, IntArrayRef dims, Tensor& out) {
   bool dim_exist[kTensorDimensionLimit];
   memset(dim_exist, false, sizeof(dim_exist));
 
-  for (int i = 0; i < dims.size(); i++) {
+  for (const auto i : c10::irange(dims.size())) {
     ET_LOG_AND_RETURN_IF_FALSE(tensor_has_dim(in, dims[i]));
     // Convert dimension to a non-negative number in the range
     // [0 .. in.dim() - 1].
@@ -241,7 +241,7 @@ bool check_permute_copy_args(const Tensor& in, IntArrayRef dims, Tensor& out) {
     ET_LOG_AND_RETURN_IF_FALSE(dim < kTensorDimensionLimit && dim >= 0);
 
     // Check that the dimension hasn't been seen previously.
-    ET_LOG_MSG_AND_RETURN_IF_FALSE(
+    ET_CHECK_OR_RETURN_FALSE(
         dim_exist[dim] == false, "duplicate dims are not allowed.");
 
     dim_exist[dim] = true;
@@ -251,14 +251,14 @@ bool check_permute_copy_args(const Tensor& in, IntArrayRef dims, Tensor& out) {
 }
 
 bool check_unbind_copy_args(const Tensor& in, int64_t dim, TensorList out) {
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       in.dim() > 0, "in must have at least one dimension; saw %zd", in.dim());
 
   ET_LOG_AND_RETURN_IF_FALSE(dim_is_valid(dim, in.dim()));
 
   const ssize_t dim_size = in.size(dim);
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
-      dim_size == out.size(),
+  ET_CHECK_OR_RETURN_FALSE(
+      dim_size == static_cast<ssize_t>(out.size()),
       "out tensorlist's length %zd must equal unbind dim %" PRId64
       " size = %zd.",
       out.size(),
@@ -266,9 +266,9 @@ bool check_unbind_copy_args(const Tensor& in, int64_t dim, TensorList out) {
       dim_size);
 
   // Validate each output.
-  for (size_t i = 0; i < out.size(); ++i) {
+  for (const auto i : c10::irange(out.size())) {
     // All output dtypes must be the same.
-    ET_LOG_MSG_AND_RETURN_IF_FALSE(
+    ET_CHECK_OR_RETURN_FALSE(
         out[i].scalar_type() == out[0].scalar_type(),
         "out[%zu] dtype %" PRId8 " != out[0] dtype %" PRId8,
         i,
@@ -276,7 +276,7 @@ bool check_unbind_copy_args(const Tensor& in, int64_t dim, TensorList out) {
         static_cast<int8_t>(out[0].scalar_type()));
 
     // output tensor must have # of dims = in.dim() -1
-    ET_LOG_MSG_AND_RETURN_IF_FALSE(
+    ET_CHECK_OR_RETURN_FALSE(
         out[i].dim() == (in.dim() - 1),
         "out[%zu] dim %zd != in dim %zd",
         i,
@@ -284,9 +284,10 @@ bool check_unbind_copy_args(const Tensor& in, int64_t dim, TensorList out) {
         in.dim() - 1);
 
     // Check the shape of the output.
-    for (ssize_t d = 0, out_d = 0; d < in.dim(); ++d) {
+    ssize_t out_d = 0;
+    for (const auto d : c10::irange(in.dim())) {
       if (d != dim) {
-        ET_LOG_MSG_AND_RETURN_IF_FALSE(
+        ET_CHECK_OR_RETURN_FALSE(
             out[i].size(out_d) == in.size(d),
             "out[%zu].size(%zd) %zd != in.size(%zd) %zd",
             i,
@@ -309,7 +310,7 @@ void get_permute_copy_out_target_size(
     size_t* out_ndim) {
   *out_ndim = in.dim();
 
-  for (size_t i = 0; i < in.dim(); ++i) {
+  for (const auto i : c10::irange(in.dim())) {
     out_sizes[i] = in.size(dims[i] >= 0 ? dims[i] : dims[i] + in.dim());
   }
 }
@@ -348,7 +349,7 @@ void get_pixel_shuffle_out_target_size(
   *out_ndim = in.dim();
   const executorch::aten::SizesType casted_upscale_factor = upscale_factor;
 
-  size_t i = 0;
+  ssize_t i = 0;
   for (; i < in.dim() - 3; ++i) {
     // Copy all leading dimensions in.
     out_sizes[i] = in.size(i);
@@ -370,7 +371,7 @@ void get_pixel_unshuffle_out_target_size(
   *out_ndim = in.dim();
   const executorch::aten::SizesType casted_factor = downscale_factor;
 
-  size_t i = 0;
+  ssize_t i = 0;
   for (; i < in.dim() - 3; ++i) {
     // Copy all leading dimensions in.
     out_sizes[i] = in.size(i);
@@ -404,7 +405,7 @@ void get_select_copy_out_target_size(
     size_t* out_ndim) {
   *out_ndim = in.dim() - 1;
 
-  for (size_t d = 0; d < in.dim() - 1; ++d) {
+  for (const auto d : c10::irange(in.dim() - 1)) {
     if (d < dim) {
       out_sizes[d] = in.size(d);
     } else {
@@ -421,19 +422,19 @@ bool check_split_with_sizes_copy_args(
   ET_LOG_AND_RETURN_IF_FALSE(tensor_has_rank_greater_or_equal_to(in, 1));
   ET_LOG_AND_RETURN_IF_FALSE(tensor_has_dim(in, dim));
 
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       split_sizes.size() == out.size(),
       "Number of split sizes must match the number of output tensors");
 
   int64_t sum = 0;
-  for (int i = 0; i < split_sizes.size(); i++) {
-    ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  for (const auto i : c10::irange(split_sizes.size())) {
+    ET_CHECK_OR_RETURN_FALSE(
         split_sizes[i] >= 0, "All split sizes must be non negative.");
     sum += split_sizes[i];
   }
 
   const ssize_t dim_size = in.size(dim);
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       sum == dim_size,
       "Sum of split sizes does not match input size at given dim");
 
@@ -448,7 +449,7 @@ void get_split_with_sizes_copy_out_target_size(
     size_t* out_ndim) {
   *out_ndim = in.dim();
 
-  for (size_t d = 0; d < in.dim(); ++d) {
+  for (const auto d : c10::irange(in.dim())) {
     out_sizes[d] = in.size(d);
   }
   out_sizes[dim] = split_size;
@@ -483,7 +484,7 @@ void get_squeeze_copy_dim_out_target_size(
   }
 
   size_t out_d = 0;
-  for (size_t in_d = 0; in_d < in.dim(); ++in_d) {
+  for (const auto in_d : c10::irange(in.dim())) {
     if (in_d != dim || in.size(in_d) != 1) {
       out_sizes[out_d] = in.size(in_d);
       ++out_d;
@@ -497,16 +498,16 @@ bool check_squeeze_copy_dims_args(
     const Tensor out) {
   ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, out));
 
-  for (size_t i = 0; i < dims.size(); ++i) {
+  for (const auto i : c10::irange(dims.size())) {
     const int64_t dim = dims[i] < 0 ? dims[i] + nonzero_dim(in) : dims[i];
     ET_LOG_AND_RETURN_IF_FALSE(tensor_has_dim(in, dim));
 
     // Check that a dim does not appear twice in dims
-    for (size_t j = 0; j < dims.size(); ++j) {
+    for (const auto j : c10::irange(dims.size())) {
       if (i != j) {
         const int64_t dim_temp =
             dims[j] < 0 ? dims[j] + nonzero_dim(in) : dims[j];
-        ET_LOG_MSG_AND_RETURN_IF_FALSE(
+        ET_CHECK_OR_RETURN_FALSE(
             dim != dim_temp,
             "dim %" PRId64 " appears multiple times in dims!",
             dim);
@@ -530,7 +531,7 @@ void get_squeeze_copy_dims_out_target_size(
 
   // A dim is only removed if the size at the given dim is 1.
   executorch::aten::SizesType dims_to_remove = 0;
-  for (size_t i = 0; i < dims.size(); ++i) {
+  for (const auto i : c10::irange(dims.size())) {
     int64_t dim = dims[i] < 0 ? dims[i] + nonzero_dim(in) : dims[i];
     if (in.size(dim) == 1) {
       ++dims_to_remove;
@@ -539,9 +540,9 @@ void get_squeeze_copy_dims_out_target_size(
   *out_ndim = in.dim() - dims_to_remove;
 
   size_t out_d = 0;
-  for (size_t in_d = 0; in_d < in.dim(); ++in_d) {
+  for (const auto in_d : c10::irange(in.dim())) {
     bool in_d_in_dims = false;
-    for (size_t i = 0; i < dims.size(); ++i) {
+    for (const auto i : c10::irange(dims.size())) {
       int64_t dim = dims[i] < 0 ? dims[i] + nonzero_dim(in) : dims[i];
       if (in_d == dim) {
         in_d_in_dims = true;
@@ -564,13 +565,13 @@ bool check_stack_args(
 
   // All input tensors need to be of the same size
   // https://pytorch.org/docs/stable/generated/torch.stack.html
-  for (size_t i = 0; i < tensors.size(); i++) {
+  for (const auto i : c10::irange(tensors.size())) {
     // All input dtypes must be castable to the output dtype.
     ET_LOG_AND_RETURN_IF_FALSE(
         canCast(tensors[i].scalar_type(), out.scalar_type()));
 
     ET_LOG_AND_RETURN_IF_FALSE(tensor_is_rank(tensors[i], tensors[0].dim()));
-    for (size_t d = 0; d < tensors[i].dim(); d++) {
+    for (const auto d : c10::irange(tensors[i].dim())) {
       ET_LOG_AND_RETURN_IF_FALSE(
           tensors_have_same_size_at_dims(tensors[i], d, tensors[0], d));
     }
@@ -590,13 +591,14 @@ void get_stack_out_target_size(
     size_t* out_ndim) {
   *out_ndim = tensors[0].dim() + 1;
 
-  for (size_t d = 0; d < *out_ndim; ++d) {
-    if (d < dim) {
-      out_sizes[d] = tensors[0].size(d);
-    } else if (d == dim) {
-      out_sizes[d] = tensors.size();
+  for (const auto d : c10::irange(*out_ndim)) {
+    int64_t d_ = static_cast<int64_t>(d);
+    if (d_ < dim) {
+      out_sizes[d_] = tensors[0].size(d_);
+    } else if (d_ == dim) {
+      out_sizes[d_] = tensors.size();
     } else {
-      out_sizes[d] = tensors[0].size(d - 1);
+      out_sizes[d_] = tensors[0].size(d_ - 1);
     }
   }
 }
@@ -612,22 +614,22 @@ bool check_split_copy_args(
     int64_t split_size,
     int64_t dim,
     TensorList out) {
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       input.dim() > 0,
       "input must have at least one dimension; saw %zd",
       input.dim());
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       dim >= 0 && dim < input.dim(),
       "dim %" PRId64 " out of range [0,%zd)",
       dim,
       input.dim());
 
   const ssize_t dim_size = input.size(dim);
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       split_size >= 0,
       "split_size %" PRId64 " must be non-negative",
       split_size);
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       split_size > 0 || dim_size == 0,
       "split_size is zero but input.size(%" PRId64 ") %zd is non-zero",
       dim,
@@ -646,7 +648,7 @@ bool check_split_copy_args(
     // Note that this also handles the case where split_size == 0, avoiding a
     // division by zero in the other branch. When dim_size == 0 && split_size ==
     // 0, core PyTorch expects 1 output element.
-    ET_LOG_MSG_AND_RETURN_IF_FALSE(
+    ET_CHECK_OR_RETURN_FALSE(
         out.size() == 1,
         "Unexpected out.size() %zu: should be 1 because split_size %" PRId64
         " >= input.size(%" PRId64 ") %zd",
@@ -657,8 +659,8 @@ bool check_split_copy_args(
     remainder = dim_size;
   } else {
     int64_t expected_out_len = (dim_size + split_size - 1) / split_size;
-    ET_LOG_MSG_AND_RETURN_IF_FALSE(
-        out.size() == expected_out_len,
+    ET_CHECK_OR_RETURN_FALSE(
+        static_cast<int64_t>(out.size()) == expected_out_len,
         "Unexpected out.size() %zu: ceil(input.size(%" PRId64
         ")=%zd"
         " / split_size=%" PRId64 ") is %" PRId64,
@@ -674,9 +676,9 @@ bool check_split_copy_args(
   }
 
   // Validate each output.
-  for (size_t i = 0; i < out.size(); ++i) {
+  for (const auto i : c10::irange(out.size())) {
     // All output dtypes must be the same.
-    ET_LOG_MSG_AND_RETURN_IF_FALSE(
+    ET_CHECK_OR_RETURN_FALSE(
         out[i].scalar_type() == out[0].scalar_type(),
         "out[%zu] dtype %" PRId8 " != out[0] dtype %" PRId8,
         i,
@@ -684,7 +686,7 @@ bool check_split_copy_args(
         static_cast<int8_t>(out[0].scalar_type()));
 
     // All outputs must have the same number of dimensions as the input.
-    ET_LOG_MSG_AND_RETURN_IF_FALSE(
+    ET_CHECK_OR_RETURN_FALSE(
         out[i].dim() == input.dim(),
         "out[%zu] dim %zd != input dim %zd",
         i,
@@ -692,13 +694,13 @@ bool check_split_copy_args(
         input.dim());
 
     // Check the shape of the output.
-    for (ssize_t d = 0; d < out[i].dim(); ++d) {
+    for (const auto d : c10::irange(out[i].dim())) {
       if (d == dim) {
         // This is the split dimension, which may be different.
         if (i < out.size() - 1) {
           // All outputs except the final one: split dimension should be
           // split_size.
-          ET_LOG_MSG_AND_RETURN_IF_FALSE(
+          ET_CHECK_OR_RETURN_FALSE(
               out[i].size(d) == split_size,
               "out[%zu].size(%zd) %zd != split_size %" PRId64,
               i,
@@ -708,7 +710,7 @@ bool check_split_copy_args(
         } else {
           // The final output: split dimension should be the remainder of
           // split_size.
-          ET_LOG_MSG_AND_RETURN_IF_FALSE(
+          ET_CHECK_OR_RETURN_FALSE(
               out[i].size(d) == remainder,
               "out[%zu].size(%zd) %zd != remainder %" PRId64,
               i,
@@ -759,7 +761,8 @@ bool check__to_dim_order_copy_args(
     executorch::aten::ArrayRef<int64_t> dim_order_ref = dim_order.value();
 
     // dim order size shall equal to input dim
-    ET_LOG_AND_RETURN_IF_FALSE(dim_order_ref.size() == input.dim());
+    ET_LOG_AND_RETURN_IF_FALSE(
+        static_cast<ssize_t>(dim_order_ref.size()) == input.dim());
 
     ET_LOG_AND_RETURN_IF_FALSE(
         is_channels_last_dim_order(
@@ -770,7 +773,7 @@ bool check__to_dim_order_copy_args(
     // Out tensor shall have same dim order as dim_order
     auto out_dim_order = out.dim_order();
     ET_LOG_AND_RETURN_IF_FALSE(out_dim_order.size() == dim_order_ref.size());
-    for (size_t i = 0; i < dim_order_ref.size(); i++) {
+    for (const auto i : c10::irange(dim_order_ref.size())) {
       ET_LOG_AND_RETURN_IF_FALSE(out_dim_order[i] == dim_order_ref[i]);
     }
   } else { // dim_order is not set, preserve the dim order of input
@@ -779,7 +782,7 @@ bool check__to_dim_order_copy_args(
     auto out_dim_order = out.dim_order();
     auto input_dim_order = input.dim_order();
     ET_LOG_AND_RETURN_IF_FALSE(out_dim_order.size() == input_dim_order.size());
-    for (size_t i = 0; i < input_dim_order.size(); i++) {
+    for (const auto i : c10::irange(input_dim_order.size())) {
       ET_LOG_AND_RETURN_IF_FALSE(out_dim_order[i] == input_dim_order[i]);
     }
   }
@@ -804,14 +807,14 @@ bool check_unsqueeze_copy_args(
   // 4. out.size(dim) == 1
   ET_LOG_AND_RETURN_IF_FALSE(input.dim() == out.dim() - 1);
 
-  for (size_t d = 0; d < out.dim(); d++) {
+  for (auto const d : c10::irange(out.dim())) {
     auto dim_normalized = dim;
     if (dim_normalized < 0) {
       dim_normalized += out.dim();
     }
 
     if (d < dim_normalized) {
-      ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      ET_CHECK_OR_RETURN_FALSE(
           input.size(d) == out.size(d),
           "input.size(%zu) %zd != out.size(%zu) %zd | dim = %" PRId64,
           d,
@@ -820,7 +823,7 @@ bool check_unsqueeze_copy_args(
           out.size(d),
           dim);
     } else if (d > dim_normalized) {
-      ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      ET_CHECK_OR_RETURN_FALSE(
           input.size(d - 1) == out.size(d),
           "input.size(%zu) %zd != out.size(%zu) %zd | dim = %" PRId64,
           d - 1,
@@ -829,7 +832,7 @@ bool check_unsqueeze_copy_args(
           out.size(d),
           dim);
     } else { // d == dim
-      ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      ET_CHECK_OR_RETURN_FALSE(
           out.size(d) == 1,
           "out.size(%zu) %zd shall equal 1 | dim = %" PRId64,
           d,
@@ -848,7 +851,7 @@ bool check_view_copy_args(
   ET_LOG_AND_RETURN_IF_FALSE(size_int64_t.size() == out.sizes().size());
 
   // The input and out shall share same dtype and numel
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       self.numel() == out.numel(),
       "self.numel() %zd != out.numel() %zd",
       self.numel(),
@@ -857,10 +860,10 @@ bool check_view_copy_args(
 
   // The size of out should equal target size.
   bool size_inferred = false;
-  for (int i = 0; i < size_int64_t.size(); i++) {
+  for (auto const i : c10::irange(size_int64_t.size())) {
     // If this value is -1 it implies that this dimension is inferred.
     if (size_int64_t[i] == -1) {
-      ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      ET_CHECK_OR_RETURN_FALSE(
           !size_inferred, "Multiple dimensions cannot be inferred.");
       size_inferred = true;
     }
@@ -880,15 +883,15 @@ bool get_view_copy_target_size(
   size_t out_numels_without_minus_1 = 1;
   int32_t minus_1_dim = -1;
 
-  ET_LOG_AND_RETURN_IF_FALSE(size_int64_t.size() == dim);
+  ET_LOG_AND_RETURN_IF_FALSE(static_cast<int64_t>(size_int64_t.size()) == dim);
 
-  for (size_t i = 0; i < dim; ++i) {
+  for (const auto i : c10::irange(dim)) {
     if (size_int64_t[i] != -1) {
       out_sizes[i] = static_cast<executorch::aten::SizesType>(size_int64_t[i]);
       out_numels_without_minus_1 = out_numels_without_minus_1 * size_int64_t[i];
     } else {
       // TODO(kimishpatel): Add test to hit this line
-      ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      ET_CHECK_OR_RETURN_FALSE(
           minus_1_dim == -1, "At most one view copy dim can be -1.");
       minus_1_dim = i;
     }
@@ -951,7 +954,7 @@ void get_diagonal_copy_out_target_size(
   }
 
   size_t shift = 0;
-  for (size_t d = 0; d < in.dim(); ++d) {
+  for (const auto d : c10::irange(in.dim())) {
     if (d == dim1 || d == dim2) {
       shift++;
     } else {
@@ -961,5 +964,46 @@ void get_diagonal_copy_out_target_size(
   out_sizes[in.dim() - 2] = diagonal_size;
 }
 
+bool check_unfold_copy_args(
+    const Tensor& self,
+    int64_t dim,
+    int64_t size,
+    int64_t step) {
+  if (dim < 0) {
+    dim += nonzero_dim(self);
+  }
+  ET_LOG_AND_RETURN_IF_FALSE(tensor_has_dim(self, dim));
+  ET_CHECK_OR_RETURN_FALSE(
+      size >= 0, "size is %" PRId64 " but must be >= 0", size);
+  ET_CHECK_OR_RETURN_FALSE(
+      size <= self.size(dim),
+      "maximum size for tensor at dimension %" PRId64
+      " is %zd but size is %" PRId64,
+      dim,
+      self.size(dim),
+      size);
+  ET_CHECK_OR_RETURN_FALSE(
+      step > 0, "step is %" PRId64 " but must be > 0", step);
+  return true;
+}
+
+void get_unfold_copy_out_target_size(
+    const Tensor& self,
+    int64_t dim,
+    int64_t size,
+    int64_t step,
+    executorch::aten::SizesType* out_sizes,
+    size_t* out_ndim) {
+  for (auto i : c10::irange(self.dim())) {
+    out_sizes[i] = self.size(i);
+  }
+  // At `dim` dimension, we split the tensor into `size` chunks with `step`
+  // stride.
+  out_sizes[dim] = (self.size(dim) - size + step) / step;
+
+  out_sizes[self.dim()] = size;
+  *out_ndim = self.dim() + 1;
+}
+
 } // namespace executor
 } // namespace torch
diff --git a/kernels/portable/cpu/util/copy_ops_util.h b/kernels/portable/cpu/util/copy_ops_util.h
index 8efd6057dba..edcc6eb0021 100644
--- a/kernels/portable/cpu/util/copy_ops_util.h
+++ b/kernels/portable/cpu/util/copy_ops_util.h
@@ -7,6 +7,7 @@
  */
 
 #pragma once
+#include <c10/util/irange.h>
 
 #include <executorch/runtime/kernel/kernel_includes.h>
 
@@ -26,8 +27,8 @@ void _as_strided_copy(
     ArrayRef<int64_t> stride,
     int64_t dim) {
   // the last dimension, copy data
-  if (dim == size.size() - 1) {
-    for (size_t i = 0; i < size.at(dim); ++i) {
+  if (dim == static_cast<int64_t>(size.size()) - 1) {
+    for (const auto i : c10::irange(size.at(dim))) {
       output_data[i] = *input_data;
       input_data += stride.at(dim);
     }
@@ -35,7 +36,7 @@ void _as_strided_copy(
   }
   size_t trailing_dims = getTrailingDims(out, dim);
   // recursively set data for the next dimension
-  for (size_t i = 0; i < size.at(dim); ++i) {
+  for ([[maybe_unused]] const auto i : c10::irange(size.at(dim))) {
     _as_strided_copy<CTYPE>(
         input_data, output_data, out, size, stride, dim + 1);
     input_data += stride.at(dim);
@@ -232,5 +233,19 @@ void get_diagonal_copy_out_target_size(
     executorch::aten::SizesType* out_sizes,
     size_t* out_ndim);
 
+bool check_unfold_copy_args(
+    const Tensor& self,
+    int64_t dim,
+    int64_t size,
+    int64_t step);
+
+void get_unfold_copy_out_target_size(
+    const Tensor& self,
+    int64_t dim,
+    int64_t size,
+    int64_t step,
+    executorch::aten::SizesType* out_sizes,
+    size_t* out_ndim);
+
 } // namespace executor
 } // namespace torch
diff --git a/kernels/portable/cpu/util/delinearize_index.cpp b/kernels/portable/cpu/util/delinearize_index.cpp
new file mode 100644
index 00000000000..45378e6b05d
--- /dev/null
+++ b/kernels/portable/cpu/util/delinearize_index.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <executorch/kernels/portable/cpu/util/delinearize_index.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+
+namespace torch::executor {
+void delinearize_index(
+    size_t linear_index,
+    executorch::aten::ArrayRef<Tensor::SizesType> shape,
+    size_t* out_indexes,
+    const size_t out_indexes_len) {
+  ET_CHECK(shape.size() <= out_indexes_len);
+  for (size_t i = 0; i < shape.size(); ++i) {
+    auto dim = shape.size() - 1 - i;
+    auto dim_size = shape[dim];
+    out_indexes[dim] = linear_index % dim_size;
+    linear_index /= dim_size;
+  }
+}
+
+void delinearize_index(
+    size_t linear_index,
+    const Tensor& t,
+    size_t* out_indexes,
+    const size_t out_indexes_len) {
+  delinearize_index(linear_index, t.sizes(), out_indexes, out_indexes_len);
+}
+} // namespace torch::executor
diff --git a/kernels/portable/cpu/util/delinearize_index.h b/kernels/portable/cpu/util/delinearize_index.h
new file mode 100644
index 00000000000..3441aa6083f
--- /dev/null
+++ b/kernels/portable/cpu/util/delinearize_index.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+
+namespace torch::executor {
+/**
+ * Delinearize a flattened index to per-dimension indexes.
+ *
+ * @param[in] linear_index The flattened index
+ * @param[in] shape The tensor shape
+ * @param[out] out_indexes The per-dimension indexes
+ * @param[in] out_indexes_len The maximum size of the out_indexes array
+ * @returns void
+ */
+void delinearize_index(
+    size_t linear_index,
+    executorch::aten::ArrayRef<Tensor::SizesType> shape,
+    size_t* out_indexes,
+    const size_t out_indexes_len);
+
+/**
+ * Delinearize a flattened index to per-dimension indexes.
+ *
+ * @param[in] linear_index The flattened index
+ * @param[in] t The tensor object
+ * @param[out] out_indexes The per-dimension indexes
+ * @param[in] out_indexes_len The maximum size of the out_indexes array
+ * @returns void
+ */
+void delinearize_index(
+    size_t linear_index,
+    const Tensor& t,
+    size_t* out_indexes,
+    const size_t out_indexes_len);
+} // namespace torch::executor
diff --git a/kernels/portable/cpu/util/distance_util.cpp b/kernels/portable/cpu/util/distance_util.cpp
index f8dc2f71216..21a111d2c47 100644
--- a/kernels/portable/cpu/util/distance_util.cpp
+++ b/kernels/portable/cpu/util/distance_util.cpp
@@ -14,8 +14,7 @@ namespace executor {
 bool check_pdist_args(const Tensor& in, double p, const Tensor& out) {
   ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, out));
   ET_LOG_AND_RETURN_IF_FALSE(tensor_is_rank(in, 2));
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
-      p >= 0, "pdist only supports non-negative p values");
+  ET_CHECK_OR_RETURN_FALSE(p >= 0, "pdist only supports non-negative p values");
   return true;
 }
 
@@ -40,11 +39,10 @@ bool check_cdist_args(
   ET_LOG_AND_RETURN_IF_FALSE(tensor_has_rank_greater_or_equal_to(x2, 2));
   ET_LOG_AND_RETURN_IF_FALSE(
       tensors_have_same_size_at_dims(x1, x1.dim() - 1, x2, x2.dim() - 1));
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
-      p >= 0, "cdist only supports non-negative p values");
+  ET_CHECK_OR_RETURN_FALSE(p >= 0, "cdist only supports non-negative p values");
   if (compute_mode.has_value()) {
     int64_t mode = compute_mode.value();
-    ET_LOG_MSG_AND_RETURN_IF_FALSE(
+    ET_CHECK_OR_RETURN_FALSE(
         mode >= 0 && mode <= 2,
         "possible modes: 0, 1, 2, but was: %" PRId64,
         mode);
diff --git a/kernels/portable/cpu/util/dtype_util.cpp b/kernels/portable/cpu/util/dtype_util.cpp
index 299910da746..d240b9f83bc 100644
--- a/kernels/portable/cpu/util/dtype_util.cpp
+++ b/kernels/portable/cpu/util/dtype_util.cpp
@@ -28,17 +28,14 @@ bool check_tensor_dtype(
     case SupportedTensorDtypes::INTB:
       return executorch::runtime::tensor_is_integral_type(t, true);
     case SupportedTensorDtypes::BOOL_OR_BYTE:
-      return (
-          executorch::runtime::tensor_is_type(t, ScalarType::Bool) ||
-          executorch::runtime::tensor_is_type(t, ScalarType::Byte));
+      return (executorch::runtime::tensor_is_type(
+          t, ScalarType::Bool, ScalarType::Byte));
     case SupportedTensorDtypes::SAME_AS_COMPUTE:
       return executorch::runtime::tensor_is_type(t, compute_type);
     case SupportedTensorDtypes::SAME_AS_COMMON: {
       if (compute_type == ScalarType::Float) {
-        return (
-            executorch::runtime::tensor_is_type(t, ScalarType::Float) ||
-            executorch::runtime::tensor_is_type(t, ScalarType::Half) ||
-            executorch::runtime::tensor_is_type(t, ScalarType::BFloat16));
+        return (executorch::runtime::tensor_is_type(
+            t, ScalarType::Float, ScalarType::Half, ScalarType::BFloat16));
       } else {
         return executorch::runtime::tensor_is_type(t, compute_type);
       }
diff --git a/kernels/portable/cpu/util/elementwise_util.h b/kernels/portable/cpu/util/elementwise_util.h
index 3d06c7a3283..f5932069005 100644
--- a/kernels/portable/cpu/util/elementwise_util.h
+++ b/kernels/portable/cpu/util/elementwise_util.h
@@ -8,9 +8,15 @@
 
 #pragma once
 
+#include <c10/util/irange.h>
+#include <executorch/kernels/portable/cpu/util/broadcast_indexes_range.h>
 #include <executorch/kernels/portable/cpu/util/broadcast_util.h>
 #include <executorch/kernels/portable/cpu/util/dtype_util.h>
 #include <executorch/runtime/kernel/kernel_runtime_context.h>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
+
+#include <array>
+#include <utility>
 
 namespace torch {
 namespace executor {
@@ -44,37 +50,86 @@ inline int64_t scalar_to<int64_t>(const Scalar& s) {
                              : s.to<int64_t>();
 }
 
-template <typename CTYPE_COMMON, const char* op_name, typename Op>
-inline void apply_unitensor_elementwise_fn(
+namespace internal {
+template <
+    typename CTYPE_COMMON,
+    const char* op_name,
+    typename Op,
+    typename... Args>
+inline void apply_elementwise_fn(
     const Op& compute_fun,
     KernelRuntimeContext& ctx,
-    const Tensor& a,
-    SupportedTensorDtypes a_dtypes,
     const Tensor& out,
-    SupportedTensorDtypes out_dtypes) {
+    SupportedTensorDtypes out_dtypes,
+    Args... inputs) {
+  static_assert(
+      (std::is_same_v<Args, std::pair<const Tensor*, SupportedTensorDtypes>> &&
+       ...));
+  constexpr auto kNumInputs = sizeof...(inputs);
   constexpr auto compute_type = CppTypeToScalarType<CTYPE_COMMON>::value;
-
+  const auto check_input_dtype = [](auto input, auto compute_type) {
+    return internal::check_tensor_dtype(
+        *input.first, input.second, compute_type);
+  };
   ET_KERNEL_CHECK(
       ctx,
-      (internal::check_tensor_dtype(a, a_dtypes, compute_type) &&
-       internal::check_tensor_dtype(out, out_dtypes, compute_type)),
+      (check_input_dtype(inputs, compute_type) && ...) &&
+          internal::check_tensor_dtype(out, out_dtypes, compute_type),
       InvalidArgument, );
 
-  const auto load_a_to_common =
-      internal::get_load_to_common_fn<CTYPE_COMMON, op_name>(a, a_dtypes);
+  struct InputInfo {
+    load_to_common_fn<CTYPE_COMMON> load_to_common;
+    const char* data_ptr;
+    ssize_t element_size;
+  };
+  std::array<InputInfo, kNumInputs> inputs_info = {(InputInfo{
+      internal::get_load_to_common_fn<CTYPE_COMMON, op_name>(
+          *inputs.first, inputs.second),
+      reinterpret_cast<const char*>(inputs.first->const_data_ptr()),
+      inputs.first->element_size(),
+  })...};
+
   const auto store_common_to_out =
       internal::get_store_common_to_tensor_fn<CTYPE_COMMON, op_name>(
           out, out_dtypes);
-  const char* const data_a = reinterpret_cast<const char*>(a.const_data_ptr());
-  const auto a_element_size = a.element_size();
-  const auto out_element_size = out.element_size();
   char* const data_out = reinterpret_cast<char*>(out.mutable_data_ptr());
+  const auto out_element_size = out.element_size();
 
-  auto out_numel = out.numel();
-  for (size_t i = 0; i < out_numel; ++i) {
-    auto result = compute_fun(load_a_to_common(&data_a[i * a_element_size]));
-    store_common_to_out(result, &data_out[i * out_element_size]);
-  }
+  ::executorch::extension::parallel_for(
+      0,
+      out.numel(),
+      ::executorch::extension::internal::GRAIN_SIZE,
+      [&](const auto begin, const auto end) {
+        const auto range =
+            BroadcastIndexesRange<kNumInputs>(out, (*inputs.first)...);
+        auto begin_it = range.begin();
+        begin_it += begin;
+        for (; (*begin_it)[0] < end; ++begin_it) {
+          const auto& indexes = *begin_it;
+          std::array<CTYPE_COMMON, kNumInputs> loaded_inputs;
+          for (const auto idx : c10::irange(kNumInputs)) {
+            const auto& input_info = inputs_info[idx];
+            loaded_inputs[idx] = input_info.load_to_common(
+                &input_info
+                     .data_ptr[indexes[idx + 1] * input_info.element_size]);
+          }
+          auto result = std::apply(compute_fun, loaded_inputs);
+          store_common_to_out(result, &data_out[indexes[0] * out_element_size]);
+        }
+      });
+}
+} // namespace internal
+
+template <typename CTYPE_COMMON, const char* op_name, typename Op>
+inline void apply_unitensor_elementwise_fn(
+    const Op& compute_fun,
+    KernelRuntimeContext& ctx,
+    const Tensor& a,
+    SupportedTensorDtypes a_dtypes,
+    const Tensor& out,
+    SupportedTensorDtypes out_dtypes) {
+  internal::apply_elementwise_fn<CTYPE_COMMON, op_name>(
+      compute_fun, ctx, out, out_dtypes, std::make_pair(&a, a_dtypes));
 }
 
 /**
@@ -92,55 +147,13 @@ inline void apply_bitensor_elementwise_fn(
     SupportedTensorDtypes b_dtypes,
     const Tensor& out,
     SupportedTensorDtypes out_dtypes) {
-  constexpr auto compute_type = CppTypeToScalarType<CTYPE_COMMON>::value;
-
-  ET_KERNEL_CHECK(
+  internal::apply_elementwise_fn<CTYPE_COMMON, op_name>(
+      compute_fun,
       ctx,
-      (internal::check_tensor_dtype(a, a_dtypes, compute_type) &&
-       internal::check_tensor_dtype(b, b_dtypes, compute_type) &&
-       internal::check_tensor_dtype(out, out_dtypes, compute_type)),
-      InvalidArgument, );
-
-  const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
-  const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
-  const bool any_is_broadcasted = (a_is_broadcasted || b_is_broadcasted);
-
-  const auto load_a_to_common =
-      internal::get_load_to_common_fn<CTYPE_COMMON, op_name>(a, a_dtypes);
-  const auto load_b_to_common =
-      internal::get_load_to_common_fn<CTYPE_COMMON, op_name>(b, b_dtypes);
-  const auto store_common_to_out =
-      internal::get_store_common_to_tensor_fn<CTYPE_COMMON, op_name>(
-          out, out_dtypes);
-  const char* const data_a = reinterpret_cast<const char*>(a.const_data_ptr());
-  const char* const data_b = reinterpret_cast<const char*>(b.const_data_ptr());
-  const auto a_element_size = a.element_size();
-  const auto b_element_size = b.element_size();
-  const auto out_element_size = out.element_size();
-  char* const data_out = reinterpret_cast<char*>(out.mutable_data_ptr());
-
-  auto out_numel = out.numel();
-  for (size_t i = 0; i < out_numel; ++i) {
-    size_t a_linear_index = i;
-    size_t b_linear_index = i;
-
-    if (any_is_broadcasted) {
-      size_t out_indexes[kTensorDimensionLimit];
-      delinearize_index(i, out, out_indexes, kTensorDimensionLimit);
-
-      if (a_is_broadcasted) {
-        a_linear_index = linearize_access_indexes(out_indexes, out.dim(), a);
-      }
-      if (b_is_broadcasted) {
-        b_linear_index = linearize_access_indexes(out_indexes, out.dim(), b);
-      }
-    }
-
-    auto result = compute_fun(
-        load_a_to_common(&data_a[a_linear_index * a_element_size]),
-        load_b_to_common(&data_b[b_linear_index * b_element_size]));
-    store_common_to_out(result, &data_out[i * out_element_size]);
-  }
+      out,
+      out_dtypes,
+      std::make_pair(&a, a_dtypes),
+      std::make_pair(&b, b_dtypes));
 }
 
 /**
@@ -175,67 +188,14 @@ inline void apply_tritensor_elementwise_fn(
     SupportedTensorDtypes c_dtypes,
     const Tensor& out,
     SupportedTensorDtypes out_dtypes) {
-  constexpr auto compute_type = CppTypeToScalarType<CTYPE_COMMON>::value;
-
-  ET_KERNEL_CHECK(
+  internal::apply_elementwise_fn<CTYPE_COMMON, op_name>(
+      compute_fun,
       ctx,
-      (internal::check_tensor_dtype(a, a_dtypes, compute_type) &&
-       internal::check_tensor_dtype(b, b_dtypes, compute_type) &&
-       internal::check_tensor_dtype(c, c_dtypes, compute_type) &&
-       internal::check_tensor_dtype(out, out_dtypes, compute_type)),
-      InvalidArgument, );
-
-  const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
-  const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
-  const bool c_is_broadcasted = !out.sizes().equals(c.sizes());
-  const bool any_is_broadcasted =
-      (a_is_broadcasted || b_is_broadcasted || c_is_broadcasted);
-
-  const auto load_a_to_common =
-      internal::get_load_to_common_fn<CTYPE_COMMON, op_name>(a, a_dtypes);
-  const auto load_b_to_common =
-      internal::get_load_to_common_fn<CTYPE_COMMON, op_name>(b, b_dtypes);
-  const auto load_c_to_common =
-      internal::get_load_to_common_fn<CTYPE_COMMON, op_name>(c, c_dtypes);
-  const auto store_common_to_out =
-      internal::get_store_common_to_tensor_fn<CTYPE_COMMON, op_name>(
-          out, out_dtypes);
-  const char* const data_a = reinterpret_cast<const char*>(a.const_data_ptr());
-  const char* const data_b = reinterpret_cast<const char*>(b.const_data_ptr());
-  const char* const data_c = reinterpret_cast<const char*>(c.const_data_ptr());
-  const auto a_element_size = a.element_size();
-  const auto b_element_size = b.element_size();
-  const auto c_element_size = c.element_size();
-  const auto out_element_size = out.element_size();
-  char* const data_out = reinterpret_cast<char*>(out.mutable_data_ptr());
-
-  auto out_numel = out.numel();
-  for (size_t i = 0; i < out_numel; ++i) {
-    size_t a_linear_index = i;
-    size_t b_linear_index = i;
-    size_t c_linear_index = i;
-
-    if (any_is_broadcasted) {
-      size_t out_indexes[kTensorDimensionLimit];
-      delinearize_index(i, out, out_indexes, kTensorDimensionLimit);
-
-      if (a_is_broadcasted) {
-        a_linear_index = linearize_access_indexes(out_indexes, out.dim(), a);
-      }
-      if (b_is_broadcasted) {
-        b_linear_index = linearize_access_indexes(out_indexes, out.dim(), b);
-      }
-      if (c_is_broadcasted) {
-        c_linear_index = linearize_access_indexes(out_indexes, out.dim(), c);
-      }
-    }
-
-    auto result = compute_fun(
-        load_a_to_common(&data_a[a_linear_index * a_element_size]),
-        load_b_to_common(&data_b[b_linear_index * b_element_size]),
-        load_c_to_common(&data_c[c_linear_index * c_element_size]));
-    store_common_to_out(result, &data_out[i * out_element_size]);
-  }
+      out,
+      out_dtypes,
+      std::make_pair(&a, a_dtypes),
+      std::make_pair(&b, b_dtypes),
+      std::make_pair(&c, c_dtypes));
 }
 
 inline ScalarType get_compute_type(ScalarType& common_type) {
diff --git a/kernels/portable/cpu/util/functional_util.h b/kernels/portable/cpu/util/functional_util.h
index cdf90813772..d7ea201dbd2 100644
--- a/kernels/portable/cpu/util/functional_util.h
+++ b/kernels/portable/cpu/util/functional_util.h
@@ -8,8 +8,11 @@
 
 #pragma once
 
+#include <c10/util/irange.h>
+
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
 
 namespace torch {
 namespace executor {
@@ -30,7 +33,7 @@ inline CTYPE apply_unary_reduce_fn(
     const int64_t size,
     const int64_t stride = 1) {
   CTYPE acc_val = data_in[0];
-  for (size_t i = 1; i < size; i++) {
+  for (const auto i : c10::irange(1, size)) {
     acc_val = reduce_fun(data_in[i * stride], acc_val);
   }
   return acc_val;
@@ -51,9 +54,15 @@ inline void apply_unary_map_fn(
     CTYPE_OUT* const data_out,
     const int64_t size,
     const int64_t stride = 1) {
-  for (size_t i = 0; i < size; i++) {
-    data_out[i * stride] = map_fun(data_in[i * stride]);
-  }
+  executorch::extension::parallel_for(
+      0,
+      size,
+      ::executorch::extension::internal::GRAIN_SIZE,
+      [&](const auto begin, const auto end) {
+        for (const auto i : c10::irange(begin, end)) {
+          data_out[i * stride] = map_fun(data_in[i * stride]);
+        }
+      });
 }
 
 //
@@ -77,7 +86,7 @@ inline CTYPE_OUT apply_unary_map_reduce_fn(
     const int64_t size,
     const int64_t stride = 1) {
   CTYPE_OUT acc_val = map_fun(data_in[0]);
-  for (size_t i = 1; i < size; ++i) {
+  for (const auto i : c10::irange(1, size)) {
     acc_val = reduce_fun(map_fun(data_in[i * stride]), acc_val);
   }
   return acc_val;
diff --git a/kernels/portable/cpu/util/index_util.cpp b/kernels/portable/cpu/util/index_util.cpp
index fb54980bb47..bcf15c4bb4c 100644
--- a/kernels/portable/cpu/util/index_util.cpp
+++ b/kernels/portable/cpu/util/index_util.cpp
@@ -5,6 +5,7 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+#include <c10/util/irange.h>
 
 #include <executorch/kernels/portable/cpu/util/index_util.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
@@ -20,11 +21,11 @@ bool check_gather_args(
     Tensor& out) {
   ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, out));
   ET_LOG_AND_RETURN_IF_FALSE(tensor_has_dim(in, dim));
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       index.scalar_type() == ScalarType::Long,
       "Expected dypte int64 for index");
   if (index.numel() != 0) {
-    ET_LOG_MSG_AND_RETURN_IF_FALSE(
+    ET_CHECK_OR_RETURN_FALSE(
         nonzero_dim(in) == nonzero_dim(index),
         "self and index should have the same dimensionality when index is not empty "
         "except for the case when one has dimension 0 and the other has dimension 1");
@@ -35,9 +36,9 @@ bool check_gather_args(
     dim += nonzero_dim(in);
   }
 
-  for (size_t d = 0; d < nonzero_dim(in); ++d) {
+  for (const auto d : c10::irange(nonzero_dim(in))) {
     if (d != dim) {
-      ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      ET_CHECK_OR_RETURN_FALSE(
           nonempty_size(index, d) <= nonempty_size(in, d),
           "size of dimension %zd of index should be smaller than the size of that dimension of input if dimension %zd != dim %zd",
           d,
@@ -46,8 +47,8 @@ bool check_gather_args(
     }
   }
   const long* index_data = index.const_data_ptr<long>();
-  for (size_t i = 0; i < index.numel(); ++i) {
-    ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  for (const auto i : c10::irange(index.numel())) {
+    ET_CHECK_OR_RETURN_FALSE(
         index_data[i] >= 0 && index_data[i] < nonempty_size(in, dim),
         "Index is out of bounds for dimension %zd with size %zd",
         (size_t)dim,
@@ -64,12 +65,12 @@ bool check_index_select_args(
     Tensor& out) {
   ET_LOG_AND_RETURN_IF_FALSE(tensor_has_dim(in, dim));
   dim = dim < 0 ? dim + nonzero_dim(in) : dim;
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       nonempty_size(in, dim) > 0,
       "index_select: Indexing axis dim should be positive");
 
   ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, out));
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       index.scalar_type() == ScalarType::Long ||
           index.scalar_type() == ScalarType::Int,
       "Expected index to have type of Long or Int, but found %s",
@@ -77,15 +78,15 @@ bool check_index_select_args(
 
   ET_LOG_AND_RETURN_IF_FALSE(tensor_has_rank_smaller_or_equal_to(index, 1));
   if (index.dim() > 0 && in.dim() == 0) {
-    ET_LOG_MSG_AND_RETURN_IF_FALSE(
+    ET_CHECK_OR_RETURN_FALSE(
         index.numel() == 1,
         "index_select: Index to scalar must have exactly 1 value");
   }
 
   if (index.scalar_type() == ScalarType::Long) {
     const int64_t* const index_ptr = index.const_data_ptr<int64_t>();
-    for (size_t i = 0; i < index.numel(); ++i) {
-      ET_LOG_MSG_AND_RETURN_IF_FALSE(
+    for (const auto i : c10::irange(index.numel())) {
+      ET_CHECK_OR_RETURN_FALSE(
           index_ptr[i] >= 0 && index_ptr[i] < nonempty_size(in, dim),
           "index[%zu] = %" PRId64 " is out of range [0, %zd)",
           i,
@@ -94,8 +95,8 @@ bool check_index_select_args(
     }
   } else {
     const int32_t* const index_ptr = index.const_data_ptr<int32_t>();
-    for (size_t i = 0; i < index.numel(); ++i) {
-      ET_LOG_MSG_AND_RETURN_IF_FALSE(
+    for (const auto i : c10::irange(index.numel())) {
+      ET_CHECK_OR_RETURN_FALSE(
           index_ptr[i] >= 0 && index_ptr[i] < nonempty_size(in, dim),
           "index[%zu] = %" PRId32 " is out of range [0, %zd)",
           i,
@@ -114,7 +115,7 @@ void get_index_select_out_target_size(
     executorch::aten::SizesType* out_sizes,
     size_t* out_ndim) {
   *out_ndim = in.dim();
-  for (size_t i = 0; i < in.dim(); ++i) {
+  for (const auto i : c10::irange(in.dim())) {
     if (i == dim) {
       out_sizes[i] = index.numel();
     } else {
@@ -126,12 +127,12 @@ void get_index_select_out_target_size(
 bool check_nonzero_args(const Tensor& in, const Tensor& out) {
   (void)in;
 
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       out.scalar_type() == ScalarType::Long,
       "Expected out to be a Long tensor but received %" PRId8,
       static_cast<int8_t>(out.scalar_type()));
 
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       out.dim() == 2,
       "Expected out to be a 2d tensor received %zd",
       ssize_t(out.dim()));
@@ -147,7 +148,7 @@ bool check_scatter_add_args(
     Tensor& out) {
   ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(self, out));
   ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(self, src));
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       index.scalar_type() == ScalarType::Long,
       "Expected dypte int64 for index");
   ET_LOG_AND_RETURN_IF_FALSE(tensor_has_dim(self, dim));
@@ -156,7 +157,7 @@ bool check_scatter_add_args(
     return true;
   }
 
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       nonzero_dim(self) == nonzero_dim(src) &&
           nonzero_dim(self) == nonzero_dim(index),
       "self, index and src should have same number of dimensions.");
@@ -166,13 +167,13 @@ bool check_scatter_add_args(
     dim += nonzero_dim(self);
   }
 
-  for (size_t d = 0; d < nonzero_dim(self); ++d) {
-    ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  for (const auto d : c10::irange(nonzero_dim(self))) {
+    ET_CHECK_OR_RETURN_FALSE(
         nonempty_size(index, d) <= nonempty_size(src, d),
         "size of dimension %zd of index should be smaller than the size of that dimension of src",
         d);
     if (d != dim) {
-      ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      ET_CHECK_OR_RETURN_FALSE(
           nonempty_size(index, d) <= nonempty_size(self, d),
           "size of dimension %zd of index should be smaller than the size of that dimension of self if dimension %zd != dim %zd",
           d,
@@ -181,8 +182,8 @@ bool check_scatter_add_args(
     }
   }
   const long* index_data = index.const_data_ptr<long>();
-  for (size_t i = 0; i < index.numel(); ++i) {
-    ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  for (const auto i : c10::irange(index.numel())) {
+    ET_CHECK_OR_RETURN_FALSE(
         index_data[i] >= 0 && index_data[i] < nonempty_size(self, dim),
         "Index is out of bounds for dimension %zd with size %zd",
         (size_t)dim,
@@ -228,7 +229,7 @@ bool check_select_scatter_args(
   ET_LOG_AND_RETURN_IF_FALSE(dim_is_valid(dim, in.dim()));
 
   // The index shall be valid in the given dimenson
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       index >= 0 && index < in.size(dim),
       "index %" PRId64 " out of range [-%zd,%zd) at in.size( %" PRId64 ")",
       index,
@@ -239,7 +240,7 @@ bool check_select_scatter_args(
   // The src.dim() shall be one lower than in.dim() since src needs to fit
   // into the selected data on one dim of input
   // https://pytorch.org/docs/stable/generated/torch.select_scatter.html
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       in.dim() == src.dim() + 1,
       "in.dim() %zd != src.dim() + 1 %zd",
       in.dim(),
diff --git a/kernels/portable/cpu/util/kernel_ops_util.cpp b/kernels/portable/cpu/util/kernel_ops_util.cpp
index 2e267b57715..00b088a5cec 100644
--- a/kernels/portable/cpu/util/kernel_ops_util.cpp
+++ b/kernels/portable/cpu/util/kernel_ops_util.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <cstring>
 
 #include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
@@ -26,14 +27,14 @@ bool param_array_is_valid(
     bool allow_empty) {
   auto size = array.size();
   if (allow_empty) {
-    ET_LOG_MSG_AND_RETURN_IF_FALSE(
+    ET_CHECK_OR_RETURN_FALSE(
         size == 0 || size == 1 || size == length,
         "Expected %s to have size 0, 1 or %zu but got %zd",
         name,
         length,
         size);
   } else {
-    ET_LOG_MSG_AND_RETURN_IF_FALSE(
+    ET_CHECK_OR_RETURN_FALSE(
         size == 1 || size == length,
         "Expected %s to have size 1 or %zu but got %zd",
         name,
@@ -47,7 +48,7 @@ bool param_array_is_valid(
 } // namespace
 
 bool int_array_all_ge(IntArrayRef array, int64_t val) {
-  for (size_t i = 0; i < array.size(); ++i) {
+  for (const auto i : c10::irange(array.size())) {
     if (array[i] < val) {
       ET_LOG(
           Error,
@@ -88,7 +89,7 @@ bool padding_is_valid(
 
   if (enforce_half_kernel) {
     // Padding must be at most half of kernel size.
-    for (size_t i = 0; i < padding.size(); i++) {
+    for (const auto i : c10::irange(padding.size())) {
       if (padding[i] > val_at(kernel_size, i) / 2) {
         ET_LOG(
             Error,
@@ -122,11 +123,11 @@ bool output_padding_is_valid(
       kernel_ndim,
       /*allow_empty=*/false));
 
-  for (size_t i = 0; i < kernel_ndim; i++) {
+  for (const auto i : c10::irange(kernel_ndim)) {
     const int64_t op_i = val_at(output_padding, i);
     const int64_t s_i = val_at(stride, i);
     const int64_t d_i = val_at(dilation, i);
-    ET_LOG_MSG_AND_RETURN_IF_FALSE(
+    ET_CHECK_OR_RETURN_FALSE(
         op_i < s_i || op_i < d_i,
         "output padding must be smaller than either stride or dilation");
   }
@@ -138,12 +139,12 @@ bool output_size_is_valid(
     size_t kernel_ndim) {
   bool valid = true;
   size_t out_dim = output_size.size();
-  for (size_t i = 0; i < out_dim - kernel_ndim; i++) {
+  for (const auto i : c10::irange(out_dim - kernel_ndim)) {
     if (output_size[i] < 0) {
       valid = false;
     }
   }
-  for (size_t i = out_dim - kernel_ndim; i < out_dim; i++) {
+  for (const auto i : c10::irange(out_dim - kernel_ndim, out_dim)) {
     if (output_size[i] <= 0) {
       valid = false;
     }
@@ -153,7 +154,7 @@ bool output_size_is_valid(
         Error,
         "The provided combination of input and kernel parameters "
         "produces an invalid output size:");
-    for (size_t d = 0; d < output_size.size(); ++d) {
+    for ([[maybe_unused]] const auto d : c10::irange(output_size.size())) {
       ET_LOG(
           Error, "    size(%zu): %zu", d, static_cast<size_t>(output_size[d]));
     }
@@ -167,11 +168,11 @@ void get_unsqueezed_sizes(
     executorch::aten::SizesType* sizes_arr,
     size_t& ndim) {
   ndim = t.dim() + 1;
-  for (int d = 0; d < unsqueeze_dim; ++d) {
+  for (const auto d : c10::irange(unsqueeze_dim)) {
     sizes_arr[d] = t.size(d);
   }
   sizes_arr[unsqueeze_dim] = 1;
-  for (int d = (unsqueeze_dim + 1); d < ndim; d++) {
+  for (const auto d : c10::irange(unsqueeze_dim + 1, ndim)) {
     sizes_arr[d] = t.size(d - 1);
   }
 }
@@ -181,7 +182,7 @@ void get_unsqueezed_dim_order(
     executorch::aten::DimOrderType unsqueeze_dim,
     executorch::aten::DimOrderType* dim_order_arr) {
   int offset = 0;
-  for (int i = 0; i < t.dim(); ++i) {
+  for (const auto i : c10::irange(t.dim())) {
     executorch::aten::DimOrderType dim = t.dim_order()[i];
     if (dim == unsqueeze_dim) {
       dim_order_arr[i] = dim;
@@ -213,7 +214,7 @@ int64_t _kernel_output_size_helper(
   if (ceil_mode) {
     // ensure that the last pooling starts inside the image
     // needed to avoid problems in ceil mode
-    if ((outputSize - 1) * stride >= inputSize + pad) {
+    if ((outputSize - 1) * stride >= static_cast<int64_t>(inputSize) + pad) {
       --outputSize;
     }
   }
@@ -231,7 +232,7 @@ void calculate_kernel_output_sizes(
     bool ceil_mode,
     bool transposed,
     IntArrayRef output_padding) {
-  for (size_t i = 0; i < kernel_ndim; ++i) {
+  for (const auto i : c10::irange(kernel_ndim)) {
     auto dim = in.dim() - (kernel_ndim - i);
     int64_t k = val_at(kernel_size, i);
     int64_t s = val_at(stride, i, /*default_value=*/k);
@@ -246,12 +247,12 @@ void calculate_kernel_output_sizes(
 }
 
 bool check_arange_args(double start, double end, double step, Tensor& out) {
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       out.dim() == 1,
       "out should be a 1-d tensor, but got a %zu-d tensor",
       out.dim());
 
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       (step > 0 && (end >= start)) || (step < 0 && (end <= start)),
       "upper bound and larger bound inconsistent with step sign");
 
@@ -272,7 +273,7 @@ bool check_avg_pool2d_args(
   ET_LOG_AND_RETURN_IF_FALSE(tensor_is_default_or_channels_last_dim_order(in));
   ET_LOG_AND_RETURN_IF_FALSE(tensor_is_default_or_channels_last_dim_order(out));
 
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       (in.dim() == 3 && in.size(0) > 0 && in.size(1) > 0 && in.size(2) > 0) ||
           (in.dim() == 4 && in.size(1) > 0 && in.size(2) > 0 && in.size(3) > 0),
       "Expected 3D or 4D (batch mode) tensor with optional 0 dim batch size for input");
@@ -285,7 +286,7 @@ bool check_avg_pool2d_args(
       padding, kernel_size, /*kernel_ndim=*/2, /*enforce_half_kernel=*/true));
 
   if (divisor_override.has_value()) {
-    ET_LOG_MSG_AND_RETURN_IF_FALSE(
+    ET_CHECK_OR_RETURN_FALSE(
         divisor_override.value() != 0,
         "divisor_override must be non-zero, but found %" PRId64,
         divisor_override.value());
@@ -334,7 +335,7 @@ bool check_convolution_args(
       tensor_is_default_or_channels_last_dim_order(weight));
   ET_LOG_AND_RETURN_IF_FALSE(tensor_is_default_or_channels_last_dim_order(out));
 
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       in.dim() == 3 || in.dim() == 4,
       "Expect input tensor to be 3-D or 4-D, but got, %zu.",
       static_cast<size_t>(in.dim()));
@@ -343,7 +344,7 @@ bool check_convolution_args(
 
   if (bias.has_value()) {
     ET_LOG_AND_RETURN_IF_FALSE(tensor_is_rank(bias.value(), 1));
-    ET_LOG_MSG_AND_RETURN_IF_FALSE(
+    ET_CHECK_OR_RETURN_FALSE(
         bias.value().size(0) == transposed ? groups * weight.size(1)
                                            : weight.size(0),
         "bias length must equal number of output channels, but got %zd",
@@ -369,14 +370,14 @@ bool check_convolution_args(
         output_padding_is_valid(output_padding, stride, dilation, kernel_ndim));
   }
 
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       weight.size(0) >= groups,
       "Given groups=%" PRId64 ", expected weight to be at least %" PRId64
       " at dimension 0, but got weight.size(0) = %zd instead",
       groups,
       groups,
       weight.size(0));
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       weight.size(0) % groups == 0,
       "Given groups=%" PRId64 ", expected weight to be divisible by %" PRId64
       " at dimension 0, but got weight.size(0) = %zd instead",
@@ -385,7 +386,7 @@ bool check_convolution_args(
       weight.size(0));
 
   if (!transposed) {
-    ET_LOG_MSG_AND_RETURN_IF_FALSE(
+    ET_CHECK_OR_RETURN_FALSE(
         in.size(1) == groups * weight.size(1),
         "Given groups=%" PRId64
         " and weight.size(1) = %zd, expected input to have %" PRId64
@@ -395,7 +396,7 @@ bool check_convolution_args(
         groups * weight.size(1),
         in.size(1));
   } else {
-    ET_LOG_MSG_AND_RETURN_IF_FALSE(
+    ET_CHECK_OR_RETURN_FALSE(
         in.size(1) == weight.size(0),
         "input channels must match weight.size(0) in transposed convolution");
   }
@@ -469,10 +470,10 @@ bool check_max_pool2d_with_indices_args(
     IntArrayRef padding,
     IntArrayRef dilation,
     bool ceil_mode,
-    Tensor& out,
-    Tensor& indices) {
+    const Tensor& out,
+    const Tensor& indices) {
   ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, out));
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       indices.scalar_type() == ScalarType::Long,
       "Expected indices to have type of Long, but found %s",
       toString(indices.scalar_type()));
@@ -480,7 +481,7 @@ bool check_max_pool2d_with_indices_args(
   ET_LOG_AND_RETURN_IF_FALSE(tensor_is_default_or_channels_last_dim_order(in));
   ET_LOG_AND_RETURN_IF_FALSE(tensor_is_default_or_channels_last_dim_order(out));
 
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       (in.dim() == 3 && in.size(0) > 0 && in.size(1) > 0 && in.size(2) > 0) ||
           (in.dim() == 4 && in.size(1) > 0 && in.size(2) > 0 && in.size(3) > 0),
       "Expected 3D or 4D (batch mode) tensor with optional 0 dim batch size for input");
@@ -543,11 +544,12 @@ bool check_constant_pad_args(
 
   ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_rank(in, out));
 
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       pad.size() % 2 == 0, "Padding array must be a multiple of 2");
 
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
-      pad.size() / 2 <= in.dim(), "Padding array contains too many elements");
+  ET_CHECK_OR_RETURN_FALSE(
+      static_cast<ssize_t>(pad.size() / 2) <= in.dim(),
+      "Padding array contains too many elements");
 
   return true;
 }
@@ -559,11 +561,12 @@ Error resize_constant_pad_output(
   Tensor::SizesType expected_output_size[kTensorDimensionLimit];
 
   int pad_i = in.dim() - 1;
-  for (size_t i = 0; i < in.dim(); ++i, --pad_i) {
+  for (const auto i : c10::irange(in.dim())) {
     expected_output_size[i] = in.size(i);
-    if (pad_i >= 0 && pad_i < pad.size() / 2) {
+    if (pad_i >= 0 && static_cast<size_t>(pad_i) < pad.size() / 2) {
       expected_output_size[i] += pad[2 * pad_i] + pad[2 * pad_i + 1];
     }
+    --pad_i;
   }
 
   ArrayRef<Tensor::SizesType> output_size{
@@ -578,13 +581,13 @@ bool check_embedding_args(
     const Tensor& indices,
     const Tensor& out) {
   // Ensure weight is 2-D. It could be empty.
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       weight.dim() == 2, "weight.dim() %zd != 2", weight.dim());
 
   // Ensure out is k+1 dimension tensor where k is the indices.dim()
   // out's first k dimension shall be same as indices, and the last dim shall
   // equal weight's last dim
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       out.dim() == indices.dim() + 1,
       "out.dim() %zd != indices.dim() %zd + 1",
       out.dim(),
@@ -601,7 +604,7 @@ Error resize_embedding_output(
     const Tensor& indices,
     const Tensor& out) {
   Tensor::SizesType expected_output_size[kTensorDimensionLimit];
-  for (size_t i = 0; i < indices.dim(); i++) {
+  for (const auto i : c10::irange(indices.dim())) {
     expected_output_size[i] = indices.size(i);
   }
   const size_t embedding_dim = weight.size(1);
diff --git a/kernels/portable/cpu/util/kernel_ops_util.h b/kernels/portable/cpu/util/kernel_ops_util.h
index 812e887111b..8028f254eb4 100644
--- a/kernels/portable/cpu/util/kernel_ops_util.h
+++ b/kernels/portable/cpu/util/kernel_ops_util.h
@@ -10,6 +10,7 @@
 
 #include <tuple>
 
+#include <c10/util/irange.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
 namespace torch {
@@ -182,9 +183,9 @@ void kernel_reduction_then_map_2d(
   int64_t d_W = val_at(dilation, 1, /*default_value=*/1);
 
   // Compute 2D output region
-  for (size_t out_y = 0; out_y < out_H; ++out_y) {
+  for (const auto out_y : c10::irange(out_H)) {
     out_coord[in_dim - 2] = out_y;
-    for (size_t out_x = 0; out_x < out_W; ++out_x) {
+    for (const auto out_x : c10::irange(out_W)) {
       out_coord[in_dim - 1] = out_x;
 
       bool accum_initialized = false;
@@ -212,7 +213,7 @@ void kernel_reduction_then_map_2d(
         count = (ih1 - ih0) * (iw1 - iw0);
       }
 
-      for (size_t w_y = 0; w_y < k_H; ++w_y) {
+      for (const auto w_y : c10::irange(k_H)) {
         int64_t stride_y = s_H;
         int64_t padding_y = p_H;
         int64_t dilation_y = d_H;
@@ -220,7 +221,7 @@ void kernel_reduction_then_map_2d(
         size_t in_y = stride_y * out_y + dilation_y * w_y - padding_y;
         in_coord[in_dim - 2] = in_y;
 
-        for (size_t w_x = 0; w_x < k_W; ++w_x) {
+        for (const auto w_x : c10::irange(k_W)) {
           int64_t stride_x = s_W;
           int64_t padding_x = p_W;
           int64_t dilation_x = d_W;
@@ -356,8 +357,8 @@ void apply_kernel_2d_reduce_then_map_fn(
   if (in.dim() == 4) {
     batch_size = in_sizes[0];
   }
-  for (size_t batch = 0; batch < batch_size; ++batch) {
-    for (size_t channel = 0; channel < in_sizes[in.dim() - 3]; ++channel) {
+  for (const auto batch : c10::irange(batch_size)) {
+    for (const auto channel : c10::irange(in_sizes[in.dim() - 3])) {
       kernel_reduction_then_map_2d(
           reduce_fn,
           map_fn,
@@ -441,8 +442,8 @@ bool check_max_pool2d_with_indices_args(
     IntArrayRef padding,
     IntArrayRef dilation,
     bool ceil_mode,
-    Tensor& out,
-    Tensor& indices);
+    const Tensor& out,
+    const Tensor& indices);
 
 void get_max_pool2d_with_indices_out_target_size(
     const Tensor& in,
diff --git a/kernels/portable/cpu/util/normalization_ops_util.cpp b/kernels/portable/cpu/util/normalization_ops_util.cpp
index 684417f448a..db18cf0c053 100644
--- a/kernels/portable/cpu/util/normalization_ops_util.cpp
+++ b/kernels/portable/cpu/util/normalization_ops_util.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <cstring>
 
 #include <executorch/kernels/portable/cpu/util/normalization_ops_util.h>
@@ -81,20 +82,20 @@ bool check_layer_norm_args(
     Tensor& mean_out,
     Tensor& rstd_out) {
   size_t ndim = normalized_shape.size();
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       ndim >= 1,
       "Expected normalized_shape to be at least 1-dimensional, i.e., containing at least one element.");
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
-      in.dim() >= ndim,
+  ET_CHECK_OR_RETURN_FALSE(
+      in.dim() >= static_cast<ssize_t>(ndim),
       "Expected input tensor to have rank >= the length of normalized_shape.");
   size_t shift = in.dim() - ndim;
-  for (size_t d = 0; d < ndim; ++d) {
-    ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  for (const auto d : c10::irange(ndim)) {
+    ET_CHECK_OR_RETURN_FALSE(
         in.size(d + shift) == normalized_shape[d],
         "Expected normalized_shape to match the sizes of input's rightmost dimensions.");
   }
   executorch::aten::SizesType shape[ndim];
-  for (size_t i = 0; i < ndim; ++i) {
+  for (const auto i : c10::irange(ndim)) {
     shape[i] = static_cast<executorch::aten::SizesType>(normalized_shape[i]);
   }
 
@@ -121,8 +122,8 @@ void get_layer_norm_out_target_size(
     size_t* mean_rstd_ndim) {
   *mean_rstd_ndim = in.dim();
 
-  for (size_t d = 0; d < in.dim(); ++d) {
-    if (d < in.dim() - normalized_shape.size()) {
+  for (const auto d : c10::irange(in.dim())) {
+    if (d < static_cast<long>(in.dim() - normalized_shape.size())) {
       mean_rstd_sizes[d] = in.size(d);
     } else {
       mean_rstd_sizes[d] = 1;
@@ -144,16 +145,16 @@ bool check_group_norm_args(
   ET_LOG_AND_RETURN_IF_FALSE(in.size(0) == N);
   ET_LOG_AND_RETURN_IF_FALSE(in.size(1) == C);
   ET_LOG_AND_RETURN_IF_FALSE(in.numel() == N * C * HxW);
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       group > 0, "Expected number of groups to be greater than 0");
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       C % group == 0,
       "Expected number of channels in input to be divisible by number of groups");
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       !weight.has_value() ||
           (weight.value().dim() == 1 && weight.value().size(0) == C),
       "Expected weight to be a vector of size equal to the number of channels in input");
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       !bias.has_value() ||
           (bias.value().dim() == 1 && bias.value().size(0) == C),
       "Expected bias to be a vector of size equal to the number of channels in input");
diff --git a/kernels/portable/cpu/util/padding_util.cpp b/kernels/portable/cpu/util/padding_util.cpp
index 251c7f1c44b..d5b6e26784b 100644
--- a/kernels/portable/cpu/util/padding_util.cpp
+++ b/kernels/portable/cpu/util/padding_util.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <cstdint>
 #include <cstring>
 
@@ -21,10 +22,10 @@ bool check_padding_args(
     executorch::aten::ArrayRef<int64_t> padding,
     Tensor& out,
     bool reflection) {
-  ET_LOG_AND_RETURN_IF_FALSE(padding.size() == 2 * n);
+  ET_LOG_AND_RETURN_IF_FALSE(static_cast<int64_t>(padding.size()) == 2 * n);
   ET_LOG_AND_RETURN_IF_FALSE(in.dim() == n + 1 || in.dim() == n + 2);
   ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, out));
-  for (size_t i = 1; i <= n; ++i) {
+  for (const auto i : c10::irange(1, n + 1)) {
     ET_LOG_AND_RETURN_IF_FALSE(
         in.size(in.dim() - i) + padding[2 * i - 2] + padding[2 * i - 1] >= 0);
     if (reflection) {
@@ -43,10 +44,10 @@ void get_padding_out_target_size(
     Tensor::SizesType* out_sizes,
     size_t* out_ndim) {
   *out_ndim = in.dim();
-  for (size_t i = 0; i < in.dim(); ++i) {
+  for (const auto i : c10::irange(in.dim())) {
     out_sizes[i] = in.size(i);
   }
-  for (size_t i = 1; i <= n; ++i) {
+  for (const auto i : c10::irange(1, n + 1)) {
     out_sizes[in.dim() - i] =
         in.size(in.dim() - i) + padding[2 * i - 2] + padding[2 * i - 1];
   }
diff --git a/kernels/portable/cpu/util/padding_util.h b/kernels/portable/cpu/util/padding_util.h
index f8aa367a94b..50cfcc65643 100644
--- a/kernels/portable/cpu/util/padding_util.h
+++ b/kernels/portable/cpu/util/padding_util.h
@@ -7,6 +7,7 @@
  */
 
 #pragma once
+#include <c10/util/irange.h>
 
 #include <executorch/runtime/kernel/kernel_includes.h>
 
@@ -51,11 +52,10 @@ void pad1d(
   const auto in_width = in.size(dim);
   const auto out_width = out.size(dim);
   const auto pad_left = padding[0];
-
-  for (size_t i = 0; i < outer; i++) {
+  for (const auto i : c10::irange(outer)) {
     size_t out_i_base = i * out_width;
     size_t in_i_base = i * in_width;
-    for (size_t w = 0; w < out_width; w++) {
+    for (const auto w : c10::irange(out_width)) {
       out_data[out_i_base + w] =
           in_data[in_i_base + padding_ix(w, in_width, pad_left)];
     }
@@ -80,14 +80,14 @@ void pad2d(
   const auto pad_left = padding[0];
   const auto pad_top = padding[2];
 
-  for (size_t i = 0; i < outer; i++) {
+  for (const auto i : c10::irange(outer)) {
     size_t out_i_base = i * out_height * out_width;
     size_t in_i_base = i * in_height * in_width;
-    for (size_t h = 0; h < out_height; h++) {
+    for (const auto h : c10::irange(out_height)) {
       size_t out_h_base = out_i_base + h * out_width;
       size_t in_h_base =
           in_i_base + padding_ix(h, in_height, pad_top) * in_width;
-      for (size_t w = 0; w < out_width; w++) {
+      for (const auto w : c10::irange(out_width)) {
         out_data[out_h_base + w] =
             in_data[in_h_base + padding_ix(w, in_width, pad_left)];
       }
@@ -116,18 +116,18 @@ void pad3d(
   const auto pad_top = padding[2];
   const auto pad_front = padding[4];
 
-  for (size_t i = 0; i < outer; i++) {
+  for (const auto i : c10::irange(outer)) {
     size_t out_i_base = i * out_depth * out_height * out_width;
     size_t in_i_base = i * in_depth * in_height * in_width;
-    for (size_t d = 0; d < out_depth; d++) {
+    for (const auto d : c10::irange(out_depth)) {
       size_t out_d_base = out_i_base + d * out_height * out_width;
       size_t in_d_base =
           in_i_base + padding_ix(d, in_depth, pad_front) * in_height * in_width;
-      for (size_t h = 0; h < out_height; h++) {
+      for (const auto h : c10::irange(out_height)) {
         size_t out_h_base = out_d_base + h * out_width;
         size_t in_h_base =
             in_d_base + padding_ix(h, in_height, pad_top) * in_width;
-        for (size_t w = 0; w < out_width; w++) {
+        for (const auto w : c10::irange(out_width)) {
           out_data[out_h_base + w] =
               in_data[in_h_base + padding_ix(w, in_width, pad_left)];
         }
diff --git a/kernels/portable/cpu/util/reduce_util.cpp b/kernels/portable/cpu/util/reduce_util.cpp
index 65140fc6643..31296d67ee7 100644
--- a/kernels/portable/cpu/util/reduce_util.cpp
+++ b/kernels/portable/cpu/util/reduce_util.cpp
@@ -48,10 +48,9 @@ ET_NODISCARD bool check_dim_list_is_valid(
       }
 
       const size_t non_neg_d = _normalize_non_neg_d(d, in.dim());
-      ET_LOG_AND_RETURN_IF_FALSE(
-          non_neg_d < kTensorDimensionLimit && non_neg_d >= 0);
+      ET_LOG_AND_RETURN_IF_FALSE(non_neg_d < kTensorDimensionLimit);
 
-      ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      ET_CHECK_OR_RETURN_FALSE(
           dim_exist[non_neg_d] == false,
           "dim %zd appears multiple times in the list of dims",
           non_neg_d);
@@ -84,12 +83,8 @@ size_t get_reduced_dim_product(
   if (in.dim() == 0) {
     return 1;
   }
-  size_t dim_product = 1;
   if (!dim.has_value()) {
-    for (size_t i = 0; i < in.dim(); ++i) {
-      dim_product *= in.size(i);
-    }
-    return dim_product;
+    return in.numel();
   }
   const size_t d = _normalize_non_neg_d(dim.value(), in.dim());
   return in.size(d);
@@ -105,16 +100,12 @@ size_t get_reduced_dim_product(
   if (in.dim() == 0) {
     return 1;
   }
-  size_t dim_product = 1;
-  const size_t in_dim = in.dim();
   if (!dim_list.has_value() || dim_list.value().size() == 0) {
-    for (size_t i = 0; i < in.dim(); ++i) {
-      dim_product *= in.size(i);
-    }
-    return dim_product;
+    return in.numel();
   }
+  size_t dim_product = 1;
   for (const auto& d : dim_list.value()) {
-    const size_t non_neg_d = _normalize_non_neg_d(d, in_dim);
+    const size_t non_neg_d = _normalize_non_neg_d(d, in.dim());
     dim_product *= in.size(non_neg_d);
   }
   return dim_product;
@@ -136,7 +127,7 @@ size_t get_out_numel(
       ET_CHECK_VALID_DIM(dim_val, in.dim());
     }
     const size_t non_neg_dim = _normalize_non_neg_d(dim_val, in.dim());
-    for (size_t d = 0; d < in.dim(); ++d) {
+    for (size_t d = 0; d < static_cast<size_t>(in.dim()); ++d) {
       if (d != non_neg_dim) {
         out_numel *= in.size(d);
       }
@@ -155,7 +146,7 @@ size_t get_out_numel(
         dim_list) {
   size_t out_numel = 1;
   if (dim_list.has_value() && dim_list.value().size() != 0) {
-    for (size_t d = 0; d < in.dim(); ++d) {
+    for (size_t d = 0; d < static_cast<size_t>(in.dim()); ++d) {
       if (!check_dim_in_dim_list(d, in.dim(), dim_list.value())) {
         out_numel *= in.size(d);
       }
@@ -186,7 +177,7 @@ size_t get_init_index(
   size_t mutable_out_ix = out_ix;
   auto strides = in.strides();
   for (int64_t d = in.dim() - 1; d >= 0; d--) {
-    if (d != non_neg_dim) {
+    if (d != static_cast<int64_t>(non_neg_dim)) {
       init_ix += (mutable_out_ix % in.size(d)) * strides[d];
       mutable_out_ix /= in.size(d);
     }
@@ -234,7 +225,7 @@ size_t compute_reduced_out_size(
   if (dim.has_value()) {
     const auto dim_val = dim.value();
     const size_t non_neg_dim = _normalize_non_neg_d(dim_val, in_dim);
-    for (ssize_t i = 0; i < non_neg_dim; ++i) {
+    for (size_t i = 0; i < non_neg_dim; ++i) {
       sizes_arr[i] = in.size(i);
     }
     if (keepdim) {
@@ -250,7 +241,7 @@ size_t compute_reduced_out_size(
     }
   } else {
     if (keepdim) {
-      for (size_t i = 0; i < in_dim; ++i) {
+      for (size_t i = 0; i < static_cast<size_t>(in_dim); ++i) {
         sizes_arr[i] = 1;
       }
     } else {
@@ -266,7 +257,9 @@ size_t compute_reduced_out_size(
         dim_list,
     bool keepdim,
     executorch::aten::SizesType* sizes_arr) {
-  const auto in_dim = in.dim();
+  // check_dim_in_dim_list and later comparisons
+  // expect in_dim to be size_t, so cast it here
+  const size_t in_dim = static_cast<size_t>(in.dim());
   size_t out_dim = in_dim;
 
   if (dim_list.has_value() && dim_list.value().size() != 0) {
diff --git a/kernels/portable/cpu/util/reduce_util.h b/kernels/portable/cpu/util/reduce_util.h
index 25a2c0b44c4..9319ab01142 100644
--- a/kernels/portable/cpu/util/reduce_util.h
+++ b/kernels/portable/cpu/util/reduce_util.h
@@ -10,6 +10,7 @@
 
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
 #include <cstring>
 #include <tuple>
 
@@ -45,12 +46,12 @@ template <typename Fn>
 void apply_on_flat_ix_with_dim_mask_and_base(
     const Fn& fn,
     const Tensor& in,
-    bool* dim_mask,
+    const bool* dim_mask,
     const size_t base,
     const size_t start,
     const size_t end) {
   // Compute innermost dim from dim list
-  size_t inner_dim = in.dim() - 1;
+  int64_t inner_dim = in.dim() - 1;
   while (!dim_mask[inner_dim]) {
     inner_dim--;
   }
@@ -58,7 +59,7 @@ void apply_on_flat_ix_with_dim_mask_and_base(
   // Initialize array of indices per dimension. This array is used to maintain
   // the per-dimension index of the element in `in` that is being reduced over
   // Only the dims that are in the dim list are relevant.
-  size_t dim_index[kTensorDimensionLimit];
+  int64_t dim_index[kTensorDimensionLimit];
   for (int64_t d = 0; d < in.dim(); d++) {
     dim_index[d] = 0;
   }
@@ -163,6 +164,14 @@ size_t get_reduced_dim_product(
     const executorch::aten::optional<executorch::aten::ArrayRef<int64_t>>&
         dim_list);
 
+// Resolve ambiguity between the above two overloads -- ArrayRef and
+// optional are both implicitly constructible from int64_t.
+inline size_t get_reduced_dim_product(
+    const executorch::aten::Tensor& in,
+    int64_t dim) {
+  return get_reduced_dim_product(in, executorch::aten::optional<int64_t>(dim));
+}
+
 size_t get_out_numel(
     const executorch::aten::Tensor& in,
     const executorch::aten::optional<int64_t>& dim);
@@ -172,6 +181,12 @@ size_t get_out_numel(
     const executorch::aten::optional<executorch::aten::ArrayRef<int64_t>>&
         dim_list);
 
+// Resolve ambiguity between the above two overloads -- ArrayRef and
+// optional are both implicitly constructible from int64_t.
+inline size_t get_out_numel(const executorch::aten::Tensor& in, int64_t dim) {
+  return get_out_numel(in, executorch::aten::optional<int64_t>(dim));
+}
+
 size_t get_init_index(
     const executorch::aten::Tensor& in,
     const executorch::aten::optional<int64_t>& dim,
@@ -183,6 +198,12 @@ size_t get_init_index(
         dim_list,
     const size_t out_ix);
 
+inline size_t get_init_index(
+    const executorch::aten::Tensor& in,
+    int64_t dim,
+    const size_t out_ix) {
+  return get_init_index(in, executorch::aten::optional<int64_t>(dim), out_ix);
+}
 //
 // Iteration Functions
 //
@@ -295,6 +316,116 @@ void apply_over_dim(
   }
 }
 
+/**
+ * Execution plan for repeated apply_over_dim_list with the same
+ * function, input tensor, dim list, start, and end but varying
+ * out_ix, as done (via {map_,}reduce_over_dim_list) in reductions.
+ */
+class ApplyOverDimListPlan {
+ public:
+  ApplyOverDimListPlan(
+      const executorch::aten::Tensor& in,
+      // If set, lifetime must last until execute() returns.
+      const executorch::aten::optional<executorch::aten::ArrayRef<int64_t>>&
+          dim_list,
+      const int64_t start = 0,
+      const int64_t end = -1)
+      : dim_list_(dim_list), in_(in) {
+    ET_CHECK(check_dim_list_is_valid(in, dim_list));
+    out_numel_ = get_out_numel(in_, dim_list);
+    if (in.numel() == 0) {
+      mode_ = ExecutionMode::NothingToDo;
+      return;
+    }
+    const size_t iter_length = get_reduced_dim_product(in, dim_list);
+    const size_t normalized_start = ET_NORMALIZE_IX(start, iter_length);
+    const size_t normalized_end = ET_NORMALIZE_IX(end, iter_length);
+    ustart_ = std::max(normalized_start, size_t(0));
+    uend_ = std::min(normalized_end, iter_length - 1);
+    if (!dim_list.has_value() || dim_list.value().size() == 0 ||
+        in.dim() == 0) {
+      mode_ = ExecutionMode::NoDimMaskOrZeroDimension;
+      return;
+    }
+    dim_list_ = dim_list.value();
+    if (dim_list_.value().size() == 1) {
+      mode_ = ExecutionMode::OnlyOneDim;
+      return;
+    }
+    is_in_dim_list_.fill(0);
+    for (const auto& d : dim_list.value()) {
+      const size_t non_neg_d = d < 0 ? d + in.dim() : d;
+      is_in_dim_list_[non_neg_d] = true;
+    }
+
+    mode_ = ExecutionMode::NormalDimMask;
+  }
+
+  template <typename Fn>
+  void execute(const Fn& fn, const size_t out_ix) const {
+    ET_CHECK_MSG(out_ix < out_numel_, "Out index %zd is out of bounds", out_ix);
+
+    switch (mode_) {
+      case ExecutionMode::NothingToDo:
+        return;
+      case ExecutionMode::NoDimMaskOrZeroDimension:
+        apply_on_flat_ix_with_stride_and_base(
+            fn, /*stride=*/1, /*base=*/0, ustart_, uend_);
+        return;
+      case ExecutionMode::OnlyOneDim:
+        apply_on_flat_and_dim_ix_with_stride_and_base(
+            [&](const auto in_ix, const auto dim_ix) { fn(in_ix); },
+            in_.strides()[ET_NORMALIZE_IX(dim_list_.value()[0], in_.dim())],
+            get_init_index(in_, dim_list_.value(), out_ix),
+            ustart_,
+            uend_);
+        return;
+      case ExecutionMode::NormalDimMask:
+        apply_on_flat_ix_with_dim_mask_and_base(
+            fn,
+            in_,
+            is_in_dim_list_.data(),
+            get_init_index(in_, dim_list_.value(), out_ix),
+            ustart_,
+            uend_);
+        return;
+    }
+  }
+
+  const executorch::aten::Tensor& get_input_tensor() const {
+    return in_;
+  }
+
+  const executorch::aten::optional<executorch::aten::ArrayRef<int64_t>>&
+  get_dim_list() const {
+    return dim_list_;
+  }
+
+ private:
+  // Start argument to apply_on_flat_ix_with_{stride,dim_mask}_and_base.
+  size_t ustart_;
+  // End argument to apply_on_flat_ix_with_{stride,dim_mask}_and_base.
+  size_t uend_;
+  enum class ExecutionMode {
+    // Empty input, no work to do.
+    NothingToDo,
+    // Iterate over the entire tensor with
+    // apply_on_flat_ix_with_stride_and_base.
+    NoDimMaskOrZeroDimension,
+    // dim_list has size 1, iterate with
+    // apply_on_flat_and_dim_ix_with_stride_and_base
+    OnlyOneDim,
+    // General mode, iterate with
+    // apply_on_flat_ix_with_dim_mask_and_base.
+    NormalDimMask
+  };
+  ExecutionMode mode_;
+  size_t out_numel_;
+  executorch::aten::optional<executorch::aten::ArrayRef<int64_t>> dim_list_;
+  std::array<bool, kTensorDimensionLimit> is_in_dim_list_;
+  const executorch::aten::Tensor& in_;
+};
+
 /**
  * Useful to reduce a tensor `in` over a given list of dimensions `dim_list`
  * for the output element at index `out_ix` using the reduce function
@@ -311,42 +442,8 @@ void apply_over_dim_list(
     const size_t out_ix,
     const int64_t start = 0,
     const int64_t end = -1) {
-  ET_CHECK(check_dim_list_is_valid(in, dim_list));
-  ET_CHECK_MSG(
-      out_ix < get_out_numel(in, dim_list),
-      "Out index %zd is out of bounds",
-      out_ix);
-
-  if (in.numel() == 0) {
-    return;
-  }
-
-  const size_t iter_length = get_reduced_dim_product(in, dim_list);
-  const size_t normalized_start = ET_NORMALIZE_IX(start, iter_length);
-  const size_t normalized_end = ET_NORMALIZE_IX(end, iter_length);
-  const size_t ustart = std::max(normalized_start, size_t(0));
-  const size_t uend = std::min(normalized_end, iter_length - 1);
-
-  // If dim_list is null or empty, or in is 0-D, iterate over the entire tensor
-  if (!dim_list.has_value() || dim_list.value().size() == 0 || in.dim() == 0) {
-    apply_on_flat_ix_with_stride_and_base(
-        fn, /*stride=*/1, /*base=*/0, ustart, uend);
-    return;
-  }
-
-  // Create is_in_dims to check whether each dimension is in the dim list
-  bool is_in_dim_list[kTensorDimensionLimit];
-  memset(is_in_dim_list, false, sizeof(is_in_dim_list));
-  for (const auto& d : dim_list.value()) {
-    const size_t non_neg_d = d < 0 ? d + in.dim() : d;
-    is_in_dim_list[non_neg_d] = true;
-  }
-
-  // Compute the starting base index
-  const size_t base = get_init_index(in, dim_list, out_ix);
-
-  apply_on_flat_ix_with_dim_mask_and_base(
-      fn, in, is_in_dim_list, base, ustart, uend);
+  ApplyOverDimListPlan plan(in, dim_list, start, end);
+  plan.execute(fn, out_ix);
 }
 
 //
@@ -430,6 +527,52 @@ std::tuple<CTYPE_OUT, long> map_reduce_over_dim(
   return std::tuple<CTYPE_OUT, long>{acc_val, acc_ix};
 }
 
+/**
+ * Execution plan for repeated map_reduce_over_dim_list with the same
+ * function, input tensor, and dim_list but varying out_ix.
+ */
+class MapReduceOverDimListPlan {
+ public:
+  MapReduceOverDimListPlan(
+      const executorch::aten::Tensor& in,
+      const executorch::aten::optional<executorch::aten::ArrayRef<int64_t>>&
+          dim_list)
+      : plan_(in, dim_list, 1, -1) {
+    ET_CHECK_MSG(in.numel() > 0, "Input tensor must be nonempty");
+  }
+
+  template <
+      typename CTYPE_IN,
+      typename CTYPE_OUT,
+      typename MapOp,
+      typename ReduceOp>
+  CTYPE_OUT execute(
+      const MapOp& map_fun,
+      const ReduceOp& reduce_fun,
+      const size_t out_ix) const {
+    const size_t init_index =
+        get_init_index(plan_.get_input_tensor(), plan_.get_dim_list(), out_ix);
+
+    const CTYPE_IN* const in_data =
+        plan_.get_input_tensor().const_data_ptr<CTYPE_IN>();
+    CTYPE_OUT acc_val = map_fun(in_data[init_index]);
+
+    if (plan_.get_input_tensor().numel() == 1) {
+      return acc_val;
+    }
+
+    plan_.execute(
+        [&acc_val, reduce_fun, map_fun, in_data](const size_t in_ix) {
+          acc_val = reduce_fun(map_fun(in_data[in_ix]), acc_val);
+        },
+        out_ix);
+    return acc_val;
+  }
+
+ private:
+  ApplyOverDimListPlan plan_;
+};
+
 /**
  * Useful to reduce a tensor `in` over a given list of dimensions `dim_list`
  * for the output element at index `out_ix`, first applying the map `map_fun`
@@ -465,35 +608,8 @@ CTYPE_OUT map_reduce_over_dim_list(
     const executorch::aten::optional<executorch::aten::ArrayRef<int64_t>>&
         dim_list,
     const size_t out_ix) {
-  ET_CHECK(check_dim_list_is_valid(in, dim_list));
-
-  ET_CHECK_MSG(
-      out_ix < get_out_numel(in, dim_list),
-      "Out index %zd is out of bounds",
-      out_ix);
-
-  ET_CHECK_MSG(in.numel() > 0, "Input tensor must be nonempty");
-
-  const size_t init_index = get_init_index(in, dim_list, out_ix);
-
-  const CTYPE_IN* const in_data = in.const_data_ptr<CTYPE_IN>();
-  CTYPE_OUT acc_val = map_fun(in_data[init_index]);
-
-  if (in.numel() == 1) {
-    return acc_val;
-  }
-
-  apply_over_dim_list(
-      [&acc_val, reduce_fun, map_fun, in_data](const size_t in_ix) {
-        acc_val = reduce_fun(map_fun(in_data[in_ix]), acc_val);
-      },
-      in,
-      dim_list,
-      out_ix,
-      1,
-      -1);
-
-  return acc_val;
+  MapReduceOverDimListPlan plan(in, dim_list);
+  return plan.execute<CTYPE_IN, CTYPE_OUT>(map_fun, reduce_fun, out_ix);
 }
 
 /**
@@ -526,6 +642,28 @@ std::tuple<CTYPE, long> reduce_over_dim(
       [](CTYPE v) { return v; }, reduce_fun, in, dim, out_ix);
 }
 
+/**
+ * Execution plan for repeated reduce_over_dim_list with the same
+ * function, input tensor, and dim_list but varying out_ix.
+ */
+class ReduceOverDimListPlan {
+ public:
+  ReduceOverDimListPlan(
+      const executorch::aten::Tensor& in,
+      const executorch::aten::optional<executorch::aten::ArrayRef<int64_t>>&
+          dim_list)
+      : plan_(in, dim_list) {}
+
+  template <typename CTYPE, typename ReduceOp>
+  CTYPE execute(const ReduceOp& reduce_fun, const size_t out_ix) {
+    return plan_.execute<CTYPE, CTYPE>(
+        [](CTYPE v) { return v; }, reduce_fun, out_ix);
+  }
+
+ private:
+  MapReduceOverDimListPlan plan_;
+};
+
 /**
  * Useful to reduce a tensor `in` over a given list of dimensions `dim_list`
  * for the output element at index `out_ix` using the reduce function
@@ -552,8 +690,8 @@ CTYPE reduce_over_dim_list(
     const executorch::aten::optional<executorch::aten::ArrayRef<int64_t>>&
         dim_list,
     const size_t out_ix) {
-  return map_reduce_over_dim_list<CTYPE, CTYPE>(
-      [](CTYPE v) { return v; }, reduce_fun, in, dim_list, out_ix);
+  ReduceOverDimListPlan plan(in, dim_list);
+  return plan.execute<CTYPE>(reduce_fun, out_ix);
 }
 
 //
@@ -614,6 +752,17 @@ Error resize_reduction_out(
     bool keepdim,
     executorch::aten::Tensor& out);
 
+// Resolve ambiguity between the above two overloads -- ArrayRef and
+// optional are both implicitly constructible from int64_t.
+inline Error resize_reduction_out(
+    const executorch::aten::Tensor& in,
+    int64_t dim,
+    bool keepdim,
+    executorch::aten::Tensor& out) {
+  return resize_reduction_out(
+      in, executorch::aten::optional<int64_t>(dim), keepdim, out);
+}
+
 #ifndef USE_ATEN_LIB
 bool check_reduction_args(
     const Tensor& in,
@@ -663,5 +812,51 @@ bool check_prod_out_args(
 
 #endif
 
+/**
+ * parallel_for wrapper for reductions that call reduce_over_dim or
+ * map_reduce_over_dim for each output element. Automatically
+ * calculates appropriate grain size.
+ */
+template <typename Func>
+[[nodiscard]] bool parallel_for_each_reduce_over_dim_output_index(
+    const Tensor& in,
+    executorch::aten::optional<int64_t> dim,
+    const Tensor& out,
+    const Func& func) {
+#ifdef ET_USE_THREADPOOL
+  const ssize_t reduction_size = get_reduced_dim_product(in, dim);
+  const auto grain_size = std::max(
+      static_cast<ssize_t>(1),
+      static_cast<ssize_t>(executorch::extension::internal::GRAIN_SIZE) /
+          reduction_size);
+#else // ET_USE_THREADPOOL
+  const auto grain_size = 1;
+#endif // ET_USE_THREADPOOL
+  return executorch::extension::parallel_for(0, out.numel(), grain_size, func);
+}
+
+/**
+ * parallel_for wrapper for reductions that call reduce_over_dim_list or
+ * map_reduce_over_dim_list for each output element. Automatically
+ * calculates appropriate grain size.
+ */
+template <typename Func>
+[[nodiscard]] bool parallel_for_each_reduce_over_dim_list_output_index(
+    const Tensor& in,
+    executorch::aten::optional<ArrayRef<int64_t>> dim_list,
+    const Tensor& out,
+    const Func& func) {
+#ifdef ET_UE_THREADPOOL
+  const ssize_t reduction_size = get_reduced_dim_product(in, dim_list);
+  const auto grain_size = std::max(
+      static_cast<ssize_t>(1),
+      static_cast<ssize_t>(executorch::extension::internal::GRAIN_SIZE) /
+          reduction_size);
+#else // ET_USE_THREADPOOL
+  const auto grain_size = 1;
+#endif // ET_USE_THREADPOOL
+  return executorch::extension::parallel_for(0, out.numel(), grain_size, func);
+}
+
 } // namespace executor
 } // namespace torch
diff --git a/kernels/portable/cpu/util/repeat_util.cpp b/kernels/portable/cpu/util/repeat_util.cpp
index d373a86c16c..be7231cb621 100644
--- a/kernels/portable/cpu/util/repeat_util.cpp
+++ b/kernels/portable/cpu/util/repeat_util.cpp
@@ -8,6 +8,7 @@
 
 #include <cstring>
 
+#include <executorch/kernels/portable/cpu/util/repeat_util.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
@@ -25,8 +26,8 @@ bool check_repeat_args(
     executorch::aten::ArrayRef<int64_t> repeats,
     Tensor& out) {
   // Ensure the self tensors list is non-empty.
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
-      repeats.size() >= self.dim(),
+  ET_CHECK_OR_RETURN_FALSE(
+      static_cast<ssize_t>(repeats.size()) >= self.dim(),
       "Number of dimensions of repeat dims can not be smaller than number of dimensions of tensor");
 
   // Repeat arrayref shall not contain negative element.
@@ -34,12 +35,12 @@ bool check_repeat_args(
   for (auto repeat : repeats) {
     all_non_negative = all_non_negative && (repeat >= 0);
   }
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       all_non_negative, "Trying to create tensor with negative dimension");
 
   /// Check if out.size() is legal.
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
-      out.dim() == repeats.size(),
+  ET_CHECK_OR_RETURN_FALSE(
+      static_cast<size_t>(out.dim()) == repeats.size(),
       "The dimension of out shall equal size of repeats, but now is %zd and %zd",
       out.dim(),
       repeats.size());
@@ -47,8 +48,8 @@ bool check_repeat_args(
   // Right now we only support the tensors whose dimension is no greater than
   // kTensorDimensionLimit. Only check out tensor because the number of
   // dimension of out tensor shall have more than or equal to self tensor
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
-      out.dim() <= kTensorDimensionLimit,
+  ET_CHECK_OR_RETURN_FALSE(
+      static_cast<size_t>(out.dim()) <= kTensorDimensionLimit,
       "The dimension of input and output should not be larger than %zd",
       kTensorDimensionLimit);
 
@@ -58,7 +59,7 @@ bool check_repeat_args(
   // repeats, and called it reformat_self_size. We then make point-to-point mul
   // of reformat_self_size and repeats. The result should equal out.size().
   size_t reformat_self_size[kTensorDimensionLimit];
-  for (size_t i = 0; i < out.dim() - self.dim(); i++) {
+  for (ssize_t i = 0; i < out.dim() - self.dim(); i++) {
     reformat_self_size[i] = 1;
   }
 
@@ -66,8 +67,9 @@ bool check_repeat_args(
     reformat_self_size[out.dim() - 1 - i] = self.size(self.dim() - 1 - i);
   }
   for (size_t i = 0; i < repeats.size(); i++) {
-    ET_LOG_MSG_AND_RETURN_IF_FALSE(
-        reformat_self_size[i] * repeats[i] == out.size(i),
+    ET_CHECK_OR_RETURN_FALSE(
+        reformat_self_size[i] * repeats[i] ==
+            static_cast<uint64_t>(out.size(i)),
         "Expect out size at dimension %zu is %" PRId64 ", but now is %zd",
         i,
         reformat_self_size[i] * repeats[i],
@@ -130,7 +132,7 @@ void repeat_internal(
   // The increment along index of slot array to reach the next possible valid
   // value.
   int64_t incr[kTensorDimensionLimit];
-  for (size_t i = 0; i < self_dim; i++) {
+  for (size_t i = 0; i < static_cast<size_t>(self_dim); i++) {
     incr[i] = self_size[i];
   }
 
@@ -140,7 +142,7 @@ void repeat_internal(
   // than self).
   size_t index = self_dim - 1;
   size_t start = out.dim() - self_dim;
-  while (slots[0] != out.size(start)) {
+  while (slots[0] != static_cast<size_t>(out.size(start))) {
     // Compute the offset (from origin) in the out tensor where this self
     // data will be copied to.
     size_t offset = compute_access_offset(slots, strides, self_dim);
@@ -150,7 +152,7 @@ void repeat_internal(
     slots[index] += incr[index];
     // If we have reached the limit in the innermost dimension, successively
     // increment the slot index of outer dimensions.
-    while (slots[index] == out.size(start + index)) {
+    while (slots[index] == static_cast<size_t>(out.size(start + index))) {
       if (index == 0) {
         break;
       }
@@ -226,7 +228,7 @@ Error repeat_tensor(
   // so we reset the upper bound of innermost dim to 1. 'in_incr' indicates
   // the size (in bytes) of the self data.
   int64_t limits[kTensorDimensionLimit];
-  for (size_t i = 0; i < self_dim; i++) {
+  for (ssize_t i = 0; i < self_dim; i++) {
     limits[i] = self_size[i];
   }
 
@@ -242,7 +244,7 @@ Error repeat_tensor(
   // one array a time. To do so, we iterate over all the valid values of slots
   // array. The repeat_internal() takes care of replicating the array along the
   // coordinates specified by repeats array.
-  while (slots[0] != limits[0]) {
+  while (static_cast<int64_t>(slots[0]) != limits[0]) {
     // Compute the offset (from origin) in the out tensor where the self
     // array (with indices in self tensor indicated by slots) will be copied.
     size_t out_offset = compute_access_offset(slots, strides, self_dim);
@@ -256,7 +258,7 @@ Error repeat_tensor(
     slots[index]++;
     // If we have reached the limit in the innermost dimension, successively
     // increment the slot index of outer dimensions.
-    while (slots[index] == limits[index]) {
+    while (static_cast<int64_t>(slots[index]) == limits[index]) {
       if (index == 0) {
         break;
       }
diff --git a/kernels/portable/cpu/util/slice_util.cpp b/kernels/portable/cpu/util/slice_util.cpp
index a948a370de2..5761dee0ba7 100644
--- a/kernels/portable/cpu/util/slice_util.cpp
+++ b/kernels/portable/cpu/util/slice_util.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <executorch/kernels/portable/cpu/util/slice_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <cstring>
@@ -24,7 +25,7 @@ bool check_narrow_copy_args(
   ET_LOG_AND_RETURN_IF_FALSE(in.dim() > 0);
   ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, out));
   ET_LOG_AND_RETURN_IF_FALSE(tensor_has_dim(in, dim));
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(lenth >= 0, "lenth must be non-negative");
+  ET_CHECK_OR_RETURN_FALSE(lenth >= 0, "lenth must be non-negative");
   ET_LOG_AND_RETURN_IF_FALSE(start >= -in.size(dim));
   ET_LOG_AND_RETURN_IF_FALSE(start <= in.size(dim));
   if (start < 0) {
@@ -42,7 +43,7 @@ void get_narrow_copy_out_target_size(
     size_t* out_ndim) {
   *out_ndim = in.dim();
 
-  for (size_t d = 0; d < in.dim(); ++d) {
+  for (const auto d : c10::irange(in.dim())) {
     out_sizes[d] = in.size(d);
   }
   out_sizes[dim] = length;
@@ -56,8 +57,7 @@ bool check_slice_copy_args(
   ET_LOG_AND_RETURN_IF_FALSE(in.dim() > 0);
   ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, out));
   ET_LOG_AND_RETURN_IF_FALSE(tensor_has_dim(in, dim));
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
-      step > 0, "slice step must be greater than zero");
+  ET_CHECK_OR_RETURN_FALSE(step > 0, "slice step must be greater than zero");
   return true;
 }
 
@@ -89,18 +89,17 @@ bool check_slice_scatter_args(
   ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_rank(input, src));
 
   // Check step. Step must be greater than zero
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
-      step > 0, "slice step must be greater than zero");
+  ET_CHECK_OR_RETURN_FALSE(step > 0, "slice step must be greater than zero");
 
   // The size of src tensor should follow these rules:
   // - src.size(i) shall equal to input.size(i) if i != dim,
   // - src.size(dim) shall equal to num_values
-  for (size_t d = 0; d < input.dim() - 1; d++) {
+  for (const auto d : c10::irange(input.dim() - 1)) {
     if (d != dim) {
       ET_LOG_AND_RETURN_IF_FALSE(
           tensors_have_same_size_at_dims(input, d, src, d));
     } else {
-      ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      ET_CHECK_OR_RETURN_FALSE(
           src.size(d) == num_values,
           "input.size(%zu) %zd != num_values %" PRId64 " | dim = %" PRId64 ")",
           d,
@@ -168,9 +167,9 @@ void compute_slice(
   const char* input_data = in.const_data_ptr<char>();
   char* dest = out.mutable_data_ptr<char>();
 
-  for (int i = 0; i < leading_dims; i++) {
+  for (const auto i : c10::irange(leading_dims)) {
     const char* src = input_data + (i * dim_length + start) * length_per_step;
-    for (int j = 0; j < length; j++) {
+    for ([[maybe_unused]] const auto j : c10::irange(length)) {
       memcpy(dest, src, length_per_step);
       src += step * length_per_step;
       dest += length_per_step;
diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl
index 26f55a91e8d..a623b9d4d7a 100644
--- a/kernels/portable/cpu/util/targets.bzl
+++ b/kernels/portable/cpu/util/targets.bzl
@@ -1,4 +1,4 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime")
 
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
@@ -32,6 +32,7 @@ def define_common_targets():
             "//executorch/kernels/portable/cpu/util:slice_util",
             "//executorch/kernels/portable/cpu/util:elementwise_util",
             "//executorch/kernels/portable/cpu/util:upsample_util",
+            "//executorch/runtime/kernel:thread_parallel_interface",
         ],
         visibility = ["//executorch/...", "@EXECUTORCH_CLIENTS"],
     )
@@ -61,17 +62,22 @@ def define_common_targets():
             "//executorch/runtime/core/exec_aten/util:scalar_type_util",
             "//executorch/runtime/core/exec_aten/util:tensor_util",
         ],
-        compiler_flags = ["-Wno-missing-prototypes"],
         visibility = ["//executorch/kernels/portable/cpu/..."],
     )
 
     runtime.cxx_library(
         name = "broadcast_util",
-        srcs = ["broadcast_util.cpp"],
+        srcs = [
+            "broadcast_util.cpp",
+            "delinearize_index.cpp",
+        ],
         exported_headers = [
             "broadcast_util.h",
+            "delinearize_index.h",
+        ],
+        exported_deps = [
+            ":broadcast_indexes_range",
         ],
-        compiler_flags = ["-Wno-missing-prototypes"],
         deps = [
             ":repeat_util",
             "//executorch/runtime/kernel:kernel_includes",
@@ -100,9 +106,14 @@ def define_common_targets():
             "elementwise_util.h",
         ],
         compiler_flags = ["-Wno-missing-prototypes"],
-        deps = [
+        exported_deps = [
+            ":broadcast_indexes_range",
             ":broadcast_util",
             ":dtype_util",
+            "//executorch/runtime/kernel:kernel_runtime_context",
+            "//executorch/runtime/kernel:thread_parallel_interface",
+        ],
+        deps = [
             "//executorch/kernels/portable/cpu:scalar_utils",
             "//executorch/runtime/kernel:kernel_includes",
         ],
@@ -233,6 +244,9 @@ def define_common_targets():
         name = "functional_util",
         srcs = [],
         exported_headers = ["functional_util.h"],
+        exported_deps = [
+            "//executorch/runtime/kernel:thread_parallel_interface",
+        ],
         deps = [
             "//executorch/runtime/kernel:kernel_includes",
             "//executorch/runtime/core/exec_aten/util:tensor_util",
@@ -280,8 +294,21 @@ def define_common_targets():
         visibility = ["//executorch/kernels/portable/cpu/..."],
     )
 
+    runtime.cxx_library(
+        name = "broadcast_indexes_range",
+        exported_headers = ["broadcast_indexes_range.h"],
+        deps = [
+            "//executorch/runtime/core/exec_aten:lib",
+            "//executorch/runtime/core/exec_aten/util:tensor_dimension_limit",
+        ],
+        visibility = [
+            "//executorch/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+    )
+
     # Utility functions that can be used by operators that perform reduction
-    for aten_mode in [True, False]:
+    for aten_mode in get_aten_mode_options():
         suffix = "_aten" if aten_mode else ""
         runtime.cxx_library(
             name = "reduce_util{}".format(suffix),
@@ -291,6 +318,9 @@ def define_common_targets():
                 "//executorch/runtime/kernel:kernel_includes{}".format(suffix),
                 "//executorch/runtime/core/exec_aten/util:tensor_util{}".format(suffix),
             ],
+            exported_deps = [
+                "//executorch/runtime/kernel:thread_parallel_interface",
+            ],
             exported_preprocessor_flags = ["-DUSE_ATEN_LIB"] if aten_mode else [],
             visibility = [
                 "//executorch/extension/llm/custom_ops/...",
diff --git a/kernels/portable/cpu/util/test/CMakeLists.txt b/kernels/portable/cpu/util/test/CMakeLists.txt
index 5f81e4b6aec..d95b3a81b5c 100644
--- a/kernels/portable/cpu/util/test/CMakeLists.txt
+++ b/kernels/portable/cpu/util/test/CMakeLists.txt
@@ -17,9 +17,11 @@ cmake_minimum_required(VERSION 3.19)
 
 set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../../..)
 
-include(${EXECUTORCH_ROOT}/build/Test.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
-set(_test_srcs broadcast_test.cpp reduce_test.cpp)
+set(_test_srcs broadcast_indexes_range_test.cpp broadcast_test.cpp
+               reduce_test.cpp
+)
 
 et_cxx_test(
   kernels_portable_cpu_util_test SOURCES ${_test_srcs} EXTRA_LIBS
diff --git a/kernels/portable/cpu/util/test/broadcast_indexes_range_test.cpp b/kernels/portable/cpu/util/test/broadcast_indexes_range_test.cpp
new file mode 100644
index 00000000000..1023915ea66
--- /dev/null
+++ b/kernels/portable/cpu/util/test/broadcast_indexes_range_test.cpp
@@ -0,0 +1,284 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/portable/cpu/util/broadcast_indexes_range.h>
+#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+
+#include <gtest/gtest.h>
+
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::runtime::testing::TensorFactory;
+using torch::executor::BroadcastIndexesRange;
+using torch::executor::delinearize_index;
+using torch::executor::linearize_access_indexes;
+
+namespace {
+template <typename Range>
+auto range_to_vec(const Range& rng) {
+  return std::vector<typename Range::iterator::value_type>(
+      rng.begin(), rng.end());
+}
+} // namespace
+TEST(BroadcastIndexesRangeTest, Empty) {
+  TensorFactory<ScalarType::Int> tf;
+
+  Tensor a = tf.make({0}, {});
+  ASSERT_EQ(a.numel(), 0);
+  bool loop_entered = false;
+  for (auto _ : BroadcastIndexesRange<1>(a, a)) {
+    loop_entered = true;
+  }
+  EXPECT_FALSE(loop_entered);
+}
+
+// [W] -> [W]
+TEST(BroadcastIndexesRangeTest, OneDNotBroadcasted) {
+  TensorFactory<ScalarType::Int> tf;
+
+  Tensor out = tf.zeros({5});
+  int idx = 0;
+  const auto range = BroadcastIndexesRange<1>(out, out);
+  for (const auto& elem : range_to_vec(range)) {
+    EXPECT_EQ(*(range.begin() + idx), elem);
+    EXPECT_EQ(elem[0], idx++);
+    EXPECT_EQ(elem[0], elem[1]);
+  }
+}
+
+// [1] -> [W]
+TEST(BroadcastIndexesRangeTest, ScalarBroadcastToOneD) {
+  TensorFactory<ScalarType::Int> tf;
+
+  Tensor out = tf.zeros({5});
+  Tensor in = tf.zeros({1});
+
+  auto actual = range_to_vec(BroadcastIndexesRange<1>(out, in));
+  decltype(actual) expected = {
+      {0, 0},
+      {1, 0},
+      {2, 0},
+      {3, 0},
+      {4, 0},
+  };
+  EXPECT_EQ(expected, actual);
+}
+
+template <typename Range>
+void test_operator_plus(const Range& range) {
+  size_t idx = 0;
+  for (const auto& indexes : range) {
+    EXPECT_EQ(*(range.begin() + idx), indexes);
+    idx++;
+  }
+}
+
+// [1] -> [H, W]
+// [W] -> [H, W]
+// [1, 1] -> [H, W]
+// [1, W] -> [H, W]
+// [H, 1] -> [H, W]
+// [H, W] -> [H, W]
+// Cover all these at the same time to also exercise multiple input tensors.
+TEST(BroadcastIndexesRangeTest, OneAndTwoDExhaustive) {
+  TensorFactory<ScalarType::Int> tf;
+  Tensor out = tf.zeros({3, 4});
+  Tensor in_0d_scalar = tf.zeros({});
+  Tensor in_1d_scalar = tf.zeros({1});
+  Tensor in_2d_scalar = tf.zeros({1, 1});
+
+  Tensor in_row = tf.zeros({4});
+  Tensor in_col = tf.zeros({3, 1});
+
+  Tensor in_not_broadcast = tf.zeros({3, 4});
+
+  const auto range = BroadcastIndexesRange<6>(
+      out,
+      in_0d_scalar,
+      in_1d_scalar,
+      in_2d_scalar,
+      in_row,
+      in_col,
+      in_not_broadcast);
+  auto actual = range_to_vec(range);
+  decltype(actual) expected = {
+      {0, 0, 0, 0, 0, 0, 0},
+      {1, 0, 0, 0, 1, 0, 1},
+      {2, 0, 0, 0, 2, 0, 2},
+      {3, 0, 0, 0, 3, 0, 3},
+      {4, 0, 0, 0, 0, 1, 4},
+      {5, 0, 0, 0, 1, 1, 5},
+      {6, 0, 0, 0, 2, 1, 6},
+      {7, 0, 0, 0, 3, 1, 7},
+      {8, 0, 0, 0, 0, 2, 8},
+      {9, 0, 0, 0, 1, 2, 9},
+      {10, 0, 0, 0, 2, 2, 10},
+      {11, 0, 0, 0, 3, 2, 11},
+  };
+  EXPECT_EQ(expected, actual);
+
+  test_operator_plus(range);
+}
+
+// Make sure nothing is thrown off by a size-1 dim in the output:
+// [] -> [1, W]
+// [] -> [H, 1]
+// [1] -> [1, W]
+// [1] -> [H, 1]
+// [W] -> [1, W]
+// [1, 1] -> [1, W]
+// [1, 1] -> [H, 1]
+// [1, W] -> [1, W]
+// [H, 1] -> [H, 1]
+TEST(BroadcastIndexesRangeTest, OneAndTwoDWith1InOutputShapeExhaustive) {
+  TensorFactory<ScalarType::Int> tf;
+  constexpr auto H = 2;
+  constexpr auto W = 3;
+  Tensor out_row = tf.zeros({1, W});
+  Tensor out_col = tf.zeros({H, 1});
+  Tensor in_0d_scalar = tf.zeros({});
+  Tensor in_1d_scalar = tf.zeros({1});
+  Tensor in_2d_scalar = tf.zeros({1, 1});
+
+  Tensor in_row = tf.zeros({W});
+  Tensor in_leading_one_row = tf.zeros({1, W});
+
+  Tensor in_col = tf.zeros({H, 1});
+
+  size_t idx = 0;
+  const auto range_row = BroadcastIndexesRange<5>(
+      out_row,
+      in_0d_scalar,
+      in_1d_scalar,
+      in_2d_scalar,
+      in_row,
+      in_leading_one_row);
+  for (const auto
+       [out_idx,
+        in_0d_idx,
+        in_1d_idx,
+        in_2d_idx,
+        in_row_idx,
+        in_leading_one_row_idx] : range_row) {
+    EXPECT_EQ(out_idx, idx++);
+    EXPECT_EQ(in_0d_idx, 0);
+    EXPECT_EQ(in_1d_idx, 0);
+    EXPECT_EQ(in_2d_idx, 0);
+    EXPECT_EQ(in_row_idx, out_idx);
+    EXPECT_EQ(in_leading_one_row_idx, out_idx);
+  }
+
+  test_operator_plus(range_row);
+
+  idx = 0;
+  const auto range_col = BroadcastIndexesRange<4>(
+      out_col, in_0d_scalar, in_1d_scalar, in_2d_scalar, in_col);
+  for (const auto [out_idx, in_0d_idx, in_1d_idx, in_2d_idx, in_col_idx] :
+       range_col) {
+    EXPECT_EQ(out_idx, idx++);
+    EXPECT_EQ(in_0d_idx, 0);
+    EXPECT_EQ(in_1d_idx, 0);
+    EXPECT_EQ(in_2d_idx, 0);
+    EXPECT_EQ(in_col_idx, out_idx);
+  }
+
+  test_operator_plus(range_col);
+}
+
+// [1, 1, 1] -> [C, H, W]
+// [C, H, 1] -> [C, H, W]
+// [C, 1, W] -> [C, H, W]
+// [1, H, W] -> [C, H, W]
+// [C, 1, 1] -> [C, H, W]
+// [1, H, 1] -> [C, H, W]
+// [1, 1, W] -> [C, H, W]
+// [C, H, W] -> [C, H, W]
+TEST(BroadcastIndexesRangeTest, ThreeDBroadcasting) {
+  TensorFactory<ScalarType::Int> tf;
+  Tensor out = tf.zeros({2, 3, 4});
+  std::array<Tensor, 8> input_tensors = {
+      tf.zeros({2, 3, 1}),
+      tf.zeros({2, 1, 4}),
+      tf.zeros({1, 3, 4}),
+      tf.zeros({2, 1, 1}),
+      tf.zeros({1, 3, 1}),
+      tf.zeros({1, 1, 4}),
+      tf.zeros({1, 1, 1}),
+      tf.zeros({2, 3, 4}),
+  };
+  // Writing out all the indexes would be too cumbersome, so here we
+  // take the opportunity to mutation test against delinearize_index
+  // and linearize_access_indexes.
+  int idx = 0;
+  const auto range = BroadcastIndexesRange<8>(
+      out,
+      input_tensors[0],
+      input_tensors[1],
+      input_tensors[2],
+      input_tensors[3],
+      input_tensors[4],
+      input_tensors[5],
+      input_tensors[6],
+      input_tensors[7]);
+  for (const auto indexes : range) {
+    const auto out_idx = indexes[0];
+    EXPECT_EQ(out_idx, idx++);
+    size_t out_indexes[executorch::runtime::kTensorDimensionLimit];
+    delinearize_index(
+        out_idx, out, out_indexes, executorch::runtime::kTensorDimensionLimit);
+    for (const auto tensor_idx : c10::irange(0, input_tensors.size())) {
+      EXPECT_EQ(
+          indexes[tensor_idx + 1],
+          linearize_access_indexes(
+              out_indexes, out.dim(), input_tensors[tensor_idx]));
+    }
+  }
+  test_operator_plus(range);
+}
+
+// 4-D should generalize, but we will go ahead and test:
+// [N, 1, H, 1] -> [N, C, H, W]
+// [1, C, 1, W] -> [N, C, H, W]
+template <size_t N, size_t C, size_t H, size_t W>
+void four_d_broadcasting_test() {
+  TensorFactory<ScalarType::Int> tf;
+  Tensor out = tf.zeros({N, C, H, W});
+  Tensor in_broadcast_cw = tf.zeros({N, 1, H, 1});
+  Tensor in_broadcast_nh = tf.zeros({1, C, 1, W});
+
+  // Writing out all the indexes would be too cumbersome, so here we
+  // take the opportunity to mutation test against delinearize_index
+  // and linearize_access_indexes.
+  int idx = 0;
+  const auto range =
+      BroadcastIndexesRange<2>(out, in_broadcast_cw, in_broadcast_nh);
+  for (const auto [out_idx, in_cw_idx, in_nh_idx] : range) {
+    EXPECT_EQ(out_idx, idx++);
+    size_t out_indexes[executorch::runtime::kTensorDimensionLimit];
+    delinearize_index(
+        out_idx, out, out_indexes, executorch::runtime::kTensorDimensionLimit);
+    EXPECT_EQ(
+        in_cw_idx,
+        linearize_access_indexes(out_indexes, out.dim(), in_broadcast_cw));
+    EXPECT_EQ(
+        in_nh_idx,
+        linearize_access_indexes(out_indexes, out.dim(), in_broadcast_nh));
+  }
+
+  test_operator_plus(range);
+}
+
+TEST(BroadcastIndexesRangeTest, FourDBroadcasting) {
+  four_d_broadcasting_test<2, 3, 4, 5>();
+}
+
+TEST(BroadcastIndexesRangeTest, FourDBroadcastingWithOneDimsInOutput) {
+  four_d_broadcasting_test<2, 3, 1, 5>();
+  four_d_broadcasting_test<2, 1, 3, 1>();
+}
diff --git a/kernels/portable/cpu/util/test/broadcast_test.cpp b/kernels/portable/cpu/util/test/broadcast_test.cpp
index 679296f112c..7ffd95b6c52 100644
--- a/kernels/portable/cpu/util/test/broadcast_test.cpp
+++ b/kernels/portable/cpu/util/test/broadcast_test.cpp
@@ -13,6 +13,7 @@
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/platform/platform.h>
 #include <executorch/test/utils/DeathTest.h>
 
 #include <gtest/gtest.h>
@@ -131,6 +132,7 @@ TEST(BroadcastUtilTest, GetBroadcastTargetSize) {
           .equals(ArrayRef<Tensor::SizesType>({5, 2, 2})));
 
   Tensor c = tf.zeros({4, 5});
+  et_pal_init();
   err = get_broadcast_target_size(
       a,
       c,
diff --git a/kernels/portable/cpu/util/test/targets.bzl b/kernels/portable/cpu/util/test/targets.bzl
index 28988b90dcc..178eb25a79b 100644
--- a/kernels/portable/cpu/util/test/targets.bzl
+++ b/kernels/portable/cpu/util/test/targets.bzl
@@ -12,6 +12,17 @@ def define_common_targets():
         ],
     )
 
+    runtime.cxx_test(
+        name = "broadcast_indexes_range_test",
+        srcs = ["broadcast_indexes_range_test.cpp"],
+        deps = [
+            "//executorch/kernels/portable/cpu/util:broadcast_util",
+            "//executorch/kernels/portable/cpu/util:broadcast_indexes_range",
+            "//executorch/runtime/core/exec_aten:lib",
+            "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
+        ],
+    )
+
     runtime.cxx_test(
         name = "reduce_test",
         srcs = ["reduce_test.cpp"],
diff --git a/kernels/portable/cpu/util/transpose_util.h b/kernels/portable/cpu/util/transpose_util.h
index 453446fd842..acd6a762d11 100644
--- a/kernels/portable/cpu/util/transpose_util.h
+++ b/kernels/portable/cpu/util/transpose_util.h
@@ -7,6 +7,7 @@
  */
 
 #pragma once
+#include <c10/util/irange.h>
 
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <string.h>
@@ -66,7 +67,7 @@ inline void increment_index_and_offset(
     // Impossible to happen at i = 0 due to precondition check before this
     // function is called
     offset += new_strides[i];
-    if (index[i] == new_sizes[i]) {
+    if (static_cast<SizesType>(index[i]) == new_sizes[i]) {
       offset -= new_sizes[i] * new_strides[i];
       index[i] = 0;
     } else {
@@ -118,7 +119,7 @@ void transpose_tensors(
   // tensor in output tensor order.
   size_t non_1_dim_indices[kTensorDimensionLimit];
   size_t num_non_1_dim_indices = 0;
-  for (size_t cur_dim = 0; cur_dim < dim; cur_dim++) {
+  for (const auto cur_dim : c10::irange(dim)) {
     if (new_sizes[cur_dim] != 1) {
       non_1_dim_indices[num_non_1_dim_indices++] = cur_dim;
     }
@@ -128,7 +129,7 @@ void transpose_tensors(
 
   // Loop over and copy input elements into output
   size_t a_offset = 0;
-  for (ssize_t out_offset = 0; out_offset < a.numel(); out_offset++) {
+  for (const auto out_offset : c10::irange(a.numel())) {
     data_out[out_offset] = data_a[a_offset];
     increment_index_and_offset(
         out_index, new_sizes, new_strides, indices, a_offset);
@@ -164,7 +165,7 @@ inline void get_transpose_out_target_size(
     return;
   }
 
-  for (size_t i = 0; i < in.dim(); ++i) {
+  for (const auto i : c10::irange(in.dim())) {
     out_sizes[i] = in.size(i);
   }
   out_sizes[dim0] = in.size(dim1);
diff --git a/kernels/portable/cpu/vec_ops.h b/kernels/portable/cpu/vec_ops.h
index 617fa0b0652..7a1a488701b 100644
--- a/kernels/portable/cpu/vec_ops.h
+++ b/kernels/portable/cpu/vec_ops.h
@@ -9,6 +9,7 @@
 
 #pragma once
 
+#include <c10/util/irange.h>
 #include <algorithm>
 #include <cmath>
 #include <cstdint>
@@ -48,7 +49,7 @@ inline void vec_addf(
     const float* __restrict__ x,
     const float* __restrict__ y,
     size_t size) {
-  for (size_t i = 0; i < size; ++i) {
+  for (const auto i : c10::irange(size)) {
     z[i] = x[i] + y[i];
   }
 }
@@ -60,7 +61,7 @@ inline void vec_scalef(
     const float* __restrict__ x,
     float scale,
     size_t size) {
-  for (size_t i = 0; i < size; ++i) {
+  for (const auto i : c10::irange(size)) {
     y[i] = x[i] * scale;
   }
 }
@@ -75,10 +76,10 @@ inline void vec_matmul(
     int64_t m,
     int64_t n,
     int64_t p) {
-  for (size_t i = 0; i < m; ++i) {
-    for (size_t j = 0; j < p; ++j) {
+  for (const auto i : c10::irange(m)) {
+    for (const auto j : c10::irange(p)) {
       T sum = 0;
-      for (size_t k = 0; k < n; ++k) {
+      for (const auto k : c10::irange(n)) {
         sum += x[i * n + k] * y[k * p + j];
       }
       z[i * p + j] = sum;
@@ -95,10 +96,10 @@ inline void vec_quantized_matmul_int8(
     int64_t m,
     int64_t n,
     int64_t p) {
-  for (size_t i = 0; i < m; ++i) {
-    for (size_t j = 0; j < p; ++j) {
+  for (const auto i : c10::irange(m)) {
+    for (const auto j : c10::irange(p)) {
       T sum = 0;
-      for (size_t k = 0; k < n; ++k) {
+      for (const auto k : c10::irange(n)) {
         sum += x[i * n + k] * static_cast<U>(y[k * p + j]) * s[k];
       }
       z[i * p + j] = sum;
@@ -124,13 +125,13 @@ inline void vec_quantized_matmul_transb_int8(
     int64_t g) {
   int64_t n_over_g = (n + g - 1) / g;
 
-  for (size_t i = 0; i < m; ++i) {
-    for (size_t j = 0; j < p; ++j) {
+  for (const auto i : c10::irange(m)) {
+    for (const auto j : c10::irange(p)) {
       T sum = 0;
-      for (size_t k = 0; k < n; k += g) {
+      for (int64_t k = 0; k < n; k += g) {
         T psum = 0;
         // the last group may have fewer than g elements
-        for (size_t k2 = k; k2 < bounds_min(k + g, n); k2++) {
+        for (const auto k2 : c10::irange(k, bounds_min(k + g, n))) {
           psum += x[i * n + k2] * static_cast<U>(y[j * n + k2]);
         }
         sum += psum * s[j * n_over_g + k / g];
@@ -154,10 +155,10 @@ inline void vec_addmm(
     int64_t p,
     U beta,
     U alpha) {
-  for (size_t i = 0; i < m; ++i) {
-    for (size_t j = 0; j < p; ++j) {
+  for (const auto i : c10::irange(m)) {
+    for (const auto j : c10::irange(p)) {
       T sum = 0;
-      for (size_t k = 0; k < n; ++k) {
+      for (const auto k : c10::irange(n)) {
         sum += mat1_data[i * n + k] * mat2_data[k * p + j];
       }
       out_data[i * p + j] = sum * alpha + self_data[i * p + j] * beta;
@@ -176,7 +177,7 @@ inline float reduce_add(const T* x, size_t size) {
 template <typename T>
 inline float vec_powerf(const T* x, size_t size) {
   float sum = 0;
-  for (size_t i = 0; i < size; ++i) {
+  for (const auto i : c10::irange(size)) {
     sum += x[i] * x[i];
   }
   return sum;
@@ -198,12 +199,12 @@ inline void vec_softmax(T* __restrict__ y, const U* __restrict__ x, int n) {
   U max_x = *std::max_element(x, x + n);
   T sum = 0;
 
-  for (int i = 0; i < n; ++i) {
+  for (const auto i : c10::irange(n)) {
     y[i] = expf(x[i] - max_x);
     sum += y[i];
   }
 
-  for (int i = 0; i < n; ++i) {
+  for (const auto i : c10::irange(n)) {
     y[i] /= sum;
   }
 }
@@ -227,7 +228,7 @@ inline void quantize_i8_f32(
     float scale,
     int32_t zero_point,
     size_t size) {
-  for (size_t i = 0; i < size; ++i) {
+  for (const auto i : c10::irange(size)) {
     float tmp = roundf(x[i] * scale + zero_point);
     y[i] = internal::clamp(tmp, -128.f, 127.f);
   }
@@ -241,7 +242,7 @@ inline void dequantize_i8_f32(
     float scale,
     int32_t zero_point,
     size_t size) {
-  for (size_t i = 0; i < size; ++i) {
+  for (const auto i : c10::irange(size)) {
     y[i] = scale * (x[i] - zero_point);
   }
 }
diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml
index 3221b8fe349..29dfe8b1a0c 100644
--- a/kernels/portable/functions.yaml
+++ b/kernels/portable/functions.yaml
@@ -572,6 +572,11 @@
     - arg_meta: null
       kernel_name: torch::executor::max_pool2d_with_indices_out
 
+- op: max_pool2d_with_indices_backward.grad_input
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::max_pool2d_with_indices_backward_out
+
 - op: mean.out
   kernels:
     - arg_meta: null
@@ -917,6 +922,11 @@
     - arg_meta: null
       kernel_name: torch::executor::unbind_copy_int_out
 
+- op: unfold_copy.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::unfold_copy_out
+
 - op: unsqueeze_copy.out
   kernels:
     - arg_meta: null
diff --git a/kernels/portable/targets.bzl b/kernels/portable/targets.bzl
index 9e96de61c91..759e5c96ae8 100644
--- a/kernels/portable/targets.bzl
+++ b/kernels/portable/targets.bzl
@@ -1,4 +1,4 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime")
 load("@fbsource//xplat/executorch/codegen:codegen.bzl", "et_operator_library", "executorch_generated_lib")
 
 def define_common_targets():
@@ -20,17 +20,18 @@ def define_common_targets():
         ],
     )
 
-    runtime.cxx_library(
-        name = "operators_aten",
-        srcs = [],
-        visibility = [
-            "//executorch/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-        exported_deps = [
-            "//executorch/kernels/portable/cpu:cpu_aten",
-        ],
-    )
+    if True in get_aten_mode_options():
+        runtime.cxx_library(
+            name = "operators_aten",
+            srcs = [],
+            visibility = [
+                "//executorch/...",
+                "@EXECUTORCH_CLIENTS",
+            ],
+            exported_deps = [
+                "//executorch/kernels/portable/cpu:cpu_aten",
+            ],
+        )
 
     runtime.export_file(
         name = "functions.yaml",
@@ -79,9 +80,6 @@ def define_common_targets():
     )
 
     generated_lib_common_args = {
-        "custom_ops_aten_kernel_deps": [
-            "//executorch/kernels/portable:operators_aten",
-        ],
         "custom_ops_yaml_target": "//executorch/kernels/portable:custom_ops.yaml",
         # size_test expects _static targets to be available for these libraries.
         "define_static_targets": True,
@@ -102,21 +100,22 @@ def define_common_targets():
         **generated_lib_common_args
     )
 
-    executorch_generated_lib(
-        name = "generated_lib_aten",
-        deps = [
-            ":executorch_aten_ops",
-            ":executorch_custom_ops",
-            "//executorch/kernels/portable:operators_aten",
-        ],
-        custom_ops_aten_kernel_deps = [
-            "//executorch/kernels/portable:operators_aten",
-        ],
-        custom_ops_yaml_target = "//executorch/kernels/portable:custom_ops.yaml",
-        aten_mode = True,
-        visibility = [
-            "//executorch/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-        define_static_targets = True,
-    )
+    if True in get_aten_mode_options():
+        executorch_generated_lib(
+            name = "generated_lib_aten",
+            deps = [
+                ":executorch_aten_ops",
+                ":executorch_custom_ops",
+                "//executorch/kernels/portable:operators_aten",
+            ],
+            custom_ops_aten_kernel_deps = [
+                "//executorch/kernels/portable:operators_aten",
+            ],
+            custom_ops_yaml_target = "//executorch/kernels/portable:custom_ops.yaml",
+            aten_mode = True,
+            visibility = [
+                "//executorch/...",
+                "@EXECUTORCH_CLIENTS",
+            ],
+            define_static_targets = True,
+        )
diff --git a/kernels/portable/test/TARGETS b/kernels/portable/test/TARGETS
index adf6636be4f..f7b89818c98 100644
--- a/kernels/portable/test/TARGETS
+++ b/kernels/portable/test/TARGETS
@@ -2,7 +2,6 @@
 # targets.bzl. This file can contain fbcode-only targets.
 
 load(":targets.bzl", "define_common_targets")
-load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
 oncall("executorch")
@@ -28,29 +27,3 @@ runtime.cxx_library(
         "libtorch",
     ],
 )
-
-python_unittest(
-    name = "op_upsample_bilinear2d_test",
-    srcs = [
-        "op_upsample_bilinear2d_test.py",
-    ],
-    preload_deps = [
-        ":aot_ops_test_lib",
-    ],
-    deps = [
-        "//caffe2:torch",
-    ],
-)
-
-python_unittest(
-    name = "op_upsample_nearest2d_test",
-    srcs = [
-        "op_upsample_nearest2d_test.py",
-    ],
-    preload_deps = [
-        ":aot_ops_test_lib",
-    ],
-    deps = [
-        "//caffe2:torch",
-    ],
-)
diff --git a/kernels/portable/test/targets.bzl b/kernels/portable/test/targets.bzl
index 23f179ab690..1da276ce3f8 100644
--- a/kernels/portable/test/targets.bzl
+++ b/kernels/portable/test/targets.bzl
@@ -1,3 +1,4 @@
+load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "is_xplat", "runtime")
 load("@fbsource//xplat/executorch/codegen:codegen.bzl", "et_operator_library", "executorch_generated_lib")
 load("@fbsource//xplat/executorch/kernels/test:util.bzl", "define_supported_features_lib", "op_test")
@@ -8,12 +9,40 @@ def define_common_targets():
     The directory containing this targets.bzl file should also contain both
     TARGETS and BUCK files that call this function.
     """
-    define_supported_features_lib()
+    if not runtime.is_oss:
+        define_supported_features_lib()
 
-    op_test(name = "op_allclose_test")
-    op_test(name = "op_div_test")
-    op_test(name = "op_gelu_test")
-    op_test(name = "op_mul_test")
+        if not is_xplat():
+            python_unittest(
+                name = "op_upsample_bilinear2d_test",
+                srcs = [
+                    "op_upsample_bilinear2d_test.py",
+                ],
+                preload_deps = [
+                    ":aot_ops_test_lib",
+                ],
+                deps = [
+                    "//caffe2:torch",
+                ],
+            )
+
+            python_unittest(
+                name = "op_upsample_nearest2d_test",
+                srcs = [
+                    "op_upsample_nearest2d_test.py",
+                ],
+                preload_deps = [
+                    ":aot_ops_test_lib",
+                ],
+                deps = [
+                    "//caffe2:torch",
+                ],
+            )
+
+        op_test(name = "op_allclose_test")
+        op_test(name = "op_div_test")
+        op_test(name = "op_gelu_test")
+        op_test(name = "op_mul_test")
 
     if is_xplat():
         et_operator_library(
diff --git a/kernels/prim_ops/et_view.cpp b/kernels/prim_ops/et_view.cpp
index 66aa9ac87e2..44ac7470193 100644
--- a/kernels/prim_ops/et_view.cpp
+++ b/kernels/prim_ops/et_view.cpp
@@ -32,19 +32,20 @@ bool get_view_target_size(
     executorch::aten::ArrayRef<int64_t> size,
     int64_t dim,
     executorch::aten::SizesType* out_size) {
-  ET_LOG_AND_RETURN_IF_FALSE(size.size() == dim);
+  ET_LOG_AND_RETURN_IF_FALSE(
+      dim >= 0 && size.size() == static_cast<size_t>(dim));
   int minus1_dim = -1;
   int n_zero = 0;
   int64_t numel_without_minus_1 = 1;
   for (int i = 0; i < dim; i++) {
     if (size[i] == -1) {
-      ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      ET_CHECK_OR_RETURN_FALSE(
           minus1_dim == -1, "At most one view dim can be -1.");
       minus1_dim = i;
     } else {
       // The size[i] must be non-negative now, but we check size[i] >= -1
       // in case code is reordered in the future.
-      ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      ET_CHECK_OR_RETURN_FALSE(
           size[i] >= -1, "Negative sizes are not allowed.");
 
       numel_without_minus_1 *= size[i];
@@ -56,7 +57,7 @@ bool get_view_target_size(
     }
   }
   if (minus1_dim >= 0) {
-    ET_LOG_MSG_AND_RETURN_IF_FALSE(
+    ET_CHECK_OR_RETURN_FALSE(
         n_zero == 0, "Cannot infer dimension size if there is a zero dim.");
     out_size[minus1_dim] = self.numel() / numel_without_minus_1;
   }
@@ -71,28 +72,38 @@ void et_view(KernelRuntimeContext& context, EValue** stack) {
   auto size = (*stack[1]).toIntList();
   auto out = (*stack[2]).toTensor();
 
-  ET_CHECK(tensors_have_same_dtype(self, out));
+  ET_KERNEL_CHECK(
+      context, tensors_have_same_dtype(self, out), InvalidArgument, );
 
   // Compute output size
   SizesType expected_output_size[kTensorDimensionLimit];
-  ET_CHECK(get_view_target_size(self, size, out.dim(), expected_output_size));
+  ET_KERNEL_CHECK(
+      context,
+      get_view_target_size(self, size, out.dim(), expected_output_size),
+      InvalidArgument, );
 
   // Resize for dynamic shape
-  ET_CHECK_MSG(
+  ET_KERNEL_CHECK_MSG(
+      context,
       resize_tensor(
           out, {expected_output_size, static_cast<size_t>(out.dim())}) ==
           Error::Ok,
+      Internal,
+      ,
       "Failed to resize output tensor.");
 
   // Do some checks
-  ET_CHECK(self.numel() == out.numel());
+  ET_KERNEL_CHECK(context, self.numel() == out.numel(), InvalidArgument, );
 
   // Update data ptr
-  ET_CHECK_MSG(
+  ET_KERNEL_CHECK_MSG(
+      context,
       internal::set_tensor_data(
           out,
           /*buffer=*/self.mutable_data_ptr(),
           /*buffer_size=*/out.nbytes()) == Error::Ok,
+      Internal,
+      ,
       "Failed to set data_ptr for out to self.");
 }
 
diff --git a/kernels/prim_ops/targets.bzl b/kernels/prim_ops/targets.bzl
index 9a753b50faa..c1af21a7e73 100644
--- a/kernels/prim_ops/targets.bzl
+++ b/kernels/prim_ops/targets.bzl
@@ -1,4 +1,4 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime")
 
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
@@ -7,7 +7,7 @@ def define_common_targets():
     TARGETS and BUCK files that call this function.
     """
 
-    for aten_mode in (True, False):
+    for aten_mode in get_aten_mode_options():
         aten_suffix = ("_aten" if aten_mode else "")
 
         runtime.cxx_library(
diff --git a/kernels/prim_ops/test/CMakeLists.txt b/kernels/prim_ops/test/CMakeLists.txt
index 46588c7b436..e20441909b6 100644
--- a/kernels/prim_ops/test/CMakeLists.txt
+++ b/kernels/prim_ops/test/CMakeLists.txt
@@ -16,7 +16,7 @@ cmake_minimum_required(VERSION 3.19)
 
 set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 
-include(${EXECUTORCH_ROOT}/build/Test.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
 set(_test_srcs prim_ops_test.cpp)
 
diff --git a/kernels/prim_ops/test/TARGETS b/kernels/prim_ops/test/TARGETS
index e6e5f639ca5..d910e242ede 100644
--- a/kernels/prim_ops/test/TARGETS
+++ b/kernels/prim_ops/test/TARGETS
@@ -1,4 +1,4 @@
-load("@fbcode_macros//build_defs:cpp_unittest.bzl", "cpp_unittest")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
 # Any targets that should be shared between fbcode and xplat must be defined in
 # targets.bzl. This file can contain fbcode-only targets.
@@ -17,13 +17,14 @@ python_unittest(
     ],
 )
 
-cpp_unittest(
+runtime.cxx_test(
     name = "prim_ops_test_cpp",
     srcs = [
         "prim_ops_test.cpp",
     ],
     deps = [
         "//executorch/kernels/prim_ops:prim_ops_registry",  # @manual
+        "//executorch/kernels/test:test_util",  # @manual
         "//executorch/runtime/core:evalue",  # @manual
         "//executorch/runtime/core/exec_aten:lib",  # @manual
         "//executorch/runtime/core/exec_aten/testing_util:tensor_util",  # @manual
@@ -31,6 +32,6 @@ cpp_unittest(
         "//executorch/runtime/kernel:kernel_runtime_context",  # @manual
         "//executorch/runtime/kernel:operator_registry",
         "//executorch/runtime/platform:platform",
-        "//executorch/test/utils:utils_aten",
+        "//executorch/test/utils:utils",
     ],
 )
diff --git a/kernels/prim_ops/test/prim_ops_test.cpp b/kernels/prim_ops/test/prim_ops_test.cpp
index 2efcb53eea4..646d248cf79 100644
--- a/kernels/prim_ops/test/prim_ops_test.cpp
+++ b/kernels/prim_ops/test/prim_ops_test.cpp
@@ -8,6 +8,7 @@
 
 #include <gtest/gtest.h>
 
+#include <executorch/kernels/test/TestUtil.h>
 #include <executorch/runtime/core/evalue.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
@@ -16,7 +17,6 @@
 #include <executorch/runtime/kernel/kernel_runtime_context.h>
 #include <executorch/runtime/kernel/operator_registry.h>
 #include <executorch/runtime/platform/runtime.h>
-#include <executorch/test/utils/DeathTest.h>
 #include <cstdint>
 #include <cstdio>
 
@@ -27,12 +27,10 @@ using torch::executor::resize_tensor;
 namespace torch {
 namespace executor {
 
-class RegisterPrimOpsTest : public ::testing::Test {
+class RegisterPrimOpsTest : public OperatorTest {
  protected:
-  KernelRuntimeContext context;
   void SetUp() override {
-    torch::executor::runtime_init();
-    context = KernelRuntimeContext();
+    context_ = KernelRuntimeContext();
   }
 };
 
@@ -57,7 +55,7 @@ TEST_F(RegisterPrimOpsTest, SymSizeReturnsCorrectValue) {
     stack[i] = &values[i];
   }
 
-  getOpsFn("aten::sym_size.int")(context, stack);
+  getOpsFn("aten::sym_size.int")(context_, stack);
 
   int64_t expected = 5;
   EXPECT_EQ(stack[2]->toInt(), expected);
@@ -77,7 +75,7 @@ TEST_F(RegisterPrimOpsTest, SymNumelReturnsCorrectValue) {
     stack[i] = &values[i];
   }
 
-  getOpsFn("aten::sym_numel")(context, stack);
+  getOpsFn("aten::sym_numel")(context_, stack);
 
   int64_t expected = 15;
   EXPECT_EQ(stack[1]->toInt(), expected);
@@ -97,28 +95,28 @@ TEST_F(RegisterPrimOpsTest, TestAlgebraOps) {
     stack[i] = &values[i];
   }
 
-  getOpsFn("executorch_prim::add.Scalar")(context, stack);
+  getOpsFn("executorch_prim::add.Scalar")(context_, stack);
   EXPECT_EQ(stack[2]->toInt(), 7);
 
-  getOpsFn("executorch_prim::sub.Scalar")(context, stack);
+  getOpsFn("executorch_prim::sub.Scalar")(context_, stack);
   EXPECT_EQ(stack[2]->toInt(), -1);
 
-  getOpsFn("executorch_prim::mul.Scalar")(context, stack);
+  getOpsFn("executorch_prim::mul.Scalar")(context_, stack);
   EXPECT_EQ(stack[2]->toInt(), 12);
 
-  getOpsFn("executorch_prim::floordiv.Scalar")(context, stack);
+  getOpsFn("executorch_prim::floordiv.Scalar")(context_, stack);
   EXPECT_EQ(stack[2]->toInt(), 0);
 
-  getOpsFn("executorch_prim::truediv.Scalar")(context, stack);
+  getOpsFn("executorch_prim::truediv.Scalar")(context_, stack);
   EXPECT_FLOAT_EQ(stack[2]->toDouble(), 0.75);
 
-  getOpsFn("executorch_prim::mod.int")(context, stack);
+  getOpsFn("executorch_prim::mod.int")(context_, stack);
   EXPECT_EQ(stack[2]->toInt(), 3);
 
-  getOpsFn("executorch_prim::mod.Scalar")(context, stack);
+  getOpsFn("executorch_prim::mod.Scalar")(context_, stack);
   EXPECT_EQ(stack[2]->toInt(), 3);
 
-  getOpsFn("executorch_prim::sym_float.Scalar")(context, stack);
+  getOpsFn("executorch_prim::sym_float.Scalar")(context_, stack);
   EXPECT_FLOAT_EQ(stack[1]->toDouble(), 3.0);
 }
 
@@ -155,7 +153,7 @@ TEST_F(RegisterPrimOpsTest, TestETCopyIndex) {
   stack[2] = &values[2];
 
   // Simple test to copy to index 0.
-  getOpsFn("executorch_prim::et_copy_index.tensor")(context, stack);
+  getOpsFn("executorch_prim::et_copy_index.tensor")(context_, stack);
 
   EXPECT_EQ(copy_to.sizes()[0], 1);
   EXPECT_EQ(copy_to.sizes()[1], 2);
@@ -164,7 +162,7 @@ TEST_F(RegisterPrimOpsTest, TestETCopyIndex) {
   values[1] = tf.make({2}, {5, 6});
   values[2] = EValue((int64_t)1);
   // Copy to the next index, 1.
-  getOpsFn("executorch_prim::et_copy_index.tensor")(context, stack);
+  getOpsFn("executorch_prim::et_copy_index.tensor")(context_, stack);
 
   EXPECT_EQ(copy_to.sizes()[0], 2);
   EXPECT_EQ(copy_to.sizes()[1], 2);
@@ -193,7 +191,7 @@ TEST_F(RegisterPrimOpsTest, TestETCopyIndexMismatchShape) {
   // copy_to.sizes[1:] and to_copy.sizes[:] don't match each other
   // which is a pre-requisite for this operator.
   ET_EXPECT_DEATH(
-      getOpsFn("executorch_prim::et_copy_index.tensor")(context, stack), "");
+      getOpsFn("executorch_prim::et_copy_index.tensor")(context_, stack), "");
 }
 
 TEST_F(RegisterPrimOpsTest, TestETCopyIndexStaticShape) {
@@ -217,7 +215,7 @@ TEST_F(RegisterPrimOpsTest, TestETCopyIndexStaticShape) {
   stack[2] = &values[2];
 
   // Copy and replace at index 1.
-  getOpsFn("executorch_prim::et_copy_index.tensor")(context, stack);
+  getOpsFn("executorch_prim::et_copy_index.tensor")(context_, stack);
   EXPECT_EQ(copy_to.sizes()[0], 2);
   EXPECT_EQ(copy_to.sizes()[1], 2);
   EXPECT_TENSOR_EQ(copy_to, tf.make({2, 2}, {1, 2, 5, 6}));
@@ -228,7 +226,7 @@ TEST_F(RegisterPrimOpsTest, TestETCopyIndexStaticShape) {
   index = 2;
   values[2] = EValue(index);
   ET_EXPECT_DEATH(
-      getOpsFn("executorch_prim::et_copy_index.tensor")(context, stack), "");
+      getOpsFn("executorch_prim::et_copy_index.tensor")(context_, stack), "");
 #endif
 }
 
@@ -246,19 +244,19 @@ TEST_F(RegisterPrimOpsTest, TestBooleanOps) {
     stack[i] = &values[i];
   }
 
-  getOpsFn("executorch_prim::ge.Scalar")(context, stack);
+  getOpsFn("executorch_prim::ge.Scalar")(context_, stack);
   EXPECT_EQ(stack[2]->toBool(), false);
 
-  getOpsFn("executorch_prim::gt.Scalar")(context, stack);
+  getOpsFn("executorch_prim::gt.Scalar")(context_, stack);
   EXPECT_EQ(stack[2]->toBool(), false);
 
-  getOpsFn("executorch_prim::le.Scalar")(context, stack);
+  getOpsFn("executorch_prim::le.Scalar")(context_, stack);
   EXPECT_EQ(stack[2]->toBool(), true);
 
-  getOpsFn("executorch_prim::lt.Scalar")(context, stack);
+  getOpsFn("executorch_prim::lt.Scalar")(context_, stack);
   EXPECT_EQ(stack[2]->toBool(), true);
 
-  getOpsFn("executorch_prim::eq.Scalar")(context, stack);
+  getOpsFn("executorch_prim::eq.Scalar")(context_, stack);
   EXPECT_EQ(stack[2]->toBool(), false);
 }
 
@@ -277,7 +275,7 @@ TEST_F(RegisterPrimOpsTest, LocalScalarDenseReturnsCorrectValue) {
     stack[i] = &values[i];
   }
 
-  getOpsFn("aten::_local_scalar_dense")(context, stack);
+  getOpsFn("aten::_local_scalar_dense")(context_, stack);
 
   int64_t expected = 1;
   EXPECT_EQ(stack[1]->toInt(), expected);
@@ -295,7 +293,7 @@ TEST_F(RegisterPrimOpsTest, NegScalarReturnsCorrectValue) {
     stack[i] = &values[i];
   }
 
-  getOpsFn("executorch_prim::neg.Scalar")(context, stack);
+  getOpsFn("executorch_prim::neg.Scalar")(context_, stack);
 
   EXPECT_EQ(stack[1]->toDouble(), -5.0f);
 
@@ -305,7 +303,7 @@ TEST_F(RegisterPrimOpsTest, NegScalarReturnsCorrectValue) {
   values[0] = EValue(a);
   values[1] = EValue(b);
 
-  getOpsFn("executorch_prim::neg.Scalar")(context, stack);
+  getOpsFn("executorch_prim::neg.Scalar")(context_, stack);
 
   EXPECT_EQ(stack[1]->toInt(), -5l);
 }
@@ -327,7 +325,7 @@ TEST_F(RegisterPrimOpsTest, TestNegScalarWithTensorDies) {
   }
 
   // Try to negate a tensor, which should cause a runtime error.
-  ET_EXPECT_DEATH(getOpsFn("executorch_prim::neg.Scalar")(context, stack), "");
+  ET_EXPECT_DEATH(getOpsFn("executorch_prim::neg.Scalar")(context_, stack), "");
 }
 
 TEST_F(RegisterPrimOpsTest, TestETView) {
@@ -410,9 +408,9 @@ TEST_F(RegisterPrimOpsTest, TestETView) {
 
   // Bad stacks expect death
   for (int i = 0; i < N_BAD_STACKS; i++) {
-    ET_EXPECT_DEATH(
-        getOpsFn("executorch_prim::et_view.default")(context, bad_stacks[i]),
-        "");
+    ET_EXPECT_KERNEL_FAILURE(
+        context_,
+        getOpsFn("executorch_prim::et_view.default")(context_, bad_stacks[i]));
   }
 
   constexpr int N_GOOD_STACKS = N_GOOD_OUTS;
@@ -422,7 +420,7 @@ TEST_F(RegisterPrimOpsTest, TestETView) {
 
   // Good outs expect no death and correct output
   for (int i = 0; i < N_GOOD_STACKS; i++) {
-    getOpsFn("executorch_prim::et_view.default")(context, good_out_stacks[i]);
+    getOpsFn("executorch_prim::et_view.default")(context_, good_out_stacks[i]);
     EXPECT_TENSOR_EQ(good_outs[i], tf.make({1, 3, 2}, {1, 2, 3, 4, 5, 6}));
     EXPECT_EQ(good_outs[i].const_data_ptr(), self.const_data_ptr());
   }
@@ -456,7 +454,7 @@ TEST_F(RegisterPrimOpsTest, TestETViewDynamic) {
 
   EValue* stack[3] = {&self_evalue, &size_int_list_evalue, &out_evalue};
 
-  getOpsFn("executorch_prim::et_view.default")(context, stack);
+  getOpsFn("executorch_prim::et_view.default")(context_, stack);
 
   EXPECT_TENSOR_EQ(out, tf.make({1, 3, 1}, {1, 2, 3}));
   EXPECT_EQ(out.const_data_ptr(), self.const_data_ptr());
@@ -493,14 +491,15 @@ TEST_F(RegisterPrimOpsTest, TestETViewEmpty) {
 
   // good size test
   EValue* stack[3] = {&self_evalue, &size_int_list_evalue, &out_evalue};
-  getOpsFn("executorch_prim::et_view.default")(context, stack);
+  getOpsFn("executorch_prim::et_view.default")(context_, stack);
   EXPECT_TENSOR_EQ(out, tf.make({3, 1, 0}, {}));
   EXPECT_EQ(out.const_data_ptr(), self.const_data_ptr());
 
   // bad size test
   EValue* bad_stack[3] = {&self_evalue, &bad_size_int_list_evalue, &out_evalue};
-  ET_EXPECT_DEATH(
-      getOpsFn("executorch_prim::et_view.default")(context, bad_stack), "");
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      getOpsFn("executorch_prim::et_view.default")(context_, bad_stack));
 }
 
 TEST_F(RegisterPrimOpsTest, TestCeil) {
@@ -518,7 +517,7 @@ TEST_F(RegisterPrimOpsTest, TestCeil) {
       stack[j] = &values[j];
     }
 
-    getOpsFn("executorch_prim::ceil.Scalar")(context, stack);
+    getOpsFn("executorch_prim::ceil.Scalar")(context_, stack);
     EXPECT_EQ(stack[1]->toInt(), expected[i]);
   }
 }
@@ -539,7 +538,7 @@ TEST_F(RegisterPrimOpsTest, TestRound) {
       stack[j] = &values[j];
     }
 
-    getOpsFn("executorch_prim::round.Scalar")(context, stack);
+    getOpsFn("executorch_prim::round.Scalar")(context_, stack);
     EXPECT_EQ(stack[1]->toInt(), expected[i]);
   }
 }
@@ -559,7 +558,7 @@ TEST_F(RegisterPrimOpsTest, TestTrunc) {
       stack[j] = &values[j];
     }
 
-    getOpsFn("executorch_prim::trunc.Scalar")(context, stack);
+    getOpsFn("executorch_prim::trunc.Scalar")(context_, stack);
     EXPECT_EQ(stack[1]->toInt(), expected[i]);
   }
 }
diff --git a/kernels/quantized/CMakeLists.txt b/kernels/quantized/CMakeLists.txt
index 6b01ba4fc27..29058e9b11d 100644
--- a/kernels/quantized/CMakeLists.txt
+++ b/kernels/quantized/CMakeLists.txt
@@ -26,8 +26,8 @@ endif()
 
 set(_common_compile_options -Wno-deprecated-declarations)
 
-include(${EXECUTORCH_ROOT}/build/Utils.cmake)
-include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
 
 if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
diff --git a/kernels/quantized/cpu/op_mixed_linear.cpp b/kernels/quantized/cpu/op_mixed_linear.cpp
index d09d0bdd5e1..c97ed2cb7c9 100644
--- a/kernels/quantized/cpu/op_mixed_linear.cpp
+++ b/kernels/quantized/cpu/op_mixed_linear.cpp
@@ -36,13 +36,13 @@ bool check_quantized_mixed_linear_args(
   ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, weight_scales));
   if (dtype.has_value()) {
     ET_LOG_AND_RETURN_IF_FALSE(out.scalar_type() == dtype.value());
-    ET_LOG_MSG_AND_RETURN_IF_FALSE(
+    ET_CHECK_OR_RETURN_FALSE(
         dtype.value() == ScalarType::Float || dtype.value() == ScalarType::Half,
         "dtype must be Float or Half");
   }
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       weight.scalar_type() == ScalarType::Char, "weight dtype must be int8");
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       in.scalar_type() == ScalarType::Float ||
           in.scalar_type() == ScalarType::Half,
       "input dtype must be Float or Half");
@@ -55,7 +55,7 @@ bool check_quantized_mixed_linear_args(
   }
 
   // Support for non-null zero points is not implemented yet.
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       !opt_weight_zero_points.has_value(), "zero points not supported yet.");
   return true;
 }
diff --git a/kernels/quantized/cpu/op_mixed_mm.cpp b/kernels/quantized/cpu/op_mixed_mm.cpp
index 044e110bf5c..564de74dfde 100644
--- a/kernels/quantized/cpu/op_mixed_mm.cpp
+++ b/kernels/quantized/cpu/op_mixed_mm.cpp
@@ -31,9 +31,9 @@ bool check_quantized_mixed_mm_args(
       tensors_have_same_size_at_dims(weight_scales, 0, weight, 0));
 
   ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, weight_scales, out));
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       weight.scalar_type() == ScalarType::Char, "weight dtype must be int8");
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       in.scalar_type() == ScalarType::Float ||
           in.scalar_type() == ScalarType::Half,
       "input dtype must be Float or Half");
@@ -46,7 +46,7 @@ bool check_quantized_mixed_mm_args(
   }
 
   // Support for non-null zero points is not implemented yet.
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       !opt_weight_zero_points.has_value(), "zero points not supported yet.");
   return true;
 }
diff --git a/kernels/quantized/targets.bzl b/kernels/quantized/targets.bzl
index fde6698099e..a2533cb003a 100644
--- a/kernels/quantized/targets.bzl
+++ b/kernels/quantized/targets.bzl
@@ -1,4 +1,4 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime")
 load("@fbsource//xplat/executorch/codegen:codegen.bzl", "et_operator_library", "executorch_generated_lib", "exir_custom_ops_aot_lib")
 
 def define_common_targets():
@@ -77,7 +77,7 @@ def define_common_targets():
             ],
     )
 
-    for aten_mode in (True, False):
+    for aten_mode in get_aten_mode_options():
         aten_suffix = "_aten" if aten_mode else ""
 
         runtime.cxx_library(
diff --git a/kernels/test/CMakeLists.txt b/kernels/test/CMakeLists.txt
index 4250f1f7581..b9f48f0c9a1 100644
--- a/kernels/test/CMakeLists.txt
+++ b/kernels/test/CMakeLists.txt
@@ -15,7 +15,7 @@ cmake_minimum_required(VERSION 3.19)
 
 set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
 
-include(${EXECUTORCH_ROOT}/build/Test.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
 set(_kernels portable optimized quantized)
 foreach(kernel ${_kernels})
@@ -23,11 +23,11 @@ foreach(kernel ${_kernels})
       "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/test"
   )
   set(_wrapper_path "${_wrapper_dir}/FunctionHeaderWrapper.h")
+  set(_functions_include "#include <executorch/kernels/${kernel}/Functions.h>")
   add_custom_command(
     OUTPUT "${_wrapper_path}"
     COMMAND mkdir -p ${_wrapper_dir}
-    COMMAND echo "#include <executorch/kernels/${kernel}/Functions.h>" >
-            "${_wrapper_path}"
+    COMMAND echo ${_functions_include} > "${_wrapper_path}"
     DEPENDS
       "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/Functions.h"
       "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/NativeFunctions.h"
@@ -53,7 +53,17 @@ foreach(kernel ${_kernels})
     COMMENT "Generating ${_wrapper_dir}/supported_features.cpp and header"
     VERBATIM
   )
-
+  if(${kernel} STREQUAL "optimized")
+    set(_kernel_ops_lib "optimized_native_cpu_ops_lib")
+    set(_kernel_ops_lib_path
+        "${CMAKE_CURRENT_BINARY_DIR}/../../configurations/optimized_native_cpu_ops_lib"
+    )
+  else()
+    set(_kernel_ops_lib "${kernel}_ops_lib")
+    set(_kernel_ops_lib_path
+        "${CMAKE_CURRENT_BINARY_DIR}/../../kernels/${kernel}/${kernel}_ops_lib"
+    )
+  endif()
   add_custom_command(
     OUTPUT
       "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/Functions.h"
@@ -63,10 +73,9 @@ foreach(kernel ${_kernels})
       mkdir -p
       "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/"
     COMMAND
-      cp
-      "${CMAKE_CURRENT_BINARY_DIR}/../../kernels/${kernel}/${kernel}_ops_lib/*.h"
+      cp "${_kernel_ops_lib_path}/*.h"
       "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/"
-      DEPENDS "${kernel}_ops_lib"
+    DEPENDS ${_kernel_ops_lib}
   )
 endforeach()
 
@@ -270,18 +279,20 @@ set(_optimized_kernels_test_sources
     "op_le_test.cpp"
     "op_linear_test.cpp"
     "op_log_softmax_test.cpp"
+    "op_mm_test.cpp"
     "op_mul_test.cpp"
     "op_native_layer_norm_test.cpp"
     "op_neg_test.cpp"
     "op_sub_test.cpp"
+    "op_where_test.cpp"
     "UnaryUfuncRealHBBF16ToFloatHBF16Test.cpp"
-    ${CMAKE_CURRENT_BINARY_DIR}/include/portable/executorch/kernels/test/supported_features.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/include/optimized/executorch/kernels/test/supported_features.cpp
 )
 
-# We don't have sleef on OSS so we don't have gelu and log_softmax
-list(REMOVE_ITEM _optimized_kernels_test_sources "op_gelu_test.cpp"
-     "op_log_softmax_test.cpp"
-)
+if(TARGET optimized_portable_kernels)
+  list(APPEND _optimized_kernels_test_sources ${all_test_sources})
+  list(REMOVE_DUPLICATES _optimized_kernels_test_sources)
+endif()
 
 et_cxx_test(
   optimized_kernels_test
@@ -290,16 +301,16 @@ et_cxx_test(
   EXTRA_LIBS
   cpuinfo
   extension_threadpool
-  optimized_kernels
-  optimized_ops_lib
-  portable_kernels
+  optimized_native_cpu_ops_lib
   pthreadpool
   eigen_blas
 )
 add_dependencies(optimized_kernels_test generate_wrapper)
 target_include_directories(
-  optimized_kernels_test PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/include/optimized"
-                                 "${CMAKE_INSTALL_PREFIX}/include"
+  optimized_kernels_test
+  PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/include/optimized"
+          "${CMAKE_CURRENT_BINARY_DIR}/include/portable"
+          "${CMAKE_INSTALL_PREFIX}/include"
 )
 
 if(TARGET quantized_kernels)
diff --git a/kernels/test/op_argmax_test.cpp b/kernels/test/op_argmax_test.cpp
index 66c79cefff7..4d68dfe88be 100644
--- a/kernels/test/op_argmax_test.cpp
+++ b/kernels/test/op_argmax_test.cpp
@@ -90,3 +90,16 @@ TEST_F(OpArgmaxTest, SanityCheckNullDim) {
   EXPECT_TENSOR_EQ(out, expected);
   // clang-format on
 }
+
+TEST_F(OpArgmaxTest, FirstNaNWins) {
+  TensorFactory<ScalarType::Float> tf_float;
+  Tensor in = tf_float.make({4}, {1, NAN, -4, NAN});
+
+  TensorFactory<ScalarType::Long> tf_long;
+  Tensor out = tf_long.zeros({});
+  Tensor expected = tf_long.make({}, {1});
+
+  Tensor ret = op_argmax_out(in, {}, false, out);
+  EXPECT_TENSOR_EQ(out, ret);
+  EXPECT_TENSOR_EQ(out, expected);
+}
diff --git a/kernels/test/op_argmin_test.cpp b/kernels/test/op_argmin_test.cpp
index 250fe4f7e1e..a0b2699a28f 100644
--- a/kernels/test/op_argmin_test.cpp
+++ b/kernels/test/op_argmin_test.cpp
@@ -90,3 +90,16 @@ TEST_F(OpArgminTest, SanityCheckNullDim) {
   EXPECT_TENSOR_EQ(out, expected);
   // clang-format on
 }
+
+TEST_F(OpArgminTest, FirstNaNWins) {
+  TensorFactory<ScalarType::Float> tf_float;
+  Tensor in = tf_float.make({4}, {1, NAN, -4, NAN});
+
+  TensorFactory<ScalarType::Long> tf_long;
+  Tensor out = tf_long.zeros({});
+  Tensor expected = tf_long.make({}, {1});
+
+  Tensor ret = op_argmin_out(in, {}, false, out);
+  EXPECT_TENSOR_EQ(out, ret);
+  EXPECT_TENSOR_EQ(out, expected);
+}
diff --git a/kernels/test/op_div_test.cpp b/kernels/test/op_div_test.cpp
index 97d538971c5..8f41419a8e0 100644
--- a/kernels/test/op_div_test.cpp
+++ b/kernels/test/op_div_test.cpp
@@ -83,6 +83,52 @@ class OpDivOutTest : public OperatorTest {
     ET_EXPECT_KERNEL_FAILURE(context_, op_div_out(a, b, out));
   }
 
+  template <ScalarType DTYPE>
+  void test_broadcast_3D() {
+    TensorFactory<DTYPE> tf_a;
+
+    Tensor a =
+        tf_a.make({2, 2, 3}, /*data=*/{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    Tensor b = tf_a.make({2, 1, 3}, /*data=*/{2, 3, 4, 5, 6, 7});
+
+    // Destination for output of mul.
+    Tensor out =
+        tf_a.make({2, 2, 3}, /*data=*/{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    Tensor expected = tf_a.make(
+        {2, 2, 3},
+        /*data=*/
+        {0.5000,
+         0.6667,
+         0.75002,
+         2.0000,
+         1.6667,
+         1.5000,
+         1.4000,
+         1.3333,
+         1.2857,
+         2.0000,
+         1.8333,
+         1.7143});
+    // Check that it matches the expected output.
+    EXPECT_TENSOR_CLOSE_WITH_TOL(op_div_out(a, b, out), expected, 1e-4, 1e-4);
+    expected = tf_a.make(
+        {2, 2, 3},
+        /*data=*/
+        {2.0000,
+         1.5000,
+         1.3333,
+         0.5000,
+         0.6000,
+         0.6667,
+         0.7143,
+         0.7500,
+         0.7778,
+         0.5000,
+         0.5455,
+         0.5833});
+    EXPECT_TENSOR_CLOSE_WITH_TOL(op_div_out(b, a, out), expected, 1e-4, 1e-4);
+  }
+
   /**
    * Common testing for div operator, for float output types
    */
@@ -457,6 +503,14 @@ TEST_F(OpDivOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
+TEST_F(OpDivOutTest, BroadcastNDTest) {
+  // Test 3D tensors
+  test_broadcast_3D<ScalarType::Float>();
+  // half and bfloat16 are not supported for div quite yet
+  // test_broadcast_3D<ScalarType::Half>();
+  // test_broadcast_3D<ScalarType::BFloat16>();
+}
+
 TEST_F(OpDivOutTest, DynamicShapeUnbound) {
   GTEST_SKIP() << "Dynamic shape not supported";
   TensorFactory<ScalarType::Float> tf;
diff --git a/kernels/test/op_le_test.cpp b/kernels/test/op_le_test.cpp
index ab766d2c6ce..49ef5235d0f 100644
--- a/kernels/test/op_le_test.cpp
+++ b/kernels/test/op_le_test.cpp
@@ -174,3 +174,15 @@ TEST_F(OpLeTensorOutTest, DynamicOutShapeTest) {
   op_le_tensor_out(a, b, out);
   EXPECT_TENSOR_EQ(out, tf.make({2, 2}, {false, true, true, false}));
 }
+
+TEST_F(OpLeTensorOutTest, BroadcastTest) {
+  TensorFactory<ScalarType::Int> tf;
+
+  Tensor a = tf.make(/*sizes=*/{4}, /*data=*/{2, 3, 2, 4});
+  Tensor b = tf.make({1, 1}, {3});
+
+  Tensor out = tf.zeros({1, 4});
+
+  op_le_tensor_out(a, b, out);
+  EXPECT_TENSOR_EQ(out, tf.make({1, 4}, {true, true, true, false}));
+}
diff --git a/kernels/test/op_log_softmax_test.cpp b/kernels/test/op_log_softmax_test.cpp
index 94047592a80..1b01ff8a78d 100644
--- a/kernels/test/op_log_softmax_test.cpp
+++ b/kernels/test/op_log_softmax_test.cpp
@@ -72,6 +72,59 @@ class OpLogSoftmaxOutTest : public OperatorTest {
       EXPECT_TENSOR_CLOSE(out, expected);
     }
   }
+
+  template <class CTYPE, executorch::aten::ScalarType DTYPE>
+  void test_dtype_noncontiguous_dim() {
+    TensorFactory<DTYPE> tf;
+
+    // Dim 0 must be longer than the vector width of the machine (for
+    // float, this is 4 for ARM64 and 8 for AVX2) to exhibit problems.
+    // clang-format off
+    Tensor x = tf.make(
+      {9, 3},
+      {
+        0, 9,  18,
+        1, 10, 19,
+        2, 11, 20,
+        3, 12, 21,
+        4, 13, 22,
+        5, 14, 23,
+        6, 15, 24,
+        7, 16, 25,
+        8, 17, 26,
+      });
+    // clang-format on
+
+    Tensor out = tf.zeros({9, 3});
+
+    op_log_softmax_out(x, /*dim=*/0, /*half_to_float*/ false, out);
+
+    // clang-format off
+    Tensor expected = tf.make(
+      {9, 3},
+      {
+        -8.45855, -8.45855, -8.45855,
+        -7.45855, -7.45855, -7.45855,
+        -6.45855, -6.45855, -6.45855,
+        -5.45855, -5.45855, -5.45855,
+        -4.45855, -4.45855, -4.45855,
+        -3.45855, -3.45855, -3.45855,
+        -2.45855, -2.45855, -2.45855,
+        -1.45855, -1.45855, -1.45855,
+        -0.458552, -0.458552, -0.458552
+      });
+    // clang-format on
+
+    if constexpr (DTYPE == ScalarType::BFloat16) {
+      EXPECT_TENSOR_CLOSE_WITH_TOL(
+          out,
+          expected,
+          1e-2,
+          executorch::runtime::testing::internal::kDefaultAtol);
+    } else {
+      EXPECT_TENSOR_CLOSE(out, expected);
+    }
+  }
 };
 
 TEST_F(OpLogSoftmaxOutTest, Smoke) {
@@ -101,6 +154,10 @@ TEST_F(OpLogSoftmaxOutTest, AllDtypesSupported) {
 #undef TEST_ENTRY
 }
 
+TEST_F(OpLogSoftmaxOutTest, NonContiguous) {
+  test_dtype_noncontiguous_dim<float, ScalarType::Float>();
+}
+
 TEST_F(OpLogSoftmaxOutTest, MismatchedDimensionsDies) {
   if (SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen currently supports mismatched dimensions";
diff --git a/kernels/test/op_max_pool2d_with_indices_backward_test.cpp b/kernels/test/op_max_pool2d_with_indices_backward_test.cpp
new file mode 100644
index 00000000000..c647ad05c5f
--- /dev/null
+++ b/kernels/test/op_max_pool2d_with_indices_backward_test.cpp
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
+#include <executorch/kernels/test/TestUtil.h>
+#include <executorch/kernels/test/supported_features.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+
+#include <gtest/gtest.h>
+
+using namespace ::testing;
+
+class OpMaxPool2DWithIndicesBackwardOutTest : public OperatorTest {
+ protected:
+  executorch::aten::Tensor& op_max_pool2d_with_indices_backward_out(
+      const executorch::aten::Tensor& grad_output,
+      const executorch::aten::Tensor& input,
+      executorch::aten::ArrayRef<int64_t> kernel_size,
+      executorch::aten::ArrayRef<int64_t> stride,
+      executorch::aten::ArrayRef<int64_t> padding,
+      executorch::aten::ArrayRef<int64_t> dilation,
+      bool ceil_mode,
+      const executorch::aten::Tensor& indices,
+      executorch::aten::Tensor& grad_input) {
+    return torch::executor::aten::max_pool2d_with_indices_backward_outf(
+        context_,
+        grad_output,
+        input,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        ceil_mode,
+        indices,
+        grad_input);
+  }
+
+  template <executorch::aten::ScalarType DTYPE>
+  void test_4d_dtype() {
+    torch::executor::testing::TensorFactory<DTYPE> tf;
+    torch::executor::testing::TensorFactory<executorch::aten::ScalarType::Long>
+        tfLong;
+
+    executorch::aten::Tensor grad_output = tf.make(
+        {2, 3, 4, 4},
+        {69, 97, 97,  99,  69, 97, 97, 99,  12,  79, 85, 85, 77, 77, 85, 85,
+         87, 73, 73,  68,  87, 94, 94, 68,  -30, 94, 94, 8,  71, 74, 77, 77,
+         4,  -8, -12, -46, 87, 90, 90, -45, 87,  90, 90, 17, 63, 28, 88, 88,
+         83, 83, 61,  61,  83, 83, 47, 49,  16,  47, 47, 74, 90, 90, 73, 74,
+         41, 81, 81,  29,  84, 81, 81, 17,  84,  45, 99, 99, 16, 45, 99, 99,
+         54, 54, 5,   29,  54, 68, 68, 29,  90,  90, 68, 90, 99, 99, 65, 90});
+
+    executorch::aten::Tensor input = tf.make(
+        {2, 3, 5, 5},
+        {28,  -38, -7,  -13, 70,  53,  69,  97,  25,  99,  -72, -87, 79,  42,
+         -24, -15, 12,  -86, 85,  0,   67,  77,  53,  -61, 50,  3,   42,  -37,
+         51,  -60, 87,  32,  73,  68,  -84, -98, -30, 94,  1,   -86, -56, -68,
+         74,  -51, 8,   71,  -53, 4,   77,  -89, 4,   -46, -46, -92, -85, -23,
+         -8,  -12, -46, -88, 66,  87,  90,  -45, -78, 63,  28,  28,  -30, 17,
+         -16, 5,   11,  88,  -47, 72,  32,  -7,  61,  -63, -22, 83,  -40, -78,
+         49,  -39, -89, 47,  -61, 7,   16,  -96, -22, 8,   74,  12,  90,  73,
+         -71, -10, 41,  1,   10,  -34, 29,  -27, 26,  81,  -8,  17,  84,  -23,
+         -53, -26, -67, -90, 16,  45,  99,  56,  -87, -65, -79, 31,  79,  6,
+         44,  -55, -5,  -68, -38, 54,  -3,  5,   29,  -39, 26,  68,  -24, -53,
+         51,  90,  65,  43,  90,  -41, 99,  6,   -31, -94});
+
+    ::std::vector<int64_t> kernel_size_vec = {2, 2};
+    executorch::aten::ArrayRef<int64_t> kernel_size =
+        executorch::aten::ArrayRef<int64_t>(
+            kernel_size_vec.data(), kernel_size_vec.size());
+    ::std::vector<int64_t> stride_vec = {1, 1};
+    executorch::aten::ArrayRef<int64_t> stride =
+        executorch::aten::ArrayRef<int64_t>(
+            stride_vec.data(), stride_vec.size());
+    ::std::vector<int64_t> padding_vec = {0, 0};
+    executorch::aten::ArrayRef<int64_t> padding =
+        executorch::aten::ArrayRef<int64_t>(
+            padding_vec.data(), padding_vec.size());
+    ::std::vector<int64_t> dilation_vec = {1, 1};
+    executorch::aten::ArrayRef<int64_t> dilation =
+        executorch::aten::ArrayRef<int64_t>(
+            dilation_vec.data(), dilation_vec.size());
+    bool ceil_mode = false;
+    executorch::aten::Tensor indices = tfLong.make(
+        {2, 3, 4, 4},
+        {6, 7, 7, 9, 6,  7,  7,  9,  16, 12, 18, 18, 21, 21, 18, 18,
+         5, 7, 7, 8, 5,  12, 12, 8,  11, 12, 12, 19, 20, 17, 23, 23,
+         0, 6, 7, 8, 11, 12, 12, 13, 11, 12, 12, 19, 15, 16, 23, 23,
+         6, 6, 3, 3, 6,  6,  12, 9,  15, 12, 12, 19, 21, 21, 22, 19,
+         0, 7, 7, 4, 10, 7,  7,  9,  10, 17, 18, 18, 16, 17, 18, 18,
+         6, 6, 8, 9, 6,  12, 12, 9,  16, 16, 12, 19, 21, 21, 17, 19});
+    executorch::aten::Tensor grad_input = tf.zeros({2, 3, 5, 5});
+    executorch::aten::Tensor grad_input_expected = tf.make(
+        {2, 3, 5, 5},
+        {0,   0,   0,   0,   0,   0,   138, 388, 0, 198, 0,  0,   79,  0,   0,
+         0,   12,  0,   340, 0,   0,   154, 0,   0, 0,   0,  0,   0,   0,   0,
+         174, 0,   146, 136, 0,   0,   -30, 376, 0, 0,   0,  0,   74,  0,   8,
+         71,  0,   0,   154, 0,   4,   0,   0,   0, 0,   0,  -8,  -12, -46, 0,
+         0,   174, 360, -45, 0,   63,  28,  0,   0, 17,  0,  0,   0,   176, 0,
+         0,   0,   0,   122, 0,   0,   332, 0,   0, 49,  0,  0,   141, 0,   0,
+         16,  0,   0,   0,   148, 0,   180, 73,  0, 0,   41, 0,   0,   0,   29,
+         0,   0,   324, 0,   17,  168, 0,   0,   0, 0,   0,  16,  90,  396, 0,
+         0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,  162, 0,   5,   58,
+         0,   0,   204, 0,   0,   0,   180, 65,  0, 180, 0,  198, 0,   0,   0});
+    op_max_pool2d_with_indices_backward_out(
+        grad_output,
+        input,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        ceil_mode,
+        indices,
+        grad_input);
+    EXPECT_TENSOR_CLOSE(grad_input, grad_input_expected);
+  }
+
+  template <executorch::aten::ScalarType DTYPE>
+  void test_3d_dtype() {
+    torch::executor::testing::TensorFactory<DTYPE> tf;
+    torch::executor::testing::TensorFactory<executorch::aten::ScalarType::Long>
+        tfLong;
+
+    executorch::aten::Tensor grad_output =
+        tf.make({2, 5, 5}, {89, 89, 89, 20, 20, 89, 89, 86, 49, 80, 89, 89, 99,
+                            99, 99, 84, 84, 86, 86, 86, 51, 86, 86, 86, 62, 42,
+                            67, 85, 85, 85, 75, 75, 42, 42, 74, 75, 98, 98, 98,
+                            61, 95, 98, 98, 98, 93, 88, 88, 13, 13, 67});
+
+    executorch::aten::Tensor input = tf.make(
+        {2, 12, 12},
+        {73,  15,  30,  89,  -55, -62, 25,  -50, -47, 12,  -73, -89, 53,  -63,
+         -44, 86,  53,  -84, -6,  20,  -24, -43, -11, -34, -7,  -13, 74,  33,
+         -44, 49,  -59, -88, -46, -33, 48,  80,  38,  -58, 0,   -48, -46, -87,
+         -66, 14,  -68, -77, -50, -15, 86,  89,  -37, 7,   -16, -6,  55,  40,
+         -83, -77, -55, 32,  -17, -83, 43,  17,  2,   -51, 20,  -77, -68, -72,
+         -47, -78, -49, -52, -7,  -25, -77, -8,  -3,  99,  71,  19,  21,  -47,
+         44,  -90, -75, -87, 79,  -42, -90, 22,  2,   73,  -65, -50, -71, 19,
+         -60, -91, -43, -60, 16,  86,  -93, -78, 82,  14,  20,  19,  33,  84,
+         60,  41,  2,   -4,  -52, 74,  -40, -60, 88,  51,  -59, 49,  -81, -93,
+         43,  -99, 40,  -84, 76,  27,  59,  -19, -55, -50, 81,  86,  -19, 51,
+         70,  -90, 74,  62,  0,   -31, -71, 42,  42,  67,  26,  85,  -11, -34,
+         -97, 5,   -45, -50, 74,  -62, -81, -84, 70,  33,  -27, -54, 94,  74,
+         -30, 16,  39,  0,   0,   -80, 85,  42,  13,  -82, -30, -95, 34,  -60,
+         -51, -10, -30, -65, -96, -95, 60,  -33, 67,  -88, -26, 75,  29,  -27,
+         -28, 21,  -2,  -29, 11,  -68, -36, -85, -4,  9,   -31, -63, 98,  -1,
+         17,  61,  -50, 41,  -18, -92, -50, -40, 14,  18,  22,  10,  58,  -86,
+         -9,  5,   -69, -50, -26, 26,  57,  -94, -53, 98,  37,  35,  -20, -9,
+         -13, -41, 41,  95,  82,  -71, -43, -37, -91, -14, -55, 52,  -30, 93,
+         -26, 83,  2,   -63, 52,  31,  57,  42,  -2,  -45, 99,  -18, 38,  88,
+         36,  -36, -35, 13,  -31, -50, 10,  -38, 1,   67,  3,   -87, 42,  -31,
+         -77, -7,  -94, -99, 24,  -21, -98, 15});
+    ::std::vector<int64_t> kernel_size_vec = {4, 3};
+    executorch::aten::ArrayRef<int64_t> kernel_size =
+        executorch::aten::ArrayRef<int64_t>(
+            kernel_size_vec.data(), kernel_size_vec.size());
+    ::std::vector<int64_t> stride_vec = {3, 2};
+    executorch::aten::ArrayRef<int64_t> stride =
+        executorch::aten::ArrayRef<int64_t>(
+            stride_vec.data(), stride_vec.size());
+    ::std::vector<int64_t> padding_vec = {2, 1};
+    executorch::aten::ArrayRef<int64_t> padding =
+        executorch::aten::ArrayRef<int64_t>(
+            padding_vec.data(), padding_vec.size());
+    ::std::vector<int64_t> dilation_vec = {1, 2};
+    executorch::aten::ArrayRef<int64_t> dilation =
+        executorch::aten::ArrayRef<int64_t>(
+            dilation_vec.data(), dilation_vec.size());
+    bool ceil_mode = false;
+    executorch::aten::Tensor indices = tfLong.make(
+        {2, 5, 5},
+        {3,  3,  3,   19,  19,  49,  49,  15,  29,  35,  49,  49,  79,
+         79, 79, 111, 111, 103, 103, 103, 121, 137, 137, 137, 143, 3,
+         5,  7,  7,   7,   49,  49,  31,  31,  23,  49,  89,  89,  89,
+         67, 97, 89,  89,  89,  107, 121, 121, 125, 125, 131});
+    executorch::aten::Tensor grad_input = tf.zeros({2, 12, 12});
+    executorch::aten::Tensor grad_input_expected = tf.make(
+        {2, 12, 12},
+        {0, 0,  0, 267, 0, 0,  0, 0,   0, 0, 0, 0,   0, 0,   0, 86, 0, 0,
+         0, 40, 0, 0,   0, 0,  0, 0,   0, 0, 0, 49,  0, 0,   0, 0,  0, 80,
+         0, 0,  0, 0,   0, 0,  0, 0,   0, 0, 0, 0,   0, 356, 0, 0,  0, 0,
+         0, 0,  0, 0,   0, 0,  0, 0,   0, 0, 0, 0,   0, 0,   0, 0,  0, 0,
+         0, 0,  0, 0,   0, 0,  0, 297, 0, 0, 0, 0,   0, 0,   0, 0,  0, 0,
+         0, 0,  0, 0,   0, 0,  0, 0,   0, 0, 0, 0,   0, 258, 0, 0,  0, 0,
+         0, 0,  0, 168, 0, 0,  0, 0,   0, 0, 0, 0,   0, 51,  0, 0,  0, 0,
+         0, 0,  0, 0,   0, 0,  0, 0,   0, 0, 0, 258, 0, 0,   0, 0,  0, 62,
+         0, 0,  0, 42,  0, 67, 0, 255, 0, 0, 0, 0,   0, 0,   0, 0,  0, 0,
+         0, 0,  0, 0,   0, 74, 0, 0,   0, 0, 0, 0,   0, 84,  0, 0,  0, 0,
+         0, 0,  0, 0,   0, 0,  0, 0,   0, 0, 0, 0,   0, 225, 0, 0,  0, 0,
+         0, 0,  0, 0,   0, 0,  0, 0,   0, 0, 0, 0,   0, 61,  0, 0,  0, 0,
+         0, 0,  0, 0,   0, 0,  0, 0,   0, 0, 0, 0,   0, 0,   0, 0,  0, 588,
+         0, 0,  0, 0,   0, 0,  0, 95,  0, 0, 0, 0,   0, 0,   0, 0,  0, 93,
+         0, 0,  0, 0,   0, 0,  0, 0,   0, 0, 0, 0,   0, 176, 0, 0,  0, 26,
+         0, 0,  0, 0,   0, 67, 0, 0,   0, 0, 0, 0,   0, 0,   0, 0,  0, 0});
+
+    op_max_pool2d_with_indices_backward_out(
+        grad_output,
+        input,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        ceil_mode,
+        indices,
+        grad_input);
+    EXPECT_TENSOR_CLOSE(grad_input, grad_input_expected);
+  }
+};
+
+TEST_F(OpMaxPool2DWithIndicesBackwardOutTest, SanityTest4D) {
+#define TEST_ENTRY(ctype, dtype) \
+  test_4d_dtype<executorch::aten::ScalarType::dtype>();
+  ET_FORALL_FLOATHBF16_TYPES(TEST_ENTRY);
+#undef TEST_ENTRY
+}
+
+TEST_F(OpMaxPool2DWithIndicesBackwardOutTest, SanityTest3D) {
+#define TEST_ENTRY(ctype, dtype) \
+  test_3d_dtype<executorch::aten::ScalarType::dtype>();
+  ET_FORALL_FLOATHBF16_TYPES(TEST_ENTRY);
+#undef TEST_ENTRY
+}
diff --git a/kernels/test/op_sub_test.cpp b/kernels/test/op_sub_test.cpp
index 39fc9e14925..aafaf688b0d 100644
--- a/kernels/test/op_sub_test.cpp
+++ b/kernels/test/op_sub_test.cpp
@@ -99,6 +99,109 @@ class OpSubOutTest : public OperatorTest {
     EXPECT_TENSOR_CLOSE(out, tf.make(sizes, /*data=*/{0.1, 1.2, 3.4, 7.8}));
   }
 
+  template <ScalarType DTYPE>
+  void test_broadcast_3D() {
+    TensorFactory<DTYPE> tf_a;
+
+    Tensor a =
+        tf_a.make({2, 2, 3}, /*data=*/{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    Tensor b = tf_a.make({2, 1, 3}, /*data=*/{2, 3, 4, 5, 6, 7});
+
+    // Destination for output of mul.
+    Tensor out =
+        tf_a.make({2, 2, 3}, /*data=*/{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    Tensor expected =
+        tf_a.make({2, 2, 3}, /*data=*/{-1, -1, -1, 2, 2, 2, 2, 2, 2, 5, 5, 5});
+
+    // Check that it matches the expected output.
+    EXPECT_TENSOR_CLOSE(op_sub_out(a, b, 1.0, out), expected);
+    // b - a * 1.5 output should be
+    expected = tf_a.make(
+        {2, 2, 3},
+        /*data=*/
+        {0.5,
+         0.0,
+         -0.5,
+         -4.0,
+         -4.5,
+         -5.0,
+         -5.5,
+         -6.0,
+         -6.5,
+         -10.0,
+         -10.5,
+         -11.0});
+    EXPECT_TENSOR_CLOSE(op_sub_out(b, a, 1.5, out), expected);
+  }
+
+  template <ScalarType DTYPE>
+  void test_broadcast_4D() {
+    TensorFactory<DTYPE> tf_a;
+
+    Tensor a = tf_a.make(
+        {2, 2, 3, 5},
+        /*data=*/{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
+                  31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
+                  46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60});
+    Tensor b = tf_a.make(
+        {2, 1, 3, 5},
+        /*data=*/{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30});
+
+    // Destination for output of mul.
+    Tensor out = tf_a.zeros({2, 2, 3, 5});
+    Tensor expected = tf_a.make(
+        {2, 2, 3, 5},
+        /*data=*/{0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+                  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+                  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+                  30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30});
+
+    // Check that it matches the expected output.
+    EXPECT_TENSOR_CLOSE(op_sub_out(a, b, 1.0, out), expected);
+    expected = tf_a.make(
+        {2, 2, 3, 5},
+        /*data=*/{0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+                  0,   0,   0,   -15, -15, -15, -15, -15, -15, -15, -15, -15,
+                  -15, -15, -15, -15, -15, -15, -15, -15, -15, -15, -15, -15,
+                  -15, -15, -15, -15, -15, -15, -15, -15, -15, -30, -30, -30,
+                  -30, -30, -30, -30, -30, -30, -30, -30, -30, -30, -30, -30});
+    EXPECT_TENSOR_CLOSE(op_sub_out(b, a, 1.0, out), expected);
+
+    b = tf_a.make(
+        {2, 2, 1, 5}, /*data=*/{1,  2,  3,  4,  5,  6,  7,  8,  9,  10,
+                                11, 12, 13, 14, 15, 16, 17, 18, 19, 20});
+    out = tf_a.zeros({2, 2, 3, 5});
+    expected = tf_a.make(
+        {2, 2, 3, 5},
+        /*data=*/{0,  0,  0,  0,  0,  5,  5,  5,  5,  5,  10, 10, 10, 10, 10,
+                  10, 10, 10, 10, 10, 15, 15, 15, 15, 15, 20, 20, 20, 20, 20,
+                  20, 20, 20, 20, 20, 25, 25, 25, 25, 25, 30, 30, 30, 30, 30,
+                  30, 30, 30, 30, 30, 35, 35, 35, 35, 35, 40, 40, 40, 40, 40});
+
+    // Check that it matches the expected output.
+    EXPECT_TENSOR_CLOSE(op_sub_out(a, b, 1.0, out), expected);
+    expected = tf_a.make(
+        {2, 2, 3, 5},
+        /*data=*/{-0.5000,  -1.0000,  -1.5000,  -2.0000,  -2.5000,
+                  -8.0000,  -8.5000,  -9.0000,  -9.5000,  -10.0000,
+                  -15.5000, -16.0000, -16.5000, -17.0000, -17.5000,
+
+                  -18.0000, -18.5000, -19.0000, -19.5000, -20.0000,
+                  -25.5000, -26.0000, -26.5000, -27.0000, -27.5000,
+                  -33.0000, -33.5000, -34.0000, -34.5000, -35.0000,
+
+                  -35.5000, -36.0000, -36.5000, -37.0000, -37.5000,
+                  -43.0000, -43.5000, -44.0000, -44.5000, -45.0000,
+                  -50.5000, -51.0000, -51.5000, -52.0000, -52.5000,
+
+                  -53.0000, -53.5000, -54.0000, -54.5000, -55.0000,
+                  -60.5000, -61.0000, -61.5000, -62.0000, -62.5000,
+                  -68.0000, -68.5000, -69.0000, -69.5000, -70.0000});
+    EXPECT_TENSOR_CLOSE(op_sub_out(b, a, 1.5, out), expected);
+  }
+
   void test_sub_enumerate_a_types() {
 #define ENUMERATE_TEST_ENTRY(ctype, dtype) \
   test_sub_enumerate_b_types<ScalarType::dtype>();
@@ -237,6 +340,19 @@ TEST_F(OpSubOutTest, BroadcastScalarRank0Supported) {
   EXPECT_TENSOR_EQ(out, ret);
 }
 
+TEST_F(OpSubOutTest, BroadcastNDTest) {
+  // Test 3D tensors
+  test_broadcast_3D<ScalarType::Float>();
+  test_broadcast_3D<ScalarType::Half>();
+  // Sub doesnt yet support BFloat16
+  // test_broadcast_3D<ScalarType::BFloat16>();
+
+  // Test 4D tensors
+  test_broadcast_4D<ScalarType::Float>();
+  test_broadcast_4D<ScalarType::Half>();
+  // test_broadcast_4D<ScalarType::BFloat16>();
+}
+
 //
 // Death Tests
 //
diff --git a/kernels/test/op_unfold_copy_test.cpp b/kernels/test/op_unfold_copy_test.cpp
new file mode 100644
index 00000000000..ef3c09c10e3
--- /dev/null
+++ b/kernels/test/op_unfold_copy_test.cpp
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+
+#include <executorch/kernels/test/TestUtil.h>
+
+#include <gtest/gtest.h>
+
+using namespace ::testing;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using torch::executor::testing::TensorFactory;
+
+class OpUnfoldTest : public OperatorTest {
+ protected:
+  Tensor& op_unfold_copy_out(
+      const Tensor& self,
+      int64_t dim,
+      int64_t size,
+      int64_t step,
+      Tensor& out) {
+    return torch::executor::aten::unfold_copy_outf(
+        context_, self, dim, size, step, out);
+  }
+
+  template <class CTYPE, ScalarType DTYPE>
+  void test_unfold_copy_dtype() {
+    TensorFactory<DTYPE> tf;
+
+    auto input = tf.make({3, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+    auto expected = tf.make({3, 2, 2}, {1, 2, 2, 3, 4, 5, 5, 6, 7, 8, 8, 9});
+    auto actual_out = tf.zeros_like(expected);
+    op_unfold_copy_out(input, /*dim=*/1, /*size=*/2, /*step=*/1, actual_out);
+    EXPECT_TENSOR_CLOSE(actual_out, expected);
+  }
+};
+
+TEST_F(OpUnfoldTest, SmokeTest) {
+  TensorFactory<ScalarType::Float> tf;
+  const auto input = tf.make({3, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+  const auto expected = tf.make({3, 1, 2}, {1, 2, 4, 5, 7, 8});
+  auto output = tf.zeros_like(expected);
+
+  op_unfold_copy_out(input, /*dim=*/1, /*size=*/2, /*step=*/2, output);
+  EXPECT_TENSOR_CLOSE(output, expected);
+}
+
+TEST_F(OpUnfoldTest, DType) {
+#define TEST_ENTRY(ctype, dtype) \
+  test_unfold_copy_dtype<ctype, ScalarType::dtype>();
+  ET_FORALL_REALHBF16_TYPES(TEST_ENTRY);
+#undef TEST_ENTRY
+}
+
+TEST_F(OpUnfoldTest, ZeroDimension) {
+  TensorFactory<ScalarType::Float> tf;
+  const auto input = tf.make({3, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+  const auto expected =
+      tf.make({2, 3, 2}, {1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9});
+  auto output = tf.zeros_like(expected);
+
+  op_unfold_copy_out(input, /*dim=*/0, /*size=*/2, /*step=*/1, output);
+  EXPECT_TENSOR_CLOSE(output, expected);
+}
+
+TEST_F(OpUnfoldTest, NegativeDimension) {
+  TensorFactory<ScalarType::Float> tf;
+  const auto input = tf.make({3, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+  const auto expected = tf.make({3, 1, 2}, {1, 2, 4, 5, 7, 8});
+  auto output = tf.zeros_like(expected);
+
+  op_unfold_copy_out(input, /*dim=*/-1, /*size=*/2, /*step=*/2, output);
+  EXPECT_TENSOR_CLOSE(output, expected);
+}
+
+TEST_F(OpUnfoldTest, LargeStep) {
+  TensorFactory<ScalarType::Float> tf;
+  const auto input = tf.make({3, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+  const auto expected = tf.make({3, 1, 2}, {1, 2, 4, 5, 7, 8});
+  auto output = tf.zeros_like(expected);
+
+  op_unfold_copy_out(input, /*dim=*/-1, /*size=*/2, /*step=*/5, output);
+  EXPECT_TENSOR_CLOSE(output, expected);
+}
+
+TEST_F(OpUnfoldTest, ZeroSize) {
+  TensorFactory<ScalarType::Float> tf;
+  const auto input = tf.make({3, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+  const auto expected = tf.make({3, 4, 0}, {});
+  auto output = tf.zeros_like(expected);
+
+  op_unfold_copy_out(input, /*dim=*/1, /*size=*/0, /*step=*/1, output);
+  EXPECT_TENSOR_CLOSE(output, expected);
+}
+
+TEST_F(OpUnfoldTest, NegativeSizeAndNegativeStepDies) {
+  TensorFactory<ScalarType::Float> tf;
+  const auto input = tf.make({3, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+  auto output = tf.zeros({3, 1, 2});
+
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_unfold_copy_out(input, /*dim=*/1, /*size=*/-1, /*step=*/1, output));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_unfold_copy_out(input, /*dim=*/1, /*size=*/1, /*step=*/-1, output));
+}
+
+TEST_F(OpUnfoldTest, InvalidDimAndSizeTooLargeDies) {
+  TensorFactory<ScalarType::Float> tf;
+  const auto input = tf.make({3, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+  auto output = tf.zeros({3, 1, 2});
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_unfold_copy_out(input, /*dim=*/3, /*size=*/2, /*step=*/1, output));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_unfold_copy_out(input, /*dim=*/1, /*size=*/10, /*step=*/1, output));
+}
diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl
index 91f2121bebc..18ab0ac2e28 100644
--- a/kernels/test/targets.bzl
+++ b/kernels/test/targets.bzl
@@ -37,7 +37,7 @@ def define_common_targets():
             ],
             fbcode_exported_deps = [
                 "//common/gtest:gtest",
-            ],
+            ] if not runtime.is_oss else [],
             xplat_exported_deps = [
                 "//third-party/googletest:gtest_main",
             ],
@@ -68,7 +68,7 @@ def define_common_targets():
             fbcode_exported_deps = [
                 "//common/init:init",
                 "//common/gtest:gtest",
-            ],
+            ] if not runtime.is_oss else [],
             xplat_exported_deps = [
                 "//xplat/folly:init_init",
                 "//third-party/googletest:gtest_main",
@@ -261,6 +261,7 @@ def define_common_targets():
     _common_op_test("op_masked_select_test", ["aten", "portable"])
     _common_op_test("op_max_test", ["aten", "portable"])
     _common_op_test("op_max_pool2d_with_indices_test", ["aten", "portable"])
+    _common_op_test("op_max_pool2d_with_indices_backward_test", ["aten", "portable"])
     _common_op_test("op_maximum_test", ["aten", "portable"])
     _common_op_test("op_mean_test", ["aten", "portable"])
     _common_op_test("op_min_test", ["aten", "portable"])
@@ -324,6 +325,7 @@ def define_common_targets():
     _common_op_test("op_tril_test", ["aten", "portable"])
     _common_op_test("op_trunc_test", ["aten", "portable"])
     _common_op_test("op_unbind_copy_test", ["aten", "portable"])
+    _common_op_test("op_unfold_copy_test", ["aten", "portable"])
     _common_op_test("op_unsqueeze_copy_test", ["aten", "portable"])
     _common_op_test("op_upsample_bilinear2d_test", ["aten", "portable"])
     _common_op_test("op_upsample_nearest2d_test", ["aten", "portable"])
diff --git a/pyproject.toml b/pyproject.toml
index fb4196d99bc..90640c8dbdd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -82,6 +82,28 @@ Changelog = "https://github.com/pytorch/executorch/releases"
 [project.scripts]
 flatc = "executorch.data.bin:flatc"
 
+# TODO(dbort): Could use py_modules to restrict the set of modules we
+# package, and package_data to restrict the set up non-python files we
+# include. See also setuptools/discovery.py for custom finders.
+[tool.setuptools.package-dir]
+"executorch.backends" = "backends"
+"executorch.codegen" = "codegen"
+"executorch.data.bin" = "data/bin"
+# TODO(mnachin T180504136): Do not put examples/models
+# into core pip packages. Refactor out the necessary utils
+# or core models files into a separate package.
+"executorch.examples.apple.coreml.llama" = "examples/apple/coreml/llama"
+"executorch.examples.llm_pte_finetuning" = "examples/llm_pte_finetuning"
+"executorch.examples.models" = "examples/models"
+"executorch.exir" = "exir"
+"executorch.extension" = "extension"
+"executorch.kernels.quantized" = "kernels/quantized"
+"executorch.schema" = "schema"
+"executorch.devtools" = "devtools"
+"executorch.devtools.bundled_program" = "devtools/bundled_program"
+"executorch.runtime" = "runtime"
+"executorch.util" = "util"
+
 [tool.setuptools.package-data]
 # TODO(dbort): Prune /test[s]/ dirs, /third-party/ dirs, yaml files that we
 # don't need.
diff --git a/requirements-dev.txt b/requirements-dev.txt
index fb59bf3d21a..c79f93d0568 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,4 +1,4 @@
-cmake  # For building binary targets in the wheel.
+cmake>=3.19  # For building binary targets in the wheel.
 pip>=23  # For building the pip package.
 pyyaml  # Imported by the kernel codegen tools.
 setuptools>=63  # For building the pip package contents.
diff --git a/runtime/__init__.py b/runtime/__init__.py
index 4ed99ddae01..ed315316c9c 100644
--- a/runtime/__init__.py
+++ b/runtime/__init__.py
@@ -42,7 +42,7 @@
 import functools
 from pathlib import Path
 from types import ModuleType
-from typing import Any, BinaryIO, Dict, Optional, Sequence, Set, Union
+from typing import Any, BinaryIO, Dict, List, Optional, Sequence, Set, Union
 
 try:
     from executorch.extension.pybindings.portable_lib import (
@@ -125,6 +125,27 @@ def load_method(self, name: str) -> Optional[Method]:
         return self._methods.get(name, None)
 
 
+class BackendRegistry:
+    """The registry of backends that are available to the runtime."""
+
+    def __init__(self, legacy_module: ModuleType) -> None:
+        # TODO: Expose the kernel callables to Python.
+        self._legacy_module = legacy_module
+
+    @property
+    def registered_backend_names(self) -> List[str]:
+        """
+        Returns the names of all registered backends as a list of strings.
+        """
+        return self._legacy_module._get_registered_backend_names()
+
+    def is_available(self, backend_name: str) -> bool:
+        """
+        Returns the names of all registered backends as a list of strings.
+        """
+        return self._legacy_module._is_available(backend_name)
+
+
 class OperatorRegistry:
     """The registry of operators that are available to the runtime."""
 
@@ -157,6 +178,7 @@ def get() -> "Runtime":
 
     def __init__(self, *, legacy_module: ModuleType) -> None:
         # Public attributes.
+        self.backend_registry = BackendRegistry(legacy_module)
         self.operator_registry = OperatorRegistry(legacy_module)
         # Private attributes.
         self._legacy_module = legacy_module
diff --git a/runtime/backend/backend_init_context.h b/runtime/backend/backend_init_context.h
index 49bf4adbf9a..de1661c3af0 100644
--- a/runtime/backend/backend_init_context.h
+++ b/runtime/backend/backend_init_context.h
@@ -8,6 +8,7 @@
 
 #pragma once
 #include <executorch/runtime/core/memory_allocator.h>
+#include <executorch/runtime/core/named_data_map.h>
 
 namespace executorch {
 namespace runtime {
@@ -21,8 +22,11 @@ class BackendInitContext final {
   explicit BackendInitContext(
       MemoryAllocator* runtime_allocator,
       EventTracer* event_tracer = nullptr,
-      const char* method_name = nullptr)
-      : runtime_allocator_(runtime_allocator), method_name_(method_name) {}
+      const char* method_name = nullptr,
+      const NamedDataMap* named_data_map = nullptr)
+      : runtime_allocator_(runtime_allocator),
+        method_name_(method_name),
+        named_data_map_(named_data_map) {}
 
   /** Get the runtime allocator passed from Method. It's the same runtime
    * executor used by the standard executor runtime and the life span is the
@@ -52,10 +56,18 @@ class BackendInitContext final {
     return method_name_;
   }
 
+  /** Get the named data map from ExecuTorch runtime.
+   * This provides a way for backends to retrieve data blobs by key.
+   */
+  const NamedDataMap* get_named_data_map() const {
+    return named_data_map_;
+  }
+
  private:
   MemoryAllocator* runtime_allocator_ = nullptr;
   EventTracer* event_tracer_ = nullptr;
   const char* method_name_ = nullptr;
+  const NamedDataMap* named_data_map_ = nullptr;
 };
 
 } // namespace runtime
diff --git a/runtime/backend/interface.cpp b/runtime/backend/interface.cpp
index 84c0bb82d43..4fb1eadfa87 100644
--- a/runtime/backend/interface.cpp
+++ b/runtime/backend/interface.cpp
@@ -55,5 +55,16 @@ Error register_backend(const Backend& backend) {
   return Error::Ok;
 }
 
+size_t get_num_registered_backends() {
+  return num_registered_backends;
+}
+
+Result<const char*> get_backend_name(size_t index) {
+  if (index >= num_registered_backends) {
+    return Error::InvalidArgument;
+  }
+  return registered_backends[index].name;
+}
+
 } // namespace runtime
 } // namespace executorch
diff --git a/runtime/backend/interface.h b/runtime/backend/interface.h
index c0305f68cd3..0a3c069a201 100644
--- a/runtime/backend/interface.h
+++ b/runtime/backend/interface.h
@@ -17,6 +17,7 @@
 #include <executorch/runtime/core/evalue.h>
 #include <executorch/runtime/core/freeable_buffer.h>
 #include <executorch/runtime/core/memory_allocator.h>
+#include <executorch/runtime/core/named_data_map.h>
 #include <executorch/runtime/core/result.h>
 #include <executorch/runtime/platform/compiler.h>
 
@@ -139,6 +140,16 @@ struct Backend {
  */
 ET_NODISCARD Error register_backend(const Backend& backend);
 
+/**
+ * Returns the number of registered backends.
+ */
+size_t get_num_registered_backends();
+
+/**
+ * Returns the backend name at the given index.
+ */
+Result<const char*> get_backend_name(size_t index);
+
 } // namespace runtime
 } // namespace executorch
 
diff --git a/runtime/backend/targets.bzl b/runtime/backend/targets.bzl
index fe7ce489b56..d2187afb5fc 100644
--- a/runtime/backend/targets.bzl
+++ b/runtime/backend/targets.bzl
@@ -1,4 +1,4 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime")
 
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
@@ -7,7 +7,7 @@ def define_common_targets():
     TARGETS and BUCK files that call this function.
     """
 
-    for aten_mode in (True, False):
+    for aten_mode in get_aten_mode_options():
         aten_suffix = ("_aten" if aten_mode else "")
         runtime.cxx_library(
             name = "interface" + aten_suffix,
@@ -29,5 +29,6 @@ def define_common_targets():
                 "//executorch/runtime/core:evalue" + aten_suffix,
                 "//executorch/runtime/core:event_tracer" + aten_suffix,
                 "//executorch/runtime/core:memory_allocator",
+                "//executorch/runtime/core:named_data_map",
             ],
         )
diff --git a/runtime/core/array_ref.h b/runtime/core/array_ref.h
index d02aac955ce..a23509e8698 100644
--- a/runtime/core/array_ref.h
+++ b/runtime/core/array_ref.h
@@ -29,6 +29,7 @@
 #include <array>
 #include <cstdint>
 
+#include <c10/util/irange.h>
 #include <executorch/runtime/platform/assert.h>
 
 namespace executorch {
@@ -149,7 +150,7 @@ class ArrayRef final {
     if (Length != RHS.Length) {
       return false;
     }
-    for (size_t i = 0; i < this->Length; i++) {
+    for (const auto i : c10::irange(this->Length)) {
       if (Data[i] != RHS.Data[i]) {
         return false;
       }
diff --git a/runtime/core/data_loader.h b/runtime/core/data_loader.h
index 45fd1bc8189..3dda5516908 100644
--- a/runtime/core/data_loader.h
+++ b/runtime/core/data_loader.h
@@ -69,12 +69,12 @@ class DataLoader {
     SegmentInfo() = default;
 
     explicit SegmentInfo(
-        Type segment_type,
-        size_t segment_index = 0,
-        const char* descriptor = nullptr)
-        : segment_type(segment_type),
-          segment_index(segment_index),
-          descriptor(descriptor) {}
+        Type segment_type_,
+        size_t segment_index_ = 0,
+        const char* descriptor_ = nullptr)
+        : segment_type(segment_type_),
+          segment_index(segment_index_),
+          descriptor(descriptor_) {}
   };
 
   virtual ~DataLoader() = default;
diff --git a/runtime/core/error.h b/runtime/core/error.h
index 7fbd92b7c08..73e343a5c45 100644
--- a/runtime/core/error.h
+++ b/runtime/core/error.h
@@ -82,6 +82,9 @@ enum class Error : error_code_t {
   /// Error caused by the contents of external data.
   InvalidExternalData = 0x24,
 
+  /// Does not have enough resources to perform the requested operation.
+  OutOfResources = 0x25,
+
   /*
    * Delegate errors.
    */
diff --git a/runtime/core/event_tracer.h b/runtime/core/event_tracer.h
index 5cf4f74a38a..b55e90f0b7a 100644
--- a/runtime/core/event_tracer.h
+++ b/runtime/core/event_tracer.h
@@ -8,6 +8,7 @@
 
 #include <executorch/runtime/core/array_ref.h>
 #include <executorch/runtime/core/evalue.h>
+#include <executorch/runtime/core/result.h>
 #include <executorch/runtime/platform/platform.h>
 #include <stdlib.h>
 #include <cstdint>
@@ -67,6 +68,43 @@ enum class EventTracerDebugLogLevel {
   kIntermediateOutputs,
 };
 
+/**
+ * EventTracerFilterBase is an abstract base class that provides an interface
+ * for filtering events based on their name or delegate debug index.
+ * Derived classes should implement the filter method to define specific
+ * filtering logic.
+ */
+class EventTracerFilterBase {
+ public:
+  /**
+   * Filters events based on the given name or delegate debug index.
+   *
+   * Note that only one of either the name or delegate_debug_index should be
+   * passed in.
+   *
+   * @param[in] name A pointer to a string representing the `name` of the
+   * event. If `delegate_debug_index` is not set to kUnsetDebugHandle, `name`
+   * should be set to nullptr.
+   *
+   * @param[in] delegate_debug_index A DebugHandle representing the debug index
+   * of the delegate. If `name` is not nullptr, this should be set to
+   * kUnsetDebugHandle.
+   *
+   * @return A Result<bool> indicating whether the event matches the filter
+   * criteria.
+   *         - True if the event matches the filter.
+   *         - False if the event does not match or is unknown.
+   *         - An error code if an error occurs during filtering.
+   */
+  virtual Result<bool> filter(char* name, DebugHandle delegate_debug_index);
+
+  /**
+   * Virtual destructor for the EventTracerFilterBase class.
+   * Ensures proper cleanup of derived class objects.
+   */
+  virtual ~EventTracerFilterBase();
+};
+
 /**
  * Indicates the level of profiling that should be enabled. Profiling
  * events will be logged in increasing order of verbosity as we go down the
@@ -283,8 +321,12 @@ class EventTracer {
    * based names are used by this delegate to identify ops executed in the
    * backend then kUnsetDebugHandle should be passed in here.
    * @param[in] output The tensor type output to be logged.
+   * @return A Result<bool> indicating the status of the logging operation.
+   *         - True if the tensor type output was successfully logged.
+   *         - False if the tensor type output was filtered out and not logged.
+   *         - An error code if an error occurs during logging.
    */
-  virtual void log_intermediate_output_delegate(
+  virtual Result<bool> log_intermediate_output_delegate(
       const char* name,
       DebugHandle delegate_debug_index,
       const executorch::aten::Tensor& output) = 0;
@@ -303,8 +345,13 @@ class EventTracer {
    * based names are used by this delegate to identify ops executed in the
    * backend then kUnsetDebugHandle should be passed in here.
    * @param[in] output The tensor array type output to be logged.
+   * @return A Result<bool> indicating the status of the logging operation.
+   *         - True if the tensor array type output was successfully logged.
+   *         - False if the tensor array type output was filtered out and not
+   * logged.
+   *         - An error code if an error occurs during logging.
    */
-  virtual void log_intermediate_output_delegate(
+  virtual Result<bool> log_intermediate_output_delegate(
       const char* name,
       DebugHandle delegate_debug_index,
       const ArrayRef<executorch::aten::Tensor> output) = 0;
@@ -323,8 +370,12 @@ class EventTracer {
    * based names are used by this delegate to identify ops executed in the
    * backend then kUnsetDebugHandle should be passed in here.
    * @param[in] output The int type output to be logged.
+   * @return A Result<bool> indicating the status of the logging operation.
+   *         - True if the int type output was successfully logged.
+   *         - False if the int type output was filtered out and not logged.
+   *         - An error code if an error occurs during logging.
    */
-  virtual void log_intermediate_output_delegate(
+  virtual Result<bool> log_intermediate_output_delegate(
       const char* name,
       DebugHandle delegate_debug_index,
       const int& output) = 0;
@@ -343,8 +394,12 @@ class EventTracer {
    * based names are used by this delegate to identify ops executed in the
    * backend then kUnsetDebugHandle should be passed in here.
    * @param[in] output The bool type output to be logged.
+   * @return A Result<bool> indicating the status of the logging operation.
+   *         - True if the bool type output was successfully logged.
+   *         - False if the bool type output was filtered out and not logged.
+   *         - An error code if an error occurs during logging.
    */
-  virtual void log_intermediate_output_delegate(
+  virtual Result<bool> log_intermediate_output_delegate(
       const char* name,
       DebugHandle delegate_debug_index,
       const bool& output) = 0;
@@ -363,8 +418,12 @@ class EventTracer {
    * based names are used by this delegate to identify ops executed in the
    * backend then kUnsetDebugHandle should be passed in here.
    * @param[in] output The double type output to be logged.
+   * @return A Result<bool> indicating the status of the logging operation.
+   *         - True if the double type output was successfully logged.
+   *         - False if the double type output was filtered out and not logged.
+   *         - An error code if an error occurs during logging.
    */
-  virtual void log_intermediate_output_delegate(
+  virtual Result<bool> log_intermediate_output_delegate(
       const char* name,
       DebugHandle delegate_debug_index,
       const double& output) = 0;
@@ -443,6 +502,12 @@ class EventTracer {
     event_tracer_profiling_level_ = profiling_level;
   }
 
+  /**
+   * Set the filter of event tracer for delegation intermediate outputs.
+   */
+  void set_delegation_intermediate_output_filter(
+      EventTracerFilterBase* event_tracer_filter);
+
   /**
    * Return the current level of event tracer profiling.
    */
diff --git a/runtime/core/exec_aten/exec_aten.h b/runtime/core/exec_aten/exec_aten.h
index 828aa779b0f..0d84e88d64d 100644
--- a/runtime/core/exec_aten/exec_aten.h
+++ b/runtime/core/exec_aten/exec_aten.h
@@ -106,7 +106,7 @@ template <typename T>
 using optional = torch::executor::optional<T>;
 using nullopt_t = torch::executor::nullopt_t;
 // NOLINTNEXTLINE(facebook-hte-NamespaceScopedStaticDeclaration)
-static constexpr nullopt_t nullopt{0};
+using std::nullopt;
 using ScalarType = torch::executor::ScalarType;
 using TensorList = ArrayRef<Tensor>;
 using Scalar = torch::executor::Scalar;
diff --git a/runtime/core/exec_aten/targets.bzl b/runtime/core/exec_aten/targets.bzl
index 5664a2aea41..9114be639c0 100644
--- a/runtime/core/exec_aten/targets.bzl
+++ b/runtime/core/exec_aten/targets.bzl
@@ -1,4 +1,4 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime")
 
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
@@ -7,7 +7,7 @@ def define_common_targets():
     TARGETS and BUCK files that call this function.
     """
 
-    for aten_mode in (True, False):
+    for aten_mode in get_aten_mode_options():
         aten_suffix = "_aten" if aten_mode else ""
 
         # Depend on this target if your types (Tensor, ArrayRef, etc) should be flexible between ATen and executor
diff --git a/runtime/core/exec_aten/testing_util/targets.bzl b/runtime/core/exec_aten/testing_util/targets.bzl
index 446d0a97769..ed130c8706c 100644
--- a/runtime/core/exec_aten/testing_util/targets.bzl
+++ b/runtime/core/exec_aten/testing_util/targets.bzl
@@ -1,4 +1,4 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime")
 
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
@@ -6,7 +6,7 @@ def define_common_targets():
     The directory containing this targets.bzl file should also contain both
     TARGETS and BUCK files that call this function.
     """
-    for aten_mode in (True, False):
+    for aten_mode in get_aten_mode_options():
         aten_suffix = ("_aten" if aten_mode else "")
 
         runtime.cxx_library(
diff --git a/runtime/core/exec_aten/testing_util/test/CMakeLists.txt b/runtime/core/exec_aten/testing_util/test/CMakeLists.txt
index b4c4cee0e16..a51717f43a2 100644
--- a/runtime/core/exec_aten/testing_util/test/CMakeLists.txt
+++ b/runtime/core/exec_aten/testing_util/test/CMakeLists.txt
@@ -17,7 +17,7 @@ cmake_minimum_required(VERSION 3.19)
 
 set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../../..)
 
-include(${EXECUTORCH_ROOT}/build/Test.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
 set(_test_srcs tensor_util_test.cpp tensor_factory_test.cpp)
 
diff --git a/runtime/core/exec_aten/testing_util/test/targets.bzl b/runtime/core/exec_aten/testing_util/test/targets.bzl
index 52a6dd0b7da..a37d08ecb22 100644
--- a/runtime/core/exec_aten/testing_util/test/targets.bzl
+++ b/runtime/core/exec_aten/testing_util/test/targets.bzl
@@ -1,4 +1,4 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime")
 
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
@@ -15,19 +15,15 @@ def define_common_targets():
         ],
     )
 
-    runtime.cxx_test(
-        name = "tensor_factory_test",
-        srcs = ["tensor_factory_test.cpp"],
-        deps = [
-            "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
-        ],
-    )
+    for aten_mode in get_aten_mode_options():
+        aten_suffix = "_aten" if aten_mode else ""
+        preprocessor_flags = ["-DUSE_ATEN_LIB"] if aten_mode else []
 
-    runtime.cxx_test(
-        name = "tensor_factory_test_aten",
-        srcs = ["tensor_factory_test.cpp"],
-        preprocessor_flags = ["-DUSE_ATEN_LIB"],
-        deps = [
-            "//executorch/runtime/core/exec_aten/testing_util:tensor_util_aten",
-        ],
-    )
+        runtime.cxx_test(
+            name = "tensor_factory_test" + aten_suffix,
+            srcs = ["tensor_factory_test.cpp"],
+            preprocessor_flags = preprocessor_flags,
+            deps = [
+                "//executorch/runtime/core/exec_aten/testing_util:tensor_util" + aten_suffix,
+            ],
+        )
diff --git a/runtime/core/exec_aten/util/dim_order_util.h b/runtime/core/exec_aten/util/dim_order_util.h
index 0aef3e5c6c9..07b3d5c2a97 100644
--- a/runtime/core/exec_aten/util/dim_order_util.h
+++ b/runtime/core/exec_aten/util/dim_order_util.h
@@ -8,6 +8,7 @@
 
 #pragma once
 
+#include <c10/util/irange.h>
 #include <cstdint>
 #include <cstdio>
 #include <cstring>
@@ -22,8 +23,8 @@ namespace runtime {
 namespace {
 template <typename DimOrderType>
 bool validate_dim_order(const DimOrderType* dim_order, const size_t dims) {
-  for (int32_t i = 0; i < dims; ++i) {
-    if (dim_order[i] >= dims) {
+  for (size_t i = 0; i < dims; ++i) {
+    if (dim_order[i] >= static_cast<DimOrderType>(dims)) {
       return false;
     }
   }
@@ -42,8 +43,8 @@ template <typename DimOrderType>
 inline bool is_contiguous_dim_order(
     const DimOrderType* dim_order,
     const size_t dims) {
-  for (int i = 0; i < dims; ++i) {
-    if (dim_order[i] != i) {
+  for (size_t i = 0; i < dims; ++i) {
+    if (dim_order[i] != static_cast<DimOrderType>(i)) {
       return false;
     }
   }
@@ -65,7 +66,7 @@ bool is_channels_last_dim_order(
     return false;
   }
   // 4-dim tensor is interpreted as NCHW, 5-dim tensor is interpreted as NCHWD
-  size_t channels_dim = 1;
+  DimOrderType channels_dim = 1;
   // Last value in the dim order should be the channels dim
   if (dim_order[dims - 1] != channels_dim) {
     return false;
@@ -74,8 +75,8 @@ bool is_channels_last_dim_order(
   if (dim_order[0] != 0) {
     return false;
   }
-  int d = 1;
-  while (d < dims - 1) {
+  DimOrderType d = 1;
+  while (d < static_cast<DimOrderType>(dims) - 1) {
     if (dim_order[d] != d + 1) {
       return false;
     }
@@ -162,8 +163,8 @@ struct StrideDimOrder {
   StridesType stride;
   DimOrderType dim_order;
 
-  StrideDimOrder(StridesType stride, DimOrderType dim_order)
-      : stride(stride), dim_order(dim_order) {}
+  StrideDimOrder(StridesType stride_, DimOrderType dim_order_)
+      : stride(stride_), dim_order(dim_order_) {}
   StrideDimOrder() = default;
   bool operator>(const StrideDimOrder& other) const {
     // descending order
@@ -254,7 +255,7 @@ ET_NODISCARD inline Error stride_to_dim_order(
 
   sorter.quick_sort(array, 0, dims - 1);
 
-  for (auto i = 0; i < dims; i++) {
+  for (const auto i : c10::irange(dims)) {
     dim_order[i] = array[i].dim_order;
   }
   return Error::Ok;
diff --git a/runtime/core/exec_aten/util/scalar_type_util.h b/runtime/core/exec_aten/util/scalar_type_util.h
index 7c8162b3cdc..d07052c2ec2 100644
--- a/runtime/core/exec_aten/util/scalar_type_util.h
+++ b/runtime/core/exec_aten/util/scalar_type_util.h
@@ -998,11 +998,14 @@ struct promote_types {
   ET_INTERNAL_SWITCH_CASE(                                                   \
       ::executorch::aten::ScalarType::ADDITIONAL2, CTYPE_ALIAS, __VA_ARGS__)
 
-#define ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND3(             \
-    ADDITIONAL1, ADDITIONAL2, ADDITIONAL3, CTYPE_ALIAS, ...) \
-  ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND2(                   \
-      ADDITIONAL1, ADDITIONAL2, CTYPE_ALIAS, __VA_ARGS__)    \
-  ET_INTERNAL_SWITCH_CASE(                                   \
+#define ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND3(                             \
+    ADDITIONAL1, ADDITIONAL2, ADDITIONAL3, CTYPE_ALIAS, ...)                 \
+  ET_INTERNAL_SWITCH_CASE_REAL_TYPES(CTYPE_ALIAS, __VA_ARGS__)               \
+  ET_INTERNAL_SWITCH_CASE(                                                   \
+      ::executorch::aten::ScalarType::ADDITIONAL1, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                                   \
+      ::executorch::aten::ScalarType::ADDITIONAL2, CTYPE_ALIAS, __VA_ARGS__) \
+  ET_INTERNAL_SWITCH_CASE(                                                   \
       ::executorch::aten::ScalarType::ADDITIONAL3, CTYPE_ALIAS, __VA_ARGS__)
 
 #define ET_INTERNAL_SWITCH_CASE_INT_TYPES(CTYPE_ALIAS, ...)            \
@@ -1201,6 +1204,27 @@ struct promote_types {
   ET_SWITCH_REAL_TYPES_AND3(                                              \
       Half, Bool, BFloat16, TYPE, CONTEXT, NAME, CTYPE_ALIAS, __VA_ARGS__)
 
+#define ET_SWITCH_REALHBBF16_AND_UINT_TYPES(                  \
+    TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...)                    \
+  ET_INTERNAL_SWITCH(                                         \
+      TYPE,                                                   \
+      CONTEXT,                                                \
+      NAME,                                                   \
+      ET_INTERNAL_SWITCH_CASE_REAL_TYPES_AND3(                \
+          Half, Bool, BFloat16, CTYPE_ALIAS, __VA_ARGS__)     \
+          ET_INTERNAL_SWITCH_CASE(                            \
+              ::executorch::aten::ScalarType::UInt16,         \
+              CTYPE_ALIAS,                                    \
+              __VA_ARGS__)                                    \
+              ET_INTERNAL_SWITCH_CASE(                        \
+                  ::executorch::aten::ScalarType::UInt32,     \
+                  CTYPE_ALIAS,                                \
+                  __VA_ARGS__)                                \
+                  ET_INTERNAL_SWITCH_CASE(                    \
+                      ::executorch::aten::ScalarType::UInt64, \
+                      CTYPE_ALIAS,                            \
+                      __VA_ARGS__))
+
 #define ET_SWITCH_INT_TYPES(TYPE, CONTEXT, NAME, CTYPE_ALIAS, ...) \
   ET_INTERNAL_SWITCH(                                              \
       TYPE,                                                        \
diff --git a/runtime/core/exec_aten/util/targets.bzl b/runtime/core/exec_aten/util/targets.bzl
index 55e38d882fd..ac46da052ca 100644
--- a/runtime/core/exec_aten/util/targets.bzl
+++ b/runtime/core/exec_aten/util/targets.bzl
@@ -1,4 +1,4 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime")
 
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
@@ -7,7 +7,7 @@ def define_common_targets():
     TARGETS and BUCK files that call this function.
     """
 
-    for aten_mode in (True, False):
+    for aten_mode in get_aten_mode_options():
         aten_suffix = "_aten" if aten_mode else ""
 
         exported_preprocessor_flags_ = []
diff --git a/runtime/core/exec_aten/util/tensor_shape_to_c_string.cpp b/runtime/core/exec_aten/util/tensor_shape_to_c_string.cpp
index cfd416285c5..02155a4d9b4 100644
--- a/runtime/core/exec_aten/util/tensor_shape_to_c_string.cpp
+++ b/runtime/core/exec_aten/util/tensor_shape_to_c_string.cpp
@@ -30,7 +30,9 @@ std::array<char, kTensorShapeStringSizeLimit> tensor_shape_to_c_string_impl(
   }
   *p++ = '(';
   for (const auto elem : shape) {
-    if (elem < 0 || elem > internal::kMaximumPrintableTensorShapeElement) {
+    if (elem < 0 ||
+        static_cast<size_t>(elem) >
+            internal::kMaximumPrintableTensorShapeElement) {
       static_assert(
           internal::kMaximumPrintableTensorShapeElement > 99999,
           "must have room for error string!");
diff --git a/runtime/core/exec_aten/util/tensor_util.h b/runtime/core/exec_aten/util/tensor_util.h
index d7edcfd21d5..4e5a0cebb07 100644
--- a/runtime/core/exec_aten/util/tensor_util.h
+++ b/runtime/core/exec_aten/util/tensor_util.h
@@ -8,11 +8,13 @@
 
 #pragma once
 
+#include <c10/util/irange.h>
 #include <algorithm>
 #include <array> // std::array
 #include <cinttypes> // PRId64
 #include <cmath>
 #include <cstddef> // size_t
+
 #include <limits>
 
 #include <executorch/runtime/core/array_ref.h>
@@ -406,7 +408,7 @@ namespace runtime {
  * upper_bound - 1, inclusive.
  */
 inline bool dim_is_valid(int64_t dim, int64_t upper_bound) {
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       dim >= -upper_bound && dim < upper_bound,
       "Dimension %" PRId64
       " is out of range. Dimension should be between %" PRId64 " and %" PRId64
@@ -443,7 +445,7 @@ inline ssize_t nonempty_size(
 inline bool tensor_can_cast_to(
     executorch::aten::Tensor a,
     executorch::aten::ScalarType dtype) {
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       torch::executor::canCast(a.scalar_type(), dtype),
       "Tensor of dtype %s cannot cast to dtype %s",
       torch::executor::toString(a.scalar_type()),
@@ -453,7 +455,7 @@ inline bool tensor_can_cast_to(
 }
 
 inline bool tensor_is_bool_type(executorch::aten::Tensor t) {
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       t.scalar_type() == executorch::aten::ScalarType::Bool,
       "Expected to find bool type, but tensor has type %s",
       torch::executor::toString(t.scalar_type()));
@@ -464,7 +466,7 @@ inline bool tensor_is_bool_type(executorch::aten::Tensor t) {
 inline bool tensor_is_type(
     executorch::aten::Tensor t,
     executorch::aten::ScalarType dtype) {
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       t.scalar_type() == dtype,
       "Expected to find %s type, but tensor has type %s",
       torch::executor::toString(dtype),
@@ -473,10 +475,41 @@ inline bool tensor_is_type(
   return true;
 }
 
+inline bool tensor_is_type(
+    executorch::aten::Tensor t,
+    executorch::aten::ScalarType dtype,
+    executorch::aten::ScalarType dtype2) {
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      t.scalar_type() == dtype || t.scalar_type() == dtype2,
+      "Expected to find %s or %s type, but tensor has type %s",
+      torch::executor::toString(dtype),
+      torch::executor::toString(dtype2),
+      torch::executor::toString(t.scalar_type()));
+
+  return true;
+}
+
+inline bool tensor_is_type(
+    executorch::aten::Tensor t,
+    executorch::aten::ScalarType dtype,
+    executorch::aten::ScalarType dtype2,
+    executorch::aten::ScalarType dtype3) {
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      t.scalar_type() == dtype || t.scalar_type() == dtype2 ||
+          t.scalar_type() == dtype3,
+      "Expected to find %s, %s, or %s type, but tensor has type %s",
+      torch::executor::toString(dtype),
+      torch::executor::toString(dtype2),
+      torch::executor::toString(dtype3),
+      torch::executor::toString(t.scalar_type()));
+
+  return true;
+}
+
 inline bool tensor_is_integral_type(
     executorch::aten::Tensor t,
     bool includeBool = false) {
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       torch::executor::isIntegralType(t.scalar_type(), includeBool),
       "Expected to find a integral type, but tensor has type %s",
       torch::executor::toString(t.scalar_type()));
@@ -485,7 +518,7 @@ inline bool tensor_is_integral_type(
 }
 
 inline bool tensor_is_floating_type(executorch::aten::Tensor t) {
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       torch::executor::isFloatingType(t.scalar_type()),
       "Expected to find a floating type, but tensor has type %s",
       torch::executor::toString(t.scalar_type()));
@@ -494,7 +527,7 @@ inline bool tensor_is_floating_type(executorch::aten::Tensor t) {
 }
 
 inline bool tensor_is_real_type(executorch::aten::Tensor t) {
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       torch::executor::isRealType(t.scalar_type()),
       "Expected to find a real type, but tensor has type %s",
       torch::executor::toString(t.scalar_type()));
@@ -503,7 +536,7 @@ inline bool tensor_is_real_type(executorch::aten::Tensor t) {
 }
 
 inline bool tensor_is_realh_type(executorch::aten::Tensor t) {
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       torch::executor::isRealHType(t.scalar_type()),
       "Expected to find a real type, but tensor has type %s",
       torch::executor::toString(t.scalar_type()));
@@ -512,7 +545,7 @@ inline bool tensor_is_realh_type(executorch::aten::Tensor t) {
 }
 
 inline bool tensor_is_realhbf16_type(executorch::aten::Tensor t) {
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       executorch::runtime::isRealHBF16Type(t.scalar_type()),
       "Expected to find a real type, but tensor has type %s",
       torch::executor::toString(t.scalar_type()));
@@ -521,7 +554,7 @@ inline bool tensor_is_realhbf16_type(executorch::aten::Tensor t) {
 }
 
 inline bool tensor_is_realhb_type(executorch::aten::Tensor t) {
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       torch::executor::isRealHBType(t.scalar_type()),
       "Expected to find a real type, but tensor has type %s",
       torch::executor::toString(t.scalar_type()));
@@ -530,7 +563,7 @@ inline bool tensor_is_realhb_type(executorch::aten::Tensor t) {
 }
 
 inline bool tensor_is_realhbbf16_type(executorch::aten::Tensor t) {
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       executorch::runtime::isRealHBBF16Type(t.scalar_type()),
       "Expected to find a real type, but tensor has type %s",
       torch::executor::toString(t.scalar_type()));
@@ -539,7 +572,7 @@ inline bool tensor_is_realhbbf16_type(executorch::aten::Tensor t) {
 }
 
 inline bool tensor_is_complex_type(executorch::aten::Tensor t) {
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       torch::executor::isComplexType(t.scalar_type()),
       "Expected to find a complex type, but tensor has type %s",
       torch::executor::toString(t.scalar_type()));
@@ -548,7 +581,7 @@ inline bool tensor_is_complex_type(executorch::aten::Tensor t) {
 }
 
 inline bool tensor_is_bits_type(executorch::aten::Tensor t) {
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       torch::executor::isBitsType(t.scalar_type()),
       "Expected to find a bits type, but tensor has type %s",
       torch::executor::toString(t.scalar_type()));
@@ -559,7 +592,7 @@ inline bool tensor_is_bits_type(executorch::aten::Tensor t) {
 inline bool tensors_have_same_dtype(
     executorch::aten::Tensor a,
     executorch::aten::Tensor b) {
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       a.scalar_type() == b.scalar_type(),
       ET_TENSOR_CHECK_PREFIX__ ": dtype={%s, %s}",
       torch::executor::toString(a.scalar_type()),
@@ -571,7 +604,7 @@ inline bool tensors_have_same_dtype(
     executorch::aten::Tensor a,
     executorch::aten::Tensor b,
     executorch::aten::Tensor c) {
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       a.scalar_type() == b.scalar_type() && b.scalar_type() == c.scalar_type(),
       ET_TENSOR_CHECK_PREFIX__ ": dtype={%s, %s, %s}",
       torch::executor::toString(a.scalar_type()),
@@ -581,8 +614,8 @@ inline bool tensors_have_same_dtype(
 }
 
 inline bool tensor_is_rank(executorch::aten::Tensor t, size_t rank) {
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
-      t.dim() == rank,
+  ET_CHECK_OR_RETURN_FALSE(
+      static_cast<size_t>(t.dim()) == rank,
       "Expected tensor.dim() to be %zu, but got %zu",
       static_cast<size_t>(rank),
       static_cast<size_t>(t.dim()));
@@ -593,8 +626,8 @@ inline bool tensor_is_rank(executorch::aten::Tensor t, size_t rank) {
 inline bool tensor_has_rank_greater_or_equal_to(
     executorch::aten::Tensor t,
     size_t rank) {
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
-      t.dim() >= rank,
+  ET_CHECK_OR_RETURN_FALSE(
+      static_cast<size_t>(t.dim()) >= rank,
       "Expected tensor.dim() to be >= %zu, but got %zu",
       static_cast<size_t>(rank),
       static_cast<size_t>(t.dim()));
@@ -605,8 +638,8 @@ inline bool tensor_has_rank_greater_or_equal_to(
 inline bool tensor_has_rank_smaller_or_equal_to(
     executorch::aten::Tensor t,
     size_t rank) {
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
-      t.dim() <= rank,
+  ET_CHECK_OR_RETURN_FALSE(
+      static_cast<size_t>(t.dim()) <= rank,
       "Expected tensor.dim() to be <= %zu, but got %zu",
       static_cast<size_t>(rank),
       static_cast<size_t>(t.dim()));
@@ -616,12 +649,12 @@ inline bool tensor_has_rank_smaller_or_equal_to(
 
 inline bool tensor_has_dim(executorch::aten::Tensor t, int64_t d) {
   if (t.dim() == 0) {
-    ET_LOG_MSG_AND_RETURN_IF_FALSE(
+    ET_CHECK_OR_RETURN_FALSE(
         d == 0 || d == -1,
         "dim must be 0 or -1 for 0-dim tensor, got %" PRId64,
         d);
   } else {
-    ET_LOG_MSG_AND_RETURN_IF_FALSE(
+    ET_CHECK_OR_RETURN_FALSE(
         d > 0 ? d < t.dim() : t.dim() + d >= 0,
         "%zu-dim tensor does not have dim at index %zu",
         static_cast<size_t>(t.dim()),
@@ -647,7 +680,7 @@ tensor_dim_has_index(executorch::aten::Tensor t, int64_t d, int64_t ix) {
   // Dimension must have been already checked by tensor_has_dim
   ET_CHECK(d >= 0 && d < t.dim());
 
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       ix >= -t.size(d) && ix < t.size(d),
       "index %" PRId64 " out of range [-%zu,%zu) at dimension %" PRId64 ")",
       ix,
@@ -662,17 +695,17 @@ inline bool tensors_have_same_size_at_dims(
     size_t dim_a,
     executorch::aten::Tensor b,
     size_t dim_b) {
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
-      dim_a < a.dim(),
+  ET_CHECK_OR_RETURN_FALSE(
+      dim_a < static_cast<size_t>(a.dim()),
       "Cannot retrieve dim %zu from tensor with dim %zu",
       static_cast<size_t>(dim_a),
       static_cast<size_t>(a.dim()));
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
-      dim_b < b.dim(),
+  ET_CHECK_OR_RETURN_FALSE(
+      dim_b < static_cast<size_t>(b.dim()),
       "Cannot retrieve dim %zu from tensor with dim %zu",
       static_cast<size_t>(dim_b),
       static_cast<size_t>(b.dim()));
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       a.size(dim_a) == b.size(dim_b),
       ET_TENSOR_CHECK_PREFIX__
       ": a.size(%zu) = %zu does not match b.size(%zu) = %zu",
@@ -700,7 +733,9 @@ inline bool tensors_have_same_shape(
         static_cast<size_t>(b.numel()),
         static_cast<size_t>(a.dim()),
         static_cast<size_t>(b.dim()));
-    for (size_t d = 0; d < ET_MIN2(a.dim(), b.dim()); ++d) {
+    // Using [[maybe_unused]] as ET_LOG may not trigger based on verbosity
+    for ([[maybe_unused]] const auto d :
+         c10::irange(ET_MIN2(a.dim(), b.dim()))) {
       ET_LOG(
           Error,
           "    size(%zu): (%zu, %zu)",
@@ -737,7 +772,8 @@ inline bool tensors_have_same_shape(
         static_cast<size_t>(a.dim()),
         static_cast<size_t>(b.dim()),
         static_cast<size_t>(c.dim()));
-    for (size_t d = 0; d < ET_MIN3(a.dim(), b.dim(), c.dim()); ++d) {
+    for ([[maybe_unused]] const auto d :
+         c10::irange(ET_MIN3(a.dim(), b.dim(), c.dim()))) {
       ET_LOG(
           Error,
           "    size(%zu): (%zu, %zu, %zu)",
@@ -777,7 +813,8 @@ inline bool tensor_has_expected_size(
         static_cast<size_t>(expected_sizes.size()));
     size_t a_dim = static_cast<size_t>(a.dim());
     size_t expected_dim = static_cast<size_t>(expected_sizes.size());
-    for (size_t d = 0; d < ET_MIN2(a_dim, expected_dim); ++d) {
+    for ([[maybe_unused]] const auto d :
+         c10::irange(ET_MIN2(a_dim, expected_dim))) {
       ET_LOG(
           Error,
           "    size(%zu): (%zu, %zu)",
@@ -800,7 +837,8 @@ inline bool tensors_have_same_strides(
         ET_TENSOR_CHECK_PREFIX__ ": dim=(%zu, %zu)",
         static_cast<size_t>(a.dim()),
         static_cast<size_t>(b.dim()));
-    for (size_t d = 0; d < ET_MIN2(a.dim(), b.dim()); ++d) {
+    for ([[maybe_unused]] const auto d :
+         c10::irange(ET_MIN2(a.dim(), b.dim()))) {
       ET_LOG(
           Error,
           "    stride(%zu): (%zu, %zu)",
@@ -825,7 +863,8 @@ inline bool tensors_have_same_strides(
         static_cast<size_t>(a.dim()),
         static_cast<size_t>(b.dim()),
         static_cast<size_t>(c.dim()));
-    for (size_t d = 0; d < ET_MIN3(a.dim(), b.dim(), c.dim()); ++d) {
+    for ([[maybe_unused]] const auto d :
+         c10::irange(ET_MIN3(a.dim(), b.dim(), c.dim()))) {
       ET_LOG(
           Error,
           "    stride(%zu): (%zu, %zu, %zu)",
@@ -847,13 +886,13 @@ inline bool tensor_is_contiguous(executorch::aten::Tensor t) {
   if (strides.size() == 0) {
     return true;
   }
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       strides[strides.size() - 1] == 1,
       "Tensor is not contiguous; the stride of the last dimension must be 1, "
       "but got %zu",
       static_cast<size_t>(strides[strides.size() - 1]));
   for (int i = strides.size() - 1; i > 0; --i) {
-    ET_LOG_MSG_AND_RETURN_IF_FALSE(
+    ET_CHECK_OR_RETURN_FALSE(
         strides[i - 1] == strides[i] * sizes[i],
         "Tensor is not contiguous; the stride of dim %zu should be equal to "
         "strides[%zu] * sizes[%zu] = %zu, but found %zu",
@@ -869,7 +908,7 @@ inline bool tensor_is_contiguous(executorch::aten::Tensor t) {
 inline bool tensors_have_same_rank(
     executorch::aten::Tensor a,
     executorch::aten::Tensor b) {
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       a.dim() == b.dim(),
       ET_TENSOR_CHECK_PREFIX__ ": rank={%zd, %zd}",
       ssize_t(a.dim()),
@@ -892,7 +931,7 @@ inline size_t getLeadingDims(
       dim,
       ssize_t(tensor.dim()));
   size_t dims = 1;
-  for (size_t i = 0; i < dim; ++i) {
+  for (const auto i : c10::irange(dim)) {
     dims *= static_cast<size_t>(tensor.size(i));
   }
   return dims;
@@ -909,7 +948,7 @@ inline size_t getTrailingDims(
       dim,
       ssize_t(tensor.dim()));
   size_t dims = 1;
-  for (size_t i = dim + 1; i < tensor.dim(); ++i) {
+  for (size_t i = dim + 1; i < static_cast<size_t>(tensor.dim()); ++i) {
     dims *= static_cast<size_t>(tensor.size(i));
   }
   return dims;
@@ -982,7 +1021,7 @@ inline void indexToCoordinate(
     const executorch::aten::Tensor& tensor,
     size_t index,
     size_t* coordinate) {
-  ET_CHECK(index < tensor.numel());
+  ET_CHECK(index < static_cast<size_t>(tensor.numel()));
   for (auto i = 0; i < tensor.dim(); ++i) {
     auto dim = tensor.dim() - 1 - i;
     size_t dim_size = tensor.size(dim);
diff --git a/runtime/core/exec_aten/util/tensor_util_aten.cpp b/runtime/core/exec_aten/util/tensor_util_aten.cpp
index d768f66d05f..4df273d4dbb 100644
--- a/runtime/core/exec_aten/util/tensor_util_aten.cpp
+++ b/runtime/core/exec_aten/util/tensor_util_aten.cpp
@@ -35,7 +35,7 @@ Error get_dim_order(
 
 bool tensor_has_valid_dim_order(at::Tensor t) {
   executorch::aten::DimOrderType dim_order[kTensorDimensionLimit];
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       get_dim_order(t, dim_order, t.dim()) == Error::Ok,
       "Failed to retrieve dim order from tensor!");
 
@@ -55,7 +55,7 @@ bool tensor_has_valid_dim_order(at::Tensor t) {
 
 inline bool tensor_is_default_or_channels_last_dim_order(at::Tensor t) {
   executorch::aten::DimOrderType dim_order[kTensorDimensionLimit];
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       get_dim_order(t, dim_order, t.dim()) == Error::Ok,
       "Failed to retrieve dim order from tensor!");
 
@@ -86,7 +86,7 @@ bool tensors_have_same_dim_order(
   executorch::aten::DimOrderType first_dim_order[kTensorDimensionLimit];
   executorch::aten::DimOrderType other_dim_order[kTensorDimensionLimit];
 
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       get_dim_order(tensor_list[0], first_dim_order, tensor_list[0].dim()) ==
           Error::Ok,
       "Failed to retrieve dim order from 1st input tensor!");
@@ -97,7 +97,7 @@ bool tensors_have_same_dim_order(
       is_channels_last_dim_order(first_dim_order, tensor_list[0].dim());
 
   for (size_t i = 1; i < tensor_list.size(); ++i) {
-    ET_LOG_MSG_AND_RETURN_IF_FALSE(
+    ET_CHECK_OR_RETURN_FALSE(
         get_dim_order(tensor_list[i], other_dim_order, tensor_list[i].dim()) ==
             Error::Ok,
         "Failed to retrieve dim order from %zd-th input tensor!",
@@ -109,7 +109,7 @@ bool tensors_have_same_dim_order(
         is_channels_last_dim_order(other_dim_order, tensor_list[i].dim());
   }
 
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       all_contiguous || all_channels_last,
       "%zd input tensors have different dim orders",
       tensor_list.size());
diff --git a/runtime/core/exec_aten/util/tensor_util_portable.cpp b/runtime/core/exec_aten/util/tensor_util_portable.cpp
index 3350445db73..e4aa875aed4 100644
--- a/runtime/core/exec_aten/util/tensor_util_portable.cpp
+++ b/runtime/core/exec_aten/util/tensor_util_portable.cpp
@@ -8,6 +8,7 @@
 
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
 
+#include <c10/util/irange.h>
 #include <cstring>
 
 #include <executorch/runtime/core/portable_type/tensor.h>
@@ -41,11 +42,11 @@ Error get_dim_order(
 bool tensor_has_valid_dim_order(torch::executor::Tensor t) {
   if (!validate_dim_order(t.dim_order().data(), t.dim_order().size())) {
     ET_LOG(Error, "Tensor dim order is not valid:");
-    for (size_t d = 0; d < t.dim(); ++d) {
+    for (size_t d = 0; d < static_cast<size_t>(t.dim()); ++d) {
       ET_LOG(
           Error,
           "    dim_order(%zu): %zu",
-          static_cast<size_t>(d),
+          d,
           static_cast<size_t>(t.dim_order()[d]));
     }
     return false;
@@ -62,11 +63,11 @@ bool tensor_is_default_or_channels_last_dim_order(torch::executor::Tensor t) {
     ET_LOG(
         Error,
         "Expected tensor to have default or channels last dim order, but got");
-    for (size_t d = 0; d < t.dim(); ++d) {
+    for (size_t d = 0; d < static_cast<size_t>(t.dim()); ++d) {
       ET_LOG(
           Error,
           "    dim_order(%zu): %zu",
-          static_cast<size_t>(d),
+          d,
           static_cast<size_t>(t.dim_order()[d]));
     }
   }
@@ -79,11 +80,11 @@ bool tensor_is_default_dim_order(torch::executor::Tensor t) {
 
   if (!ret_val) {
     ET_LOG(Error, "Expected tensor to have default dim order, but got");
-    for (size_t d = 0; d < t.dim(); ++d) {
+    for (size_t d = 0; d < static_cast<size_t>(t.dim()); ++d) {
       ET_LOG(
           Error,
           "    dim_order(%zu): %zu",
-          static_cast<size_t>(d),
+          d,
           static_cast<size_t>(t.dim_order()[d]));
     }
   }
@@ -96,11 +97,11 @@ bool tensor_is_channels_last_dim_order(torch::executor::Tensor t) {
 
   if (!ret_val) {
     ET_LOG(Error, "Expected tensor to have channels last dim order, but got");
-    for (size_t d = 0; d < t.dim(); ++d) {
+    for (size_t d = 0; d < static_cast<size_t>(t.dim()); ++d) {
       ET_LOG(
           Error,
           "    dim_order(%zu): %zu",
-          static_cast<size_t>(d),
+          d,
           static_cast<size_t>(t.dim_order()[d]));
     }
   }
@@ -125,7 +126,7 @@ bool tensors_have_same_dim_order(
                             tensor_list[i].dim_order().size());
   }
 
-  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+  ET_CHECK_OR_RETURN_FALSE(
       all_contiguous || all_channels_last,
       "%zd input tensors have different dim orders",
       tensor_list.size());
diff --git a/runtime/core/exec_aten/util/test/CMakeLists.txt b/runtime/core/exec_aten/util/test/CMakeLists.txt
index 6123827e1ff..e806419e21e 100644
--- a/runtime/core/exec_aten/util/test/CMakeLists.txt
+++ b/runtime/core/exec_aten/util/test/CMakeLists.txt
@@ -17,7 +17,7 @@ cmake_minimum_required(VERSION 3.19)
 
 set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../../..)
 
-include(${EXECUTORCH_ROOT}/build/Test.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
 set(_test_srcs
     dim_order_util_test.cpp operator_impl_example_test.cpp
diff --git a/runtime/core/exec_aten/util/test/targets.bzl b/runtime/core/exec_aten/util/test/targets.bzl
index 357e91eea3a..1fcf984e034 100644
--- a/runtime/core/exec_aten/util/test/targets.bzl
+++ b/runtime/core/exec_aten/util/test/targets.bzl
@@ -1,4 +1,4 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime")
 
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
@@ -35,7 +35,7 @@ def define_common_targets():
         ],
     )
 
-    for aten_mode in (True, False):
+    for aten_mode in get_aten_mode_options():
         aten_suffix = "_aten" if aten_mode else ""
         runtime.cxx_test(
             name = "tensor_util_test" + aten_suffix,
diff --git a/runtime/core/hierarchical_allocator.h b/runtime/core/hierarchical_allocator.h
index f2f5fd18fb5..b5031fa38e5 100644
--- a/runtime/core/hierarchical_allocator.h
+++ b/runtime/core/hierarchical_allocator.h
@@ -8,6 +8,7 @@
 
 #pragma once
 
+#include <c10/util/irange.h>
 #include <executorch/runtime/core/memory_allocator.h>
 #include <executorch/runtime/core/result.h>
 #include <executorch/runtime/core/span.h>
@@ -96,7 +97,7 @@ class HierarchicalAllocator final {
         "n_allocators %" PRIu32 " > %zu",
         n_allocators,
         kSpanArraySize);
-    for (uint32_t i = 0; i < n_allocators; ++i) {
+    for (const auto i : c10::irange(n_allocators)) {
       span_array_[i] =
           Span<uint8_t>(allocators[i].base_address(), allocators[i].size());
     }
diff --git a/runtime/core/memory_allocator.h b/runtime/core/memory_allocator.h
index 5149abdaa1a..6f4496513a7 100644
--- a/runtime/core/memory_allocator.h
+++ b/runtime/core/memory_allocator.h
@@ -198,176 +198,6 @@ class MemoryAllocator {
   int32_t prof_id_ = -1;
 };
 
-#if ET_HAVE_GNU_STATEMENT_EXPRESSIONS
-/**
- * Tries allocating from the specified MemoryAllocator*.
- *
- * - On success, returns a pointer to the allocated buffer.
- * - On failure, executes the provided code block, which must return or panic.
- *
- * Example:
- * @code
- *   char* buf = ET_TRY_ALLOCATE_OR(
- *       memory_allocator, bufsize, {
- *         *out_err = Error::MemoryAllocationFailed;
- *         return nullopt;
- *       });
- * @endcode
- */
-#define ET_TRY_ALLOCATE_OR(memory_allocator__, nbytes__, ...)              \
-  ({                                                                       \
-    void* et_try_allocate_result = memory_allocator__->allocate(nbytes__); \
-    if (et_try_allocate_result == nullptr && nbytes__ > 0) {               \
-      __VA_ARGS__                                                          \
-      /* The args must return. */                                          \
-      ET_UNREACHABLE();                                                    \
-    }                                                                      \
-    et_try_allocate_result;                                                \
-  })
-
-/**
- * Tries allocating an instance of type__ from the specified MemoryAllocator*.
- *
- * - On success, returns a pointer to the allocated buffer. Note that the memory
- *   will not be initialized.
- * - On failure, executes the provided code block, which must return or panic.
- *
- * Example:
- * @code
- *   char* buf = ET_TRY_ALLOCATE_INSTANCE_OR(
- *       memory_allocator,
- *       MyType,
- *       { *out_err = Error::MemoryAllocationFailed; return nullopt; });
- * @endcode
- */
-#define ET_TRY_ALLOCATE_INSTANCE_OR(memory_allocator__, type__, ...) \
-  ({                                                                 \
-    type__* et_try_allocate_result =                                 \
-        memory_allocator__->allocateInstance<type__>();              \
-    if (et_try_allocate_result == nullptr) {                         \
-      __VA_ARGS__                                                    \
-      /* The args must return. */                                    \
-      ET_UNREACHABLE();                                              \
-    }                                                                \
-    et_try_allocate_result;                                          \
-  })
-
-/**
- * Tries allocating multiple elements of a given type from the specified
- * MemoryAllocator*.
- *
- * - On success, returns a pointer to the allocated buffer.
- * - On failure, executes the provided code block, which must return or panic.
- *
- * Example:
- * @code
- *   Tensor* tensor_list = ET_TRY_ALLOCATE_LIST_OR(
- *       memory_allocator, Tensor, num_tensors, {
- *         *out_err = Error::MemoryAllocationFailed;
- *         return nullopt;
- *       });
- * @endcode
- */
-#define ET_TRY_ALLOCATE_LIST_OR(memory_allocator__, type__, nelem__, ...) \
-  ({                                                                      \
-    type__* et_try_allocate_result =                                      \
-        memory_allocator__->allocateList<type__>(nelem__);                \
-    if (et_try_allocate_result == nullptr && nelem__ > 0) {               \
-      __VA_ARGS__                                                         \
-      /* The args must return. */                                         \
-      ET_UNREACHABLE();                                                   \
-    }                                                                     \
-    et_try_allocate_result;                                               \
-  })
-#else // !ET_HAVE_GNU_STATEMENT_EXPRESSIONS
-/**
- * The recommended alternative for statement expression-incompatible compilers
- * is to directly allocate the memory.
- * e.g. memory_allocator__->allocate(nbytes__);
- */
-#define ET_TRY_ALLOCATE_OR(memory_allocator__, nbytes__, ...) \
-  static_assert(                                              \
-      false,                                                  \
-      "ET_TRY_ALLOCATE_OR uses statement expressions and \
-      thus is not available for use with this compiler.");
-
-/**
- * The recommended alternative for statement expression-incompatible compilers
- * is to directly allocate the memory.
- * e.g. memory_allocator__->allocateInstance<type__>();
- */
-#define ET_TRY_ALLOCATE_INSTANCE_OR(memory_allocator__, type__, ...) \
-  static_assert(                                                     \
-      false,                                                         \
-      "ET_TRY_ALLOCATE_INSTANCE_OR uses statement \
-    expressions and thus is not available for use with this compiler.");
-
-/**
- * The recommended alternative for statement expression-incompatible compilers
- * is to directly use allocate the memory.
- * e.g. memory_allocator__->allocateList<type__>(nelem__);
- */
-#define ET_TRY_ALLOCATE_LIST_OR(memory_allocator__, type__, nelem__, ...) \
-  static_assert(                                                          \
-      false,                                                              \
-      "ET_TRY_ALLOCATE_LIST_OR uses statement \
-    expressions and thus is not available for use with this compiler.");
-#endif // !ET_HAVE_GNU_STATEMENT_EXPRESSIONS
-
-/**
- * Tries allocating from the specified MemoryAllocator*.
- *
- * - On success, returns a pointer to the allocated buffer.
- * - On failure, returns `Error::MemoryAllocationFailed` from the calling
- *   function, which must be declared to return `executorch::runtime::Error`.
- *
- * Example:
- * @code
- *   char* buf = ET_ALLOCATE_OR_RETURN_ERROR(memory_allocator, bufsize);
- * @endcode
- */
-#define ET_ALLOCATE_OR_RETURN_ERROR(memory_allocator__, nbytes__) \
-  ET_TRY_ALLOCATE_OR(memory_allocator__, nbytes__, {              \
-    return ::executorch::runtime::Error::MemoryAllocationFailed;  \
-  })
-
-/**
- * Tries allocating an instance of type__ from the specified MemoryAllocator*.
- *
- * - On success, returns a pointer to the allocated buffer. Note that the memory
- *   will not be initialized.
- * - On failure, returns `Error::MemoryAllocationFailed` from the calling
- *   function, which must be declared to return `executorch::runtime::Error`.
- *
- * Example:
- * @code
- *   char* buf = ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR(memory_allocator, MyType);
- * @endcode
- */
-#define ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR(memory_allocator__, type__) \
-  ET_TRY_ALLOCATE_INSTANCE_OR(memory_allocator__, type__, {              \
-    return ::executorch::runtime::Error::MemoryAllocationFailed;         \
-  })
-
-/**
- * Tries allocating multiple elements of a given type from the specified
- * MemoryAllocator*.
- *
- * - On success, returns a pointer to the allocated buffer.
- * - On failure, returns `Error::MemoryAllocationFailed` from the calling
- *   function, which must be declared to return `executorch::runtime::Error`.
- *
- * Example:
- * @code
- *   Tensor* tensor_list = ET_ALLOCATE_LIST_OR_RETURN_ERROR(
- *       memory_allocator, Tensor, num_tensors);
- * @endcode
- */
-#define ET_ALLOCATE_LIST_OR_RETURN_ERROR(memory_allocator__, type__, nelem__) \
-  ET_TRY_ALLOCATE_LIST_OR(memory_allocator__, type__, nelem__, {              \
-    return ::executorch::runtime::Error::MemoryAllocationFailed;              \
-  })
-
 } // namespace runtime
 } // namespace executorch
 
diff --git a/runtime/core/named_data_map.h b/runtime/core/named_data_map.h
index 68639ed872a..e79c7035989 100644
--- a/runtime/core/named_data_map.h
+++ b/runtime/core/named_data_map.h
@@ -7,9 +7,12 @@
  */
 
 #pragma once
+
+#ifdef __GNUC__
 // Disable -Wdeprecated-declarations, as some builds use 'Werror'.
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
 
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/freeable_buffer.h>
@@ -53,10 +56,9 @@ class ET_EXPERIMENTAL NamedDataMap {
    * size of the data for a given key.
    * @param buffer The buffer to load the data into. Must point to at least
    * `size` bytes of memory.
-   * @return Result containing the number of bytes written on success. This will
-   * fail if the buffer is too small.
+   * @returns an Error indicating if the load was successful.
    */
-  ET_NODISCARD virtual Result<size_t>
+  ET_NODISCARD virtual Error
   load_data_into(const char* key, void* buffer, size_t size) const = 0;
 
   /**
@@ -79,4 +81,6 @@ class ET_EXPERIMENTAL NamedDataMap {
 } // namespace runtime
 } // namespace executorch
 
+#ifdef __GNUC__
 #pragma GCC diagnostic pop
+#endif
diff --git a/runtime/core/portable_type/c10/README.md b/runtime/core/portable_type/c10/README.md
index df14d22a4cf..104a6717ba7 100644
--- a/runtime/core/portable_type/c10/README.md
+++ b/runtime/core/portable_type/c10/README.md
@@ -1,7 +1,13 @@
-We added an extra c10 directory so that runtime/core/portable_type/c10
+This directory contains header files from `c10` in PyTorch core that
+need to be used in ExecuTorch core. They are copied here rather than
+being found through the torch pip package to keep the core build
+hermetic for embedded use cases. The headers should be exact copies
+from PyTorch core; if they are out of sync, please send a PR!
+
+We added an extra c10 directory so that `runtime/core/portable_type/c10`
 can be the directory to put on your include path, rather than
-runtime/core/portable_type, because using runtime/core/portable_type
+`runtime/core/portable_type`, because using `runtime/core/portable_type`
 would cause all headers in that directory to be includeable with
 `#include <foo.h>`. In particular, that includes
-runtime/core/portable_type/complex.h, which would shadow the C99
-complex.h standard header.
+`runtime/core/portable_type/complex.h`, which would shadow the C99
+`complex.h` standard header.
diff --git a/runtime/core/portable_type/c10/c10/macros/Export.h b/runtime/core/portable_type/c10/c10/macros/Export.h
index cb68060ed81..21808de77a3 100644
--- a/runtime/core/portable_type/c10/c10/macros/Export.h
+++ b/runtime/core/portable_type/c10/c10/macros/Export.h
@@ -139,8 +139,10 @@
 #endif
 
 #if defined(TORCH_HIP_BUILD_MAIN_LIB)
+#define TORCH_HIP_CPP_API C10_EXPORT
 #define TORCH_HIP_API C10_EXPORT
 #else
+#define TORCH_HIP_CPP_API C10_IMPORT
 #define TORCH_HIP_API C10_IMPORT
 #endif
 
diff --git a/runtime/core/portable_type/c10/c10/targets.bzl b/runtime/core/portable_type/c10/c10/targets.bzl
index 1e60b70a4b8..dbe35f8eefd 100644
--- a/runtime/core/portable_type/c10/c10/targets.bzl
+++ b/runtime/core/portable_type/c10/c10/targets.bzl
@@ -1,4 +1,4 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime", "is_arvr_mode")
 
 def get_sleef_preprocessor_flags():
     if runtime.is_oss:
@@ -26,25 +26,16 @@ def define_common_targets():
             "util/TypeSafeSignMath.h",
             "util/bit_cast.h",
             "util/floating_point_utils.h",
+            "util/irange.h",
         ],
         exported_preprocessor_flags = [
-            # NOTE: If we define C10_EMBEDDED to prevent Half and
-            # BFloat16 from supporting streams, non-ExecuTorch-core
-            # uses of other ATen headers that try to print ATen
-            # primitive types fail to build because, apparently, there
-            # are implicit conversions from Half/BFloat16 to a variety
-            # of primitive types, not just float. Since merely
-            # including <ostream> shouldn't result in any runtime
-            # artifacts if stream code is never actually called, it
-            # seems best to just not define C10_EMBEDDED, but if you
-            # need it, it's there.
-            # "-DC10_EMBEDDED",
+            "-DC10_USING_CUSTOM_GENERATED_MACROS",
+        ] + ([] if runtime.is_oss else [
             "-DC10_USE_GLOG",
             "-DC10_USE_MINIMAL_GLOG",
-            "-DC10_USING_CUSTOM_GENERATED_MACROS",
-        ],
+        ]),
         visibility = [
-            "//executorch/runtime/core/portable_type/...",
+            "//executorch/...",
         ],
         deps = select({
             "DEFAULT": [],
@@ -87,9 +78,8 @@ def define_common_targets():
         ] + get_sleef_preprocessor_flags(),
         xplat_exported_deps = [
             "//xplat/caffe2:aten_header",
-            "//xplat/caffe2:generated_aten_config_header",
             "//xplat/caffe2/c10:c10_headers",
-        ],
+        ] + ["//xplat/caffe2:ovrsource_aten_Config.h" if is_arvr_mode() else "//xplat/caffe2:generated_aten_config_header",],
         exported_preprocessor_flags = select({
             # Intentionally punting on non-fbcode x86 sleef support
             # for now because of fbsource//third-party/sleef:sleef
diff --git a/runtime/core/portable_type/c10/c10/util/BFloat16.h b/runtime/core/portable_type/c10/c10/util/BFloat16.h
index ad1271fc729..09d3051ab71 100644
--- a/runtime/core/portable_type/c10/c10/util/BFloat16.h
+++ b/runtime/core/portable_type/c10/c10/util/BFloat16.h
@@ -8,9 +8,7 @@
 #include <cstdint>
 #include <cstring>
 #include <iosfwd>
-#ifndef C10_EMBEDDED
 #include <ostream>
-#endif // C10_EMBEDDED
 
 #if defined(__CUDACC__) && !defined(USE_ROCM)
 #include <cuda_bf16.h>
@@ -116,14 +114,12 @@ struct alignas(2) BFloat16 {
 #endif
 };
 
-#ifndef C10_EMBEDDED
 C10_API inline std::ostream& operator<<(
     std::ostream& out,
     const BFloat16& value) {
   out << (float)value;
   return out;
 }
-#endif // C10_EMBEDDED
 
 } // namespace c10
 
diff --git a/runtime/core/portable_type/c10/c10/util/Half.h b/runtime/core/portable_type/c10/c10/util/Half.h
index 5625d4c3403..b77cf7b1f4a 100644
--- a/runtime/core/portable_type/c10/c10/util/Half.h
+++ b/runtime/core/portable_type/c10/c10/util/Half.h
@@ -29,9 +29,7 @@
 #include <cstring>
 #include <iosfwd>
 #include <limits>
-#ifndef C10_EMBEDDED
 #include <ostream>
-#endif // C10_EMBEDDED
 
 #ifdef __CUDACC__
 #include <cuda_fp16.h>
@@ -411,12 +409,10 @@ struct alignas(2) Half {
 #endif
 };
 
-#ifndef C10_EMBEDDED
 C10_API inline std::ostream& operator<<(std::ostream& out, const Half& value) {
   out << (float)value;
   return out;
 }
-#endif // C10_EMBEDDED
 
 } // namespace c10
 
diff --git a/runtime/core/portable_type/c10/c10/util/irange.h b/runtime/core/portable_type/c10/c10/util/irange.h
new file mode 100644
index 00000000000..81104d9568f
--- /dev/null
+++ b/runtime/core/portable_type/c10/c10/util/irange.h
@@ -0,0 +1,123 @@
+// Copyright 2004-present Facebook. All Rights Reserved.
+
+#pragma once
+
+#include <c10/util/TypeSafeSignMath.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <iterator>
+#include <type_traits>
+
+namespace c10 {
+
+namespace detail {
+
+template <
+    typename I,
+    bool one_sided = false,
+    std::enable_if_t<std::is_integral_v<I>, int> = 0>
+struct integer_iterator {
+  using iterator_category = std::input_iterator_tag;
+  using value_type = I;
+  using difference_type = std::ptrdiff_t;
+  using pointer = I*;
+  using reference = I&;
+
+  explicit constexpr integer_iterator(I value_) : value(value_) {}
+
+  constexpr I operator*() const {
+    return value;
+  }
+
+  constexpr I const* operator->() const {
+    return &value;
+  }
+
+  constexpr integer_iterator& operator++() {
+    ++value;
+    return *this;
+  }
+
+  constexpr integer_iterator operator++(int) {
+    const auto copy = *this;
+    ++*this;
+    return copy;
+  }
+
+  constexpr bool operator==(const integer_iterator& other) const {
+    if constexpr (one_sided) {
+      // Range-for loops' end test is `begin != end`, not `begin <
+      // end`. To handle `c10::irange(n)` where n < 0 (which should be
+      // empty), we just make `begin != end` fail whenever `end` is
+      // negative.
+      return is_negative(other.value) || value == other.value;
+    } else {
+      return value == other.value;
+    }
+    // Suppress "warning: missing return statement at end of non-void function"
+    // which Nvidia's Robert Crovella confirms is an NVCC compiler error
+    // here https://stackoverflow.com/a/64561686/752843 on 2020-10-27
+    // `__builtin_unreachable();` would be best here, but it's not
+    // available with all compilers. So we instead return an arbitrary
+    // value trusting that this line will, in fact, never be reached.
+    return false; // Horrible hack
+  }
+
+  constexpr bool operator!=(const integer_iterator& other) const {
+    return !(*this == other);
+  }
+
+ protected:
+  I value;
+};
+
+} // namespace detail
+
+template <
+    typename I,
+    bool one_sided = false,
+    std::enable_if_t<std::is_integral_v<I>, bool> = true>
+struct integer_range {
+ public:
+  constexpr integer_range(I begin, I end) : begin_(begin), end_(end) {}
+  using iterator = detail::integer_iterator<I, one_sided>;
+  constexpr iterator begin() const {
+    return begin_;
+  }
+  constexpr iterator end() const {
+    return end_;
+  }
+
+ private:
+  iterator begin_;
+  iterator end_;
+};
+
+/// Creates an integer range for the half-open interval [begin, end)
+/// If end<=begin, then the range is empty.
+/// The range has the type of the `end` integer; `begin` integer is
+/// cast to this type.
+template <
+    typename Integer1,
+    typename Integer2,
+    std::enable_if_t<std::is_integral_v<Integer1>, bool> = true,
+    std::enable_if_t<std::is_integral_v<Integer2>, bool> = true>
+integer_range<Integer2> irange(Integer1 begin, Integer2 end) {
+  // If end<=begin then the range is empty; we can achieve this effect by
+  // choosing the larger of {begin, end} as the loop terminator
+  return {
+      static_cast<Integer2>(begin),
+      std::max(static_cast<Integer2>(begin), end)};
+}
+
+/// Creates an integer range for the half-open interval [0, end)
+/// If end<=begin, then the range is empty
+template <
+    typename Integer,
+    std::enable_if_t<std::is_integral_v<Integer>, bool> = true>
+constexpr integer_range<Integer, true> irange(Integer end) {
+  return {Integer(), end};
+}
+
+} // namespace c10
diff --git a/runtime/core/portable_type/optional.h b/runtime/core/portable_type/optional.h
index 21fe0d39267..31ad06fd093 100644
--- a/runtime/core/portable_type/optional.h
+++ b/runtime/core/portable_type/optional.h
@@ -8,175 +8,18 @@
 
 #pragma once
 
-#include <executorch/runtime/platform/assert.h>
-#include <new>
-#include <utility> // std::forward and other template magic checks
+#include <optional>
 
 namespace executorch {
 namespace runtime {
 namespace etensor {
 
-/// Used to indicate an optional type with uninitialized state.
-struct nullopt_t final {
-  constexpr explicit nullopt_t(int32_t) {}
-};
-
-/// A constant of type nullopt_t that is used to indicate an optional type with
-/// uninitialized state.
-constexpr nullopt_t nullopt{0};
-
-/// Leaner optional class, subset of c10, std, and boost optional APIs.
-template <class T>
-class optional final {
- public:
-  /// The type wrapped by the optional class.
-  using value_type = T;
-
-  /// Constructs an optional object that does not contain a value.
-  /* implicit */ optional() noexcept : storage_(trivial_init), init_(false) {}
-
-  /// Constructs an optional object that does not contain a value.
-  /* implicit */ optional(nullopt_t) noexcept
-      : storage_(trivial_init), init_(false) {}
-
-  /// Constructs an optional object that matches the state of v.
-  /* implicit */ optional(const optional<T>& v)
-      : storage_(trivial_init), init_(v.init_) {
-    if (init_) {
-      new (&storage_.value_) T(v.storage_.value_);
-    }
-  }
-
-  /// Constructs an optional object that contains the specified value.
-  /* implicit */ optional(const T& v) : storage_(v), init_(true) {}
-
-  /// Constructs an optional object from v.
-  /* implicit */ optional(optional<T>&& v) noexcept(
-      std::is_nothrow_move_constructible<T>::value)
-      : storage_(trivial_init), init_(v.init_) {
-    if (init_) {
-      new (&storage_.value_) T(std::forward<T>(v.storage_.value_));
-    }
-  }
-
-  /// Constructs an optional object that contains the specified value.
-  /* implicit */ optional(T&& v) : storage_(std::forward<T>(v)), init_(true) {}
-
-  optional& operator=(const optional& rhs) {
-    if (init_ && !rhs.init_) {
-      clear();
-    } else if (!init_ && rhs.init_) {
-      init_ = true;
-      new (&storage_.value_) T(rhs.storage_.value_);
-    } else if (init_ && rhs.init_) {
-      storage_.value_ = rhs.storage_.value_;
-    }
-    return *this;
-  }
-
-  optional& operator=(optional&& rhs) noexcept(
-      std::is_nothrow_move_assignable<T>::value &&
-      std::is_nothrow_move_constructible<T>::value) {
-    if (init_ && !rhs.init_) {
-      clear();
-    } else if (!init_ && rhs.init_) {
-      init_ = true;
-      new (&storage_.value_) T(std::forward<T>(rhs.storage_.value_));
-    } else if (init_ && rhs.init_) {
-      storage_.value_ = std::forward<T>(rhs.storage_.value_);
-    }
-    return *this;
-  }
-
-  /// Destroys the stored value if there is one
-  ~optional() {
-    if (init_) {
-      storage_.value_.~T();
-    }
-  }
-
-  optional& operator=(nullopt_t) noexcept {
-    clear();
-    return *this;
-  }
-
-  /// Returns true if the object contains a value, false otherwise
-  explicit operator bool() const noexcept {
-    return init_;
-  }
-
-  /// Returns true if the object contains a value, false otherwise
-  bool has_value() const noexcept {
-    return init_;
-  }
-
-  /// Returns a constant reference to the contained value. Calls ET_CHECK if
-  /// the object does not contain a value.
-  T const& value() const& {
-    ET_CHECK(init_);
-    return contained_val();
-  }
-
-  /// Returns a mutable reference to the contained value. Calls ET_CHECK if the
-  /// object does not contain a value.
-  T& value() & {
-    ET_CHECK(init_);
-    return contained_val();
-  }
-
-  /// Returns an rvalue of the contained value. Calls ET_CHECK if the object
-  /// does not contain a value.
-  T&& value() && {
-    ET_CHECK(init_);
-    return std::forward<T>(contained_val());
-  }
-
- private:
-  // Used to invoke the dummy ctor of storage_t in the initializer lists of
-  // optional_base as default ctor is implicitly deleted because T is nontrivial
-  struct trivial_init_t {
-  } trivial_init{};
-
-  /**
-   * A wrapper type that lets us avoid constructing a T when there is no value.
-   * If there is a value present, the optional class must destroy it.
-   */
-  union storage_t {
-    /// A small, trivially-constructable alternative to T.
-    unsigned char dummy_;
-    /// The constructed value itself, if optional::has_value_ is true.
-    T value_;
-
-    /* implicit */ storage_t(trivial_init_t) {
-      dummy_ = 0;
-    }
-
-    template <class... Args>
-    storage_t(Args&&... args) : value_(std::forward<Args>(args)...) {}
-
-    ~storage_t() {}
-  };
-
-  const T& contained_val() const& {
-    return storage_.value_;
-  }
-  T&& contained_val() && {
-    return std::move(storage_.value_);
-  }
-  T& contained_val() & {
-    return storage_.value_;
-  }
-
-  void clear() noexcept {
-    if (init_) {
-      storage_.value_.~T();
-    }
-    init_ = false;
-  }
-
-  storage_t storage_;
-  bool init_;
-};
+// NOLINTNEXTLINE(misc-unused-using-decls)
+using std::nullopt;
+// NOLINTNEXTLINE(misc-unused-using-decls)
+using std::nullopt_t;
+// NOLINTNEXTLINE(misc-unused-using-decls)
+using std::optional;
 
 } // namespace etensor
 } // namespace runtime
diff --git a/runtime/core/portable_type/targets.bzl b/runtime/core/portable_type/targets.bzl
index 43efeca208c..6178f2c0f9a 100644
--- a/runtime/core/portable_type/targets.bzl
+++ b/runtime/core/portable_type/targets.bzl
@@ -28,6 +28,9 @@ def define_common_targets():
             "//executorch/runtime/core/exec_aten/...",
             "//executorch/runtime/core/portable_type/test/...",
         ],
+        deps = [
+            "//executorch/runtime/core/portable_type/c10/c10:c10",
+        ],
         exported_deps = [
             ":scalar_type",
             "//executorch/runtime/core:core",
diff --git a/runtime/core/portable_type/tensor_impl.cpp b/runtime/core/portable_type/tensor_impl.cpp
index b978e23cbd6..ede5a3d4101 100644
--- a/runtime/core/portable_type/tensor_impl.cpp
+++ b/runtime/core/portable_type/tensor_impl.cpp
@@ -11,6 +11,8 @@
 #include <algorithm>
 #include <cstdint>
 
+#include <c10/util/irange.h>
+
 #include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_shape_to_c_string.h>
@@ -30,11 +32,11 @@ ssize_t compute_numel(const TensorImpl::SizesType* sizes, ssize_t dim) {
       dim == 0 || sizes != nullptr,
       "Sizes must be provided for non-scalar tensors");
   ssize_t numel = 1; // Zero-dimensional tensors (scalars) have numel == 1.
-  for (ssize_t i = 0; i < dim; ++i) {
+  for (const auto i : c10::irange(dim)) {
     ET_CHECK_MSG(
         sizes[i] >= 0,
-        "Size must be non-negative, got %d at dimension %zd",
-        sizes[i],
+        "Size must be non-negative, got %zd at dimension %zd",
+        static_cast<ssize_t>(sizes[i]),
         i);
     numel *= sizes[i];
   }
@@ -74,7 +76,7 @@ ssize_t TensorImpl::element_size() const {
 
 Error TensorImpl::internal_resize_contiguous(ArrayRef<SizesType> new_sizes) {
   ET_CHECK_OR_RETURN_ERROR(
-      new_sizes.size() == dim_,
+      static_cast<ssize_t>(new_sizes.size()) == dim_,
       NotSupported,
       "Attempted to change the tensor rank which is immutable: old=%zu, new=%zu",
       dim_,
@@ -118,7 +120,7 @@ Error TensorImpl::internal_resize_contiguous(ArrayRef<SizesType> new_sizes) {
       const auto new_numel = compute_numel(new_sizes.data(), dim_);
 
       ET_CHECK_OR_RETURN_ERROR(
-          new_numel <= numel_bound_,
+          static_cast<size_t>(new_numel) <= numel_bound_,
           NotSupported,
           "Attempted to resize a bounded tensor with a maximum capacity of %zu elements to %zu elements.",
           numel_bound_,
diff --git a/runtime/core/portable_type/test/CMakeLists.txt b/runtime/core/portable_type/test/CMakeLists.txt
index f21e4fc791c..b1f57a93ab5 100644
--- a/runtime/core/portable_type/test/CMakeLists.txt
+++ b/runtime/core/portable_type/test/CMakeLists.txt
@@ -17,7 +17,7 @@ cmake_minimum_required(VERSION 3.19)
 
 set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..)
 
-include(${EXECUTORCH_ROOT}/build/Test.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
 set(_test_srcs
     bfloat16_test.cpp
diff --git a/runtime/core/portable_type/test/bfloat16_test.cpp b/runtime/core/portable_type/test/bfloat16_test.cpp
index 6b42a6e4a5e..505f80e770f 100644
--- a/runtime/core/portable_type/test/bfloat16_test.cpp
+++ b/runtime/core/portable_type/test/bfloat16_test.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <executorch/runtime/core/portable_type/bfloat16.h>
 
 #include <gtest/gtest.h>
@@ -41,7 +42,7 @@ uint16_t bits_from_f32(float src) {
 TEST(BFloat16Conversion, FloatToBFloat16AndBack) {
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays)
   float in[100];
-  for (int i = 0; i < 100; ++i) {
+  for (const auto i : c10::irange(100)) {
     // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions,cppcoreguidelines-avoid-magic-numbers)
     in[i] = i + 1.25;
   }
@@ -51,7 +52,7 @@ TEST(BFloat16Conversion, FloatToBFloat16AndBack) {
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays)
   float out[100];
 
-  for (int i = 0; i < 100; ++i) {
+  for (const auto i : c10::irange(100)) {
     bfloats[i].x = bits_from_f32(in[i]);
     out[i] = f32_from_bits(bfloats[i].x);
 
@@ -64,7 +65,7 @@ TEST(BFloat16Conversion, FloatToBFloat16AndBack) {
 TEST(BFloat16Conversion, FloatToBFloat16RNEAndBack) {
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays)
   float in[100];
-  for (int i = 0; i < 100; ++i) {
+  for (const auto i : c10::irange(100)) {
     // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions,cppcoreguidelines-avoid-magic-numbers)
     in[i] = i + 1.25;
   }
@@ -74,7 +75,7 @@ TEST(BFloat16Conversion, FloatToBFloat16RNEAndBack) {
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays)
   float out[100];
 
-  for (int i = 0; i < 100; ++i) {
+  for (const auto i : c10::irange(100)) {
     bfloats[i].x = round_to_nearest_even(in[i]);
     out[i] = f32_from_bits(bfloats[i].x);
 
diff --git a/runtime/core/portable_type/test/optional_test.cpp b/runtime/core/portable_type/test/optional_test.cpp
index fe27186bbf2..60d835d7439 100644
--- a/runtime/core/portable_type/test/optional_test.cpp
+++ b/runtime/core/portable_type/test/optional_test.cpp
@@ -36,11 +36,11 @@ TEST(TestOptional, NulloptHasNoValue) {
   EXPECT_FALSE(o.has_value());
 }
 
-TEST(TestOptional, ValueOfEmptyOptionalShouldDie) {
+TEST(TestOptional, ValueOfEmptyOptionalShouldThrow) {
   optional<int32_t> o;
   EXPECT_FALSE(o.has_value());
 
-  ET_EXPECT_DEATH({ (void)o.value(); }, "");
+  EXPECT_THROW({ (void)o.value(); }, std::bad_optional_access);
 }
 
 TEST(TestOptional, IntValue) {
diff --git a/runtime/core/portable_type/test/targets.bzl b/runtime/core/portable_type/test/targets.bzl
index c0b4ef00c78..d8e82a15fba 100644
--- a/runtime/core/portable_type/test/targets.bzl
+++ b/runtime/core/portable_type/test/targets.bzl
@@ -11,6 +11,7 @@ def define_common_targets():
         srcs = ["bfloat16_test.cpp"],
         deps = [
             "//executorch/runtime/core/portable_type:portable_type",
+            "//executorch/runtime/core/portable_type/c10/c10:c10",
         ],
     )
 
@@ -52,5 +53,6 @@ def define_common_targets():
         deps = [
             "//executorch/runtime/core/exec_aten/util:tensor_util",
             "//executorch/runtime/core/portable_type:portable_type",
+            "//executorch/runtime/core/portable_type/c10/c10:c10",
         ],
     )
diff --git a/runtime/core/portable_type/test/tensor_impl_test.cpp b/runtime/core/portable_type/test/tensor_impl_test.cpp
index bd5f82c5d1f..0b8ae05f4da 100644
--- a/runtime/core/portable_type/test/tensor_impl_test.cpp
+++ b/runtime/core/portable_type/test/tensor_impl_test.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <executorch/runtime/core/portable_type/tensor_impl.h>
 
 #include <gtest/gtest.h>
@@ -76,7 +77,7 @@ TEST_F(TensorImplTest, TestSetSizesContigContract) {
 
   SizesType new_sizes[RANK] = {0, 0, 0, 0, 0};
   // assign random sizes between 1 and 100
-  for (int i = 0; i < RANK; i++) {
+  for (const auto i : c10::irange(RANK)) {
     new_sizes[i] = distribution(generator);
   }
   Error err = resize_tensor_impl(&t, {new_sizes, RANK});
diff --git a/runtime/core/result.h b/runtime/core/result.h
index 7b404bca946..377573e6dfa 100644
--- a/runtime/core/result.h
+++ b/runtime/core/result.h
@@ -59,8 +59,13 @@ class Result final {
    * a non-Ok value.
    */
   /* implicit */ Result(Error error)
-      : error_(error == Error::Ok ? Error::Internal : error),
-        hasValue_(false) {}
+      : error_(error == Error::Ok ? Error::Internal : error), hasValue_(false) {
+    if ET_UNLIKELY (error == Error::Ok) {
+      ET_LOG(
+          Debug,
+          "Attempted to create Result from Error::Ok, this has been converted to Error::Internal.");
+    }
+  }
 
   /// Value copy constructor.
   /* implicit */ Result(const T& val) : value_(val), hasValue_(true) {}
diff --git a/runtime/core/targets.bzl b/runtime/core/targets.bzl
index c3535688f63..3195e727d96 100644
--- a/runtime/core/targets.bzl
+++ b/runtime/core/targets.bzl
@@ -1,4 +1,4 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime")
 
 def event_tracer_enabled():
     return native.read_config("executorch", "event_tracer_enabled", "false") == "true"
@@ -50,6 +50,7 @@ def define_common_targets():
         ],
         exported_preprocessor_flags = get_core_flags(),
         exported_deps = [
+            "//executorch/runtime/core/portable_type/c10/c10:c10",
             "//executorch/runtime/platform:platform",
         ],
     )
@@ -73,6 +74,7 @@ def define_common_targets():
         ],
         exported_deps = [
             ":core",
+            "//executorch/runtime/core/portable_type/c10/c10:c10",
         ],
         visibility = [
             "//executorch/...",
@@ -80,7 +82,7 @@ def define_common_targets():
         ],
     )
 
-    for aten_mode in (True, False):
+    for aten_mode in get_aten_mode_options():
         aten_suffix = ("_aten" if aten_mode else "")
         runtime.cxx_library(
             name = "evalue" + aten_suffix,
@@ -145,13 +147,16 @@ def define_common_targets():
             ":tensor_layout",
         ],
     )
-    
+
     runtime.cxx_library(
         name = "tensor_layout",
         srcs = ["tensor_layout.cpp"],
         exported_headers = ["tensor_layout.h"],
+        deps = [
+            "//executorch/runtime/core/portable_type/c10/c10:c10",
+        ],
         exported_deps = [
-            ":core", 
+            ":core",
             "//executorch/runtime/core/exec_aten:lib",
         ],
         visibility = ["//executorch/..."],
diff --git a/runtime/core/tensor_layout.cpp b/runtime/core/tensor_layout.cpp
index 748a43fc5d6..2b862e6dc14 100644
--- a/runtime/core/tensor_layout.cpp
+++ b/runtime/core/tensor_layout.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <c10/util/irange.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
 #include <executorch/runtime/core/span.h>
@@ -19,7 +20,7 @@ Result<size_t> calculate_nbytes(
     const Span<const int32_t>& sizes,
     const executorch::aten::ScalarType& scalar_type) {
   ssize_t n = 1;
-  for (ssize_t i = 0; i < sizes.size(); i++) {
+  for (const auto i : c10::irange(sizes.size())) {
     if (sizes[i] < 0) {
       return Error::InvalidArgument;
     }
@@ -43,7 +44,7 @@ Result<const TensorLayout> TensorLayout::create(
     return Error::InvalidArgument;
   }
 
-  for (size_t i = 0; i < dim_order.size(); i++) {
+  for (const auto i : c10::irange(dim_order.size())) {
     if (dim_order[i] >= sizes.size()) {
       return Error::InvalidArgument;
     }
diff --git a/runtime/core/test/CMakeLists.txt b/runtime/core/test/CMakeLists.txt
index b2da466193c..70f7cbf4bfd 100644
--- a/runtime/core/test/CMakeLists.txt
+++ b/runtime/core/test/CMakeLists.txt
@@ -17,7 +17,7 @@ cmake_minimum_required(VERSION 3.19)
 
 set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 
-include(${EXECUTORCH_ROOT}/build/Test.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
 set(_test_srcs
     span_test.cpp
diff --git a/runtime/core/test/error_handling_test.cpp b/runtime/core/test/error_handling_test.cpp
index b6b58623984..ef270cad1ed 100644
--- a/runtime/core/test/error_handling_test.cpp
+++ b/runtime/core/test/error_handling_test.cpp
@@ -110,6 +110,7 @@ TEST(ErrorHandlingTest, ResultBasic) {
 }
 
 TEST(ErrorHandlingTest, OkErrorNotPossible) {
+  executorch::runtime::runtime_init();
   Result<uint32_t> r(Error::Ok);
   ASSERT_FALSE(r.ok());
   ASSERT_NE(r.error(), Error::Ok);
diff --git a/runtime/core/test/event_tracer_test.cpp b/runtime/core/test/event_tracer_test.cpp
index 622de1ff9fa..310483fab49 100644
--- a/runtime/core/test/event_tracer_test.cpp
+++ b/runtime/core/test/event_tracer_test.cpp
@@ -8,9 +8,11 @@
 
 #include <gtest/gtest.h>
 
+#include <c10/util/irange.h>
 #include <executorch/runtime/core/array_ref.h>
 #include <executorch/runtime/core/evalue.h>
 #include <executorch/runtime/core/event_tracer.h>
+#include <executorch/runtime/core/result.h>
 // Enable flag for test
 #define ET_EVENT_TRACER_ENABLED
 #include <executorch/runtime/core/event_tracer_hooks.h>
@@ -28,6 +30,7 @@ using executorch::runtime::EventTracerEntry;
 using executorch::runtime::kUnsetChainId;
 using executorch::runtime::kUnsetDebugHandle;
 using executorch::runtime::LoggedEValueType;
+using executorch::runtime::Result;
 
 class DummyEventTracer : public EventTracer {
  public:
@@ -100,49 +103,54 @@ class DummyEventTracer : public EventTracer {
     (void)metadata_len;
   }
 
-  void log_intermediate_output_delegate(
+  virtual Result<bool> log_intermediate_output_delegate(
       const char* name,
       DebugHandle delegate_debug_index,
       const Tensor& output) override {
     (void)name;
     (void)delegate_debug_index;
     (void)output;
+    return true;
   }
 
-  void log_intermediate_output_delegate(
+  virtual Result<bool> log_intermediate_output_delegate(
       const char* name,
       DebugHandle delegate_debug_index,
       const ArrayRef<Tensor> output) override {
     (void)name;
     (void)delegate_debug_index;
     (void)output;
+    return true;
   }
 
-  void log_intermediate_output_delegate(
+  virtual Result<bool> log_intermediate_output_delegate(
       const char* name,
       DebugHandle delegate_debug_index,
       const int& output) override {
     (void)name;
     (void)delegate_debug_index;
     (void)output;
+    return true;
   }
 
-  virtual void log_intermediate_output_delegate(
+  virtual Result<bool> log_intermediate_output_delegate(
       const char* name,
       DebugHandle delegate_debug_index,
       const bool& output) override {
     (void)name;
     (void)delegate_debug_index;
     (void)output;
+    return true;
   }
 
-  virtual void log_intermediate_output_delegate(
+  virtual Result<bool> log_intermediate_output_delegate(
       const char* name,
       DebugHandle delegate_debug_index,
       const double& output) override {
     (void)name;
     (void)delegate_debug_index;
     (void)output;
+    return true;
   }
 
   void log_evalue(const EValue& evalue, LoggedEValueType evalue_type) override {
@@ -207,7 +215,7 @@ TEST(TestEventTracer, SimpleEventTracerTest) {
   // and also with a null pointer (to test that the null case works).
   DummyEventTracer dummy;
   std::vector<DummyEventTracer*> dummy_event_tracer_arr = {&dummy, nullptr};
-  for (size_t i = 0; i < dummy_event_tracer_arr.size(); i++) {
+  for (const auto i : c10::irange(dummy_event_tracer_arr.size())) {
     RunSimpleTracerTest(&dummy);
     RunSimpleTracerTest(nullptr);
   }
@@ -234,7 +242,7 @@ TEST(TestEventTracer, SimpleEventTracerTestDelegate) {
   // and also with a null pointer (to test that the null case works).
   DummyEventTracer dummy;
   std::vector<DummyEventTracer*> dummy_event_tracer_arr = {&dummy, nullptr};
-  for (size_t i = 0; i < dummy_event_tracer_arr.size(); i++) {
+  for (const auto i : c10::irange(dummy_event_tracer_arr.size())) {
     RunSimpleTracerTestDelegate(&dummy);
     RunSimpleTracerTestDelegate(nullptr);
   }
diff --git a/runtime/core/test/memory_allocator_test.cpp b/runtime/core/test/memory_allocator_test.cpp
index dfd2f23a488..fee95a6407e 100644
--- a/runtime/core/test/memory_allocator_test.cpp
+++ b/runtime/core/test/memory_allocator_test.cpp
@@ -9,6 +9,7 @@
 #include <array>
 #include <vector>
 
+#include <c10/util/irange.h>
 #include <executorch/runtime/core/memory_allocator.h>
 #include <executorch/runtime/platform/runtime.h>
 #include <executorch/test/utils/alignment.h>
@@ -62,12 +63,12 @@ TEST_F(MemoryAllocatorTest, MemoryAllocatorAlignment) {
       128,
       2};
 
-  for (int i = 0; i < arr_size; i++) {
+  for (const auto i : c10::irange(arr_size)) {
     auto align_size = alignment[i];
     constexpr size_t mem_size = 1000;
     uint8_t mem_pool[mem_size];
     MemoryAllocator allocator = MemoryAllocator(mem_size, mem_pool);
-    for (int j = 0; j < arr_size; j++) {
+    for (const auto j : c10::irange(arr_size)) {
       auto size = allocation[j];
       void* start = allocator.allocate(size, align_size);
       EXPECT_ALIGNED(start, align_size);
@@ -81,7 +82,7 @@ TEST_F(MemoryAllocatorTest, MemoryAllocatorNonPowerOfTwoAlignment) {
   MemoryAllocator allocator(mem_size, mem_pool);
 
   size_t alignment[5] = {0, 5, 6, 12, 34};
-  for (int i = 0; i < 5; i++) {
+  for (const auto i : c10::irange(5)) {
     ASSERT_EQ(nullptr, allocator.allocate(8, alignment[i]));
   }
 }
@@ -195,7 +196,6 @@ TEST_F(MemoryAllocatorTest, AllocateListFailure) {
   EXPECT_EQ(p, nullptr);
 }
 
-#if ET_HAVE_GNU_STATEMENT_EXPRESSIONS
 class HelperMacrosTest : public ::testing::Test {
  protected:
   void SetUp() override {
@@ -212,25 +212,12 @@ class HelperMacrosTest : public ::testing::Test {
  */
 static const Error kTestFailureValue = static_cast<Error>(12345);
 
-void* try_allocate_helper(
-    MemoryAllocator* allocator,
-    size_t nbytes,
-    Error* out_error) {
-  return ET_TRY_ALLOCATE_OR(allocator, nbytes, {
-    // An example that doesn't simply return.
-    *out_error = kTestFailureValue;
-    return nullptr;
-  });
-}
-
 TEST_F(HelperMacrosTest, TryAllocateSuccess) {
   std::array<uint8_t, 16> buffer;
   MemoryAllocator allocator(buffer.size(), buffer.data());
 
   // Allocate less memory than the allocator provides, which should succeed.
-  Error err = Error::Ok;
-  void* p = try_allocate_helper(&allocator, allocator.size() / 2, &err);
-  EXPECT_EQ(err, Error::Ok);
+  void* p = allocator.allocate(allocator.size() / 2);
   EXPECT_NE(p, nullptr);
 }
 
@@ -239,60 +226,16 @@ TEST_F(HelperMacrosTest, TryAllocateFailure) {
   MemoryAllocator allocator(buffer.size(), buffer.data());
 
   // Allocate more memory than the allocator provides, which should fail.
-  Error err = Error::Ok;
-  void* p = try_allocate_helper(&allocator, allocator.size() * 2, &err);
-  EXPECT_EQ(err, kTestFailureValue);
+  void* p = allocator.allocate(allocator.size() * 2);
   EXPECT_EQ(p, nullptr);
 }
 
-Error allocate_or_return_error_helper(
-    MemoryAllocator* allocator,
-    size_t nbytes,
-    void** out_pointer) {
-  *out_pointer = ET_ALLOCATE_OR_RETURN_ERROR(allocator, nbytes);
-  return Error::Ok;
-}
-
-TEST_F(HelperMacrosTest, AllocateOrReturnSuccess) {
-  std::array<uint8_t, 16> buffer;
-  MemoryAllocator allocator(buffer.size(), buffer.data());
-
-  // Allocate less memory than the allocator provides, which should succeed.
-  void* p;
-  Error err =
-      allocate_or_return_error_helper(&allocator, allocator.size() / 2, &p);
-  EXPECT_EQ(err, Error::Ok);
-  EXPECT_NE(p, nullptr);
-}
-
-TEST_F(HelperMacrosTest, AllocateOrReturnFailure) {
-  std::array<uint8_t, 16> buffer;
-  MemoryAllocator allocator(buffer.size(), buffer.data());
-
-  // Allocate more memory than the allocator provides, which should fail.
-  void* p;
-  Error err =
-      allocate_or_return_error_helper(&allocator, allocator.size() * 2, &p);
-  EXPECT_EQ(err, Error::MemoryAllocationFailed);
-}
-
-template <typename T>
-T* try_allocate_instance_helper(MemoryAllocator* allocator, Error* out_error) {
-  return ET_TRY_ALLOCATE_INSTANCE_OR(allocator, T, {
-    // An example that doesn't simply return.
-    *out_error = kTestFailureValue;
-    return nullptr;
-  });
-}
-
 TEST_F(HelperMacrosTest, TryAllocateInstanceSuccess) {
   std::array<uint8_t, 16> buffer;
   MemoryAllocator allocator(buffer.size(), buffer.data());
 
   // Allocate less memory than the allocator provides, which should succeed.
-  Error err = Error::Ok;
-  TestType8* p = try_allocate_instance_helper<TestType8>(&allocator, &err);
-  EXPECT_EQ(err, Error::Ok);
+  TestType8* p = allocator.allocateInstance<TestType8>();
   EXPECT_NE(p, nullptr);
 }
 
@@ -301,53 +244,8 @@ TEST_F(HelperMacrosTest, TryAllocateInstanceFailure) {
   MemoryAllocator allocator(buffer.size(), buffer.data());
 
   // Allocate more memory than the allocator provides, which should fail.
-  Error err = Error::Ok;
-  TestType1024* p =
-      try_allocate_instance_helper<TestType1024>(&allocator, &err);
-  EXPECT_EQ(err, kTestFailureValue);
-}
-
-template <typename T>
-Error allocate_instance_or_return_error_helper(
-    MemoryAllocator* allocator,
-    void** out_pointer) {
-  *out_pointer = ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR(allocator, T);
-  return Error::Ok;
-}
-
-TEST_F(HelperMacrosTest, AllocateInstanceOrReturnSuccess) {
-  std::array<uint8_t, 16> buffer;
-  MemoryAllocator allocator(buffer.size(), buffer.data());
-
-  // Allocate less memory than the allocator provides, which should succeed.
-  void* p;
-  Error err =
-      allocate_instance_or_return_error_helper<TestType8>(&allocator, &p);
-  EXPECT_EQ(err, Error::Ok);
-  EXPECT_NE(p, nullptr);
-}
-
-TEST_F(HelperMacrosTest, AllocateInstanceOrReturnFailure) {
-  std::array<uint8_t, 16> buffer;
-  MemoryAllocator allocator(buffer.size(), buffer.data());
-
-  // Allocate more memory than the allocator provides, which should fail.
-  void* p;
-  Error err =
-      allocate_instance_or_return_error_helper<TestType1024>(&allocator, &p);
-  EXPECT_EQ(err, Error::MemoryAllocationFailed);
-}
-
-void* try_allocate_list_helper(
-    MemoryAllocator* allocator,
-    size_t nbytes,
-    Error* out_error) {
-  // Use a 1-sized type so that nbytes == nelem.
-  return ET_TRY_ALLOCATE_LIST_OR(allocator, uint8_t, nbytes, {
-    // An example that doesn't simply return.
-    *out_error = kTestFailureValue;
-    return nullptr;
-  });
+  TestType1024* p = allocator.allocateInstance<TestType1024>();
+  EXPECT_EQ(p, nullptr);
 }
 
 TEST_F(HelperMacrosTest, TryAllocateListSuccess) {
@@ -355,9 +253,7 @@ TEST_F(HelperMacrosTest, TryAllocateListSuccess) {
   MemoryAllocator allocator(buffer.size(), buffer.data());
 
   // Allocate less memory than the allocator provides, which should succeed.
-  Error err = Error::Ok;
-  void* p = try_allocate_list_helper(&allocator, allocator.size() / 2, &err);
-  EXPECT_EQ(err, Error::Ok);
+  void* p = allocator.allocateList<uint8_t>(allocator.size() / 2);
   EXPECT_NE(p, nullptr);
 }
 
@@ -366,41 +262,6 @@ TEST_F(HelperMacrosTest, TryAllocateListFailure) {
   MemoryAllocator allocator(buffer.size(), buffer.data());
 
   // Allocate more memory than the allocator provides, which should fail.
-  Error err = Error::Ok;
-  void* p = try_allocate_list_helper(&allocator, allocator.size() * 2, &err);
-  EXPECT_EQ(err, kTestFailureValue);
+  void* p = allocator.allocateList<uint8_t>(allocator.size() * 2);
   EXPECT_EQ(p, nullptr);
 }
-
-Error allocate_list_or_return_error_helper(
-    MemoryAllocator* allocator,
-    size_t nbytes,
-    void** out_pointer) {
-  // Use a 1-sized type so that nbytes == nelem.
-  *out_pointer = ET_ALLOCATE_LIST_OR_RETURN_ERROR(allocator, uint8_t, nbytes);
-  return Error::Ok;
-}
-
-TEST_F(HelperMacrosTest, AllocateListOrReturnSuccess) {
-  std::array<uint8_t, 16> buffer;
-  MemoryAllocator allocator(buffer.size(), buffer.data());
-
-  // Allocate less memory than the allocator provides, which should succeed.
-  void* p;
-  Error err = allocate_list_or_return_error_helper(
-      &allocator, allocator.size() / 2, &p);
-  EXPECT_EQ(err, Error::Ok);
-  EXPECT_NE(p, nullptr);
-}
-
-TEST_F(HelperMacrosTest, AllocateListOrReturnFailure) {
-  std::array<uint8_t, 16> buffer;
-  MemoryAllocator allocator(buffer.size(), buffer.data());
-
-  // Allocate more memory than the allocator provides, which should fail.
-  void* p;
-  Error err = allocate_list_or_return_error_helper(
-      &allocator, allocator.size() * 2, &p);
-  EXPECT_EQ(err, Error::MemoryAllocationFailed);
-}
-#endif // ET_HAVE_GNU_STATEMENT_EXPRESSIONS
diff --git a/runtime/core/test/targets.bzl b/runtime/core/test/targets.bzl
index 7332aad8a3d..180e4eb0a0d 100644
--- a/runtime/core/test/targets.bzl
+++ b/runtime/core/test/targets.bzl
@@ -1,4 +1,4 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime")
 
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
@@ -40,6 +40,7 @@ def define_common_targets():
         ],
         deps = [
             "//executorch/runtime/core:event_tracer",
+            "//executorch/runtime/core/portable_type/c10/c10:c10",
         ],
     )
 
@@ -68,6 +69,7 @@ def define_common_targets():
         ],
         deps = [
             "//executorch/runtime/core:memory_allocator",
+            "//executorch/runtime/core/portable_type/c10/c10:c10",
         ],
     )
 
@@ -91,16 +93,17 @@ def define_common_targets():
         ],
     )
 
-    runtime.cxx_test(
-        name = "tensor_shape_dynamism_test_aten",
-        srcs = ["tensor_shape_dynamism_test_aten.cpp"],
-        deps = [
-            "//executorch/runtime/core/exec_aten:lib_aten",
-            "//executorch/runtime/core/exec_aten/testing_util:tensor_util_aten",
-        ],
-    )
+    if True in get_aten_mode_options():
+        runtime.cxx_test(
+            name = "tensor_shape_dynamism_test_aten",
+            srcs = ["tensor_shape_dynamism_test_aten.cpp"],
+            deps = [
+                "//executorch/runtime/core/exec_aten:lib_aten",
+                "//executorch/runtime/core/exec_aten/testing_util:tensor_util_aten",
+            ],
+        )
 
-    for aten_mode in (True, False):
+    for aten_mode in get_aten_mode_options():
         aten_suffix = "_aten" if aten_mode else ""
 
         runtime.cxx_test(
diff --git a/runtime/executor/method.cpp b/runtime/executor/method.cpp
index d435678ca2b..41d44522a22 100644
--- a/runtime/executor/method.cpp
+++ b/runtime/executor/method.cpp
@@ -8,6 +8,7 @@
 
 #include <executorch/runtime/executor/method.h>
 
+#include <c10/util/irange.h>
 #include <array>
 #include <cinttypes> // @donotremove
 #include <cstdint>
@@ -33,6 +34,7 @@
 namespace executorch {
 namespace runtime {
 
+using deserialization::NamedData;
 using internal::PlatformMemoryAllocator;
 
 /**
@@ -238,10 +240,10 @@ Result<InstructionArgs> gen_instruction_arguments(
   for (size_t i = 0; i < num_args; ++i) {
     int32_t arg_idx = arg_idxs[i];
     ET_CHECK_OR_RETURN_ERROR(
-        arg_idx < num_values,
+        static_cast<size_t>(arg_idx) < num_values,
         InvalidProgram,
-        "Arg index %d >= %" ET_PRIsize_t,
-        arg_idx,
+        "Arg index %zd >= %" ET_PRIsize_t,
+        static_cast<ssize_t>(arg_idx),
         num_values);
     arg_list[i] = &values[arg_idx];
   }
@@ -269,7 +271,7 @@ Result<bool> parse_cond_value(const EValue& cond_value) {
         static_cast<int8_t>(cond_val.scalar_type()));
 
     const bool* cond_data = cond_val.const_data_ptr<bool>();
-    for (size_t i = 0; i < cond_val.numel(); i++) {
+    for (size_t i = 0; i < static_cast<size_t>(cond_val.numel()); i++) {
       if (!cond_data[i]) {
         return false;
       }
@@ -289,6 +291,113 @@ Result<bool> parse_cond_value(const EValue& cond_value) {
 
 } // namespace
 
+Result<size_t> Method::get_num_external_constants() {
+  auto flatbuffer_values = serialization_plan_->values();
+  size_t n_value = flatbuffer_values->size();
+
+  size_t n_external_constants = 0;
+  for (size_t i = 0; i < n_value; ++i) {
+    auto serialization_value = flatbuffer_values->Get(i);
+    // Ensure values are non-null.
+    // Note that as a side-effect of this check, we're guaranteed that all
+    // values are non-null, so later loops can skip that check.
+    ET_CHECK_OR_RETURN_ERROR(
+        serialization_value != nullptr &&
+            (serialization_value->val_type() ==
+                 executorch_flatbuffer::KernelTypes::Null ||
+             serialization_value->val() != nullptr),
+        InvalidProgram,
+        "Null value at index %" ET_PRIsize_t,
+        i);
+    // Ignore non-tensor types.
+    if (serialization_value->val_type() !=
+        executorch_flatbuffer::KernelTypes::Tensor) {
+      continue;
+    }
+    const auto s_tensor = static_cast<const executorch_flatbuffer::Tensor*>(
+        serialization_value->val());
+
+    // An external constant is tagged with EXTERNAL and has no
+    // allocation_info.
+    if (s_tensor->extra_tensor_info() != nullptr &&
+        s_tensor->extra_tensor_info()->location() ==
+            executorch_flatbuffer::TensorDataLocation::EXTERNAL &&
+        s_tensor->allocation_info() == nullptr) {
+      n_external_constants++;
+    }
+  }
+  return n_external_constants;
+}
+
+Error Method::parse_external_constants(const NamedDataMap* named_data_map) {
+  auto flatbuffer_values = serialization_plan_->values();
+  size_t n_value = flatbuffer_values->size();
+
+  // n_external_constants_ counts the number of successfully-initialized
+  // external constants for ~Method() to clean up, and is incremented at the
+  // bottom of the loop. This makes it safe for errors to return without
+  // updating any state.
+  n_external_constants_ = 0;
+  for (size_t i = 0; i < n_value; ++i) {
+    auto serialization_value = flatbuffer_values->Get(i);
+    // Ignore non-tensor types.
+    if (serialization_value->val_type() !=
+        executorch_flatbuffer::KernelTypes::Tensor) {
+      continue;
+    }
+    const auto s_tensor = static_cast<const executorch_flatbuffer::Tensor*>(
+        serialization_value->val());
+    // Constant tensors are resolved here; tensors with allocation_info are
+    // mutable and are resolved in parse_values.
+    if (s_tensor->extra_tensor_info() == nullptr ||
+        s_tensor->extra_tensor_info()->location() !=
+            executorch_flatbuffer::TensorDataLocation::EXTERNAL ||
+        s_tensor->allocation_info() != nullptr) {
+      continue;
+    }
+    ET_CHECK_OR_RETURN_ERROR(
+        s_tensor->extra_tensor_info()->fully_qualified_name() != nullptr,
+        InvalidExternalData,
+        "Fully qualified name of external tensor is null at index %zu",
+        i);
+
+    const char* key =
+        s_tensor->extra_tensor_info()->fully_qualified_name()->c_str();
+
+    // Check if this tensor has already been resolved.
+    if (get_data_by_key(
+            key, Span<NamedData>(external_constants_, n_external_constants_)) !=
+        nullptr) {
+      continue;
+    }
+    Result<const TensorLayout> tensor_layout =
+        named_data_map->get_metadata(key);
+    if (!tensor_layout.ok()) {
+      return tensor_layout.error();
+    }
+    // Check external tensor compatibility.
+    Error err =
+        deserialization::validateTensorLayout(s_tensor, tensor_layout.get());
+    if (err != Error::Ok) {
+      return err;
+    }
+    // Save the key.
+    external_constants_[n_external_constants_].key = key;
+
+    // Save the buffer.
+    Result<FreeableBuffer> buffer = named_data_map->get_data(key);
+    ET_CHECK_OR_RETURN_ERROR(
+        buffer.ok(),
+        InvalidExternalData,
+        "Buffer retrieved from get_data is not valid");
+    new (&external_constants_[n_external_constants_].buffer)
+        FreeableBuffer(std::move(buffer.get()));
+
+    n_external_constants_ += 1;
+  }
+  return Error::Ok;
+}
+
 Error Method::parse_values(const NamedDataMap* named_data_map) {
   auto flatbuffer_values = serialization_plan_->values();
   ET_CHECK_OR_RETURN_ERROR(
@@ -299,6 +408,30 @@ Error Method::parse_values(const NamedDataMap* named_data_map) {
     return Error::MemoryAllocationFailed;
   }
 
+  // Count the number of tensors marked as EXTERNAL for this method. The actual
+  // number of external constants may be smaller, eg. if multiple tensors point
+  // to the same underlying data buffer.
+  // This function also ensures that all flatbuffer_values entries
+  // are non-null, so `val_as_X()` calls below are guaranteed to return
+  // non-null pointers.
+  Result<size_t> max_external_constants = get_num_external_constants();
+  if (!max_external_constants.ok()) {
+    return max_external_constants.error();
+  }
+  if (max_external_constants.get() > 0) {
+    // Allocate space for external tensors.
+    external_constants_ =
+        memory_manager_->method_allocator()->allocateList<NamedData>(
+            max_external_constants.get());
+    if (external_constants_ == nullptr) {
+      return Error::MemoryAllocationFailed;
+    }
+    Error err = parse_external_constants(named_data_map);
+    if (err != Error::Ok) {
+      return err;
+    }
+  }
+
   // n_value_ counts the number of successfully-initialized values for ~Method()
   // to clean up, and is incremented at the bottom of the loop. This makes it
   // safe for errors to return without updating any state.
@@ -306,16 +439,6 @@ Error Method::parse_values(const NamedDataMap* named_data_map) {
 
   for (size_t i = 0; i < n_value; ++i) {
     auto serialization_value = flatbuffer_values->Get(i);
-    // Ensure that the `val_as_X()` calls will return non-null pointers.
-    ET_CHECK_OR_RETURN_ERROR(
-        serialization_value != nullptr &&
-            (serialization_value->val_type() ==
-                 executorch_flatbuffer::KernelTypes::Null ||
-             serialization_value->val() != nullptr),
-        InvalidProgram,
-        "Null value at index %" ET_PRIsize_t,
-        i);
-
     const auto val = serialization_value->val();
 
     switch (serialization_value->val_type()) {
@@ -359,7 +482,7 @@ Error Method::parse_values(const NamedDataMap* named_data_map) {
         for (size_t j = 0; j < items->size(); j++) {
           auto value_index = items->Get(j);
           ET_CHECK_OR_RETURN_ERROR(
-              value_index >= 0 && value_index < n_value,
+              value_index >= 0 && static_cast<size_t>(value_index) < n_value,
               InvalidProgram,
               "Invalid value index %" PRId64 " for IntList %" ET_PRIsize_t
               " index %" ET_PRIsize_t,
@@ -416,7 +539,8 @@ Error Method::parse_values(const NamedDataMap* named_data_map) {
             program_,
             memory_manager_,
             static_cast<const executorch_flatbuffer::Tensor*>(val),
-            named_data_map);
+            named_data_map,
+            Span<NamedData>(external_constants_, n_external_constants_));
         if (!t.ok()) {
           ET_LOG(
               Error,
@@ -521,7 +645,7 @@ Error populate_operator_name(
       has_overload ? op->overload()->c_str() : "");
   ET_CHECK_OR_RETURN_ERROR(cx >= 0, Internal, "snprintf failed: %d", cx);
   ET_CHECK_OR_RETURN_ERROR(
-      cx < operator_name_size,
+      static_cast<size_t>(cx) < operator_name_size,
       Internal,
       "Operator name %s%s%s with length %d "
       "truncated to %" ET_PRIsize_t " due to internal buffer limit.",
@@ -549,7 +673,8 @@ Error Method::resolve_operator(
   char operator_name[kTempBufferSizeForName];
   const auto ops = serialization_plan_->operators();
   ET_CHECK_OR_RETURN_ERROR(
-      ops != nullptr && op_index < ops->size(),
+      ops != nullptr &&
+          static_cast<flatbuffers::uoffset_t>(op_index) < ops->size(),
       InvalidProgram,
       "Op index %" PRIu32 " out of range",
       op_index);
@@ -598,7 +723,11 @@ Error Method::resolve_operator(
   Result<OpFunction> op_function =
       get_op_function_from_registry(operator_name, {meta, count});
   if (!op_function.ok()) {
-    ET_LOG(Error, "Missing operator: [%d] %s", op_index, operator_name);
+    ET_LOG(
+        Error,
+        "Missing operator: [%zd] %s",
+        static_cast<ssize_t>(op_index),
+        operator_name);
     return op_function.error();
   }
   kernels[kernel_index] = op_function.get();
@@ -669,6 +798,14 @@ Error Method::init(
       return Error::MemoryAllocationFailed;
     }
 
+    // Get NamedDataMap, if it exists.
+    const NamedDataMap* pte_data_map = nullptr;
+    Result<const NamedDataMap*> pte_data_map_res =
+        program_->get_named_data_map();
+    if (pte_data_map_res.ok()) {
+      pte_data_map = pte_data_map_res.get();
+    }
+
     // n_delegate_ counts the number of successfully-initialized delegates for
     // ~Method() to clean up, and is incremented at the bottom of the loop. This
     // makes it safe for errors to return without updating any state.
@@ -679,7 +816,8 @@ Error Method::init(
       BackendInitContext backend_init_context(
           method_allocator,
           /*event_tracer=*/event_tracer_,
-          /*method_name=*/serialization_plan_->name()->c_str());
+          /*method_name=*/serialization_plan_->name()->c_str(),
+          /*named_data_map=*/pte_data_map);
       Error err = BackendDelegate::Init(
           delegate, program_, backend_init_context, &delegates_[i]);
       if (err != Error::Ok) {
@@ -800,10 +938,10 @@ Error Method::init(
                     instr_args)
                     ->cond_value_index();
             ET_CHECK_OR_RETURN_ERROR(
-                index >= 0 && index < n_value_,
+                index >= 0 && static_cast<size_t>(index) < n_value_,
                 InvalidProgram,
-                "Index %d negative or >= %" ET_PRIsize_t,
-                index,
+                "Index %zd negative or >= %" ET_PRIsize_t,
+                static_cast<ssize_t>(index),
                 n_value_);
             chain_instruction_arg_lists[instr_idx] = InstructionArgs();
           } break;
@@ -821,9 +959,9 @@ Error Method::init(
     ET_CHECK_OR_RETURN_ERROR(
         num_instructions_missing_op == 0,
         OperatorMissing,
-        "There are %d instructions don't have corresponding operator registered. "
+        "There are %zu instructions don't have corresponding operator registered. "
         "See logs for details",
-        num_instructions_missing_op);
+        static_cast<size_t>(num_instructions_missing_op));
     if (delayed_error != Error::Ok) {
       return delayed_error;
     }
@@ -1192,7 +1330,7 @@ Error Method::execute_instruction() {
       auto delegate_idx =
           instruction->instr_args_as_DelegateCall()->delegate_index();
       ET_CHECK_OR_RETURN_ERROR(
-          delegate_idx < n_delegate_,
+          static_cast<size_t>(delegate_idx) < n_delegate_,
           Internal,
           "DELEGATE_CALL index %" PRIu32 " >= num delegates %" ET_PRIsize_t
           " at instruction %" ET_PRIsize_t,
@@ -1486,16 +1624,20 @@ Method::~Method() {
   // Destroy the values. It's necessary in ATen mode, where the refcount of
   // Tensors needs to be decremented properly.
   if (values_ != nullptr) {
-    for (int i = 0; i < n_value_; ++i) {
+    for (size_t i = 0; i < n_value_; ++i) {
       values_[i].~EValue();
     }
   }
   // Free any resources associated with delegate backends.
   if (delegates_ != nullptr) {
-    for (int i = 0; i < n_delegate_; i++) {
+    for (size_t i = 0; i < n_delegate_; i++) {
       delegates_[i].~BackendDelegate();
     }
   }
+  // Free resources associated with external constants.
+  for (const auto i : c10::irange(n_external_constants_)) {
+    external_constants_[i].buffer.~FreeableBuffer();
+  }
   // All other fields are trivially destructible.
 }
 } // namespace runtime
diff --git a/runtime/executor/method.h b/runtime/executor/method.h
index dff4e818f9f..0ca2df440ad 100644
--- a/runtime/executor/method.h
+++ b/runtime/executor/method.h
@@ -7,9 +7,12 @@
  */
 
 #pragma once
+
+#ifdef __GNUC__
 // Disable -Wdeprecated-declarations, as some builds use 'Werror'.
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
 
 #include <executorch/runtime/core/evalue.h>
 #include <executorch/runtime/core/event_tracer.h>
@@ -31,6 +34,12 @@ struct EValue;
 namespace executorch {
 namespace runtime {
 
+// Forward declare NamedData. This is a public header and must not include
+// internal data types.
+namespace deserialization {
+struct NamedData;
+} // namespace deserialization
+
 // Forward declare Program to avoid a circular reference.
 class Program;
 
@@ -42,6 +51,7 @@ using OpFunction = void (*)(KernelRuntimeContext&, EValue**);
 /// A list of pointers into the master values table that together compose the
 /// argument list for a single instruction
 using InstructionArgs = Span<EValue*>;
+using deserialization::NamedData;
 
 /**
  * An executable method of an executorch program. Maps to a python method like
@@ -66,6 +76,8 @@ class Method final {
         delegates_(rhs.delegates_),
         n_chains_(rhs.n_chains_),
         chains_(rhs.chains_),
+        external_constants_(rhs.external_constants_),
+        n_external_constants_(rhs.n_external_constants_),
         init_state_(rhs.init_state_) {
     // Required: clear out fields that the dtor looks at, so that we don't free
     // anything twice.
@@ -73,6 +85,8 @@ class Method final {
     rhs.values_ = nullptr;
     rhs.n_delegate_ = 0;
     rhs.delegates_ = nullptr;
+    rhs.n_external_constants_ = 0;
+    rhs.external_constants_ = nullptr;
 
     // Helpful: Try to ensure that any other interactions with the old object
     // result in failures.
@@ -288,6 +302,8 @@ class Method final {
         delegates_(nullptr),
         n_chains_(0),
         chains_(nullptr),
+        external_constants_(nullptr),
+        n_external_constants_(0),
         init_state_(InitializationState::Uninitialized) {}
 
   /// Static factory used by Program.
@@ -336,8 +352,31 @@ class Method final {
   size_t n_chains_;
   Chain* chains_;
 
+  NamedData* external_constants_;
+  size_t n_external_constants_ = 0;
+
   InitializationState init_state_;
 
+  /**
+   * Counts the number of tensors marked as EXTERNAL in the flatbuffer
+   * for this method.
+   */
+  ET_NODISCARD Result<size_t> get_num_external_constants();
+
+  /**
+   * Parses the flatbuffer for constant tensors tagged as EXTERNAL.
+   * Retrieves the external constants using the named_data_map and places them
+   * into `external_constants_`. Updates `n_external_constants_` to count the
+   * number of successfully-initialized external constants.
+   * FreeableBuffers returned by the named_data_map are owned by the
+   * method and are freed on method destruction.
+   *
+   * @param[in] named_data_map, to retrieve external constants from.
+   * @returns Error::Ok on success, non-Ok on failure.
+   */
+  ET_NODISCARD Error
+  parse_external_constants(const NamedDataMap* named_data_map);
+
   /**
    * Parses the elements of the values_ array. On error, n_value_ will be set to
    * the number of successfully-initialized entries so that ~Method doesn't try
@@ -366,4 +405,6 @@ using ::executorch::runtime::Method;
 } // namespace executor
 } // namespace torch
 
+#ifdef __GNUC__
 #pragma GCC diagnostic pop
+#endif
diff --git a/runtime/executor/method_meta.cpp b/runtime/executor/method_meta.cpp
index bcc2390d2bd..eb019f64e71 100644
--- a/runtime/executor/method_meta.cpp
+++ b/runtime/executor/method_meta.cpp
@@ -56,7 +56,7 @@ size_t calculate_nbytes(
     Span<const int32_t> sizes,
     executorch::aten::ScalarType scalar_type) {
   ssize_t n = 1;
-  for (ssize_t i = 0; i < sizes.size(); i++) {
+  for (size_t i = 0; i < sizes.size(); i++) {
     n *= sizes[i];
   }
   // Use the full namespace to disambiguate from c10::elementSize.
@@ -110,7 +110,7 @@ size_t MethodMeta::num_inputs() const {
 Result<Tag> MethodMeta::input_tag(size_t index) const {
   auto num_inputs = this->num_inputs();
   ET_CHECK_OR_RETURN_ERROR(
-      index >= 0 && index < num_inputs,
+      index < num_inputs,
       InvalidArgument,
       "index %zu out of range. num_inputs: %zu",
       index,
@@ -118,10 +118,10 @@ Result<Tag> MethodMeta::input_tag(size_t index) const {
   auto input_index = s_plan_->inputs()->Get(index);
   size_t num_values = s_plan_->values()->size();
   ET_CHECK_OR_RETURN_ERROR(
-      input_index >= 0 && input_index < num_values,
+      input_index >= 0 && static_cast<size_t>(input_index) < num_values,
       InvalidProgram,
-      "internal value index %d out of range [0,%zu) for input %zu",
-      input_index,
+      "internal value index %zd out of range [0,%zu) for input %zu",
+      static_cast<ssize_t>(input_index),
       num_values,
       index);
   auto serialization_value = s_plan_->values()->Get(input_index);
@@ -160,7 +160,7 @@ size_t MethodMeta::num_outputs() const {
 Result<Tag> MethodMeta::output_tag(size_t index) const {
   auto num_outputs = this->num_outputs();
   ET_CHECK_OR_RETURN_ERROR(
-      index >= 0 && index < num_outputs,
+      index < num_outputs,
       InvalidArgument,
       "index %zu out of range. num_outputs: %zu",
       index,
@@ -168,10 +168,10 @@ Result<Tag> MethodMeta::output_tag(size_t index) const {
   auto output_index = s_plan_->outputs()->Get(index);
   size_t num_values = s_plan_->values()->size();
   ET_CHECK_OR_RETURN_ERROR(
-      output_index >= 0 && output_index < num_values,
+      output_index >= 0 && static_cast<size_t>(output_index) < num_values,
       InvalidProgram,
-      "internal value index %d out of range [0,%zu) for output %zu",
-      output_index,
+      "internal value index %zd out of range [0,%zu) for output %zu",
+      static_cast<ssize_t>(output_index),
       num_values,
       index);
   auto serialization_value = s_plan_->values()->Get(output_index);
@@ -218,7 +218,7 @@ size_t MethodMeta::num_memory_planned_buffers() const {
 Result<int64_t> MethodMeta::memory_planned_buffer_size(size_t index) const {
   auto num_buffers = this->num_memory_planned_buffers();
   ET_CHECK_OR_RETURN_ERROR(
-      index >= 0 && index < num_buffers,
+      index < num_buffers,
       InvalidArgument,
       "index %zu out of range. num_buffers: %zu",
       index,
@@ -229,16 +229,33 @@ Result<int64_t> MethodMeta::memory_planned_buffer_size(size_t index) const {
 }
 
 bool MethodMeta::uses_backend(const char* backend_name) const {
+  ET_CHECK_MSG(backend_name, "backend name is null");
   const auto delegates = s_plan_->delegates();
   for (size_t i = 0; i < delegates->size(); i++) {
     auto delegate = delegates->Get(i);
-    if (strcmp(delegate->id()->c_str(), backend_name) == 0) {
+    if (std::strcmp(delegate->id()->c_str(), backend_name) == 0) {
       return true;
     }
   }
   return false;
 }
 
+size_t MethodMeta::num_backends() const {
+  const auto delegates = s_plan_->delegates();
+  return delegates ? delegates->size() : 0;
+}
+
+Result<const char*> MethodMeta::get_backend_name(size_t index) const {
+  const auto count = num_backends();
+  ET_CHECK_OR_RETURN_ERROR(
+      index < count,
+      InvalidArgument,
+      "Index %zu out of range. num_backends: %zu",
+      index,
+      count);
+  return s_plan_->delegates()->Get(index)->id()->c_str();
+}
+
 size_t MethodMeta::num_instructions() const {
   const auto chains = s_plan_->chains();
   if (chains == nullptr) {
diff --git a/runtime/executor/method_meta.h b/runtime/executor/method_meta.h
index ea3b39ba9df..d9bb64d68a7 100644
--- a/runtime/executor/method_meta.h
+++ b/runtime/executor/method_meta.h
@@ -193,6 +193,22 @@ class MethodMeta final {
    */
   bool uses_backend(const char* backend_name) const;
 
+  /**
+   * Get the number of backends used in this method.
+   *
+   * @returns The total number of backend names.
+   */
+  size_t num_backends() const;
+
+  /**
+   * Get the backend name at the given index.
+   *
+   * @param[in] index The index of the backend name.
+   * @returns A Result wrapping the backend name as a C-style string
+   * on success, or an error if the index is invalid.
+   */
+  Result<const char*> get_backend_name(size_t index) const;
+
   /**
    * Get the number of instructions in this method.
    *
diff --git a/runtime/executor/program.cpp b/runtime/executor/program.cpp
index 964b8c8bdac..14e0b83d8aa 100644
--- a/runtime/executor/program.cpp
+++ b/runtime/executor/program.cpp
@@ -150,6 +150,22 @@ Result<executorch_flatbuffer::ExecutionPlan*> get_execution_plan(
   const executorch_flatbuffer::Program* flatbuffer_program =
       executorch_flatbuffer::GetProgram(program_data->data());
 
+  // Instantiate PteDataMap if named_data is present.
+  const auto named_data = flatbuffer_program->named_data();
+  std::optional<internal::PteDataMap> pte_data_map = std::nullopt;
+  if (named_data != nullptr) {
+    Result<internal::PteDataMap> pte_data_map_result =
+        internal::PteDataMap::create(
+            loader,
+            segment_base_offset,
+            named_data,
+            flatbuffer_program->segments());
+    if (!pte_data_map_result.ok()) {
+      return pte_data_map_result.error();
+    }
+    pte_data_map.emplace(std::move(pte_data_map_result.get()));
+  }
+
   // Constant data may live inside the flatbuffer data (constant_buffer) or in a
   // separate segment (constant_segment). It should not be in both.
   // Check constant_segment->offsets()->size() > 1, as the offsets list will
@@ -163,10 +179,10 @@ Result<executorch_flatbuffer::ExecutionPlan*> get_execution_plan(
     ET_CHECK_OR_RETURN_ERROR(
         constant_buffer == nullptr || constant_buffer->size() == 0,
         InvalidProgram,
-        "constant_buffer contains %u items, "
-        "constant_segment.offsets contains %u items. Only one should be used.",
-        constant_buffer->size(),
-        constant_segment->offsets()->size());
+        "constant_buffer contains %zu items, "
+        "constant_segment.offsets contains %zu items. Only one should be used.",
+        static_cast<size_t>(constant_buffer->size()),
+        static_cast<size_t>(constant_segment->offsets()->size()));
     const auto* segments = flatbuffer_program->segments();
     ET_CHECK_OR_RETURN_ERROR(
         segments != nullptr, InvalidProgram, "No segments in program");
@@ -176,9 +192,9 @@ Result<executorch_flatbuffer::ExecutionPlan*> get_execution_plan(
     ET_CHECK_OR_RETURN_ERROR(
         constant_segment->segment_index() < segments->size(),
         InvalidProgram,
-        "Constant segment index %d invalid for program segments range %d",
-        constant_segment->segment_index(),
-        segments->size());
+        "Constant segment index %zu invalid for program segments range %zu",
+        static_cast<size_t>(constant_segment->segment_index()),
+        static_cast<size_t>(segments->size()));
 
     const executorch_flatbuffer::DataSegment* data_segment =
         segments->Get(constant_segment->segment_index());
@@ -199,7 +215,8 @@ Result<executorch_flatbuffer::ExecutionPlan*> get_execution_plan(
         segment_base_offset,
         std::move(program_data.get()),
         flatbuffer_program,
-        std::move(constant_segment_data.get()));
+        std::move(constant_segment_data.get()),
+        std::move(pte_data_map));
   } else {
     // The constant data is stored inside the flatbuffer, so this program does
     // not contain a separate segment for it.
@@ -208,7 +225,8 @@ Result<executorch_flatbuffer::ExecutionPlan*> get_execution_plan(
         segment_base_offset,
         std::move(program_data.get()),
         flatbuffer_program,
-        /*constant_segment_data=*/FreeableBuffer{});
+        /*constant_segment_data=*/FreeableBuffer{},
+        std::move(pte_data_map));
   }
 }
 
@@ -347,14 +365,21 @@ Result<const void*> Program::get_constant_buffer_data(
     ET_CHECK_OR_RETURN_ERROR(
         storage_size <= nbytes,
         InvalidArgument,
-        "Constant buffer size %u larger than allocated nbytes %zu",
-        storage_size,
+        "Constant buffer size %zu larger than allocated nbytes %zu",
+        static_cast<size_t>(constant_buffer[buffer_index]->storage()->size()),
         nbytes);
 
     return storage->data();
   }
 }
 
+Result<const NamedDataMap*> Program::get_named_data_map() const {
+  if (pte_data_map_.has_value()) {
+    return &pte_data_map_.value();
+  }
+  return Error::NotFound;
+}
+
 Result<const char*> Program::get_output_flattening_encoding(
     const char* method_name) const {
   auto plan = get_execution_plan(internal_program_, method_name);
@@ -479,8 +504,8 @@ Error Program::load_mutable_subsegment_into(
   if (segment_offsets->segment_index() >= num_segments) {
     ET_LOG(
         Error,
-        "Segment index %u out of range (>= %zu)",
-        segment_offsets->segment_index(),
+        "Segment index %zu out of range (>= %zu)",
+        static_cast<size_t>(segment_offsets->segment_index()),
         num_segments);
     return Error::NotFound;
   }
diff --git a/runtime/executor/program.h b/runtime/executor/program.h
index 7313b19d66d..0932e51619f 100644
--- a/runtime/executor/program.h
+++ b/runtime/executor/program.h
@@ -7,12 +7,16 @@
  */
 
 #pragma once
+
+#ifdef __GNUC__
 // Disable -Wdeprecated-declarations, as some builds use 'Werror'.
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
 
 #include <cinttypes>
 #include <cstdint>
+#include <optional>
 
 #include <executorch/runtime/core/data_loader.h>
 #include <executorch/runtime/core/error.h>
@@ -22,6 +26,7 @@
 #include <executorch/runtime/executor/memory_manager.h>
 #include <executorch/runtime/executor/method.h>
 #include <executorch/runtime/executor/method_meta.h>
+#include <executorch/runtime/executor/pte_data_map.h>
 #include <executorch/runtime/platform/compiler.h>
 
 // Forward declare flatbuffer types. This is a public header and must not
@@ -105,6 +110,12 @@ class Program final {
   Result<const void*> get_constant_buffer_data(size_t buffer_idx, size_t nbytes)
       const;
 
+  /**
+   * Get the named data map from the program.
+   * @return The named data map.
+   */
+  Result<const NamedDataMap*> get_named_data_map() const;
+
   /**
    * Returns the number of methods in the program.
    */
@@ -266,13 +277,15 @@ class Program final {
       size_t segment_base_offset,
       FreeableBuffer&& program_data,
       const executorch_flatbuffer::Program* internal_program,
-      FreeableBuffer&& constant_segment_data)
+      FreeableBuffer&& constant_segment_data,
+      std::optional<internal::PteDataMap>&& pte_data_map)
       : program_data_(std::move(program_data)),
         // Don't need the loader if there are no segments.
         loader_(segment_base_offset > 0 ? loader : nullptr),
         internal_program_(internal_program),
         segment_base_offset_(segment_base_offset),
-        constant_segment_data_(std::move(constant_segment_data)) {}
+        constant_segment_data_(std::move(constant_segment_data)),
+        pte_data_map_(std::move(pte_data_map)) {}
 
   // Not copyable or assignable.
   Program(const Program& rhs) = delete;
@@ -295,6 +308,9 @@ class Program final {
 
   /// Constant segment data.
   FreeableBuffer constant_segment_data_;
+
+  /// NamedDataMap holding named data from the program.
+  std::optional<internal::PteDataMap> pte_data_map_;
 };
 
 } // namespace runtime
@@ -308,4 +324,6 @@ using ::executorch::runtime::Program;
 } // namespace executor
 } // namespace torch
 
+#ifdef __GNUC__
 #pragma GCC diagnostic pop
+#endif
diff --git a/runtime/executor/pte_data_map.cpp b/runtime/executor/pte_data_map.cpp
new file mode 100644
index 00000000000..5829395028a
--- /dev/null
+++ b/runtime/executor/pte_data_map.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/runtime/executor/pte_data_map.h>
+#include <executorch/schema/program_generated.h>
+
+namespace executorch {
+namespace runtime {
+namespace internal {
+
+/* static */ executorch::runtime::Result<PteDataMap> PteDataMap::create(
+    executorch::runtime::DataLoader* loader,
+    size_t segment_base_offset,
+    const flatbuffers::FlatbufferNamedData* named_data,
+    const flatbuffers::FlatbufferDataSegment* segments) {
+  ET_CHECK_OR_RETURN_ERROR(
+      loader != nullptr && named_data != nullptr && segments != nullptr,
+      InvalidArgument,
+      "PteDataMap loader, named_data or segments is null; most likely the program does not have any named_data segments");
+  return PteDataMap(loader, segment_base_offset, named_data, segments);
+}
+
+ET_NODISCARD
+executorch::runtime::Result<executorch::runtime::FreeableBuffer>
+PteDataMap::get_data(const char* key) const {
+  for (size_t i = 0; i < named_data_->size(); i++) {
+    ET_CHECK_OR_RETURN_ERROR(
+        named_data_->Get(i) != nullptr && named_data_->Get(i)->key() != nullptr,
+        InvalidArgument,
+        "Searching for key %s: NamedData at index %zu is null",
+        key,
+        i);
+    if (strcmp(named_data_->Get(i)->key()->c_str(), key) == 0) {
+      // Get the segment index.
+      size_t segment_index = named_data_->Get(i)->segment_index();
+
+      // Get the segment offset and size.
+      ET_CHECK_OR_RETURN_ERROR(
+          segment_index < segments_->size(),
+          InvalidArgument,
+          "Segment index %zu for key %s is out of range for segments size %u",
+          segment_index,
+          key,
+          segments_->size());
+      size_t segment_offset = segments_->Get(segment_index)->offset();
+      size_t segment_size = segments_->Get(segment_index)->size();
+
+      return loader_->load(
+          /*offset=*/segment_base_offset_ + segment_offset,
+          segment_size,
+          DataLoader::SegmentInfo(DataLoader::SegmentInfo::Type::External));
+    }
+  }
+  return Error::NotFound;
+}
+
+ET_NODISCARD executorch::runtime::Result<size_t> PteDataMap::get_num_keys()
+    const {
+  return named_data_->size();
+}
+
+ET_NODISCARD executorch::runtime::Result<const char*> PteDataMap::get_key(
+    size_t index) const {
+  ET_CHECK_OR_RETURN_ERROR(
+      index < named_data_->size(),
+      InvalidArgument,
+      "Index out of range: named_data size is %u, received index %zu",
+      named_data_->size(),
+      index);
+
+  ET_CHECK_OR_RETURN_ERROR(
+      named_data_->Get(index) != nullptr &&
+          named_data_->Get(index)->key() != nullptr,
+      InvalidArgument,
+      "NamedData at index %zu is null",
+      index);
+  return named_data_->Get(index)->key()->c_str();
+}
+
+} // namespace internal
+} // namespace runtime
+} // namespace executorch
diff --git a/runtime/executor/pte_data_map.h b/runtime/executor/pte_data_map.h
new file mode 100644
index 00000000000..01c15555786
--- /dev/null
+++ b/runtime/executor/pte_data_map.h
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/data_loader.h>
+#include <executorch/runtime/core/named_data_map.h>
+
+// Forward declare flatbuffer types. This is a public header and must not
+// include the generated flatbuffer header.
+namespace executorch_flatbuffer {
+struct NamedData;
+struct DataSegment;
+} // namespace executorch_flatbuffer
+
+namespace flatbuffers {
+template <typename T>
+struct Offset;
+} // namespace flatbuffers
+
+// @lint-ignore CLANGTIDY facebook-modularize-issue-check
+#if EXECUTORCH_INTERNAL_FLATBUFFERS == 1
+// TODO(T216992074): update internal flatbuffers (v1.12) to match OSS (v24.3.5).
+namespace flatbuffers {
+template <typename T>
+class Vector;
+using FlatbufferNamedData =
+    flatbuffers::Vector<flatbuffers::Offset<executorch_flatbuffer::NamedData>>;
+using FlatbufferDataSegment = flatbuffers::Vector<
+    flatbuffers::Offset<executorch_flatbuffer::DataSegment>>;
+} // namespace flatbuffers
+#else
+namespace flatbuffers {
+template <typename T, typename SizeT>
+class Vector;
+using FlatbufferNamedData = flatbuffers::
+    Vector<flatbuffers::Offset<executorch_flatbuffer::NamedData>, uint32_t>;
+using FlatbufferDataSegment = flatbuffers::
+    Vector<flatbuffers::Offset<executorch_flatbuffer::DataSegment>, uint32_t>;
+} // namespace flatbuffers
+#endif
+
+namespace executorch {
+namespace runtime {
+namespace internal {
+
+/**
+ * A NamedDataMap implementation for Flatbuffer-serialized named data
+ * originating from a PTE file.
+ */
+class PteDataMap final : public NamedDataMap {
+ public:
+  /**
+   * Creates a new DataMap that wraps named_data from the PTE file.
+   *
+   * @param[in] loader The DataLoader that accesses the PTE file.
+   * Note: the loader must outlive the PteDataMap instance.
+   * @param[in] segment_base_offset The offset to the first segment in the PTE
+   * file, in bytes.
+   * @param[in] named_data The named_data from the PTE file. Note: the pointer
+   * passed here must outlive the PteDataMap instance.
+   * @param[in] segments The segments from the PTE file. Note: the pointer
+   * passed here must outlive the PteDataMap instance.
+   */
+  static Result<PteDataMap> create(
+      DataLoader* loader,
+      size_t segment_base_offset,
+      const flatbuffers::FlatbufferNamedData* named_data,
+      const flatbuffers::FlatbufferDataSegment* segments);
+
+  /**
+   * The PteDataMap currently only handles opaque data that does not contain
+   * tensor-specific metadata.
+   */
+  ET_NODISCARD
+  Result<const TensorLayout> get_metadata(
+      ET_UNUSED const char* key) const override {
+    return Error::NotImplemented;
+  }
+
+  /**
+   * Retrieve read-only data for the specified key.
+   *
+   * @param[in] key The name of the blob to get data on.
+   *
+   * @return error if the key is not present or data cannot be loaded.
+   */
+  ET_NODISCARD
+  Result<FreeableBuffer> get_data(const char* key) const override;
+
+  /**
+   * The PteDataMap currently does not implement load_into.
+   */
+  ET_NODISCARD Error load_data_into(
+      ET_UNUSED const char* key,
+      ET_UNUSED void* buffer,
+      ET_UNUSED size_t size) const override {
+    return Error::NotImplemented;
+  }
+
+  /**
+   * @returns The number of keys in the map.
+   */
+  ET_NODISCARD Result<size_t> get_num_keys() const override;
+
+  /**
+   * @returns The key at the specified index, error if index out of bounds.
+   */
+  ET_NODISCARD Result<const char*> get_key(size_t index) const override;
+
+  // Moveable, to be compatible with Result.
+  PteDataMap(PteDataMap&&) noexcept = default;
+  ~PteDataMap() override = default;
+
+ private:
+  PteDataMap(
+      DataLoader* loader,
+      size_t segment_base_offset,
+      const flatbuffers::FlatbufferNamedData* named_data,
+      const flatbuffers::FlatbufferDataSegment* segments)
+      : loader_(loader),
+        segment_base_offset_(segment_base_offset),
+        named_data_(named_data),
+        segments_(segments) {}
+
+  // Not copyable or assignable.
+  PteDataMap(const PteDataMap& rhs) = delete;
+  PteDataMap& operator=(PteDataMap&& rhs) noexcept = delete;
+  PteDataMap& operator=(const PteDataMap& rhs) = delete;
+
+  // Data loader, used to load segment data.
+  DataLoader* loader_;
+
+  // The offset to the first segment in the PTE file, in bytes.
+  size_t segment_base_offset_;
+
+  // Named data, containing name and segment index.
+  const flatbuffers::FlatbufferNamedData* named_data_;
+
+  // Segments, to retrieve offset and size for the loader.
+  const flatbuffers::FlatbufferDataSegment* segments_;
+};
+
+} // namespace internal
+} // namespace runtime
+} // namespace executorch
diff --git a/runtime/executor/targets.bzl b/runtime/executor/targets.bzl
index 67163ed8789..cfb6c607359 100644
--- a/runtime/executor/targets.bzl
+++ b/runtime/executor/targets.bzl
@@ -1,4 +1,4 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime")
 
 def _program_preprocessor_flags():
     """Returns the preprocessor_flags to use when building Program.cpp"""
@@ -42,7 +42,29 @@ def define_common_targets():
         ],
     )
 
-    for aten_mode in (True, False):
+    runtime.cxx_library(
+        name = "pte_data_map",
+        srcs = [
+            "pte_data_map.cpp",
+        ],
+        exported_headers = [
+            "pte_data_map.h",
+        ],
+        visibility = [
+            "//executorch/runtime/executor/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+        exported_deps = [
+            "//executorch/runtime/core:core",
+            "//executorch/runtime/core:named_data_map",
+        ],
+        deps = [
+            "//executorch/schema:program",
+        ],
+        exported_preprocessor_flags = [] if runtime.is_oss else ["-DEXECUTORCH_INTERNAL_FLATBUFFERS=1"],
+    )
+
+    for aten_mode in get_aten_mode_options():
         aten_suffix = "_aten" if aten_mode else ""
         runtime.cxx_library(
             name = "program" + aten_suffix,
@@ -74,9 +96,14 @@ def define_common_targets():
                 "program.h",
                 "tensor_parser.h",
             ],
+            compiler_flags = select({
+                "ovr_config//os:windows": [],
+                "DEFAULT" :["-Wno-error=deprecated-declarations"]
+            }),
             preprocessor_flags = _program_preprocessor_flags(),
             exported_deps = [
                 ":memory_manager",
+                ":pte_data_map",
                 "//executorch/runtime/backend:interface",
                 "//executorch/runtime/core:core",
                 "//executorch/runtime/core:named_data_map",
diff --git a/runtime/executor/tensor_parser.h b/runtime/executor/tensor_parser.h
index 2ffb473544d..1fae84cfb05 100644
--- a/runtime/executor/tensor_parser.h
+++ b/runtime/executor/tensor_parser.h
@@ -7,9 +7,12 @@
  */
 
 #pragma once
+
+#ifdef __GNUC__
 // Disable -Wdeprecated-declarations, as some builds use 'Werror'.
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
 
 #include <executorch/runtime/core/evalue.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
@@ -21,11 +24,21 @@ namespace executorch {
 namespace runtime {
 namespace deserialization {
 
+/// Data structure to hold key and data buffer for external data used
+/// in a method.
+struct NamedData {
+  const char* key;
+  FreeableBuffer buffer;
+};
+
+NamedData* get_data_by_key(const char* key, Span<NamedData> entries);
+
 ET_NODISCARD Result<executorch::aten::Tensor> parseTensor(
     const Program* program,
     MemoryManager* memory_manager,
     const executorch_flatbuffer::Tensor* s_tensor,
-    const NamedDataMap* named_data_map = nullptr);
+    const NamedDataMap* named_data_map = nullptr,
+    Span<NamedData> external_constants = {});
 
 ET_NODISCARD Result<BoxedEvalueList<executorch::aten::Tensor>> parseTensorList(
     const flatbuffers::Vector<int32_t>* tensor_indices,
@@ -33,6 +46,12 @@ ET_NODISCARD Result<BoxedEvalueList<executorch::aten::Tensor>> parseTensorList(
     size_t values_len,
     MemoryManager* memory_manager);
 
+// Checks that the sizes, dim_order and scalar_type match between tensors
+// stored in the PTE and externally.
+ET_NODISCARD Error validateTensorLayout(
+    const executorch_flatbuffer::Tensor* s_tensor,
+    const TensorLayout& expected_layout);
+
 // Deserializes a List of optional type. The code here is the same between all
 // list of optionals: list of optional Tensor, list of optional float etc, so we
 // just use a template to avoid boilerplate.
@@ -75,7 +94,7 @@ parseListOptionalType(
       evalp_list[output_idx] = nullptr;
     } else {
       ET_CHECK_OR_RETURN_ERROR(
-          index >= 0 && index < values_len,
+          index >= 0 && static_cast<size_t>(index) < values_len,
           InvalidProgram,
           "Invalid value index %" PRId32 " for ListOptional",
           index);
@@ -105,7 +124,11 @@ parseListOptionalType(
  * @param[in] nbytes The amount of memory to get from the allocator.
  * @param[in] allocator The source of memory for non-constant tensors.
  * @param[in] named_data_map An optional map of {name, blob} used to resolve
- *     data that is external to the PTE, if any.
+ *     data that is mutable and external to the PTE, if any.
+ * @param[in] external_constants An optional span containing tensor fqn to
+ *     corresponding tensor data. Used to resolve data that is constant and
+ *     external to the PTE, if any. Referencing data from external_constants is
+ *     safe, as it has the same lifetime as the method.
  *
  * @returns On success, the data pointer to use for the tensor. On failure, a
  *     non-Ok Error.
@@ -115,7 +138,8 @@ ET_NODISCARD Result<void*> getTensorDataPtr(
     const Program* program,
     size_t nbytes,
     HierarchicalAllocator* allocator,
-    const NamedDataMap* named_data_map = nullptr);
+    const NamedDataMap* named_data_map = nullptr,
+    Span<NamedData> external_constants = {});
 
 } // namespace deserialization
 } // namespace runtime
@@ -133,4 +157,7 @@ using ::executorch::runtime::deserialization::parseTensorList;
 } // namespace deserialization
 } // namespace executor
 } // namespace torch
+
+#ifdef __GNUC__
 #pragma GCC diagnostic pop
+#endif
diff --git a/runtime/executor/tensor_parser_aten.cpp b/runtime/executor/tensor_parser_aten.cpp
index ab9af3d0399..d1a2f712853 100644
--- a/runtime/executor/tensor_parser_aten.cpp
+++ b/runtime/executor/tensor_parser_aten.cpp
@@ -33,7 +33,8 @@ Result<at::Tensor> parseTensor(
     const Program* program,
     MemoryManager* memory_manager,
     const executorch_flatbuffer::Tensor* s_tensor,
-    const NamedDataMap* named_data_map) {
+    const NamedDataMap* named_data_map,
+    Span<NamedData> external_constants) {
   EXECUTORCH_SCOPE_PROF("TensorParser::parseTensor");
 
   ET_CHECK_OR_RETURN_ERROR(
@@ -108,7 +109,8 @@ Result<at::Tensor> parseTensor(
         program,
         tensor.nbytes(),
         memory_manager->planned_memory(),
-        named_data_map);
+        named_data_map,
+        external_constants);
     if (!data_ptr.ok()) {
       ET_LOG(
           Error,
diff --git a/runtime/executor/tensor_parser_exec_aten.cpp b/runtime/executor/tensor_parser_exec_aten.cpp
index 83310ff680c..002c7366be6 100644
--- a/runtime/executor/tensor_parser_exec_aten.cpp
+++ b/runtime/executor/tensor_parser_exec_aten.cpp
@@ -64,7 +64,8 @@ ET_NODISCARD Result<void*> getMemPlannedPtr(
         "size_t cannot hold memory offset 0x%08" PRIx32 ".%08" PRIx32,
         memory_offset_high,
         memory_offset_low);
-    memory_offset |= static_cast<size_t>(memory_offset_high) << 32;
+    memory_offset |= static_cast<size_t>(memory_offset_high)
+        << (sizeof(size_t) - sizeof(uint32_t));
   }
   return allocator->get_offset_address(memory_id, memory_offset, nbytes);
 }
@@ -94,7 +95,7 @@ ET_NODISCARD Result<BoxedEvalueList<executorch::aten::Tensor>> parseTensorList(
   size_t output_idx = 0;
   for (int32_t tensor_index : *tensor_indices) {
     ET_CHECK_OR_RETURN_ERROR(
-        tensor_index >= 0 && tensor_index < values_len,
+        tensor_index >= 0 && static_cast<size_t>(tensor_index) < values_len,
         InvalidProgram,
         "Invalid value index %" PRId32 " for TensorList",
         tensor_index);
@@ -111,34 +112,68 @@ ET_NODISCARD Result<BoxedEvalueList<executorch::aten::Tensor>> parseTensorList(
       evalp_list, tensor_list, tensor_indices->size());
 }
 
+ET_NODISCARD Error validateTensorLayout(
+    const executorch_flatbuffer::Tensor* s_tensor,
+    const TensorLayout& expected_layout) {
+  ET_CHECK_OR_RETURN_ERROR(
+      static_cast<ScalarType>(s_tensor->scalar_type()) ==
+          expected_layout.scalar_type(),
+      InvalidExternalData,
+      "Scalar type mismatch. Expected %hhd, got %hhd.",
+      static_cast<int8_t>(s_tensor->scalar_type()),
+      static_cast<int8_t>(expected_layout.scalar_type()));
+  int dim = s_tensor->sizes()->size();
+  ET_CHECK_OR_RETURN_ERROR(
+      dim >= 0, InvalidExternalData, "Dim is negative: %d", dim)
+  ET_CHECK_OR_RETURN_ERROR(
+      static_cast<size_t>(dim) == expected_layout.sizes().size(),
+      InvalidExternalData,
+      "Dim mismatch. Expected %d, got %zu.",
+      dim,
+      expected_layout.sizes().size());
+  for (int i = 0; i < dim; i++) {
+    ET_CHECK_OR_RETURN_ERROR(
+        s_tensor->sizes()->Get(i) == expected_layout.sizes()[i],
+        InvalidExternalData,
+        "Sizes mismatch. Expected %d, got %d for size at index %d.",
+        s_tensor->sizes()->Get(i),
+        expected_layout.sizes()[i],
+        i);
+    ET_CHECK_OR_RETURN_ERROR(
+        s_tensor->dim_order()->Get(i) == expected_layout.dim_order()[i],
+        InvalidExternalData,
+        "Dim order mismatch. Expected %d, got %d for dim at index %d.",
+        s_tensor->dim_order()->Get(i),
+        expected_layout.dim_order()[i],
+        i);
+  }
+  return Error::Ok;
+}
+
+// Check if key exists in entries. If it does, return a pointer to the entry
+// otherwise return a nullptr.
+NamedData* get_data_by_key(const char* key, Span<NamedData> entries) {
+  for (const auto i : c10::irange(entries.size())) {
+    if (strcmp(key, entries[i].key) == 0) {
+      return &entries[i];
+    }
+  }
+  return nullptr;
+}
+
 ET_NODISCARD Result<void*> getTensorDataPtr(
     const executorch_flatbuffer::Tensor* s_tensor,
     const Program* program,
     size_t nbytes,
     HierarchicalAllocator* allocator,
-    const NamedDataMap* named_data_map) {
+    const NamedDataMap* named_data_map,
+    Span<NamedData> external_constants) {
   auto data_buffer_idx = s_tensor->data_buffer_idx();
   const executorch_flatbuffer::AllocationDetails* allocation_info =
       s_tensor->allocation_info();
 
-  // Memory Planned, with initial state
-  if (data_buffer_idx > 0 && allocation_info != nullptr) {
-    auto planned_ptr = getMemPlannedPtr(allocation_info, nbytes, allocator);
-    if (!planned_ptr.ok()) {
-      return planned_ptr.error();
-    }
-    auto err = TensorParser::load_mutable_subsegment_into(
-        program, 0, s_tensor->data_buffer_idx(), nbytes, planned_ptr.get());
-
-    if (err != Error::Ok) {
-      return err;
-    }
-    return planned_ptr;
-  }
-
   // External tensors.
-  else if (
-      s_tensor->extra_tensor_info() != nullptr &&
+  if (s_tensor->extra_tensor_info() != nullptr &&
       s_tensor->extra_tensor_info()->location() ==
           executorch_flatbuffer::TensorDataLocation::EXTERNAL) {
     // Check that fqn is not null.
@@ -146,91 +181,47 @@ ET_NODISCARD Result<void*> getTensorDataPtr(
         s_tensor->extra_tensor_info()->fully_qualified_name() != nullptr,
         InvalidExternalData,
         "Fully qualified name of external tensor is null");
-    // Look up tensor in named data map.
-    Result<const TensorLayout> tensor_layout_res = named_data_map->get_metadata(
-        s_tensor->extra_tensor_info()->fully_qualified_name()->c_str());
-    if (!tensor_layout_res.ok()) {
-      return tensor_layout_res.error();
-    }
-    const TensorLayout& tensor_layout = tensor_layout_res.get();
-
-    // Compatibility checking.
-    ET_CHECK_OR_RETURN_ERROR(
-        static_cast<ScalarType>(s_tensor->scalar_type()) ==
-            tensor_layout.scalar_type(),
-        InvalidExternalData,
-        "Scalar type mismatch. Expected %hhd, got %hhd.",
-        static_cast<int8_t>(s_tensor->scalar_type()),
-        static_cast<int8_t>(tensor_layout.scalar_type()));
-    ET_CHECK_OR_RETURN_ERROR(
-        nbytes == tensor_layout.nbytes(),
-        InvalidExternalData,
-        "Nbytes mismatch. Expected %zu, got %zu.",
-        nbytes,
-        tensor_layout.nbytes());
-    int dim = s_tensor->sizes()->size();
-    ET_CHECK_OR_RETURN_ERROR(
-        dim == tensor_layout.sizes().size(),
-        InvalidExternalData,
-        "Dim mismatch. Expected %d, got %zu.",
-        dim,
-        tensor_layout.sizes().size());
-    for (int i = 0; i < dim; i++) {
-      ET_CHECK_OR_RETURN_ERROR(
-          s_tensor->sizes()->Get(i) == tensor_layout.sizes()[i],
-          InvalidExternalData,
-          "Sizes mismatch. Expected %d, got %d for size at index %d.",
-          s_tensor->sizes()->Get(i),
-          tensor_layout.sizes()[i],
-          i);
-      ET_CHECK_OR_RETURN_ERROR(
-          s_tensor->dim_order()->Get(i) == tensor_layout.dim_order()[i],
-          InvalidExternalData,
-          "Dim order mismatch. Expected %d, got %d for dim at index %d.",
-          s_tensor->dim_order()->Get(i),
-          tensor_layout.dim_order()[i],
-          i);
-    }
+    const char* fqn =
+        s_tensor->extra_tensor_info()->fully_qualified_name()->c_str();
 
     // Constant value.
     if (allocation_info == nullptr) {
-      Result<FreeableBuffer> data_res = named_data_map->get_data(
-          s_tensor->extra_tensor_info()->fully_qualified_name()->c_str());
-      if (!data_res.ok()) {
-        return data_res.error();
+      NamedData* data = get_data_by_key(fqn, external_constants);
+      if (data != nullptr) {
+        return const_cast<void*>(data->buffer.data());
+      }
+      // Should never reach here; these tensors are resolved in
+      // Method::parse_external_constants. Any errors should be caught there.
+      return Error::Internal;
+    } else {
+      // Mutable value.
+      // Look up tensor in named data map.
+      Result<const TensorLayout> tensor_layout_res =
+          named_data_map->get_metadata(fqn);
+      if (!tensor_layout_res.ok()) {
+        return tensor_layout_res.error();
+      }
+      const TensorLayout& tensor_layout = tensor_layout_res.get();
+      Error err = validateTensorLayout(s_tensor, tensor_layout);
+      if (err != Error::Ok) {
+        return err;
       }
-      // The const_cast is 'ok' here because program and runtime should
-      // guarantee that this data is never modified. Temporary until runtime
-      // takes ownership of FreeableBuffers in TODO(T214294528).
-      return const_cast<void*>(data_res.get().data());
-    }
-
-    // Mutable value.
-    else {
       // Call load_into.
       auto planned_ptr = getMemPlannedPtr(allocation_info, nbytes, allocator);
       if (!planned_ptr.ok()) {
         return planned_ptr.error();
       }
-      auto size = named_data_map->load_data_into(
-          s_tensor->extra_tensor_info()->fully_qualified_name()->c_str(),
-          planned_ptr.get(),
-          nbytes);
-      if (size.error() != Error::Ok) {
-        return size.error();
+      auto load_error =
+          named_data_map->load_data_into(fqn, planned_ptr.get(), nbytes);
+      if (load_error != Error::Ok) {
+        return load_error;
       }
-      ET_CHECK_OR_RETURN_ERROR(
-          size.get() == nbytes,
-          InvalidExternalData,
-          "Expected to load %zu bytes, actually loaded %u bytes",
-          nbytes,
-          static_cast<unsigned int>(size.get()));
+
       return planned_ptr;
     }
-  }
 
-  // Constant, stored in PTE file.
-  else if (data_buffer_idx > 0 && allocation_info == nullptr) {
+    // Constant, stored in PTE file.
+  } else if (data_buffer_idx > 0 && allocation_info == nullptr) {
     auto const_data =
         program->get_constant_buffer_data(data_buffer_idx, nbytes);
     if (!const_data.ok()) {
@@ -241,6 +232,20 @@ ET_NODISCARD Result<void*> getTensorDataPtr(
     // guarantee that this data is never modified.
     return const_cast<void*>(const_data.get());
 
+    // Memory Planned, with initial state
+  } else if (data_buffer_idx > 0 && allocation_info != nullptr) {
+    auto planned_ptr = getMemPlannedPtr(allocation_info, nbytes, allocator);
+    if (!planned_ptr.ok()) {
+      return planned_ptr.error();
+    }
+    auto err = TensorParser::load_mutable_subsegment_into(
+        program, 0, s_tensor->data_buffer_idx(), nbytes, planned_ptr.get());
+
+    if (err != Error::Ok) {
+      return err;
+    }
+    return planned_ptr;
+
     // Memory planned, no initial state
   } else if (data_buffer_idx == 0 && allocation_info != nullptr) {
     return getMemPlannedPtr(allocation_info, nbytes, allocator);
diff --git a/runtime/executor/tensor_parser_portable.cpp b/runtime/executor/tensor_parser_portable.cpp
index a53295470fc..787af8b506b 100644
--- a/runtime/executor/tensor_parser_portable.cpp
+++ b/runtime/executor/tensor_parser_portable.cpp
@@ -21,6 +21,7 @@ namespace executorch {
 namespace runtime {
 namespace deserialization {
 
+using executorch::runtime::Span;
 using torch::executor::ScalarType;
 using torch::executor::Tensor;
 using torch::executor::TensorImpl;
@@ -29,7 +30,8 @@ Result<Tensor> parseTensor(
     const Program* program,
     MemoryManager* memory_manager,
     const executorch_flatbuffer::Tensor* s_tensor,
-    const NamedDataMap* named_data_map) {
+    const NamedDataMap* named_data_map,
+    Span<NamedData> external_constants) {
   EXECUTORCH_SCOPE_PROF("TensorParser::parseTensor");
   auto method_allocator = memory_manager->method_allocator();
 
@@ -41,11 +43,7 @@ Result<Tensor> parseTensor(
 
   ScalarType scalar_type = static_cast<ScalarType>(s_tensor->scalar_type());
   ET_CHECK_OR_RETURN_ERROR(
-      isValid(scalar_type) &&
-          // Types that do not yet have deserialization support.
-          scalar_type != executorch::aten::ScalarType::ComplexHalf &&
-          scalar_type != executorch::aten::ScalarType::ComplexFloat &&
-          scalar_type != executorch::aten::ScalarType::ComplexDouble,
+      isValid(scalar_type),
       InvalidProgram,
       "Invalid or unsupported ScalarType %" PRId8,
       static_cast<int8_t>(scalar_type));
@@ -84,11 +82,18 @@ Result<Tensor> parseTensor(
     // copy sizes and dim order out of flatbuffer
     // kimishpate: I think dim order can remain immutable and point to fb
     // memory, unless we plan to implement in-place permute
-    executorch::aten::SizesType* sizes_buf = ET_ALLOCATE_LIST_OR_RETURN_ERROR(
-        method_allocator, executorch::aten::SizesType, dim);
+    executorch::aten::SizesType* sizes_buf =
+        method_allocator->allocateList<executorch::aten::SizesType>(dim);
+    if (sizes_buf == nullptr) {
+      return Error::MemoryAllocationFailed;
+    }
+
     executorch::aten::DimOrderType* dim_order_buf =
-        ET_ALLOCATE_LIST_OR_RETURN_ERROR(
-            method_allocator, executorch::aten::DimOrderType, dim);
+        method_allocator->allocateList<executorch::aten::DimOrderType>(dim);
+    if (dim_order_buf == nullptr) {
+      return Error::MemoryAllocationFailed;
+    }
+
     std::memcpy(
         sizes_buf, serialized_sizes, sizeof(executorch::aten::SizesType) * dim);
     std::memcpy(
@@ -109,12 +114,12 @@ Result<Tensor> parseTensor(
   // detect bad positive values, but we can reject negative values, which would
   // otherwise panic in the TensorImpl ctor. dim_order_to_stride() will validate
   // dim_order.
-  for (int i = 0; i < dim; i++) {
+  for (flatbuffers::uoffset_t i = 0; i < dim; i++) {
     ET_CHECK_OR_RETURN_ERROR(
         sizes[i] >= 0,
         InvalidProgram,
-        "Negative size[%d] %" PRId32,
-        i,
+        "Negative size[%zu] %" PRId32,
+        static_cast<size_t>(i),
         sizes[i]);
   }
 
@@ -122,16 +127,23 @@ Result<Tensor> parseTensor(
   // Allocating strides buffer here and populating it.
   // In subsequent diffs we can remove strides accessor, however this
   // will introduce incompatible APIs between ATen Tensor and ETensor.
-  executorch::aten::StridesType* strides = ET_ALLOCATE_LIST_OR_RETURN_ERROR(
-      method_allocator, executorch::aten::StridesType, dim);
+  executorch::aten::StridesType* strides =
+      method_allocator->allocateList<executorch::aten::StridesType>(dim);
+  if (strides == nullptr) {
+    return Error::MemoryAllocationFailed;
+  }
+
   auto status = dim_order_to_stride(sizes, dim_order, dim, strides);
   ET_CHECK_OR_RETURN_ERROR(
       status == Error::Ok,
       Internal,
       "dim_order_to_stride returned invalid status");
 
-  auto* tensor_impl =
-      ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR(method_allocator, TensorImpl);
+  auto* tensor_impl = method_allocator->allocateInstance<TensorImpl>();
+  if (tensor_impl == nullptr) {
+    return Error::MemoryAllocationFailed;
+  }
+
   // Placement new on the allocated memory space. Note that we create this first
   // with null data so we can find its expected size before getting its memory.
   new (tensor_impl) TensorImpl(
@@ -149,7 +161,8 @@ Result<Tensor> parseTensor(
       program,
       tensor_impl->nbytes(),
       memory_manager->planned_memory(),
-      named_data_map);
+      named_data_map,
+      external_constants);
   if (!data_ptr.ok()) {
     ET_LOG(
         Error,
diff --git a/runtime/executor/test/CMakeLists.txt b/runtime/executor/test/CMakeLists.txt
index c9ac26ba673..2de32c9176a 100644
--- a/runtime/executor/test/CMakeLists.txt
+++ b/runtime/executor/test/CMakeLists.txt
@@ -15,7 +15,7 @@ cmake_minimum_required(VERSION 3.19)
 
 set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 
-include(${EXECUTORCH_ROOT}/build/Test.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
 add_custom_command(
   OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/ModuleAddHalf.pte"
@@ -24,7 +24,7 @@ add_custom_command(
          "${CMAKE_CURRENT_BINARY_DIR}/ModuleIndex.pte"
          "${CMAKE_CURRENT_BINARY_DIR}/ModuleLinear.pte"
          "${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.pte"
-         "${CMAKE_CURRENT_BINARY_DIR}/_default_external_constant.ptd"
+         "${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.ptd"
          "${CMAKE_CURRENT_BINARY_DIR}/ModuleMultipleEntry.pte"
          "${CMAKE_CURRENT_BINARY_DIR}/ModuleSimpleTrain.pte"
   COMMAND
@@ -48,7 +48,7 @@ add_custom_target(
           "${CMAKE_CURRENT_BINARY_DIR}/ModuleIndex.pte"
           "${CMAKE_CURRENT_BINARY_DIR}/ModuleLinear.pte"
           "${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.pte"
-          "${CMAKE_CURRENT_BINARY_DIR}/_default_external_constant.ptd"
+          "${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.ptd"
           "${CMAKE_CURRENT_BINARY_DIR}/ModuleMultipleEntry.pte"
           "${CMAKE_CURRENT_BINARY_DIR}/ModuleSimpleTrain.pte"
 )
@@ -61,7 +61,7 @@ set(test_env
     "ET_MODULE_INDEX_PATH=${CMAKE_CURRENT_BINARY_DIR}/ModuleIndex.pte"
     "ET_MODULE_LINEAR_PATH=${CMAKE_CURRENT_BINARY_DIR}/ModuleLinear.pte"
     "ET_MODULE_LINEAR_PROGRAM_PATH=${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.pte"
-    "ET_MODULE_LINEAR_DATA_PATH=${CMAKE_CURRENT_BINARY_DIR}/_default_external_constant.ptd"
+    "ET_MODULE_LINEAR_DATA_PATH=${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.ptd"
     "ET_MODULE_MULTI_ENTRY_PATH=${CMAKE_CURRENT_BINARY_DIR}/ModuleMultipleEntry.pte"
     "ET_MODULE_SIMPLE_TRAIN_PATH=${CMAKE_CURRENT_BINARY_DIR}/ModuleSimpleTrain.pte"
 )
diff --git a/runtime/executor/test/backend_integration_test.cpp b/runtime/executor/test/backend_integration_test.cpp
index 5c804b4ca42..587c7b353eb 100644
--- a/runtime/executor/test/backend_integration_test.cpp
+++ b/runtime/executor/test/backend_integration_test.cpp
@@ -347,6 +347,37 @@ TEST_P(BackendIntegrationTest, BasicInitSucceeds) {
   EXPECT_EQ(method_res.error(), Error::Ok);
 }
 
+TEST_P(BackendIntegrationTest, GetBackendNamesSuccess) {
+  // Load the program from file.
+  Result<FileDataLoader> loader = FileDataLoader::from(program_path());
+  ASSERT_EQ(loader.error(), Error::Ok);
+
+  Result<Program> program = Program::load(&loader.get());
+  ASSERT_EQ(program.error(), Error::Ok);
+
+  // Get method metadata for the "forward" method.
+  auto method_meta = program->method_meta("forward");
+
+  // Ensure the StubBackend is used.
+  EXPECT_TRUE(method_meta->uses_backend(StubBackend::kName));
+
+  // Retrieve the number of backends.
+  const size_t num_backends = method_meta->num_backends();
+  EXPECT_GT(num_backends, 0u);
+
+  // Iterate through each backend and verify its name.
+  for (size_t i = 0; i < num_backends; ++i) {
+    auto backend_name_result = method_meta->get_backend_name(i);
+    ASSERT_TRUE(backend_name_result.ok());
+    const char* name = backend_name_result.get();
+    // For this test, we expect that the only backend is StubBackend.
+    EXPECT_STREQ(name, StubBackend::kName);
+  }
+  // Check that an out-of-range index returns an error.
+  auto out_of_range_result = method_meta->get_backend_name(num_backends);
+  EXPECT_FALSE(out_of_range_result.ok());
+}
+
 TEST_P(BackendIntegrationTest, FreeingProcessedBufferSucceeds) {
   // Install an init() implementation that frees its processed buffer, and lets
   // us know that it was actually called by setting init_called.
diff --git a/runtime/executor/test/program_test.cpp b/runtime/executor/test/program_test.cpp
index 80f91f1af6a..d82bfb50778 100644
--- a/runtime/executor/test/program_test.cpp
+++ b/runtime/executor/test/program_test.cpp
@@ -371,6 +371,18 @@ TEST_F(ProgramTest, getMethods) {
   EXPECT_EQ(strcmp(res2.get(), "forward2"), 0);
 }
 
+TEST_F(ProgramTest, GetNamedDataMap_Fail) {
+  Result<Program> program =
+      Program::load(add_loader_.get(), kDefaultVerification);
+  ASSERT_EQ(program.error(), Error::Ok);
+
+  // Get the named data map. Expect to fail, as add.pte does not have any
+  // named data segments.
+  Result<const executorch::runtime::NamedDataMap*> named_data_map =
+      program->get_named_data_map();
+  EXPECT_EQ(named_data_map.error(), Error::NotFound);
+}
+
 // Test that the deprecated Load method (capital 'L') still works.
 TEST_F(ProgramTest, DEPRECATEDLoad) {
   // Parse the Program from the data.
diff --git a/runtime/executor/test/pte_data_map_test.cpp b/runtime/executor/test/pte_data_map_test.cpp
new file mode 100644
index 00000000000..b5312eb4a88
--- /dev/null
+++ b/runtime/executor/test/pte_data_map_test.cpp
@@ -0,0 +1,277 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/runtime/executor/pte_data_map.h>
+
+#include <executorch/extension/data_loader/file_data_loader.h>
+#include <executorch/extension/testing_util/temp_file.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/result.h>
+#include <executorch/runtime/platform/runtime.h>
+#include <executorch/schema/program_generated.h>
+
+#include <gtest/gtest.h>
+
+using namespace ::testing;
+using executorch::extension::FileDataLoader;
+using executorch::extension::testing::TempFile;
+using executorch::runtime::DataLoader;
+using executorch::runtime::Error;
+using executorch::runtime::FreeableBuffer;
+using executorch::runtime::Result;
+using executorch::runtime::internal::PteDataMap;
+
+class PteDataMapTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // Since these tests cause ET_LOG to be called, the PAL must be initialized
+    // first.
+    executorch::runtime::runtime_init();
+
+    // Create a sample Program with only named_data and segments. Technically
+    // not a valid Program; only used to test the PteDataMap.
+    // Create named data.
+    std::array<const flatbuffers::Offset<executorch_flatbuffer::NamedData>, 4>
+        named_data_arr = {
+            executorch_flatbuffer::CreateNamedDataDirect(
+                builder_, "key0", /*segment_index=*/0),
+            executorch_flatbuffer::CreateNamedDataDirect(
+                builder_, "key1", /*segment_index=*/1),
+            // Note: key2 points to the same segment as key0.
+            executorch_flatbuffer::CreateNamedDataDirect(
+                builder_, "key2", /*segment_index=*/0),
+            // This is invalid, as segment_index=10 is out of range when the
+            // number of segments is 2.
+            executorch_flatbuffer::CreateNamedDataDirect(
+                builder_, "key_invalid", /*segment_index=*/10),
+        };
+    const auto named_data =
+        builder_.CreateVector(named_data_arr.data(), named_data_arr.size());
+
+    // Create segments.
+    std::array<const flatbuffers::Offset<executorch_flatbuffer::DataSegment>, 2>
+        segment_arr = {// @lint-ignore CLANGTIDY facebook-hte-BadArgumentComment
+                       executorch_flatbuffer::CreateDataSegment(
+                           builder_, /*offset=*/0, /*size=*/kSegmentSizes[0]),
+                       // @lint-ignore CLANGTIDY facebook-hte-BadArgumentComment
+                       executorch_flatbuffer::CreateDataSegment(
+                           builder_,
+                           /*offset=*/kSegmentAlignment * 2,
+                           /*size=*/kSegmentSizes[1])};
+    const auto segments =
+        builder_.CreateVector(segment_arr.data(), segment_arr.size());
+
+    // Create Program.
+    const auto program = executorch_flatbuffer::CreateProgram(
+        builder_, 0, 0, 0, 0, segments, 0, 0, named_data);
+
+    builder_.Finish(program);
+    program_ = executorch_flatbuffer::GetProgram(builder_.GetBufferPointer());
+
+    // Create sample segment data.
+    for (int i = 0; i < kSegmentSizes[0]; i++) {
+      sample_data_[i] = 1;
+    }
+    for (int i = kSegmentOffsets[1]; i < kSegmentOffsets[1] + kSegmentSizes[1];
+         i++) {
+      sample_data_[i] = 2;
+    }
+    TempFile tf(sample_data_.data(), sizeof(sample_data_));
+
+    // Wrap the sample data in a loader.
+    Result<FileDataLoader> loader =
+        FileDataLoader::from(tf.path().c_str(), kSegmentAlignment);
+    ASSERT_EQ(loader.error(), Error::Ok);
+    data_map_loader_ =
+        std::make_unique<FileDataLoader>(std::move(loader.get()));
+  }
+
+  // Program builder constants.
+  static constexpr int kSegmentAlignment = 16;
+  static constexpr std::array<int, 2> kSegmentSizes{17, 8};
+  static constexpr std::array<int, 2> kSegmentOffsets{0, kSegmentAlignment * 2};
+  std::array<uint8_t, 64> sample_data_;
+
+  // Program builder.
+  flatbuffers::FlatBufferBuilder builder_;
+  const executorch_flatbuffer::Program* program_;
+
+  // Data loader for the sample data.
+  std::unique_ptr<FileDataLoader> data_map_loader_;
+};
+
+TEST_F(PteDataMapTest, Load) {
+  Result<PteDataMap> data_map = PteDataMap::create(
+      data_map_loader_.get(), 0, program_->named_data(), program_->segments());
+  ASSERT_TRUE(data_map.ok());
+}
+
+TEST_F(PteDataMapTest, LoadFail) {
+  Result<PteDataMap> data_map = PteDataMap::create(
+      /*loader=*/nullptr,
+      /*segment_base_offset=*/0,
+      program_->named_data(),
+      program_->segments());
+  EXPECT_EQ(data_map.error(), Error::InvalidArgument);
+}
+
+TEST_F(PteDataMapTest, UnimplementedMethods) {
+  Result<PteDataMap> data_map = PteDataMap::create(
+      data_map_loader_.get(), 0, program_->named_data(), program_->segments());
+  ;
+
+  // Check get_metadata is not implemented.
+  auto result = data_map->get_metadata("sample_key");
+  EXPECT_EQ(result.error(), Error::NotImplemented);
+
+  // Check load_data_into is not implemented.
+  auto err = data_map->load_data_into("sample_key", nullptr, 0);
+  EXPECT_EQ(err, Error::NotImplemented);
+}
+
+TEST_F(PteDataMapTest, Keys) {
+  Result<PteDataMap> data_map = PteDataMap::create(
+      data_map_loader_.get(), 0, program_->named_data(), program_->segments());
+  ASSERT_TRUE(data_map.ok());
+
+  // Check get_num_keys.
+  auto num_keys = data_map->get_num_keys();
+  EXPECT_EQ(num_keys.error(), Error::Ok);
+  EXPECT_EQ(num_keys.get(), 4);
+
+  // Check get_key_at.
+  auto key0 = data_map->get_key(0);
+  EXPECT_EQ(strcmp(key0.get(), "key0"), 0);
+  auto key1 = data_map->get_key(1);
+  EXPECT_EQ(strcmp(key1.get(), "key1"), 0);
+  auto key2 = data_map->get_key(2);
+  EXPECT_EQ(strcmp(key2.get(), "key2"), 0);
+
+  // This key is invalid because it points to a segment_index=10, which is out
+  // of range for this example with segment size=2.
+  // Note: practically, a PTE should not have invalid keys.
+  auto key_invalid = data_map->get_key(3);
+  EXPECT_EQ(strcmp(key_invalid.get(), "key_invalid"), 0);
+
+  // Returns an error on non-existent key.
+  auto nonexistent_key = data_map->get_key(10);
+  EXPECT_EQ(nonexistent_key.error(), Error::InvalidArgument);
+}
+
+TEST_F(PteDataMapTest, GetData) {
+  Result<PteDataMap> data_map = PteDataMap::create(
+      data_map_loader_.get(), 0, program_->named_data(), program_->segments());
+  ASSERT_TRUE(data_map.ok());
+
+  Result<FreeableBuffer> data0 = data_map->get_data("key0");
+  EXPECT_EQ(data0.error(), Error::Ok);
+  EXPECT_EQ(data0.get().size(), kSegmentSizes[0]);
+  EXPECT_EQ(
+      memcmp(data0.get().data(), sample_data_.data(), data0.get().size()), 0);
+
+  Result<FreeableBuffer> data1 = data_map->get_data("key1");
+  EXPECT_EQ(data1.error(), Error::Ok);
+  EXPECT_EQ(data1.get().size(), kSegmentSizes[1]);
+  EXPECT_EQ(
+      memcmp(
+          data1.get().data(),
+          sample_data_.data() + kSegmentOffsets[1],
+          data1.get().size()),
+      0);
+
+  Result<FreeableBuffer> data2 = data_map->get_data("key2");
+  EXPECT_EQ(data2.error(), Error::Ok);
+  // Expect the same values as data0, as key0 and key2 point to the same
+  // segment.
+  EXPECT_EQ(data2.get().size(), kSegmentSizes[0]);
+  EXPECT_EQ(
+      memcmp(data2.get().data(), sample_data_.data(), data2.get().size()), 0);
+
+  // Free data.
+  data0->Free();
+  data1->Free();
+  data2->Free();
+
+  // Returns an error, as key_invalid contains segment_index=10, which
+  // is out of range for segments.size()=2.
+  Result<FreeableBuffer> data_invalid = data_map->get_data("key_invalid");
+  EXPECT_EQ(data_invalid.error(), Error::InvalidArgument);
+
+  // Returns an error on nonexistent key.
+  Result<FreeableBuffer> data_nonexistent =
+      data_map->get_data("nonexistent_key");
+  EXPECT_EQ(data_nonexistent.error(), Error::NotFound);
+}
+
+TEST_F(PteDataMapTest, FreeAndReload) {
+  // Load a key, free it, and then load it again, and ensure that the
+  // core data map can return a new FreeableBuffer with the same data.
+  Result<PteDataMap> data_map = PteDataMap::create(
+      data_map_loader_.get(), 0, program_->named_data(), program_->segments());
+  ASSERT_TRUE(data_map.ok());
+
+  // Load data0.
+  Result<FreeableBuffer> data0 = data_map->get_data("key0");
+  EXPECT_EQ(data0.error(), Error::Ok);
+  EXPECT_EQ(data0.get().size(), kSegmentSizes[0]);
+  EXPECT_EQ(
+      memcmp(data0.get().data(), sample_data_.data(), data0.get().size()), 0);
+  data0->Free();
+
+  // Reload data0, ensure that the core data map can return a new
+  // FreeableBuffer with the same data.
+  Result<FreeableBuffer> data0_reload = data_map->get_data("key0");
+  EXPECT_EQ(data0_reload.error(), Error::Ok);
+  EXPECT_EQ(data0_reload.get().size(), kSegmentSizes[0]);
+  EXPECT_EQ(
+      memcmp(
+          data0_reload.get().data(),
+          sample_data_.data(),
+          data0_reload.get().size()),
+      0);
+  data0_reload->Free();
+}
+
+TEST_F(PteDataMapTest, ReloadAndFree) {
+  // Load the same key multiple times, and then free one and ensure that the
+  // data in the other is still valid.
+  Result<PteDataMap> data_map = PteDataMap::create(
+      data_map_loader_.get(), 0, program_->named_data(), program_->segments());
+  ASSERT_TRUE(data_map.ok());
+
+  // Load data0.
+  Result<FreeableBuffer> data0 = data_map->get_data("key0");
+  EXPECT_EQ(data0.error(), Error::Ok);
+  EXPECT_EQ(data0.get().size(), kSegmentSizes[0]);
+  EXPECT_EQ(
+      memcmp(data0.get().data(), sample_data_.data(), data0.get().size()), 0);
+
+  // Reload data0.
+  Result<FreeableBuffer> data0_reload = data_map->get_data("key0");
+  EXPECT_EQ(data0_reload.error(), Error::Ok);
+  EXPECT_EQ(data0_reload.get().size(), kSegmentSizes[0]);
+  EXPECT_EQ(
+      memcmp(
+          data0_reload.get().data(),
+          sample_data_.data(),
+          data0_reload.get().size()),
+      0);
+
+  // Free data0 and check that data0_reload is still valid.
+  data0->Free();
+  EXPECT_EQ(data0_reload.get().size(), kSegmentSizes[0]);
+  EXPECT_EQ(
+      memcmp(
+          data0_reload.get().data(),
+          sample_data_.data(),
+          data0_reload.get().size()),
+      0);
+
+  // Free data_reload0.
+  data0_reload->Free();
+}
diff --git a/runtime/executor/test/targets.bzl b/runtime/executor/test/targets.bzl
index 922fa17ba75..dd5262b5ac6 100644
--- a/runtime/executor/test/targets.bzl
+++ b/runtime/executor/test/targets.bzl
@@ -1,4 +1,4 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime")
 
 def define_common_targets(is_fbcode = False):
     """Defines targets that should be shared between fbcode and xplat.
@@ -7,7 +7,7 @@ def define_common_targets(is_fbcode = False):
     TARGETS and BUCK files that call this function.
     """
 
-    for aten_mode in (True, False):
+    for aten_mode in get_aten_mode_options():
         aten_suffix = ("_aten" if aten_mode else "")
 
         runtime.cxx_library(
@@ -92,6 +92,19 @@ def define_common_targets(is_fbcode = False):
         ],
     )
 
+    runtime.cxx_test(
+        name = "pte_data_map_test",
+        srcs = [
+            "pte_data_map_test.cpp",
+        ],
+        deps = [
+            "//executorch/extension/data_loader:file_data_loader",
+            "//executorch/extension/testing_util:temp_file",
+            "//executorch/runtime/executor:pte_data_map",
+            "//executorch/schema:program",
+        ],
+    )
+
     # TODO(dbort): Find a way to make these run for ANDROID/APPLE in xplat. The
     # android and ios test determinators don't like the reference to the model
     # file in fbcode. See https://fburl.com/9esapdmd
diff --git a/runtime/executor/test/test_backend_compiler_lib.cpp b/runtime/executor/test/test_backend_compiler_lib.cpp
index 7bfd7689a47..9eea6384d6f 100644
--- a/runtime/executor/test/test_backend_compiler_lib.cpp
+++ b/runtime/executor/test/test_backend_compiler_lib.cpp
@@ -144,10 +144,16 @@ class BackendWithCompiler final : public BackendInterface {
         "Instruction count must be non-negative: %ld",
         instruction_number);
 
-    auto op_list =
-        ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR(runtime_allocator, DemoOpList);
-    op_list->ops = ET_ALLOCATE_LIST_OR_RETURN_ERROR(
-        runtime_allocator, DemoOp, instruction_number);
+    auto op_list = runtime_allocator->allocateInstance<DemoOpList>();
+    if (op_list == nullptr) {
+      return Error::MemoryAllocationFailed;
+    }
+
+    op_list->ops = runtime_allocator->allocateList<DemoOp>(instruction_number);
+    if (op_list->ops == nullptr) {
+      return Error::MemoryAllocationFailed;
+    }
+
     op_list->numops = static_cast<size_t>(instruction_number);
 
     parse_delegate(instruction_set_start + 1, kSignLiteral, op_list->ops);
diff --git a/runtime/executor/test/test_backend_with_delegate_mapping.cpp b/runtime/executor/test/test_backend_with_delegate_mapping.cpp
index ead99c1305a..e6d84aca189 100644
--- a/runtime/executor/test/test_backend_with_delegate_mapping.cpp
+++ b/runtime/executor/test/test_backend_with_delegate_mapping.cpp
@@ -70,8 +70,11 @@ class BackendWithDelegateMapping final : public BackendInterface {
       }
 
       if (op_name != nullptr && delegate_debug_identifier != nullptr) {
-        char* op_name_mem = (char*)ET_ALLOCATE_OR_RETURN_ERROR(
-            runtime_allocator, strlen(op_name) + 1);
+        char* op_name_mem =
+            (char*)runtime_allocator->allocate(strlen(op_name) + 1);
+        if (op_name_mem == nullptr) {
+          return Error::MemoryAllocationFailed;
+        }
         memcpy(op_name_mem, op_name, strlen(op_name) + 1);
         op_list->ops[num_ops].name = op_name_mem;
         op_list->ops[num_ops].debug_handle = atoi(delegate_debug_identifier);
@@ -106,10 +109,16 @@ class BackendWithDelegateMapping final : public BackendInterface {
         "Instruction count must be non-negative: %ld",
         instruction_number);
 
-    auto op_list =
-        ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR(runtime_allocator, DemoOpList);
-    op_list->ops = ET_ALLOCATE_LIST_OR_RETURN_ERROR(
-        runtime_allocator, DemoOp, instruction_number);
+    auto op_list = runtime_allocator->allocateInstance<DemoOpList>();
+    if (op_list == nullptr) {
+      return Error::MemoryAllocationFailed;
+    }
+
+    op_list->ops = runtime_allocator->allocateList<DemoOp>(instruction_number);
+    if (op_list->ops == nullptr) {
+      return Error::MemoryAllocationFailed;
+    }
+
     op_list->numops = static_cast<size_t>(instruction_number);
 
     Error error =
diff --git a/runtime/kernel/operator_registry.cpp b/runtime/kernel/operator_registry.cpp
index b51c2567f0a..85705e5b3fd 100644
--- a/runtime/kernel/operator_registry.cpp
+++ b/runtime/kernel/operator_registry.cpp
@@ -79,7 +79,7 @@ Error register_kernels_internal(const Span<const Kernel> kernels) {
 
   for (const auto& kernel : kernels) {
     // Linear search. This is fine if the number of kernels is small.
-    for (int32_t i = 0; i < num_registered_kernels; i++) {
+    for (size_t i = 0; i < num_registered_kernels; i++) {
       Kernel k = registered_kernels[i];
       if (strcmp(kernel.name_, k.name_) == 0 &&
           kernel.kernel_key_ == k.kernel_key_) {
@@ -188,7 +188,7 @@ Error make_kernel_key_string(
     buf_size -= 1;
 
     // Add dim order.
-    for (int j = 0; j < meta.dim_order_.size(); j++) {
+    for (size_t j = 0; j < meta.dim_order_.size(); j++) {
       n = copy_char_as_number_to_buf((int)meta.dim_order_[j], buf, buf_size);
       if (n < 0) {
         return Error::InvalidArgument;
diff --git a/runtime/kernel/operator_registry.h b/runtime/kernel/operator_registry.h
index 82815852e6f..a3cdcd66cee 100644
--- a/runtime/kernel/operator_registry.h
+++ b/runtime/kernel/operator_registry.h
@@ -26,17 +26,17 @@
 
 #define ET_LOG_KERNEL_KEY(k)      \
   ET_LOG(                         \
-      Error,                      \
+      Info,                       \
       "key: %s, is_fallback: %s", \
       k.data(),                   \
       k.is_fallback() ? "true" : "false");
-#define ET_LOG_TENSOR_META(meta_list)                                 \
-  for (const auto& meta : meta_list) {                                \
-    ET_LOG(Error, "dtype: %d | dim order: [", int(meta.dtype_));      \
-    for (int i = 0; i < meta.dim_order_.size(); i++) {                \
-      ET_LOG(Error, "%d,", static_cast<int32_t>(meta.dim_order_[i])); \
-    }                                                                 \
-    ET_LOG(Error, "]");                                               \
+#define ET_LOG_TENSOR_META(meta_list)                                \
+  for (const auto& meta : meta_list) {                               \
+    ET_LOG(Info, "dtype: %d | dim order: [", int(meta.dtype_));      \
+    for (size_t i = 0; i < meta.dim_order_.size(); i++) {            \
+      ET_LOG(Info, "%d,", static_cast<int32_t>(meta.dim_order_[i])); \
+    }                                                                \
+    ET_LOG(Info, "]");                                               \
   }
 
 namespace executorch {
@@ -74,7 +74,7 @@ struct TensorMeta {
     if (dim_order_.size() != other.dim_order_.size()) {
       return false;
     }
-    for (int i = 0; i < dim_order_.size(); i++) {
+    for (size_t i = 0; i < dim_order_.size(); i++) {
       if (dim_order_[i] != other.dim_order_[i]) {
         return false;
       }
diff --git a/runtime/kernel/targets.bzl b/runtime/kernel/targets.bzl
index ed013260a9a..0726752d3dd 100644
--- a/runtime/kernel/targets.bzl
+++ b/runtime/kernel/targets.bzl
@@ -1,4 +1,4 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime")
 
 def _operator_registry_preprocessor_flags():
     max_kernel_num = native.read_config("executorch", "max_kernel_num", None)
@@ -7,8 +7,9 @@ def _operator_registry_preprocessor_flags():
     elif not runtime.is_oss:
         return select({
             "DEFAULT": [],
-            "fbsource//xplat/executorch/build/constraints:executorch-max-kernel-num-256": ["-DMAX_KERNEL_NUM=256"],
-            "fbsource//xplat/executorch/build/constraints:executorch-max-kernel-num-64": ["-DMAX_KERNEL_NUM=64"],
+            "fbsource//xplat/executorch/tools/buck/constraints:executorch-max-kernel-num-256": ["-DMAX_KERNEL_NUM=256"],
+            "fbsource//xplat/executorch/tools/buck/constraints:executorch-max-kernel-num-128": ["-DMAX_KERNEL_NUM=128"],
+            "fbsource//xplat/executorch/tools/buck/constraints:executorch-max-kernel-num-64": ["-DMAX_KERNEL_NUM=64"],
         })
     else:
         return []
@@ -50,7 +51,21 @@ def define_common_targets():
         preprocessor_flags = ["-DMAX_KERNEL_NUM=1"],
     )
 
-    for aten_mode in (True, False):
+    runtime.cxx_library(
+        name = "thread_parallel_interface",
+        exported_headers = ["thread_parallel_interface.h"],
+        exported_deps = [
+            "//executorch/runtime/core:core",
+            "//executorch/runtime/core/portable_type/c10/c10:c10",
+            "//executorch/runtime/platform:platform",
+        ],
+        visibility = [
+            "//executorch/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+    )
+
+    for aten_mode in get_aten_mode_options():
         aten_suffix = "_aten" if aten_mode else ""
 
         runtime.cxx_library(
diff --git a/runtime/kernel/test/CMakeLists.txt b/runtime/kernel/test/CMakeLists.txt
index 4e8c24776f1..9ff47fbefd5 100644
--- a/runtime/kernel/test/CMakeLists.txt
+++ b/runtime/kernel/test/CMakeLists.txt
@@ -16,7 +16,7 @@ cmake_minimum_required(VERSION 3.19)
 
 set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 
-include(${EXECUTORCH_ROOT}/build/Test.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
 add_executable(operator_registry_test operator_registry_test.cpp)
 target_link_libraries(
diff --git a/runtime/kernel/test/targets.bzl b/runtime/kernel/test/targets.bzl
index 96e0c8c557c..bd66fc05b6f 100644
--- a/runtime/kernel/test/targets.bzl
+++ b/runtime/kernel/test/targets.bzl
@@ -1,4 +1,4 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime")
 load("@fbsource//xplat/executorch/codegen:codegen.bzl", "et_operator_library", "executorch_generated_lib")
 
 def define_common_targets():
@@ -88,7 +88,7 @@ def define_common_targets():
         ],
     )
 
-    for aten_mode in (True, False):
+    for aten_mode in get_aten_mode_options():
         aten_suffix = "_aten" if aten_mode else ""
 
         runtime.cxx_test(
diff --git a/runtime/kernel/thread_parallel_interface.h b/runtime/kernel/thread_parallel_interface.h
new file mode 100644
index 00000000000..dcb8f3bd452
--- /dev/null
+++ b/runtime/kernel/thread_parallel_interface.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <functional>
+
+#include <c10/util/irange.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/platform/assert.h>
+
+namespace executorch {
+namespace extension {
+namespace internal {
+template <typename Func>
+inline bool parallel_for_no_threadpool(
+    const int64_t begin,
+    const int64_t end,
+    const int64_t grain_size,
+    const Func& f) {
+  ET_CHECK_OR_RETURN_FALSE(
+      begin >= 0 && end >= 0 && end >= begin,
+      "begin = %" PRId64 ", end = %" PRId64,
+      begin,
+      end);
+  ET_CHECK_OR_RETURN_FALSE(grain_size > 0, "grain_size = %" PRId64, grain_size);
+#ifndef NDEBUG
+  // Go backwards through the range elementwise to catch code that
+  // assumes parallel_for is in order like a regular for loop.
+  for (const auto i : c10::irange(begin, end)) {
+    const auto offset = i - begin;
+    const auto idx = end - offset - 1;
+    f(idx, idx + 1);
+  }
+#else // NDEBUG
+  f(begin, end);
+#endif
+  return true;
+}
+
+// Match GRAIN_SIZE from PyTorch core.
+// https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/TensorIterator.h#L78
+constexpr int64_t GRAIN_SIZE = 32768;
+} // namespace internal
+
+#ifdef ET_USE_THREADPOOL
+/**
+ * A helper to run a function in parallel.
+ *
+ * begin, end: describe the extent of the workitems via first and last workitem
+ * to be processed
+ * grain_size: number of workitems processed by user callback which is
+ * described below
+ * f: user function applied in parallel to the chunks, signature:
+ *   void f(int64_t begin, int64_t end)
+ * Returns true if all work items are processed successfully, false otherwise
+ *
+ * Warning: parallel_for does NOT copy thread local states from the current
+ * thread to the worker threads. Users need to protect the access to captured
+ * data if they mutate them in f.
+ */
+bool parallel_for(
+    const int64_t begin,
+    const int64_t end,
+    const int64_t grain_size,
+    const std::function<void(int64_t, int64_t)>& f);
+
+int64_t get_thread_num();
+
+void set_thread_num(int64_t thread_num);
+#else // ET_USE_THREADPOOL
+template <typename Func>
+bool parallel_for(
+    const int64_t begin,
+    const int64_t end,
+    const int64_t grain_size,
+    const Func& func) {
+  return internal::parallel_for_no_threadpool(begin, end, grain_size, func);
+}
+
+inline int64_t get_thread_num() {
+  return 0;
+}
+
+inline void set_thread_num(int64_t thread_num) {
+  ET_DCHECK_MSG(false, "cannot set_thread_num without threading support!");
+}
+#endif // ET_USE_THREADPOOL
+} // namespace extension
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::get_thread_num;
+using ::executorch::extension::parallel_for;
+using ::executorch::extension::set_thread_num;
+} // namespace executor
+} // namespace torch
diff --git a/runtime/platform/compat_unistd.h b/runtime/platform/compat_unistd.h
new file mode 100644
index 00000000000..c8bc4866702
--- /dev/null
+++ b/runtime/platform/compat_unistd.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/**
+ * @file
+ * unistd.h related macros for POSIX/Windows compatibility.
+ */
+#pragma once
+
+#if defined(_WIN32) && !defined(_WIN64)
+#error \
+    "You're trying to build ExecuTorch with a too old version of Windows. We need Windows 64-bit."
+#endif
+
+#if !defined(_WIN64)
+#include <unistd.h>
+#else
+#include <io.h>
+#define O_RDONLY _O_RDONLY
+#define open _open
+#define close _close
+#define read _read
+#define write _write
+#define stat _stat64
+#define fstat _fstat64
+#define off_t _off_t
+#define lseek _lseeki64
+
+#include <executorch/runtime/platform/compiler.h> // For ssize_t.
+#include <windows.h>
+// To avoid conflicts with std::numeric_limits<int32_t>::max() in
+// file_data_loader.cpp.
+#undef max
+
+inline ssize_t pread(int fd, void* buf, size_t nbytes, size_t offset) {
+  OVERLAPPED overlapped; /* The offset for ReadFile. */
+  memset(&overlapped, 0, sizeof(overlapped));
+  overlapped.Offset = offset;
+  overlapped.OffsetHigh = offset >> 32;
+
+  BOOL result; /* The result of ReadFile. */
+  DWORD bytes_read; /* The number of bytes read. */
+  HANDLE file = (HANDLE)_get_osfhandle(fd);
+
+  result = ReadFile(file, buf, nbytes, &bytes_read, &overlapped);
+  DWORD error = GetLastError();
+  if (!result) {
+    if (error == ERROR_IO_PENDING) {
+      result = GetOverlappedResult(file, &overlapped, &bytes_read, TRUE);
+      if (!result) {
+        error = GetLastError();
+      }
+    }
+  }
+  if (!result) {
+    // Translate error into errno.
+    switch (error) {
+      case ERROR_HANDLE_EOF:
+        errno = 0;
+        break;
+      default:
+        errno = EIO;
+        break;
+    }
+    return -1;
+  }
+  return bytes_read;
+}
+
+#endif // !defined(_WIN64)
\ No newline at end of file
diff --git a/runtime/platform/default/posix.cpp b/runtime/platform/default/posix.cpp
index 8807a62516e..837ffd02833 100644
--- a/runtime/platform/default/posix.cpp
+++ b/runtime/platform/default/posix.cpp
@@ -75,6 +75,9 @@ static bool initialized = false;
  * This function should be called before any other function provided by the PAL
  * to initialize any global state. Typically overridden by PAL implementer.
  */
+#ifdef _MSC_VER
+#pragma weak et_pal_init
+#endif // _MSC_VER
 void et_pal_init(void) {
   if (initialized) {
     return;
@@ -88,6 +91,9 @@ void et_pal_init(void) {
  * Immediately abort execution, setting the device into an error state, if
  * available.
  */
+#ifdef _MSC_VER
+#pragma weak et_pal_abort
+#endif // _MSC_VER
 ET_NORETURN void et_pal_abort(void) {
   std::abort();
 }
@@ -97,6 +103,9 @@ ET_NORETURN void et_pal_abort(void) {
  *
  * @retval Timestamp value in system ticks.
  */
+#ifdef _MSC_VER
+#pragma weak et_pal_current_ticks
+#endif // _MSC_VER
 et_timestamp_t et_pal_current_ticks(void) {
   _ASSERT_PAL_INITIALIZED();
   auto systemCurrentTime = std::chrono::steady_clock::now();
@@ -113,6 +122,9 @@ et_timestamp_t et_pal_current_ticks(void) {
  *
  * @retval The ratio of nanoseconds to system ticks.
  */
+#ifdef _MSC_VER
+#pragma weak et_pal_ticks_to_ns_multiplier
+#endif // _MSC_VER
 et_tick_ratio_t et_pal_ticks_to_ns_multiplier(void) {
   // The system tick interval is 1 nanosecond, so the conversion factor is 1.
   return {1, 1};
@@ -130,6 +142,9 @@ et_tick_ratio_t et_pal_ticks_to_ns_multiplier(void) {
  * @param[in] message Message string to log.
  * @param[in] length Message string length.
  */
+#ifdef _MSC_VER
+#pragma weak et_pal_emit_log_message
+#endif // _MSC_VER
 void et_pal_emit_log_message(
     et_timestamp_t timestamp,
     et_pal_log_level_t level,
@@ -181,6 +196,9 @@ void et_pal_emit_log_message(
  * @returns the allocated memory, or nullptr on failure. Must be freed using
  *     et_pal_free().
  */
+#ifdef _MSC_VER
+#pragma weak et_pal_allocate
+#endif // _MSC_VER
 void* et_pal_allocate(size_t size) {
   return malloc(size);
 }
@@ -190,6 +208,9 @@ void* et_pal_allocate(size_t size) {
  *
  * @param[in] ptr Pointer to memory to free. May be nullptr.
  */
+#ifdef _MSC_VER
+#pragma weak et_pal_free
+#endif // _MSC_VER
 void et_pal_free(void* ptr) {
   free(ptr);
 }
diff --git a/runtime/platform/log.cpp b/runtime/platform/log.cpp
index c1ad6ddcc0d..6529c73b238 100644
--- a/runtime/platform/log.cpp
+++ b/runtime/platform/log.cpp
@@ -92,8 +92,7 @@ void vlogf(
   }
   buf[kMaxLogMessageLength - 1] = 0;
 
-  et_pal_log_level_t pal_level =
-      (int(level) >= 0 && level < LogLevel::NumLevels)
+  et_pal_log_level_t pal_level = (level < LogLevel::NumLevels)
       ? kLevelToPal[size_t(level)]
       : et_pal_log_level_t::kUnknown;
 
diff --git a/runtime/platform/log.h b/runtime/platform/log.h
index 9ad234b2520..72ea8528442 100644
--- a/runtime/platform/log.h
+++ b/runtime/platform/log.h
@@ -33,6 +33,15 @@
 #define ET_LOG_ENABLED 1
 #endif // !defined(ET_LOG_ENABLED)
 
+// Even though it is supposed to be "portable" some toolchains
+// do not define, so providing a definition here
+#ifndef PRIu64
+#define PRIu64 "llu"
+#endif
+#ifndef PRId64
+#define PRId64 "lld"
+#endif
+
 namespace executorch {
 namespace runtime {
 
diff --git a/runtime/platform/profiler.cpp b/runtime/platform/profiler.cpp
index 2f514286aa1..21f68963c78 100644
--- a/runtime/platform/profiler.cpp
+++ b/runtime/platform/profiler.cpp
@@ -129,7 +129,8 @@ void track_allocation(int32_t id, uint32_t size) {
 uint32_t track_allocator(const char* name) {
   ET_CHECK_MSG(
       prof_header->allocator_entries < MEM_PROFILE_MAX_ALLOCATORS,
-      "Out of allocator tracking space, %d is needed. Increase MEM_PROFILE_MAX_ALLOCATORS and re-compile",
+      "Out of allocator tracking space, %" PRIu32
+      " is needed. Increase MEM_PROFILE_MAX_ALLOCATORS and re-compile",
       prof_header->allocator_entries);
   size_t str_len = strlen(name);
   size_t num_allocators = prof_header->allocator_entries;
@@ -151,7 +152,8 @@ void profiling_create_block(const char* name) {
     num_blocks += 1;
     ET_CHECK_MSG(
         num_blocks <= MAX_PROFILE_BLOCKS,
-        "Only %d blocks are supported and they've all been used up but %d is used. Increment MAX_PROFILE_BLOCKS and re-run",
+        "Only %d blocks are supported and they've all been used up but %" PRIu32
+        " is used. Increment MAX_PROFILE_BLOCKS and re-run",
         MAX_PROFILE_BLOCKS,
         num_blocks);
   }
diff --git a/runtime/platform/targets.bzl b/runtime/platform/targets.bzl
index 42bb851e2cf..68322ffe97f 100644
--- a/runtime/platform/targets.bzl
+++ b/runtime/platform/targets.bzl
@@ -68,6 +68,7 @@ def define_common_targets():
             "log.h",
             "profiler.h",
             "runtime.h",
+            "compat_unistd.h",
         ],
         srcs = [
             "abort.cpp",
diff --git a/runtime/platform/test/CMakeLists.txt b/runtime/platform/test/CMakeLists.txt
index e96953b0552..0afffaaabf0 100644
--- a/runtime/platform/test/CMakeLists.txt
+++ b/runtime/platform/test/CMakeLists.txt
@@ -15,7 +15,7 @@ cmake_minimum_required(VERSION 3.19)
 
 set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 
-include(${EXECUTORCH_ROOT}/build/Test.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
 et_cxx_test(platform_test SOURCES executor_pal_test.cpp)
 
diff --git a/schema/CMakeLists.txt b/schema/CMakeLists.txt
index 64f8821da1e..484363acdf5 100644
--- a/schema/CMakeLists.txt
+++ b/schema/CMakeLists.txt
@@ -9,10 +9,6 @@
 # cmake-format -i CMakeLists.txt
 # ~~~
 
-if(NOT FLATC_EXECUTABLE)
-  set(FLATC_EXECUTABLE flatc)
-endif()
-
 # The include directory that will contain the generated schema headers.
 set(_program_schema__include_dir "${CMAKE_BINARY_DIR}/schema/include")
 set(_program_schema__output_dir "${_program_schema__include_dir}/executorch/schema")
@@ -37,7 +33,7 @@ function(generate_program_schema _schema_srcs _schema_name)
       ${FLATC_EXECUTABLE} --cpp --cpp-std c++11 --gen-mutable --scoped-enums -o
       "${_program_schema__output_dir}" ${_schema_srcs}
     WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-    DEPENDS ${FLATC_EXECUTABLE} ${_schema_srcs}
+    DEPENDS flatc ${_schema_srcs}
     COMMENT "Generating ${_schema_name} headers"
     VERBATIM
   )
@@ -49,7 +45,7 @@ function(generate_program_schema _schema_srcs _schema_name)
   # and some users need an alignment larger than the default, which is typically
   # 32.
   target_compile_definitions(
-    ${_schema_name} INTERFACE FLATBUFFERS_MAX_ALIGNMENT=1024
+    ${_schema_name} INTERFACE FLATBUFFERS_MAX_ALIGNMENT=${FLATBUFFERS_MAX_ALIGNMENT}
   )
 
   target_include_directories(
diff --git a/schema/extended_header.cpp b/schema/extended_header.cpp
index fdc463207ba..3236b040c49 100644
--- a/schema/extended_header.cpp
+++ b/schema/extended_header.cpp
@@ -14,8 +14,6 @@
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/result.h>
 
-#pragma clang diagnostic ignored "-Wdeprecated"
-
 namespace executorch {
 namespace runtime {
 
diff --git a/schema/program.fbs b/schema/program.fbs
index 7ab2175f8ac..7308cc63199 100644
--- a/schema/program.fbs
+++ b/schema/program.fbs
@@ -431,6 +431,17 @@ table SubsegmentOffsets {
   offsets: [uint64];
 }
 
+// Attributes a name to data referenced by Program.segments. Used when data is
+// referenced by multiple users, in cases where indices are not guaranteed to
+// be consistent across the users.
+table NamedData {
+  // The unique id of the data blob.
+  key: string;
+
+  // Index of the segment in Program.segments.
+  segment_index: uint32;
+}
+
 table Program {
   // Schema version.
   version: uint;
@@ -468,6 +479,11 @@ table Program {
   // constant memory, copying it over, and then being unable to release the
   // constant segment. No two elements should point to the same segment.
   mutable_data_segments: [SubsegmentOffsets];
+
+  // [Optional] List of blobs keyed by a unique name. Note that multiple
+  // 'NamedData' entries could point to the same segment index. Stored in
+  // segments attached to the PTE file.
+  named_data: [NamedData];
 }
 
 root_type Program;
diff --git a/schema/targets.bzl b/schema/targets.bzl
index 40c6d8d5c8d..c0036c7500a 100644
--- a/schema/targets.bzl
+++ b/schema/targets.bzl
@@ -78,6 +78,10 @@ def define_common_targets():
             # //executorch/runtime/executor/...
             "//executorch/codegen/tools/...",
             "//executorch/runtime/executor/...",
+            # Tests have a set up which uses raw flatbuffer.
+            # TODO will refactor these setup steps into 
+            # testing utils in runtime/executor/... path
+            "//executorch/backends/xnnpack/test/...",
         ],
         exported_headers = {
             OUTPUT_PROGRAM_HEADER: ":{}[{}]".format(PROGRAM_GEN_RULE_NAME, OUTPUT_PROGRAM_HEADER),
diff --git a/scripts/__init__.py b/scripts/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/scripts/build_android_library.sh b/scripts/build_android_library.sh
new file mode 100755
index 00000000000..8a385ad6876
--- /dev/null
+++ b/scripts/build_android_library.sh
@@ -0,0 +1,152 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -ex
+
+if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
+  PYTHON_EXECUTABLE=python3
+fi
+which "${PYTHON_EXECUTABLE}"
+
+copy_src() {
+  cp -r extension/android/build.gradle extension/android/settings.gradle extension/android/gradlew extension/android/gradle extension/android/gradlew.bat extension/android/gradle.properties "${BUILD_AAR_DIR}"
+  cp -r extension/android/executorch_android "${BUILD_AAR_DIR}/executorch_android"
+}
+
+build_android_native_library() {
+  ANDROID_ABI="$1"
+  ANDROID_NDK="${ANDROID_NDK:-/opt/ndk}"
+  CMAKE_OUT="cmake-out-android-${ANDROID_ABI}"
+  EXECUTORCH_CMAKE_BUILD_TYPE="${EXECUTORCH_CMAKE_BUILD_TYPE:-Release}"
+  QNN_SDK_ROOT="${QNN_SDK_ROOT:-}"
+  if [ -n "$QNN_SDK_ROOT" ]; then
+    EXECUTORCH_BUILD_QNN=ON
+  else
+    EXECUTORCH_BUILD_QNN=OFF
+  fi
+
+  NEURON_BUFFER_ALLOCATOR_LIB="${NEURON_BUFFER_ALLOCATOR_LIB:-}"
+  NEURON_USDK_ADAPTER_LIB="${NEURON_USDK_ADAPTER_LIB:-}"
+  if [ -n "$NEURON_BUFFER_ALLOCATOR_LIB" ]; then
+    EXECUTORCH_BUILD_NEURON=ON
+  else
+    EXECUTORCH_BUILD_NEURON=OFF
+  fi
+
+  EXECUTORCH_BUILD_VULKAN="${EXECUTORCH_BUILD_VULKAN:-OFF}"
+
+  cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
+    -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}/build/cmake/android.toolchain.cmake" \
+    -DANDROID_ABI="${ANDROID_ABI}" \
+    -DANDROID_PLATFORM=android-26 \
+    -DBUILD_TESTING=OFF \
+    -DEXECUTORCH_ENABLE_LOGGING=ON \
+    -DEXECUTORCH_LOG_LEVEL=Info \
+    -DEXECUTORCH_BUILD_XNNPACK=ON \
+    -DEXECUTORCH_XNNPACK_SHARED_WORKSPACE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+    -DEXECUTORCH_BUILD_KERNELS_CUSTOM="${EXECUTORCH_BUILD_EXTENSION_LLM:-ON}" \
+    -DEXECUTORCH_BUILD_NEURON="${EXECUTORCH_BUILD_NEURON}" \
+    -DNEURON_BUFFER_ALLOCATOR_LIB="${NEURON_BUFFER_ALLOCATOR_LIB}" \
+    -DEXECUTORCH_BUILD_QNN="${EXECUTORCH_BUILD_QNN}" \
+    -DQNN_SDK_ROOT="${QNN_SDK_ROOT}" \
+    -DEXECUTORCH_BUILD_VULKAN="${EXECUTORCH_BUILD_VULKAN}" \
+    -DCMAKE_BUILD_TYPE="${EXECUTORCH_CMAKE_BUILD_TYPE}" \
+    -B"${CMAKE_OUT}"
+
+  if [ "$(uname)" == "Darwin" ]; then
+    CMAKE_JOBS=$(( $(sysctl -n hw.ncpu) - 1 ))
+  else
+    CMAKE_JOBS=$(( $(nproc) - 1 ))
+  fi
+  cmake --build "${CMAKE_OUT}" -j "${CMAKE_JOBS}" --target install --config "${EXECUTORCH_CMAKE_BUILD_TYPE}"
+
+  # Update tokenizers submodule
+  pushd extension/llm/tokenizers
+  echo "Update tokenizers submodule"
+  git submodule update --init
+  popd
+  cmake extension/android \
+    -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
+    -DANDROID_ABI="${ANDROID_ABI}" \
+    -DANDROID_PLATFORM=android-26 \
+    -DBUILD_TESTING=OFF \
+    -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
+    -DEXECUTORCH_ENABLE_LOGGING=ON \
+    -DEXECUTORCH_LOG_LEVEL=Info \
+    -DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \
+    -DNEURON_BUFFER_ALLOCATOR_LIB="$NEURON_BUFFER_ALLOCATOR_LIB" \
+    -DEXECUTORCH_BUILD_KERNELS_CUSTOM="${EXECUTORCH_BUILD_EXTENSION_LLM:-ON}" \
+    -DEXECUTORCH_BUILD_LLAMA_JNI="${EXECUTORCH_BUILD_EXTENSION_LLM:-ON}" \
+    -DCMAKE_BUILD_TYPE="${EXECUTORCH_CMAKE_BUILD_TYPE}" \
+    -B"${CMAKE_OUT}"/extension/android
+
+  cmake --build "${CMAKE_OUT}"/extension/android -j "${CMAKE_JOBS}" --config "${EXECUTORCH_CMAKE_BUILD_TYPE}"
+
+  # Copy artifacts to ABI specific directory
+  mkdir -p "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}"
+  cp "${CMAKE_OUT}"/extension/android/*.so "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/"
+
+  # Copy QNN related so library
+  if [ -n "$QNN_SDK_ROOT" ] && [ "$ANDROID_ABI" == "arm64-v8a" ]; then
+    cp "${CMAKE_OUT}"/lib/libqnn_executorch_backend.so "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/"
+    cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtp.so "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/"
+    cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnSystem.so "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/"
+    cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtpV69Stub.so "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/"
+    cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtpV73Stub.so "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/"
+    cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtpV75Stub.so "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/"
+    cp "${QNN_SDK_ROOT}"/lib/hexagon-v69/unsigned/libQnnHtpV69Skel.so "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/"
+    cp "${QNN_SDK_ROOT}"/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/"
+    cp "${QNN_SDK_ROOT}"/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/"
+  fi
+
+  # Copy MTK related so library
+  if [ -n "$NEURON_BUFFER_ALLOCATOR_LIB" ] && [ -n "$NEURON_USDK_ADAPTER_LIB" ] && [ "$ANDROID_ABI" == "arm64-v8a" ]; then
+    cp "${CMAKE_OUT}"/backends/mediatek/libneuron_backend.so ${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/
+    cp "${NEURON_BUFFER_ALLOCATOR_LIB}" ${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/
+    cp "${NEURON_USDK_ADAPTER_LIB}" ${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/
+  fi
+}
+
+build_aar() {
+  pushd "${BUILD_AAR_DIR}"
+  # Rename libexecutorch_jni.so to libexecutorch.so for soname consistency
+  # between Java and JNI
+  find . -type f -name "libexecutorch_jni.so" -exec bash -c 'mv "$1" "${1/_jni/}"' bash {} \;
+  if [ "$EXECUTORCH_CMAKE_BUILD_TYPE" == "Release" ]; then
+    find . -type f -name "*.so" -exec "$ANDROID_NDK"/toolchains/llvm/prebuilt/*/bin/llvm-strip {} \;
+  fi
+  ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew build
+  cp executorch_android/build/outputs/aar/executorch_android-debug.aar executorch.aar
+  popd
+}
+
+main() {
+  if [[ -z "${BUILD_AAR_DIR:-}" ]]; then
+    BUILD_AAR_DIR="$(mktemp -d)"
+  fi
+  export BUILD_AAR_DIR
+  if [ -z "$ANDROID_ABIS" ]; then
+    ANDROID_ABIS=("arm64-v8a" "x86_64")
+  fi
+  export ANDROID_ABIS
+
+  copy_src
+  for ANDROID_ABI in "${ANDROID_ABIS[@]}"; do
+    build_android_native_library ${ANDROID_ABI}
+  done
+  build_aar
+}
+
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+  main "$@"
+fi
diff --git a/build/build_apple_frameworks.sh b/scripts/build_apple_frameworks.sh
similarity index 63%
rename from build/build_apple_frameworks.sh
rename to scripts/build_apple_frameworks.sh
index 4d793ace0ab..c473d13cf50 100755
--- a/build/build_apple_frameworks.sh
+++ b/scripts/build_apple_frameworks.sh
@@ -9,10 +9,9 @@ set -euo pipefail
 
 SOURCE_ROOT_DIR=""
 OUTPUT="cmake-out"
-MODE="Release"
+MODES=()
 TOOLCHAIN=""
 PYTHON=$(which python3)
-FLATC=$(which flatc)
 COREML=OFF
 CUSTOM=OFF
 MPS=OFF
@@ -46,6 +45,7 @@ libmpsdelegate.a,\
 FRAMEWORK_BACKEND_XNNPACK="backend_xnnpack:\
 libXNNPACK.a,\
 libcpuinfo.a,\
+libextension_threadpool.a,\
 libpthreadpool.a,\
 libxnnpack_backend.a,\
 libmicrokernels-prod.a,\
@@ -56,6 +56,7 @@ libcustom_ops.a,\
 :"
 
 FRAMEWORK_KERNELS_OPTIMIZED="kernels_optimized:\
+libcpublas.a,\
 liboptimized_kernels.a,\
 liboptimized_native_cpu_ops_lib.a,\
 :"
@@ -77,20 +78,20 @@ usage() {
   echo
   echo "Options:"
   echo "  --output=DIR         Output directory. Default: 'cmake-out'"
-  echo "  --Debug              Use Debug build mode. Default: Uses Release build mode."
-  echo "  --toolchain=FILE     Cmake toolchain file. Default: '\$SOURCE_ROOT_DIR/third-party/ios-cmake/ios.toolchain.cmake'"
-  echo "  --python=FILE        Python executable path. Default: Path of python3 found in the current \$PATH"
-  echo "  --flatc=FILE         FlatBuffers Compiler executable path. Default: Path of flatc found in the current \$PATH"
-  echo "  --coreml             Include this flag to build the Core ML backend."
-  echo "  --custom             Include this flag to build the Custom kernels."
-  echo "  --mps                Include this flag to build the Metal Performance Shaders backend."
-  echo "  --optimized          Include this flag to build the Optimized kernels."
-  echo "  --portable           Include this flag to build the Portable kernels."
-  echo "  --quantized          Include this flag to build the Quantized kernels."
-  echo "  --xnnpack            Include this flag to build the XNNPACK backend."
+  echo "  --Debug              Build Debug version."
+  echo "  --Release            Build Release version."
+  echo "  --toolchain=FILE     CMake toolchain file. Default: '\$SOURCE_ROOT_DIR/third-party/ios-cmake/ios.toolchain.cmake'"
+  echo "  --python=FILE        Python executable path. Default: Path of python3 in \$PATH"
+  echo "  --coreml             Build the Core ML backend."
+  echo "  --custom             Build the Custom kernels."
+  echo "  --mps                Build the Metal Performance Shaders backend."
+  echo "  --optimized          Build the Optimized kernels."
+  echo "  --portable           Build the Portable kernels."
+  echo "  --quantized          Build the Quantized kernels."
+  echo "  --xnnpack            Build the XNNPACK backend."
   echo
   echo "Example:"
-  echo "  $0 /path/to/source/root --output=cmake-out --toolchain=/path/to/cmake/toolchain --python=/path/to/python3 --coreml --mps --xnnpack"
+  echo "  $0 /path/to/source/root --output=cmake-out --toolchain=/path/to/toolchain --python=/path/to/python3 --coreml --mps --xnnpack"
   exit 0
 }
 
@@ -98,10 +99,18 @@ for arg in "$@"; do
   case $arg in
       -h|--help) usage ;;
       --output=*) OUTPUT="${arg#*=}" ;;
-      --Debug) MODE="Debug" ;;
+      --Release)
+        if [[ ! " ${MODES[*]:-} " =~ \bRelease\b ]]; then
+          MODES+=("Release")
+        fi
+        ;;
+      --Debug)
+        if [[ ! " ${MODES[*]:-} " =~ \bDebug\b ]]; then
+          MODES+=("Debug")
+        fi
+        ;;
       --toolchain=*) TOOLCHAIN="${arg#*=}" ;;
       --python=*) PYTHON="${arg#*=}" ;;
-      --flatc=*) FLATC="${arg#*=}" ;;
       --coreml) COREML=ON ;;
       --custom) CUSTOM=ON ;;
       --mps) MPS=ON ;;
@@ -120,6 +129,10 @@ for arg in "$@"; do
   esac
 done
 
+if [ ${#MODES[@]} -eq 0 ]; then
+  MODES=("Release")
+fi
+
 if [[ -z "$SOURCE_ROOT_DIR" ]]; then
     SOURCE_ROOT_DIR=$(pwd)
 fi
@@ -129,14 +142,30 @@ if [[ -z "$TOOLCHAIN" ]]; then
 fi
 [[ -f "$TOOLCHAIN" ]] || { echo >&2 "Toolchain file $TOOLCHAIN does not exist."; exit 1; }
 
+BUCK2=$("$PYTHON" "$SOURCE_ROOT_DIR/tools/cmake/resolve_buck.py" --cache_dir="$SOURCE_ROOT_DIR/buck2-bin")
+
+if [[ "$BUCK2" == "buck2" ]]; then
+  BUCK2=$(command -v buck2)
+fi
+
 check_command() {
-  command -v "$1" >/dev/null 2>&1 || { echo >&2 "$1 is not installed"; exit 1; }
+  if [[ "$1" == */* ]]; then
+    if [[ ! -x "$1" ]]; then
+      echo "Error: Command not found or not executable at '$1'" >&2
+      exit 1
+    fi
+  else
+    if ! command -v "$1" >/dev/null 2>&1; then
+      echo "Error: Command '$1' not found in PATH" >&2
+      exit 1
+    fi
+  fi
 }
 
 check_command cmake
 check_command rsync
 check_command "$PYTHON"
-check_command "$FLATC"
+check_command "$BUCK2"
 
 echo "Building libraries"
 
@@ -146,18 +175,17 @@ cmake_build() {
     local platform=$1
     local platform_flag=$2
     local platform_target=$3
-    echo "Building for $platform with flag $platform_flag"
-    mkdir "$platform" && cd "$platform" || exit 1
+    local mode=$4
+    echo "Building for $platform ($mode) with flag $platform_flag"
+    mkdir -p "$platform" && cd "$platform" || exit 1
     cmake "$SOURCE_ROOT_DIR" -G Xcode \
-        -DCMAKE_BUILD_TYPE="$MODE" \
-        -DCMAKE_PREFIX_PATH="$($PYTHON -c 'import torch as _; print(_.__path__[0])')" \
+        -DCMAKE_BUILD_TYPE="$mode" \
         -DCMAKE_TOOLCHAIN_FILE="$TOOLCHAIN" \
         -DCMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD="c++17" \
         -DCMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY="libc++" \
         -DCMAKE_C_FLAGS="-ffile-prefix-map=$SOURCE_ROOT_DIR=/executorch -fdebug-prefix-map=$SOURCE_ROOT_DIR=/executorch" \
         -DCMAKE_CXX_FLAGS="-ffile-prefix-map=$SOURCE_ROOT_DIR=/executorch -fdebug-prefix-map=$SOURCE_ROOT_DIR=/executorch" \
         -DPYTHON_EXECUTABLE="$PYTHON" \
-        -DFLATC_EXECUTABLE="$FLATC" \
         -DEXECUTORCH_BUILD_COREML=$COREML \
         -DEXECUTORCH_BUILD_MPS=$MPS \
         -DEXECUTORCH_BUILD_XNNPACK=$XNNPACK \
@@ -174,28 +202,22 @@ cmake_build() {
         ${platform_target:+-DDEPLOYMENT_TARGET=$platform_target} \
         --log-level=VERBOSE
     cmake --build . \
-        --config $MODE \
+        --config "$mode" \
         --verbose
     cd ..
 }
 
 for index in ${!PLATFORMS[*]}; do
-  cmake_build "${PLATFORMS[$index]}" "${PLATFORM_FLAGS[$index]}" "${PLATFORM_TARGET[$index]}"
+  for mode in "${MODES[@]}"; do
+    cmake_build "${PLATFORMS[$index]}" "${PLATFORM_FLAGS[$index]}" "${PLATFORM_TARGET[$index]}" "$mode"
+  done
 done
 
 echo "Exporting headers"
 
 mkdir -p "$HEADERS_PATH"
 
-BUCK2=$(find $SOURCE_ROOT_DIR -type f -path '*/buck2-bin/buck2-*' | head -n 1)
-if [[ -z "$BUCK2" ]]; then
-  echo "Could not find buck2 executable in any buck2-bin directory under $SOURCE_ROOT_DIR"
-  BUCK2=$(which buck2)
-fi
-
-check_command "$BUCK2"
-
-"$SOURCE_ROOT_DIR"/build/print_exported_headers.py --buck2=$(realpath "$BUCK2") --targets \
+"$SOURCE_ROOT_DIR"/scripts/print_exported_headers.py --buck2=$(realpath "$BUCK2") --targets \
   //extension/module: \
   //extension/tensor: \
 | rsync -av --files-from=- "$SOURCE_ROOT_DIR" "$HEADERS_PATH/executorch"
@@ -207,42 +229,51 @@ check_command "$BUCK2"
 # So, just patch our generated framework to do that.
 sed -i '' '1i\
 #define C10_USING_CUSTOM_GENERATED_MACROS
-' $HEADERS_PATH/executorch/runtime/core/portable_type/c10/c10/macros/Macros.h
-sed -i '' '1i\
-#define C10_USING_CUSTOM_GENERATED_MACROS
-' $HEADERS_PATH/executorch/runtime/core/portable_type/c10/c10/macros/Export.h
-cp -r $HEADERS_PATH/executorch/runtime/core/portable_type/c10/c10 "$HEADERS_PATH/"
+' \
+"$HEADERS_PATH/executorch/runtime/core/portable_type/c10/c10/macros/Macros.h" \
+"$HEADERS_PATH/executorch/runtime/core/portable_type/c10/c10/macros/Export.h"
 
+cp -r $HEADERS_PATH/executorch/runtime/core/portable_type/c10/c10 "$HEADERS_PATH/"
 
 cp "$SOURCE_ROOT_DIR/extension/apple/ExecuTorch/Exported/"*.h "$HEADERS_PATH/executorch"
 cp "$SOURCE_ROOT_DIR/extension/apple/ExecuTorch/Exported/"*.modulemap "$HEADERS_PATH"
 
 echo "Creating frameworks"
 
-for platform in "${PLATFORMS[@]}"; do
-  echo "Directory: $platform/$MODE"
-  FRAMEWORK_FLAGS+=("--directory=$platform/$MODE")
-done
-
 append_framework_flag() {
   local flag="$1"
   local framework="$2"
+  local mode="${3:-}"
   if [[ $flag == ON ]]; then
+    if [[ -n "$mode" && "$mode" != "Release" ]]; then
+      local name spec
+      name=$(echo "$framework" | cut -d: -f1)
+      spec=$(echo "$framework" | cut -d: -f2-)
+      framework="${name}_$(echo "$mode" | tr '[:upper:]' '[:lower:]'):${spec}"
+    fi
     echo "Framework: $framework"
     FRAMEWORK_FLAGS+=("--framework=$framework")
   fi
 }
 
-append_framework_flag "ON" "$FRAMEWORK_EXECUTORCH"
-append_framework_flag "$COREML" "$FRAMEWORK_BACKEND_COREML"
-append_framework_flag "$MPS" "$FRAMEWORK_BACKEND_MPS"
-append_framework_flag "$XNNPACK" "$FRAMEWORK_BACKEND_XNNPACK"
-append_framework_flag "$CUSTOM" "$FRAMEWORK_KERNELS_CUSTOM"
-append_framework_flag "$OPTIMIZED" "$FRAMEWORK_KERNELS_OPTIMIZED"
-append_framework_flag "$PORTABLE" "$FRAMEWORK_KERNELS_PORTABLE"
-append_framework_flag "$QUANTIZED" "$FRAMEWORK_KERNELS_QUANTIZED"
-
-"$SOURCE_ROOT_DIR"/build/create_frameworks.sh "${FRAMEWORK_FLAGS[@]}"
+for mode in "${MODES[@]}"; do
+  FRAMEWORK_FLAGS=()
+  for platform in "${PLATFORMS[@]}"; do
+    echo "Directory: $platform/$mode"
+    FRAMEWORK_FLAGS+=("--directory=$platform/$mode")
+  done
+
+  append_framework_flag "ON" "$FRAMEWORK_EXECUTORCH" "$mode"
+  append_framework_flag "$COREML" "$FRAMEWORK_BACKEND_COREML" "$mode"
+  append_framework_flag "$MPS" "$FRAMEWORK_BACKEND_MPS" "$mode"
+  append_framework_flag "$XNNPACK" "$FRAMEWORK_BACKEND_XNNPACK" "$mode"
+  append_framework_flag "$CUSTOM" "$FRAMEWORK_KERNELS_CUSTOM" "$mode"
+  append_framework_flag "$OPTIMIZED" "$FRAMEWORK_KERNELS_OPTIMIZED" "$mode"
+  append_framework_flag "$PORTABLE" "$FRAMEWORK_KERNELS_PORTABLE" "$mode"
+  append_framework_flag "$QUANTIZED" "$FRAMEWORK_KERNELS_QUANTIZED" "$mode"
+
+  "$SOURCE_ROOT_DIR"/scripts/create_frameworks.sh "${FRAMEWORK_FLAGS[@]}"
+done
 
 echo "Cleaning up"
 
diff --git a/build/build_apple_llm_demo.sh b/scripts/build_apple_llm_demo.sh
similarity index 100%
rename from build/build_apple_llm_demo.sh
rename to scripts/build_apple_llm_demo.sh
diff --git a/build/create_frameworks.sh b/scripts/create_frameworks.sh
similarity index 96%
rename from build/create_frameworks.sh
rename to scripts/create_frameworks.sh
index a55c4aed1e7..804a5156e0b 100755
--- a/build/create_frameworks.sh
+++ b/scripts/create_frameworks.sh
@@ -76,8 +76,8 @@ create_xcframework() {
         fi
 
         local dir_suffix
-        dir_suffix=$(echo "$dir" | tr '[:upper:]' '[:lower:]' | sed 's/\//-/g')
-        local merged_lib="${output}/lib${target_library_name}-${dir_suffix}.a"
+        dir_suffix=$(echo "$dir" | cut -d'/' -f1 | tr '[:upper:]' '[:lower:]' | sed 's/[\/\.~]/_/g')
+        local merged_lib="${output}/lib${target_library_name}_${dir_suffix}.a"
 
         # Remove the existing .a file if it exists.
         if [ -f "${merged_lib}" ]; then
diff --git a/build/pick_doc_commits.py b/scripts/pick_doc_commits.py
similarity index 100%
rename from build/pick_doc_commits.py
rename to scripts/pick_doc_commits.py
diff --git a/build/print_exported_headers.py b/scripts/print_exported_headers.py
similarity index 88%
rename from build/print_exported_headers.py
rename to scripts/print_exported_headers.py
index b24100c7a94..31f70fb9ba7 100755
--- a/build/print_exported_headers.py
+++ b/scripts/print_exported_headers.py
@@ -38,8 +38,15 @@ def query(buck2: str, target: str, attribute: str) -> str:
         raise SystemExit("Error: " + str(e))
 
 
+# Cache to store results for exported headers per target.
+_exported_headers_cache: dict[str, Set[str]] = {}
+
+
 def exported_headers(buck2: str, target: str) -> Set[str]:
-    """Get all exported headers of a target and its dependencies."""
+    """Get all exported headers of a target and its dependencies, with caching."""
+    if target in _exported_headers_cache:
+        return _exported_headers_cache[target]
+
     deps = query(buck2, target, "exported_deps")
     headers = set(query(buck2, target, "exported_headers"))
     headers.update(
@@ -48,13 +55,14 @@ def exported_headers(buck2: str, target: str) -> Set[str]:
         for header in exported_headers(buck2, dep.split()[0])
         if header.endswith(".h")
     )
+    _exported_headers_cache[target] = headers
     return headers
 
 
 def expand_target(buck2: str, target: str) -> List[str]:
     """Expand a target into a list of targets if applicable."""
     output = run([buck2, "cquery", target])
-    # Buck's output format is "<target> (<build platform>)", we take only the target part.
+    # Buck's output format is "<target> (<build platform>)", so we take only the target part.
     targets = [line.split(" ")[0] for line in output.strip().split("\n")]
     return targets
 
diff --git a/build/print_public_headers.py b/scripts/print_public_headers.py
similarity index 100%
rename from build/print_public_headers.py
rename to scripts/print_public_headers.py
diff --git a/build/run_android_emulator.sh b/scripts/run_android_emulator.sh
similarity index 93%
rename from build/run_android_emulator.sh
rename to scripts/run_android_emulator.sh
index cd625988ec4..fe73ec8a1d7 100755
--- a/build/run_android_emulator.sh
+++ b/scripts/run_android_emulator.sh
@@ -28,9 +28,7 @@ adb push model.pte /data/local/tmp/llama
 adb push tokenizer.bin /data/local/tmp/llama
 adb shell am instrument -w -r com.example.executorchllamademo.test/androidx.test.runner.AndroidJUnitRunner
 
-adb uninstall org.pytorch.executorch || true
 adb uninstall org.pytorch.executorch.test || true
-adb install -t android-test-debug.apk
 adb install -t android-test-debug-androidTest.apk
 
 adb shell am instrument -w -r org.pytorch.executorch.test/androidx.test.runner.AndroidJUnitRunner
diff --git a/build/test_ios.sh b/scripts/test_ios.sh
similarity index 92%
rename from build/test_ios.sh
rename to scripts/test_ios.sh
index cdc02098cee..09461e0953e 100755
--- a/build/test_ios.sh
+++ b/scripts/test_ios.sh
@@ -64,8 +64,6 @@ say "Installing Requirements"
 pip install --upgrade cmake pip setuptools wheel zstd
 
 ./install_executorch.sh --pybind coreml mps xnnpack
-export PATH="$(realpath third-party/flatbuffers/cmake-out):$PATH"
-./build/install_flatc.sh
 
 say "Installing CoreML Backend Requirements"
 
@@ -92,7 +90,7 @@ curl https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt \
 
 say "Building Frameworks"
 
-./build/build_apple_frameworks.sh --coreml --custom --mps --optimized --portable --quantized --xnnpack
+./scripts/build_apple_frameworks.sh --coreml --custom --mps --optimized --portable --quantized --xnnpack
 mv cmake-out "$APP_PATH/Frameworks"
 
 say "Creating Simulator"
diff --git a/setup.py b/setup.py
index 2a05d1eac5c..b40d6f4738c 100644
--- a/setup.py
+++ b/setup.py
@@ -50,6 +50,7 @@
 import os
 import platform
 import re
+import site
 import sys
 
 # Import this before distutils so that setuptools can intercept the distuils
@@ -169,7 +170,7 @@ def write_to_python_file(cls, path: str) -> None:
 # set to a non-empty value, the build type is Debug. Otherwise, the build type
 # is Release.
 def get_build_type(is_debug=None) -> str:
-    debug = int(os.environ.get("DEBUG", 0)) if is_debug is None else is_debug
+    debug = int(os.environ.get("DEBUG", 0) or 0) if is_debug is None else is_debug
     cfg = "Debug" if debug else "Release"
     return cfg
 
@@ -219,37 +220,50 @@ def src_path(self, installer: "InstallerBuildExt") -> Path:
         """
         # Share the cmake-out location with CustomBuild.
         build_cmd = installer.get_finalized_command("build")
-        if hasattr(build_cmd, "cmake_cache_dir"):
-            cmake_cache_dir = Path(build_cmd.cmake_cache_dir)
+        if "%CMAKE_CACHE_DIR%" in self.src:
+            if not hasattr(build_cmd, "cmake_cache_dir"):
+                raise RuntimeError(
+                    f"Extension {self.name} has a src {self.src} that contains"
+                    " %CMAKE_CACHE_DIR% but CMake does not run in the `build` "
+                    "command. Please double check if the command is correct."
+                )
+            else:
+                build_dir = Path(build_cmd.cmake_cache_dir)
         else:
-            # If we're in editable mode, use a default or fallback value for cmake_cache_dir
-            # This could be a hardcoded path, or a path derived from the current working directory
-            cmake_cache_dir = Path(".")
+            # If the src path doesn't contain %CMAKE_CACHE_DIR% placeholder,
+            # try to find it under the current directory.
+            build_dir = Path(".")
+
+        src_path = self.src.replace("%CMAKE_CACHE_DIR%/", "")
+
         cfg = get_build_type(installer.debug)
 
         if os.name == "nt":
             # Replace %BUILD_TYPE% with the current build type.
-            self.src = self.src.replace("%BUILD_TYPE%", cfg)
+            src_path = src_path.replace("%BUILD_TYPE%", cfg)
         else:
             # Remove %BUILD_TYPE% from the path.
-            self.src = self.src.replace("/%BUILD_TYPE%", "")
+            src_path = src_path.replace("/%BUILD_TYPE%", "")
 
         # Construct the full source path, resolving globs. If there are no glob
         # pattern characters, this will just ensure that the source file exists.
-        srcs = tuple(cmake_cache_dir.glob(self.src))
+        srcs = tuple(build_dir.glob(src_path))
         if len(srcs) != 1:
             raise ValueError(
-                f"""Expected exactly one file matching '{self.src}'; found {repr(srcs)}. 
-
-If that file is a CMake-built extension module file, and we are installing in editable mode, please disable the corresponding build option since it's not supported yet.
-
-Try: 
-
-EXECUTORCH_BUILD_FLATC=OFF EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT=OFF pip install -e .
-"""
+                f"Expecting exactly 1 file matching {self.src} in {build_dir}, "
+                f"found {repr(srcs)}. Resolved src pattern: {src_path}."
             )
         return srcs[0]
 
+    def inplace_dir(self, installer: "InstallerBuildExt") -> Path:
+        """Returns the path of this file to be installed to, under inplace mode.
+
+        It will be a relative path to the project root directory. For more info
+        related to inplace/editable mode, please checkout this doc:
+        https://setuptools.pypa.io/en/latest/userguide/development_mode.html
+        """
+        raise NotImplementedError()
+
 
 class BuiltFile(_BaseExtension):
     """An extension that installs a single file that was built by cmake.
@@ -315,26 +329,46 @@ def dst_path(self, installer: "InstallerBuildExt") -> Path:
             # Destination looks like a file.
             return dst_root / Path(self.dst)
 
+    def inplace_dir(self, installer: "InstallerBuildExt") -> Path:
+        """For a `BuiltFile`, we use self.dst as its inplace directory path.
+        Need to handle directory vs file.
+        """
+        # HACK: get rid of the leading "executorch" in ext.dst.
+        # This is because we don't have a root level "executorch" module.
+        package_dir = self.dst.removeprefix("executorch/")
+        # If dst is a file, use it's directory
+        if not package_dir.endswith("/"):
+            package_dir = os.path.dirname(package_dir)
+        return Path(package_dir)
+
 
 class BuiltExtension(_BaseExtension):
     """An extension that installs a python extension that was built by cmake."""
 
-    def __init__(self, src: str, modpath: str):
+    def __init__(self, src: str, modpath: str, src_dir: Optional[str] = None):
         """Initializes a BuiltExtension.
 
         Args:
-            src: The path to the file to install (typically a shared library),
-                relative to the cmake-out directory. May be an fnmatch-style
-                glob that matches exactly one file. If the path ends in `.so`,
-                this class will also look for similarly-named `.dylib` files.
+            src_dir: The directory of the file to install, relative to the cmake-out
+                directory. A placeholder %BUILD_TYPE% will be replaced with the build
+                type for multi-config generators (like Visual Studio) where the build
+                output is in a subdirectory named after the build type. For single-
+                config generators (like Makefile Generators or Ninja), this placeholder
+                will be removed.
+            src_name: The name of the file to install. If the path ends in `.so`,
             modpath: The dotted path of the python module that maps to the
                 extension.
         """
         assert (
             "/" not in modpath
         ), f"modpath must be a dotted python module path: saw '{modpath}'"
+        full_src = src
+        if src_dir is None and platform.system() == "Windows":
+            src_dir = "%BUILD_TYPE%/"
+        if src_dir is not None:
+            full_src = os.path.join(src_dir, src)
         # This is a real extension, so use the modpath as the name.
-        super().__init__(src=src, dst=modpath, name=modpath)
+        super().__init__(src=f"%CMAKE_CACHE_DIR%/{full_src}", dst=modpath, name=modpath)
 
     def src_path(self, installer: "InstallerBuildExt") -> Path:
         """Returns the path to the source file, resolving globs.
@@ -368,10 +402,68 @@ def dst_path(self, installer: "InstallerBuildExt") -> Path:
         # path: that's the file we're creating.
         return Path(installer.get_ext_fullpath(self.dst))
 
+    def inplace_dir(self, installer: "InstallerBuildExt") -> Path:
+        """For BuiltExtension, deduce inplace dir path from extension name."""
+        build_py = installer.get_finalized_command("build_py")
+        modpath = self.name.split(".")
+        package = ".".join(modpath[:-1])
+        package_dir = os.path.abspath(build_py.get_package_dir(package))
+
+        return Path(package_dir)
+
 
 class InstallerBuildExt(build_ext):
     """Installs files that were built by cmake."""
 
+    def __init__(self, *args, **kwargs):
+        self._ran_build = False
+        super().__init__(*args, **kwargs)
+
+    def run(self):
+        # Run the build command first in editable mode. Since `build` command
+        # will also trigger `build_ext` command, only run this once.
+        if self._ran_build:
+            return
+
+        if self.editable_mode:
+            self._ran_build = True
+            self.run_command("build")
+        super().run()
+
+    def copy_extensions_to_source(self) -> None:
+        """For each extension in `ext_modules`, we need to copy the extension
+        file from the build directory to the correct location in the local
+        directory.
+
+        This should only be triggered when inplace mode (editable mode) is enabled.
+
+        Args:
+
+        Returns:
+        """
+        for ext in self.extensions:
+            package_dir = ext.inplace_dir(self)
+
+            # Ensure that the destination directory exists.
+            self.mkpath(os.fspath(package_dir))
+
+            regular_file = ext.src_path(self)
+            inplace_file = os.path.join(
+                package_dir, os.path.basename(ext.dst_path(self))
+            )
+
+            # Always copy, even if source is older than destination, to ensure
+            # that the right extensions for the current Python/platform are
+            # used.
+            if os.path.exists(regular_file) or not ext.optional:
+                self.copy_file(regular_file, inplace_file, level=self.verbose)
+
+            if ext._needs_stub:
+                inplace_stub = self._get_equivalent_stub(ext, inplace_file)
+                self._write_stub_file(inplace_stub, ext, compile=True)
+                # Always compile stub and remove the original (leave the cache behind)
+                # (this behaviour was observed in previous iterations of the code)
+
     # TODO(dbort): Depend on the "build" command to ensure it runs first
 
     def build_extension(self, ext: _BaseExtension) -> None:
@@ -379,7 +471,8 @@ def build_extension(self, ext: _BaseExtension) -> None:
         dst_file: Path = ext.dst_path(self)
 
         # Ensure that the destination directory exists.
-        self.mkpath(os.fspath(dst_file.parent))
+        if not dst_file.parent.exists():
+            self.mkpath(os.fspath(dst_file.parent))
 
         # Copy the file.
         self.copy_file(os.fspath(src_file), os.fspath(dst_file))
@@ -443,7 +536,7 @@ def run(self):
             ),
             # Install executorch-wheel-config.cmake to pip package.
             (
-                "build/executorch-wheel-config.cmake",
+                "tools/cmake/executorch-wheel-config.cmake",
                 "share/cmake/executorch-config.cmake",
             ),
         ]
@@ -567,10 +660,6 @@ def run(self):
 
         build_args = [f"-j{self.parallel}"]
 
-        # TODO(dbort): Try to manage these targets and the cmake args from the
-        # extension entries themselves instead of hard-coding them here.
-        build_args += ["--target", "flatc"]
-
         if ShouldBuild.pybindings():
             cmake_args += [
                 "-DEXECUTORCH_BUILD_PYBIND=ON",
@@ -630,6 +719,10 @@ def run(self):
         if not self.dry_run:
             # Dry run should log the command but not actually run it.
             (Path(cmake_cache_dir) / "CMakeCache.txt").unlink(missing_ok=True)
+        # Set PYTHONPATH to the location of the pip package.
+        os.environ["PYTHONPATH"] = (
+            site.getsitepackages()[0] + ";" + os.environ.get("PYTHONPATH", "")
+        )
         with Buck2EnvironmentFixer():
             # The context manager may patch the environment while running this
             # cmake command, which happens to run buck2 to get some source
@@ -662,20 +755,6 @@ def run(self):
         # Build the system.
         self.spawn(["cmake", "--build", cmake_cache_dir, *build_args])
 
-        # Non-python files should live under this data directory.
-        data_root = os.path.join(self.build_lib, "executorch", "data")
-
-        # Directories like bin/ and lib/ live under data/.
-        bin_dir = os.path.join(data_root, "bin")
-
-        # Copy the bin wrapper so that users can run any executables under
-        # data/bin, as long as they are listed in the [project.scripts] section
-        # of pyproject.toml.
-        self.mkpath(bin_dir)
-        self.copy_file(
-            "build/pip_data_bin_init.py.in",
-            os.path.join(bin_dir, "__init__.py"),
-        )
         # Share the cmake-out location with _BaseExtension.
         self.cmake_cache_dir = cmake_cache_dir
 
@@ -687,13 +766,20 @@ def get_ext_modules() -> List[Extension]:
     """Returns the set of extension modules to build."""
     ext_modules = []
     if ShouldBuild.flatc():
-        ext_modules.append(
-            BuiltFile(
-                src_dir="third-party/flatbuffers/%BUILD_TYPE%/",
-                src_name="flatc",
-                dst="executorch/data/bin/",
-                is_executable=True,
-            )
+        ext_modules.extend(
+            [
+                BuiltFile(
+                    src_dir="%CMAKE_CACHE_DIR%/third-party/flatbuffers/%BUILD_TYPE%/",
+                    src_name="flatc",
+                    dst="executorch/data/bin/",
+                    is_executable=True,
+                ),
+                BuiltFile(
+                    src_dir="tools/wheel",
+                    src_name="pip_data_bin_init.py.in",
+                    dst="executorch/data/bin/__init__.py",
+                ),
+            ]
         )
 
     if ShouldBuild.pybindings():
@@ -702,7 +788,12 @@ def get_ext_modules() -> List[Extension]:
             # portable kernels, and a selection of backends. This lets users
             # load and execute .pte files from python.
             BuiltExtension(
-                "_portable_lib.*", "executorch.extension.pybindings._portable_lib"
+                (
+                    "_portable_lib.cp*"
+                    if platform.system() == "Windows"
+                    else "_portable_lib.*"
+                ),
+                "executorch.extension.pybindings._portable_lib",
             )
         )
         if ShouldBuild.training():
@@ -716,16 +807,16 @@ def get_ext_modules() -> List[Extension]:
     if ShouldBuild.llama_custom_ops():
         ext_modules.append(
             BuiltFile(
-                src_dir="extension/llm/custom_ops/%BUILD_TYPE%/",
+                src_dir="%CMAKE_CACHE_DIR%/extension/llm/custom_ops/%BUILD_TYPE%/",
                 src_name="custom_ops_aot_lib",
-                dst="executorch/extension/llm/custom_ops",
+                dst="executorch/extension/llm/custom_ops/",
                 is_dynamic_lib=True,
             )
         )
         ext_modules.append(
             # Install the prebuilt library for quantized ops required by custom ops.
             BuiltFile(
-                src_dir="kernels/quantized/%BUILD_TYPE%/",
+                src_dir="%CMAKE_CACHE_DIR%/kernels/quantized/%BUILD_TYPE%/",
                 src_name="quantized_ops_aot_lib",
                 dst="executorch/kernels/quantized/",
                 is_dynamic_lib=True,
@@ -741,25 +832,6 @@ def get_ext_modules() -> List[Extension]:
 
 setup(
     version=Version.string(),
-    # TODO(dbort): Could use py_modules to restrict the set of modules we
-    # package, and package_data to restrict the set up non-python files we
-    # include. See also setuptools/discovery.py for custom finders.
-    package_dir={
-        "executorch/backends": "backends",
-        "executorch/codegen": "codegen",
-        # TODO(mnachin T180504136): Do not put examples/models
-        # into core pip packages. Refactor out the necessary utils
-        # or core models files into a separate package.
-        "executorch/examples/models": "examples/models",
-        "executorch/exir": "exir",
-        "executorch/extension": "extension",
-        "executorch/kernels/quantized": "kernels/quantized",
-        "executorch/schema": "schema",
-        "executorch/devtools": "devtools",
-        "executorch/devtools/bundled_program": "devtools/bundled_program",
-        "executorch/runtime": "runtime",
-        "executorch/util": "util",
-    },
     cmdclass={
         "build": CustomBuild,
         "build_ext": InstallerBuildExt,
diff --git a/shim b/shim
new file mode 160000
index 00000000000..cf6a954aae4
--- /dev/null
+++ b/shim
@@ -0,0 +1 @@
+Subproject commit cf6a954aae4bee7b4515e13475878460115027d1
diff --git a/shim/.gitignore b/shim/.gitignore
deleted file mode 100644
index a1412f7fa8e..00000000000
--- a/shim/.gitignore
+++ /dev/null
@@ -1,5 +0,0 @@
-# We currently expect end users to run reindeer vendor themselves
-# so mark these things as to ignore
-/third-party/rust/.cargo/
-/third-party/rust/BUCK
-/third-party/rust/vendor/
diff --git a/shim/README.md b/shim/README.md
deleted file mode 100644
index 85933e51b2c..00000000000
--- a/shim/README.md
+++ /dev/null
@@ -1,16 +0,0 @@
-# buck2 shims
-
-The `shim/` subtree helps the Meta-internal buck2 build system also work in the
-open-source repo.
-
-Shims are how open-source buck2 supports a [line
-like](https://github.com/pytorch/executorch/blob/50aa517549d10324147534d91d04a923b76421d6/kernels/optimized/targets.bzl#L1):
-
-```
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
-```
-
-In the open-source repo, `fbsource//xplat` (a Meta-internal root) doesn't exist.
-The `shim = shim` line in `../.buckconfig` tells buck2 to look in
-[`shim/xplat/executorch/build/runtime_wrapper.bzl`](https://github.com/pytorch/executorch/blob/main/shim/xplat/executorch/build/runtime_wrapper.bzl)
-instead.
diff --git a/shim/build_defs/native_rules.bzl b/shim/build_defs/native_rules.bzl
deleted file mode 100644
index a3e3a7039b0..00000000000
--- a/shim/build_defs/native_rules.bzl
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under both the MIT license found in the
-# LICENSE-MIT file in the root directory of this source tree and the Apache
-# License, Version 2.0 found in the LICENSE-APACHE file in the root directory
-# of this source tree.
-
-def buck_genrule(visibility = ["PUBLIC"], **kwargs):
-    # @lint-ignore BUCKLINT: avoid "native is forbidden in fbcode"
-    native.genrule(visibility = visibility, **kwargs)
-
-def buck_command_alias(**_):
-    pass
-
-def buck_filegroup(visibility = ["PUBLIC"], **kwargs):
-    # @lint-ignore BUCKLINT: avoid "native is forbidden in fbcode"
-    native.filegroup(visibility = visibility, **kwargs)
-
-def alias(actual, visibility = ["PUBLIC"], **kwargs):
-    if actual.startswith("//buck2/"):
-        actual = "root//" + actual.removeprefix("//buck2/")
-    native.alias(actual = actual, visibility = visibility, **kwargs)
-
-def buck_sh_binary(visibility = ["PUBLIC"], **kwargs):
-    # @lint-ignore BUCKLINT: avoid "native is forbidden in fbcode"
-    native.sh_binary(visibility = visibility, **kwargs)
diff --git a/shim/tools/build_defs/default_platform_defs.bzl b/shim/tools/build_defs/default_platform_defs.bzl
deleted file mode 100644
index 3f860ae49df..00000000000
--- a/shim/tools/build_defs/default_platform_defs.bzl
+++ /dev/null
@@ -1,8 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under both the MIT license found in the
-# LICENSE-MIT file in the root directory of this source tree and the Apache
-# License, Version 2.0 found in the LICENSE-APACHE file in the root directory
-# of this source tree.
-
-DEVSERVER_PLATFORM_REGEX = "UNUSED"
diff --git a/shim/tools/build_defs/fb_native_wrapper.bzl b/shim/tools/build_defs/fb_native_wrapper.bzl
deleted file mode 100644
index d67b9384fe9..00000000000
--- a/shim/tools/build_defs/fb_native_wrapper.bzl
+++ /dev/null
@@ -1,10 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under both the MIT license found in the
-# LICENSE-MIT file in the root directory of this source tree and the Apache
-# License, Version 2.0 found in the LICENSE-APACHE file in the root directory
-# of this source tree.
-
-fb_native = struct(
-    config_setting = native.config_setting,
-)
diff --git a/shim/BUCK b/shim_et/BUCK
similarity index 66%
rename from shim/BUCK
rename to shim_et/BUCK
index 77f3742e2dc..a1a9bdaf65d 100644
--- a/shim/BUCK
+++ b/shim_et/BUCK
@@ -2,19 +2,11 @@ load("@prelude//platforms:defs.bzl", "execution_platform")
 load("@prelude//tests:test_toolchain.bzl", "noop_test_toolchain")
 load("@prelude//toolchains:cxx.bzl", "system_cxx_toolchain")
 load("@prelude//toolchains:genrule.bzl", "system_genrule_toolchain")
-load("@prelude//toolchains:go.bzl", "system_go_toolchain")
-load("@prelude//toolchains:haskell.bzl", "system_haskell_toolchain")
-load("@prelude//toolchains:ocaml.bzl", "system_ocaml_toolchain")
 load("@prelude//toolchains:python.bzl", "system_python_bootstrap_toolchain", "system_python_toolchain")
 load("@prelude//toolchains:remote_test_execution.bzl", "remote_test_execution_toolchain")
-load("@prelude//toolchains:rust.bzl", "system_rust_toolchain")
-
-# TODO: sync this directory with https://github.com/facebook/buck2-shims-meta.
-# Internal context:
-# https://fb.workplace.com/groups/222849770514616/posts/600883896044533/
-
-oncall("executorch")
 
+# Although the non-Android toolchains below are present in shim/BUCK, it appears that we
+# have to duplicate them here or builds won't work.
 system_cxx_toolchain(
     name = "cxx",
     cxx_flags = ["-std=c++20"],
@@ -26,21 +18,6 @@ system_genrule_toolchain(
     visibility = ["PUBLIC"],
 )
 
-system_go_toolchain(
-    name = "go",
-    visibility = ["PUBLIC"],
-)
-
-system_haskell_toolchain(
-    name = "haskell",
-    visibility = ["PUBLIC"],
-)
-
-system_ocaml_toolchain(
-    name = "ocaml",
-    visibility = ["PUBLIC"],
-)
-
 system_python_toolchain(
     name = "python",
     visibility = ["PUBLIC"],
@@ -51,17 +28,6 @@ system_python_bootstrap_toolchain(
     visibility = ["PUBLIC"],
 )
 
-system_rust_toolchain(
-    name = "rust",
-    default_edition = "2021",
-    visibility = ["PUBLIC"],
-)
-
-remote_test_execution_toolchain(
-    name = "remote_test_execution",
-    visibility = ["PUBLIC"],
-)
-
 execution_platform(
     name = "android-arm64",
     cpu_configuration = "prelude//cpu:arm64",
@@ -84,3 +50,8 @@ noop_test_toolchain(
     name = "test",
     visibility = ["PUBLIC"],
 )
+
+remote_test_execution_toolchain(
+    name = "remote_test_execution",
+    visibility = ["PUBLIC"],
+)
diff --git a/shim_et/README.md b/shim_et/README.md
new file mode 100644
index 00000000000..a9e55273b0b
--- /dev/null
+++ b/shim_et/README.md
@@ -0,0 +1,19 @@
+# buck2 shims
+
+The `shim_et/` subtree helps the Meta-internal buck2 build system also work in the
+open-source repo.
+
+Shims are how open-source buck2 supports a [line
+like](https://github.com/pytorch/executorch/blob/50aa517549d10324147534d91d04a923b76421d6/kernels/optimized/targets.bzl#L1):
+
+```
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+```
+
+In the open-source repo, `fbsource//xplat` (a Meta-internal root) doesn't exist.
+The `fbsource = shim_et` line in `../.buckconfig` tells buck2 to look in
+[`shim_et/xplat/executorch/build/runtime_wrapper.bzl`](https://github.com/pytorch/executorch/blob/main/shim_et/xplat/executorch/build/runtime_wrapper.bzl)
+instead.
+
+NOTE: `tools` is a symlink to `../shim/tools` because `fbsource//`
+must refer here, but `fbsource//tools` in particular lives in `shim/`.
diff --git a/shim_et/tools b/shim_et/tools
new file mode 120000
index 00000000000..0b2cfeed777
--- /dev/null
+++ b/shim_et/tools
@@ -0,0 +1 @@
+../shim/tools
\ No newline at end of file
diff --git a/shim/xplat/executorch/backends/backends.bzl b/shim_et/xplat/executorch/backends/backends.bzl
similarity index 100%
rename from shim/xplat/executorch/backends/backends.bzl
rename to shim_et/xplat/executorch/backends/backends.bzl
diff --git a/shim/xplat/executorch/backends/qualcomm/qnn_version.bzl b/shim_et/xplat/executorch/backends/qualcomm/qnn_version.bzl
similarity index 100%
rename from shim/xplat/executorch/backends/qualcomm/qnn_version.bzl
rename to shim_et/xplat/executorch/backends/qualcomm/qnn_version.bzl
diff --git a/shim/xplat/executorch/backends/xnnpack/third-party/third_party_libs.bzl b/shim_et/xplat/executorch/backends/xnnpack/third-party/third_party_libs.bzl
similarity index 100%
rename from shim/xplat/executorch/backends/xnnpack/third-party/third_party_libs.bzl
rename to shim_et/xplat/executorch/backends/xnnpack/third-party/third_party_libs.bzl
diff --git a/shim/xplat/executorch/build/env_interface.bzl b/shim_et/xplat/executorch/build/env_interface.bzl
similarity index 92%
rename from shim/xplat/executorch/build/env_interface.bzl
rename to shim_et/xplat/executorch/build/env_interface.bzl
index e83f9ca01b9..a953e29a89a 100644
--- a/shim/xplat/executorch/build/env_interface.bzl
+++ b/shim_et/xplat/executorch/build/env_interface.bzl
@@ -10,11 +10,15 @@ load(":type_defs.bzl", "is_list", "is_tuple")
 
 _ET_TARGET_PREFIX = "executorch"
 
+_TOKENIZER_TARGET_PREFIX = "pytorch/tokenizers"
+
 # Indicates that an external_dep entry should fall through to the underlying
 # buck rule.
 _EXTERNAL_DEP_FALLTHROUGH = "<fallthrough>"
 
 _EXTERNAL_DEPS = {
+    # Abseil for tokenizers
+    "abseil-cpp": "//extension/llm/tokenizers/third-party:abseil",
     # ATen C++ library deps
     "aten-core": [],  # TODO(larryliu0820): Add support
     # ATen native_functions.yaml file deps
@@ -43,7 +47,7 @@ _EXTERNAL_DEPS = {
     "nlohmann_json": [], # Intentionally not supporting OSS buck build HF tokenizer.
     "prettytable": "//third-party:prettytable",
     "pybind11": "//third-party:pybind11",
-    "re2": "//extension/llm/third-party:re2",
+    "re2": "//extension/llm/tokenizers/third-party:re2",
     "sentencepiece": [], # Intentionally not supporting OSS buck build of sentencepiece.
     "sentencepiece-py": [],
     # Core C++ PyTorch functionality like Tensor and ScalarType.
@@ -65,10 +69,11 @@ def _resolve_external_dep(name):
         return [res]
 
 def _start_with_et_targets(target):
-    prefix = "//" + _ET_TARGET_PREFIX
-    for suffix in ("/", ":"):
-        if target.startswith(prefix + suffix):
-            return True
+    for prefix in [_ET_TARGET_PREFIX, _TOKENIZER_TARGET_PREFIX]:
+        prefix = "//" + prefix
+        for suffix in ("/", ":"):
+            if target.startswith(prefix + suffix):
+                return True
     return False
 
 def _patch_platforms(kwargs):
@@ -198,7 +203,11 @@ def _target_needs_patch(target):
     return _start_with_et_targets(target) or target.startswith(":")
 
 def _patch_target_for_env(target):
-    return target.replace("//executorch/", "//", 1)
+    if _ET_TARGET_PREFIX in target:
+        return target.replace("//executorch/", "//", 1)
+    elif _TOKENIZER_TARGET_PREFIX in target:
+        return target.replace("//pytorch/tokenizers", "//extension/llm/tokenizers", 1)
+    return target
 
 def _struct_to_json(object):
     # @lint-ignore BUCKLINT: native and fb_native are explicitly forbidden in fbcode.
@@ -225,6 +234,7 @@ env = struct(
     genrule = native.genrule,
     is_oss = True,
     is_xplat = lambda: False,
+    is_arvr_mode = lambda: False,
     patch_deps = _patch_deps,
     patch_cxx_compiler_flags = _patch_cxx_compiler_flags,
     patch_executorch_genrule_cmd = _patch_executorch_genrule_cmd,
@@ -244,6 +254,7 @@ env = struct(
     remove_unsupported_kwargs = _remove_unsupported_kwargs,
     resolve_external_dep = _resolve_external_dep,
     struct_to_json = _struct_to_json,
+    supported_platforms = [],
     target_needs_patch = _target_needs_patch,
     EXTERNAL_DEP_FALLTHROUGH = _EXTERNAL_DEP_FALLTHROUGH,
 )
diff --git a/shim/xplat/executorch/build/runtime_wrapper.bzl b/shim_et/xplat/executorch/build/runtime_wrapper.bzl
similarity index 96%
rename from shim/xplat/executorch/build/runtime_wrapper.bzl
rename to shim_et/xplat/executorch/build/runtime_wrapper.bzl
index 03bca6623f9..3f9a1142b67 100644
--- a/shim/xplat/executorch/build/runtime_wrapper.bzl
+++ b/shim_et/xplat/executorch/build/runtime_wrapper.bzl
@@ -29,6 +29,9 @@ use TARGETS files normally. Same for xplat-only directories and BUCK files.
 load(":env_interface.bzl", "env")
 load(":selects.bzl", "selects")
 
+def is_arvr_mode():
+    return env.is_arvr_mode()
+
 def is_xplat():
     return env.is_xplat()
 
@@ -38,6 +41,9 @@ def struct_to_json(x):
 def get_default_executorch_platforms():
     return env.default_platforms
 
+def get_executorch_supported_platforms():
+    return env.supported_platforms
+
 def _patch_executorch_references(targets, use_static_deps = False):
     """Patches up references to "//executorch/..." in lists of build targets.
 
@@ -171,7 +177,7 @@ def _patch_kwargs_common(kwargs):
     # don't pick up unexpected clients while things are still in flux.
     if not kwargs.pop("_is_external_target", False):
         for target in kwargs.get("visibility", []):
-            if not (target.startswith("//executorch") or target.startswith("@")):
+            if not (target.startswith("//executorch") or target.startswith("//pytorch/tokenizers") or target.startswith("@")):
                 fail("Please manage all external visibility using the " +
                      "EXECUTORCH_CLIENTS list in " +
                      "//executorch/build/fb/clients.bzl. " +
@@ -187,10 +193,13 @@ def _patch_kwargs_common(kwargs):
     for dep_type in ("deps", "exported_deps"):
         env.patch_deps(kwargs, dep_type)
 
+    if "visibility" not in kwargs:
+        kwargs["visibility"] = ["//executorch/..."]
+
     # Patch up references to "//executorch/..." in lists of build targets,
     # if necessary.
     use_static_deps = kwargs.pop("use_static_deps", False)
-    for dep_type in ("deps", "exported_deps", "visibility"):
+    for dep_type in ("deps", "exported_deps", "visibility", "preload_deps"):
         if kwargs.get(dep_type):
             # deps may contain select() elements, dicts that map names to lists
             # of targets. selects.apply() will run the provided function on all
@@ -202,10 +211,6 @@ def _patch_kwargs_common(kwargs):
                 function = native.partial(_patch_executorch_references, use_static_deps = use_static_deps),
             )
 
-    # Make all targets private by default, like in xplat.
-    if "visibility" not in kwargs:
-        kwargs["visibility"] = []
-
     # If we see certain strings in the "visibility" list, expand them.
     if "@EXECUTORCH_CLIENTS" in kwargs["visibility"]:
         # See env.executorch_clients for this list.
@@ -268,6 +273,7 @@ def _cxx_test(*args, **kwargs):
     kwargs["deps"].append("//executorch/test/utils:utils")
 
     _patch_kwargs_cxx(kwargs)
+    env.patch_headers(kwargs)
     _patch_build_mode_flags(kwargs)
     _patch_test_compiler_flags(kwargs)
 
@@ -333,6 +339,9 @@ def get_oss_build_kwargs():
         }
     return {}
 
+def get_aten_mode_options():
+    return (False,) if env.is_oss else (True, False)
+
 # Names in this struct should match the standard Buck rule names if possible:
 # see the "Build Rules" section in the sidebar of
 # https://buck.build/concept/build_rule.html.
diff --git a/shim/xplat/executorch/build/selects.bzl b/shim_et/xplat/executorch/build/selects.bzl
similarity index 100%
rename from shim/xplat/executorch/build/selects.bzl
rename to shim_et/xplat/executorch/build/selects.bzl
diff --git a/shim/xplat/executorch/build/type_defs.bzl b/shim_et/xplat/executorch/build/type_defs.bzl
similarity index 100%
rename from shim/xplat/executorch/build/type_defs.bzl
rename to shim_et/xplat/executorch/build/type_defs.bzl
diff --git a/shim/xplat/executorch/codegen/codegen.bzl b/shim_et/xplat/executorch/codegen/codegen.bzl
similarity index 100%
rename from shim/xplat/executorch/codegen/codegen.bzl
rename to shim_et/xplat/executorch/codegen/codegen.bzl
diff --git a/shim/xplat/executorch/extension/pybindings/pybindings.bzl b/shim_et/xplat/executorch/extension/pybindings/pybindings.bzl
similarity index 94%
rename from shim/xplat/executorch/extension/pybindings/pybindings.bzl
rename to shim_et/xplat/executorch/extension/pybindings/pybindings.bzl
index 52191eb978a..61eeaf7c179 100644
--- a/shim/xplat/executorch/extension/pybindings/pybindings.bzl
+++ b/shim_et/xplat/executorch/extension/pybindings/pybindings.bzl
@@ -3,7 +3,7 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
 # Aten ops with portable kernel
 MODELS_ATEN_OPS_LEAN_MODE_GENERATED_LIB = [
-    "//executorch/kernels/portable:generated_lib",
+    "//executorch/configurations:optimized_native_cpu_ops",
     "//executorch/kernels/quantized:generated_lib",
 ]
 
@@ -41,6 +41,7 @@ MODELS_ATEN_OPS_ATEN_MODE_GENERATED_LIB = [
 
 def executorch_pybindings(python_module_name, srcs = [], cppdeps = [], visibility = ["//executorch/..."], types = [], compiler_flags = []):
     runtime.cxx_python_extension(
+        # @autodeps-skip
         name = python_module_name,
         srcs = [
             "//executorch/extension/pybindings:pybindings.cpp",
@@ -52,8 +53,8 @@ def executorch_pybindings(python_module_name, srcs = [], cppdeps = [], visibilit
             "-DEXECUTORCH_PYTHON_MODULE_NAME={}".format(python_module_name),
         ],
         deps = [
-            "//executorch/exir:_warnings",
             "//executorch/runtime/core:core",
+            "//executorch/extension/threadpool:threadpool",
         ] + cppdeps,
         external_deps = [
             "pybind11",
diff --git a/shim/xplat/executorch/kernels/optimized/lib_defs.bzl b/shim_et/xplat/executorch/kernels/optimized/lib_defs.bzl
similarity index 100%
rename from shim/xplat/executorch/kernels/optimized/lib_defs.bzl
rename to shim_et/xplat/executorch/kernels/optimized/lib_defs.bzl
diff --git a/shim/xplat/executorch/kernels/optimized/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl
similarity index 88%
rename from shim/xplat/executorch/kernels/optimized/op_registration_util.bzl
rename to shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl
index c70757e29b9..d48a22cee37 100644
--- a/shim/xplat/executorch/kernels/optimized/op_registration_util.bzl
+++ b/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl
@@ -9,8 +9,13 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 load("@fbsource//xplat/executorch/build:selects.bzl", "selects")
 load(
     "@fbsource//xplat/executorch/kernels/optimized:lib_defs.bzl",
+    "get_vec_deps",
     "get_vec_preprocessor_flags",
 )
+load(
+    "@fbsource//xplat/executorch/kernels/portable:op_registration_util.bzl",
+    "get_compiler_optimization_flags",
+)
 
 def op_target(name, deps = [], compiler_flags = []):
     """Registers an optimized implementation for an operator overload group.
@@ -94,12 +99,17 @@ def define_op_library(name, compiler_flags, deps):
             "//executorch/kernels/test/...",
             "@EXECUTORCH_CLIENTS",
         ],
-        # kernels often have helpers with no prototypes just disabling the warning here as the headers
-        # are codegend and linked in later
-        compiler_flags = ["-Wno-missing-prototypes"],
+        compiler_flags = [
+            # kernels often have helpers with no prototypes just disabling the warning here as the headers
+            # are codegend and linked in later
+            "-Wno-missing-prototypes",
+            # pragma unroll fails with -Os, don't need to warn us and
+            # fail Werror builds; see https://godbolt.org/z/zvf85vTsr
+            "-Wno-pass-failed",
+        ] + get_compiler_optimization_flags(),
         deps = [
             "//executorch/runtime/kernel:kernel_includes",
-        ] + augmented_deps,
+        ] + augmented_deps + get_vec_deps(),
         preprocessor_flags = get_vec_preprocessor_flags(),
         # sleef needs to be added as a direct dependency of the operator target when building for Android,
         # or a linker error may occur. Not sure why this happens; it seems that fbandroid_platform_deps of
@@ -134,8 +144,3 @@ def define_op_target(name, compiler_flags, deps):
         compiler_flags = compiler_flags,
         deps = deps,
     )
-
-def is_op_disabled(name):
-    # TODO (gjcomer) Enable ops with sleef dependency in OSS
-    disabled_ops = ["op_log_softmax"]
-    return name in disabled_ops
diff --git a/shim/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
similarity index 98%
rename from shim/xplat/executorch/kernels/portable/op_registration_util.bzl
rename to shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
index f5ddae06b6a..b56413b92f4 100644
--- a/shim/xplat/executorch/kernels/portable/op_registration_util.bzl
+++ b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
@@ -1,4 +1,4 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "is_xplat", "runtime")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "is_xplat", "runtime")
 load("@fbsource//xplat/executorch/build:selects.bzl", "selects")
 
 def get_compiler_optimization_flags():
@@ -170,7 +170,7 @@ def define_op_target(name, deps, android_deps, is_aten_op, is_et_op = True, _all
 
     # If this is a custom op, define a target that builds it with at::Tensor
     # so that it can be imported into a host PyTorch environment for authoring.
-    if not is_aten_op:
+    if not is_aten_op and True in get_aten_mode_options():
         define_op_library(
             name = name,
             deps = _aten_mode_deps if _aten_mode_deps else deps,
@@ -817,6 +817,12 @@ ATEN_OPS = (
             "//executorch/kernels/portable/cpu/util:kernel_ops_util",
         ],
     ),
+    op_target(
+        name = "op_max_pool2d_with_indices_backward",
+        deps = [
+            "//executorch/kernels/portable/cpu/util:kernel_ops_util",
+        ],
+    ),
     op_target(
         name = "op_mean",
         deps = [
@@ -1223,6 +1229,12 @@ ATEN_OPS = (
             "//executorch/kernels/portable/cpu/util:copy_ops_util",
         ],
     ),
+    op_target(
+        name = "op_unfold_copy",
+        deps = [
+            "//executorch/kernels/portable/cpu/util:copy_ops_util",
+        ],
+    ),
     op_target(
         name = "op_unsqueeze_copy",
         deps = [
diff --git a/shim/xplat/executorch/kernels/test/util.bzl b/shim_et/xplat/executorch/kernels/test/util.bzl
similarity index 100%
rename from shim/xplat/executorch/kernels/test/util.bzl
rename to shim_et/xplat/executorch/kernels/test/util.bzl
diff --git a/third-party/glob_defs.bzl b/shim_et/xplat/executorch/third-party/glob_defs.bzl
similarity index 100%
rename from third-party/glob_defs.bzl
rename to shim_et/xplat/executorch/third-party/glob_defs.bzl
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index b651bd2dd93..3932f1097e1 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -23,7 +23,7 @@ set(CMAKE_CXX_STANDARD 17)
 
 set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/..)
 
-include(${EXECUTORCH_ROOT}/build/Utils.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 
 # Find prebuilt executorch library
 find_package(executorch CONFIG REQUIRED)
diff --git a/test/build_size_test.sh b/test/build_size_test.sh
index 823b399fe34..f7f9a0152d2 100644
--- a/test/build_size_test.sh
+++ b/test/build_size_test.sh
@@ -11,14 +11,13 @@ set -e
 # shellcheck source=/dev/null
 source "$(dirname "${BASH_SOURCE[0]}")/../.ci/scripts/utils.sh"
 
-# TODO(#8149): Remove -Wno-sign-compare
 # TODO(#8357): Remove -Wno-int-in-bool-context
-COMMON_CXXFLAGS="-fno-exceptions -fno-rtti -Wall -Werror -Wno-sign-compare -Wno-unknown-pragmas -Wno-int-in-bool-context"
+COMMON_CXXFLAGS="-fno-exceptions -fno-rtti -Wall -Werror -Wno-int-in-bool-context"
 
 cmake_install_executorch_lib() {
   echo "Installing libexecutorch.a"
   clean_executorch_install_folders
-
+  update_tokenizers_git_submodule
   CXXFLAGS="$COMMON_CXXFLAGS" retry cmake -DBUCK2="$BUCK2" \
           -DCMAKE_CXX_STANDARD_REQUIRED=ON \
           -DCMAKE_INSTALL_PREFIX=cmake-out \
diff --git a/test/models/export_program.py b/test/models/export_program.py
index ada80ff342f..ccf8a965eb2 100644
--- a/test/models/export_program.py
+++ b/test/models/export_program.py
@@ -276,6 +276,11 @@ def main() -> None:
             prog.write_to_file(fp)
             print(f"Exported {module_name} and wrote program data to {outfile}")
 
+        if args.external_constants:
+            # current infra doesnt easily allow renaming this file, so just hackily do it here.
+            prog._tensor_data[f"{module_name}"] = prog._tensor_data.pop(
+                "_default_external_constant"
+            )
         prog.write_tensor_data_to_file(args.outdir)
 
 
diff --git a/test/models/targets.bzl b/test/models/targets.bzl
index bb04c6bc5fa..6d5b6753f3f 100644
--- a/test/models/targets.bzl
+++ b/test/models/targets.bzl
@@ -90,13 +90,21 @@ def define_common_targets():
         # case, and typically shouldn't be done.
         _is_external_target = True,
     )
+
+    # Class names of nn.Modules for :exported_programs to export.
+    MODULES_AND_DATA_TO_EXPORT = [
+        "ModuleLinear",
+        "ModuleSimpleTrain",
+    ]
     
     runtime.genrule(
         name = "exported_program_and_data",
-        cmd = "$(exe :export_program) --modules ModuleLinear --external-constants --outdir $OUT",
+        cmd = "$(exe :export_program) --modules " + ",".join(MODULES_AND_DATA_TO_EXPORT) + " --external-constants --outdir $OUT",
         outs = {
             "ModuleLinear.pte": ["ModuleLinearProgram.pte"],
-            "ModuleLinear.ptd": ["_default_external_constant.ptd"],
+            "ModuleLinear.ptd": ["ModuleLinearProgram.ptd"],
+            "ModuleSimpleTrainProgram.pte": ["ModuleSimpleTrainProgram.pte"],
+            "ModuleSimpleTrain.ptd": ["ModuleSimpleTrainProgram.ptd"],
         },
         default_outs = ["."],
         visibility = [
diff --git a/test/run_oss_cpp_tests.sh b/test/run_oss_cpp_tests.sh
index df7955c4d41..ff2ed048257 100755
--- a/test/run_oss_cpp_tests.sh
+++ b/test/run_oss_cpp_tests.sh
@@ -32,11 +32,10 @@ build_executorch() {
   if [ -x "$(command -v glslc)" ]; then
     BUILD_VULKAN="ON"
   fi
-  CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')"
   cmake . \
     -DCMAKE_INSTALL_PREFIX=cmake-out \
-    -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \
     -DEXECUTORCH_USE_CPP_CODE_COVERAGE=ON \
+    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
     -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
diff --git a/test/size_test.cpp b/test/size_test.cpp
index 1fab1e914e0..8f67368f64e 100644
--- a/test/size_test.cpp
+++ b/test/size_test.cpp
@@ -5,6 +5,7 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+#include <c10/util/irange.h>
 
 #include <executorch/extension/data_loader/file_data_loader.h>
 #include <executorch/runtime/executor/method.h>
@@ -92,10 +93,10 @@ int main(int argc, char** argv) {
   ET_CHECK(status == Error::Ok);
 
   // It assumes the outputs are all tensors.
-  for (size_t i = 0; i < method->outputs_size(); i++) {
+  for (const auto i : c10::irange(method->outputs_size())) {
     auto output_tensor = output_list[i].toTensor();
     [[maybe_unused]] auto data_output = output_tensor.const_data_ptr<float>();
-    for (size_t j = 0; j < output_list[i].toTensor().numel(); ++j) {
+    for (const auto j : c10::irange(output_tensor.numel())) {
       ET_LOG(Info, "%f", data_output[j]);
     }
   }
diff --git a/test/utils/CMakeLists.txt b/test/utils/CMakeLists.txt
index c8bc2b5702e..9729d7d31b2 100644
--- a/test/utils/CMakeLists.txt
+++ b/test/utils/CMakeLists.txt
@@ -15,7 +15,7 @@ cmake_minimum_required(VERSION 3.19)
 
 set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
 
-include(${EXECUTORCH_ROOT}/build/Test.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
 set(_test_srcs alignment_test.cpp UnitTestMain.cpp)
 
diff --git a/test/utils/OSSTest.cmake.in b/test/utils/OSSTest.cmake.in
index 6487c0bc79d..33e902c655c 100644
--- a/test/utils/OSSTest.cmake.in
+++ b/test/utils/OSSTest.cmake.in
@@ -17,7 +17,7 @@ cmake_minimum_required(VERSION 3.19)
 
 set(EXECUTORCH_ROOT ${{CMAKE_CURRENT_SOURCE_DIR}}/{path_to_root})
 
-include(${{EXECUTORCH_ROOT}}/build/Test.cmake)
+include(${{EXECUTORCH_ROOT}}/tools/cmake/Test.cmake)
 
 set(_test_srcs {test_srcs})
 
diff --git a/test/utils/OSSTestConfig.json b/test/utils/OSSTestConfig.json
index 6eff74eec86..be594f9d5f4 100644
--- a/test/utils/OSSTestConfig.json
+++ b/test/utils/OSSTestConfig.json
@@ -1,4 +1,16 @@
 { "tests": [
+    {
+        "directory": "extension/llm/custom_ops/spinquant/test",
+        "sources": [
+            "fast_hadamard_transform_test.cpp",
+            "fast_hadamard_transform_test_impl.cpp",
+            "op_fast_hadamard_transform_test.cpp"
+        ],
+        "additional_libs": [
+            "custom_ops",
+            "dumb_fht"
+        ]
+    },
     {
         "directory": "extension/data_loader/test",
         "sources": [
@@ -47,9 +59,20 @@
             "extension_tensor"
         ]
     },
+    {
+        "directory": "extension/threadpool/test",
+        "sources": [
+            "thread_parallel_test.cpp",
+            "threadpool_test.cpp"
+        ],
+        "additional_libs": [
+            "extension_threadpool"
+        ]
+    },
     {
         "directory": "kernels/portable/cpu/util/test",
         "sources": [
+            "broadcast_indexes_range_test.cpp",
             "broadcast_test.cpp",
             "reduce_test.cpp"
         ],
diff --git a/test/utils/targets.bzl b/test/utils/targets.bzl
index b16ce2bac25..249e7bdf2be 100644
--- a/test/utils/targets.bzl
+++ b/test/utils/targets.bzl
@@ -1,4 +1,4 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime")
 
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
@@ -7,7 +7,7 @@ def define_common_targets():
     TARGETS and BUCK files that call this function.
     """
 
-    for aten_mode in (True, False):
+    for aten_mode in get_aten_mode_options():
         aten_suffix = "_aten" if aten_mode else ""
 
         runtime.cxx_library(
@@ -21,6 +21,7 @@ def define_common_targets():
             ],
             visibility = [
                 "//executorch/...",
+                "//pytorch/tokenizers/...",
                 "@EXECUTORCH_CLIENTS",
             ],
             deps = [
diff --git a/third-party/TARGETS b/third-party/TARGETS
index 5f31ef13c55..0ec62c1536f 100644
--- a/third-party/TARGETS
+++ b/third-party/TARGETS
@@ -1,7 +1,7 @@
 load("@fbsource//xplat/executorch/build/runtime_wrapper.bzl", "runtime")
+load("@fbsource//xplat/executorch/third-party/glob_defs.bzl", "subdir_glob")
 load(":flatcc_defs.bzl", "define_flatcc_targets")
 load(":gflags.bzl", "define_gflags")
-load(":glob_defs.bzl", "subdir_glob")
 load(":gtest_defs.bzl", "define_gtest_targets")
 load(":prebuilt_python_defs.bzl", "add_prebuilt_python_library_targets")
 load("@prelude//rules.bzl", "prebuilt_cxx_library")
diff --git a/third-party/ao b/third-party/ao
index 11333ba2cb5..83eb4903916 160000
--- a/third-party/ao
+++ b/third-party/ao
@@ -1 +1 @@
-Subproject commit 11333ba2cb5c4e792bc4f5c0d70c12991f972008
+Subproject commit 83eb4903916340900c140afd0fe35dfaddf23c23
diff --git a/third-party/gtest_defs.bzl b/third-party/gtest_defs.bzl
index c1f4778b80c..ac8046e264d 100644
--- a/third-party/gtest_defs.bzl
+++ b/third-party/gtest_defs.bzl
@@ -1,3 +1,4 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options")
 # Copied from fbsource/third-party/googletest
 
 COMPILER_FLAGS = [
@@ -17,7 +18,7 @@ def define_gtest_targets():
         visibility = ["PUBLIC"],
     )
 
-    for aten_mode in (True, False):
+    for aten_mode in get_aten_mode_options():
         aten_suffix = "_aten" if aten_mode else ""
 
         # # Google Test
diff --git a/tools/__init__.py b/tools/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/build/constraints/TARGETS b/tools/buck/constraints/TARGETS
similarity index 100%
rename from build/constraints/TARGETS
rename to tools/buck/constraints/TARGETS
diff --git a/build/Codegen.cmake b/tools/cmake/Codegen.cmake
similarity index 98%
rename from build/Codegen.cmake
rename to tools/cmake/Codegen.cmake
index f2da23baeaa..a229cbe5cdf 100644
--- a/build/Codegen.cmake
+++ b/tools/cmake/Codegen.cmake
@@ -9,6 +9,8 @@
 
 # Selective build. See codegen/tools/gen_oplist.py for how to use these
 # arguments.
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+
 function(gen_selected_ops)
   set(arg_names LIB_NAME OPS_SCHEMA_YAML ROOT_OPS INCLUDE_ALL_OPS)
   cmake_parse_arguments(GEN "" "" "${arg_names}" ${ARGN})
@@ -145,18 +147,13 @@ function(gen_custom_ops_aot_lib)
     ${_out_dir}/RegisterCPUCustomOps.cpp ${_out_dir}/RegisterSchema.cpp
     ${_out_dir}/CustomOpsNativeFunctions.h "${GEN_KERNEL_SOURCES}"
   )
-  # Find `Torch`.
-  if(NOT TARGET torch)
-    find_package(Torch REQUIRED)
-  endif()
+  find_package_torch()
   # This lib uses ATen lib, so we explicitly enable rtti and exceptions.
   target_compile_options(${GEN_LIB_NAME} PRIVATE -frtti -fexceptions)
   target_compile_definitions(${GEN_LIB_NAME} PRIVATE USE_ATEN_LIB=1)
   include_directories(${TORCH_INCLUDE_DIRS})
   target_link_libraries(${GEN_LIB_NAME} PRIVATE torch)
 
-  include(${EXECUTORCH_ROOT}/build/Utils.cmake)
-
   target_link_options_shared_lib(${GEN_LIB_NAME})
   if(TARGET portable_lib)
     target_link_libraries(${GEN_LIB_NAME} PRIVATE portable_lib)
diff --git a/build/Test.cmake b/tools/cmake/Test.cmake
similarity index 88%
rename from build/Test.cmake
rename to tools/cmake/Test.cmake
index 31e5aaf4d63..8538e7b7d28 100644
--- a/build/Test.cmake
+++ b/tools/cmake/Test.cmake
@@ -35,7 +35,14 @@ function(et_cxx_test target_name)
   set(multi_arg_names SOURCES EXTRA_LIBS)
   cmake_parse_arguments(ET_CXX_TEST "" "" "${multi_arg_names}" ${ARGN})
 
-  add_executable(${target_name} ${ET_CXX_TEST_SOURCES} ${EXECUTORCH_ROOT}/runtime/core/exec_aten/testing_util/tensor_util.cpp)
+  add_executable(
+    ${target_name}
+    ${ET_CXX_TEST_SOURCES}
+    ${EXECUTORCH_ROOT}/runtime/core/exec_aten/testing_util/tensor_util.cpp
+  )
+  if(NOT TARGET GTest::gtest)
+    find_package(GTest)
+  endif()
   # Includes gtest, gmock, executorch by default
   target_link_libraries(
     ${target_name} GTest::gtest GTest::gtest_main GTest::gmock executorch
diff --git a/build/Utils.cmake b/tools/cmake/Utils.cmake
similarity index 81%
rename from build/Utils.cmake
rename to tools/cmake/Utils.cmake
index 3bb62fdaf0f..8b52e115c6f 100644
--- a/build/Utils.cmake
+++ b/tools/cmake/Utils.cmake
@@ -210,9 +210,9 @@ function(extract_sources sources_file)
 
     if(ANDROID_ABI)
       if("${ANDROID_ABI}" STREQUAL "arm64-v8a")
-        set(target_platforms_arg "--target-platforms=shim//:android-arm64")
+        set(target_platforms_arg "--target-platforms=shim_et//:android-arm64")
       elseif("${ANDROID_ABI}" STREQUAL "x86_64")
-        set(target_platforms_arg "--target-platforms=shim//:android-x86_64")
+        set(target_platforms_arg "--target-platforms=shim_et//:android-x86_64")
       else()
         message(
           FATAL_ERROR
@@ -222,8 +222,8 @@ function(extract_sources sources_file)
     endif()
     execute_process(
       COMMAND
-        ${PYTHON_EXECUTABLE} ${executorch_root}/build/extract_sources.py
-        --config=${executorch_root}/build/cmake_deps.toml --out=${sources_file}
+        ${PYTHON_EXECUTABLE} ${executorch_root}/tools/cmake/extract_sources.py
+        --config=${executorch_root}/tools/cmake/cmake_deps.toml --out=${sources_file}
         --buck2=${BUCK2} ${target_platforms_arg}
       OUTPUT_VARIABLE gen_srcs_output
       ERROR_VARIABLE gen_srcs_error
@@ -258,7 +258,7 @@ function(resolve_buck2)
   endif()
 
   set(resolve_buck2_command
-      ${PYTHON_EXECUTABLE} ${executorch_root}/build/resolve_buck.py
+      ${PYTHON_EXECUTABLE} ${executorch_root}/tools/cmake/resolve_buck.py
       --cache_dir=buck2-bin
   )
 
@@ -332,21 +332,64 @@ function(resolve_python_executable)
   endif()
 endfunction()
 
-# find_package(Torch CONFIG REQUIRED) replacement for targets that
-# have a header-only Torch dependency. Because find_package sets
-# variables in the parent scope, we use a macro to preserve this
-# rather than maintaining our own list of those variables.
-macro(find_package_torch_headers)
-  # We cannot simply use CMAKE_FIND_ROOT_PATH_BOTH, because that does
-  # not propagate into TorchConfig.cmake.
-  foreach(mode_kind IN ITEMS PACKAGE LIBRARY INCLUDE)
-    set(OLD_CMAKE_FIND_ROOT_PATH_MODE_${mode_kind} ${CMAKE_FIND_ROOT_PATH_MODE_${mode_kind}})
-    set(CMAKE_FIND_ROOT_PATH_MODE_${mode_kind} BOTH)
-  endforeach()
+# find_package(Torch CONFIG REQUIRED) replacement for targets that have a
+# header-only Torch dependency.
+#
+# Unlike find_package(Torch ...), this will only set
+# TORCH_INCLUDE_DIRS in the parent scope. In particular, it will NOT
+# set any of the following:
+# - TORCH_FOUND
+# - TORCH_LIBRARY
+# - TORCH_CXX_FLAGS
+function(find_package_torch_headers)
+  # We implement this way rather than using find_package so that
+  # cross-compilation can still use the host's installed copy of
+  # torch, since the headers should be fine.
+  get_torch_base_path(TORCH_BASE_PATH)
+  set(TORCH_INCLUDE_DIRS "${TORCH_BASE_PATH}/include;${TORCH_BASE_PATH}/include/torch/csrc/api/include" PARENT_SCOPE)
+endfunction()
+
+# Return the base path to the installed Torch Python library in
+# outVar.
+function(get_torch_base_path outVar)
+  if(NOT PYTHON_EXECUTABLE)
+    resolve_python_executable()
+  endif()
+  execute_process(
+    COMMAND "${PYTHON_EXECUTABLE}" -c
+            "import importlib.util; print(importlib.util.find_spec('torch').submodule_search_locations[0])"
+    OUTPUT_VARIABLE _tmp_torch_path
+    ERROR_VARIABLE _tmp_torch_path_error
+    RESULT_VARIABLE _tmp_torch_path_result COMMAND_ECHO STDERR
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+  )
+  if(NOT _tmp_torch_path_result EQUAL 0)
+    message("Error while adding torch to CMAKE_PREFIX_PATH. "
+            "Exit code: ${_tmp_torch_path_result}"
+    )
+    message("Output:\n${_tmp_torch_path}")
+    message(FATAL_ERROR "Error:\n${_tmp_torch_path_error}")
+  endif()
+  set(${outVar} ${_tmp_torch_path} PARENT_SCOPE)
+endfunction()
+
+# Add the Torch CMake configuration to CMAKE_PREFIX_PATH so that find_package
+# can find Torch.
+function(add_torch_to_cmake_prefix_path)
+  get_torch_base_path(_tmp_torch_path)
+  list(APPEND CMAKE_PREFIX_PATH "${_tmp_torch_path}")
+  set(CMAKE_PREFIX_PATH
+      "${CMAKE_PREFIX_PATH}"
+      PARENT_SCOPE
+  )
+endfunction()
+
+# Replacement for find_package(Torch CONFIG REQUIRED); sets up CMAKE_PREFIX_PATH
+# first and only does the find once. If you have a header-only Torch dependency,
+# use find_package_torch_headers instead!
+macro(find_package_torch)
   if(NOT TARGET torch)
+    add_torch_to_cmake_prefix_path()
     find_package(Torch CONFIG REQUIRED)
   endif()
-  foreach(mode_kind IN ITEMS PACKAGE LIBRARY INCLUDE)
-    set(CMAKE_FIND_ROOT_PATH_MODE_${mode_kind} ${OLD_CMAKE_FIND_ROOT_PATH_MODE_${mode_kind}})
-  endforeach()
 endmacro()
diff --git a/tools/cmake/__init__.py b/tools/cmake/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/build/buck_util.py b/tools/cmake/buck_util.py
similarity index 81%
rename from build/buck_util.py
rename to tools/cmake/buck_util.py
index d01a6f42731..9b093820054 100644
--- a/build/buck_util.py
+++ b/tools/cmake/buck_util.py
@@ -8,12 +8,20 @@
 import os
 import subprocess
 import sys
+from functools import cache
+from pathlib import Path
 
 from typing import Optional, Sequence
 
 
-# Run buck2 from the same directory (and thus repo) as this script.
-BUCK_CWD: str = os.path.dirname(os.path.realpath(__file__))
+@cache
+def repo_root_dir() -> Path:
+    git_root = subprocess.check_output(
+        ["git", "rev-parse", "--show-toplevel"],
+        cwd=os.path.dirname(os.path.realpath(__file__)),
+        text=True,
+    ).strip()
+    return Path(git_root)
 
 
 class Buck2Runner:
@@ -26,7 +34,7 @@ def run(self, args: Sequence[str]) -> list[str]:
             cp: subprocess.CompletedProcess = subprocess.run(
                 [self._path] + args,  # type: ignore[operator]
                 capture_output=True,
-                cwd=BUCK_CWD,
+                cwd=repo_root_dir(),
                 check=True,
             )
             return [line.strip().decode("utf-8") for line in cp.stdout.splitlines()]
diff --git a/build/cmake_deps.toml b/tools/cmake/cmake_deps.toml
similarity index 92%
rename from build/cmake_deps.toml
rename to tools/cmake/cmake_deps.toml
index c44fcf92ea6..a251891e622 100644
--- a/build/cmake_deps.toml
+++ b/tools/cmake/cmake_deps.toml
@@ -58,6 +58,21 @@ deps = [
   "executorch_core",
 ]
 
+# HACK: prevent reduce_util from also showing up in custom_ops. The
+# actual medium-term fix is to stop using Buck to drive our CMake
+# builds.
+[targets.reduce_util]
+buck_targets = [
+  "//kernels/portable/cpu/util:reduce_util",
+]
+filters = [
+  ".cpp$",
+]
+deps = [
+  "executorch",
+  "executorch_core",
+]
+
 [targets.optimized_kernels]
 buck_targets = [
   "//kernels/optimized:generated_lib",
@@ -74,6 +89,7 @@ deps = [
   "executorch",
   "executorch_core",
   "extension_threadpool",
+  "optimized_cpublas",
   "portable_kernels",
 ]
 
@@ -115,11 +131,12 @@ excludes = [
 deps = [
   "executorch_core",
   "executorch",
+  "extension_threadpool",
 ]
 
-[targets.optimized_native_cpu_ops_oss]
+[targets.optimized_native_cpu_ops]
 buck_targets = [
-  "//configurations:optimized_native_cpu_ops_oss",
+  "//configurations:optimized_native_cpu_ops",
 ]
 filters = [
   ".cpp$",
@@ -129,6 +146,8 @@ excludes = [
 deps = [
   "executorch_core",
   "executorch",
+  "extension_threadpool",
+  "optimized_cpublas",
   "portable_kernels",
 ]
 # ---------------------------------- core end ----------------------------------
@@ -347,6 +366,7 @@ excludes = [
 deps = [
   "executorch",
   "executorch_core",
+  "extension_threadpool",
   "xnnpack_backend",
   "portable_kernels",
 ]
@@ -361,6 +381,7 @@ filters = [
 deps = [
   "executorch",
   "executorch_core",
+  "extension_threadpool",
 ]
 
 [targets.xnnpack_schema]
@@ -386,10 +407,7 @@ buck_targets = [
   "//extension/llm/custom_ops:custom_ops",
 ]
 filters = [
-  # Second clause is to pick up fht_neon.c/fht_avx.c from FFHT. TODO:
-  # remove filters and patch extract_sources.py's Buck query to fetch
-  # srcs; presumably filters is here to remove .h files.
-  "(.cpp$)|(fht.*\\.c$)",
+  ".cpp$",
 ]
 excludes = [
   "^codegen",
@@ -397,8 +415,10 @@ excludes = [
 deps = [
   "executorch",
   "executorch_core",
+  "optimized_cpublas",
   "optimized_kernels",
   "extension_threadpool",
+  "reduce_util",
   "xnnpack_backend",
 ]
 
@@ -434,9 +454,11 @@ deps = [
   "executorch_core",
   "extension_data_loader",
   "extension_module",
+  "extension_threadpool",
+  "optimized_cpublas",
   "portable_kernels",
   "quantized_kernels",
   "xnnpack_backend",
-  "optimized_native_cpu_ops_oss",
+  "optimized_native_cpu_ops",
 ]
 # ---------------------------------- LLama end ----------------------------------
diff --git a/build/executorch-config.cmake b/tools/cmake/executorch-config.cmake
similarity index 60%
rename from build/executorch-config.cmake
rename to tools/cmake/executorch-config.cmake
index d14a1227cd9..13ec0b876ad 100644
--- a/build/executorch-config.cmake
+++ b/tools/cmake/executorch-config.cmake
@@ -1,4 +1,3 @@
-
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
@@ -16,20 +15,23 @@
 #
 # This will define the following variables:
 #
-#   EXECUTORCH_FOUND        -- True if the system has the ExecuTorch library
-#   EXECUTORCH_INCLUDE_DIRS -- The include directories for ExecuTorch
-#   EXECUTORCH_LIBRARIES    -- Libraries to link against
+# EXECUTORCH_FOUND        -- True if the system has the ExecuTorch library
+# EXECUTORCH_INCLUDE_DIRS -- The include directories for ExecuTorch
+# EXECUTORCH_LIBRARIES    -- Libraries to link against
 #
-# The actual values for these variables will be different from what executorch-config.cmake
-# in executorch pip package gives, but we wanted to keep the contract of exposing these
-# CMake variables.
+# The actual values for these variables will be different from what
+# executorch-config.cmake in executorch pip package gives, but we wanted to keep
+# the contract of exposing these CMake variables.
 
 cmake_minimum_required(VERSION 3.19)
 
 set(_root "${CMAKE_CURRENT_LIST_DIR}/../../..")
 set(required_lib_list executorch executorch_core portable_kernels)
 set(EXECUTORCH_LIBRARIES)
-set(EXECUTORCH_INCLUDE_DIRS ${_root}/include ${_root}/include/executorch/runtime/core/portable_type/c10 ${_root}/lib)
+set(EXECUTORCH_INCLUDE_DIRS
+    ${_root}/include ${_root}/include/executorch/runtime/core/portable_type/c10
+    ${_root}/lib
+)
 foreach(lib ${required_lib_list})
   set(lib_var "LIB_${lib}")
   add_library(${lib} STATIC IMPORTED)
@@ -40,7 +42,12 @@ foreach(lib ${required_lib_list})
   )
   set_target_properties(${lib} PROPERTIES IMPORTED_LOCATION "${${lib_var}}")
   target_compile_definitions(${lib} INTERFACE C10_USING_CUSTOM_GENERATED_MACROS)
-  target_include_directories(${lib} INTERFACE ${_root}/include ${_root}/include/executorch/runtime/core/portable_type/c10 ${_root}/lib)
+  target_include_directories(
+    ${lib}
+    INTERFACE ${_root}/include
+              ${_root}/include/executorch/runtime/core/portable_type/c10
+              ${_root}/lib
+  )
   list(APPEND EXECUTORCH_LIBRARIES ${lib})
 endforeach()
 
@@ -65,6 +72,7 @@ set(lib_list
     neuron_backend
     qnn_executorch_backend
     portable_ops_lib
+    custom_ops
     extension_module
     extension_module_static
     extension_runner_util
@@ -81,6 +89,7 @@ set(lib_list
     pthreadpool
     vulkan_backend
     optimized_kernels
+    optimized_portable_kernels
     cpublas
     eigen_blas
     optimized_ops_lib
@@ -110,7 +119,42 @@ foreach(lib ${lib_list})
       add_library(${lib} STATIC IMPORTED)
     endif()
     set_target_properties(${lib} PROPERTIES IMPORTED_LOCATION "${${lib_var}}")
-    target_include_directories(${lib} INTERFACE ${_root}/include ${_root}/include/executorch/runtime/core/portable_type/c10 ${_root}/lib)
+    target_include_directories(
+      ${lib}
+      INTERFACE ${_root}/include
+                ${_root}/include/executorch/runtime/core/portable_type/c10
+                ${_root}/lib
+    )
     list(APPEND EXECUTORCH_LIBRARIES ${lib})
   endif()
 endforeach()
+
+# TODO: investigate use of install(EXPORT) to cleanly handle
+# target_compile_options/target_compile_definitions for everything.
+if(TARGET cpublas)
+  set_target_properties(
+    cpublas PROPERTIES INTERFACE_LINK_LIBRARIES
+                       "extension_threadpool;eigen_blas"
+  )
+endif()
+if(TARGET optimized_kernels)
+  set_target_properties(
+    optimized_kernels PROPERTIES INTERFACE_LINK_LIBRARIES
+                                 "executorch_core;cpublas;extension_threadpool"
+  )
+endif()
+if(TARGET optimized_native_cpu_ops_lib)
+  if(TARGET optimized_portable_kernels)
+    set(_maybe_optimized_portable_kernels_lib optimized_portable_kernels)
+  else()
+    set(_maybe_optimized_portable_kernels_lib portable_kernels)
+  endif()
+  set_target_properties(
+    optimized_native_cpu_ops_lib
+    PROPERTIES INTERFACE_LINK_LIBRARIES
+               "optimized_kernels;${_maybe_optimized_portable_kernels_lib}"
+  )
+endif()
+if(TARGET extension_threadpool)
+  target_compile_definitions(extension_threadpool INTERFACE ET_USE_THREADPOOL)
+endif()
diff --git a/build/executorch-wheel-config.cmake b/tools/cmake/executorch-wheel-config.cmake
similarity index 100%
rename from build/executorch-wheel-config.cmake
rename to tools/cmake/executorch-wheel-config.cmake
diff --git a/build/extract_sources.py b/tools/cmake/extract_sources.py
similarity index 100%
rename from build/extract_sources.py
rename to tools/cmake/extract_sources.py
diff --git a/build/resolve_buck.py b/tools/cmake/resolve_buck.py
similarity index 96%
rename from build/resolve_buck.py
rename to tools/cmake/resolve_buck.py
index 725a326ea67..6da0a81b6de 100644
--- a/build/resolve_buck.py
+++ b/tools/cmake/resolve_buck.py
@@ -37,10 +37,10 @@
 
 """
 
+
 # Path to the file containing BUCK2 version (build date) for ExecuTorch.
-# Note that this path is relative to this script file, not the working
-# directory.
-BUCK_VERSION_FILE = "../.ci/docker/ci_commit_pins/buck2.txt"
+def _buck_version_path() -> Path:
+    return buck_util.repo_root_dir() / ".ci/docker/ci_commit_pins/buck2.txt"
 
 
 @dataclass
@@ -125,9 +125,7 @@ def resolve_buck2(args: argparse.Namespace) -> Union[str, int]:
 
     # Read the target version (build date) from the CI pin file. Note that
     # this path is resolved relative to the directory containing this script.
-    script_dir = os.path.dirname(__file__)
-    version_file_path = Path(script_dir) / BUCK_VERSION_FILE
-    with open(version_file_path.absolute().as_posix()) as f:
+    with open(_buck_version_path().absolute().as_posix()) as f:
         target_buck_version = f.read().strip()
 
     # Determine the target buck2 version string according to the current
diff --git a/build/pip_data_bin_init.py.in b/tools/wheel/pip_data_bin_init.py.in
similarity index 100%
rename from build/pip_data_bin_init.py.in
rename to tools/wheel/pip_data_bin_init.py.in
diff --git a/util/collect_env.py b/util/collect_env.py
new file mode 100644
index 00000000000..7d35c0636ce
--- /dev/null
+++ b/util/collect_env.py
@@ -0,0 +1,749 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# mypy: allow-untyped-defs
+
+# Unlike the rest of the PyTorch this file must be python2 compliant.
+# This script outputs relevant system environment info
+# Run it with `python util/collect_env.py` or `python -m util.collect_env`
+
+import datetime
+import json
+import locale
+import os
+import re
+import subprocess
+import sys
+from collections import namedtuple
+
+
+try:
+    import torch
+
+    TORCH_AVAILABLE = True
+except (ImportError, NameError, AttributeError, OSError):
+    TORCH_AVAILABLE = False
+
+# System Environment Information
+SystemEnv = namedtuple(
+    "SystemEnv",
+    [
+        "torch_version",
+        "is_debug_build",
+        "cuda_compiled_version",
+        "gcc_version",
+        "clang_version",
+        "cmake_version",
+        "os",
+        "libc_version",
+        "python_version",
+        "python_platform",
+        "is_cuda_available",
+        "cuda_runtime_version",
+        "cuda_module_loading",
+        "nvidia_driver_version",
+        "nvidia_gpu_models",
+        "cudnn_version",
+        "pip_version",  # 'pip' or 'pip3'
+        "pip_packages",
+        "conda_packages",
+        "hip_compiled_version",
+        "hip_runtime_version",
+        "miopen_runtime_version",
+        "caching_allocator_config",
+        "is_xnnpack_available",
+        "cpu_info",
+    ],
+)
+
+COMMON_PATTERNS = [
+    "torch",
+    "numpy",
+    "triton",
+    "optree",
+]
+
+NVIDIA_PATTERNS = [
+    "cuda-cudart",
+    "cuda-cupti",
+    "cuda-libraries",
+    "cuda-opencl",
+    "cuda-nvrtc",
+    "cuda-runtime",
+    "cublas",
+    "cudnn",
+    "cufft",
+    "curand",
+    "cusolver",
+    "cusparse",
+    "nccl",
+    "nvjitlink",
+    "nvtx",
+]
+
+CONDA_PATTERNS = [
+    "cudatoolkit",
+    "soumith",
+    "mkl",
+    "magma",
+]
+
+PIP_PATTERNS = [
+    "mypy",
+    "flake8",
+    "onnx",
+]
+
+
+def run(command):
+    """Return (return-code, stdout, stderr)."""
+    shell = True if type(command) is str else False
+    p = subprocess.Popen(
+        command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=shell
+    )
+    raw_output, raw_err = p.communicate()
+    rc = p.returncode
+    if get_platform() == "win32":
+        enc = "oem"
+    else:
+        enc = locale.getpreferredencoding()
+    output = raw_output.decode(enc)
+    err = raw_err.decode(enc)
+    return rc, output.strip(), err.strip()
+
+
+def run_and_read_all(run_lambda, command):
+    """Run command using run_lambda; reads and returns entire output if rc is 0."""
+    rc, out, _ = run_lambda(command)
+    if rc != 0:
+        return None
+    return out
+
+
+def run_and_parse_first_match(run_lambda, command, regex):
+    """Run command using run_lambda, returns the first regex match if it exists."""
+    rc, out, _ = run_lambda(command)
+    if rc != 0:
+        return None
+    match = re.search(regex, out)
+    if match is None:
+        return None
+    return match.group(1)
+
+
+def run_and_return_first_line(run_lambda, command):
+    """Run command using run_lambda and returns first line if output is not empty."""
+    rc, out, _ = run_lambda(command)
+    if rc != 0:
+        return None
+    return out.split("\n")[0]
+
+
+def get_conda_packages(run_lambda, patterns=None):
+    if patterns is None:
+        patterns = CONDA_PATTERNS + COMMON_PATTERNS + NVIDIA_PATTERNS
+    conda = os.environ.get("CONDA_EXE", "conda")
+    out = run_and_read_all(run_lambda, "{} list".format(conda))
+    if out is None:
+        return out
+
+    return "\n".join(
+        line
+        for line in out.splitlines()
+        if not line.startswith("#") and any(name in line for name in patterns)
+    )
+
+
+def get_gcc_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, "gcc --version", r"gcc (.*)")
+
+
+def get_clang_version(run_lambda):
+    return run_and_parse_first_match(
+        run_lambda, "clang --version", r"clang version (.*)"
+    )
+
+
+def get_cmake_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, "cmake --version", r"cmake (.*)")
+
+
+def get_nvidia_driver_version(run_lambda):
+    if get_platform() == "darwin":
+        cmd = "kextstat | grep -i cuda"
+        return run_and_parse_first_match(
+            run_lambda, cmd, r"com[.]nvidia[.]CUDA [(](.*?)[)]"
+        )
+    smi = get_nvidia_smi()
+    return run_and_parse_first_match(run_lambda, smi, r"Driver Version: (.*?) ")
+
+
+def get_gpu_info(run_lambda):
+    if get_platform() == "darwin" or (
+        TORCH_AVAILABLE
+        and hasattr(torch.version, "hip")
+        and torch.version.hip is not None
+    ):
+        if TORCH_AVAILABLE and torch.cuda.is_available():
+            if torch.version.hip is not None:
+                prop = torch.cuda.get_device_properties(0)
+                if hasattr(prop, "gcnArchName"):
+                    gcnArch = " ({})".format(prop.gcnArchName)
+                else:
+                    gcnArch = "NoGCNArchNameOnOldPyTorch"
+            else:
+                gcnArch = ""
+            return torch.cuda.get_device_name(None) + gcnArch
+        return None
+    smi = get_nvidia_smi()
+    uuid_regex = re.compile(r" \(UUID: .+?\)")
+    rc, out, _ = run_lambda(smi + " -L")
+    if rc != 0:
+        return None
+    # Anonymize GPUs by removing their UUID
+    return re.sub(uuid_regex, "", out)
+
+
+def get_running_cuda_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, "nvcc --version", r"release .+ V(.*)")
+
+
+def get_cudnn_version(run_lambda):
+    """Return a list of libcudnn.so; it's hard to tell which one is being used."""
+    if get_platform() == "win32":
+        system_root = os.environ.get("SYSTEMROOT", "C:\\Windows")
+        cuda_path = os.environ.get("CUDA_PATH", "%CUDA_PATH%")
+        where_cmd = os.path.join(system_root, "System32", "where")
+        cudnn_cmd = '{} /R "{}\\bin" cudnn*.dll'.format(where_cmd, cuda_path)
+    elif get_platform() == "darwin":
+        # CUDA libraries and drivers can be found in /usr/local/cuda/. See
+        # https://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#install
+        # https://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#installmac
+        # Use CUDNN_LIBRARY when cudnn library is installed elsewhere.
+        cudnn_cmd = "ls /usr/local/cuda/lib/libcudnn*"
+    else:
+        cudnn_cmd = 'ldconfig -p | grep libcudnn | rev | cut -d" " -f1 | rev'
+    rc, out, _ = run_lambda(cudnn_cmd)
+    # find will return 1 if there are permission errors or if not found
+    if len(out) == 0 or (rc != 1 and rc != 0):
+        cudnn_lib = os.environ.get("CUDNN_LIBRARY")
+        if cudnn_lib is not None and os.path.isfile(cudnn_lib):
+            return os.path.realpath(cudnn_lib)
+        return None
+    files_set = set()
+    for fn in out.split("\n"):
+        fn = os.path.realpath(fn)  # eliminate symbolic links
+        if os.path.isfile(fn):
+            files_set.add(fn)
+    if not files_set:
+        return None
+    # Alphabetize the result because the order is non-deterministic otherwise
+    files = sorted(files_set)
+    if len(files) == 1:
+        return files[0]
+    result = "\n".join(files)
+    return "Probably one of the following:\n{}".format(result)
+
+
+def get_nvidia_smi():
+    # Note: nvidia-smi is currently available only on Windows and Linux
+    smi = "nvidia-smi"
+    if get_platform() == "win32":
+        system_root = os.environ.get("SYSTEMROOT", "C:\\Windows")
+        program_files_root = os.environ.get("PROGRAMFILES", "C:\\Program Files")
+        legacy_path = os.path.join(
+            program_files_root, "NVIDIA Corporation", "NVSMI", smi
+        )
+        new_path = os.path.join(system_root, "System32", smi)
+        smis = [new_path, legacy_path]
+        for candidate_smi in smis:
+            if os.path.exists(candidate_smi):
+                smi = '"{}"'.format(candidate_smi)
+                break
+    return smi
+
+
+# example outputs of CPU infos
+#  * linux
+#    Architecture:            x86_64
+#      CPU op-mode(s):        32-bit, 64-bit
+#      Address sizes:         46 bits physical, 48 bits virtual
+#      Byte Order:            Little Endian
+#    CPU(s):                  128
+#      On-line CPU(s) list:   0-127
+#    Vendor ID:               GenuineIntel
+#      Model name:            Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz
+#        CPU family:          6
+#        Model:               106
+#        Thread(s) per core:  2
+#        Core(s) per socket:  32
+#        Socket(s):           2
+#        Stepping:            6
+#        BogoMIPS:            5799.78
+#        Flags:               fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr
+#                             sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon rep_good nopl
+#                             xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq monitor ssse3 fma cx16
+#                             pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand
+#                             hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb stibp ibrs_enhanced
+#                             fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid avx512f avx512dq rdseed adx smap
+#                             avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1
+#                             xsaves wbnoinvd ida arat avx512vbmi pku ospke avx512_vbmi2 gfni vaes vpclmulqdq
+#                             avx512_vnni avx512_bitalg tme avx512_vpopcntdq rdpid md_clear flush_l1d arch_capabilities
+#    Virtualization features:
+#      Hypervisor vendor:     KVM
+#      Virtualization type:   full
+#    Caches (sum of all):
+#      L1d:                   3 MiB (64 instances)
+#      L1i:                   2 MiB (64 instances)
+#      L2:                    80 MiB (64 instances)
+#      L3:                    108 MiB (2 instances)
+#    NUMA:
+#      NUMA node(s):          2
+#      NUMA node0 CPU(s):     0-31,64-95
+#      NUMA node1 CPU(s):     32-63,96-127
+#    Vulnerabilities:
+#      Itlb multihit:         Not affected
+#      L1tf:                  Not affected
+#      Mds:                   Not affected
+#      Meltdown:              Not affected
+#      Mmio stale data:       Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown
+#      Retbleed:              Not affected
+#      Spec store bypass:     Mitigation; Speculative Store Bypass disabled via prctl and seccomp
+#      Spectre v1:            Mitigation; usercopy/swapgs barriers and __user pointer sanitization
+#      Spectre v2:            Mitigation; Enhanced IBRS, IBPB conditional, RSB filling, PBRSB-eIBRS SW sequence
+#      Srbds:                 Not affected
+#      Tsx async abort:       Not affected
+#  * win32
+#    Architecture=9
+#    CurrentClockSpeed=2900
+#    DeviceID=CPU0
+#    Family=179
+#    L2CacheSize=40960
+#    L2CacheSpeed=
+#    Manufacturer=GenuineIntel
+#    MaxClockSpeed=2900
+#    Name=Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz
+#    ProcessorType=3
+#    Revision=27142
+#
+#    Architecture=9
+#    CurrentClockSpeed=2900
+#    DeviceID=CPU1
+#    Family=179
+#    L2CacheSize=40960
+#    L2CacheSpeed=
+#    Manufacturer=GenuineIntel
+#    MaxClockSpeed=2900
+#    Name=Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz
+#    ProcessorType=3
+#    Revision=27142
+
+
+def get_cpu_info(run_lambda):
+    rc, out, err = 0, "", ""
+    if get_platform() == "linux":
+        rc, out, err = run_lambda("lscpu")
+    elif get_platform() == "win32":
+        rc, out, err = run_lambda(
+            'powershell.exe "gwmi -Class Win32_Processor | Select-Object -Property Name,Manufacturer,Family,\
+            Architecture,ProcessorType,DeviceID,CurrentClockSpeed,MaxClockSpeed,L2CacheSize,L2CacheSpeed,Revision\
+            | ConvertTo-Json"'
+        )
+        if rc == 0:
+            lst = []
+            try:
+                obj = json.loads(out)
+                if type(obj) is list:
+                    for o in obj:
+                        lst.append("----------------------")
+                        lst.extend([f"{k}: {v}" for (k, v) in o.items()])
+                else:
+                    lst.extend([f"{k}: {v}" for (k, v) in obj.items()])
+            except ValueError as e:
+                lst.append(out)
+                lst.append(str(e))
+            out = "\n".join(lst)
+    elif get_platform() == "darwin":
+        rc, out, err = run_lambda("sysctl -n machdep.cpu.brand_string")
+    cpu_info = "None"
+    if rc == 0:
+        cpu_info = out
+    else:
+        cpu_info = err
+    return cpu_info
+
+
+def get_platform():
+    if sys.platform.startswith("linux"):
+        return "linux"
+    elif sys.platform.startswith("win32"):
+        return "win32"
+    elif sys.platform.startswith("cygwin"):
+        return "cygwin"
+    elif sys.platform.startswith("darwin"):
+        return "darwin"
+    else:
+        return sys.platform
+
+
+def get_mac_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, "sw_vers -productVersion", r"(.*)")
+
+
+def get_windows_version(run_lambda):
+    ret = run_and_read_all(
+        run_lambda,
+        'powershell.exe "gwmi -Class Win32_OperatingSystem | Select-Object -Property Caption,\
+        OSArchitecture,Version | ConvertTo-Json"',
+    )
+    try:
+        obj = json.loads(ret)
+        ret = f'{obj["Caption"]} ({obj["Version"]} {obj["OSArchitecture"]})'
+    except ValueError as e:
+        ret += f"\n{str(e)}"
+    return ret
+
+
+def get_lsb_version(run_lambda):
+    return run_and_parse_first_match(
+        run_lambda, "lsb_release -a", r"Description:\t(.*)"
+    )
+
+
+def check_release_file(run_lambda):
+    return run_and_parse_first_match(
+        run_lambda, "cat /etc/*-release", r'PRETTY_NAME="(.*)"'
+    )
+
+
+def get_os(run_lambda):
+    from platform import machine
+
+    platform = get_platform()
+
+    if platform == "win32" or platform == "cygwin":
+        return get_windows_version(run_lambda)
+
+    if platform == "darwin":
+        version = get_mac_version(run_lambda)
+        if version is None:
+            return None
+        return "macOS {} ({})".format(version, machine())
+
+    if platform == "linux":
+        # Ubuntu/Debian based
+        desc = get_lsb_version(run_lambda)
+        if desc is not None:
+            return "{} ({})".format(desc, machine())
+
+        # Try reading /etc/*-release
+        desc = check_release_file(run_lambda)
+        if desc is not None:
+            return "{} ({})".format(desc, machine())
+
+        return "{} ({})".format(platform, machine())
+
+    # Unknown platform
+    return platform
+
+
+def get_python_platform():
+    import platform
+
+    return platform.platform()
+
+
+def get_libc_version():
+    import platform
+
+    if get_platform() != "linux":
+        return "N/A"
+    return "-".join(platform.libc_ver())
+
+
+def get_pip_packages(run_lambda, patterns=None):
+    """Return `pip list` output. Note: will also find conda-installed pytorch and numpy packages."""
+    if patterns is None:
+        patterns = PIP_PATTERNS + COMMON_PATTERNS + NVIDIA_PATTERNS
+
+    pip_version = "pip3" if sys.version[0] == "3" else "pip"
+
+    os.environ["PIP_DISABLE_PIP_VERSION_CHECK"] = "1"
+    # People generally have pip as `pip` or `pip3`
+    # But here it is invoked as `python -mpip`
+    out = run_and_read_all(
+        run_lambda, [sys.executable, "-mpip", "list", "--format=freeze"]
+    )
+    filtered_out = "\n".join(
+        line for line in out.splitlines() if any(name in line for name in patterns)
+    )
+
+    return pip_version, filtered_out
+
+
+def get_cachingallocator_config():
+    ca_config = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")
+    return ca_config
+
+
+def get_cuda_module_loading_config():
+    if TORCH_AVAILABLE and torch.cuda.is_available():
+        torch.cuda.init()
+        config = os.environ.get("CUDA_MODULE_LOADING", "")
+        return config
+    else:
+        return "N/A"
+
+
+def is_xnnpack_available():
+    if TORCH_AVAILABLE:
+        import torch.backends.xnnpack
+
+        return str(torch.backends.xnnpack.enabled)  # type: ignore[attr-defined]
+    else:
+        return "N/A"
+
+
+def get_env_info():
+    """
+    Collects environment information to aid in debugging.
+
+    The returned environment information contains details on torch version, is debug build
+    or not, cuda compiled version, gcc version, clang version, cmake version, operating
+    system, libc version, python version, python platform, CUDA availability, CUDA
+    runtime version, CUDA module loading config, GPU model and configuration, Nvidia
+    driver version, cuDNN version, pip version and versions of relevant pip and
+    conda packages, HIP runtime version, MIOpen runtime version,
+    Caching allocator config, XNNPACK availability and CPU information.
+
+    Returns:
+        SystemEnv (namedtuple): A tuple containining various environment details
+            and system information.
+    """
+    run_lambda = run
+    pip_version, pip_list_output = get_pip_packages(run_lambda)
+
+    if TORCH_AVAILABLE:
+        version_str = torch.__version__
+        debug_mode_str = str(torch.version.debug)
+        cuda_available_str = str(torch.cuda.is_available())
+        cuda_version_str = torch.version.cuda
+        if (
+            not hasattr(torch.version, "hip") or torch.version.hip is None
+        ):  # cuda version
+            hip_compiled_version = hip_runtime_version = miopen_runtime_version = "N/A"
+        else:  # HIP version
+
+            def get_version_or_na(cfg, prefix):
+                _lst = [s.rsplit(None, 1)[-1] for s in cfg if prefix in s]
+                return _lst[0] if _lst else "N/A"
+
+            cfg = torch._C._show_config().split("\n")
+            hip_runtime_version = get_version_or_na(cfg, "HIP Runtime")
+            miopen_runtime_version = get_version_or_na(cfg, "MIOpen")
+            cuda_version_str = "N/A"
+            hip_compiled_version = torch.version.hip
+    else:
+        version_str = debug_mode_str = cuda_available_str = cuda_version_str = "N/A"
+        hip_compiled_version = hip_runtime_version = miopen_runtime_version = "N/A"
+
+    sys_version = sys.version.replace("\n", " ")
+
+    conda_packages = get_conda_packages(run_lambda)
+
+    return SystemEnv(
+        torch_version=version_str,
+        is_debug_build=debug_mode_str,
+        python_version="{} ({}-bit runtime)".format(
+            sys_version, sys.maxsize.bit_length() + 1
+        ),
+        python_platform=get_python_platform(),
+        is_cuda_available=cuda_available_str,
+        cuda_compiled_version=cuda_version_str,
+        cuda_runtime_version=get_running_cuda_version(run_lambda),
+        cuda_module_loading=get_cuda_module_loading_config(),
+        nvidia_gpu_models=get_gpu_info(run_lambda),
+        nvidia_driver_version=get_nvidia_driver_version(run_lambda),
+        cudnn_version=get_cudnn_version(run_lambda),
+        hip_compiled_version=hip_compiled_version,
+        hip_runtime_version=hip_runtime_version,
+        miopen_runtime_version=miopen_runtime_version,
+        pip_version=pip_version,
+        pip_packages=pip_list_output,
+        conda_packages=conda_packages,
+        os=get_os(run_lambda),
+        libc_version=get_libc_version(),
+        gcc_version=get_gcc_version(run_lambda),
+        clang_version=get_clang_version(run_lambda),
+        cmake_version=get_cmake_version(run_lambda),
+        caching_allocator_config=get_cachingallocator_config(),
+        is_xnnpack_available=is_xnnpack_available(),
+        cpu_info=get_cpu_info(run_lambda),
+    )
+
+
+env_info_fmt = """
+PyTorch version: {torch_version}
+Is debug build: {is_debug_build}
+CUDA used to build PyTorch: {cuda_compiled_version}
+ROCM used to build PyTorch: {hip_compiled_version}
+
+OS: {os}
+GCC version: {gcc_version}
+Clang version: {clang_version}
+CMake version: {cmake_version}
+Libc version: {libc_version}
+
+Python version: {python_version}
+Python platform: {python_platform}
+Is CUDA available: {is_cuda_available}
+CUDA runtime version: {cuda_runtime_version}
+CUDA_MODULE_LOADING set to: {cuda_module_loading}
+GPU models and configuration: {nvidia_gpu_models}
+Nvidia driver version: {nvidia_driver_version}
+cuDNN version: {cudnn_version}
+HIP runtime version: {hip_runtime_version}
+MIOpen runtime version: {miopen_runtime_version}
+Is XNNPACK available: {is_xnnpack_available}
+
+CPU:
+{cpu_info}
+
+Versions of relevant libraries:
+{pip_packages}
+{conda_packages}
+""".strip()
+
+
+def pretty_str(envinfo):  # noqa: C901
+    def replace_nones(dct, replacement="Could not collect"):
+        for key in dct.keys():
+            if dct[key] is not None:
+                continue
+            dct[key] = replacement
+        return dct
+
+    def replace_bools(dct, true="Yes", false="No"):
+        for key in dct.keys():
+            if dct[key] is True:
+                dct[key] = true
+            elif dct[key] is False:
+                dct[key] = false
+        return dct
+
+    def prepend(text, tag="[prepend]"):
+        lines = text.split("\n")
+        updated_lines = [tag + line for line in lines]
+        return "\n".join(updated_lines)
+
+    def replace_if_empty(text, replacement="No relevant packages"):
+        if text is not None and len(text) == 0:
+            return replacement
+        return text
+
+    def maybe_start_on_next_line(string):
+        # If `string` is multiline, prepend a \n to it.
+        if string is not None and len(string.split("\n")) > 1:
+            return "\n{}\n".format(string)
+        return string
+
+    mutable_dict = envinfo._asdict()
+
+    # If nvidia_gpu_models is multiline, start on the next line
+    mutable_dict["nvidia_gpu_models"] = maybe_start_on_next_line(
+        envinfo.nvidia_gpu_models
+    )
+
+    # If the machine doesn't have CUDA, report some fields as 'No CUDA'
+    dynamic_cuda_fields = [
+        "cuda_runtime_version",
+        "nvidia_gpu_models",
+        "nvidia_driver_version",
+    ]
+    all_cuda_fields = dynamic_cuda_fields + ["cudnn_version"]
+    all_dynamic_cuda_fields_missing = all(
+        mutable_dict[field] is None for field in dynamic_cuda_fields
+    )
+    if (
+        TORCH_AVAILABLE
+        and not torch.cuda.is_available()
+        and all_dynamic_cuda_fields_missing
+    ):
+        for field in all_cuda_fields:
+            mutable_dict[field] = "No CUDA"
+        if envinfo.cuda_compiled_version is None:
+            mutable_dict["cuda_compiled_version"] = "None"
+
+    # Replace True with Yes, False with No
+    mutable_dict = replace_bools(mutable_dict)
+
+    # Replace all None objects with 'Could not collect'
+    mutable_dict = replace_nones(mutable_dict)
+
+    # If either of these are '', replace with 'No relevant packages'
+    mutable_dict["pip_packages"] = replace_if_empty(mutable_dict["pip_packages"])
+    mutable_dict["conda_packages"] = replace_if_empty(mutable_dict["conda_packages"])
+
+    # Tag conda and pip packages with a prefix
+    # If they were previously None, they'll show up as ie '[conda] Could not collect'
+    if mutable_dict["pip_packages"]:
+        mutable_dict["pip_packages"] = prepend(
+            mutable_dict["pip_packages"], "[{}] ".format(envinfo.pip_version)
+        )
+    if mutable_dict["conda_packages"]:
+        mutable_dict["conda_packages"] = prepend(
+            mutable_dict["conda_packages"], "[conda] "
+        )
+    mutable_dict["cpu_info"] = envinfo.cpu_info
+    return env_info_fmt.format(**mutable_dict)
+
+
+def get_pretty_env_info():
+    """
+    Returns a pretty string of environment information.
+
+    This function retrieves environment information by calling the `get_env_info` function
+    and then formats the information into a human-readable string. The retrieved environment
+    information is listed in the document of `get_env_info`.
+    This function is used in `python collect_env.py` that should be executed when reporting a bug.
+
+    Returns:
+        str: A pretty string of the environment information.
+    """
+    return pretty_str(get_env_info())
+
+
+def main():
+    print("Collecting environment information...")
+    output = get_pretty_env_info()
+    print(output)
+
+    if (
+        TORCH_AVAILABLE
+        and hasattr(torch, "utils")
+        and hasattr(torch.utils, "_crash_handler")
+    ):
+        minidump_dir = torch.utils._crash_handler.DEFAULT_MINIDUMP_DIR
+        if sys.platform == "linux" and os.path.exists(minidump_dir):
+            dumps = [
+                os.path.join(minidump_dir, dump) for dump in os.listdir(minidump_dir)
+            ]
+            latest = max(dumps, key=os.path.getctime)
+            ctime = os.path.getctime(latest)
+            creation_time = datetime.datetime.fromtimestamp(ctime).strftime(
+                "%Y-%m-%d %H:%M:%S"
+            )
+            msg = (
+                "\n*** Detected a minidump at {} created on {}, ".format(
+                    latest, creation_time
+                )
+                + "if this is related to your bug please include it when you file a report ***"
+            )
+            print(msg, file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()