diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
index b17dd3f8f95..a846b87c198 100644
--- a/.ci/docker/ci_commit_pins/pytorch.txt
+++ b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-295f2ed4d103017f7e19a7b8263ece606cd629db
+7ae0ce6360b6e4f944906502d20da24c04debee5
diff --git a/.ci/scripts/gather_test_models.py b/.ci/scripts/gather_test_models.py
index 3f22d7699de..802aee4b53c 100755
--- a/.ci/scripts/gather_test_models.py
+++ b/.ci/scripts/gather_test_models.py
@@ -14,7 +14,7 @@
 from typing import Any
 
 from examples.models import MODEL_NAME_TO_MODEL
-from examples.xnnpack import MODEL_NAME_TO_OPTIONS
+from examples.xnnpack import MODEL_NAME_TO_OPTIONS, QuantType
 
 DEFAULT_RUNNERS = {
     "linux": "linux.2xlarge",
@@ -154,7 +154,7 @@ def export_models_for_ci() -> dict[str, dict]:
         if backend == "xnnpack":
             if name not in MODEL_NAME_TO_OPTIONS:
                 continue
-            if MODEL_NAME_TO_OPTIONS[name].quantization:
+            if MODEL_NAME_TO_OPTIONS[name].quantization != QuantType.NONE:
                 backend += "-quantization"
 
             if MODEL_NAME_TO_OPTIONS[name].delegation:
diff --git a/.ci/scripts/unittest-linux.sh b/.ci/scripts/unittest-linux.sh
index f8ff9df773e..a05211d8e0e 100755
--- a/.ci/scripts/unittest-linux.sh
+++ b/.ci/scripts/unittest-linux.sh
@@ -21,8 +21,7 @@ if [[ "$BUILD_TOOL" == "cmake" ]]; then
     source .ci/scripts/setup-vulkan-linux-deps.sh
 
     PYTHON_EXECUTABLE=python \
-    EXECUTORCH_BUILD_PYBIND=ON \
-    CMAKE_ARGS="-DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON" \
+    CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON" \
     .ci/scripts/setup-linux.sh "$@"
 
     # Install llama3_2_vision dependencies.
diff --git a/.ci/scripts/unittest-macos.sh b/.ci/scripts/unittest-macos.sh
index d5ca97404aa..12c9d3f1508 100755
--- a/.ci/scripts/unittest-macos.sh
+++ b/.ci/scripts/unittest-macos.sh
@@ -21,8 +21,7 @@ trap 'rm -rfv ${TMP_DIR}' EXIT
 
 # Setup MacOS dependencies as there is no Docker support on MacOS atm
 PYTHON_EXECUTABLE=python \
-EXECUTORCH_BUILD_PYBIND=ON \
-CMAKE_ARGS="-DEXECUTORCH_BUILD_COREML=ON -DEXECUTORCH_BUILD_MPS=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON" \
+CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_COREML=ON -DEXECUTORCH_BUILD_MPS=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON" \
 ${CONDA_RUN} --no-capture-output \
 .ci/scripts/setup-macos.sh "$@"
 
diff --git a/.ci/scripts/wheel/envvar_base.sh b/.ci/scripts/wheel/envvar_base.sh
index 6379dee6b5a..15f590f0f68 100755
--- a/.ci/scripts/wheel/envvar_base.sh
+++ b/.ci/scripts/wheel/envvar_base.sh
@@ -8,13 +8,10 @@
 # should typically only contain shell variable assignments. Be sure to export
 # any variables so that subprocesses will see them.
 
-# Enable pybindings so that users can execute ExecuTorch programs from python.
-export EXECUTORCH_BUILD_PYBIND=1
-
 # Ensure that CMAKE_ARGS is defined before referencing it. Defaults to empty
 # if not defined.
 export CMAKE_ARGS="${CMAKE_ARGS:-}"
 
 # Link the XNNPACK backend into the pybindings runtime so that users can execute
 # ExecuTorch programs that delegate to it.
-CMAKE_ARGS="${CMAKE_ARGS} -DEXECUTORCH_BUILD_XNNPACK=ON"
+CMAKE_ARGS="${CMAKE_ARGS} -DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON"
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 9a2221b3aac..c3eafc02c39 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -365,8 +365,7 @@ jobs:
         # build module for executorch.extension.pybindings.portable_lib
         BUILD_TOOL="cmake"
         PYTHON_EXECUTABLE=python \
-        EXECUTORCH_BUILD_XNNPACK=ON \
-        EXECUTORCH_BUILD_PYBIND=ON \
+        CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON" \
         bash .ci/scripts/setup-linux.sh --build-tool "${BUILD_TOOL}"
 
         # see if we can import the module successfully
@@ -504,7 +503,7 @@ jobs:
 
         # Setup MacOS dependencies as there is no Docker support on MacOS atm
         PYTHON_EXECUTABLE=python \
-        EXECUTORCH_BUILD_PYBIND=ON \
+        CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON" \
         EXECUTORCH_BUILD_ARM_BAREMETAL=ON \
         .ci/scripts/setup-linux.sh --build-tool "${BUILD_TOOL}"
 
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 829d8fd88f4..6c4d7f8a58e 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -65,22 +65,29 @@ jobs:
       matrix:
         model: [linear, add, add_mul, ic3, ic4, mv2, mv3, resnet18, resnet50, vit, w2l, mobilebert, emformer_join, emformer_transcribe]
         backend: [portable, xnnpack-quantization-delegation]
+        runner: [linux.arm64.2xlarge]
         include:
           - model: lstm
             backend: portable
+            runner: linux.arm64.2xlarge
           - model: mul
             backend: portable
+            runner: linux.arm64.2xlarge
           - model: softmax
             backend: portable
+            runner: linux.arm64.2xlarge
           - model: phi_4_mini
             backend: portable
+            runner: linux.arm64.m7g.4xlarge
           - model: qwen2_5
             backend: portable
+            runner: linux.arm64.2xlarge
           - model: llama3_2_vision_encoder
             backend: portable
+            runner: linux.arm64.2xlarge
       fail-fast: false
     with:
-      runner: linux.arm64.2xlarge
+      runner: ${{ matrix.runner }}
       docker-image: executorch-ubuntu-22.04-gcc11-aarch64
       submodules: 'true'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
@@ -261,7 +268,7 @@ jobs:
 
         # build module for executorch.extension.pybindings.portable_lib
         BUILD_TOOL=${{ matrix.build-tool }}
-        EXECUTORCH_BUILD_PYBIND=ON PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh --build-tool "${BUILD_TOOL}"
+        CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON" PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh --build-tool "${BUILD_TOOL}"
 
         # see if we can import the module successfully
         ${CONDA_RUN} python -c "from executorch.extension.pybindings import portable_lib; print('success!')"
@@ -536,9 +543,8 @@ jobs:
         git clone https://github.com/huggingface/optimum-executorch
         cd optimum-executorch
         # There is no release yet, for CI stability, always test from the same commit on main
-        git checkout 6a7e83f3eee2976fa809335bfb78a45b1ea1cb25
-        pip install .
-        pip install accelerate sentencepiece
+        git checkout 577a2b19670e4c643a5c6ecb09bf47b9a699e7c6
+        pip install .[tests]
         pip list
         echo "::endgroup::"
 
diff --git a/Package.swift b/Package.swift
index 94acfc4cd7b..1322b918c07 100644
--- a/Package.swift
+++ b/Package.swift
@@ -82,5 +82,24 @@ let package = Package(
           (value["libraries"] as? [String] ?? []).map { .linkedLibrary($0) }
       ),
     ]
-  }
+  } + [
+    .testTarget(
+      name: "tests",
+      dependencies: [
+        .target(name: "executorch_debug"),
+        .target(name: "kernels_portable"),
+      ],
+      path: "extension/apple/ExecuTorch/__tests__",
+      resources: [
+        .copy("resources/add.pte")
+      ],
+      linkerSettings: [
+        .linkedLibrary("c++"),
+        .unsafeFlags([
+          "-Xlinker", "-force_load",
+          "-Xlinker", "cmake-out/kernels_portable.xcframework/macos-arm64/libkernels_portable_macos.a",
+        ])
+      ]
+    )
+  ]
 )
diff --git a/backends/apple/coreml/compiler/coreml_preprocess.py b/backends/apple/coreml/compiler/coreml_preprocess.py
index c7828888ee5..e9afd819d94 100644
--- a/backends/apple/coreml/compiler/coreml_preprocess.py
+++ b/backends/apple/coreml/compiler/coreml_preprocess.py
@@ -16,8 +16,8 @@
 
 import coremltools as ct
 import coremltools.optimize as cto
-import executorchcoreml
 
+from executorch.backends.apple.coreml import executorchcoreml
 from executorch.exir.backend.backend_details import (
     BackendDetails,
     ExportedProgram,
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm
index 3e11999e939..3848f7c9b3c 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm
@@ -208,7 +208,7 @@ void set_outputs(std::vector<executorchcoreml::MultiArray>& outputs,
                                     const inmemoryfs::InMemoryFileSystem *inmemory_fs,
                                     NSError * __autoreleasing *error) {
     NSError *local_error = nil;
-    if (![fm createDirectoryAtURL:dst_url withIntermediateDirectories:NO attributes:@{} error:error]) {
+    if (![fm createDirectoryAtURL:dst_url withIntermediateDirectories:YES attributes:@{} error:error]) {
         ETCoreMLLogUnderlyingErrorAndSetNSError(error,
                                                 ETCoreMLErrorModelSaveFailed,
                                                 local_error,
diff --git a/backends/apple/coreml/runtime/inmemoryfs/setup.py b/backends/apple/coreml/runtime/inmemoryfs/setup.py
deleted file mode 100644
index c93022ed341..00000000000
--- a/backends/apple/coreml/runtime/inmemoryfs/setup.py
+++ /dev/null
@@ -1,52 +0,0 @@
-import os
-
-import pathlib
-import sys
-
-REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent.parent.parent.parent.parent
-PYBIND11_DIR_PATH = REPO_ROOT / "third-party" / "pybind11"
-sys.path.append(str(PYBIND11_DIR_PATH.absolute()))
-
-from pybind11.setup_helpers import build_ext, Pybind11Extension
-from setuptools import setup
-
-__version__ = "0.0.1"
-
-cxx_std = int(os.environ.get("CMAKE_CXX_STANDARD", "17"))
-
-ext_modules = [
-    Pybind11Extension(
-        "executorchcoreml",
-        [
-            "../util/json_util.cpp",
-            "inmemory_filesystem.cpp",
-            "inmemory_filesystem_py.cpp",
-            "inmemory_filesystem_utils.cpp",
-            "memory_buffer.cpp",
-            "memory_stream.cpp",
-            "reversed_memory_stream.cpp",
-        ],
-        define_macros=[("VERSION_INFO", __version__)],
-        cxx_std=cxx_std,
-        extra_compile_args=["-mmacosx-version-min=10.15", "-g"],
-        include_dirs=[
-            "../../third-party/nlohmann_json/single_include",
-            ".",
-            "../util",
-        ],
-    ),
-]
-
-setup(
-    name="executorchcoreml",
-    version=__version__,
-    description="CoreML extension for executorch",
-    long_description="",
-    author="Apple Inc.",
-    ext_modules=ext_modules,
-    extras_require={"test": "pytest"},
-    cmdclass={"build_ext": build_ext},
-    include_package_data=True,
-    zip_safe=False,
-    python_requires=">=3.9",
-)
diff --git a/backends/apple/coreml/scripts/install_inmemoryfs.sh b/backends/apple/coreml/scripts/install_inmemoryfs.sh
deleted file mode 100644
index 1fb9dd1c4d5..00000000000
--- a/backends/apple/coreml/scripts/install_inmemoryfs.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/usr/bin/env bash
-#
-# Copyright © 2023 Apple Inc. All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-SCRIPT_DIR_PATH="$(
-    cd -- "$(dirname "$0")" >/dev/null 2>&1
-    pwd -P
-)"
-
-EXECUTORCH_ROOT_PATH=$(realpath "$SCRIPT_DIR_PATH/../../../../")
-COREML_DIR_PATH="$EXECUTORCH_ROOT_PATH/backends/apple/coreml"
-
-red=`tput setaf 1`
-green=`tput setaf 2`
-
-echo "${green}ExecuTorch: Installing inmemoryfs extension."
-pip install "$COREML_DIR_PATH/runtime/inmemoryfs"
-STATUS=$?
-if [ $STATUS -ne 0 ]; then
-    echo "${red}ExecuTorch: Failed to install inmemoryfs extension."
-    exit 1
-fi
\ No newline at end of file
diff --git a/backends/apple/coreml/scripts/install_requirements.sh b/backends/apple/coreml/scripts/install_requirements.sh
index 140ba09c702..5b3f4e3f31a 100755
--- a/backends/apple/coreml/scripts/install_requirements.sh
+++ b/backends/apple/coreml/scripts/install_requirements.sh
@@ -10,6 +10,10 @@ SCRIPT_DIR_PATH="$(
     pwd -P
 )"
 
+# TODO(jathu): remove the need to fetch coremltools to build deps for coreml_executor_runner.
+# Keep this version in sync with: pyproject.toml
+COREMLTOOLS_VERSION="8.1"
+
 red=`tput setaf 1`
 green=`tput setaf 2`
 
@@ -24,7 +28,7 @@ rm -rf "$COREML_DIR_PATH/third-party"
 mkdir "$COREML_DIR_PATH/third-party"
 
 echo "${green}ExecuTorch: Cloning coremltools."
-git clone --depth 1 --branch 8.1 "https://github.com/apple/coremltools.git" $COREMLTOOLS_DIR_PATH
+git clone --depth 1 --branch "${COREMLTOOLS_VERSION}" "https://github.com/apple/coremltools.git" $COREMLTOOLS_DIR_PATH
 cd $COREMLTOOLS_DIR_PATH
 
 STATUS=$?
@@ -43,16 +47,7 @@ fi
 
 mkdir "$COREMLTOOLS_DIR_PATH/build"
 cmake -S "$COREMLTOOLS_DIR_PATH" -B "$COREMLTOOLS_DIR_PATH/build"
-cmake --build "$COREMLTOOLS_DIR_PATH/build" --parallel
-
-echo "${green}ExecuTorch: Installing coremltools."
-pip install "$COREMLTOOLS_DIR_PATH"
-
-STATUS=$?
-if [ $STATUS -ne 0 ]; then
-    echo "${red}ExecuTorch: Failed to install coremltools."
-    exit 1
-fi
+cmake --build "$COREMLTOOLS_DIR_PATH/build" --parallel --target mlmodel
 
 echo "${green}ExecuTorch: Cloning nlohmann."
 git clone https://github.com/nlohmann/json.git "$COREML_DIR_PATH/third-party/nlohmann_json"
@@ -62,8 +57,6 @@ if [ $STATUS -ne 0 ]; then
     exit 1
 fi
 
-sh "$COREML_DIR_PATH/scripts/install_inmemoryfs.sh"
-
 echo "${green}ExecuTorch: Copying protobuf files."
 mkdir -p "$COREML_DIR_PATH/runtime/sdk/format/"
 cp -rf "$PROTOBUF_FILES_DIR_PATH" "$COREML_DIR_PATH/runtime/sdk/format/"
diff --git a/backends/apple/coreml/setup.md b/backends/apple/coreml/setup.md
index 6b7ffa4ded8..c6daae0d989 100644
--- a/backends/apple/coreml/setup.md
+++ b/backends/apple/coreml/setup.md
@@ -6,16 +6,8 @@ This is a tutorial for setting up the Core ML backend.
 
 1. Follow the instructions described in [Setting Up ExecuTorch](/docs/source/getting-started-setup.md) to set up ExecuTorch environment.
 
-2. Run `install_requirements.sh` to install dependencies required by the **Core ML** backend.
 
-```
-cd executorch
-
-./backends/apple/coreml/scripts/install_requirements.sh
-
-```
-
-3. Run the example script to validate that the **Core ML** backend is set up correctly.
+2. Run the example script to validate that the **Core ML** backend is set up correctly.
 
 ```
 cd executorch
@@ -26,7 +18,7 @@ python3 -m examples.apple.coreml.scripts.export --model_name add
 
 ```
 
-4. You can now integrate the **Core ML** backend in code.
+3. You can now integrate the **Core ML** backend in code.
 
 ```python
 # Delegate to Core ML backend
diff --git a/backends/arm/operator_support/convolution_support.py b/backends/arm/operator_support/convolution_support.py
index b07ae82f98f..9e13babe23a 100644
--- a/backends/arm/operator_support/convolution_support.py
+++ b/backends/arm/operator_support/convolution_support.py
@@ -55,7 +55,7 @@ def _is_node_supported_u55(self, node: fx.Node):
 
         C_in = shape_in[1]
         C_out = shape_out[1]
-        if (C_in == group) and (C_out % C_in) == 0:
+        if (C_in == group) and (C_out % C_in) == 0 and len(shape_in) <= 4:
             # Depthwise convolution
             for dim in shape_in[1:]:
                 if not 1 <= dim <= 65536:
@@ -74,6 +74,7 @@ def _is_node_supported_u55(self, node: fx.Node):
 
         kernel_w = kernel[2]
         kernel_h = kernel[3] if len(kernel) > 3 else 1
+        kernel_z = kernel[4] if len(kernel) > 4 else 1
         # Kernel condition misses constraint on sum of absolute weights
         if not 1 <= kernel_h <= 64 or not 1 <= kernel_w * kernel_h <= 4096:
             self.reporter.report_reject(
@@ -81,6 +82,11 @@ def _is_node_supported_u55(self, node: fx.Node):
                 f"Convolution needs to have kernel_y<=64, kernel_x*kernel_y<=4096, got kernel ({kernel_w}, {kernel_h})",
             )
             return False
+        if kernel_z != 1:
+            self.reporter.report_reject(
+                node, f"Convolution3d needs to have kernel_z==1, got {kernel_z}."
+            )
+            return False
 
         if not self._stride_condition(node):
             self.reporter.report_reject(
@@ -107,6 +113,14 @@ def _stride_condition(self, node: fx.Node) -> bool:
         if len(strides) == 1:
             strides = [strides[0]] * 2
 
+        if len(strides) > 2:
+            stride_z = strides[2]
+            if stride_z > 1:
+                self.reporter.report_reject(
+                    node, f"Convolution3d only supports stride_z<=1, got {stride_z}."
+                )
+                return False
+
         for stride, dilation in zip(strides, dilations):
             stride_condition = 1 <= stride <= 3
             dilation_condition = (not has_padding) and (dilation == 1)
diff --git a/backends/arm/test/models/test_llama.py b/backends/arm/test/models/test_llama.py
index 973f62d2724..2656c12417d 100644
--- a/backends/arm/test/models/test_llama.py
+++ b/backends/arm/test/models/test_llama.py
@@ -52,7 +52,7 @@ def prepare_model(self):
                 params_file, str
             ), "invalid input for --llama_inputs"
         else:
-            logging.warning(
+            logger.warning(
                 "Skipping Llama test because of lack of input. To run use --llama_inputs <.pt> <.json>"
             )
             return None, None, None
@@ -61,6 +61,8 @@ def prepare_model(self):
             params_file
         ), "Invalid file paths"
 
+        logger.info("Running test_llama.py")
+
         # TODO: Enable key value cache
         args = [
             "--disable_dynamic_shape",
@@ -112,9 +114,11 @@ def test_llama_tosa_MI(self):
                 )
                 .export()
                 .to_edge_transform_and_lower()
-                .check_count({"torch.ops.higher_order.executorch_call_delegate": 14})
+                .check_count({"torch.ops.higher_order.executorch_call_delegate": 26})
                 .to_executorch()
                 .run_method_and_compare_outputs(
-                    inputs=llama_inputs, atol=1.8, rtol=0.01  # TODO: decrease tolerance
+                    inputs=llama_inputs,
+                    atol=4.3,
+                    rtol=1.1,  # TODO: MLETORCH-825 decrease tolerance
                 )
             )
diff --git a/backends/arm/test/ops/test_conv2d.py b/backends/arm/test/ops/test_conv2d.py
index 8083b2ecf71..844eed97638 100644
--- a/backends/arm/test/ops/test_conv2d.py
+++ b/backends/arm/test/ops/test_conv2d.py
@@ -8,10 +8,10 @@
 
 import torch
 from executorch.backends.arm.test import common
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from executorch.backends.arm.test.tester.test_pipeline import (
     EthosU55PipelineBI,
     EthosU85PipelineBI,
+    OpNotSupportedPipeline,
     TosaPipelineBI,
     TosaPipelineMI,
 )
@@ -34,9 +34,9 @@ def __init__(
         in_channels: Union[List, int, None] = None,
         out_channels: Union[List, int, None] = None,
         kernel_size: Union[List, Tuple, None] = None,
-        stride: Union[List, Tuple, None] = None,
-        padding: Union[List, Tuple, None] = None,
-        dilation: Union[List, Tuple, None] = None,
+        stride: Union[List, Tuple, int, None] = None,
+        padding: Union[List, Tuple, int, None] = None,
+        dilation: Union[List, Tuple, int, None] = None,
         groups: Union[List, int, None] = None,
         bias: Union[List, bool, None] = None,
         padding_mode: Union[List, str, None] = None,
@@ -446,17 +446,9 @@ def test_convolution_2d_u85_BI_on_fvp(test_module):
 def test_reject_convolution_2d_u55_BI(
     module: Conv2d,
 ):
-    (
-        ArmTester(
-            module,
-            example_inputs=module.get_inputs(),
-            compile_spec=common.get_u55_compile_spec(),
-        )
-        .quantize()
-        .export()
-        .check_count({"torch.ops.aten.conv2d.default": 1})
-        .check(["torch.ops.quantized_decomposed"])
-        .to_edge_transform_and_lower()
-        .check(["executorch_exir_dialects_edge__ops_aten_convolution_default"])
-        .check_count({"torch.ops.higher_order.executorch_call_delegate": 0})
-    )
+    OpNotSupportedPipeline(
+        module,
+        module.get_inputs(),
+        "TOSA-0.80+BI+u55",
+        {"executorch_exir_dialects_edge__ops_aten_convolution_default": 1},
+    ).run()
diff --git a/backends/arm/test/ops/test_conv3d.py b/backends/arm/test/ops/test_conv3d.py
new file mode 100644
index 00000000000..22f7e9e7f54
--- /dev/null
+++ b/backends/arm/test/ops/test_conv3d.py
@@ -0,0 +1,399 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from typing import List, Tuple, Union
+
+import pytest
+import torch
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineBI,
+    EthosU85PipelineBI,
+    OpNotSupportedPipeline,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+aten_op = "torch.ops.aten.conv3d.default"
+exir_op = "executorch_exir_dialects_edge__ops_aten_convolution_default"
+
+
+class Conv3d(torch.nn.Module):
+    """
+    Creates one or many chained 3D-convolutions. For multiple convolutions, the
+    respective parameteres are provided as lists.
+    """
+
+    def __init__(
+        self,
+        height=8,
+        width=8,
+        depth=8,
+        nbr_conv=1,  # Number of chained convs
+        in_channels: Union[List, int, None] = None,
+        out_channels: Union[List, int, None] = None,
+        kernel_size: Union[List, Tuple, None] = None,
+        stride: Union[List, Tuple, int, None] = None,
+        padding: Union[List, Tuple, int, None] = None,
+        dilation: Union[List, Tuple, int, None] = None,
+        groups: Union[List, int, None] = None,
+        bias: Union[List, bool, None] = None,
+        padding_mode: Union[List, str, None] = None,
+        batches=1,
+        dtype=torch.float,
+    ):
+        super().__init__()
+        self.nbr_convs = nbr_conv
+
+        # Handle default values
+        in_channels = [2] * nbr_conv if in_channels is None else in_channels
+        out_channels = [1 * nbr_conv] if out_channels is None else out_channels
+        kernel_size = [(3, 3, 1)] * nbr_conv if kernel_size is None else kernel_size
+        stride = [(2, 2, 1)] * nbr_conv if stride is None else stride
+        padding = [(1, 1, 1)] * nbr_conv if padding is None else padding
+        dilation = [(1, 1, 1)] * nbr_conv if dilation is None else dilation
+        groups = [1] * nbr_conv if groups is None else groups
+        bias = [True] * nbr_conv if bias is None else bias
+        padding_mode = ["zeros"] * nbr_conv if padding_mode is None else padding_mode
+
+        # This allows the input parameters to be either a single value or a list
+        # as type hint implies
+        if not isinstance(in_channels, List):
+            in_channels = [in_channels]
+        if not isinstance(out_channels, List):
+            out_channels = [out_channels]
+        if not isinstance(kernel_size, List):
+            kernel_size = [kernel_size]
+        if not isinstance(stride, List):
+            stride = [stride]
+        if not isinstance(padding, List):
+            padding = [padding]
+        if not isinstance(dilation, List):
+            dilation = [dilation]
+        if not isinstance(groups, List):
+            groups = [groups]
+        if not isinstance(bias, List):
+            bias = [bias]
+        if not isinstance(padding_mode, List):
+            padding_mode = [padding_mode]
+
+        self.batches = batches
+        self.in_channels = in_channels
+        self.height = height
+        self.width = width
+        self.depth = depth
+        self.dtype = dtype
+
+        # Build chain of convs
+        for i in range(self.nbr_convs):
+            setattr(
+                self,
+                f"conv_{i}",
+                torch.nn.Conv3d(
+                    in_channels=in_channels[i],
+                    out_channels=out_channels[i],
+                    kernel_size=kernel_size[i],
+                    stride=stride[i],
+                    padding=padding[i],
+                    dilation=dilation[i],
+                    groups=groups[i],
+                    bias=bias[i],
+                    padding_mode=padding_mode[i],
+                ).to(dtype),
+            )
+
+    def get_inputs(self):
+        return (
+            torch.randn(
+                self.batches, self.in_channels[0], self.height, self.width, self.depth
+            ).to(self.dtype),
+        )
+
+    def forward(self, x):
+        for i in range(self.nbr_convs):
+            conv = getattr(self, f"conv_{i}")
+            x = conv(x)
+        return x
+
+
+conv3d_2x2_3x2x40x40_nobias = Conv3d(
+    in_channels=2,
+    out_channels=3,
+    kernel_size=(2, 2, 2),
+    stride=1,
+    bias=False,
+    padding=0,
+    width=40,
+    height=40,
+    batches=3,
+)
+
+conv3d_3x3_1x3x256x256_st1 = Conv3d(
+    in_channels=3,
+    out_channels=10,
+    kernel_size=(3, 3, 3),
+    stride=1,
+    padding=0,
+    width=256,
+    height=256,
+    batches=1,
+)
+
+conv3d_3x3_1x3x12x12_st2_pd1 = Conv3d(
+    in_channels=3,
+    out_channels=4,
+    kernel_size=(3, 3, 3),
+    stride=2,
+    padding=1,
+    width=12,
+    height=12,
+    batches=1,
+)
+
+conv3d_1x1_1x2x128x128_st1 = Conv3d(
+    in_channels=2,
+    out_channels=1,
+    kernel_size=(1, 1, 1),
+    stride=1,
+    padding=0,
+    width=128,
+    height=128,
+    batches=1,
+)
+
+conv3d_2x2_1x1x14x13_st2 = Conv3d(
+    in_channels=1,
+    out_channels=1,
+    kernel_size=(2, 2, 2),
+    stride=2,
+    padding=0,
+    width=14,
+    height=13,
+    batches=1,
+)
+
+conv3d_5x5_3x2x128x128_st1 = Conv3d(
+    in_channels=2,
+    out_channels=3,
+    kernel_size=(5, 5, 5),
+    stride=1,
+    padding=0,
+    width=128,
+    height=128,
+    batches=3,
+)
+
+conv3d_3x3_1x3x224x224_st2_pd1 = Conv3d(
+    in_channels=3,
+    out_channels=16,
+    kernel_size=(3, 3, 3),
+    stride=2,
+    padding=1,
+    width=224,
+    height=224,
+    batches=1,
+)
+
+conv3d_5x5_1x3x14x15_st3_pd1 = Conv3d(
+    in_channels=3,
+    out_channels=16,
+    kernel_size=(5, 5, 5),
+    stride=3,
+    padding=1,
+    width=14,
+    height=15,
+    batches=1,
+)
+
+conv3d_7x7_1x3x16x16_st2_pd1_dl2 = Conv3d(
+    in_channels=3,
+    out_channels=3,
+    kernel_size=(7, 7, 7),
+    stride=2,
+    padding=1,
+    dilation=2,
+    width=16,
+    height=16,
+    batches=1,
+)
+
+conv3d_7x7_1x3x15x15_st1_pd0_dl1 = Conv3d(
+    in_channels=3,
+    out_channels=3,
+    kernel_size=(7, 7, 7),
+    stride=1,
+    padding=0,
+    dilation=1,
+    width=15,
+    height=15,
+    batches=1,
+)
+
+conv3d_5x5_1x3x14x14_st5_pd0_dl1 = Conv3d(
+    in_channels=3,
+    out_channels=3,
+    kernel_size=(5, 5, 5),
+    stride=5,
+    padding=0,
+    dilation=1,
+    width=14,
+    height=14,
+    batches=1,
+)
+
+conv3d_5x5_1x3x9x9_st5_pd0_dl1 = Conv3d(
+    in_channels=3,
+    out_channels=3,
+    kernel_size=(5, 5, 5),
+    stride=5,
+    padding=0,
+    dilation=1,
+    width=9,
+    height=9,
+    batches=1,
+)
+
+conv3d_3x3_1x3x8x9_st3_pd0_dl1 = Conv3d(
+    in_channels=3,
+    out_channels=3,
+    kernel_size=(3, 3, 3),
+    stride=3,
+    padding=0,
+    dilation=1,
+    width=8,
+    height=9,
+    batches=1,
+)
+
+conv3d_3x3_1x3x9x8_st3_pd0_dl1 = Conv3d(
+    in_channels=3,
+    out_channels=3,
+    kernel_size=(3, 3, 3),
+    stride=3,
+    padding=0,
+    dilation=1,
+    width=8,
+    height=9,
+    batches=1,
+)
+
+conv3d_3x4_1x3x7x7_st3_pd0_dl1 = Conv3d(
+    in_channels=3,
+    out_channels=3,
+    kernel_size=(3, 4, 3),
+    stride=3,
+    padding=0,
+    dilation=1,
+    width=7,
+    height=7,
+    batches=1,
+)
+
+conv3d_4x3_1x3x7x7_st3_pd0_dl1 = Conv3d(
+    in_channels=3,
+    out_channels=3,
+    kernel_size=(4, 3, 3),
+    stride=3,
+    padding=0,
+    dilation=1,
+    width=7,
+    height=7,
+    batches=1,
+)
+
+test_modules = {
+    "2x2_3x2x40x40_nobias": conv3d_2x2_3x2x40x40_nobias,
+    "3x3_1x3x256x256_st1": conv3d_3x3_1x3x256x256_st1,
+    "3x3_1x3x12x12_st2_pd1": conv3d_3x3_1x3x12x12_st2_pd1,
+    "1x1_1x2x128x128_st1": conv3d_1x1_1x2x128x128_st1,
+    "2x2_1x1x14x13_st2_needs_adjust_pass": conv3d_2x2_1x1x14x13_st2,
+    "5x5_1x3x14x15_st3_pd1_needs_adjust_pass": conv3d_5x5_1x3x14x15_st3_pd1,
+    "7x7_1x3x16x16_st2_pd1_dl2_needs_adjust_pass": conv3d_7x7_1x3x16x16_st2_pd1_dl2,
+    "7x7_1x3x15x15_st1_pd0_dl1_needs_adjust_pass": conv3d_7x7_1x3x15x15_st1_pd0_dl1,
+    "5x5_1x3x14x14_st5_pd0_dl1_needs_adjust_pass": conv3d_5x5_1x3x14x14_st5_pd0_dl1,
+    "5x5_1x3x9x9_st5_pd0_dl1_needs_adjust_pass": conv3d_5x5_1x3x9x9_st5_pd0_dl1,
+    "3x3_1x3x9x8_st3_pd0_dl1_needs_adjust_pass": conv3d_3x3_1x3x9x8_st3_pd0_dl1,
+    "3x3_1x3x8x9_st3_pd0_dl1_needs_adjust_pass": conv3d_3x3_1x3x8x9_st3_pd0_dl1,
+    "3x4_1x3x7x7_st3_pd0_dl1_needs_adjust_pass": conv3d_3x4_1x3x7x7_st3_pd0_dl1,
+    "4x3_1x3x7x7_st3_pd0_dl1_needs_adjust_pass": conv3d_4x3_1x3x7x7_st3_pd0_dl1,
+    "5x5_3x2x128x128_st1": conv3d_5x5_3x2x128x128_st1,
+    "3x3_1x3x224x224_st2_pd1": conv3d_3x3_1x3x224x224_st2_pd1,
+}
+
+input_t = Tuple[torch.Tensor]
+
+
+@common.parametrize("test_module", test_modules)
+@pytest.mark.skip  # Not implemented, skip until it is.
+def test_convolution_3d_tosa_MI(test_module):
+    pipeline = TosaPipelineMI[input_t](
+        test_module, test_module.get_inputs(), aten_op, exir_op
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_modules)
+@pytest.mark.skip  # Not implemented, skip until it is.
+def test_convolution_3d_tosa_BI(test_module):
+    pipeline = TosaPipelineBI[input_t](
+        test_module, test_module.get_inputs(), aten_op, exir_op
+    )
+    pipeline.change_args("run_method_and_compare_outputs", qtol=1)
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_modules)
+@pytest.mark.skip  # Not implemented, skip until it is.
+def test_convolution_3d_u55_BI(test_module):
+    pipeline = EthosU55PipelineBI[input_t](
+        test_module, test_module.get_inputs(), aten_op, exir_op, run_on_fvp=True
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_modules)
+@pytest.mark.skip  # Not implemented, skip until it is.
+def test_convolution_3d_u85_BI(test_module):
+    pipeline = EthosU85PipelineBI[input_t](
+        test_module, test_module.get_inputs(), aten_op, exir_op, run_on_fvp=True
+    )
+    pipeline.run()
+
+
+reject_suite = {
+    "large_stride": Conv3d(
+        in_channels=1,
+        out_channels=1,
+        kernel_size=(2, 2, 1),
+        stride=(2, 4, 2),
+        padding=1,
+        width=10,
+        height=14,
+        batches=1,
+    ),
+    "large_kernel_z": Conv3d(
+        in_channels=1,
+        out_channels=1,
+        kernel_size=(2, 2, 2),
+        stride=1,
+        padding=0,
+        width=80,
+        height=80,
+        batches=1,
+    ),
+}
+
+
+@common.parametrize("module", reject_suite)
+def test_reject_convolution_3d_u55_BI(
+    module: Conv3d,
+):
+    OpNotSupportedPipeline(
+        module,
+        module.get_inputs(),
+        "TOSA-0.80+BI+u55",
+        {"executorch_exir_dialects_edge__ops_aten_convolution_default": 1},
+    ).run()
diff --git a/backends/arm/test/ops/test_layer_norm.py b/backends/arm/test/ops/test_layer_norm.py
index 4e91554e05a..7ed181711a1 100644
--- a/backends/arm/test/ops/test_layer_norm.py
+++ b/backends/arm/test/ops/test_layer_norm.py
@@ -77,47 +77,31 @@ def test_native_layer_norm_tosa_BI(test_data):
         test_data[0],
         "torch.ops.aten.sub.Tensor",  # Just check for sub op included in the layernorm decomposition
     )
+    pipeline.change_args("run_method_and_compare_outputs", qtol=1)
     pipeline.run()
 
 
 @common.parametrize("test_data", test_data_suite)
+@common.XfailIfNoCorstone300
 def test_native_layer_norm_u55_BI(test_data):
     pipeline = EthosU55PipelineBI[input_t](
         test_data[1],
         test_data[0],
         "torch.ops.aten.sub.Tensor",  # Just check for sub op included in the layernorm decomposition
+        run_on_fvp=True,
     )
+    pipeline.change_args("run_method_and_compare_outputs", qtol=1)
     pipeline.run()
 
 
 @common.parametrize("test_data", test_data_suite)
+@common.XfailIfNoCorstone320
 def test_native_layer_norm_u85_BI(test_data):
-    pipeline = EthosU85PipelineBI[input_t](
-        test_data[1],
-        test_data[0],
-        "torch.ops.aten.sub.Tensor",  # Just check for sub op included in the layernorm decomposition
-    )
-    pipeline.run()
-
-
-@common.parametrize("test_data", test_data_suite)
-@common.SkipIfNoCorstone300
-def test_native_layer_norm_u55_BI_on_fvp(test_data):
-    pipeline = EthosU55PipelineBI[input_t](
-        test_data[1],
-        test_data[0],
-        "torch.ops.aten.sub.Tensor",  # Just check for sub op included in the layernorm decomposition
-    )
-    pipeline.run()
-
-
-@common.parametrize("test_data", test_data_suite)
-@common.SkipIfNoCorstone320
-def test_native_layer_norm_u85_BI_on_fvp(test_data):
     pipeline = EthosU85PipelineBI[input_t](
         test_data[1],
         test_data[0],
         "torch.ops.aten.sub.Tensor",  # Just check for sub op included in the layernorm decomposition
         run_on_fvp=True,
     )
+    pipeline.change_args("run_method_and_compare_outputs", qtol=1)
     pipeline.run()
diff --git a/backends/arm/test/ops/test_logsoftmax.py b/backends/arm/test/ops/test_logsoftmax.py
index 3e4cc1c0faa..7068ee77e01 100644
--- a/backends/arm/test/ops/test_logsoftmax.py
+++ b/backends/arm/test/ops/test_logsoftmax.py
@@ -47,6 +47,7 @@ def test_log_softmax_tosa_MI(test_data):
     pipeline.add_stage_after(
         "to_edge_transform_and_lower", pipeline.tester.check_not, [exir_op]
     )
+
     pipeline.run()
 
 
@@ -55,22 +56,7 @@ def test_log_softmax_tosa_BI(test_data):
     data, dim = test_data
     pipeline = TosaPipelineBI[input_t1](LogSoftmax(dim), data, [])
     pipeline.add_stage_after("quantize", pipeline.tester.check_not, [aten_op])
-    pipeline.run()
-
-
-@common.parametrize("test_data", LogSoftmax.test_data)
-def test_log_softmax_u55_BI(test_data):
-    data, dim = test_data
-    pipeline = EthosU55PipelineBI[input_t1](LogSoftmax(dim), data, [], run_on_fvp=False)
-    pipeline.add_stage_after("quantize", pipeline.tester.check_not, [aten_op])
-    pipeline.run()
-
-
-@common.parametrize("test_data", LogSoftmax.test_data)
-def test_log_softmax_u85_BI(test_data):
-    data, dim = test_data
-    pipeline = EthosU85PipelineBI[input_t1](LogSoftmax(dim), data, [], run_on_fvp=False)
-    pipeline.add_stage_after("quantize", pipeline.tester.check_not, [aten_op])
+    pipeline.change_args("run_method_and_compare_outputs", qtol=1)
     pipeline.run()
 
 
@@ -81,11 +67,12 @@ def test_log_softmax_u85_BI(test_data):
         "randn_mult_batches": "MLETORCH-433: Multiple batches not supported on FVP"
     },
 )
-@common.SkipIfNoCorstone300()
-def test_log_softmax_u55_BI_on_fvp(test_data):
+@common.XfailIfNoCorstone300()
+def test_log_softmax_u55_BI(test_data):
     data, dim = test_data
     pipeline = EthosU55PipelineBI[input_t1](LogSoftmax(dim), data, [], run_on_fvp=True)
     pipeline.add_stage_after("quantize", pipeline.tester.check_not, [aten_op])
+    pipeline.change_args("run_method_and_compare_outputs", qtol=1)
     pipeline.run()
 
 
@@ -96,9 +83,10 @@ def test_log_softmax_u55_BI_on_fvp(test_data):
         "randn_mult_batches": "MLETORCH-433: Multiple batches not supported on FVP"
     },
 )
-@common.SkipIfNoCorstone320
-def test_log_softmax_u85_BI_on_fvp(test_data):
+@common.XfailIfNoCorstone320
+def test_log_softmax_u85_BI(test_data):
     data, dim = test_data
     pipeline = EthosU85PipelineBI[input_t1](LogSoftmax(dim), data, [], run_on_fvp=True)
     pipeline.add_stage_after("quantize", pipeline.tester.check_not, [aten_op])
+    pipeline.change_args("run_method_and_compare_outputs", qtol=1)
     pipeline.run()
diff --git a/backends/arm/test/ops/test_mean_dim.py b/backends/arm/test/ops/test_mean_dim.py
index 9d67030cc4f..2351b0f9e9c 100644
--- a/backends/arm/test/ops/test_mean_dim.py
+++ b/backends/arm/test/ops/test_mean_dim.py
@@ -139,48 +139,36 @@ def test_mean_tosa_MI(test_data):
 
 @common.parametrize("test_data", MeanDim.test_data_suite)
 def test_mean_tosa_BI(test_data):
-    TosaPipelineBI[input_t](
-        MeanDim(test_data[1], test_data[2]),
-        (test_data[0],),
-        "torch.ops.aten.sum.dim_IntList",  # Just check for sum op included in the mean decomposition
-    ).run()
-
-
-@common.parametrize("test_data", MeanDim.test_data_suite)
-def test_mean_u55(test_data):
-    EthosU55PipelineBI[input_t](
-        MeanDim(test_data[1], test_data[2]),
-        (test_data[0],),
-        "torch.ops.aten.sum.dim_IntList",  # Just check for sum op included in the mean decomposition
-    ).run()
-
-
-@common.parametrize("test_data", MeanDim.test_data_suite)
-def test_mean_u85(test_data):
-    EthosU85PipelineBI[input_t](
+    pipeline = TosaPipelineBI[input_t](
         MeanDim(test_data[1], test_data[2]),
         (test_data[0],),
         "torch.ops.aten.sum.dim_IntList",  # Just check for sum op included in the mean decomposition
-    ).run()
+    )
+    pipeline.change_args("run_method_and_compare_outputs", qtol=1)
+    pipeline.run()
 
 
 @common.parametrize("test_data", MeanDim.test_data_suite)
-@common.SkipIfNoCorstone300
-def test_mean_u55_on_fvp(test_data):
-    EthosU55PipelineBI[input_t](
+@common.XfailIfNoCorstone300
+def test_mean_u55_BI(test_data):
+    pipeline = EthosU55PipelineBI[input_t](
         MeanDim(test_data[1], test_data[2]),
         (test_data[0],),
         "torch.ops.aten.sum.dim_IntList",  # Just check for sum op included in the mean decomposition
         run_on_fvp=True,
-    ).run()
+    )
+    pipeline.change_args("run_method_and_compare_outputs", qtol=1)
+    pipeline.run()
 
 
 @common.parametrize("test_data", MeanDim.test_data_suite)
-@common.SkipIfNoCorstone320
-def test_mean_u85_on_fvp(test_data):
-    EthosU85PipelineBI[input_t](
+@common.XfailIfNoCorstone320
+def test_mean_u85_BI(test_data):
+    pipeline = EthosU85PipelineBI[input_t](
         MeanDim(test_data[1], test_data[2]),
         (test_data[0],),
         "torch.ops.aten.sum.dim_IntList",  # Just check for sum op included in the mean decomposition
         run_on_fvp=True,
-    ).run()
+    )
+    pipeline.change_args("run_method_and_compare_outputs", qtol=1)
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_scalars.py b/backends/arm/test/ops/test_scalars.py
index 17dcd6f1d27..97af070120b 100644
--- a/backends/arm/test/ops/test_scalars.py
+++ b/backends/arm/test/ops/test_scalars.py
@@ -220,7 +220,9 @@ def _test_passes_tosa_BI_pipeline(module: torch.nn.Module, test_data: tuple):
 }
 
 
-@common.parametrize("tensor_scalar_tests", tensor_scalar_tests, passes_xfails)
+@common.parametrize(
+    "tensor_scalar_tests", tensor_scalar_tests, passes_xfails, strict=False
+)
 def test_passes_BI(tensor_scalar_tests: list):
     op, x, y = tensor_scalar_tests
     _test_passes_tosa_BI_pipeline(op, (x, y))
diff --git a/backends/arm/test/ops/test_softmax.py b/backends/arm/test/ops/test_softmax.py
index d51f20040b0..dcee5d038f2 100644
--- a/backends/arm/test/ops/test_softmax.py
+++ b/backends/arm/test/ops/test_softmax.py
@@ -57,22 +57,7 @@ def test_softmax_tosa_BI(test_data):
     data, dim = test_data
     pipeline = TosaPipelineBI[input_t1](Softmax(dim), data, [])
     pipeline.add_stage_after("quantize", pipeline.tester.check_not, [aten_op])
-    pipeline.run()
-
-
-@common.parametrize("test_data", Softmax.test_data)
-def test_softmax_u55_BI(test_data):
-    data, dim = test_data
-    pipeline = EthosU55PipelineBI[input_t1](Softmax(dim), data, [], run_on_fvp=False)
-    pipeline.add_stage_after("quantize", pipeline.tester.check_not, [aten_op])
-    pipeline.run()
-
-
-@common.parametrize("test_data", Softmax.test_data)
-def test_softmax_u85_BI(test_data):
-    data, dim = test_data
-    pipeline = EthosU85PipelineBI[input_t1](Softmax(dim), data, [], run_on_fvp=False)
-    pipeline.add_stage_after("quantize", pipeline.tester.check_not, [aten_op])
+    pipeline.change_args("run_method_and_compare_outputs", qtol=1)
     pipeline.run()
 
 
@@ -83,11 +68,12 @@ def test_softmax_u85_BI(test_data):
         "randn_mult_batches": "MLETORCH-433: Multiple batches not supported on FVP"
     },
 )
-@common.SkipIfNoCorstone300
-def test_softmax_u55_BI_on_fvp(test_data):
+@common.XfailIfNoCorstone300
+def test_softmax_u55_BI(test_data):
     data, dim = test_data
     pipeline = EthosU55PipelineBI[input_t1](Softmax(dim), data, [], run_on_fvp=True)
     pipeline.add_stage_after("quantize", pipeline.tester.check_not, [aten_op])
+    pipeline.change_args("run_method_and_compare_outputs", qtol=1)
     pipeline.run()
 
 
@@ -98,9 +84,10 @@ def test_softmax_u55_BI_on_fvp(test_data):
         "randn_mult_batches": "MLETORCH-433: Multiple batches not supported on FVP"
     },
 )
-@common.SkipIfNoCorstone320
-def test_softmax_u85_BI_on_fvp(test_data):
+@common.XfailIfNoCorstone320
+def test_softmax_u85_BI(test_data):
     data, dim = test_data
     pipeline = EthosU85PipelineBI[input_t1](Softmax(dim), data, [], run_on_fvp=True)
     pipeline.add_stage_after("quantize", pipeline.tester.check_not, [aten_op])
+    pipeline.change_args("run_method_and_compare_outputs", qtol=1)
     pipeline.run()
diff --git a/backends/qualcomm/_passes/__init__.py b/backends/qualcomm/_passes/__init__.py
index fb65e6b5f75..fb1f985edb9 100644
--- a/backends/qualcomm/_passes/__init__.py
+++ b/backends/qualcomm/_passes/__init__.py
@@ -1,10 +1,18 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
 from .annotate_decomposed import AnnotateDecomposed
 from .annotate_quant_attrs import AnnotateQuantAttrs
 from .constant_i64_to_i32 import ConstantI64toI32
 from .convert_bmm_to_matmul import ConvertBmmToMatmul
+from .convert_conv1d_to_conv2d import ConvertConv1dToConv2d
 from .convert_to_linear import ConvertToLinear
 from .decompose_any import DecomposeAny
 from .decompose_einsum import DecomposeEinsum
+from .decompose_expm1 import DecomposeExpM1
 from .decompose_linalg_vector_norm import DecomposeLinalgVectorNorm
 from .decompose_silu import DecomposeSilu
 from .expand_broadcast_tensor_shape import ExpandBroadcastTensorShape
@@ -19,8 +27,9 @@
 from .recompose_rms_norm import RecomposeRmsNorm
 from .reduce_dynamic_range import ReduceDynamicRange
 from .remove_redundancy import RemoveRedundancy
+from .replace_arange_args import ReplaceArangeArgs
 from .replace_index_put_input import ReplaceIndexPutInput
-from .replace_inf_buffer import ReplaceInfBuffer
+from .replace_inf_values import ReplaceInfValues
 from .tensor_i64_to_i32 import TensorI64toI32
 
 
@@ -29,10 +38,12 @@
     AnnotateQuantAttrs,
     ConstantI64toI32,
     ConvertBmmToMatmul,
+    ConvertConv1dToConv2d,
     RecomposePReLU,
     ConvertToLinear,
     DecomposeAny,
     DecomposeEinsum,
+    DecomposeExpM1,
     DecomposeLinalgVectorNorm,
     DecomposeSilu,
     ExpandBroadcastTensorShape,
@@ -46,7 +57,8 @@
     RecomposeRmsNorm,
     ReduceDynamicRange,
     RemoveRedundancy,
+    ReplaceArangeArgs,
     ReplaceIndexPutInput,
-    ReplaceInfBuffer,
+    ReplaceInfValues,
     TensorI64toI32,
 ]
diff --git a/backends/qualcomm/_passes/annotate_decomposed.py b/backends/qualcomm/_passes/annotate_decomposed.py
index a8a757ce9bf..918b705e5e9 100644
--- a/backends/qualcomm/_passes/annotate_decomposed.py
+++ b/backends/qualcomm/_passes/annotate_decomposed.py
@@ -17,6 +17,8 @@ class AnnotateDecomposed(ExportPass):
     generated after quantization process.
     """
 
+    decomp_ops = [torch.ops.aten.stack.default, torch.ops.aten.unbind.int]
+
     def __init__(self, edge_program: torch.export.ExportedProgram):
         super(AnnotateDecomposed, self).__init__()
         self.edge_program = edge_program
@@ -32,7 +34,7 @@ def _annotate_unbind(self, graph_module: torch.fx.GraphModule):
                         n.meta[QCOM_QUANT_ATTRS] = quant_attrs.copy()
 
     def _annotate_stack(self, graph_module: torch.fx.GraphModule):
-        partitions = get_source_partitions(graph_module.graph, [torch.stack])
+        partitions = get_source_partitions(graph_module.graph, [torch.stack, "stack"])
         for _, src_partitions in partitions.items():
             for src_partition in src_partitions:
                 output = src_partition.output_nodes[0]
diff --git a/backends/qualcomm/_passes/convert_conv1d_to_conv2d.py b/backends/qualcomm/_passes/convert_conv1d_to_conv2d.py
new file mode 100644
index 00000000000..947b631dbbf
--- /dev/null
+++ b/backends/qualcomm/_passes/convert_conv1d_to_conv2d.py
@@ -0,0 +1,99 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+from executorch.backends.qualcomm.builders.utils import get_parameter, set_parameter
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+
+from .utils import copy_meta
+
+
+class ConvertConv1dToConv2d(ExportPass):
+    """
+    Conv1d is not supported by QNN.
+    Change it to input -> unsqueeze -> conv2d -> squeeze -> output
+    """
+
+    def __init__(self, edge_program: torch.export.ExportedProgram):
+        super(ConvertConv1dToConv2d, self).__init__()
+        self.edge_program = edge_program
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        graph = graph_module.graph
+        conv_op = exir_ops.edge.aten.convolution.default
+        for node in graph.nodes:
+            if node.target == conv_op and node.meta["val"].dim() == 3:
+
+                input_node = node.args[0]
+                with graph_module.graph.inserting_after(input_node):
+                    unsqueeze_op = exir_ops.edge.aten.unsqueeze_copy.default
+                    unsqueeze_node = graph.create_node(
+                        "call_function",
+                        unsqueeze_op,
+                        (
+                            input_node,
+                            2,
+                        ),
+                    )
+                    unsqueeze_node.meta = copy_meta(
+                        input_node.meta, lambda m: {**m, "val": m["val"].unsqueeze(2)}
+                    )
+                    with graph_module.graph.inserting_after(unsqueeze_node):
+
+                        filter_node = node.args[1]
+                        filter_node.meta["val"] = (
+                            filter_node.meta["val"].unsqueeze(2).contiguous()
+                        )
+                        filter_tensor = get_parameter(filter_node, self.edge_program)
+                        # Ensure tensor is nn.Parameter type, so program does not fail during edge_program._validate()
+                        filter_tensor = nn.Parameter(filter_tensor.unsqueeze(2))
+                        set_parameter(filter_tensor, filter_node, self.edge_program)
+
+                        bias_node = node.args[2]
+                        stride = [1] + node.args[3]
+                        padding = [0] + node.args[4]
+                        dilation = [1] + node.args[5]
+                        transpose = node.args[6]
+                        output_padding = [0] + node.args[7]
+                        groups = node.args[8]
+
+                        conv2d_node = graph.create_node(
+                            "call_function",
+                            conv_op,
+                            (
+                                unsqueeze_node,
+                                filter_node,
+                                bias_node,
+                                stride,
+                                padding,
+                                dilation,
+                                transpose,
+                                output_padding,
+                                groups,
+                            ),
+                        )
+                        conv2d_node.meta = copy_meta(
+                            node.meta, lambda m: {**m, "val": m["val"].unsqueeze(2)}
+                        )
+
+                        with graph_module.graph.inserting_after(conv2d_node):
+                            squeeze_op = exir_ops.edge.aten.squeeze_copy.dims
+                            squeeze_node = graph.create_node(
+                                "call_function",
+                                squeeze_op,
+                                (
+                                    conv2d_node,
+                                    [2],
+                                ),
+                            )
+                            squeeze_node.meta = copy_meta(node.meta)
+                for user in node.users.copy():
+                    user.replace_input_with(node, squeeze_node)
+        graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
diff --git a/backends/qualcomm/_passes/decompose_expm1.py b/backends/qualcomm/_passes/decompose_expm1.py
new file mode 100644
index 00000000000..8fe6ebdec5b
--- /dev/null
+++ b/backends/qualcomm/_passes/decompose_expm1.py
@@ -0,0 +1,46 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.exir.pass_base import ExportPass, PassResult
+
+from .utils import copy_meta
+
+
+class DecomposeExpM1(ExportPass):
+    """
+    Decompose for expm1 to exponential and minus 1.
+    """
+
+    def __init__(self, quantization_capture=False) -> None:
+        super().__init__()
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        graph = graph_module.graph
+        for node in graph.nodes:
+            if node.target == torch.ops.aten.special_expm1.default:
+                input_node = node.args[0]
+                with graph_module.graph.inserting_after(input_node):
+                    exp_op = torch.ops.aten.exp.default
+                    exp_node = graph.create_node("call_function", exp_op, (input_node,))
+                    exp_node.meta = copy_meta(node.meta)
+                    with graph_module.graph.inserting_after(exp_node):
+                        sub_op = torch.ops.aten.sub.Tensor
+                        sub_node = graph.create_node(
+                            "call_function",
+                            sub_op,
+                            (
+                                exp_node,
+                                1,
+                            ),
+                        )
+                        sub_node.meta = copy_meta(node.meta)
+                for user in node.users.copy():
+                    user.replace_input_with(node, sub_node)
+
+        graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
diff --git a/backends/qualcomm/_passes/decompose_silu.py b/backends/qualcomm/_passes/decompose_silu.py
index 96c48920419..c3ac45a8d9d 100644
--- a/backends/qualcomm/_passes/decompose_silu.py
+++ b/backends/qualcomm/_passes/decompose_silu.py
@@ -3,22 +3,17 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
-from typing import Dict
 
 import torch
 from executorch.exir.pass_base import ExportPass, PassResult
 
+from .utils import copy_meta
+
 
 class DecomposeSilu(ExportPass):
     def __init__(self):
         super(DecomposeSilu, self).__init__()
 
-    def _copy_meta(self, meta: Dict):
-        copied = {}
-        for k, v in meta.items():
-            copied[k] = v
-        return copied
-
     def call(self, graph_module: torch.fx.GraphModule):
         graph = graph_module.graph
         for node in graph.nodes:
@@ -34,14 +29,14 @@ def call(self, graph_module: torch.fx.GraphModule):
                         torch.ops.aten.sigmoid.default,
                         (silu_node_input,),
                     )
-                    sigmoid_node.meta = self._copy_meta(silu_node.meta)
+                    sigmoid_node.meta = copy_meta(silu_node.meta)
                     with graph_module.graph.inserting_after(sigmoid_node):
                         mul_node = graph.create_node(
                             "call_function",
                             torch.ops.aten.mul.Tensor,
                             (silu_node_input, sigmoid_node),
                         )
-                        mul_node.meta = self._copy_meta(silu_node.meta)
+                        mul_node.meta = copy_meta(silu_node.meta)
                         for user in silu_node.users.copy():
                             user.replace_input_with(silu_node, mul_node)
 
diff --git a/backends/qualcomm/_passes/layout_transform.py b/backends/qualcomm/_passes/layout_transform.py
index 31bb936f3c4..64fdcb2bb88 100644
--- a/backends/qualcomm/_passes/layout_transform.py
+++ b/backends/qualcomm/_passes/layout_transform.py
@@ -49,12 +49,15 @@ class LayoutTransform(ExportPass):
         exir_ops.edge.aten.add.Tensor,
         exir_ops.edge.aten.bitwise_or.Tensor,
         exir_ops.edge.aten.bmm.default,
+        exir_ops.edge.aten.bitwise_and.Tensor,
         exir_ops.edge.aten.cat.default,
         exir_ops.edge.aten.ceil.default,
         exir_ops.edge.aten.clamp.default,
         exir_ops.edge.aten.constant_pad_nd.default,
         exir_ops.edge.aten.div.Tensor,
+        exir_ops.edge.aten.elu.default,
         exir_ops.edge.aten.eq.Tensor,
+        exir_ops.edge.aten.exp.default,
         exir_ops.edge.aten.full.default,
         exir_ops.edge.aten.full_like.default,
         exir_ops.edge.aten.ge.Tensor,
@@ -87,10 +90,13 @@ class LayoutTransform(ExportPass):
         exir_ops.edge.aten.sqrt.default,
         exir_ops.edge.aten.sub.Tensor,
         exir_ops.edge.aten.sum.dim_IntList,
+        exir_ops.edge.aten.stack.default,
         exir_ops.edge.aten.topk.default,
         exir_ops.edge.aten._to_copy.default,
+        exir_ops.edge.aten.unbind.int,
         exir_ops.edge.aten.where.self,
         _operator.getitem,
+        torch.ops.aten.scalar_tensor.default,
     }
 
     layout_type = {
diff --git a/backends/qualcomm/_passes/lift_constant_scalar_operands.py b/backends/qualcomm/_passes/lift_constant_scalar_operands.py
index 749d30f3564..cef28988520 100644
--- a/backends/qualcomm/_passes/lift_constant_scalar_operands.py
+++ b/backends/qualcomm/_passes/lift_constant_scalar_operands.py
@@ -28,24 +28,27 @@ class TensorConstant:
 class TensorOpInfo:
     target: torch._ops.OpOverload
     use_schema_args: bool
+    use_self_dtype: bool
 
 
 SCALAR_OPS = {
-    aten.eq.Scalar: TensorOpInfo(aten.eq.Tensor, False),
-    aten.ge.Scalar: TensorOpInfo(aten.ge.Tensor, False),
-    aten.gt.Scalar: TensorOpInfo(aten.gt.Tensor, False),
-    aten.le.Scalar: TensorOpInfo(aten.le.Tensor, False),
-    aten.lt.Scalar: TensorOpInfo(aten.lt.Tensor, False),
-    aten.ne.Scalar: TensorOpInfo(aten.ne.Tensor, False),
-    aten.add.Scalar: TensorOpInfo(aten.add.Tensor, False),
-    aten.add_.Scalar: TensorOpInfo(aten.add_.Tensor, False),
-    aten.div.Scalar: TensorOpInfo(aten.div.Tensor, False),
-    aten.mul.Scalar: TensorOpInfo(aten.mul.Tensor, False),
-    aten.rsub.Scalar: TensorOpInfo(aten.rsub.Tensor, False),
-    aten.sub.Scalar: TensorOpInfo(aten.sub.Tensor, False),
-    aten.pow.Tensor_Scalar: TensorOpInfo(aten.pow.Tensor_Tensor, False),
+    aten.eq.Scalar: TensorOpInfo(aten.eq.Tensor, False, False),
+    aten.ge.Scalar: TensorOpInfo(aten.ge.Tensor, False, False),
+    aten.gt.Scalar: TensorOpInfo(aten.gt.Tensor, False, False),
+    aten.le.Scalar: TensorOpInfo(aten.le.Tensor, False, False),
+    aten.lt.Scalar: TensorOpInfo(aten.lt.Tensor, False, False),
+    aten.ne.Scalar: TensorOpInfo(aten.ne.Tensor, False, False),
+    aten.add.Scalar: TensorOpInfo(aten.add.Tensor, False, False),
+    aten.add_.Scalar: TensorOpInfo(aten.add_.Tensor, False, False),
+    aten.div.Scalar: TensorOpInfo(aten.div.Tensor, False, False),
+    aten.mul.Scalar: TensorOpInfo(aten.mul.Tensor, False, False),
+    aten.rsub.Scalar: TensorOpInfo(aten.rsub.Tensor, False, False),
+    aten.sub.Scalar: TensorOpInfo(aten.sub.Tensor, False, False),
+    aten.pow.Tensor_Scalar: TensorOpInfo(aten.pow.Tensor_Tensor, False, False),
     # The scalar number arg[1] is missing when using default. Result in a corner case to deal
-    aten.leaky_relu.default: TensorOpInfo(aten.prelu.default, True),
+    aten.leaky_relu.default: TensorOpInfo(aten.prelu.default, True, False),
+    aten.where.ScalarOther: TensorOpInfo(aten.where.self, False, True),
+    aten.where.Scalar: TensorOpInfo(aten.where.self, False, True),
 }
 
 
@@ -63,11 +66,14 @@ def __init__(self):
     def _build_tensor_constant(
         self, gm: torch.fx.GraphModule, node: fx.Node, const_val
     ) -> TensorConstant:
+        # For dtype, in some cases, we cannot use node.args[0] as scalar dtype.
+        # Ex: Where op args[0] can be bool, however, we probably want args[1] and args[2] to be dtype same as node.meta["val"] instead of bool type
         tensor = torch.tensor(
             [const_val],
             dtype=(
                 node.args[0].meta["val"].dtype
                 if not is_float_tensor(node)
+                and not SCALAR_OPS.get(node.target).use_self_dtype
                 else node.meta["val"].dtype
             ),
             device=node.meta["val"].device,
diff --git a/backends/qualcomm/_passes/replace_arange_args.py b/backends/qualcomm/_passes/replace_arange_args.py
new file mode 100644
index 00000000000..19ebc60227f
--- /dev/null
+++ b/backends/qualcomm/_passes/replace_arange_args.py
@@ -0,0 +1,48 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.exir.pass_base import ExportPass, PassResult
+
+from .utils import copy_meta
+
+
+class ReplaceArangeArgs(ExportPass):
+    """
+    During annotation, kwargs for arange will be removed due to restrictions by quantizer.
+    This causes arange to have no dtype, which means FP nodes might be inferred as INT nodes during calibration.
+    This can cause calibration to fail since QDQ can only be applied on FP nodes but not INT nodes.
+    To hint the dtype, we provide step size as 1.0 instead of 1, which makes the node a FP node.
+    """
+
+    def __init__(self, quantization_capture=False) -> None:
+        super().__init__()
+        self.quantization_capture = quantization_capture
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        graph = graph_module.graph
+        for node in graph.nodes:
+            if node.target == torch.ops.aten.arange.default:
+                if torch.is_floating_point(node.meta["val"]) and len(node.args) == 1:
+                    with graph_module.graph.inserting_after(node):
+                        step_arange_op = torch.torch.ops.aten.arange.start_step
+                        step_arange_node = graph.create_node(
+                            "call_function",
+                            step_arange_op,
+                            (
+                                0,
+                                node.args[0],
+                                1.0,
+                            ),
+                        )
+                        step_arange_node.meta = copy_meta(node.meta)
+
+                        for user in node.users.copy():
+                            user.replace_input_with(node, step_arange_node)
+
+        graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
diff --git a/backends/qualcomm/_passes/replace_inf_buffer.py b/backends/qualcomm/_passes/replace_inf_values.py
similarity index 58%
rename from backends/qualcomm/_passes/replace_inf_buffer.py
rename to backends/qualcomm/_passes/replace_inf_values.py
index 776bc9beeba..5f7fb9bd768 100644
--- a/backends/qualcomm/_passes/replace_inf_buffer.py
+++ b/backends/qualcomm/_passes/replace_inf_values.py
@@ -7,20 +7,30 @@
 from executorch.exir.pass_base import ExportPass, PassResult
 
 
-class ReplaceInfBuffer(ExportPass):
+class ReplaceInfValues(ExportPass):
     """
     Due to limitation in Qnn, we need to change inf or -inf to arbitrary value in quantization.
     """
 
     def __init__(self):
-        super(ReplaceInfBuffer, self).__init__()
+        super(ReplaceInfValues, self).__init__()
 
     def call(self, graph_module: torch.fx.GraphModule):
         for buf_name, tensor in graph_module.named_buffers():
             if tensor.is_floating_point():
+                # 255 here is mainly for attention_mask in Llama for reasonable quant scale
                 tensor[tensor == float("inf")] = 255
                 tensor[tensor == float("-inf")] = -255
                 setattr(graph_module, buf_name, tensor)
 
+        for node in graph_module.graph.nodes:
+            arg_list = list(node.args)
+            for index, arg in enumerate(arg_list):
+                if arg == float("-inf"):
+                    arg_list[index] = torch.finfo(torch.float32).min
+                elif arg == float("inf"):
+                    arg_list[index] = torch.finfo(torch.float32).max
+            node.args = tuple(arg_list)
+
         graph_module.recompile()
         return PassResult(graph_module, True)
diff --git a/backends/qualcomm/_passes/tensor_i64_to_i32.py b/backends/qualcomm/_passes/tensor_i64_to_i32.py
index b590e30884c..baddd747f99 100644
--- a/backends/qualcomm/_passes/tensor_i64_to_i32.py
+++ b/backends/qualcomm/_passes/tensor_i64_to_i32.py
@@ -24,6 +24,9 @@ class TensorI64toI32(ExportPass):
 
     cast_ops = {
         torch.ops.aten.argmin.default,
+        torch.ops.aten.arange.start_step,
+        torch.ops.aten.full.default,
+        torch.ops.aten.scalar_tensor.default,
     }
 
     def __init__(self, edge_program):
@@ -61,7 +64,13 @@ def _cast_to_int32(self, core_ep: ExirExportedProgram):
                         cast_node.args = args
 
                         for user in users:
-                            user.replace_input_with(n, cast_node)
+                            # _assert_tensor_metadata is used to check dtype, which will cause lowering to fail since we are changing int64 to int32
+                            # We also skip if the next op is already a cast op, which prevents redundant casting.
+                            if user.target not in {
+                                torch.ops.aten._assert_tensor_metadata.default,
+                                torch.ops.aten._to_copy.default,
+                            }:
+                                user.replace_input_with(n, cast_node)
 
         core_ep.exported_program._graph_signature = _get_updated_graph_signature(
             core_ep.exported_program._graph_signature,
diff --git a/backends/qualcomm/_passes/utils.py b/backends/qualcomm/_passes/utils.py
index 23dfb569a8f..0c838e9a676 100755
--- a/backends/qualcomm/_passes/utils.py
+++ b/backends/qualcomm/_passes/utils.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from typing import Dict
+
 import torch
 from executorch.backends.qualcomm.builders.utils import get_parameter
 from executorch.backends.qualcomm.utils.constants import QCOM_DTYPE, QCOM_ENCODING
@@ -24,6 +26,15 @@
 }
 
 
+def copy_meta(meta: Dict, callback=None):
+    copied = {}
+    for k, v in meta.items():
+        copied[k] = v
+    if callback:
+        copied = callback(copied)
+    return copied
+
+
 def get_quant_attrs(
     edge_program: torch.export.ExportedProgram, quant_node: torch.fx.Node
 ):
@@ -66,6 +77,7 @@ def get_passes_dependency_for_capture_program():
         AnnotateQuantAttrs,
         ConstantI64toI32,
         ConvertBmmToMatmul,
+        ConvertConv1dToConv2d,
         ConvertToLinear,
         DecomposeAny,
         DecomposeLinalgVectorNorm,
@@ -91,6 +103,7 @@ def get_passes_dependency_for_capture_program():
         ],
         ConstantI64toI32: [RemoveRedundancy],
         ConvertBmmToMatmul: [ConvertToLinear],
+        ConvertConv1dToConv2d: [FoldQDQ],
         ConvertToLinear: [RecomposePixelUnshuffle],
         DecomposeAny: [RemoveRedundancy],
         DecomposeLinalgVectorNorm: [RemoveRedundancy],
@@ -98,6 +111,7 @@ def get_passes_dependency_for_capture_program():
         FoldQDQ: [AnnotateQuantAttrs, AnnotateDecomposed],
         LayoutTransform: [
             AnnotateQuantAttrs,
+            ConvertConv1dToConv2d,
             ExpandBroadcastTensorShape,
         ],
         RecomposePixelUnshuffle: [RemoveRedundancy],
diff --git a/backends/qualcomm/builders/__init__.py b/backends/qualcomm/builders/__init__.py
index c5352a7fbee..cc85333f26b 100644
--- a/backends/qualcomm/builders/__init__.py
+++ b/backends/qualcomm/builders/__init__.py
@@ -9,6 +9,7 @@
     op_abs,
     op_adaptive_avg_pool2d,
     op_add,
+    op_and,
     op_arange,
     op_argmin,
     op_avg_pool2d,
@@ -22,8 +23,10 @@
     op_depth_to_space,
     op_dequantize,
     op_div,
+    op_elu,
     op_embedding,
     op_eq,
+    op_exp,
     op_expand,
     op_full,
     op_full_like,
@@ -62,6 +65,7 @@
     op_reshape,
     op_rms_norm,
     op_rsqrt,
+    op_scalar_tensor,
     op_select_copy,
     op_sigmoid,
     op_sin,
@@ -72,12 +76,14 @@
     op_split_with_sizes,
     op_sqrt,
     op_squeeze,
+    op_stack,
     op_sub,
     op_sum_int_list,
     op_tanh,
     op_to,
     op_topk,
     op_transpose,
+    op_unbind,
     op_unsqueeze,
     op_upsample_bilinear2d,
     op_upsample_nearest2d,
@@ -89,6 +95,7 @@
     op_abs,
     op_adaptive_avg_pool2d,
     op_add,
+    op_and,
     op_arange,
     op_argmin,
     op_avg_pool2d,
@@ -102,8 +109,10 @@
     op_depth_to_space,
     op_dequantize,
     op_div,
+    op_elu,
     op_embedding,
     op_eq,
+    op_exp,
     op_expand,
     op_full,
     op_full_like,
@@ -142,6 +151,7 @@
     op_reshape,
     op_rms_norm,
     op_rsqrt,
+    op_scalar_tensor,
     op_select_copy,
     op_sigmoid,
     op_sin,
@@ -152,12 +162,14 @@
     op_split_with_sizes,
     op_squeeze,
     op_sqrt,
+    op_stack,
     op_sub,
     op_sum_int_list,
     op_tanh,
     op_topk,
     op_to,
     op_transpose,
+    op_unbind,
     op_unsqueeze,
     op_upsample_bilinear2d,
     op_upsample_nearest2d,
diff --git a/backends/qualcomm/builders/op_and.py b/backends/qualcomm/builders/op_and.py
new file mode 100644
index 00000000000..44e6f2893f5
--- /dev/null
+++ b/backends/qualcomm/builders/op_and.py
@@ -0,0 +1,59 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Dict
+
+import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
+
+import torch
+
+from .node_visitor import NodeVisitor, register_node_visitor
+from .qnn_constants import OpElementWiseAnd, QNN_OP_PACKAGE_NAME_QTI_AISW
+
+
+@register_node_visitor
+class OpAnd(NodeVisitor):
+    target = ["aten.bitwise_and.Tensor"]
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
+    ) -> PyQnnWrapper.PyQnnOpWrapper:
+        out_tensor = self.get_tensor(node, node)
+        output_tensor_wrapper = self.define_tensor(
+            node,
+            node,
+            out_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+        )
+        and_output_tensors = [output_tensor_wrapper]
+
+        and_input_tensors = []
+        for index in range(2):
+            input_node = node.args[index]
+            input_tensor = self.get_tensor(input_node, node)
+            tensor_type = PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE
+
+            input_tensor_wrapper = self.define_tensor(
+                input_node,
+                node,
+                input_tensor,
+                tensor_type,
+                nodes_to_wrappers,
+            )
+            and_input_tensors.append(input_tensor_wrapper)
+        and_op = PyQnnWrapper.PyQnnOpWrapper(
+            node.name,
+            QNN_OP_PACKAGE_NAME_QTI_AISW,
+            OpElementWiseAnd.op_name,
+        )
+        and_op.AddInputTensors(and_input_tensors)
+        and_op.AddOutputTensors(and_output_tensors)
+        return and_op
diff --git a/backends/qualcomm/builders/op_conv2d.py b/backends/qualcomm/builders/op_conv2d.py
index a6051636d3e..c019a835223 100644
--- a/backends/qualcomm/builders/op_conv2d.py
+++ b/backends/qualcomm/builders/op_conv2d.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import warnings
 from typing import cast, Dict, List
 
 import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
@@ -17,8 +16,6 @@
 from .qnn_constants import (
     OpConv2d,
     OpDepthWiseConv2d,
-    OpExpandDims,
-    OpReshape,
     OpTransposeConv2d,
     QNN_OP_PACKAGE_NAME_QTI_AISW,
 )
@@ -102,176 +99,16 @@ def _add_conv_op_parameter(
 
         return conv_op
 
-    def _define_conv1d(
-        self,
-        node: torch.fx.Node,
-        nodes_to_wrappers: Dict[str, PyQnnWrapper.TensorWrapper],
-    ) -> PyQnnWrapper.PyQnnOpWrapper:
-        """
-        Conv1D is a special case for convolutional operation. QNN does not support Conv1D, therefore,
-        we need to cast from input -> Conv1d -> output to input -> unsqueeze -> Conv2d -> squeeze -> output.
-        """
-        transpose_conv = cast(bool, node.args[6])
-        if transpose_conv:
-            print("ConvTranspose1d is not yet supported")
-            return
-
-        op_wrapper_list = []  # op_wrapper to return
-        unsqueeze_input_node = node.args[0]
-        input_quant_encoding, input_quant_configs = self.get_quant_encoding_conf(
-            unsqueeze_input_node, node
-        )
-
-        unsqueeze_input_tensor = self.get_tensor(unsqueeze_input_node, node)
-        unsqueeze_input_tensor_wrapper = self.define_tensor(
-            unsqueeze_input_node,
-            node,
-            unsqueeze_input_tensor,
-            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
-            nodes_to_wrappers,
-        )
-        unsqueeze_output_tensor = unsqueeze_input_tensor.unsqueeze(1).contiguous()
-        dtype = self.get_data_type(unsqueeze_output_tensor, input_quant_configs)
-        unsqueeze_output_tensor_wrapper = self.define_custom_tensor_wrapper(
-            node_name=node.name + "_unsqueeze",
-            tensor_type=PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
-            dtype=dtype,
-            quant_encoding=input_quant_encoding,
-            quant_configs=input_quant_configs,
-            dims=unsqueeze_output_tensor.size(),
-            tensor=unsqueeze_output_tensor,
-            is_fake_tensor=True,
-            nodes_to_wrappers=nodes_to_wrappers,
-        )
-        unsqueeze_op = PyQnnWrapper.PyQnnOpWrapper(
-            node.name + "_unsqueeze",
-            QNN_OP_PACKAGE_NAME_QTI_AISW,
-            OpExpandDims.op_name,
-        )
-        unsqueeze_op.AddInputTensors([unsqueeze_input_tensor_wrapper])
-        unsqueeze_op.AddOutputTensors([unsqueeze_output_tensor_wrapper])
-        unsqueeze_op.AddScalarParam(
-            OpExpandDims.param_axis,
-            PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
-            {QCOM_DATA: np.uint32(1)},
-        )
-        op_wrapper_list.append(unsqueeze_op)
-
-        filter_node = node.args[1]
-        filter_tensor = (
-            get_parameter(filter_node, self.edge_program).unsqueeze(2).contiguous()
-        )
-        filter_axis_order = (2, 3, 1, 0)
-        filter_tensor = filter_tensor.permute(dims=filter_axis_order).contiguous()
-        filter_tensor_wrapper = self.define_tensor(
-            filter_node,
-            node,
-            filter_tensor,
-            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC,
-            nodes_to_wrappers,
-        )
-        conv_input_tensors = [unsqueeze_output_tensor_wrapper, filter_tensor_wrapper]
-        if node.args[2] is not None:
-            bias_node = node.args[2]
-            bias_tensor = get_parameter(bias_node, self.edge_program)
-            bias_tensor_wrapper = self.define_tensor(
-                bias_node,
-                node,
-                bias_tensor,
-                PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC,
-                nodes_to_wrappers,
-            )
-            conv_input_tensors.append(bias_tensor_wrapper)
-
-        stride = [1] + cast(List[int], node.args[3])
-        padding = [0] + cast(List[int], node.args[4])
-        dilation = [1] + cast(List[int], node.args[5])
-        groups = cast(int, node.args[8])
-
-        # args[6] = transposed
-        if cast(bool, node.args[6]):
-            warnings.warn(
-                "[QNN Delegate Op Builder]: Currently, No support for transposed convolution.",
-                stacklevel=1,
-            )
-            return
-
-        # args[7] = output padding
-        if not all(out_pad == 0 for out_pad in cast(List[int], node.args[7])):
-            warnings.warn(
-                "[QNN Delegate Op Builder]: QNN does not support output padding.",
-                stacklevel=1,
-            )
-            return
-
-        stride_shape = [len(stride)]
-        padding_shape = [2, 2]
-        dilation_shape = [len(dilation)]
-
-        conv_op = PyQnnWrapper.PyQnnOpWrapper(
-            node.name + "_squeeze",
-            QNN_OP_PACKAGE_NAME_QTI_AISW,
-            OpConv2d.op_name,
-        )
-        conv_output_tensor = self.get_tensor(node, node)
-        conv_output_tensor = conv_output_tensor.unsqueeze(1).contiguous()
-        dtype = self.get_data_type(conv_output_tensor, input_quant_configs)
-        conv_output_tensor_wrapper = self.define_custom_tensor_wrapper(
-            node_name=node.name + "_squeeze",
-            tensor_type=PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
-            dtype=dtype,
-            quant_encoding=input_quant_encoding,
-            quant_configs=input_quant_configs,
-            dims=conv_output_tensor.size(),
-            tensor=conv_output_tensor,
-            is_fake_tensor=True,
-            nodes_to_wrappers=nodes_to_wrappers,
-        )
-        conv_op = self._add_conv_op_parameter(
-            OpConv2d,
-            conv_op,
-            conv_input_tensors,
-            [conv_output_tensor_wrapper],
-            stride,
-            stride_shape,
-            padding,
-            padding_shape,
-            dilation,
-            dilation_shape,
-            groups=groups,
-        )
-        op_wrapper_list.append(conv_op)
-
-        squeeze_op = PyQnnWrapper.PyQnnOpWrapper(
-            node.name,
-            QNN_OP_PACKAGE_NAME_QTI_AISW,
-            OpReshape.op_name,
-        )
-        squeeze_output_tensor = self.get_tensor(node, node)
-        squeeze_output_tensor_wrapper = self.define_tensor(
-            node,
-            node,
-            squeeze_output_tensor,
-            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
-            nodes_to_wrappers,
-            node_name=node.name,
-        )
-        squeeze_op.AddInputTensors([conv_output_tensor_wrapper])
-        squeeze_op.AddOutputTensors([squeeze_output_tensor_wrapper])
-        op_wrapper_list.append(squeeze_op)
-
-        return op_wrapper_list
-
     def define_node(
         self,
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[str, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        if get_parameter(node.args[1], self.edge_program).dim() == 3:
-            return self._define_conv1d(node, nodes_to_wrappers)
-
         input_node = node.args[0]
         input_tensor = self.get_tensor(input_node, node)
+        assert (
+            input_tensor.dim() == 4
+        ), "All Conv should be converted to Conv2D in ConvertConv1dToConv2d"
         input_tensor_wrapper = self.define_tensor(
             input_node,
             node,
diff --git a/backends/qualcomm/builders/op_elu.py b/backends/qualcomm/builders/op_elu.py
new file mode 100644
index 00000000000..f9cc089c7bb
--- /dev/null
+++ b/backends/qualcomm/builders/op_elu.py
@@ -0,0 +1,68 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Dict
+
+import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
+
+import numpy as np
+import torch
+from executorch.backends.qualcomm.utils.constants import QCOM_DATA
+
+from .node_visitor import NodeVisitor, register_node_visitor
+from .qnn_constants import OpElu, QNN_OP_PACKAGE_NAME_QTI_AISW
+
+
+@register_node_visitor
+class Elu(NodeVisitor):
+    target = ["aten.elu.default"]
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
+    ) -> PyQnnWrapper.PyQnnOpWrapper:
+        # tensor input
+        input_node = node.args[0]
+        input_tensor = self.get_tensor(input_node, node)
+
+        input_tensor_wrapper = self.define_tensor(
+            input_node,
+            node,
+            input_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+        )
+        elu_input_tensors = [input_tensor_wrapper]
+
+        out_tensor = self.get_tensor(node, node)
+        output_tensor_wrapper = self.define_tensor(
+            node,
+            node,
+            out_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+        )
+        elu_output_tensors = [output_tensor_wrapper]
+
+        elu_op = PyQnnWrapper.PyQnnOpWrapper(
+            node.name,
+            QNN_OP_PACKAGE_NAME_QTI_AISW,
+            OpElu.op_name,
+        )
+        elu_op.AddInputTensors(elu_input_tensors)
+        elu_op.AddOutputTensors(elu_output_tensors)
+
+        if len(node.args) == 2:
+            elu_op.AddScalarParam(
+                OpElu.param_alpha,
+                PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_FLOAT_32,
+                {QCOM_DATA: np.uint32(node.args[1])},
+            )
+
+        return elu_op
diff --git a/backends/qualcomm/builders/op_exp.py b/backends/qualcomm/builders/op_exp.py
new file mode 100644
index 00000000000..8c4794c9725
--- /dev/null
+++ b/backends/qualcomm/builders/op_exp.py
@@ -0,0 +1,59 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Dict
+
+import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
+
+import torch
+
+from .node_visitor import NodeVisitor, register_node_visitor
+from .qnn_constants import OpElementWiseExp, QNN_OP_PACKAGE_NAME_QTI_AISW
+
+
+@register_node_visitor
+class Exp(NodeVisitor):
+    target = ["aten.exp.default"]
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
+    ) -> PyQnnWrapper.PyQnnOpWrapper:
+        # tensor input
+        input_node = node.args[0]
+        input_tensor = self.get_tensor(input_node, node)
+
+        input_tensor_wrapper = self.define_tensor(
+            input_node,
+            node,
+            input_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+        )
+        exp_input_tensors = [input_tensor_wrapper]
+
+        out_tensor = self.get_tensor(node, node)
+        output_tensor_wrapper = self.define_tensor(
+            node,
+            node,
+            out_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+        )
+        exp_output_tensors = [output_tensor_wrapper]
+
+        exp_op = PyQnnWrapper.PyQnnOpWrapper(
+            node.name,
+            QNN_OP_PACKAGE_NAME_QTI_AISW,
+            OpElementWiseExp.op_name,
+        )
+        exp_op.AddInputTensors(exp_input_tensors)
+        exp_op.AddOutputTensors(exp_output_tensors)
+
+        return exp_op
diff --git a/backends/qualcomm/builders/op_pad.py b/backends/qualcomm/builders/op_pad.py
index 10948859be9..5ec34065f8b 100644
--- a/backends/qualcomm/builders/op_pad.py
+++ b/backends/qualcomm/builders/op_pad.py
@@ -53,14 +53,14 @@ def define_node(
         pad_amount = np.reshape(cast(List[int], node.args[1]), (-1, 2))[::-1].astype(
             np.uint32
         )
-        # fullfill the pad amount for each idex of tensor
+        # fulfill the pad amount for each idex of tensor
         if zero_amounts := pad_amount_shape[0] - pad_amount.shape[0]:
             pad_amount = np.concatenate(
                 (np.array([(0, 0)] * zero_amounts), pad_amount)
             ).astype(np.uint32)
 
         if QCOM_AXIS_ORDER in node.meta:
-            pad_amount = np.transpose(pad_amount, node.meta[QCOM_AXIS_ORDER])
+            pad_amount = pad_amount[list(node.meta[QCOM_AXIS_ORDER])]
         pad_amount_val = node.args[2]
 
         pad_op = PyQnnWrapper.PyQnnOpWrapper(
diff --git a/backends/qualcomm/builders/op_scalar_tensor.py b/backends/qualcomm/builders/op_scalar_tensor.py
new file mode 100644
index 00000000000..d236f6674df
--- /dev/null
+++ b/backends/qualcomm/builders/op_scalar_tensor.py
@@ -0,0 +1,50 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Dict
+
+import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
+
+import torch
+
+from .node_visitor import NodeVisitor, register_node_visitor
+
+
+@register_node_visitor
+class Arange(NodeVisitor):
+    target = ["scalar_tensor.default"]
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
+    ) -> PyQnnWrapper.PyQnnOpWrapper:
+        val = node.args[0]
+        out_tensor = torch.tensor([val], dtype=node.meta["val"].dtype)
+
+        # The following clamping will only occur in FP mode. Clamping for quantized mode will happen in the pass ReplaceInfValues.
+        # negative infinite
+        if torch.isinf(out_tensor)[0] and (out_tensor < 0):
+            out_tensor = torch.tensor(
+                [torch.finfo(torch.float32).min], dtype=node.meta["val"].dtype
+            )
+        # positive infinite
+        elif torch.isinf(out_tensor)[0] and (out_tensor > 0):
+            out_tensor = torch.tensor(
+                [torch.finfo(torch.float32).max], dtype=node.meta["val"].dtype
+            )
+        # since we can derive the constant value of current op in AoT stage
+        # we only build static tensor here for consumers of current node
+        # to correctly reference the data
+        self.define_tensor(
+            node,
+            node,
+            out_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC,
+            nodes_to_wrappers,
+        )
diff --git a/backends/qualcomm/builders/op_sqrt.py b/backends/qualcomm/builders/op_sqrt.py
index dc6691460ca..030e6c3e10a 100644
--- a/backends/qualcomm/builders/op_sqrt.py
+++ b/backends/qualcomm/builders/op_sqrt.py
@@ -10,7 +10,7 @@
 import torch
 
 from .node_visitor import NodeVisitor, register_node_visitor
-from .qnn_constants import OpSqrt, QNN_OP_PACKAGE_NAME_QTI_AISW
+from .qnn_constants import OpElementWiseSqrt, QNN_OP_PACKAGE_NAME_QTI_AISW
 
 
 @register_node_visitor
@@ -51,7 +51,7 @@ def define_node(
         sqrt_op = PyQnnWrapper.PyQnnOpWrapper(
             node.name,
             QNN_OP_PACKAGE_NAME_QTI_AISW,
-            OpSqrt.op_name,
+            OpElementWiseSqrt.op_name,
         )
         sqrt_op.AddInputTensors(sqrt_input_tensors)
         sqrt_op.AddOutputTensors(sqrt_output_tensors)
diff --git a/backends/qualcomm/builders/op_stack.py b/backends/qualcomm/builders/op_stack.py
new file mode 100644
index 00000000000..616d0ee0ccc
--- /dev/null
+++ b/backends/qualcomm/builders/op_stack.py
@@ -0,0 +1,71 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import cast, Dict
+
+import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
+
+import numpy as np
+import torch
+from executorch.backends.qualcomm.utils.constants import QCOM_AXIS_ORDER, QCOM_DATA
+
+from .node_visitor import NodeVisitor, register_node_visitor
+from .qnn_constants import OpPack, QNN_OP_PACKAGE_NAME_QTI_AISW
+
+
+@register_node_visitor
+class Stack(NodeVisitor):
+    target = ["aten.stack.default"]
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
+    ) -> PyQnnWrapper.PyQnnOpWrapper:
+        input_node_list = node.args[0]
+        stack_input_tensors = []
+        for input_node in input_node_list:
+            input_tensor = self.get_tensor(input_node, node)
+            stack_inp_tensor_wrapper = self.define_tensor(
+                input_node,
+                node,
+                input_tensor,
+                PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+                nodes_to_wrappers,
+            )
+            stack_input_tensors.append(stack_inp_tensor_wrapper)
+        output_tensor = self.get_tensor(node, node)
+        output_tensor_wrapper = self.define_tensor(
+            node,
+            node,
+            output_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+        )
+        stack_output_tensors = [output_tensor_wrapper]
+
+        dim = 0 if len(node.args) == 1 else cast(int, node.args[1])
+        if dim < 0:
+            dim = dim % len(input_tensor.shape)
+        if QCOM_AXIS_ORDER in node.meta:
+            dim = node.meta[QCOM_AXIS_ORDER].index(dim)
+        stack_op = PyQnnWrapper.PyQnnOpWrapper(
+            node.name,
+            QNN_OP_PACKAGE_NAME_QTI_AISW,
+            OpPack.op_name,
+        )
+        stack_op.AddInputTensors(stack_input_tensors)
+        stack_op.AddOutputTensors(stack_output_tensors)
+
+        stack_op.AddScalarParam(
+            OpPack.param_axis,
+            PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
+            {QCOM_DATA: np.uint32(dim)},
+        )
+
+        return stack_op
diff --git a/backends/qualcomm/builders/op_unbind.py b/backends/qualcomm/builders/op_unbind.py
new file mode 100644
index 00000000000..8ca62e2a07b
--- /dev/null
+++ b/backends/qualcomm/builders/op_unbind.py
@@ -0,0 +1,73 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import cast, Dict
+
+import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
+
+import numpy as np
+import torch
+from executorch.backends.qualcomm.utils.constants import QCOM_AXIS_ORDER, QCOM_DATA
+
+from .node_visitor import NodeVisitor, register_node_visitor
+from .qnn_constants import OpUnpack, QNN_OP_PACKAGE_NAME_QTI_AISW
+
+
+@register_node_visitor
+class Unbind(NodeVisitor):
+    target = ["aten.unbind.int"]
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
+    ) -> PyQnnWrapper.PyQnnOpWrapper:
+        input_node = node.args[0]
+        input_tensor = self.get_tensor(input_node, node)
+        input_tensor_wrapper = self.define_tensor(
+            input_node,
+            node,
+            input_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC,
+            nodes_to_wrappers,
+        )
+        unbind_input_tensors = [input_tensor_wrapper]
+
+        unbind_output_tensors = []
+        for i in range(len(node.meta["val"])):
+            output_tensor = self.get_tensor(node, node, i)
+            output_tensor_wrapper = self.define_tensor(
+                node,
+                node,
+                output_tensor,
+                PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+                nodes_to_wrappers,
+                wrapper_idx=i,
+            )
+            unbind_output_tensors.append(output_tensor_wrapper)
+
+        dim = 0 if len(node.args) == 1 else cast(int, node.args[1])
+        if dim < 0:
+            dim = dim % len(input_tensor.shape)
+        if QCOM_AXIS_ORDER in node.meta:
+            dim = node.meta[QCOM_AXIS_ORDER].index(dim)
+        unbind_op = PyQnnWrapper.PyQnnOpWrapper(
+            node.name,
+            QNN_OP_PACKAGE_NAME_QTI_AISW,
+            OpUnpack.op_name,
+        )
+        unbind_op.AddInputTensors(unbind_input_tensors)
+        unbind_op.AddOutputTensors(unbind_output_tensors)
+
+        unbind_op.AddScalarParam(
+            OpUnpack.param_axis,
+            PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
+            {QCOM_DATA: np.uint32(dim)},
+        )
+
+        return unbind_op
diff --git a/backends/qualcomm/builders/qnn_constants.py b/backends/qualcomm/builders/qnn_constants.py
index 1d55d56de0f..9613c755c7c 100644
--- a/backends/qualcomm/builders/qnn_constants.py
+++ b/backends/qualcomm/builders/qnn_constants.py
@@ -85,6 +85,11 @@ class OpElementWiseAdd:
     op_name: str = "ElementWiseAdd"
 
 
+@dataclass(init=False, frozen=True)
+class OpElementWiseAnd:
+    op_name: str = "ElementWiseAnd"
+
+
 @dataclass(init=False, frozen=True)
 class OpElementWiseCeil:
     op_name = "ElementWiseCeil"
@@ -100,6 +105,11 @@ class OpElementWiseDivide:
     op_name: str = "ElementWiseDivide"
 
 
+@dataclass(init=False, frozen=True)
+class OpElementWiseExp:
+    op_name: str = "ElementWiseExp"
+
+
 @dataclass(init=False, frozen=True)
 class OpElementWiseEqual:
     op_name: str = "ElementWiseEqual"
@@ -193,11 +203,22 @@ class OpElementWiseSelect:
     op_name = "ElementWiseSelect"
 
 
+@dataclass(init=False, frozen=True)
+class OpElementWiseSqrt:
+    op_name = "ElementWiseSquareRoot"
+
+
 @dataclass(init=False, frozen=True)
 class OpElementWiseSubtract:
     op_name = "ElementWiseSubtract"
 
 
+@dataclass(init=False, frozen=True)
+class OpElu:
+    op_name: str = "Elu"
+    param_alpha: str = "alpha"
+
+
 @dataclass(init=False, frozen=True)
 class OpExpandDims:
     op_name: str = "ExpandDims"
@@ -423,11 +444,6 @@ class OpSplit:
     param_split_index: str = "split_index"
 
 
-@dataclass(init=False, frozen=True)
-class OpSqrt:
-    op_name: str = "ElementWiseSquareRoot"
-
-
 @dataclass(init=False, frozen=True)
 class OpSqueeze:
     op_name: str = "Squeeze"
@@ -474,3 +490,9 @@ class OpTransposeConv2d:
     param_pad_amount: str = "pad_amount"
     param_group: str = "group"
     param_output_padding: str = "output_padding"
+
+
+@dataclass(init=False, frozen=True)
+class OpUnpack:
+    op_name: str = "UnPack"
+    param_axis: str = "axis"
diff --git a/backends/qualcomm/partition/common_defs.py b/backends/qualcomm/partition/common_defs.py
index 8254bb64db0..b427c59ce07 100644
--- a/backends/qualcomm/partition/common_defs.py
+++ b/backends/qualcomm/partition/common_defs.py
@@ -5,6 +5,8 @@
 # LICENSE file in the root directory of this source tree.
 import _operator
 
+import torch
+
 from executorch.exir.dialects._ops import ops as exir_ops
 
 not_supported_operator = [
@@ -20,6 +22,7 @@
     exir_ops.edge.aten.arange.start_step,
     exir_ops.edge.aten.full.default,
     exir_ops.edge.aten.full_like.default,
+    torch.ops.aten.scalar_tensor.default,
 ]
 
 allow_list_operator = [
diff --git a/backends/qualcomm/quantizer/annotators.py b/backends/qualcomm/quantizer/annotators.py
index c1e1aa25b08..93af5e86c97 100644
--- a/backends/qualcomm/quantizer/annotators.py
+++ b/backends/qualcomm/quantizer/annotators.py
@@ -378,6 +378,20 @@ def annotate_sin(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_single_in_single_out(node, quantization_config)
 
 
+@register_annotator([torch.ops.aten.scalar_tensor.default])
+def annotate_scalar_tensor(node: Node, quantization_config: QuantizationConfig) -> None:
+    if _is_annotated([node]):
+        return
+    if _is_float_tensor(node):
+        # workaround for node with kwargs could not be correctly annotated
+        node.kwargs = {}
+        node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
+            input_qspec_map={},
+            output_qspec=quantization_config.output_activation,
+            _annotated=True,
+        )
+
+
 @register_annotator([torch.ops.aten.tanh.default])
 def annotate_tanh(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_single_in_single_out(node, quantization_config)
@@ -680,6 +694,11 @@ def annotate_sigmoid(node: Node, quantization_config: QuantizationConfig) -> Non
         )
 
 
+@register_annotator([torch.ops.aten.__and__.Tensor])
+def annotate_and(node: Node, quantization_config: QuantizationConfig) -> None:
+    annotate_binary(node, quantization_config)
+
+
 @register_annotator([torch.ops.aten.bitwise_or.Tensor, torch.ops.aten.__or__.Tensor])
 def annotate_bitwise_or(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_binary(node, quantization_config)
@@ -717,6 +736,11 @@ def annotate_transpose(node: Node, quantization_config: QuantizationConfig) -> N
         annotate_single_in_single_out(node, quantization_config)
 
 
+@register_annotator([torch.ops.aten.elu.default])
+def annotate_elu(node: Node, quantization_config: QuantizationConfig) -> None:
+    annotate_single_in_single_out(node, quantization_config)
+
+
 @register_annotator([torch.ops.aten.embedding.default])
 def annotate_embedding(node: Node, quantization_config: QuantizationConfig) -> None:
     weight = node.args[0]
@@ -763,6 +787,11 @@ def annotate_index_put(node: Node, quantization_config: QuantizationConfig) -> N
     )
 
 
+@register_annotator([torch.ops.aten.exp.default])
+def annotate_exp(node: Node, quantization_config: QuantizationConfig) -> None:
+    annotate_single_in_single_out(node, quantization_config)
+
+
 @register_annotator([torch.ops.aten.expand.default, torch.ops.aten.expand_as.default])
 def annotate_expand(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_in_out_obs_sharing_op(node, quantization_config)
@@ -812,18 +841,28 @@ def annotate_flatten(node: Node, quantization_config: QuantizationConfig) -> Non
 
 @register_annotator([torch.ops.aten.stack.default])
 def annotate_stack(node: Node, quantization_config: QuantizationConfig) -> None:
+    input_nodes = node.args[0]
+    if _is_annotated([node]) or not _is_float_tensor(node):
+        return
+
+    assert isinstance(input_nodes, Sequence)
+
+    first_input_node = input_nodes[0]
     input_qspec_map = {}
-    for input_act in node.args[0]:
-        assert isinstance(input_act, Node)
-        input_qspec_map[input_act] = quantization_config.input_activation
+    assert isinstance(first_input_node, Node)
+    input_qspec_map[first_input_node] = quantization_config.input_activation
+    share_qparams_with_input_act0_qspec = SharedQuantizationSpec(
+        (first_input_node, node)
+    )
 
-        node_tensor = node.meta.get("val")
-        if torch.is_tensor(node_tensor) and node_tensor.dtype == torch.int64:
-            continue
+    for input_node in input_nodes[1:]:
+        if input_node not in input_qspec_map:
+            assert isinstance(input_node, Node)
+            input_qspec_map[input_node] = share_qparams_with_input_act0_qspec
 
     node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
         input_qspec_map=input_qspec_map,
-        output_qspec=quantization_config.output_activation,
+        output_qspec=share_qparams_with_input_act0_qspec,
         _annotated=True,
     )
 
@@ -894,6 +933,7 @@ def annotate_bmm(node: Node, quantization_config: QuantizationConfig) -> None:
         torch.ops.aten.conv2d.default,
         torch.ops.aten.conv1d.default,
         torch.ops.aten.conv_transpose2d.input,
+        torch.ops.aten.conv_transpose1d.default,
     ]
 )
 def annotate_conv2d(node: Node, quantization_config: QuantizationConfig) -> None:
@@ -1059,7 +1099,7 @@ def annotate_layer_norm(node: Node, quantization_config: QuantizationConfig) ->
 @register_annotator([torch.ops.aten.cat.default, torch.ops.aten.concat.default])
 def annotate_cat(node: Node, quantization_config: QuantizationConfig) -> None:
     input_nodes = node.args[0]
-    if _is_annotated([node]):
+    if _is_annotated([node]) or not _is_float_tensor(node):
         return
 
     assert isinstance(input_nodes, Sequence)
@@ -1087,23 +1127,28 @@ def annotate_cat(node: Node, quantization_config: QuantizationConfig) -> None:
 
 @register_annotator([torch.ops.aten.unbind.int])
 def annotate_unbind(node: Node, quantization_config: QuantizationConfig) -> None:
-    if _is_annotated([node]):
+    # Seems like unbind.int can be either float or int. Only quant when input is float.
+    if _is_annotated([node]) or not _is_float_tensor(node.args[0]):
         return
 
     input_qspec_map = {}
     input_act = node.args[0]
     assert isinstance(input_act, Node)
+    share_qparams_with_out_node0_qspec = SharedQuantizationSpec((node.args[0], node))
     input_qspec_map[input_act] = quantization_config.input_activation
 
-    node_tensor = node.meta.get("val")
-    if torch.is_tensor(node_tensor) and node_tensor.dtype == torch.int64:
-        return
-
     node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
         input_qspec_map=input_qspec_map,
+        output_qspec=share_qparams_with_out_node0_qspec,
         _annotated=True,
     )
 
+    for user in node.users:
+        user.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
+            output_qspec=share_qparams_with_out_node0_qspec,
+            _annotated=True,
+        )
+
 
 @register_annotator([torch.ops.aten.split.Tensor, torch.ops.aten.chunk.default])
 def annotate_chunk(node: Node, quantization_config: QuantizationConfig) -> None:
@@ -1129,22 +1174,33 @@ def annotate_chunk(node: Node, quantization_config: QuantizationConfig) -> None:
 
 @register_annotator([torch.ops.aten.where.self])
 def annotate_where(node: Node, quantization_config: QuantizationConfig) -> None:
-    true_input_act = node.args[1]
-    false_input_act = node.args[2]
     if _is_annotated([node]):
         return
 
-    _annotate_input_qspec_map(
-        node,
-        true_input_act,
-        quantization_config.input_activation,
-    )
+    input_qspec_map = {}
+    for input_node in node.args:
+        assert isinstance(input_node, Node)
+        if _is_float_tensor(input_node):
+            input_qspec_map[input_node] = quantization_config.input_activation
 
-    _annotate_input_qspec_map(
-        node,
-        false_input_act,
-        quantization_config.input_activation,
+    node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
+        input_qspec_map=input_qspec_map,
+        output_qspec=(
+            quantization_config.output_activation if _is_float_tensor(node) else None
+        ),
+        _annotated=True,
     )
 
-    _annotate_output_qspec(node, quantization_config.output_activation)
-    _mark_nodes_as_annotated([node])
+
+@register_annotator([torch.ops.aten.zeros.default])
+def annotate_zeros(node: Node, quantization_config: QuantizationConfig) -> None:
+    if _is_annotated([node]) or not _is_float_tensor(node):
+        return
+
+    # workaround for node with kwargs could not be correctly annotated
+    node.kwargs = {}
+    node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
+        input_qspec_map={},
+        output_qspec=quantization_config.output_activation,
+        _annotated=True,
+    )
diff --git a/backends/qualcomm/quantizer/quantizer.py b/backends/qualcomm/quantizer/quantizer.py
index 38570835bea..028ffb69f1d 100644
--- a/backends/qualcomm/quantizer/quantizer.py
+++ b/backends/qualcomm/quantizer/quantizer.py
@@ -10,12 +10,14 @@
 import torch
 from executorch.backends.qualcomm._passes import (
     DecomposeEinsum,
+    DecomposeExpM1,
     DecomposeLinalgVectorNorm,
     DecomposeSilu,
     LiftConstantScalarOperands,
     RecomposePixelUnshuffle,
     ReduceDynamicRange,
-    ReplaceInfBuffer,
+    ReplaceArangeArgs,
+    ReplaceInfValues,
 )
 from executorch.backends.transforms.decompose_sdpa import (
     DecomposeScaledDotProductAttention,
@@ -273,11 +275,13 @@ def set_per_channel_linear_quant(self, enable: bool) -> None:
     def transform_for_annotation(self, model: GraphModule) -> GraphModule:
         model = ReduceDynamicRange()(model).graph_module
         model = RecomposePixelUnshuffle(quantization_capture=True)(model).graph_module
+        model = ReplaceArangeArgs()(model).graph_module
         model = DecomposeScaledDotProductAttention()(model).graph_module
         model = DecomposeSilu()(model).graph_module
         model = DecomposeEinsum()(model).graph_module
+        model = DecomposeExpM1()(model).graph_module
         model = DecomposeLinalgVectorNorm(aten_dialect_capture=True)(model).graph_module
-        model = ReplaceInfBuffer()(model).graph_module
+        model = ReplaceInfValues()(model).graph_module
         model = LiftConstantScalarOperands()(model).graph_module
         return model
 
diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py
index e5a9be8e75b..c3c439261d2 100644
--- a/backends/qualcomm/tests/models.py
+++ b/backends/qualcomm/tests/models.py
@@ -8,6 +8,19 @@
 
 
 # module with related operator only
+
+
+class And(torch.nn.Module):
+    def __init__(self, pos, neg):
+        super().__init__()
+        self.pos = pos
+        self.neg = neg
+
+    def forward(self, x, y):
+        bitwise_and = torch.bitwise_and(x, y).bool()
+        return torch.where(bitwise_and, self.pos, self.neg)
+
+
 class Abs(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -462,6 +475,17 @@ def forward(self, x):
         return self.conv(x)
 
 
+class ConvTranspose1dSingle(torch.nn.Module):
+    def __init__(self, bias=True):
+        super().__init__()
+        self.conv_transpose = torch.nn.ConvTranspose1d(
+            in_channels=1, out_channels=3, kernel_size=3, stride=2, padding=1, bias=bias
+        )
+
+    def forward(self, x):
+        return self.conv_transpose(x)
+
+
 class ConvTranspose2dSingle(torch.nn.Module):
     def __init__(self, bias=True):
         super().__init__()
@@ -601,6 +625,15 @@ def forward(self, i, j):
         return torch.relu(torch.einsum("i,j->ij", i, j))
 
 
+class Elu(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.elu = torch.nn.ELU(alpha=0.5)
+
+    def forward(self, i):
+        return self.elu(i)
+
+
 class Embedding(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -645,6 +678,14 @@ def forward(self, x):
         return y.expand_as(x)
 
 
+class ExpM1(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return torch.special.expm1(x)
+
+
 class Full(torch.nn.Module):
     def __init__(self, fill, shape):
         super().__init__()
@@ -1383,8 +1424,8 @@ class Stack(torch.nn.Module):
     def __init__(self):
         super().__init__()
 
-    def forward(self, x, y):
-        return torch.stack((x, y))
+    def forward(self, x, y, z):
+        return torch.stack((x, y, z))
 
 
 class Sub(torch.nn.Module):
@@ -1493,3 +1534,29 @@ def __init__(self, pos, neg):
 
     def forward(self, x):
         return torch.where(x >= torch.zeros(x.shape), self.pos, self.neg)
+
+
+class WhereConstantOther(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return torch.where(x >= 0, torch.ones(x.shape), 0)
+
+
+class WhereConstantAll(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return torch.where(x >= 0, 1, 0)
+
+
+class WhereConstantInf(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return torch.nn.functional.softmax(
+            torch.where(x >= 0, 0.1, float("-inf")), dim=-1
+        )
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index 936b9c3efe4..05e368f372e 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -195,6 +195,16 @@ def test_qnn_backend_conv2d_channel_last(self):
             with self.subTest(i=i):
                 self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_conv_transpose1d(self):
+        modules = [
+            ConvTranspose1dSingle(),  # noqa: F405
+            ConvTranspose1dSingle(bias=False),  # noqa: F405
+        ]
+        sample_input = (torch.randn([1, 1, 3]),)
+        for i, module in enumerate(modules):
+            with self.subTest(i=i):
+                self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_conv_transpose2d(self):
         modules = [
             ConvTranspose2dSingle(),  # noqa: F405
@@ -255,6 +265,14 @@ def test_qnn_backend_element_wise_add(self):
                         self.lower_module_and_test_output(module, sample_input)
                         index += 1
 
+    def test_qnn_backend_element_wise_and(self):
+        module = And(torch.tensor(1.7), torch.tensor(0.2))  # noqa: F405
+        sample_input = (
+            torch.tensor([1, 0, 1, 0], dtype=torch.bool),
+            torch.tensor([1, 1, 0, 0], dtype=torch.bool),
+        )
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_element_wise_ceil(self):
         module = Ceil()  # noqa: F405
         sample_input = (torch.randn([2, 5, 1, 3]),)
@@ -369,6 +387,12 @@ def test_qnn_backend_element_wise_sub(self):
                         self.lower_module_and_test_output(module, sample_input)
                         index += 1
 
+    @unittest.expectedFailure
+    def test_qnn_backend_elu(self):
+        module = Elu()  # noqa: F405
+        sample_input = (torch.randn(2, 5, 1, 3),)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_embedding(self):
         module = Embedding()  # noqa: F405
         sample_input = (torch.Tensor([[1, 2, 4, 5], [4, 3, 2, 9]]).to(torch.int32),)
@@ -398,6 +422,11 @@ def test_qnn_backend_expand(self):
             with self.subTest(i=i):
                 self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_expm1(self):
+        sample_input = (torch.randn(3, 4, 5),)
+        module = ExpM1()  # noqa: F405
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_full(self):
         shape = (1, 2, 3, 4)
         module = Full(0.5, shape)  # noqa: F405
@@ -758,7 +787,11 @@ def test_qnn_backend_slice_copy(self):
 
     def test_qnn_backend_stack(self):
         module = Stack()  # noqa: F405
-        sample_input = (torch.randn([1, 2, 3, 4]), torch.randn([1, 2, 3, 4]))
+        sample_input = (
+            torch.randn([1, 2, 3, 4]),
+            torch.randn([1, 2, 3, 4]),
+            torch.randn([1, 2, 3, 4]),
+        )
         self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_softmax(self):
@@ -800,10 +833,16 @@ def test_qnn_backend_where(self):
         modules = [
             Where(),  # noqa: F405
             WhereConstant(torch.randn(3, 2), torch.randn(3, 2)),  # noqa: F405
+            WhereConstantOther(),  # noqa: F405
+            # WhereConstantAll(),  # noqa: F405 TODO: constant dtype does not propogate when doing const i64->32, causing where to fail since where does not support int64 output
+            WhereConstantInf(),  # noqa: F405
         ]
         sample_inputs = [
             (torch.randn(3, 2), torch.randn(3, 2), torch.randn(3, 2)),
             (torch.randn(3, 2),),
+            (torch.randn(3, 2),),
+            # (torch.randn(3, 2),),
+            (torch.randn(30, 20),),
         ]
         for i, module in enumerate(modules):
             self.lower_module_and_test_output(module, sample_inputs[i])
@@ -1206,6 +1245,17 @@ def test_qnn_backend_conv2d_channel_last(self):
                 module = self.get_qdq_module(module, sample_input)
                 self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_conv_transpose1d(self):
+        modules = [
+            ConvTranspose1dSingle(),  # noqa: F405
+            ConvTranspose1dSingle(bias=False),  # noqa: F405
+        ]
+        sample_input = (torch.randn([1, 1, 3]),)
+        for i, module in enumerate(modules):
+            with self.subTest(i=i):
+                module = self.get_qdq_module(module, sample_input)
+                self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_conv_transpose2d(self):
         modules = [
             ConvTranspose2dSingle(),  # noqa: F405
@@ -1271,6 +1321,15 @@ def test_qnn_backend_element_wise_add(self):
                         self.lower_module_and_test_output(module, sample_input)
                         index += 1
 
+    def test_qnn_backend_element_wise_and(self):
+        module = And(torch.tensor(1.7), torch.tensor(0.2))  # noqa: F405
+        sample_input = (
+            torch.tensor([1, 0, 1, 0], dtype=torch.bool),
+            torch.tensor([1, 1, 0, 0], dtype=torch.bool),
+        )
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_element_wise_ceil(self):
         module = Ceil()  # noqa: F405
         sample_input = (torch.randn([2, 5, 1, 3]),)
@@ -1391,6 +1450,12 @@ def test_qnn_backend_element_wise_sub(self):
                         self.lower_module_and_test_output(module, sample_input)
                         index += 1
 
+    def test_qnn_backend_elu(self):
+        module = Elu()  # noqa: F405
+        sample_input = (torch.randn(2, 5, 1, 3),)
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_embedding(self):
         module = Embedding()  # noqa: F405
         sample_input = (torch.Tensor([[1, 2, 4, 5], [4, 3, 2, 9]]).to(torch.int32),)
@@ -1423,6 +1488,12 @@ def test_qnn_backend_expand(self):
                 module = self.get_qdq_module(module, sample_input)
                 self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_expm1(self):
+        sample_input = (torch.randn(3, 4, 5),)
+        module = ExpM1()  # noqa: F405
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_full(self):
         shape = (1, 2, 3, 4)
         module = Full(0.5, shape)  # noqa: F405
@@ -1856,6 +1927,7 @@ def test_qnn_backend_stack(self):
         sample_input = (
             torch.randn([1, 2, 3, 4]),
             torch.randn([1, 2, 3, 4]),
+            torch.randn([1, 2, 3, 4]),
         )
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
@@ -1894,10 +1966,16 @@ def test_qnn_backend_where(self):
         modules = [
             Where(),  # noqa: F405
             WhereConstant(torch.randn(3, 2), torch.randn(3, 2)),  # noqa: F405
+            WhereConstantOther(),  # noqa: F405
+            # WhereConstantAll(),  # noqa: F405, TODO: constant dtype does not propogate when doing const i64->32, causing where to fail since where does not support int64 output
+            WhereConstantInf(),  # noqa: F405
         ]
         sample_inputs = [
             (torch.randn(3, 2), torch.randn(3, 2), torch.randn(3, 2)),
             (torch.randn(3, 2),),
+            (torch.randn(3, 2),),
+            # (torch.randn(3, 2),),
+            (torch.randn(30, 20),),
         ]
         for i, module in enumerate(modules):
             module = self.get_qdq_module(module, sample_inputs[i])
diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
index 8045e9e6443..7033f30997a 100644
--- a/backends/qualcomm/utils/utils.py
+++ b/backends/qualcomm/utils/utils.py
@@ -21,8 +21,10 @@
     AnnotateQuantAttrs,
     ConstantI64toI32,
     ConvertBmmToMatmul,
+    ConvertConv1dToConv2d,
     ConvertToLinear,
     DecomposeAny,
+    DecomposeExpM1,
     DecomposeLinalgVectorNorm,
     ExpandBroadcastTensorShape,
     FoldQDQ,
@@ -321,11 +323,12 @@ def canonicalize_program(obj):
     update_spill_fill_size(obj)
 
 
-def get_decomp_table() -> Dict[torch._ops.OperatorBase, Callable]:
+def get_decomp_table(passes_job) -> Dict[torch._ops.OperatorBase, Callable]:
     source_decompositions = core_aten_decompositions()
     # The below super ops are supported by QNN
     skip_decompositions = [
         torch.ops.aten.adaptive_avg_pool2d.default,
+        torch.ops.aten.elu.default,
         torch.ops.aten.instance_norm.default,
         torch.ops.aten.pixel_shuffle.default,
         torch.ops.aten.pixel_unshuffle.default,
@@ -334,8 +337,17 @@ def get_decomp_table() -> Dict[torch._ops.OperatorBase, Callable]:
         torch.ops.pt2e_quant.quantize_affine.default,
         torch.ops.pt2e_quant.dequantize_affine.default,
         torch.ops.aten._safe_softmax.default,
+        torch.ops.aten.stack.default,
+        torch.ops.aten.unbind.int,
     ]
 
+    # If we want to annotate the decomposed ops, then we should decompose the operation.
+    if passes_job and passes_job.get(AnnotateDecomposed, False):
+        skip_decompositions = [
+            skip_decomp_op
+            for skip_decomp_op in skip_decompositions
+            if skip_decomp_op not in AnnotateDecomposed.decomp_ops
+        ]
     remove_decompositions(source_decompositions, skip_decompositions)
 
     return source_decompositions
@@ -353,10 +365,11 @@ def get_capture_program_passes():
     # The second value in each tuple in `default_passes_and_setting` indicates whether the corresponding pass is activated by default.
     # If a pass is activated, it will be executed by default.
     default_passes_and_setting = [
-        (AnnotateDecomposed, True),
+        (AnnotateDecomposed, False),
         (AnnotateQuantAttrs, True),
         (ConstantI64toI32, True),
         (ConvertBmmToMatmul, True),
+        (ConvertConv1dToConv2d, True),
         (ConvertToLinear, True),
         (DecomposeAny, True),
         (DecomposeLinalgVectorNorm, True),
@@ -448,6 +461,7 @@ def _preprocess_module(module: torch.nn.Module, inputs: Tuple[torch.Tensor]):
     module = torch.export.export(module, inputs, strict=True).module()
     module = DecomposeScaledDotProductAttention()(module).graph_module
     module = DecomposeLinalgVectorNorm(True)(module).graph_module
+    module = DecomposeExpM1()(module).graph_module
     module = LiftConstantScalarOperands()(module).graph_module
     return module
 
@@ -460,7 +474,8 @@ def capture_program(
 ) -> exir.ExirExportedProgram:
     module = _preprocess_module(module, inputs)
     ep = torch.export.export(module, inputs, dynamic_shapes=dynamic_shapes, strict=True)
-    decomposed_ep = ep.run_decompositions(get_decomp_table())
+    # TODO: Handle stack op. If we want to run annotate_decomposed pass for stack op, we need to make stack op decompose, which means we need to find a method to remove it from skip_decomp table
+    decomposed_ep = ep.run_decompositions(get_decomp_table(passes_job))
     core_ep = ExirExportedProgram(decomposed_ep, False)
     core_ep.transform(TensorI64toI32(edge_program=core_ep))
     edge_ep = core_ep.to_edge(qnn_edge_config())
diff --git a/backends/xnnpack/test/models/deeplab_v3.py b/backends/xnnpack/test/models/deeplab_v3.py
index 9913296521d..c47832b63d1 100644
--- a/backends/xnnpack/test/models/deeplab_v3.py
+++ b/backends/xnnpack/test/models/deeplab_v3.py
@@ -23,6 +23,9 @@ def forward(self, *args):
 
 
 class TestDeepLabV3(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     dl3 = DL3Wrapper()
     dl3 = dl3.eval()
     model_inputs = (torch.randn(1, 3, 224, 224),)
diff --git a/backends/xnnpack/test/models/edsr.py b/backends/xnnpack/test/models/edsr.py
index 34b5898cf41..138ea62ddf5 100644
--- a/backends/xnnpack/test/models/edsr.py
+++ b/backends/xnnpack/test/models/edsr.py
@@ -14,6 +14,9 @@
 
 
 class TestEDSR(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     edsr = edsr_r16f64(2, False).eval()  # noqa
     model_inputs = (torch.randn(1, 3, 224, 224),)
 
diff --git a/backends/xnnpack/test/models/emformer_rnnt.py b/backends/xnnpack/test/models/emformer_rnnt.py
index 5cf4337307c..d5125361def 100644
--- a/backends/xnnpack/test/models/emformer_rnnt.py
+++ b/backends/xnnpack/test/models/emformer_rnnt.py
@@ -13,6 +13,9 @@
 
 
 class TestEmformerModel(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     class EmformerRnnt(torch.nn.Module):
         def __init__(self):
             super().__init__()
diff --git a/backends/xnnpack/test/models/inception_v3.py b/backends/xnnpack/test/models/inception_v3.py
index 59fd56d6af7..d5d6d086206 100644
--- a/backends/xnnpack/test/models/inception_v3.py
+++ b/backends/xnnpack/test/models/inception_v3.py
@@ -13,6 +13,9 @@
 
 
 class TestInceptionV3(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     ic3 = models.inception_v3(weights="IMAGENET1K_V1").eval()  # noqa
     model_inputs = (torch.randn(1, 3, 224, 224),)
 
diff --git a/backends/xnnpack/test/models/inception_v4.py b/backends/xnnpack/test/models/inception_v4.py
index e8a785116a3..c5239bb2dd3 100644
--- a/backends/xnnpack/test/models/inception_v4.py
+++ b/backends/xnnpack/test/models/inception_v4.py
@@ -12,6 +12,9 @@
 
 
 class TestInceptionV4(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     ic4 = inception_v4(pretrained=False).eval()
     model_inputs = (torch.randn(3, 299, 299).unsqueeze(0),)
 
diff --git a/backends/xnnpack/test/models/llama2_et_example.py b/backends/xnnpack/test/models/llama2_et_example.py
index f1dce43c3c9..378f9dd3d48 100644
--- a/backends/xnnpack/test/models/llama2_et_example.py
+++ b/backends/xnnpack/test/models/llama2_et_example.py
@@ -13,6 +13,9 @@
 
 
 class TestLlama2ETExample(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     def test_f32(self):
         self._test()
 
diff --git a/backends/xnnpack/test/models/mobilebert.py b/backends/xnnpack/test/models/mobilebert.py
index ca18e6c265d..57c099e87d1 100644
--- a/backends/xnnpack/test/models/mobilebert.py
+++ b/backends/xnnpack/test/models/mobilebert.py
@@ -12,6 +12,9 @@
 
 
 class TestMobilebert(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     # pyre-ignore
     mobilebert = MobileBertModel(MobileBertConfig()).eval()
     example_inputs = (torch.tensor([[101, 7592, 1010, 2026, 3899, 2003, 10140, 102]]),)
diff --git a/backends/xnnpack/test/models/mobilenet_v2.py b/backends/xnnpack/test/models/mobilenet_v2.py
index 4ee28af6b95..2ff93303d50 100644
--- a/backends/xnnpack/test/models/mobilenet_v2.py
+++ b/backends/xnnpack/test/models/mobilenet_v2.py
@@ -14,6 +14,9 @@
 
 
 class TestMobileNetV2(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     mv2 = models.mobilenetv2.mobilenet_v2(weights=MobileNet_V2_Weights)
     mv2 = mv2.eval()
     model_inputs = (torch.randn(1, 3, 224, 224),)
diff --git a/backends/xnnpack/test/models/mobilenet_v3.py b/backends/xnnpack/test/models/mobilenet_v3.py
index cacd8b5cc87..f64b7352b7f 100644
--- a/backends/xnnpack/test/models/mobilenet_v3.py
+++ b/backends/xnnpack/test/models/mobilenet_v3.py
@@ -13,6 +13,9 @@
 
 
 class TestMobileNetV3(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     mv3 = models.mobilenetv3.mobilenet_v3_small(pretrained=True)
     mv3 = mv3.eval()
     model_inputs = (torch.randn(1, 3, 224, 224),)
diff --git a/backends/xnnpack/test/models/resnet.py b/backends/xnnpack/test/models/resnet.py
index 4ad6a7d5f47..9f4989e1724 100644
--- a/backends/xnnpack/test/models/resnet.py
+++ b/backends/xnnpack/test/models/resnet.py
@@ -13,6 +13,9 @@
 
 
 class TestResNet18(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     inputs = (torch.randn(1, 3, 224, 224),)
     dynamic_shapes = (
         {
diff --git a/backends/xnnpack/test/models/torchvision_vit.py b/backends/xnnpack/test/models/torchvision_vit.py
index 6bebd284e53..f9153032cd8 100644
--- a/backends/xnnpack/test/models/torchvision_vit.py
+++ b/backends/xnnpack/test/models/torchvision_vit.py
@@ -12,6 +12,9 @@
 
 
 class TestViT(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     vit = models.vision_transformer.vit_b_16(weights="IMAGENET1K_V1")
     vit = vit.eval()
     model_inputs = (torch.randn(1, 3, 224, 224),)
diff --git a/backends/xnnpack/test/models/very_big_model.py b/backends/xnnpack/test/models/very_big_model.py
index 3545287c628..f4f10f1611c 100644
--- a/backends/xnnpack/test/models/very_big_model.py
+++ b/backends/xnnpack/test/models/very_big_model.py
@@ -11,6 +11,9 @@
 
 
 class TestVeryBigModel(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     class BigModel(torch.nn.Module):
         def __init__(self):
             super().__init__()
diff --git a/backends/xnnpack/test/models/w2l.py b/backends/xnnpack/test/models/w2l.py
index 07b3bf56b32..216fb7a89f0 100644
--- a/backends/xnnpack/test/models/w2l.py
+++ b/backends/xnnpack/test/models/w2l.py
@@ -12,6 +12,9 @@
 
 
 class TestW2L(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     batch_size = 10
     input_frames = 700
     vocab_size = 4096
diff --git a/backends/xnnpack/test/ops/test_abs.py b/backends/xnnpack/test/ops/test_abs.py
index a41bee47470..48feaafb0e1 100644
--- a/backends/xnnpack/test/ops/test_abs.py
+++ b/backends/xnnpack/test/ops/test_abs.py
@@ -11,6 +11,9 @@
 
 
 class TestAbs(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     class Abs(torch.nn.Module):
         def __init__(self):
             super().__init__()
diff --git a/backends/xnnpack/test/ops/test_add.py b/backends/xnnpack/test/ops/test_add.py
index 29a87df1303..2416879f5ce 100644
--- a/backends/xnnpack/test/ops/test_add.py
+++ b/backends/xnnpack/test/ops/test_add.py
@@ -11,6 +11,9 @@
 
 
 class TestAdd(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     class Add(torch.nn.Module):
         def __init__(self):
             super().__init__()
diff --git a/backends/xnnpack/test/ops/test_avgpool2d.py b/backends/xnnpack/test/ops/test_avgpool2d.py
index b471fd914c2..c1f149e5a93 100644
--- a/backends/xnnpack/test/ops/test_avgpool2d.py
+++ b/backends/xnnpack/test/ops/test_avgpool2d.py
@@ -11,6 +11,9 @@
 
 
 class TestAvgPool2d(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     class AvgPool2d(torch.nn.Module):
         def __init__(
             self, count_include_pad=False, ceil_mode=False, divisor_override=None
diff --git a/backends/xnnpack/test/ops/test_bilinear2d.py b/backends/xnnpack/test/ops/test_bilinear2d.py
index 24c990d6bb1..1fd3c147328 100644
--- a/backends/xnnpack/test/ops/test_bilinear2d.py
+++ b/backends/xnnpack/test/ops/test_bilinear2d.py
@@ -14,6 +14,9 @@
 
 
 class TestUpsampleBilinear2d(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     class StaticResizeBilinear2dModule(torch.nn.Module):
         def forward(self, x):
             a = torch.nn.functional.interpolate(
diff --git a/backends/xnnpack/test/ops/test_bmm.py b/backends/xnnpack/test/ops/test_bmm.py
index 1c6235e5f7e..a029738e771 100644
--- a/backends/xnnpack/test/ops/test_bmm.py
+++ b/backends/xnnpack/test/ops/test_bmm.py
@@ -11,6 +11,9 @@
 
 
 class TestBMM(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     class BMM(torch.nn.Module):
         def __init__(self):
             super().__init__()
diff --git a/backends/xnnpack/test/ops/test_cat.py b/backends/xnnpack/test/ops/test_cat.py
index dd551ea3fa7..11e246f541a 100644
--- a/backends/xnnpack/test/ops/test_cat.py
+++ b/backends/xnnpack/test/ops/test_cat.py
@@ -13,6 +13,9 @@
 
 
 class TestCat(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     class Cat(torch.nn.Module):
         def __init__(self, dim=0):
             super().__init__()
diff --git a/backends/xnnpack/test/ops/test_ceil.py b/backends/xnnpack/test/ops/test_ceil.py
index 9caee15ad5b..717df6f47e6 100644
--- a/backends/xnnpack/test/ops/test_ceil.py
+++ b/backends/xnnpack/test/ops/test_ceil.py
@@ -11,6 +11,9 @@
 
 
 class TestCeil(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     class Ceil(torch.nn.Module):
         def __init__(self):
             super().__init__()
diff --git a/backends/xnnpack/test/ops/test_check_quant_params.py b/backends/xnnpack/test/ops/test_check_quant_params.py
index cd18568afba..b76935a9f72 100644
--- a/backends/xnnpack/test/ops/test_check_quant_params.py
+++ b/backends/xnnpack/test/ops/test_check_quant_params.py
@@ -14,6 +14,9 @@
 
 
 class TestCheckQuantParams(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     def create_invalid_value_injector(
         self, invalid_value, is_per_channel=False, is_zp=False
     ):
@@ -46,6 +49,7 @@ def inject_invalid_scale_in_per_tensor(aten):
         return inject_invalid_scale_in_per_tensor
 
     def _test_check_quant_message(self, ep_modifier, expected_message):
+        torch._dynamo.reset()
         mod = torch.nn.Linear(10, 10)
         quantizer = XNNPACKQuantizer()
         captured = export_for_training(mod, (torch.randn(1, 10),)).module()
diff --git a/backends/xnnpack/test/ops/test_clamp.py b/backends/xnnpack/test/ops/test_clamp.py
index 9fb8935553e..671d9372e18 100644
--- a/backends/xnnpack/test/ops/test_clamp.py
+++ b/backends/xnnpack/test/ops/test_clamp.py
@@ -11,6 +11,9 @@
 
 
 class TestClamp(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     class Clamp(torch.nn.Module):
         def __init__(self, min_val=None, max_val=None):
             super().__init__()
diff --git a/backends/xnnpack/test/ops/test_conv1d.py b/backends/xnnpack/test/ops/test_conv1d.py
index b4c8c414929..036500b29d5 100644
--- a/backends/xnnpack/test/ops/test_conv1d.py
+++ b/backends/xnnpack/test/ops/test_conv1d.py
@@ -19,6 +19,9 @@
 
 
 class TestConv1d(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     class Conv1d(torch.nn.Module):
         def __init__(self, dtype: torch.dtype = torch.float):
             groups = 1
diff --git a/backends/xnnpack/test/ops/test_conv2d.py b/backends/xnnpack/test/ops/test_conv2d.py
index d3e5db8df2d..80b731bd18e 100644
--- a/backends/xnnpack/test/ops/test_conv2d.py
+++ b/backends/xnnpack/test/ops/test_conv2d.py
@@ -170,6 +170,9 @@ def get_inputs(self):
 
 
 class TestConv2d(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     def _test(
         self,
         m: torch.nn.Module,
diff --git a/backends/xnnpack/test/ops/test_div.py b/backends/xnnpack/test/ops/test_div.py
index 9bca5feed48..b53c59df8e1 100644
--- a/backends/xnnpack/test/ops/test_div.py
+++ b/backends/xnnpack/test/ops/test_div.py
@@ -11,6 +11,9 @@
 
 
 class TestDiv(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     class Div(torch.nn.Module):
         def __init__(self):
             super().__init__()
diff --git a/backends/xnnpack/test/ops/test_elu.py b/backends/xnnpack/test/ops/test_elu.py
index f976c29d799..68a0c687779 100644
--- a/backends/xnnpack/test/ops/test_elu.py
+++ b/backends/xnnpack/test/ops/test_elu.py
@@ -11,6 +11,9 @@
 
 
 class TestElu(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     class ELU(torch.nn.Module):
         def __init__(self):
             super().__init__()
diff --git a/backends/xnnpack/test/ops/test_floor.py b/backends/xnnpack/test/ops/test_floor.py
index dfbe7fb18c0..5c543fd0534 100644
--- a/backends/xnnpack/test/ops/test_floor.py
+++ b/backends/xnnpack/test/ops/test_floor.py
@@ -11,6 +11,9 @@
 
 
 class TestFloor(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     class Floor(torch.nn.Module):
         def __init__(self):
             super().__init__()
diff --git a/backends/xnnpack/test/ops/test_hardswish.py b/backends/xnnpack/test/ops/test_hardswish.py
index 899a119ed44..561551fc433 100644
--- a/backends/xnnpack/test/ops/test_hardswish.py
+++ b/backends/xnnpack/test/ops/test_hardswish.py
@@ -11,6 +11,9 @@
 
 
 class TestHardswish(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     class Hardswish(torch.nn.Module):
         def __init__(self):
             super().__init__()
diff --git a/backends/xnnpack/test/ops/test_hardtanh.py b/backends/xnnpack/test/ops/test_hardtanh.py
index e35e840e3c3..6f2914010c7 100644
--- a/backends/xnnpack/test/ops/test_hardtanh.py
+++ b/backends/xnnpack/test/ops/test_hardtanh.py
@@ -11,6 +11,9 @@
 
 
 class TestHardTanh(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     class HardTanh(torch.nn.Module):
         def __init__(self, min_val=-1.0, max_val=1.0):
             super().__init__()
diff --git a/backends/xnnpack/test/ops/test_leaky_relu.py b/backends/xnnpack/test/ops/test_leaky_relu.py
index 32f73486977..c0921ddf3ad 100644
--- a/backends/xnnpack/test/ops/test_leaky_relu.py
+++ b/backends/xnnpack/test/ops/test_leaky_relu.py
@@ -11,6 +11,9 @@
 
 
 class TestLeakyRelu(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     class LeakyReLU(torch.nn.Module):
         def __init__(self, **kwargs):
             super().__init__()
diff --git a/backends/xnnpack/test/ops/test_linear.py b/backends/xnnpack/test/ops/test_linear.py
index cf9473180bb..849a1b237e8 100644
--- a/backends/xnnpack/test/ops/test_linear.py
+++ b/backends/xnnpack/test/ops/test_linear.py
@@ -219,6 +219,9 @@ class TestLinear(unittest.TestCase):
           should produce strictly better results compared to Per-Tensor Quantization
     """
 
+    def setUp(self):
+        torch._dynamo.reset()
+
     @staticmethod
     def _get_4b_dqconfig() -> QuantizationConfig:
         # Returns a QuantizationConfig for 4b dynamic quantization for XNNPACK.
diff --git a/backends/xnnpack/test/ops/test_lstm.py b/backends/xnnpack/test/ops/test_lstm.py
index 6c174b16f33..db4deb9aae4 100644
--- a/backends/xnnpack/test/ops/test_lstm.py
+++ b/backends/xnnpack/test/ops/test_lstm.py
@@ -14,6 +14,9 @@
 
 
 class TestLSTM(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     class LSTMLinear(torch.nn.Module):
         def __init__(self, input_size, hidden_size, out_size):
             super().__init__()
diff --git a/backends/xnnpack/test/ops/test_max_dim.py b/backends/xnnpack/test/ops/test_max_dim.py
index c660a5a6d26..f209845372e 100644
--- a/backends/xnnpack/test/ops/test_max_dim.py
+++ b/backends/xnnpack/test/ops/test_max_dim.py
@@ -11,6 +11,9 @@
 
 
 class TestMaxDim(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     class Max(torch.nn.Module):
         def forward(self, x):
             max_values_1, max_indices_1 = torch.max(x, dim=2, keepdim=True)
diff --git a/backends/xnnpack/test/ops/test_maximum.py b/backends/xnnpack/test/ops/test_maximum.py
index 30dfa5503a9..c594452631c 100644
--- a/backends/xnnpack/test/ops/test_maximum.py
+++ b/backends/xnnpack/test/ops/test_maximum.py
@@ -11,6 +11,9 @@
 
 
 class TestMaximum(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     class Maximum(torch.nn.Module):
         def __init__(self):
             super().__init__()
diff --git a/backends/xnnpack/test/ops/test_maxpool2d.py b/backends/xnnpack/test/ops/test_maxpool2d.py
index 521235232a2..f82b27b09ec 100644
--- a/backends/xnnpack/test/ops/test_maxpool2d.py
+++ b/backends/xnnpack/test/ops/test_maxpool2d.py
@@ -15,6 +15,9 @@
 
 
 class TestMaxPool2d(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     class MaxPool2d(torch.nn.Module):
         def __init__(self, kernel_size=3, stride=1, padding=0, dilation=1):
             super().__init__()
diff --git a/backends/xnnpack/test/ops/test_mean_dim.py b/backends/xnnpack/test/ops/test_mean_dim.py
index 3bac5f3239c..81a93c3e97e 100644
--- a/backends/xnnpack/test/ops/test_mean_dim.py
+++ b/backends/xnnpack/test/ops/test_mean_dim.py
@@ -11,6 +11,9 @@
 
 
 class TestMeanDim(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     class MeanDim(torch.nn.Module):
         def __init__(self, dims):
             super().__init__()
diff --git a/backends/xnnpack/test/ops/test_minimum.py b/backends/xnnpack/test/ops/test_minimum.py
index 406ac8485e5..fe1af3de5ab 100644
--- a/backends/xnnpack/test/ops/test_minimum.py
+++ b/backends/xnnpack/test/ops/test_minimum.py
@@ -11,6 +11,9 @@
 
 
 class TestMinimum(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     class Minimum(torch.nn.Module):
         def __init__(self):
             super().__init__()
diff --git a/backends/xnnpack/test/ops/test_multiply.py b/backends/xnnpack/test/ops/test_multiply.py
index db50bc5dd44..3315200005d 100644
--- a/backends/xnnpack/test/ops/test_multiply.py
+++ b/backends/xnnpack/test/ops/test_multiply.py
@@ -11,6 +11,9 @@
 
 
 class TestMul(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     class Mul(torch.nn.Module):
         def forward(self, x, y):
             z = x * y
diff --git a/backends/xnnpack/test/ops/test_negate.py b/backends/xnnpack/test/ops/test_negate.py
index 4d158612e97..5022255e484 100644
--- a/backends/xnnpack/test/ops/test_negate.py
+++ b/backends/xnnpack/test/ops/test_negate.py
@@ -11,6 +11,9 @@
 
 
 class TestNegate(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     class Negate(torch.nn.Module):
         def __init__(self):
             super().__init__()
diff --git a/backends/xnnpack/test/ops/test_permute.py b/backends/xnnpack/test/ops/test_permute.py
index b348fc8af6d..2991ba1773d 100644
--- a/backends/xnnpack/test/ops/test_permute.py
+++ b/backends/xnnpack/test/ops/test_permute.py
@@ -11,6 +11,9 @@
 
 
 class TestPermute(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     class Permute(torch.nn.Module):
         def __init__(self, dims):
             self.dims = dims
diff --git a/backends/xnnpack/test/ops/test_pow.py b/backends/xnnpack/test/ops/test_pow.py
index ac902ae44be..2accb010210 100644
--- a/backends/xnnpack/test/ops/test_pow.py
+++ b/backends/xnnpack/test/ops/test_pow.py
@@ -11,6 +11,9 @@
 
 
 class TestPow(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     class Pow(torch.nn.Module):
         def __init__(self, exp):
             super().__init__()
diff --git a/backends/xnnpack/test/ops/test_prelu.py b/backends/xnnpack/test/ops/test_prelu.py
index f73648dfa25..47b2851278c 100644
--- a/backends/xnnpack/test/ops/test_prelu.py
+++ b/backends/xnnpack/test/ops/test_prelu.py
@@ -11,6 +11,9 @@
 
 
 class TestPrelu(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     class PReLU(torch.nn.Module):
         def __init__(self):
             super().__init__()
diff --git a/backends/xnnpack/test/ops/test_quantize_per_tensor.py b/backends/xnnpack/test/ops/test_quantize_per_tensor.py
index c2117987536..9e876c09671 100644
--- a/backends/xnnpack/test/ops/test_quantize_per_tensor.py
+++ b/backends/xnnpack/test/ops/test_quantize_per_tensor.py
@@ -13,6 +13,9 @@
 
 
 class TestQuantizePerTensor(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     def test_qs8_quantize_per_tensor(self):
         class Quant(torch.nn.Module):
             def forward(self, x):
diff --git a/backends/xnnpack/test/ops/test_relu.py b/backends/xnnpack/test/ops/test_relu.py
index 8672b1d3e4e..508c1ceeffa 100644
--- a/backends/xnnpack/test/ops/test_relu.py
+++ b/backends/xnnpack/test/ops/test_relu.py
@@ -11,6 +11,9 @@
 
 
 class TestRelu(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     class Relu(torch.nn.Module):
         def __init__(self):
             super().__init__()
diff --git a/backends/xnnpack/test/ops/test_rsqrt.py b/backends/xnnpack/test/ops/test_rsqrt.py
index e5d704a0467..5405e966359 100644
--- a/backends/xnnpack/test/ops/test_rsqrt.py
+++ b/backends/xnnpack/test/ops/test_rsqrt.py
@@ -11,6 +11,9 @@
 
 
 class TestRsqrt(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     class Rsqrt(torch.nn.Module):
         def __init__(self):
             super().__init__()
diff --git a/backends/xnnpack/test/ops/test_sdpa.py b/backends/xnnpack/test/ops/test_sdpa.py
index de5c7174ab5..205b6d4ab36 100644
--- a/backends/xnnpack/test/ops/test_sdpa.py
+++ b/backends/xnnpack/test/ops/test_sdpa.py
@@ -15,6 +15,9 @@
 
 
 class TestSDPA(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     class SDPA(torch.nn.Module):
         def __init__(self, scale: Optional[float] = None):
             super().__init__()
diff --git a/backends/xnnpack/test/ops/test_sigmoid.py b/backends/xnnpack/test/ops/test_sigmoid.py
index a9acd4df6db..fe55f0f1ef5 100644
--- a/backends/xnnpack/test/ops/test_sigmoid.py
+++ b/backends/xnnpack/test/ops/test_sigmoid.py
@@ -11,6 +11,9 @@
 
 
 class TestSigmoid(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     class Sigmoid(torch.nn.Module):
         def __init__(self):
             super().__init__()
diff --git a/backends/xnnpack/test/ops/test_slice_copy.py b/backends/xnnpack/test/ops/test_slice_copy.py
index 8ff37368578..ea65571b1e8 100644
--- a/backends/xnnpack/test/ops/test_slice_copy.py
+++ b/backends/xnnpack/test/ops/test_slice_copy.py
@@ -11,6 +11,9 @@
 
 
 class TestSliceCopy(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     def _test_slice_copy(self, module, inputs, copy_count=1, edge_copy_count=1):
         (
             Tester(module, inputs)
diff --git a/backends/xnnpack/test/ops/test_softmax.py b/backends/xnnpack/test/ops/test_softmax.py
index f909e8ce5f2..bf078860ba5 100644
--- a/backends/xnnpack/test/ops/test_softmax.py
+++ b/backends/xnnpack/test/ops/test_softmax.py
@@ -11,6 +11,9 @@
 
 
 class TestSoftmax(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     class Softmax(torch.nn.Module):
         def __init__(self, dim):
             super().__init__()
diff --git a/backends/xnnpack/test/ops/test_sqrt.py b/backends/xnnpack/test/ops/test_sqrt.py
index eaeb3b9f700..ee800c62568 100644
--- a/backends/xnnpack/test/ops/test_sqrt.py
+++ b/backends/xnnpack/test/ops/test_sqrt.py
@@ -11,6 +11,9 @@
 
 
 class TestSqrt(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     class Sqrt(torch.nn.Module):
         def __init__(self):
             super().__init__()
diff --git a/backends/xnnpack/test/ops/test_square.py b/backends/xnnpack/test/ops/test_square.py
index 32a19639343..c7a567239bb 100644
--- a/backends/xnnpack/test/ops/test_square.py
+++ b/backends/xnnpack/test/ops/test_square.py
@@ -11,6 +11,9 @@
 
 
 class TestSquare(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     class Square(torch.nn.Module):
         def __init__(self):
             super().__init__()
diff --git a/backends/xnnpack/test/ops/test_static_constant_pad.py b/backends/xnnpack/test/ops/test_static_constant_pad.py
index b1b41afe8cf..c5d103f596a 100644
--- a/backends/xnnpack/test/ops/test_static_constant_pad.py
+++ b/backends/xnnpack/test/ops/test_static_constant_pad.py
@@ -11,6 +11,9 @@
 
 
 class TestStaticConstantPad(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     class StaticConstantPadFunctional(torch.nn.Module):
         def __init__(self):
             super().__init__()
diff --git a/backends/xnnpack/test/ops/test_sub.py b/backends/xnnpack/test/ops/test_sub.py
index fb3d3d3f948..06219730ddb 100644
--- a/backends/xnnpack/test/ops/test_sub.py
+++ b/backends/xnnpack/test/ops/test_sub.py
@@ -11,6 +11,9 @@
 
 
 class TestSub(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     class Sub(torch.nn.Module):
         def __init__(self):
             super().__init__()
diff --git a/backends/xnnpack/test/passes/test_activation_fusion.py b/backends/xnnpack/test/passes/test_activation_fusion.py
index 5f340f61b2e..6a1182dc7fb 100644
--- a/backends/xnnpack/test/passes/test_activation_fusion.py
+++ b/backends/xnnpack/test/passes/test_activation_fusion.py
@@ -16,6 +16,9 @@
 class TestActivationFusion(unittest.TestCase):
     PassStage = RunPasses([ConvertToLinearPass, FuseActivationPass])
 
+    def setUp(self):
+        torch._dynamo.reset()
+
     def check_node_has_tag(self, graph_module, node_target, tag):
         for n in graph_module.graph.nodes:
             if n.op == "call_function" and n.target == node_target:
diff --git a/backends/xnnpack/test/passes/test_batch_norm_fusion.py b/backends/xnnpack/test/passes/test_batch_norm_fusion.py
index 59d0e0a2072..70c93c3751b 100644
--- a/backends/xnnpack/test/passes/test_batch_norm_fusion.py
+++ b/backends/xnnpack/test/passes/test_batch_norm_fusion.py
@@ -18,6 +18,9 @@ class TestBatchNormFusion(unittest.TestCase):
     PassStage = RunPasses([FuseBatchNormWithConvPass])
     bn_name = "executorch_exir_dialects_edge__ops_aten__native_batch_norm_legit_no_training_default"
 
+    def setUp(self):
+        torch._dynamo.reset()
+
     class ModelConvBN(torch.nn.Module):
         def __init__(
             self,
diff --git a/backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py b/backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py
index c1438b29213..6d60f9d76b5 100644
--- a/backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py
+++ b/backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py
@@ -17,6 +17,9 @@
 
 
 class TestChannelsLastTaggedReshapePass(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
     PassStage = RunPasses([ChannelsLastTaggedReshapePass])
     # Dictionary mapping modules to expected number of reshapes
     modules = {
diff --git a/backends/xnnpack/test/passes/test_convert_to_linear.py b/backends/xnnpack/test/passes/test_convert_to_linear.py
index a07f8cf61ce..0e7bc7d01c4 100644
--- a/backends/xnnpack/test/passes/test_convert_to_linear.py
+++ b/backends/xnnpack/test/passes/test_convert_to_linear.py
@@ -14,6 +14,9 @@
 class TestConvertToLinear(unittest.TestCase):
     PassStage = RunPasses([ConvertToLinearPass])
 
+    def setUp(self):
+        torch._dynamo.reset()
+
     def test_fp32_convert_to_linear(self):
         in_sizes = [1, 4, 4]
         input_sizes = [4, 37, 17]
@@ -21,6 +24,7 @@ def test_fp32_convert_to_linear(self):
         bias_vals = [True, True, False]
 
         for i, _ in enumerate(in_sizes):
+            torch._dynamo.reset()
             in_size = int(in_sizes[i])
             input_size = int(input_sizes[i])
             output_size = int(output_sizes[i])
diff --git a/backends/xnnpack/test/passes/test_decompose_cat_pass.py b/backends/xnnpack/test/passes/test_decompose_cat_pass.py
index beb1761aec8..38537a99c4d 100644
--- a/backends/xnnpack/test/passes/test_decompose_cat_pass.py
+++ b/backends/xnnpack/test/passes/test_decompose_cat_pass.py
@@ -16,6 +16,9 @@ class TestDecomposeCatPass(unittest.TestCase):
     PassStage = RunPasses([DecomposeConcatenate])
     cat_name = "executorch_exir_dialects_edge__ops_aten_cat_default"
 
+    def setUp(self):
+        torch._dynamo.reset()
+
     class Cat(torch.nn.Module):
         def forward(self, *args):
             xs = [*args]
diff --git a/backends/xnnpack/test/passes/test_remove_get_item_pass.py b/backends/xnnpack/test/passes/test_remove_get_item_pass.py
index 2365c9bba0c..4d71d61afd7 100644
--- a/backends/xnnpack/test/passes/test_remove_get_item_pass.py
+++ b/backends/xnnpack/test/passes/test_remove_get_item_pass.py
@@ -16,6 +16,9 @@ class TestRemoveGetItemPass(unittest.TestCase):
     max_pool2d_name = "executorch_exir_dialects_edge__ops_aten_max_pool2d_default"
     amax_name = "executorch_exir_dialects_edge__ops_aten_amax_default"
 
+    def setUp(self):
+        torch._dynamo.reset()
+
     class MaxPool2dModule(torch.nn.Module):
         def __init__(
             self,
diff --git a/backends/xnnpack/test/passes/test_tag_implicit_q_dq_pass.py b/backends/xnnpack/test/passes/test_tag_implicit_q_dq_pass.py
index 05d1ac9e8b6..6fec7726835 100644
--- a/backends/xnnpack/test/passes/test_tag_implicit_q_dq_pass.py
+++ b/backends/xnnpack/test/passes/test_tag_implicit_q_dq_pass.py
@@ -20,6 +20,9 @@
 class TestTagImplicitQDq(unittest.TestCase):
     PassStage = RunPasses([DuplicateDequantNodePass, TagImplicitQDqPass])
 
+    def setUp(self):
+        torch._dynamo.reset()
+
     class QDqModule(torch.nn.Module):
         def __init__(self):
             super().__init__()
diff --git a/docs/source/backends-coreml.md b/docs/source/backends-coreml.md
index a06820b2d08..126727735ae 100644
--- a/docs/source/backends-coreml.md
+++ b/docs/source/backends-coreml.md
@@ -28,12 +28,6 @@ Before starting, make sure you install the Xcode Command Line Tools:
 xcode-select --install
 ```
 
-Finally you must install the CoreML backend by running the following script:
-```bash
-sh ./backends/apple/coreml/scripts/install_requirements.sh
-```
-
-
 ----
 
 ## Using the CoreML Backend
diff --git a/docs/source/backends-qualcomm.md b/docs/source/backends-qualcomm.md
index 2d2b017aca1..7a2f749e185 100644
--- a/docs/source/backends-qualcomm.md
+++ b/docs/source/backends-qualcomm.md
@@ -43,10 +43,17 @@ The version is documented in QNN SDK.
 
 ### Hardware:
 You will need an Android smartphone with adb-connected running on one of below Qualcomm SoCs:
+ - SA8295
  - SM8450 (Snapdragon 8 Gen 1)
  - SM8475 (Snapdragon 8 Gen 1+)
  - SM8550 (Snapdragon 8 Gen 2)
  - SM8650 (Snapdragon 8 Gen 3)
+ - SM8750 (Snapdragon 8 Elite)
+ - SSG2115P
+ - SSG2125P
+ - SXR1230P
+ - SXR2230P
+ - SXR2330P
 
 This example is verified with SM8550 and SM8450.
 
diff --git a/docs/source/demo-apps-android.md b/docs/source/demo-apps-android.md
deleted file mode 100644
index 5d6dccf5734..00000000000
--- a/docs/source/demo-apps-android.md
+++ /dev/null
@@ -1,2 +0,0 @@
-```{include} ../../examples/demo-apps/android/ExecuTorchDemo/README.md
-```
diff --git a/docs/source/getting-started.md b/docs/source/getting-started.md
index 4d8e3f0189d..741454fed27 100644
--- a/docs/source/getting-started.md
+++ b/docs/source/getting-started.md
@@ -14,7 +14,7 @@ The following are required to install the ExecuTorch host libraries, needed to e
   - Windows is supported via WSL.
 
 ## Installation
-To use ExecuTorch, you will need to install both the Python package and the appropriate platform-specific runtime libraries. Pip is the recommended way to install the ExecuTorch python package. 
+To use ExecuTorch, you will need to install both the Python package and the appropriate platform-specific runtime libraries. Pip is the recommended way to install the ExecuTorch python package.
 
 This package includes the dependencies needed to export a PyTorch model, as well as Python runtime bindings for model testing and evaluation. Consider installing ExecuTorch within a virtual environment, such as one provided by [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html#creating-environments) or [venv](https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/#create-and-use-virtual-environments).
 
@@ -72,7 +72,7 @@ Quantization can also be done at this stage to reduce model size and runtime. Qu
 
 ### Testing the Model
 
-After successfully generating a .pte file, it is common to use the Python runtime APIs to validate the model on the development platform. This can be used to evaluate model accuracy before running on-device. 
+After successfully generating a .pte file, it is common to use the Python runtime APIs to validate the model on the development platform. This can be used to evaluate model accuracy before running on-device.
 
 For the MobileNet V2 model from torchvision used in this example, image inputs are expected as a normalized, float32 tensor with a dimensions of (batch, channels, height, width). The output See [torchvision.models.mobilenet_v2](https://pytorch.org/vision/main/models/generated/torchvision.models.mobilenet_v2.html) for more information on the input and output tensor format for this model.
 
@@ -103,20 +103,13 @@ Quick Links:
 ### Android
 
 #### Installation
-ExecuTorch provides Java bindings for Android usage, which can be consumed from both Java and Kotlin. 
-To add the library to your app, download the AAR, and add it to the gradle build rule.
+ExecuTorch provides Java bindings for Android usage, which can be consumed from both Java and Kotlin.
+To add the library to your app, add the following dependency to gradle build rule.
 
-```
-mkdir -p app/libs
-curl https://ossci-android.s3.amazonaws.com/executorch/release/v0.5.0-rc3/executorch.aar -o app/libs/executorch.aar
-```
-And in gradle,
 ```
 # app/build.gradle.kts
 dependencies {
-    implementation(files("libs/executorch.aar"))
-    implementation("com.facebook.soloader:soloader:0.10.5")
-    implementation("com.facebook.fbjni:fbjni:0.5.1")
+  implementation("org.pytorch:executorch-android:0.5.1")
 }
 ```
 
@@ -137,7 +130,7 @@ EValue[] output = model.forward(input_evalue);
 float[] scores = output[0].toTensor().getDataAsFloatArray();
 ```
 
-For a full example of running a model on Android, see the [ExecuTorch Android Demo App](https://github.com/pytorch/executorch/blob/main/examples/demo-apps/android/ExecuTorchDemo/app/src/main/java/com/example/executorchdemo/ClassificationActivity.java). For more information on Android development, including building from source, a full description of the Java APIs, and information on using ExecuTorch from Android native code, see [Using ExecuTorch on Android](using-executorch-android.md).
+For a full example of running a model on Android, see the [DeepLabV3AndroidDemo](https://github.com/pytorch-labs/executorch-examples/tree/main/dl3/android/DeepLabV3Demo). For more information on Android development, including building from source, a full description of the Java APIs, and information on using ExecuTorch from Android native code, see [Using ExecuTorch on Android](using-executorch-android.md).
 
 ### iOS
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
index b27d53f51c7..187a5300c58 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -31,7 +31,7 @@ Welcome to the ExecuTorch Documentation
    </div>
 
 The ExecuTorch source is hosted on GitHub at
-https://github.com/pytorch/executorch. 
+https://github.com/pytorch/executorch.
 
 Join us on `Discord <https://discord.com/invite/Dh43CKSAdc>`__ if you have questions
 about ExecuTorch or would like to become a contributor!
@@ -103,7 +103,7 @@ Topics in this section will help you get started with ExecuTorch.
    :caption: Examples
    :hidden:
 
-   demo-apps-android.md
+   Building an ExecuTorch Android Demo App <https://github.com/pytorch-labs/executorch-examples/tree/main/dl3/android/DeepLabV3Demo#executorch-android-demo-app>
    demo-apps-ios.md
 
 .. toctree::
diff --git a/docs/source/using-executorch-android.md b/docs/source/using-executorch-android.md
index 62d1f3ee75a..99b68008dc6 100644
--- a/docs/source/using-executorch-android.md
+++ b/docs/source/using-executorch-android.md
@@ -176,7 +176,7 @@ public class MainActivity extends Activity {
 ```
 This example loads an ExecuTorch module, prepares input data, runs inference, and processes the output data.
 
-Please use [ExecuTorchDemo](https://github.com/pytorch/executorch/tree/main/examples/demo-apps/android/ExecuTorchDemo)
+Please use [DeepLabV3AndroidDemo](https://github.com/pytorch-labs/executorch-examples/tree/main/dl3/android/DeepLabV3Demo)
 and [LlamaDemo](https://github.com/pytorch/executorch/tree/main/examples/demo-apps/android/LlamaDemo) for the code examples
 using ExecuTorch AAR package.
 
diff --git a/docs/source/using-executorch-ios.md b/docs/source/using-executorch-ios.md
index e975cb9ef22..70c2b366fa8 100644
--- a/docs/source/using-executorch-ios.md
+++ b/docs/source/using-executorch-ios.md
@@ -103,19 +103,18 @@ git clone https://github.com/pytorch/executorch.git --depth 1 --recurse-submodul
 3. Set up [Python](https://www.python.org/downloads/macos/) 3.10+ and activate a virtual environment:
 
 ```bash
-python3 -m venv .venv && source .venv/bin/activate && pip install --upgrade pip
+python3 -m venv .venv && source .venv/bin/activate && ./install_requirements.sh
 ```
 
-4. Install the required dependencies, including those needed for the backends like [Core ML](backends-coreml.md) or [MPS](backends-mps.md), if you plan to build them as well:
+4. Install the required dependencies, including those needed for the backends like [Core ML](backends-coreml.md) or [MPS](backends-mps.md). Choose one:
 
 ```bash
-./install_executorch.sh --pybind coreml mps xnnpack
+# ExecuTorch with xnnpack and CoreML backend
+./install_executorch.sh --pybind xnnpack
 
-# Optional dependencies for Core ML backend.
-./backends/apple/coreml/scripts/install_requirements.sh
-
-# And MPS backend.
+# Optional: ExecuTorch with xnnpack, CoreML, and MPS backend
 ./backends/apple/mps/install_requirements.sh
+./install_executorch.sh --pybind xnnpack mps
 ```
 
 5. Install [CMake](https://cmake.org):
diff --git a/examples/apple/coreml/README.md b/examples/apple/coreml/README.md
index f4270956b2c..4dba5031358 100644
--- a/examples/apple/coreml/README.md
+++ b/examples/apple/coreml/README.md
@@ -18,16 +18,8 @@ We will walk through an example model to generate a Core ML delegated binary fil
 1. Following the setup guide in [Setting Up ExecuTorch](https://pytorch.org/executorch/stable/getting-started-setup)
 you should be able to get the basic development environment for ExecuTorch working.
 
-2. Run `install_requirements.sh` to install dependencies required by the **Core ML** backend.
 
-```bash
-cd executorch
-
-./backends/apple/coreml/scripts/install_requirements.sh
-
-```
-
-3. Run the export script to generate a Core ML delegated binary file.
+2. Run the export script to generate a Core ML delegated binary file.
 
 ```bash
 cd executorch
@@ -39,11 +31,14 @@ python3 -m examples.portable.scripts.export -h
 python3 -m examples.apple.coreml.scripts.export --model_name add
 ```
 
-4. Run the binary file using the `coreml_executor_runner`.
+3. Run the binary file using the `coreml_executor_runner`.
 
 ```bash
 cd executorch
 
+# Install requirements needed to run the example runner
+./backends/apple/coreml/scripts/install_requirements.sh
+
 # Builds the Core ML executor runner. Generates ./coreml_executor_runner if successful.
 ./examples/apple/coreml/scripts/build_executor_runner.sh
 
diff --git a/examples/apple/coreml/scripts/extract_coreml_models.py b/examples/apple/coreml/scripts/extract_coreml_models.py
index d2812d37ab0..b3778a22625 100644
--- a/examples/apple/coreml/scripts/extract_coreml_models.py
+++ b/examples/apple/coreml/scripts/extract_coreml_models.py
@@ -10,12 +10,9 @@
 
 from typing import List, Optional
 
-import executorchcoreml
-
+from executorch.backends.apple.coreml import executorchcoreml
 from executorch.backends.apple.coreml.compiler import CoreMLBackend
-
 from executorch.exir._serialize._program import deserialize_pte_binary
-
 from executorch.exir.schema import (
     BackendDelegate,
     BackendDelegateDataReference,
diff --git a/examples/demo-apps/android/ExecuTorchDemo/.gitignore b/examples/demo-apps/android/ExecuTorchDemo/.gitignore
deleted file mode 100644
index e7bee2e2b1c..00000000000
--- a/examples/demo-apps/android/ExecuTorchDemo/.gitignore
+++ /dev/null
@@ -1,11 +0,0 @@
-*.iml
-.gradle
-/local.properties
-.idea
-.DS_Store
-/build
-/captures
-.externalNativeBuild
-.cxx
-local.properties
-*.so
diff --git a/examples/demo-apps/android/ExecuTorchDemo/README.md b/examples/demo-apps/android/ExecuTorchDemo/README.md
deleted file mode 100644
index c6ee756458f..00000000000
--- a/examples/demo-apps/android/ExecuTorchDemo/README.md
+++ /dev/null
@@ -1,132 +0,0 @@
-# Building an ExecuTorch Android Demo App
-
-This is forked from [PyTorch Android Demo App](https://github.com/pytorch/android-demo-app).
-
-This guide explains how to setup ExecuTorch for Android using a demo app. The app employs a [DeepLab v3](https://pytorch.org/hub/pytorch_vision_deeplabv3_resnet101/) model for image segmentation tasks. Models are exported to ExecuTorch using [XNNPACK FP32 backend](tutorial-xnnpack-delegate-lowering.md).
-
-::::{grid} 2
-:::{grid-item-card}  What you will learn
-:class-card: card-prerequisites
-* How to set up a build target for Android arm64-v8a
-* How to build the required ExecuTorch runtime with JNI wrapper for Android
-* How to build the app with required JNI library and model file
-:::
-
-:::{grid-item-card} Prerequisites
-:class-card: card-prerequisites
-* Refer to [Setting up ExecuTorch](https://pytorch.org/executorch/stable/getting-started-setup) to set up the repo and dev environment.
-* Download and install [Android Studio and SDK](https://developer.android.com/studio).
-* Supported Host OS: CentOS, macOS Ventura (M1/x86_64). See below for Qualcomm HTP specific requirements.
-* *Qualcomm HTP Only[^1]:* To build and run on Qualcomm's AI Engine Direct, please follow [Building and Running ExecuTorch with Qualcomm AI Engine Direct Backend](backends-qualcomm.md) for hardware and software pre-requisites. The version we use for this tutorial is 2.19. The chip we use for this tutorial is SM8450.
-:::
-::::
-
-[^1]: This section applies only if Qualcomm HTP Backend is needed in the app. Same applies to sections with title`Qualcomm Hexagon NPU`.
-
-```{note}
-This demo app and tutorial has only been validated with arm64-v8a [ABI](https://developer.android.com/ndk/guides/abis).
-```
-
-
-## Build
-
-### Ahead-Of-Time
-
-We generate the model file for the ExecuTorch runtime in Android Demo App.
-
-#### XNNPACK Delegation
-
-For delegating DeepLab v3 to XNNPACK backend, please do the following to export the model:
-
-```bash
-cd executorch # go to executorch root
-python3 -m examples.xnnpack.aot_compiler --model_name="dl3" --delegate
-```
-
-Then push the pte file to Android device:
-
-```bash
-adb push dl3_xnnpack_fp32.pte /data/local/tmp/dl3_xnnpack_fp32.pte
-```
-
-For more detailed tutorial of lowering to XNNPACK, please see [XNNPACK backend](backends-xnnpack.md).
-
-#### Qualcomm Hexagon NPU
-
-For delegating to Qualcomm Hexagon NPU, please follow the tutorial [here](backends-qualcomm.md).
-
-```bash
-python -m examples.qualcomm.scripts.deeplab_v3 -b build-android -m SM8450 -s <adb_connected_device_serial>
-```
-
-Then push the pte file to Android device:
-
-```bash
-adb push deeplab_v3/dlv3_qnn.pte /data/local/tmp/dlv3_qnn.pte
-```
-
-### Runtime
-
-We build the required ExecuTorch runtime library (AAR) to run the model.
-
-#### XNNPACK
-
-```bash
-# go to ExecuTorch repo root
-export ANDROID_NDK=<path-to-android-ndk>
-export ANDROID_ABIS=arm64-v8a
-
-# Run the following lines from the `executorch/` folder
-./install_executorch.sh --clean
-
-# Create a new directory `app/libs` for the AAR to live
-pushd examples/demo-apps/android/ExecuTorchDemo
-mkdir -p app/libs
-popd
-
-# Build the AAR. It will include XNNPACK backend by default.
-export BUILD_AAR_DIR=$(realpath examples/demo-apps/android/ExecuTorchDemo/app/libs)
-sh scripts/build_android_library.sh
-```
-
-#### Qualcomm Hexagon NPU
-
-```bash
-# go to ExecuTorch repo root
-export ANDROID_NDK=<path-to-android-ndk>
-export ANDROID_ABIS=arm64-v8a
-export QNN_SDK_ROOT=<path-to-qnn-sdk-root>
-
-# Run the following lines from the `executorch/` folder
-./install_executorch.sh --clean
-
-# Create a new directory `app/libs` for the AAR to live
-pushd examples/demo-apps/android/ExecuTorchDemo
-mkdir -p app/libs
-popd
-
-# Build the AAR. It will include XNNPACK backend by default.
-export BUILD_AAR_DIR=$(realpath examples/demo-apps/android/ExecuTorchDemo/app/libs)
-sh scripts/build_android_library.sh
-```
-
-This is very similar to XNNPACK setup, but users now needs to define `QNN_SDK_ROOT` so that
-QNN backend is built into the AAR.
-
-## Running the App
-
-1. Open the project `examples/demo-apps/android/ExecuTorchDemo` with Android Studio.
-
-2. [Run](https://developer.android.com/studio/run) the app (^R).
-
-<img src="_static/img/android_studio.png" alt="Android Studio View" /><br>
-
-On the phone or emulator, you can try running the model:
-<img src="_static/img/android_demo_run.png" alt="Android Demo" /><br>
-
-## Takeaways
-Through this tutorial we've learnt how to build the ExecuTorch runtime library with XNNPACK (or Qualcomm HTP) backend, and expose it to JNI layer to build the Android app running segmentation model.
-
-## Reporting Issues
-
-If you encountered any bugs or issues following this tutorial please file a bug/issue here on [Github](https://github.com/pytorch/executorch/issues/new).
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/.gitignore b/examples/demo-apps/android/ExecuTorchDemo/app/.gitignore
deleted file mode 100644
index 796b96d1c40..00000000000
--- a/examples/demo-apps/android/ExecuTorchDemo/app/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-/build
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/build.gradle.kts b/examples/demo-apps/android/ExecuTorchDemo/app/build.gradle.kts
deleted file mode 100644
index ca06671f328..00000000000
--- a/examples/demo-apps/android/ExecuTorchDemo/app/build.gradle.kts
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-plugins {
-  id("com.android.application")
-  id("org.jetbrains.kotlin.android")
-}
-
-android {
-  namespace = "com.example.executorchdemo"
-  compileSdk = 34
-
-  defaultConfig {
-    applicationId = "com.example.executorchdemo"
-    minSdk = 24
-    targetSdk = 33
-    versionCode = 1
-    versionName = "1.0"
-
-    testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner"
-    vectorDrawables { useSupportLibrary = true }
-    externalNativeBuild { cmake { cppFlags += "" } }
-  }
-
-  buildTypes {
-    release {
-      isMinifyEnabled = false
-      proguardFiles(getDefaultProguardFile("proguard-android-optimize.txt"), "proguard-rules.pro")
-    }
-  }
-  compileOptions {
-    sourceCompatibility = JavaVersion.VERSION_1_8
-    targetCompatibility = JavaVersion.VERSION_1_8
-  }
-  kotlinOptions { jvmTarget = "1.8" }
-  buildFeatures { compose = true }
-  composeOptions { kotlinCompilerExtensionVersion = "1.4.3" }
-  packaging { resources { excludes += "/META-INF/{AL2.0,LGPL2.1}" } }
-}
-
-dependencies {
-  implementation("androidx.core:core-ktx:1.9.0")
-  implementation("androidx.lifecycle:lifecycle-runtime-ktx:2.6.1")
-  implementation("androidx.activity:activity-compose:1.7.0")
-  implementation(platform("androidx.compose:compose-bom:2023.03.00"))
-  implementation("androidx.compose.ui:ui")
-  implementation("androidx.compose.ui:ui-graphics")
-  implementation("androidx.compose.ui:ui-tooling-preview")
-  implementation("androidx.compose.material3:material3")
-  implementation("androidx.appcompat:appcompat:1.6.1")
-  implementation("androidx.camera:camera-core:1.3.0-rc02")
-  implementation("androidx.constraintlayout:constraintlayout:2.2.0-alpha12")
-  implementation("com.facebook.soloader:soloader:0.10.5")
-  implementation("com.facebook.fbjni:fbjni:0.5.1")
-  implementation(files("libs/executorch.aar"))
-  testImplementation("junit:junit:4.13.2")
-  androidTestImplementation("androidx.test.ext:junit:1.1.5")
-  androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
-  androidTestImplementation(platform("androidx.compose:compose-bom:2023.03.00"))
-  androidTestImplementation("androidx.compose.ui:ui-test-junit4")
-  debugImplementation("androidx.compose.ui:ui-tooling")
-  debugImplementation("androidx.compose.ui:ui-test-manifest")
-}
-
-tasks.register("setup") {
-  doFirst {
-    exec {
-      commandLine("sh", "setup.sh")
-      workingDir("../")
-    }
-  }
-}
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/proguard-rules.pro b/examples/demo-apps/android/ExecuTorchDemo/app/proguard-rules.pro
deleted file mode 100644
index 481bb434814..00000000000
--- a/examples/demo-apps/android/ExecuTorchDemo/app/proguard-rules.pro
+++ /dev/null
@@ -1,21 +0,0 @@
-# Add project specific ProGuard rules here.
-# You can control the set of applied configuration files using the
-# proguardFiles setting in build.gradle.
-#
-# For more details, see
-#   http://developer.android.com/guide/developing/tools/proguard.html
-
-# If your project uses WebView with JS, uncomment the following
-# and specify the fully qualified class name to the JavaScript interface
-# class:
-#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
-#   public *;
-#}
-
-# Uncomment this to preserve the line number information for
-# debugging stack traces.
-#-keepattributes SourceFile,LineNumberTable
-
-# If you keep the line number information, uncomment this to
-# hide the original source file name.
-#-renamesourcefileattribute SourceFile
\ No newline at end of file
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/AndroidManifest.xml b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/AndroidManifest.xml
deleted file mode 100644
index 8d71b156398..00000000000
--- a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/AndroidManifest.xml
+++ /dev/null
@@ -1,49 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<manifest xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:tools="http://schemas.android.com/tools"
-    package="com.example.executorchdemo">
-
-    <uses-sdk android:minSdkVersion="19"
-          android:targetSdkVersion="34"
-          android:maxSdkVersion="40" />
-
-    <uses-permission android:name="android.permission.READ_EXTERNAL_STORAGE" />
-
-    <application
-        android:allowBackup="false"
-        android:dataExtractionRules="@xml/data_extraction_rules"
-        android:fullBackupContent="@xml/backup_rules"
-        android:icon="@mipmap/ic_launcher"
-        android:label="@string/app_name"
-        android:roundIcon="@mipmap/ic_launcher_round"
-        android:supportsRtl="true"
-        android:theme="@style/Theme.ExecuTorchDemo"
-        android:extractNativeLibs="true"
-        tools:targetApi="34">
-
-        <uses-native-library android:name="libexecutorchdemo.so"
-            android:required="false"/>
-
-        <uses-native-library android:name="libcdsprpc.so"
-            android:required="false"/>
-
-        <activity
-            android:name=".MainActivity"
-            android:exported="true"
-            android:label="@string/app_name"
-            android:theme="@style/Theme.ExecuTorchDemo">
-            <intent-filter>
-                <action android:name="android.intent.action.MAIN" />
-
-                <category android:name="android.intent.category.LAUNCHER" />
-            </intent-filter>
-        </activity>
-        <activity
-            android:name=".ClassificationActivity"
-            android:label="@string/app_name"
-            android:theme="@style/Theme.ExecuTorchDemo">
-        </activity>
-
-    </application>
-
-</manifest>
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/BUCK b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/BUCK
deleted file mode 100644
index 371c991ce88..00000000000
--- a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/BUCK
+++ /dev/null
@@ -1,65 +0,0 @@
-load("@fbsource//tools/build_defs:manifold.bzl", "manifold_get")
-load("@fbsource//tools/build_defs/android:fb_android_binary.bzl", "fb_android_binary")
-load("@fbsource//tools/build_defs/android:fb_android_library.bzl", "fb_android_library")
-load("@fbsource//tools/build_defs/android:fb_android_resource.bzl", "fb_android_resource")
-
-manifold_get(
-    name = "dl3_xnnpack_fp32",
-    out = "dl3_xnnpack_fp32.pte",
-    api_key = "executorch-key",
-    artifact_path = "tree/models/benchmarking/executorch/dl3_xnnpack_fp32.pte",
-    bucket_name = "executorch",
-    sha1 = "3e7af1d8f5ec4acb6de156d361715e16e5f53783",
-    timeout_msec = 120000,
-)
-
-fb_android_resource(
-    name = "app_res",
-    assets = "assets",
-    package = "com.example.executorchdemo",
-    res = "res",
-)
-
-fb_android_resource(
-    name = "model_res",
-    assets = {"dl3_xnnpack_fp32.pte": ":dl3_xnnpack_fp32"},
-    package = "com.example.executorchdemo",
-    res = "res",
-)
-
-fb_android_library(
-    name = "app_lib",
-    srcs = [
-        "java/com/example/executorchdemo/MainActivity.java",
-        "java/com/example/executorchdemo/TensorImageUtils.java",
-    ],
-    autoglob = False,
-    language = "JAVA",
-    deps = [
-        ":app_res",
-        "//xplat/executorch/extension/android:executorch",
-    ],
-)
-
-fb_android_binary(
-    name = "ExecuTorchDemo",
-    keystore = "//fbandroid/keystores:debug",
-    manifest = "AndroidManifest.xml",
-    manifest_entries = {
-        "min_sdk_version": 19,  # Android supports 19 for minimum
-        "target_sdk_version": 34,
-        "version_code": "1",
-        "version_name": "1.0",
-    },
-    package_type = "release",
-    skip_proguard = True,
-    deps = [
-        ":app_lib",
-        ":app_res",
-        ":model_res",
-        "//third-party/java/androidx/appcompat/appcompat:appcompat",
-        "//third-party/java/androidx/constraintlayout/constraintlayout:constraintlayout",
-        "//xplat/executorch/extension/android:executorch",
-        "//xplat/executorch/extension/android/jni:executorch_jni_full",
-    ],
-)
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/TARGETS b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/TARGETS
deleted file mode 100644
index 5c4f482b5ea..00000000000
--- a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/TARGETS
+++ /dev/null
@@ -1 +0,0 @@
-# This file needs to exist to avoid build system breakage, see https://fburl.com/workplace/jtdlgdmd
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/assets/corgi.jpeg b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/assets/corgi.jpeg
deleted file mode 100644
index b7cdd3bf73d..00000000000
Binary files a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/assets/corgi.jpeg and /dev/null differ
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/assets/deeplab.jpg b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/assets/deeplab.jpg
deleted file mode 100644
index e840b670002..00000000000
Binary files a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/assets/deeplab.jpg and /dev/null differ
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/assets/dog.jpg b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/assets/dog.jpg
deleted file mode 100644
index e20f0ccbc48..00000000000
Binary files a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/assets/dog.jpg and /dev/null differ
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/java/com/example/executorchdemo/MainActivity.java b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/java/com/example/executorchdemo/MainActivity.java
deleted file mode 100644
index 9ac800b49a3..00000000000
--- a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/java/com/example/executorchdemo/MainActivity.java
+++ /dev/null
@@ -1,225 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-package com.example.executorchdemo;
-
-import android.app.Activity;
-import android.content.Context;
-import android.graphics.Bitmap;
-import android.graphics.BitmapFactory;
-import android.os.Bundle;
-import android.os.SystemClock;
-import android.system.ErrnoException;
-import android.system.Os;
-import android.util.Log;
-import android.view.View;
-import android.widget.Button;
-import android.widget.ImageView;
-import android.widget.ProgressBar;
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.Objects;
-import org.pytorch.executorch.EValue;
-import org.pytorch.executorch.Module;
-import org.pytorch.executorch.Tensor;
-
-public class MainActivity extends Activity implements Runnable {
-  private ImageView mImageView;
-  private Button mButtonXnnpack;
-  private Button mButtonHtp;
-  private ProgressBar mProgressBar;
-  private Bitmap mBitmap = null;
-  private Module mModule = null;
-  private String mImagename = "corgi.jpeg";
-
-  // see http://host.robots.ox.ac.uk:8080/pascal/VOC/voc2007/segexamples/index.html for the list of
-  // classes with indexes
-  private static final int CLASSNUM = 21;
-  private static final int DOG = 12;
-  private static final int PERSON = 15;
-  private static final int SHEEP = 17;
-
-  public static String assetFilePath(Context context, String assetName) throws IOException {
-    File file = new File(context.getFilesDir(), assetName);
-    if (file.exists() && file.length() > 0) {
-      return file.getAbsolutePath();
-    }
-
-    try (InputStream is = context.getAssets().open(assetName)) {
-      try (OutputStream os = new FileOutputStream(file)) {
-        byte[] buffer = new byte[4 * 1024];
-        int read;
-        while ((read = is.read(buffer)) != -1) {
-          os.write(buffer, 0, read);
-        }
-        os.flush();
-      }
-      return file.getAbsolutePath();
-    }
-  }
-
-  private void populateImage() {
-    try {
-      mBitmap = BitmapFactory.decodeStream(getAssets().open(mImagename));
-      mBitmap = Bitmap.createScaledBitmap(mBitmap, 224, 224, true);
-      mImageView.setImageBitmap(mBitmap);
-    } catch (IOException e) {
-      Log.e("ImageSegmentation", "Error reading assets", e);
-      finish();
-    }
-  }
-
-  @Override
-  protected void onCreate(Bundle savedInstanceState) {
-    super.onCreate(savedInstanceState);
-    setContentView(R.layout.activity_main);
-
-    try {
-      Os.setenv("ADSP_LIBRARY_PATH", getApplicationInfo().nativeLibraryDir, true);
-    } catch (ErrnoException e) {
-      Log.e("ExecuTorchDemo", "Cannot set ADSP_LIBRARY_PATH", e);
-      finish();
-    }
-
-    try {
-      mBitmap = BitmapFactory.decodeStream(getAssets().open(mImagename), null, null);
-      mBitmap = Bitmap.createScaledBitmap(mBitmap, 224, 224, true);
-    } catch (IOException e) {
-      Log.e("ImageSegmentation", "Error reading assets", e);
-      finish();
-    }
-
-    mModule = Module.load("/data/local/tmp/dl3_xnnpack_fp32.pte");
-
-    mImageView = findViewById(R.id.imageView);
-    mImageView.setImageBitmap(mBitmap);
-
-    final Button buttonNext = findViewById(R.id.nextButton);
-    buttonNext.setOnClickListener(
-        new View.OnClickListener() {
-          public void onClick(View v) {
-            if (Objects.equals(mImagename, "corgi.jpeg")) {
-              mImagename = "dog.jpg";
-            } else if (Objects.equals(mImagename, "dog.jpg")) {
-              mImagename = "deeplab.jpg";
-            } else {
-              mImagename = "corgi.jpeg";
-            }
-            populateImage();
-          }
-        });
-
-    mButtonXnnpack = findViewById(R.id.xnnpackButton);
-    mButtonHtp = findViewById(R.id.htpButton);
-    mProgressBar = (ProgressBar) findViewById(R.id.progressBar);
-    mButtonXnnpack.setOnClickListener(
-        new View.OnClickListener() {
-          public void onClick(View v) {
-            mModule.destroy();
-            mModule = Module.load("/data/local/tmp/dl3_xnnpack_fp32.pte");
-            mButtonXnnpack.setEnabled(false);
-            mProgressBar.setVisibility(ProgressBar.VISIBLE);
-            mButtonXnnpack.setText(getString(R.string.run_model));
-
-            Thread thread = new Thread(MainActivity.this);
-            thread.start();
-          }
-        });
-
-    mButtonHtp.setOnClickListener(
-        new View.OnClickListener() {
-          public void onClick(View v) {
-            mModule.destroy();
-            mModule = Module.load("/data/local/tmp/dlv3_qnn.pte");
-            mButtonHtp.setEnabled(false);
-            mProgressBar.setVisibility(ProgressBar.VISIBLE);
-            mButtonHtp.setText(getString(R.string.run_model));
-
-            Thread thread = new Thread(MainActivity.this);
-            thread.start();
-          }
-        });
-
-    final Button resetImage = findViewById(R.id.resetImage);
-    resetImage.setOnClickListener(
-        new View.OnClickListener() {
-          public void onClick(View v) {
-            populateImage();
-          }
-        });
-  }
-
-  @Override
-  public void run() {
-    final Tensor inputTensor =
-        TensorImageUtils.bitmapToFloat32Tensor(
-            mBitmap,
-            TensorImageUtils.TORCHVISION_NORM_MEAN_RGB,
-            TensorImageUtils.TORCHVISION_NORM_STD_RGB);
-    final float[] inputs = inputTensor.getDataAsFloatArray();
-
-    final long startTime = SystemClock.elapsedRealtime();
-    Tensor outputTensor = mModule.forward(EValue.from(inputTensor))[0].toTensor();
-    final long inferenceTime = SystemClock.elapsedRealtime() - startTime;
-    Log.d("ImageSegmentation", "inference time (ms): " + inferenceTime);
-
-    final float[] scores = outputTensor.getDataAsFloatArray();
-    int width = mBitmap.getWidth();
-    int height = mBitmap.getHeight();
-
-    int[] intValues = new int[width * height];
-    for (int j = 0; j < height; j++) {
-      for (int k = 0; k < width; k++) {
-        int maxi = 0, maxj = 0, maxk = 0;
-        double maxnum = -Double.MAX_VALUE;
-        for (int i = 0; i < CLASSNUM; i++) {
-          float score = scores[i * (width * height) + j * width + k];
-          if (score > maxnum) {
-            maxnum = score;
-            maxi = i;
-            maxj = j;
-            maxk = k;
-          }
-        }
-        if (maxi == PERSON) intValues[maxj * width + maxk] = 0xFFFF0000; // R
-        else if (maxi == DOG) intValues[maxj * width + maxk] = 0xFF00FF00; // G
-        else if (maxi == SHEEP) intValues[maxj * width + maxk] = 0xFF0000FF; // B
-        else intValues[maxj * width + maxk] = 0xFF000000;
-      }
-    }
-
-    Bitmap bmpSegmentation = Bitmap.createScaledBitmap(mBitmap, width, height, true);
-    Bitmap outputBitmap = bmpSegmentation.copy(bmpSegmentation.getConfig(), true);
-    outputBitmap.setPixels(
-        intValues,
-        0,
-        outputBitmap.getWidth(),
-        0,
-        0,
-        outputBitmap.getWidth(),
-        outputBitmap.getHeight());
-    final Bitmap transferredBitmap =
-        Bitmap.createScaledBitmap(outputBitmap, mBitmap.getWidth(), mBitmap.getHeight(), true);
-
-    runOnUiThread(
-        new Runnable() {
-          @Override
-          public void run() {
-            mImageView.setImageBitmap(transferredBitmap);
-            mButtonXnnpack.setEnabled(true);
-            mButtonXnnpack.setText(R.string.run_xnnpack);
-            mButtonHtp.setEnabled(true);
-            mButtonHtp.setText(R.string.run_htp);
-            mProgressBar.setVisibility(ProgressBar.INVISIBLE);
-          }
-        });
-  }
-}
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/java/com/example/executorchdemo/TensorImageUtils.java b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/java/com/example/executorchdemo/TensorImageUtils.java
deleted file mode 100644
index a5c7699df9f..00000000000
--- a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/java/com/example/executorchdemo/TensorImageUtils.java
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-package com.example.executorchdemo;
-
-import android.graphics.Bitmap;
-import android.util.Log;
-import java.nio.FloatBuffer;
-import org.pytorch.executorch.Tensor;
-
-/**
- * Contains utility functions for {@link Tensor} creation from {@link android.graphics.Bitmap} or
- * {@link android.media.Image} source.
- */
-public final class TensorImageUtils {
-
-  public static float[] TORCHVISION_NORM_MEAN_RGB = new float[] {0.485f, 0.456f, 0.406f};
-  public static float[] TORCHVISION_NORM_STD_RGB = new float[] {0.229f, 0.224f, 0.225f};
-
-  /**
-   * Creates new {@link Tensor} from full {@link android.graphics.Bitmap}, normalized with specified
-   * in parameters mean and std.
-   *
-   * @param normMeanRGB means for RGB channels normalization, length must equal 3, RGB order
-   * @param normStdRGB standard deviation for RGB channels normalization, length must equal 3, RGB
-   *     order
-   */
-  public static Tensor bitmapToFloat32Tensor(
-      final Bitmap bitmap, final float[] normMeanRGB, final float normStdRGB[]) {
-    checkNormMeanArg(normMeanRGB);
-    checkNormStdArg(normStdRGB);
-
-    return bitmapToFloat32Tensor(
-        bitmap, 0, 0, bitmap.getWidth(), bitmap.getHeight(), normMeanRGB, normStdRGB);
-  }
-
-  /**
-   * Writes tensor content from specified {@link android.graphics.Bitmap}, normalized with specified
-   * in parameters mean and std to specified {@link java.nio.FloatBuffer} with specified offset.
-   *
-   * @param bitmap {@link android.graphics.Bitmap} as a source for Tensor data
-   * @param x - x coordinate of top left corner of bitmap's area
-   * @param y - y coordinate of top left corner of bitmap's area
-   * @param width - width of bitmap's area
-   * @param height - height of bitmap's area
-   * @param normMeanRGB means for RGB channels normalization, length must equal 3, RGB order
-   * @param normStdRGB standard deviation for RGB channels normalization, length must equal 3, RGB
-   *     order
-   */
-  public static void bitmapToFloatBuffer(
-      final Bitmap bitmap,
-      final int x,
-      final int y,
-      final int width,
-      final int height,
-      final float[] normMeanRGB,
-      final float[] normStdRGB,
-      final FloatBuffer outBuffer,
-      final int outBufferOffset) {
-    checkOutBufferCapacity(outBuffer, outBufferOffset, width, height);
-    checkNormMeanArg(normMeanRGB);
-    checkNormStdArg(normStdRGB);
-    final int pixelsCount = height * width;
-    final int[] pixels = new int[pixelsCount];
-    bitmap.getPixels(pixels, 0, width, x, y, width, height);
-    final int offset_g = pixelsCount;
-    final int offset_b = 2 * pixelsCount;
-    for (int i = 0; i < 100; i++) {
-      final int c = pixels[i];
-      Log.i("Image", ": " + i + " " + ((c >> 16) & 0xff));
-    }
-    for (int i = 0; i < pixelsCount; i++) {
-      final int c = pixels[i];
-      float r = ((c >> 16) & 0xff) / 255.0f;
-      float g = ((c >> 8) & 0xff) / 255.0f;
-      float b = ((c) & 0xff) / 255.0f;
-      outBuffer.put(outBufferOffset + i, (r - normMeanRGB[0]) / normStdRGB[0]);
-      outBuffer.put(outBufferOffset + offset_g + i, (g - normMeanRGB[1]) / normStdRGB[1]);
-      outBuffer.put(outBufferOffset + offset_b + i, (b - normMeanRGB[2]) / normStdRGB[2]);
-    }
-  }
-
-  /**
-   * Creates new {@link Tensor} from specified area of {@link android.graphics.Bitmap}, normalized
-   * with specified in parameters mean and std.
-   *
-   * @param bitmap {@link android.graphics.Bitmap} as a source for Tensor data
-   * @param x - x coordinate of top left corner of bitmap's area
-   * @param y - y coordinate of top left corner of bitmap's area
-   * @param width - width of bitmap's area
-   * @param height - height of bitmap's area
-   * @param normMeanRGB means for RGB channels normalization, length must equal 3, RGB order
-   * @param normStdRGB standard deviation for RGB channels normalization, length must equal 3, RGB
-   *     order
-   */
-  public static Tensor bitmapToFloat32Tensor(
-      final Bitmap bitmap,
-      int x,
-      int y,
-      int width,
-      int height,
-      float[] normMeanRGB,
-      float[] normStdRGB) {
-    checkNormMeanArg(normMeanRGB);
-    checkNormStdArg(normStdRGB);
-
-    final FloatBuffer floatBuffer = Tensor.allocateFloatBuffer(3 * width * height);
-    bitmapToFloatBuffer(bitmap, x, y, width, height, normMeanRGB, normStdRGB, floatBuffer, 0);
-    return Tensor.fromBlob(floatBuffer, new long[] {1, 3, height, width});
-  }
-
-  private static void checkOutBufferCapacity(
-      FloatBuffer outBuffer, int outBufferOffset, int tensorWidth, int tensorHeight) {
-    if (outBufferOffset + 3 * tensorWidth * tensorHeight > outBuffer.capacity()) {
-      throw new IllegalStateException("Buffer underflow");
-    }
-  }
-
-  private static void checkTensorSize(int tensorWidth, int tensorHeight) {
-    if (tensorHeight <= 0 || tensorWidth <= 0) {
-      throw new IllegalArgumentException("tensorHeight and tensorWidth must be positive");
-    }
-  }
-
-  private static void checkRotateCWDegrees(int rotateCWDegrees) {
-    if (rotateCWDegrees != 0
-        && rotateCWDegrees != 90
-        && rotateCWDegrees != 180
-        && rotateCWDegrees != 270) {
-      throw new IllegalArgumentException("rotateCWDegrees must be one of 0, 90, 180, 270");
-    }
-  }
-
-  private static void checkNormStdArg(float[] normStdRGB) {
-    if (normStdRGB.length != 3) {
-      throw new IllegalArgumentException("normStdRGB length must be 3");
-    }
-  }
-
-  private static void checkNormMeanArg(float[] normMeanRGB) {
-    if (normMeanRGB.length != 3) {
-      throw new IllegalArgumentException("normMeanRGB length must be 3");
-    }
-  }
-}
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/drawable/ic_launcher_background.xml b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/drawable/ic_launcher_background.xml
deleted file mode 100644
index 07d5da9cbf1..00000000000
--- a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/drawable/ic_launcher_background.xml
+++ /dev/null
@@ -1,170 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<vector xmlns:android="http://schemas.android.com/apk/res/android"
-    android:width="108dp"
-    android:height="108dp"
-    android:viewportWidth="108"
-    android:viewportHeight="108">
-    <path
-        android:fillColor="#3DDC84"
-        android:pathData="M0,0h108v108h-108z" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M9,0L9,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M19,0L19,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M29,0L29,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M39,0L39,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M49,0L49,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M59,0L59,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M69,0L69,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M79,0L79,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M89,0L89,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M99,0L99,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,9L108,9"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,19L108,19"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,29L108,29"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,39L108,39"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,49L108,49"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,59L108,59"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,69L108,69"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,79L108,79"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,89L108,89"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,99L108,99"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M19,29L89,29"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M19,39L89,39"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M19,49L89,49"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M19,59L89,59"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M19,69L89,69"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M19,79L89,79"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M29,19L29,89"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M39,19L39,89"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M49,19L49,89"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M59,19L59,89"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M69,19L69,89"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M79,19L79,89"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-</vector>
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/drawable/ic_launcher_foreground.xml b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/drawable/ic_launcher_foreground.xml
deleted file mode 100644
index 7706ab9e6d4..00000000000
--- a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/drawable/ic_launcher_foreground.xml
+++ /dev/null
@@ -1,30 +0,0 @@
-<vector xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:aapt="http://schemas.android.com/aapt"
-    android:width="108dp"
-    android:height="108dp"
-    android:viewportWidth="108"
-    android:viewportHeight="108">
-    <path android:pathData="M31,63.928c0,0 6.4,-11 12.1,-13.1c7.2,-2.6 26,-1.4 26,-1.4l38.1,38.1L107,108.928l-32,-1L31,63.928z">
-        <aapt:attr name="android:fillColor">
-            <gradient
-                android:endX="85.84757"
-                android:endY="92.4963"
-                android:startX="42.9492"
-                android:startY="49.59793"
-                android:type="linear">
-                <item
-                    android:color="#44000000"
-                    android:offset="0.0" />
-                <item
-                    android:color="#00000000"
-                    android:offset="1.0" />
-            </gradient>
-        </aapt:attr>
-    </path>
-    <path
-        android:fillColor="#FFFFFF"
-        android:fillType="nonZero"
-        android:pathData="M65.3,45.828l3.8,-6.6c0.2,-0.4 0.1,-0.9 -0.3,-1.1c-0.4,-0.2 -0.9,-0.1 -1.1,0.3l-3.9,6.7c-6.3,-2.8 -13.4,-2.8 -19.7,0l-3.9,-6.7c-0.2,-0.4 -0.7,-0.5 -1.1,-0.3C38.8,38.328 38.7,38.828 38.9,39.228l3.8,6.6C36.2,49.428 31.7,56.028 31,63.928h46C76.3,56.028 71.8,49.428 65.3,45.828zM43.4,57.328c-0.8,0 -1.5,-0.5 -1.8,-1.2c-0.3,-0.7 -0.1,-1.5 0.4,-2.1c0.5,-0.5 1.4,-0.7 2.1,-0.4c0.7,0.3 1.2,1 1.2,1.8C45.3,56.528 44.5,57.328 43.4,57.328L43.4,57.328zM64.6,57.328c-0.8,0 -1.5,-0.5 -1.8,-1.2s-0.1,-1.5 0.4,-2.1c0.5,-0.5 1.4,-0.7 2.1,-0.4c0.7,0.3 1.2,1 1.2,1.8C66.5,56.528 65.6,57.328 64.6,57.328L64.6,57.328z"
-        android:strokeWidth="1"
-        android:strokeColor="#00000000" />
-</vector>
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/layout/activity_classification.xml b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/layout/activity_classification.xml
deleted file mode 100644
index d896adb54d4..00000000000
--- a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/layout/activity_classification.xml
+++ /dev/null
@@ -1,39 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-
-<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:tools="http://schemas.android.com/tools"
-    android:layout_width="match_parent"
-    android:layout_height="match_parent"
-    tools:context=".MainActivity">
-
-    <ImageView
-        android:id="@+id/image"
-        android:layout_width="match_parent"
-        android:layout_height="match_parent"
-        android:scaleType="fitCenter" />
-
-    <TextView
-        android:id="@+id/text"
-        android:layout_width="match_parent"
-        android:layout_height="wrap_content"
-        android:layout_gravity="top"
-        android:textSize="18sp"
-        android:background="#80000000"
-        android:textColor="@android:color/white" />
-
-    <Button
-        android:id="@+id/forward"
-        android:layout_width="wrap_content"
-        android:layout_height="wrap_content"
-        android:layout_marginTop="32dp"
-        android:text="Run" />
-
-    <Button
-        android:id="@+id/segmentationDemoButton"
-        android:layout_width="wrap_content"
-        android:layout_height="wrap_content"
-        android:layout_marginTop="100dp"
-        android:text="@string/segmentation_demo"
-        android:textAllCaps="false" />
-
-</FrameLayout>
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/layout/activity_main.xml b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/layout/activity_main.xml
deleted file mode 100644
index e6b3864fb4b..00000000000
--- a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/layout/activity_main.xml
+++ /dev/null
@@ -1,75 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<androidx.constraintlayout.widget.ConstraintLayout xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:app="http://schemas.android.com/apk/res-auto"
-    xmlns:tools="http://schemas.android.com/tools"
-    android:layout_width="match_parent"
-    android:layout_height="match_parent"
-    tools:context=".MainActivity">
-
-    <ImageView
-        android:id="@+id/imageView"
-        android:layout_width="388dp"
-        android:layout_height="405dp"
-        android:layout_marginTop="30dp"
-        android:contentDescription="@string/image_view"
-        app:layout_constraintDimensionRatio="1:1"
-        app:layout_constraintEnd_toEndOf="parent"
-        app:layout_constraintHorizontal_bias="0.478"
-        app:layout_constraintStart_toStartOf="parent"
-        app:layout_constraintTop_toTopOf="parent" />
-
-    <Button
-        android:id="@+id/nextButton"
-        android:layout_width="wrap_content"
-        android:layout_height="wrap_content"
-        android:layout_marginTop="10dp"
-        android:text="@string/next"
-        android:textAllCaps="false"
-        app:layout_constraintEnd_toEndOf="parent"
-        app:layout_constraintHorizontal_bias="0.45"
-        app:layout_constraintStart_toStartOf="parent"
-        app:layout_constraintTop_toBottomOf="@+id/imageView" />
-
-    <Button
-        android:id="@+id/resetImage"
-        android:layout_width="wrap_content"
-        android:layout_height="wrap_content"
-        android:text="@string/reset"
-        android:textAllCaps="false"
-        app:layout_constraintRight_toRightOf="@+id/nextButton"
-        app:layout_constraintStart_toEndOf="@+id/nextButton"
-        app:layout_constraintTop_toTopOf="@+id/nextButton" />
-
-    <Button
-        android:id="@+id/xnnpackButton"
-        android:layout_width="wrap_content"
-        android:layout_height="wrap_content"
-        android:layout_marginTop="20dp"
-        android:text="@string/run_xnnpack"
-        android:textAllCaps="false"
-        app:layout_constraintEnd_toEndOf="parent"
-        app:layout_constraintHorizontal_bias="0.45"
-        app:layout_constraintStart_toStartOf="parent"
-        app:layout_constraintTop_toBottomOf="@+id/nextButton" />
-
-    <ProgressBar
-        android:id="@+id/progressBar"
-        android:layout_width="wrap_content"
-        android:layout_height="wrap_content"
-        android:layout_marginTop="10dp"
-        android:visibility="invisible"
-        app:layout_constraintEnd_toEndOf="parent"
-        app:layout_constraintHorizontal_bias="0.45"
-        app:layout_constraintStart_toStartOf="parent"
-        app:layout_constraintTop_toBottomOf="@+id/nextButton" />
-
-    <Button
-        android:id="@+id/htpButton"
-        android:layout_width="wrap_content"
-        android:layout_height="wrap_content"
-        android:text="@string/run_htp"
-        android:textAllCaps="false"
-        app:layout_constraintStart_toEndOf="@+id/xnnpackButton"
-        app:layout_constraintTop_toTopOf="@+id/xnnpackButton" />
-
-</androidx.constraintlayout.widget.ConstraintLayout>
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/mipmap-anydpi-v26/ic_launcher.xml b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/mipmap-anydpi-v26/ic_launcher.xml
deleted file mode 100644
index b3e26b4c60c..00000000000
--- a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/mipmap-anydpi-v26/ic_launcher.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<adaptive-icon xmlns:android="http://schemas.android.com/apk/res/android">
-    <background android:drawable="@drawable/ic_launcher_background" />
-    <foreground android:drawable="@drawable/ic_launcher_foreground" />
-    <monochrome android:drawable="@drawable/ic_launcher_foreground" />
-</adaptive-icon>
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml
deleted file mode 100644
index b3e26b4c60c..00000000000
--- a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<adaptive-icon xmlns:android="http://schemas.android.com/apk/res/android">
-    <background android:drawable="@drawable/ic_launcher_background" />
-    <foreground android:drawable="@drawable/ic_launcher_foreground" />
-    <monochrome android:drawable="@drawable/ic_launcher_foreground" />
-</adaptive-icon>
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/mipmap-hdpi/ic_launcher.webp b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/mipmap-hdpi/ic_launcher.webp
deleted file mode 100644
index c209e78ecd3..00000000000
Binary files a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/mipmap-hdpi/ic_launcher.webp and /dev/null differ
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp
deleted file mode 100644
index b2dfe3d1ba5..00000000000
Binary files a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp and /dev/null differ
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/mipmap-mdpi/ic_launcher.webp b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/mipmap-mdpi/ic_launcher.webp
deleted file mode 100644
index 4f0f1d64e58..00000000000
Binary files a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/mipmap-mdpi/ic_launcher.webp and /dev/null differ
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp
deleted file mode 100644
index 62b611da081..00000000000
Binary files a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp and /dev/null differ
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/mipmap-xhdpi/ic_launcher.webp b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/mipmap-xhdpi/ic_launcher.webp
deleted file mode 100644
index 948a3070fe3..00000000000
Binary files a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/mipmap-xhdpi/ic_launcher.webp and /dev/null differ
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp
deleted file mode 100644
index 1b9a6956b3a..00000000000
Binary files a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp and /dev/null differ
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp
deleted file mode 100644
index 28d4b77f9f0..00000000000
Binary files a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp and /dev/null differ
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp
deleted file mode 100644
index 9287f508362..00000000000
Binary files a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp and /dev/null differ
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp
deleted file mode 100644
index aa7d6427e6f..00000000000
Binary files a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp and /dev/null differ
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp
deleted file mode 100644
index 9126ae37cbc..00000000000
Binary files a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp and /dev/null differ
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/values/colors.xml b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/values/colors.xml
deleted file mode 100644
index 4faecfa80d4..00000000000
--- a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/values/colors.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<resources>
-    <color name="colorPrimary">#6200EE</color>
-    <color name="colorPrimaryDark">#3700B3</color>
-    <color name="colorAccent">#03DAC5</color>
-</resources>
\ No newline at end of file
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/values/strings.xml b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/values/strings.xml
deleted file mode 100644
index 021e4f0375f..00000000000
--- a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/values/strings.xml
+++ /dev/null
@@ -1,14 +0,0 @@
-<resources>
-    <string name="app_name">ExecuTorchDemo</string>
-    <string name="image_view">Image View</string>
-    <string name="detect">Detect</string>
-    <string name="run_model">Running the model...</string>
-    <string name="next">Next</string>
-    <string name="select">Select</string>
-    <string name="live">Live</string>
-    <string name="classification_demo">Classification demo</string>
-    <string name="segmentation_demo">Segmentation Demo</string>
-    <string name="run_htp">HTP</string>
-    <string name="run_xnnpack">XNNPACK</string>
-    <string name="reset">Reset</string>
-</resources>
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/values/styles.xml b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/values/styles.xml
deleted file mode 100644
index 391ec9ae3b7..00000000000
--- a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/values/styles.xml
+++ /dev/null
@@ -1,10 +0,0 @@
-<resources>
-    <!-- Base application theme. -->
-    <style name="AppTheme" parent="Theme.AppCompat.Light.DarkActionBar">
-        <!-- Customize your theme here. -->
-        <item name="colorPrimary">@color/colorPrimary</item>
-        <item name="colorPrimaryDark">@color/colorPrimaryDark</item>
-        <item name="colorAccent">@color/colorAccent</item>
-    </style>
-
-</resources>
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/values/themes.xml b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/values/themes.xml
deleted file mode 100644
index 4561c150176..00000000000
--- a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/values/themes.xml
+++ /dev/null
@@ -1,5 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<resources>
-
-    <style name="Theme.ExecuTorchDemo" parent="android:Theme.Light" />
-</resources>
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/xml/backup_rules.xml b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/xml/backup_rules.xml
deleted file mode 100644
index 148c18b6593..00000000000
--- a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/xml/backup_rules.xml
+++ /dev/null
@@ -1,13 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?><!--
-   Sample backup rules file; uncomment and customize as necessary.
-   See https://developer.android.com/guide/topics/data/autobackup
-   for details.
-   Note: This file is ignored for devices older that API 31
-   See https://developer.android.com/about/versions/12/backup-restore
--->
-<full-backup-content>
-    <!--
-   <include domain="sharedpref" path="."/>
-   <exclude domain="sharedpref" path="device.xml"/>
--->
-</full-backup-content>
diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/xml/data_extraction_rules.xml b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/xml/data_extraction_rules.xml
deleted file mode 100644
index 0c4f95cab91..00000000000
--- a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/res/xml/data_extraction_rules.xml
+++ /dev/null
@@ -1,19 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?><!--
-   Sample data extraction rules file; uncomment and customize as necessary.
-   See https://developer.android.com/about/versions/12/backup-restore#xml-changes
-   for details.
--->
-<data-extraction-rules>
-    <cloud-backup>
-        <!-- TODO: Use <include> and <exclude> to control what is backed up.
-        <include .../>
-        <exclude .../>
-        -->
-    </cloud-backup>
-    <!--
-    <device-transfer>
-        <include .../>
-        <exclude .../>
-    </device-transfer>
-    -->
-</data-extraction-rules>
diff --git a/examples/demo-apps/android/ExecuTorchDemo/build.gradle.kts b/examples/demo-apps/android/ExecuTorchDemo/build.gradle.kts
deleted file mode 100644
index 568efa2815b..00000000000
--- a/examples/demo-apps/android/ExecuTorchDemo/build.gradle.kts
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// Top-level build file where you can add configuration options common to all sub-projects/modules.
-plugins {
-  id("com.android.application") version "8.1.0" apply false
-  id("org.jetbrains.kotlin.android") version "1.8.10" apply false
-}
diff --git a/examples/demo-apps/android/ExecuTorchDemo/gradle.properties b/examples/demo-apps/android/ExecuTorchDemo/gradle.properties
deleted file mode 100644
index 2cbd6d19d33..00000000000
--- a/examples/demo-apps/android/ExecuTorchDemo/gradle.properties
+++ /dev/null
@@ -1,23 +0,0 @@
-# Project-wide Gradle settings.
-# IDE (e.g. Android Studio) users:
-# Gradle settings configured through the IDE *will override*
-# any settings specified in this file.
-# For more details on how to configure your build environment visit
-# http://www.gradle.org/docs/current/userguide/build_environment.html
-# Specifies the JVM arguments used for the daemon process.
-# The setting is particularly useful for tweaking memory settings.
-org.gradle.jvmargs=-Xmx2048m -Dfile.encoding=UTF-8
-# When configured, Gradle will run in incubating parallel mode.
-# This option should only be used with decoupled projects. More details, visit
-# http://www.gradle.org/docs/current/userguide/multi_project_builds.html#sec:decoupled_projects
-# org.gradle.parallel=true
-# AndroidX package structure to make it clearer which packages are bundled with the
-# Android operating system, and which are packaged with your app's APK
-# https://developer.android.com/topic/libraries/support-library/androidx-rn
-android.useAndroidX=true
-# Kotlin code style for this project: "official" or "obsolete":
-kotlin.code.style=official
-# Enables namespacing of each library's R class so that its R class includes only the
-# resources declared in the library itself and none from the library's dependencies,
-# thereby reducing the size of the R class for that library
-android.nonTransitiveRClass=true
diff --git a/examples/demo-apps/android/ExecuTorchDemo/gradle/wrapper/gradle-wrapper.jar b/examples/demo-apps/android/ExecuTorchDemo/gradle/wrapper/gradle-wrapper.jar
deleted file mode 100644
index e708b1c023e..00000000000
Binary files a/examples/demo-apps/android/ExecuTorchDemo/gradle/wrapper/gradle-wrapper.jar and /dev/null differ
diff --git a/examples/demo-apps/android/ExecuTorchDemo/gradle/wrapper/gradle-wrapper.properties b/examples/demo-apps/android/ExecuTorchDemo/gradle/wrapper/gradle-wrapper.properties
deleted file mode 100644
index 2a7f77d2fbb..00000000000
--- a/examples/demo-apps/android/ExecuTorchDemo/gradle/wrapper/gradle-wrapper.properties
+++ /dev/null
@@ -1,6 +0,0 @@
-#Mon Sep 25 11:23:11 PDT 2023
-distributionBase=GRADLE_USER_HOME
-distributionPath=wrapper/dists
-distributionUrl=https\://services.gradle.org/distributions/gradle-8.0-bin.zip
-zipStoreBase=GRADLE_USER_HOME
-zipStorePath=wrapper/dists
diff --git a/examples/demo-apps/android/ExecuTorchDemo/gradlew b/examples/demo-apps/android/ExecuTorchDemo/gradlew
deleted file mode 100755
index 4f906e0c811..00000000000
--- a/examples/demo-apps/android/ExecuTorchDemo/gradlew
+++ /dev/null
@@ -1,185 +0,0 @@
-#!/usr/bin/env sh
-
-#
-# Copyright 2015 the original author or authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-##############################################################################
-##
-##  Gradle start up script for UN*X
-##
-##############################################################################
-
-# Attempt to set APP_HOME
-# Resolve links: $0 may be a link
-PRG="$0"
-# Need this for relative symlinks.
-while [ -h "$PRG" ] ; do
-    ls=`ls -ld "$PRG"`
-    link=`expr "$ls" : '.*-> \(.*\)$'`
-    if expr "$link" : '/.*' > /dev/null; then
-        PRG="$link"
-    else
-        PRG=`dirname "$PRG"`"/$link"
-    fi
-done
-SAVED="`pwd`"
-cd "`dirname \"$PRG\"`/" >/dev/null
-APP_HOME="`pwd -P`"
-cd "$SAVED" >/dev/null
-
-APP_NAME="Gradle"
-APP_BASE_NAME=`basename "$0"`
-
-# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
-DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
-
-# Use the maximum available, or set MAX_FD != -1 to use that value.
-MAX_FD="maximum"
-
-warn () {
-    echo "$*"
-}
-
-die () {
-    echo
-    echo "$*"
-    echo
-    exit 1
-}
-
-# OS specific support (must be 'true' or 'false').
-cygwin=false
-msys=false
-darwin=false
-nonstop=false
-case "`uname`" in
-  CYGWIN* )
-    cygwin=true
-    ;;
-  Darwin* )
-    darwin=true
-    ;;
-  MINGW* )
-    msys=true
-    ;;
-  NONSTOP* )
-    nonstop=true
-    ;;
-esac
-
-CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
-
-
-# Determine the Java command to use to start the JVM.
-if [ -n "$JAVA_HOME" ] ; then
-    if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
-        # IBM's JDK on AIX uses strange locations for the executables
-        JAVACMD="$JAVA_HOME/jre/sh/java"
-    else
-        JAVACMD="$JAVA_HOME/bin/java"
-    fi
-    if [ ! -x "$JAVACMD" ] ; then
-        die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
-
-Please set the JAVA_HOME variable in your environment to match the
-location of your Java installation."
-    fi
-else
-    JAVACMD="java"
-    which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
-
-Please set the JAVA_HOME variable in your environment to match the
-location of your Java installation."
-fi
-
-# Increase the maximum file descriptors if we can.
-if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
-    MAX_FD_LIMIT=`ulimit -H -n`
-    if [ $? -eq 0 ] ; then
-        if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
-            MAX_FD="$MAX_FD_LIMIT"
-        fi
-        ulimit -n $MAX_FD
-        if [ $? -ne 0 ] ; then
-            warn "Could not set maximum file descriptor limit: $MAX_FD"
-        fi
-    else
-        warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
-    fi
-fi
-
-# For Darwin, add options to specify how the application appears in the dock
-if $darwin; then
-    GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
-fi
-
-# For Cygwin or MSYS, switch paths to Windows format before running java
-if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then
-    APP_HOME=`cygpath --path --mixed "$APP_HOME"`
-    CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
-
-    JAVACMD=`cygpath --unix "$JAVACMD"`
-
-    # We build the pattern for arguments to be converted via cygpath
-    ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
-    SEP=""
-    for dir in $ROOTDIRSRAW ; do
-        ROOTDIRS="$ROOTDIRS$SEP$dir"
-        SEP="|"
-    done
-    OURCYGPATTERN="(^($ROOTDIRS))"
-    # Add a user-defined pattern to the cygpath arguments
-    if [ "$GRADLE_CYGPATTERN" != "" ] ; then
-        OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
-    fi
-    # Now convert the arguments - kludge to limit ourselves to /bin/sh
-    i=0
-    for arg in "$@" ; do
-        CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
-        CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
-
-        if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
-            eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
-        else
-            eval `echo args$i`="\"$arg\""
-        fi
-        i=`expr $i + 1`
-    done
-    case $i in
-        0) set -- ;;
-        1) set -- "$args0" ;;
-        2) set -- "$args0" "$args1" ;;
-        3) set -- "$args0" "$args1" "$args2" ;;
-        4) set -- "$args0" "$args1" "$args2" "$args3" ;;
-        5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
-        6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
-        7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
-        8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
-        9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
-    esac
-fi
-
-# Escape application args
-save () {
-    for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
-    echo " "
-}
-APP_ARGS=`save "$@"`
-
-# Collect all arguments for the java command, following the shell quoting and substitution rules
-eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
-
-exec "$JAVACMD" "$@"
diff --git a/examples/demo-apps/android/ExecuTorchDemo/gradlew.bat b/examples/demo-apps/android/ExecuTorchDemo/gradlew.bat
deleted file mode 100644
index b4fb785a6d0..00000000000
--- a/examples/demo-apps/android/ExecuTorchDemo/gradlew.bat
+++ /dev/null
@@ -1,95 +0,0 @@
-@REM Copyright (c) Meta Platforms, Inc. and affiliates.
-@REM All rights reserved.
-@REM
-@REM This source code is licensed under the BSD-style license found in the
-@REM LICENSE file in the root directory of this source tree.
-
-@rem
-@rem Copyright 2015 the original author or authors.
-@rem
-@rem Licensed under the Apache License, Version 2.0 (the "License");
-@rem you may not use this file except in compliance with the License.
-@rem You may obtain a copy of the License at
-@rem
-@rem      https://www.apache.org/licenses/LICENSE-2.0
-@rem
-@rem Unless required by applicable law or agreed to in writing, software
-@rem distributed under the License is distributed on an "AS IS" BASIS,
-@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@rem See the License for the specific language governing permissions and
-@rem limitations under the License.
-@rem
-
-@if "%DEBUG%" == "" @echo off
-@rem ##########################################################################
-@rem
-@rem  Gradle startup script for Windows
-@rem
-@rem ##########################################################################
-
-@rem Set local scope for the variables with windows NT shell
-if "%OS%"=="Windows_NT" setlocal
-
-set DIRNAME=%~dp0
-if "%DIRNAME%" == "" set DIRNAME=.
-set APP_BASE_NAME=%~n0
-set APP_HOME=%DIRNAME%
-
-@rem Resolve any "." and ".." in APP_HOME to make it shorter.
-for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
-
-@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
-set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
-
-@rem Find java.exe
-if defined JAVA_HOME goto findJavaFromJavaHome
-
-set JAVA_EXE=java.exe
-%JAVA_EXE% -version >NUL 2>&1
-if "%ERRORLEVEL%" == "0" goto execute
-
-echo.
-echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
-echo.
-echo Please set the JAVA_HOME variable in your environment to match the
-echo location of your Java installation.
-
-goto fail
-
-:findJavaFromJavaHome
-set JAVA_HOME=%JAVA_HOME:"=%
-set JAVA_EXE=%JAVA_HOME%/bin/java.exe
-
-if exist "%JAVA_EXE%" goto execute
-
-echo.
-echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
-echo.
-echo Please set the JAVA_HOME variable in your environment to match the
-echo location of your Java installation.
-
-goto fail
-
-:execute
-@rem Setup the command line
-
-set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
-
-
-@rem Execute Gradle
-"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
-
-:end
-@rem End local scope for the variables with windows NT shell
-if "%ERRORLEVEL%"=="0" goto mainEnd
-
-:fail
-rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
-rem the _cmd.exe /c_ return code!
-if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
-exit /b 1
-
-:mainEnd
-if "%OS%"=="Windows_NT" endlocal
-
-:omega
diff --git a/examples/demo-apps/android/ExecuTorchDemo/settings.gradle.kts b/examples/demo-apps/android/ExecuTorchDemo/settings.gradle.kts
deleted file mode 100644
index ba0e809fd98..00000000000
--- a/examples/demo-apps/android/ExecuTorchDemo/settings.gradle.kts
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-pluginManagement {
-  repositories {
-    google()
-    mavenCentral()
-    gradlePluginPortal()
-  }
-}
-
-dependencyResolutionManagement {
-  repositoriesMode.set(RepositoriesMode.FAIL_ON_PROJECT_REPOS)
-  repositories {
-    google()
-    mavenCentral()
-  }
-}
-
-rootProject.name = "ExecuTorch Demo"
-
-include(":app")
diff --git a/examples/demo-apps/android/ExecuTorchDemo/setup.sh b/examples/demo-apps/android/ExecuTorchDemo/setup.sh
deleted file mode 100644
index 28ecc083bba..00000000000
--- a/examples/demo-apps/android/ExecuTorchDemo/setup.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/usr/bin/env bash
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-set -eu
-
-BASEDIR=$(dirname "$0")
-mkdir -p "$BASEDIR"/app/libs
-curl -o "$BASEDIR"/app/libs/executorch.aar https://ossci-android.s3.amazonaws.com/executorch/release/v0.5.0-rc3/executorch.aar
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/BUCK b/examples/demo-apps/android/LlamaDemo/app/src/main/BUCK
index afbd3697661..a64e11d1306 100644
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/BUCK
+++ b/examples/demo-apps/android/LlamaDemo/app/src/main/BUCK
@@ -1,16 +1,17 @@
+load("@fbcode_macros//build_defs:build_file_migration.bzl", "fbcode_target", "non_fbcode_target")
 load("@fbsource//tools/build_defs/android:fb_android_binary.bzl", "fb_android_binary")
 load("@fbsource//tools/build_defs/android:fb_android_library.bzl", "fb_android_library")
 load("@fbsource//tools/build_defs/android:fb_android_resource.bzl", "fb_android_resource")
 
 oncall("executorch")
 
-fb_android_resource(
+non_fbcode_target(_kind = fb_android_resource,
     name = "app_res",
     package = "com.example.executorchllamademo",
     res = "res",
 )
 
-fb_android_library(
+non_fbcode_target(_kind = fb_android_library,
     name = "app_lib",
     srcs = [
         "java/com/example/executorchllamademo/AppLog.java",
@@ -43,7 +44,7 @@ fb_android_library(
     ],
 )
 
-fb_android_binary(
+non_fbcode_target(_kind = fb_android_binary,
     name = "ExecuTorchLlamaDemo",
     keystore = "//fbandroid/keystores:debug",
     manifest = "AndroidManifest.xml",
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/TARGETS b/examples/demo-apps/android/LlamaDemo/app/src/main/TARGETS
deleted file mode 100644
index 5c4f482b5ea..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/TARGETS
+++ /dev/null
@@ -1 +0,0 @@
-# This file needs to exist to avoid build system breakage, see https://fburl.com/workplace/jtdlgdmd
diff --git a/examples/models/llama/TARGETS b/examples/models/llama/TARGETS
index 48c48532f7b..6f3a3f2a3f4 100644
--- a/examples/models/llama/TARGETS
+++ b/examples/models/llama/TARGETS
@@ -118,6 +118,16 @@ runtime.python_library(
     ],
 )
 
+runtime.python_library(
+    name = "hf_download",
+    srcs = [
+        "hf_download.py",
+    ],
+    deps = [
+        "fbsource//third-party/pypi/huggingface-hub:huggingface-hub",
+    ]
+)
+
 runtime.python_library(
     name = "export_library",
     srcs = [
@@ -134,6 +144,7 @@ runtime.python_library(
         "@EXECUTORCH_CLIENTS",
     ],
     deps = [
+        ":hf_download",
         ":source_transformation",
         "//ai_codesign/gen_ai/fast_hadamard_transform:fast_hadamard_transform",
         "//caffe2:torch",
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index dc7f763fade..f1c5c3a73f1 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -539,11 +539,17 @@ def export_llama(args) -> str:
     if not args.checkpoint and args.model in HUGGING_FACE_REPO_IDS:
         repo_id = HUGGING_FACE_REPO_IDS[args.model]
         if args.model == "qwen2_5":
-            from executorch.examples.models.qwen2_5 import convert_weights
+            from executorch.examples.models.qwen2_5 import (  # pyre-ignore[21]
+                convert_weights,
+            )
         elif args.model == "phi_4_mini":
-            from executorch.examples.models.phi_4_mini import convert_weights
+            from executorch.examples.models.phi_4_mini import (  # pyre-ignore[21]
+                convert_weights,
+            )
         elif args.model == "smollm2":
-            from executorch.examples.models.smollm2 import convert_weights
+            from executorch.examples.models.smollm2 import (  # pyre-ignore[21]
+                convert_weights,
+            )
         else:
             raise ValueError(
                 f"Converting weights to meta format for {args.model} is not yet supported"
@@ -820,9 +826,19 @@ def _to_edge_and_lower_llama(  # noqa: C901
             )
         )
         # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.qualcomm.utils.utils`
-        from executorch.backends.qualcomm.utils.utils import _transform, tag_quant_io
+        from executorch.backends.qualcomm._passes.annotate_decomposed import (
+            AnnotateDecomposed,
+        )
+        from executorch.backends.qualcomm.utils.constants import QCOM_PASS_ACTIVATE_KEY
+        from executorch.backends.qualcomm.utils.utils import (
+            _transform,
+            get_capture_program_passes,
+            tag_quant_io,
+        )
 
-        _transform(builder_exported_to_edge.edge_manager.exported_program())
+        passes_job = get_capture_program_passes()
+        passes_job[AnnotateDecomposed][QCOM_PASS_ACTIVATE_KEY] = True
+        _transform(builder_exported_to_edge.edge_manager.exported_program(), passes_job)
 
         if args.num_sharding > 0:
             model_sharding.split_graph(
diff --git a/examples/models/llama/hf_download.py b/examples/models/llama/hf_download.py
index b09dc5125d9..fbc4240619b 100644
--- a/examples/models/llama/hf_download.py
+++ b/examples/models/llama/hf_download.py
@@ -36,8 +36,8 @@ def download_and_convert_hf_checkpoint(
     converted_path = cache_dir / f"{model_name}.pth"
 
     if converted_path.exists():
-        print(f"✔ Using cached converted model: {converted_path}")
-        return converted_path
+        print(f"✔ Using cached converted model: {str(converted_path)}")
+        return str(converted_path)
 
     # 1. Download weights from Hugging Face.
     print("⬇ Downloading and converting checkpoint...")
@@ -46,5 +46,5 @@ def download_and_convert_hf_checkpoint(
     )
 
     # 2. Convert weights to Meta format.
-    convert_weights(checkpoint_path, converted_path)
-    return converted_path
+    convert_weights(checkpoint_path, str(converted_path))
+    return str(converted_path)
diff --git a/examples/models/llama/model.py b/examples/models/llama/model.py
index ec10ae5a649..f90ceb8c415 100644
--- a/examples/models/llama/model.py
+++ b/examples/models/llama/model.py
@@ -178,7 +178,7 @@ def __init__(self, **kwargs):
             if checkpoint:
                 self.model_.checkpoint_dtype = get_checkpoint_dtype(checkpoint)
             else:
-                self.model_.checkpoint_dtype = None
+                self.model_.checkpoint_dtype = torch.float32
 
         if "int8" in str(checkpoint_path):
             print("Using int8 weight-only quantization!")
@@ -259,15 +259,22 @@ def __init__(self, **kwargs):
                     assign=True,
                 )  # self.model_ = Transformer(gptconf)
             else:
-                print("Checkpoint not provided, defaulting to uninitialized weights.")
+                print("Checkpoint not provided, defaulting weights to zeros.")
                 self.model_.to_empty(device="cpu")
+                for p in self.model_.parameters():
+                    p.data.fill_(0)
+                for b in self.model_.buffers():
+                    b.data.fill_(0)
         except RuntimeError as e:
             print(
-                f"Could not load checkpoint into mode and will default to uninitialized weights due to error: {e}."
+                f"Could not load checkpoint into mode and will defaulting weights to zeros due to error: {e}."
             )
             # Need to provide concrete (empty) values for meta-initialized tensors for quantization.
             self.model_.to_empty(device="cpu")
-
+            for p in self.model_.parameters():
+                p.data.fill_(0)
+            for b in self.model_.buffers():
+                b.data.fill_(0)
         if missing:
             missing_weights = [fqn for fqn in missing if fqn.endswith(".weight")]
             if missing_weights:
diff --git a/examples/models/llama/model_args.py b/examples/models/llama/model_args.py
index fdbd18eca1d..8d2641d9d78 100644
--- a/examples/models/llama/model_args.py
+++ b/examples/models/llama/model_args.py
@@ -5,7 +5,7 @@
 @dataclass
 class ModelArgs:
     dim: int = 4096
-    n_layers: int = 32
+    n_layers: int = 8
     n_heads: int = 32
     n_kv_heads: Optional[int] = None
     vocab_size: int = 512  # Arbitrary value, should be defined later by tokenizer.
diff --git a/examples/models/llama/source_transformation/quantize.py b/examples/models/llama/source_transformation/quantize.py
index 36743bb3b79..2ef016de097 100644
--- a/examples/models/llama/source_transformation/quantize.py
+++ b/examples/models/llama/source_transformation/quantize.py
@@ -136,14 +136,14 @@ def quantize(  # noqa C901
         # Check for required args
         if group_size is None:
             raise Exception("For 8da4w quantization, group size must be specified.")
-        from torchao.quantization.quant_api import Int8DynActInt4WeightQuantizer
 
-        # 1. Quantize in checkpoint dtype.
-        model = Int8DynActInt4WeightQuantizer(
-            precision=checkpoint_torch_dtype, groupsize=group_size
-        ).quantize(model)
-        # 2. Set the computation dtype (what weights/acts dequantize to).
-        model = set_8da4w_computation_dtype(model, computation_torch_dtype)
+        from torchao.quantization import int8_dynamic_activation_int4_weight, quantize_
+        from torchao.utils import unwrap_tensor_subclass
+
+        quantize_(model, int8_dynamic_activation_int4_weight(group_size=group_size))
+        model = unwrap_tensor_subclass(model)
+
+        # TODO: deal with checkpoint / computation dtype decoupling.
 
         if verbose:
             print("quantized model:", model)
@@ -698,7 +698,7 @@ def convert_for_runtime(self) -> nn.Module:
     def quantized_model(self) -> nn.Module:
         model_updated_state_dict = self.create_quantized_state_dict(self.packed)
         self.convert_for_runtime()
-        self.mod.load_state_dict(model_updated_state_dict)
+        self.mod.load_state_dict(model_updated_state_dict, assign=True)
         return self.mod
 
 
diff --git a/examples/models/llama/tests/test_export_llama_lib.py b/examples/models/llama/tests/test_export_llama_lib.py
new file mode 100644
index 00000000000..b94adb5fa0c
--- /dev/null
+++ b/examples/models/llama/tests/test_export_llama_lib.py
@@ -0,0 +1,48 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+from executorch.devtools.backend_debug import get_delegation_info
+from executorch.examples.models.llama.export_llama_lib import (
+    _export_llama,
+    build_args_parser,
+)
+
+UNWANTED_OPS = [
+    "aten_permute_copy_default",
+    "aten_transpose_copy_default",
+]
+
+
+class ExportLlamaLibTest(unittest.TestCase):
+    def test_has_expected_ops_and_op_counts(self):
+        """
+        Checks the presence of unwanted expensive ops.
+
+        Serves as a proxy for a performance regression test, as performance
+        is directly tied to which and how many of each ops are in the graph.
+
+        If this test breaks, please ensure that the difference in ops
+        is intentional before updating the expected ops.
+        """
+        # Since we aren't loading a checkpoint, it doesn't
+        # matter what model we specify. Note that
+        # we cannot test quantization args in this way
+        # since quantization requires promoting meta tensors
+        # to device=cpu, which requires real weights.
+        parser = build_args_parser()
+        args = parser.parse_args([])
+        args.use_sdpa_with_kv_cache = True
+        args.use_kv_cache = True
+        args.verbose = True
+
+        builder = _export_llama(args)
+        graph_module = builder.edge_manager.exported_program().graph_module
+        delegation_info = get_delegation_info(graph_module)
+
+        for op, _op_info in delegation_info.delegation_by_operator.items():
+            self.assertTrue(op not in UNWANTED_OPS)
diff --git a/examples/models/llama3_2_vision/install_requirements.sh b/examples/models/llama3_2_vision/install_requirements.sh
index 7dad02caad7..4a5df966e5f 100755
--- a/examples/models/llama3_2_vision/install_requirements.sh
+++ b/examples/models/llama3_2_vision/install_requirements.sh
@@ -7,7 +7,7 @@
 
 set +ex
 
-NIGHTLY_VERSION="dev20250311"
+NIGHTLY_VERSION="dev20250310"
 
 # Install torchtune nightly for model definitions.
 pip install --pre torchtune==0.6.0.${NIGHTLY_VERSION} --extra-index-url https://download.pytorch.org/whl/nightly/cpu --no-cache-dir
diff --git a/examples/models/llava/model.py b/examples/models/llava/model.py
index 5e215d1c035..6ce4b701bbe 100644
--- a/examples/models/llava/model.py
+++ b/examples/models/llava/model.py
@@ -56,6 +56,7 @@ def __init__(
         )
         self.text_model_args = ModelArgs(
             use_kv_cache=True,
+            n_layers=32,
             vocab_size=self.model_.config.text_config.vocab_size,
             hidden_dim=self.model_.config.text_config.intermediate_size,
             max_batch_size=1,  # doesn't work with default batch size 32
diff --git a/examples/portable/executor_runner/executor_runner.cpp b/examples/portable/executor_runner/executor_runner.cpp
index 7c75c39f0a9..b5d008ff1e7 100644
--- a/examples/portable/executor_runner/executor_runner.cpp
+++ b/examples/portable/executor_runner/executor_runner.cpp
@@ -302,7 +302,7 @@ int main(int argc, char** argv) {
   if (tracer.get_event_tracer()) {
     // Dump ETDump data containing profiling/debugging data to file specified in
     // command line flag.
-    Error status = tracer.write_etdump_to_file();
+    status = tracer.write_etdump_to_file();
     ET_CHECK_MSG(status == Error::Ok, "Failed to save ETDump file.");
   }
 
diff --git a/examples/portable/executor_runner/targets.bzl b/examples/portable/executor_runner/targets.bzl
index 7cd0a9dcc3a..30653404eb0 100644
--- a/examples/portable/executor_runner/targets.bzl
+++ b/examples/portable/executor_runner/targets.bzl
@@ -13,6 +13,7 @@ def define_common_targets():
     runtime.cxx_library(
         name = "executor_runner_lib",
         srcs = ["executor_runner.cpp"],
+        compiler_flags = ["-Wno-global-constructors"],
         deps = [
             "//executorch/runtime/executor:program",
             "//executorch/devtools/etdump:etdump_flatcc",
@@ -32,6 +33,7 @@ def define_common_targets():
     runtime.cxx_library(
         name = "executor_runner_lib_with_threadpool",
         srcs = ["executor_runner.cpp"],
+        compiler_flags = ["-Wno-global-constructors"],
         deps = [
             "//executorch/runtime/executor:program",
             "//executorch/extension/data_loader:file_data_loader",
diff --git a/examples/xnnpack/__init__.py b/examples/xnnpack/__init__.py
index d8de9f6a36e..e78e1fec5be 100644
--- a/examples/xnnpack/__init__.py
+++ b/examples/xnnpack/__init__.py
@@ -7,33 +7,44 @@
 # pyre-unsafe
 
 from dataclasses import dataclass
+from enum import Enum
+
+
+class QuantType(Enum):
+    NONE = 1
+    # Used for Operations that don't have weights
+    STATIC_PER_TENSOR = 2
+    # Used best for CNN/RNN Models with Conv layers
+    STATIC_PER_CHANNEL = 3
+    # Used for Linear Layers and Transformer Based Models
+    DYNAMIC_PER_CHANNEL = 4
 
 
 @dataclass
 class XNNPACKOptions(object):
-    quantization: bool
+    quantization: QuantType
     delegation: bool
 
 
 MODEL_NAME_TO_OPTIONS = {
-    "linear": XNNPACKOptions(True, True),
-    "add": XNNPACKOptions(True, True),
-    "add_mul": XNNPACKOptions(True, True),
-    "dl3": XNNPACKOptions(True, True),
-    "ic3": XNNPACKOptions(True, True),
-    "ic4": XNNPACKOptions(True, True),
-    "mv2": XNNPACKOptions(True, True),
-    "mv3": XNNPACKOptions(True, True),
-    "resnet18": XNNPACKOptions(True, True),
-    "resnet50": XNNPACKOptions(True, True),
-    "vit": XNNPACKOptions(True, True),
-    "w2l": XNNPACKOptions(True, True),
-    "edsr": XNNPACKOptions(True, True),
-    "mobilebert": XNNPACKOptions(True, True),
-    "llama2": XNNPACKOptions(False, True),
-    "emformer_join": XNNPACKOptions(True, True),
-    "emformer_predict": XNNPACKOptions(True, True),
-    "emformer_transcribe": XNNPACKOptions(True, True),
+    "linear": XNNPACKOptions(QuantType.STATIC_PER_CHANNEL, True),
+    "add": XNNPACKOptions(QuantType.STATIC_PER_TENSOR, True),
+    "add_mul": XNNPACKOptions(QuantType.STATIC_PER_TENSOR, True),
+    "dl3": XNNPACKOptions(QuantType.STATIC_PER_CHANNEL, True),
+    "ic3": XNNPACKOptions(QuantType.STATIC_PER_CHANNEL, True),
+    "ic4": XNNPACKOptions(QuantType.STATIC_PER_CHANNEL, True),
+    "mv2": XNNPACKOptions(QuantType.STATIC_PER_CHANNEL, True),
+    "mv3": XNNPACKOptions(QuantType.STATIC_PER_CHANNEL, True),
+    "resnet18": XNNPACKOptions(QuantType.STATIC_PER_CHANNEL, True),
+    "resnet50": XNNPACKOptions(QuantType.STATIC_PER_CHANNEL, True),
+    "vit": XNNPACKOptions(QuantType.DYNAMIC_PER_CHANNEL, True),
+    "w2l": XNNPACKOptions(QuantType.DYNAMIC_PER_CHANNEL, True),
+    "edsr": XNNPACKOptions(QuantType.STATIC_PER_CHANNEL, True),
+    "mobilebert": XNNPACKOptions(QuantType.DYNAMIC_PER_CHANNEL, True),
+    "llama2": XNNPACKOptions(QuantType.DYNAMIC_PER_CHANNEL, True),
+    "emformer_join": XNNPACKOptions(QuantType.DYNAMIC_PER_CHANNEL, True),
+    "emformer_predict": XNNPACKOptions(QuantType.DYNAMIC_PER_CHANNEL, True),
+    "emformer_transcribe": XNNPACKOptions(QuantType.STATIC_PER_CHANNEL, True),
 }
 
 
diff --git a/examples/xnnpack/aot_compiler.py b/examples/xnnpack/aot_compiler.py
index e1542245aca..6db0d82a274 100644
--- a/examples/xnnpack/aot_compiler.py
+++ b/examples/xnnpack/aot_compiler.py
@@ -66,7 +66,7 @@
 
     args = parser.parse_args()
 
-    if not args.delegate:
+    if not args.delegate and args.quantize:
         raise NotImplementedError(
             "T161880157: Quantization-only without delegation is not supported yet"
         )
@@ -79,6 +79,8 @@
             f"Available models are {list(MODEL_NAME_TO_OPTIONS.keys())}."
         )
 
+    quant_type = MODEL_NAME_TO_OPTIONS[args.model_name].quantization
+
     model, example_inputs, _, _ = EagerModelFactory.create_model(
         *MODEL_NAME_TO_MODEL[args.model_name]
     )
@@ -91,7 +93,7 @@
     if args.quantize:
         logging.info("Quantizing Model...")
         # TODO(T165162973): This pass shall eventually be folded into quantizer
-        model = quantize(model, example_inputs)
+        model = quantize(model, example_inputs, quant_type)
         ep = torch.export.export_for_training(model, example_inputs)
 
     edge = to_edge_transform_and_lower(
diff --git a/examples/xnnpack/quantization/utils.py b/examples/xnnpack/quantization/utils.py
index de59c076a8f..9e49f15a99d 100644
--- a/examples/xnnpack/quantization/utils.py
+++ b/examples/xnnpack/quantization/utils.py
@@ -13,13 +13,25 @@
 
 from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
 
+from .. import QuantType
 
-def quantize(model, example_inputs):
+
+def quantize(
+    model, example_inputs, quant_type: QuantType = QuantType.STATIC_PER_TENSOR
+):
     """This is the official recommended flow for quantization in pytorch 2.0 export"""
     logging.info(f"Original model: {model}")
     quantizer = XNNPACKQuantizer()
     # if we set is_per_channel to True, we also need to add out_variant of quantize_per_channel/dequantize_per_channel
-    operator_config = get_symmetric_quantization_config(is_per_channel=False)
+    is_per_channel = (
+        quant_type == QuantType.STATIC_PER_CHANNEL
+        or quant_type == QuantType.DYNAMIC_PER_CHANNEL
+    )
+    is_dynamic = quant_type == QuantType.DYNAMIC_PER_CHANNEL
+    operator_config = get_symmetric_quantization_config(
+        is_per_channel=is_per_channel,
+        is_dynamic=is_dynamic,
+    )
     quantizer.set_global(operator_config)
     m = prepare_pt2e(model, quantizer)
     # calibration
diff --git a/exir/dialects/edge/edge.yaml b/exir/dialects/edge/edge.yaml
index 039490a8397..6970f7b00cc 100644
--- a/exir/dialects/edge/edge.yaml
+++ b/exir/dialects/edge/edge.yaml
@@ -3217,6 +3217,423 @@
     other: T6
     __ret_0: T17
 
+- func: aten::elu
+  namespace: edge
+  inherits: aten::elu
+  type_alias:
+    T0: [Bool]
+    T1: [Bool, Float, Int]
+    T2: [Double]
+    T3: [Float]
+    T4: [Half]
+    T5: [Int]
+  type_constraint:
+  - self: T2
+    alpha: T0
+    scale: T0
+    input_scale: T1
+    __ret_0: T2
+  - self: T2
+    alpha: T0
+    scale: T1
+    input_scale: T0
+    __ret_0: T2
+  - self: T2
+    alpha: T0
+    scale: T1
+    input_scale: T3
+    __ret_0: T2
+  - self: T2
+    alpha: T0
+    scale: T1
+    input_scale: T5
+    __ret_0: T2
+  - self: T2
+    alpha: T0
+    scale: T3
+    input_scale: T1
+    __ret_0: T2
+  - self: T2
+    alpha: T0
+    scale: T5
+    input_scale: T1
+    __ret_0: T2
+  - self: T2
+    alpha: T1
+    scale: T0
+    input_scale: T0
+    __ret_0: T2
+  - self: T2
+    alpha: T1
+    scale: T0
+    input_scale: T3
+    __ret_0: T2
+  - self: T2
+    alpha: T1
+    scale: T0
+    input_scale: T5
+    __ret_0: T2
+  - self: T2
+    alpha: T1
+    scale: T3
+    input_scale: T0
+    __ret_0: T2
+  - self: T2
+    alpha: T1
+    scale: T3
+    input_scale: T3
+    __ret_0: T2
+  - self: T2
+    alpha: T1
+    scale: T3
+    input_scale: T5
+    __ret_0: T2
+  - self: T2
+    alpha: T1
+    scale: T5
+    input_scale: T0
+    __ret_0: T2
+  - self: T2
+    alpha: T1
+    scale: T5
+    input_scale: T3
+    __ret_0: T2
+  - self: T2
+    alpha: T1
+    scale: T5
+    input_scale: T5
+    __ret_0: T2
+  - self: T2
+    alpha: T3
+    scale: T0
+    input_scale: T1
+    __ret_0: T2
+  - self: T2
+    alpha: T3
+    scale: T1
+    input_scale: T0
+    __ret_0: T2
+  - self: T2
+    alpha: T3
+    scale: T1
+    input_scale: T3
+    __ret_0: T2
+  - self: T2
+    alpha: T3
+    scale: T1
+    input_scale: T5
+    __ret_0: T2
+  - self: T2
+    alpha: T3
+    scale: T3
+    input_scale: T1
+    __ret_0: T2
+  - self: T2
+    alpha: T3
+    scale: T5
+    input_scale: T1
+    __ret_0: T2
+  - self: T2
+    alpha: T5
+    scale: T0
+    input_scale: T1
+    __ret_0: T2
+  - self: T2
+    alpha: T5
+    scale: T1
+    input_scale: T0
+    __ret_0: T2
+  - self: T2
+    alpha: T5
+    scale: T1
+    input_scale: T3
+    __ret_0: T2
+  - self: T2
+    alpha: T5
+    scale: T1
+    input_scale: T5
+    __ret_0: T2
+  - self: T2
+    alpha: T5
+    scale: T3
+    input_scale: T1
+    __ret_0: T2
+  - self: T2
+    alpha: T5
+    scale: T5
+    input_scale: T1
+    __ret_0: T2
+  - self: T3
+    alpha: T0
+    scale: T0
+    input_scale: T1
+    __ret_0: T3
+  - self: T3
+    alpha: T0
+    scale: T1
+    input_scale: T0
+    __ret_0: T3
+  - self: T3
+    alpha: T0
+    scale: T1
+    input_scale: T3
+    __ret_0: T3
+  - self: T3
+    alpha: T0
+    scale: T1
+    input_scale: T5
+    __ret_0: T3
+  - self: T3
+    alpha: T0
+    scale: T3
+    input_scale: T1
+    __ret_0: T3
+  - self: T3
+    alpha: T0
+    scale: T5
+    input_scale: T1
+    __ret_0: T3
+  - self: T3
+    alpha: T1
+    scale: T0
+    input_scale: T0
+    __ret_0: T3
+  - self: T3
+    alpha: T1
+    scale: T0
+    input_scale: T3
+    __ret_0: T3
+  - self: T3
+    alpha: T1
+    scale: T0
+    input_scale: T5
+    __ret_0: T3
+  - self: T3
+    alpha: T1
+    scale: T3
+    input_scale: T0
+    __ret_0: T3
+  - self: T3
+    alpha: T1
+    scale: T3
+    input_scale: T3
+    __ret_0: T3
+  - self: T3
+    alpha: T1
+    scale: T3
+    input_scale: T5
+    __ret_0: T3
+  - self: T3
+    alpha: T1
+    scale: T5
+    input_scale: T0
+    __ret_0: T3
+  - self: T3
+    alpha: T1
+    scale: T5
+    input_scale: T3
+    __ret_0: T3
+  - self: T3
+    alpha: T1
+    scale: T5
+    input_scale: T5
+    __ret_0: T3
+  - self: T3
+    alpha: T3
+    scale: T0
+    input_scale: T1
+    __ret_0: T3
+  - self: T3
+    alpha: T3
+    scale: T1
+    input_scale: T0
+    __ret_0: T3
+  - self: T3
+    alpha: T3
+    scale: T1
+    input_scale: T3
+    __ret_0: T3
+  - self: T3
+    alpha: T3
+    scale: T1
+    input_scale: T5
+    __ret_0: T3
+  - self: T3
+    alpha: T3
+    scale: T3
+    input_scale: T1
+    __ret_0: T3
+  - self: T3
+    alpha: T3
+    scale: T5
+    input_scale: T1
+    __ret_0: T3
+  - self: T3
+    alpha: T5
+    scale: T0
+    input_scale: T1
+    __ret_0: T3
+  - self: T3
+    alpha: T5
+    scale: T1
+    input_scale: T0
+    __ret_0: T3
+  - self: T3
+    alpha: T5
+    scale: T1
+    input_scale: T3
+    __ret_0: T3
+  - self: T3
+    alpha: T5
+    scale: T1
+    input_scale: T5
+    __ret_0: T3
+  - self: T3
+    alpha: T5
+    scale: T3
+    input_scale: T1
+    __ret_0: T3
+  - self: T3
+    alpha: T5
+    scale: T5
+    input_scale: T1
+    __ret_0: T3
+  - self: T4
+    alpha: T0
+    scale: T0
+    input_scale: T1
+    __ret_0: T4
+  - self: T4
+    alpha: T0
+    scale: T1
+    input_scale: T0
+    __ret_0: T4
+  - self: T4
+    alpha: T0
+    scale: T1
+    input_scale: T3
+    __ret_0: T4
+  - self: T4
+    alpha: T0
+    scale: T1
+    input_scale: T5
+    __ret_0: T4
+  - self: T4
+    alpha: T0
+    scale: T3
+    input_scale: T1
+    __ret_0: T4
+  - self: T4
+    alpha: T0
+    scale: T5
+    input_scale: T1
+    __ret_0: T4
+  - self: T4
+    alpha: T1
+    scale: T0
+    input_scale: T0
+    __ret_0: T4
+  - self: T4
+    alpha: T1
+    scale: T0
+    input_scale: T3
+    __ret_0: T4
+  - self: T4
+    alpha: T1
+    scale: T0
+    input_scale: T5
+    __ret_0: T4
+  - self: T4
+    alpha: T1
+    scale: T3
+    input_scale: T0
+    __ret_0: T4
+  - self: T4
+    alpha: T1
+    scale: T3
+    input_scale: T3
+    __ret_0: T4
+  - self: T4
+    alpha: T1
+    scale: T3
+    input_scale: T5
+    __ret_0: T4
+  - self: T4
+    alpha: T1
+    scale: T5
+    input_scale: T0
+    __ret_0: T4
+  - self: T4
+    alpha: T1
+    scale: T5
+    input_scale: T3
+    __ret_0: T4
+  - self: T4
+    alpha: T1
+    scale: T5
+    input_scale: T5
+    __ret_0: T4
+  - self: T4
+    alpha: T3
+    scale: T0
+    input_scale: T1
+    __ret_0: T4
+  - self: T4
+    alpha: T3
+    scale: T1
+    input_scale: T0
+    __ret_0: T4
+  - self: T4
+    alpha: T3
+    scale: T1
+    input_scale: T3
+    __ret_0: T4
+  - self: T4
+    alpha: T3
+    scale: T1
+    input_scale: T5
+    __ret_0: T4
+  - self: T4
+    alpha: T3
+    scale: T3
+    input_scale: T1
+    __ret_0: T4
+  - self: T4
+    alpha: T3
+    scale: T5
+    input_scale: T1
+    __ret_0: T4
+  - self: T4
+    alpha: T5
+    scale: T0
+    input_scale: T1
+    __ret_0: T4
+  - self: T4
+    alpha: T5
+    scale: T1
+    input_scale: T0
+    __ret_0: T4
+  - self: T4
+    alpha: T5
+    scale: T1
+    input_scale: T3
+    __ret_0: T4
+  - self: T4
+    alpha: T5
+    scale: T1
+    input_scale: T5
+    __ret_0: T4
+  - self: T4
+    alpha: T5
+    scale: T3
+    input_scale: T1
+    __ret_0: T4
+  - self: T4
+    alpha: T5
+    scale: T5
+    input_scale: T1
+    __ret_0: T4
+
 - func: aten::embedding
   namespace: edge
   inherits: aten::embedding
diff --git a/exir/dialects/edge/op/sample_input.py b/exir/dialects/edge/op/sample_input.py
index 23d87053c9e..3986cfd2d9f 100644
--- a/exir/dialects/edge/op/sample_input.py
+++ b/exir/dialects/edge/op/sample_input.py
@@ -424,6 +424,15 @@
         ],
         "returns": [Return(ArgType.Tensor)],
     },
+    "elu.default": {  # (Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1, *, Tensor(a!) out) -> Tensor(a!)
+        "args": [
+            InArg(ArgType.Tensor),
+            InArg(ArgType.Scalar),
+            InArg(ArgType.Scalar),
+            InArg(ArgType.Scalar),
+        ],
+        "returns": [Return(ArgType.Tensor)],
+    },
     "embedding.default": {  # (Tensor weight, Tensor indices, SymInt padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor
         "args": [
             InArg(ArgType.Tensor),
diff --git a/extension/android/BUCK b/extension/android/BUCK
index 7312545d6eb..7b8acfcfd88 100644
--- a/extension/android/BUCK
+++ b/extension/android/BUCK
@@ -1,8 +1,9 @@
+load("@fbcode_macros//build_defs:build_file_migration.bzl", "fbcode_target", "non_fbcode_target")
 load("@fbsource//tools/build_defs/android:fb_android_library.bzl", "fb_android_library")
 
 oncall("executorch")
 
-fb_android_library(
+non_fbcode_target(_kind = fb_android_library,
     name = "executorch",
     srcs = [
         "executorch_android/src/main/java/org/pytorch/executorch/DType.java",
@@ -20,7 +21,7 @@ fb_android_library(
     ],
 )
 
-fb_android_library(
+non_fbcode_target(_kind = fb_android_library,
     name = "executorch_llama",
     srcs = [
         "executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmCallback.java",
diff --git a/extension/android/TARGETS b/extension/android/TARGETS
deleted file mode 100644
index 5c4f482b5ea..00000000000
--- a/extension/android/TARGETS
+++ /dev/null
@@ -1 +0,0 @@
-# This file needs to exist to avoid build system breakage, see https://fburl.com/workplace/jtdlgdmd
diff --git a/extension/android/jni/BUCK b/extension/android/jni/BUCK
index e1bf26fef26..d9ef6d1455e 100644
--- a/extension/android/jni/BUCK
+++ b/extension/android/jni/BUCK
@@ -1,3 +1,4 @@
+load("@fbcode_macros//build_defs:build_file_migration.bzl", "fbcode_target", "non_fbcode_target")
 load("@fbsource//tools/build_defs/android:fb_android_cxx_library.bzl", "fb_android_cxx_library")
 load("@fbsource//xplat/executorch/backends/xnnpack/third-party:third_party_libs.bzl", "third_party_dep")
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
@@ -5,7 +6,7 @@ load("@fbsource//xplat/executorch/codegen:codegen.bzl", "executorch_generated_li
 
 oncall("executorch")
 
-executorch_generated_lib(
+non_fbcode_target(_kind = executorch_generated_lib,
     name = "generated_op_lib_optimized",
     custom_ops_aten_kernel_deps = [
         "//executorch/kernels/portable:operators_aten",
@@ -24,7 +25,7 @@ executorch_generated_lib(
     ],
 )
 
-fb_android_cxx_library(
+non_fbcode_target(_kind = fb_android_cxx_library,
     name = "executorch_jni",
     srcs = ["jni_layer.cpp", "log.cpp"],
     headers = ["jni_layer_constants.h"],
@@ -49,7 +50,7 @@ fb_android_cxx_library(
     ],
 )
 
-fb_android_cxx_library(
+non_fbcode_target(_kind = fb_android_cxx_library,
     name = "executorch_jni_full",
     srcs = ["jni_layer.cpp", "log.cpp"],
     headers = ["jni_layer_constants.h"],
@@ -74,7 +75,7 @@ fb_android_cxx_library(
     ],
 )
 
-fb_android_cxx_library(
+non_fbcode_target(_kind = fb_android_cxx_library,
     name = "executorch_llama_jni",
     srcs = [
         "jni_layer.cpp",
@@ -106,7 +107,7 @@ fb_android_cxx_library(
     ],
 )
 
-runtime.cxx_library(
+non_fbcode_target(_kind = runtime.cxx_library,
     name = "log_provider",
     srcs = ["log.cpp"],
     exported_headers = ["log.h"],
diff --git a/extension/android/jni/TARGETS b/extension/android/jni/TARGETS
deleted file mode 100644
index 5c4f482b5ea..00000000000
--- a/extension/android/jni/TARGETS
+++ /dev/null
@@ -1 +0,0 @@
-# This file needs to exist to avoid build system breakage, see https://fburl.com/workplace/jtdlgdmd
diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorchModule.h b/extension/apple/ExecuTorch/Exported/ExecuTorchModule.h
index 5e6e0ecaf47..432af27f321 100644
--- a/extension/apple/ExecuTorch/Exported/ExecuTorchModule.h
+++ b/extension/apple/ExecuTorch/Exported/ExecuTorchModule.h
@@ -10,6 +10,18 @@
 
 NS_ASSUME_NONNULL_BEGIN
 
+/**
+ * Enum to define loading behavior.
+ * Values can be a subset, but must numerically match exactly those defined in
+ * extension/module/module.h
+ */
+typedef NS_ENUM(NSInteger, ExecuTorchModuleLoadMode) {
+  ExecuTorchModuleLoadModeFile = 0,
+  ExecuTorchModuleLoadModeMmap,
+  ExecuTorchModuleLoadModeMmapUseMlock,
+  ExecuTorchModuleLoadModeMmapUseMlockIgnoreErrors,
+} NS_SWIFT_NAME(ModuleLoadMode);
+
 NS_SWIFT_NAME(Module)
 __attribute__((deprecated("This API is experimental.")))
 @interface ExecuTorchModule : NSObject
diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorchTensor.h b/extension/apple/ExecuTorch/Exported/ExecuTorchTensor.h
index 220e377b60d..b840cd2faac 100644
--- a/extension/apple/ExecuTorch/Exported/ExecuTorchTensor.h
+++ b/extension/apple/ExecuTorch/Exported/ExecuTorchTensor.h
@@ -10,6 +10,57 @@
 
 NS_ASSUME_NONNULL_BEGIN
 
+/**
+ * Enum to define the data type of a Tensor.
+ * Values can be a subset, but must numerically match exactly those defined in
+ * runtime/core/portable_type/scalar_type.h
+ */
+typedef NS_ENUM(int8_t, ExecuTorchDataType) {
+  ExecuTorchDataTypeByte,
+  ExecuTorchDataTypeChar,
+  ExecuTorchDataTypeShort,
+  ExecuTorchDataTypeInt,
+  ExecuTorchDataTypeLong,
+  ExecuTorchDataTypeHalf,
+  ExecuTorchDataTypeFloat,
+  ExecuTorchDataTypeDouble,
+  ExecuTorchDataTypeComplexHalf,
+  ExecuTorchDataTypeComplexFloat,
+  ExecuTorchDataTypeComplexDouble,
+  ExecuTorchDataTypeBool,
+  ExecuTorchDataTypeQInt8,
+  ExecuTorchDataTypeQUInt8,
+  ExecuTorchDataTypeQInt32,
+  ExecuTorchDataTypeBFloat16,
+  ExecuTorchDataTypeQUInt4x2,
+  ExecuTorchDataTypeQUInt2x4,
+  ExecuTorchDataTypeBits1x8,
+  ExecuTorchDataTypeBits2x4,
+  ExecuTorchDataTypeBits4x2,
+  ExecuTorchDataTypeBits8,
+  ExecuTorchDataTypeBits16,
+  ExecuTorchDataTypeFloat8_e5m2,
+  ExecuTorchDataTypeFloat8_e4m3fn,
+  ExecuTorchDataTypeFloat8_e5m2fnuz,
+  ExecuTorchDataTypeFloat8_e4m3fnuz,
+  ExecuTorchDataTypeUInt16,
+  ExecuTorchDataTypeUInt32,
+  ExecuTorchDataTypeUInt64,
+  ExecuTorchDataTypeUndefined,
+  ExecuTorchDataTypeNumOptions,
+} NS_SWIFT_NAME(DataType);
+
+/**
+ * Enum to define the shape dynamism of a Tensor.
+ * Values can be a subset, but must numerically match exactly those defined in
+ * runtime/core/tensor_shape_dynamism.h
+ */
+typedef NS_ENUM(uint8_t, ExecuTorchShapeDynamism) {
+  ExecuTorchShapeDynamismStatic,
+  ExecuTorchShapeDynamismDynamicBound,
+  ExecuTorchShapeDynamismDynamicUnbound,
+} NS_SWIFT_NAME(ShapeDynamism);
+
 NS_SWIFT_NAME(Tensor)
 __attribute__((deprecated("This API is experimental.")))
 @interface ExecuTorchTensor : NSObject
diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorchValue.h b/extension/apple/ExecuTorch/Exported/ExecuTorchValue.h
index 9b2c8aaaae6..c524b66b512 100644
--- a/extension/apple/ExecuTorch/Exported/ExecuTorchValue.h
+++ b/extension/apple/ExecuTorch/Exported/ExecuTorchValue.h
@@ -10,6 +10,26 @@
 
 NS_ASSUME_NONNULL_BEGIN
 
+/**
+ * Enum to define the dynamic type of a Value.
+ * Values can be a subset, but must numerically match exactly those defined in
+ * runtime/core/tag.h
+ */
+typedef NS_ENUM(uint32_t, ExecuTorchValueTag) {
+  ExecuTorchValueTagNone,
+  ExecuTorchValueTagTensor,
+  ExecuTorchValueTagString,
+  ExecuTorchValueTagDouble,
+  ExecuTorchValueTagInteger,
+  ExecuTorchValueTagBoolean,
+  ExecuTorchValueTagBooleanList,
+  ExecuTorchValueTagDoubleList,
+  ExecuTorchValueTagIntegerList,
+  ExecuTorchValueTagTensorList,
+  ExecuTorchValueTagScalarList,
+  ExecuTorchValueTagOptionalTensorList,
+} NS_SWIFT_NAME(ValueTag);
+
 NS_SWIFT_NAME(Value)
 __attribute__((deprecated("This API is experimental.")))
 @interface ExecuTorchValue : NSObject
diff --git a/extension/apple/ExecuTorch/Internal/ExecuTorchUtils.h b/extension/apple/ExecuTorch/Internal/ExecuTorchUtils.h
index e53999cd355..9add6dbd48d 100644
--- a/extension/apple/ExecuTorch/Internal/ExecuTorchUtils.h
+++ b/extension/apple/ExecuTorch/Internal/ExecuTorchUtils.h
@@ -7,3 +7,117 @@
  */
 
 #import <Foundation/Foundation.h>
+
+#ifdef __cplusplus
+
+#import <executorch/runtime/core/exec_aten/exec_aten.h>
+#import <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+
+namespace executorch::extension::utils {
+using namespace aten;
+using namespace runtime;
+
+/**
+ * Deduces the scalar type for a given NSNumber based on its type encoding.
+ *
+ * @param number The NSNumber instance whose scalar type is to be deduced.
+ * @return The corresponding ScalarType.
+ */
+ScalarType deduceType(NSNumber *number);
+
+/**
+ * Converts the value held in the NSNumber to the specified C++ type T.
+ *
+ * @tparam T The target C++ numeric type.
+ * @param number The NSNumber instance to extract the value from.
+ * @return The value converted to type T.
+ */
+template <typename T>
+T extractValue(NSNumber *number) {
+  ET_CHECK_MSG(!(isFloatingType(deduceType(number)) &&
+    isIntegralType(CppTypeToScalarType<T>::value, true)),
+    "Cannot convert floating point to integral type");
+  T value;
+  if constexpr (std::is_same_v<T, uint8_t>) {
+    value = number.unsignedCharValue;
+  } else if constexpr (std::is_same_v<T, int8_t>) {
+    value = number.charValue;
+  } else if constexpr (std::is_same_v<T, int16_t>) {
+    value = number.shortValue;
+  } else if constexpr (std::is_same_v<T, int32_t>) {
+    value = number.intValue;
+  } else if constexpr (std::is_same_v<T, int64_t>) {
+    value = number.longLongValue;
+  } else if constexpr (std::is_same_v<T, float>) {
+    value = number.floatValue;
+  } else if constexpr (std::is_same_v<T, double>) {
+    value = number.doubleValue;
+  } else if constexpr (std::is_same_v<T, BOOL>) {
+    value = number.boolValue;
+  } else if constexpr (std::is_same_v<T, uint16_t>) {
+    value = number.unsignedShortValue;
+  } else if constexpr (std::is_same_v<T, uint32_t>) {
+    value = number.unsignedIntValue;
+  } else if constexpr (std::is_same_v<T, uint64_t>) {
+    value = number.unsignedLongLongValue;
+  } else if constexpr (std::is_same_v<T, NSInteger>) {
+    value = number.integerValue;
+  } else if constexpr (std::is_same_v<T, NSUInteger>) {
+    value = number.unsignedIntegerValue;
+  } else if constexpr (std::is_same_v<T, BFloat16> ||
+                       std::is_same_v<T, Half>) {
+    value = T(number.floatValue);
+  } else {
+    static_assert(sizeof(T) == 0, "Unsupported type");
+  }
+  ET_DCHECK_MSG(std::numeric_limits<T>::lowest() <= value && value <= std::numeric_limits<T>::max(),
+    "Value out of range");
+  return value;
+}
+
+/**
+ * Converts an NSArray of NSNumber objects to a std::vector of type T.
+ *
+ * @tparam T The target C++ numeric type.
+ * @param array The NSArray containing NSNumber objects.
+ * @return A std::vector with the values extracted as type T.
+ */
+template <typename T>
+std::vector<T> toVector(NSArray<NSNumber *> *array) {
+  std::vector<T> vector;
+  vector.reserve(array.count);
+  for (NSNumber *number in array) {
+    vector.push_back(extractValue<T>(number));
+  }
+  return vector;
+}
+
+// Trait for types that can be wrapped into an NSNumber.
+template <typename T>
+constexpr bool isNSNumberWrapable =
+    std::is_arithmetic_v<T> ||
+    std::is_same_v<T, BOOL> ||
+    std::is_same_v<T, BFloat16> ||
+    std::is_same_v<T, Half>;
+
+/**
+ * Converts a generic container of numeric values to an NSArray of NSNumber objects.
+ *
+ * @tparam Container The container type holding numeric values.
+ * @param container The container whose items are to be converted.
+ * @return An NSArray populated with NSNumber objects representing the container's items.
+ */
+template <typename Container>
+NSArray<NSNumber *> *toNSArray(const Container &container) {
+  static_assert(isNSNumberWrapable<typename Container::value_type>, "Invalid container value type");
+  const NSUInteger count = std::distance(std::begin(container), std::end(container));
+  NSMutableArray<NSNumber *> *array = [NSMutableArray arrayWithCapacity:count];
+  for (const auto &item : container) {
+    [array addObject:@(item)];
+  }
+  return array;
+}
+
+} // namespace executorch::extension::utils
+
+#endif // __cplusplus
diff --git a/extension/apple/ExecuTorch/Internal/ExecuTorchUtils.mm b/extension/apple/ExecuTorch/Internal/ExecuTorchUtils.mm
new file mode 100644
index 00000000000..8210b366c0f
--- /dev/null
+++ b/extension/apple/ExecuTorch/Internal/ExecuTorchUtils.mm
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#import "ExecuTorchUtils.h"
+
+namespace executorch::extension::utils {
+using namespace aten;
+using namespace runtime;
+
+ScalarType deduceType(NSNumber *number) {
+  auto type = [number objCType][0];
+  type = (type >= 'A' && type <= 'Z') ? type + ('a' - 'A') : type;
+  if (type == 'c') {
+    return ScalarType::Byte;
+  } else if (type == 's') {
+    return ScalarType::Short;
+  } else if (type == 'i') {
+    return ScalarType::Int;
+  } else if (type == 'q' || type == 'l') {
+    return ScalarType::Long;
+  } else if (type == 'f') {
+    return ScalarType::Float;
+  } else if (type == 'd') {
+    return ScalarType::Double;
+  }
+  ET_CHECK_MSG(false, "Unsupported type: %c", type);
+  return ScalarType::Undefined;
+}
+
+} // namespace executorch::extension::utils
diff --git a/extension/apple/ExecuTorch/TARGETS b/extension/apple/ExecuTorch/TARGETS
deleted file mode 100644
index 5c4f482b5ea..00000000000
--- a/extension/apple/ExecuTorch/TARGETS
+++ /dev/null
@@ -1 +0,0 @@
-# This file needs to exist to avoid build system breakage, see https://fburl.com/workplace/jtdlgdmd
diff --git a/extension/apple/ExecuTorch/__tests__/ModuleTest.swift b/extension/apple/ExecuTorch/__tests__/ModuleTest.swift
index 609727ec93f..03299cad991 100644
--- a/extension/apple/ExecuTorch/__tests__/ModuleTest.swift
+++ b/extension/apple/ExecuTorch/__tests__/ModuleTest.swift
@@ -11,9 +11,16 @@
 import XCTest
 
 class ModuleTest: XCTestCase {
+  var resourceBundle: Bundle {
+#if SWIFT_PACKAGE
+    return Bundle.module
+#else
+    return Bundle(for: type(of: self))
+#endif
+  }
+  
   func test() throws {
-    let bundle = Bundle(for: type(of: self))
-    guard let modelPath = bundle.path(forResource: "add", ofType: "pte") else {
+    guard let modelPath = resourceBundle.path(forResource: "add", ofType: "pte") else {
       XCTFail("Couldn't find the model file")
       return
     }
diff --git a/extension/apple/TARGETS b/extension/apple/TARGETS
deleted file mode 100644
index 5c4f482b5ea..00000000000
--- a/extension/apple/TARGETS
+++ /dev/null
@@ -1 +0,0 @@
-# This file needs to exist to avoid build system breakage, see https://fburl.com/workplace/jtdlgdmd
diff --git a/extension/parallel/targets.bzl b/extension/parallel/targets.bzl
index dbfb3ff160c..3a2d6f354ef 100644
--- a/extension/parallel/targets.bzl
+++ b/extension/parallel/targets.bzl
@@ -17,6 +17,6 @@ def define_common_targets():
             "@EXECUTORCH_CLIENTS",
         ],
         deps = [
-            "//executorch/runtime/kernel:thread_parallel_interface",
+            "//executorch/extension/threadpool:threadpool",
         ],
     )
diff --git a/install_executorch.py b/install_executorch.py
index 3b2a4091888..6863ed2c0fc 100644
--- a/install_executorch.py
+++ b/install_executorch.py
@@ -15,6 +15,7 @@
 import subprocess
 import sys
 from contextlib import contextmanager
+from typing import List, Tuple
 
 from install_requirements import (
     install_requirements,
@@ -51,6 +52,7 @@ def clean():
     print("Done cleaning build artifacts.")
 
 
+# Please keep this insync with `ShouldBuild.pybindings` in setup.py.
 VALID_PYBINDS = ["coreml", "mps", "xnnpack", "training", "openvino"]
 
 
@@ -168,28 +170,30 @@ def build_args_parser() -> argparse.ArgumentParser:
     return parser
 
 
-def handle_pybind(args, cmake_args, executorch_build_pybind):
+# Returns (wants_off, wanted_pybindings)
+def _list_pybind_defines(args) -> Tuple[bool, List[str]]:
+    if args.pybind is None:
+        return False, []
+
     # Flatten list of lists.
     args.pybind = list(itertools.chain(*args.pybind))
     if "off" in args.pybind:
         if len(args.pybind) != 1:
             raise Exception(f"Cannot combine `off` with other pybinds: {args.pybind}")
-        executorch_build_pybind = "OFF"
-    else:
-        for pybind_arg in args.pybind:
-            if pybind_arg not in VALID_PYBINDS:
-                raise Exception(
-                    f"Unrecognized pybind argument {pybind_arg}; valid options are: {', '.join(VALID_PYBINDS)}"
-                )
-            if pybind_arg == "training":
-                cmake_args += " -DEXECUTORCH_BUILD_EXTENSION_TRAINING=ON"
-                os.environ["EXECUTORCH_BUILD_TRAINING"] = "ON"
-            elif pybind_arg == "mps":
-                cmake_args += " -DEXECUTORCH_BUILD_MPS=ON"
-            else:
-                cmake_args += f" -DEXECUTORCH_BUILD_{pybind_arg.upper()}=ON"
-            executorch_build_pybind = "ON"
-    return executorch_build_pybind, cmake_args
+        return True, []
+
+    cmake_args = []
+    for pybind_arg in args.pybind:
+        if pybind_arg not in VALID_PYBINDS:
+            raise Exception(
+                f"Unrecognized pybind argument {pybind_arg}; valid options are: {', '.join(VALID_PYBINDS)}"
+            )
+        if pybind_arg == "training":
+            cmake_args.append("-DEXECUTORCH_BUILD_EXTENSION_TRAINING=ON")
+        else:
+            cmake_args.append(f"-DEXECUTORCH_BUILD_{pybind_arg.upper()}=ON")
+
+    return False, cmake_args
 
 
 def main(args):
@@ -199,14 +203,18 @@ def main(args):
     parser = build_args_parser()
     args = parser.parse_args()
 
-    EXECUTORCH_BUILD_PYBIND = ""
-    CMAKE_ARGS = os.getenv("CMAKE_ARGS", "")
+    cmake_args = [os.getenv("CMAKE_ARGS", "")]
     use_pytorch_nightly = True
 
-    if args.pybind:
-        EXECUTORCH_BUILD_PYBIND, CMAKE_ARGS = handle_pybind(
-            args, CMAKE_ARGS, EXECUTORCH_BUILD_PYBIND
-        )
+    wants_pybindings_off, pybind_defines = _list_pybind_defines(args)
+    if not wants_pybindings_off:
+        if len(pybind_defines) > 0:
+            # If the user explicitly provides a list of bindings, just use them
+            cmake_args += pybind_defines
+        else:
+            # If the user has not set pybindings off but also has not provided
+            # a list, then turn on xnnpack by default
+            cmake_args.append("-DEXECUTORCH_BUILD_XNNPACK=ON")
 
     if args.clean:
         clean()
@@ -218,18 +226,11 @@ def main(args):
         # latest PT commit otherwise
         use_pytorch_nightly = False
 
-    # If --pybind is not set explicitly for backends (e.g., --pybind xnnpack)
-    # or is not turned off explicitly (--pybind off)
-    # then install XNNPACK by default.
-    if EXECUTORCH_BUILD_PYBIND == "":
-        EXECUTORCH_BUILD_PYBIND = "ON"
-        CMAKE_ARGS += " -DEXECUTORCH_BUILD_XNNPACK=ON"
-
     # Use ClangCL on Windows.
     # ClangCL is an alias to Clang that configures it to work in an MSVC-compatible
     # mode. Using it on Windows to avoid compiler compatibility issues for MSVC.
     if os.name == "nt":
-        CMAKE_ARGS += " -T ClangCL"
+        cmake_args.append("-T ClangCL")
 
     #
     # Install executorch pip package. This also makes `flatc` available on the path.
@@ -238,8 +239,7 @@ def main(args):
     #
 
     # Set environment variables
-    os.environ["EXECUTORCH_BUILD_PYBIND"] = EXECUTORCH_BUILD_PYBIND
-    os.environ["CMAKE_ARGS"] = CMAKE_ARGS
+    os.environ["CMAKE_ARGS"] = " ".join(cmake_args)
 
     # Check if the required submodules are present and update them if not
     check_and_update_submodules()
diff --git a/install_requirements.py b/install_requirements.py
index 0331f76522a..4b3de68b8db 100644
--- a/install_requirements.py
+++ b/install_requirements.py
@@ -71,7 +71,7 @@ def python_is_compatible():
 #
 # NOTE: If you're changing, make the corresponding change in .ci/docker/ci_commit_pins/pytorch.txt
 # by picking the hash from the same date in https://hud.pytorch.org/hud/pytorch/pytorch/nightly/
-NIGHTLY_VERSION = "dev20250311"
+NIGHTLY_VERSION = "dev20250310"
 
 
 def install_requirements(use_pytorch_nightly):
diff --git a/kernels/aten/functions.yaml b/kernels/aten/functions.yaml
index 7069f9140ab..a8fa6611478 100644
--- a/kernels/aten/functions.yaml
+++ b/kernels/aten/functions.yaml
@@ -141,6 +141,8 @@
 
 - op: div.out_mode
 
+- op: elu.out
+
 - op: embedding.out
 
 - op: empty.out
diff --git a/kernels/optimized/cpu/targets.bzl b/kernels/optimized/cpu/targets.bzl
index 017dff8a127..8dadd5a49e6 100644
--- a/kernels/optimized/cpu/targets.bzl
+++ b/kernels/optimized/cpu/targets.bzl
@@ -99,8 +99,8 @@ _OPTIMIZED_ATEN_OPS = (
     op_target(
         name = "op_where",
         deps = [
+            "//executorch/extension/threadpool:threadpool",
             "//executorch/kernels/portable/cpu/util:elementwise_util",
-            "//executorch/runtime/kernel:thread_parallel_interface",
         ],
     ),
 )
diff --git a/kernels/optimized/lib_defs.bzl b/kernels/optimized/lib_defs.bzl
index 6e884457e35..85886365a01 100644
--- a/kernels/optimized/lib_defs.bzl
+++ b/kernels/optimized/lib_defs.bzl
@@ -232,9 +232,9 @@ def define_libs(is_fbcode=False):
                 "DEFAULT": [],
             }) + LIBBLAS_DEPS,
             exported_deps = [
+                "//executorch/extension/threadpool:threadpool",
                 "//executorch/kernels/optimized:libutils",
                 "//executorch/runtime/core/exec_aten:lib",
-                "//executorch/runtime/kernel:thread_parallel_interface",
             ],
             **get_apple_framework_deps_kwargs(is_fbcode),
         )
diff --git a/kernels/portable/cpu/op_convolution.cpp b/kernels/portable/cpu/op_convolution.cpp
index 44da2cc0f1f..b5eb8d1f5db 100644
--- a/kernels/portable/cpu/op_convolution.cpp
+++ b/kernels/portable/cpu/op_convolution.cpp
@@ -414,7 +414,7 @@ Tensor& convolution_out(
 
   ET_SWITCH_REALH_TYPES(in.scalar_type(), ctx, name, CTYPE, [&]() {
     const auto load_bias = bias.has_value()
-        ? utils::internal::get_load_to_common_fn<CTYPE, name>(
+        ? utils::internal::get_load_to_compute_fn<CTYPE, name>(
               bias.value(), utils::SupportedTensorDtypes::REALHBF16)
         : nullptr;
     convolution_wrapper<CTYPE>(
diff --git a/kernels/portable/cpu/op_cumsum.cpp b/kernels/portable/cpu/op_cumsum.cpp
index 2faa67433d4..1f4aa5c458e 100644
--- a/kernels/portable/cpu/op_cumsum.cpp
+++ b/kernels/portable/cpu/op_cumsum.cpp
@@ -113,7 +113,7 @@ Tensor& cumsum_out(
 
   ET_SWITCH_REALHBBF16_TYPES(out.scalar_type(), ctx, op_name, CTYPE_OUT, [&] {
     const auto load_self =
-        utils::internal::get_load_to_common_fn<CTYPE_OUT, op_name>(
+        utils::internal::get_load_to_compute_fn<CTYPE_OUT, op_name>(
             self, utils::SupportedTensorDtypes::REALHBBF16);
     cumsum_tensors<CTYPE_OUT>(self, load_self, dim, out);
   });
diff --git a/kernels/portable/cpu/op_elu.cpp b/kernels/portable/cpu/op_elu.cpp
new file mode 100644
index 00000000000..d4846fb1bfb
--- /dev/null
+++ b/kernels/portable/cpu/op_elu.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cmath>
+#include <type_traits>
+
+#include <executorch/kernels/portable/cpu/scalar_utils.h>
+#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch::executor::native {
+
+Tensor& elu_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& in,
+    const Scalar& alpha,
+    const Scalar& scale,
+    const Scalar& input_scale,
+    Tensor& out) {
+  ET_KERNEL_CHECK(ctx, tensors_have_same_dtype(in, out), InvalidArgument, out);
+  ET_KERNEL_CHECK(
+      ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out);
+
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_floating_type(in), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensors_have_same_dtype(in, out), InvalidArgument, out);
+
+  static constexpr const char op_name[] = "elu.out";
+  ET_SWITCH_FLOATHBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE, [&]() {
+    using MathT = std::
+        conditional_t<c10::is_reduced_floating_point_v<CTYPE>, float, CTYPE>;
+    MathT math_alpha = 0;
+    MathT math_scale = 0;
+    MathT math_input_scale = 0;
+    ET_EXTRACT_SCALAR(alpha, math_alpha);
+    ET_EXTRACT_SCALAR(scale, math_scale);
+    ET_EXTRACT_SCALAR(input_scale, math_input_scale);
+    const auto negcoef = math_alpha * math_scale;
+    utils::apply_unitensor_elementwise_fn<CTYPE, op_name>(
+        [negcoef, math_scale, math_input_scale](auto x) {
+          return MathT(x) <= MathT(0)
+              ? std::expm1(MathT(x) * math_input_scale) * negcoef
+              : MathT(x) * math_scale;
+        },
+        ctx,
+        in,
+        utils::SupportedTensorDtypes::FLOATHBF16,
+        out,
+        utils::SupportedTensorDtypes::SAME_AS_COMMON);
+  });
+  return out;
+}
+
+} // namespace torch::executor::native
diff --git a/kernels/portable/cpu/util/dtype_util.h b/kernels/portable/cpu/util/dtype_util.h
index 2bbd5de4577..e3cac54908e 100644
--- a/kernels/portable/cpu/util/dtype_util.h
+++ b/kernels/portable/cpu/util/dtype_util.h
@@ -26,189 +26,189 @@ void convert_and_store(From f, void* dst) {
   *reinterpret_cast<To*>(dst) = static_cast<To>(f);
 }
 
-template <typename CTYPE_COMMON>
-using load_to_common_fn = CTYPE_COMMON (*)(const void*);
+template <typename CTYPE_COMPUTE>
+using load_to_compute_fn = CTYPE_COMPUTE (*)(const void*);
 
-template <typename CTYPE_COMMON, const char* op_name>
-load_to_common_fn<CTYPE_COMMON> get_load_to_common_fn_realhbbf16(
+template <typename CTYPE_COMPUTE, const char* op_name>
+load_to_compute_fn<CTYPE_COMPUTE> get_load_to_compute_fn_realhbbf16(
     const Tensor& t) {
-  CTYPE_COMMON (*result)(const void*) = nullptr;
+  CTYPE_COMPUTE (*result)(const void*) = nullptr;
   ET_SWITCH_REALHBBF16_TYPES(
       t.scalar_type(), unused, op_name, TENSOR_CTYPE, [&]() {
-        result = internal::load_and_convert<CTYPE_COMMON, TENSOR_CTYPE>;
+        result = internal::load_and_convert<CTYPE_COMPUTE, TENSOR_CTYPE>;
       });
   return result;
 }
 
-template <typename CTYPE_COMMON, const char* op_name>
-load_to_common_fn<CTYPE_COMMON> get_load_to_common_fn_realhbf16(
+template <typename CTYPE_COMPUTE, const char* op_name>
+load_to_compute_fn<CTYPE_COMPUTE> get_load_to_compute_fn_realhbf16(
     const Tensor& t) {
-  CTYPE_COMMON (*result)(const void*) = nullptr;
+  CTYPE_COMPUTE (*result)(const void*) = nullptr;
   ET_SWITCH_REALHBF16_TYPES(
       t.scalar_type(), unused, op_name, TENSOR_CTYPE, [&]() {
-        result = internal::load_and_convert<CTYPE_COMMON, TENSOR_CTYPE>;
+        result = internal::load_and_convert<CTYPE_COMPUTE, TENSOR_CTYPE>;
       });
   return result;
 }
 
-template <typename CTYPE_COMMON, const char* op_name>
-load_to_common_fn<CTYPE_COMMON> get_load_to_common_fn_floathbf16(
+template <typename CTYPE_COMPUTE, const char* op_name>
+load_to_compute_fn<CTYPE_COMPUTE> get_load_to_compute_fn_floathbf16(
     const Tensor& t) {
-  CTYPE_COMMON (*result)(const void*) = nullptr;
+  CTYPE_COMPUTE (*result)(const void*) = nullptr;
   ET_SWITCH_FLOATHBF16_TYPES(
       t.scalar_type(), unused, op_name, TENSOR_CTYPE, [&]() {
-        result = internal::load_and_convert<CTYPE_COMMON, TENSOR_CTYPE>;
+        result = internal::load_and_convert<CTYPE_COMPUTE, TENSOR_CTYPE>;
       });
   return result;
 }
 
-template <typename CTYPE_COMMON, const char* op_name>
-load_to_common_fn<CTYPE_COMMON> get_load_to_common_fn_intb(const Tensor& t) {
-  CTYPE_COMMON (*result)(const void*) = nullptr;
+template <typename CTYPE_COMPUTE, const char* op_name>
+load_to_compute_fn<CTYPE_COMPUTE> get_load_to_compute_fn_intb(const Tensor& t) {
+  CTYPE_COMPUTE (*result)(const void*) = nullptr;
   ET_SWITCH_INT_TYPES_AND(
       Bool, t.scalar_type(), unused, op_name, TENSOR_CTYPE, [&]() {
-        result = internal::load_and_convert<CTYPE_COMMON, TENSOR_CTYPE>;
+        result = internal::load_and_convert<CTYPE_COMPUTE, TENSOR_CTYPE>;
       });
   return result;
 }
 
-template <typename CTYPE_COMMON, const char* op_name>
-load_to_common_fn<CTYPE_COMMON> get_load_to_common_fn_bool_or_byte(
+template <typename CTYPE_COMPUTE, const char* op_name>
+load_to_compute_fn<CTYPE_COMPUTE> get_load_to_compute_fn_bool_or_byte(
     const Tensor& t) {
-  CTYPE_COMMON (*result)(const void*) = nullptr;
+  CTYPE_COMPUTE (*result)(const void*) = nullptr;
   ET_SWITCH_TWO_TYPES(
       Bool, Byte, t.scalar_type(), unused, op_name, TENSOR_CTYPE, [&]() {
-        result = internal::load_and_convert<CTYPE_COMMON, TENSOR_CTYPE>;
+        result = internal::load_and_convert<CTYPE_COMPUTE, TENSOR_CTYPE>;
       });
   return result;
 }
 
-template <typename CTYPE_COMMON, const char* op_name>
-load_to_common_fn<CTYPE_COMMON> get_load_to_common_fn_same_as_compute(
+template <typename CTYPE_COMPUTE, const char* op_name>
+load_to_compute_fn<CTYPE_COMPUTE> get_load_to_compute_fn_same_as_compute(
     const Tensor& t) {
-  constexpr auto common_scalar_type = CppTypeToScalarType<CTYPE_COMMON>::value;
+  constexpr auto common_scalar_type = CppTypeToScalarType<CTYPE_COMPUTE>::value;
   ET_CHECK_MSG(
       t.scalar_type() == common_scalar_type,
       "Unhandled dtype %s for %s",
       ::executorch::runtime::toString(common_scalar_type),
       op_name);
-  return internal::load_and_convert<CTYPE_COMMON, CTYPE_COMMON>;
+  return internal::load_and_convert<CTYPE_COMPUTE, CTYPE_COMPUTE>;
 }
 
 template <
-    typename CTYPE_COMMON,
+    typename CTYPE_COMPUTE,
     const char* op_name,
-    std::enable_if_t<std::is_same_v<CTYPE_COMMON, float>, bool> = true>
-load_to_common_fn<CTYPE_COMMON> get_load_to_common_fn_same_as_common(
+    std::enable_if_t<std::is_same_v<CTYPE_COMPUTE, float>, bool> = true>
+load_to_compute_fn<CTYPE_COMPUTE> get_load_to_compute_fn_same_as_common(
     const Tensor& t) {
-  CTYPE_COMMON (*result)(const void*) = nullptr;
+  CTYPE_COMPUTE (*result)(const void*) = nullptr;
   ET_SWITCH_THREE_TYPES(
       Float, Half, BFloat16, t.scalar_type(), unused, op_name, T, [&]() {
-        result = internal::load_and_convert<CTYPE_COMMON, T>;
+        result = internal::load_and_convert<CTYPE_COMPUTE, T>;
       });
   return result;
 }
 
 template <
-    typename CTYPE_COMMON,
+    typename CTYPE_COMPUTE,
     const char* op_name,
-    std::enable_if_t<!std::is_same_v<CTYPE_COMMON, float>, bool> = true>
-load_to_common_fn<CTYPE_COMMON> get_load_to_common_fn_same_as_common(
+    std::enable_if_t<!std::is_same_v<CTYPE_COMPUTE, float>, bool> = true>
+load_to_compute_fn<CTYPE_COMPUTE> get_load_to_compute_fn_same_as_common(
     const Tensor& t) {
-  return get_load_to_common_fn_same_as_compute<CTYPE_COMMON, op_name>(t);
+  return get_load_to_compute_fn_same_as_compute<CTYPE_COMPUTE, op_name>(t);
 }
 
-template <typename CTYPE_COMMON>
-using store_common_to_tensor_fn = void (*)(CTYPE_COMMON, void*);
+template <typename CTYPE_COMPUTE>
+using store_compute_to_tensor_fn = void (*)(CTYPE_COMPUTE, void*);
 
-template <typename CTYPE_COMMON, const char* op_name>
-store_common_to_tensor_fn<CTYPE_COMMON>
-get_store_common_to_tensor_fn_realhbbf16(const Tensor& t) {
-  void (*result)(CTYPE_COMMON, void*) = nullptr;
+template <typename CTYPE_COMPUTE, const char* op_name>
+store_compute_to_tensor_fn<CTYPE_COMPUTE>
+get_store_compute_to_tensor_fn_realhbbf16(const Tensor& t) {
+  void (*result)(CTYPE_COMPUTE, void*) = nullptr;
   ET_SWITCH_REALHBBF16_TYPES(
       t.scalar_type(), unused, op_name, TENSOR_CTYPE, [&]() {
-        result = internal::convert_and_store<TENSOR_CTYPE, CTYPE_COMMON>;
+        result = internal::convert_and_store<TENSOR_CTYPE, CTYPE_COMPUTE>;
       });
   return result;
 }
 
-template <typename CTYPE_COMMON, const char* op_name>
-store_common_to_tensor_fn<CTYPE_COMMON> get_store_common_to_tensor_fn_realhbf16(
-    const Tensor& t) {
-  void (*result)(CTYPE_COMMON, void*) = nullptr;
+template <typename CTYPE_COMPUTE, const char* op_name>
+store_compute_to_tensor_fn<CTYPE_COMPUTE>
+get_store_compute_to_tensor_fn_realhbf16(const Tensor& t) {
+  void (*result)(CTYPE_COMPUTE, void*) = nullptr;
   ET_SWITCH_REALHBF16_TYPES(
       t.scalar_type(), unused, op_name, TENSOR_CTYPE, [&]() {
-        result = internal::convert_and_store<TENSOR_CTYPE, CTYPE_COMMON>;
+        result = internal::convert_and_store<TENSOR_CTYPE, CTYPE_COMPUTE>;
       });
   return result;
 }
 
-template <typename CTYPE_COMMON, const char* op_name>
-store_common_to_tensor_fn<CTYPE_COMMON>
-get_store_common_to_tensor_fn_floathbf16(const Tensor& t) {
-  void (*result)(CTYPE_COMMON, void*) = nullptr;
+template <typename CTYPE_COMPUTE, const char* op_name>
+store_compute_to_tensor_fn<CTYPE_COMPUTE>
+get_store_compute_to_tensor_fn_floathbf16(const Tensor& t) {
+  void (*result)(CTYPE_COMPUTE, void*) = nullptr;
   ET_SWITCH_FLOATHBF16_TYPES(
       t.scalar_type(), unused, op_name, TENSOR_CTYPE, [&]() {
-        result = internal::convert_and_store<TENSOR_CTYPE, CTYPE_COMMON>;
+        result = internal::convert_and_store<TENSOR_CTYPE, CTYPE_COMPUTE>;
       });
   return result;
 }
 
-template <typename CTYPE_COMMON, const char* op_name>
-store_common_to_tensor_fn<CTYPE_COMMON> get_store_common_to_tensor_fn_intb(
+template <typename CTYPE_COMPUTE, const char* op_name>
+store_compute_to_tensor_fn<CTYPE_COMPUTE> get_store_compute_to_tensor_fn_intb(
     const Tensor& t) {
-  void (*result)(CTYPE_COMMON, void*) = nullptr;
+  void (*result)(CTYPE_COMPUTE, void*) = nullptr;
   ET_SWITCH_INT_TYPES_AND(
       Bool, t.scalar_type(), unused, op_name, TENSOR_CTYPE, [&]() {
-        result = internal::convert_and_store<TENSOR_CTYPE, CTYPE_COMMON>;
+        result = internal::convert_and_store<TENSOR_CTYPE, CTYPE_COMPUTE>;
       });
   return result;
 }
 
-template <typename CTYPE_COMMON, const char* op_name>
-store_common_to_tensor_fn<CTYPE_COMMON>
-get_store_common_to_tensor_fn_bool_or_byte(const Tensor& t) {
-  void (*result)(CTYPE_COMMON, void*) = nullptr;
+template <typename CTYPE_COMPUTE, const char* op_name>
+store_compute_to_tensor_fn<CTYPE_COMPUTE>
+get_store_compute_to_tensor_fn_bool_or_byte(const Tensor& t) {
+  void (*result)(CTYPE_COMPUTE, void*) = nullptr;
   ET_SWITCH_TWO_TYPES(
       Bool, Byte, t.scalar_type(), unused, op_name, TENSOR_CTYPE, [&]() {
-        result = internal::convert_and_store<TENSOR_CTYPE, CTYPE_COMMON>;
+        result = internal::convert_and_store<TENSOR_CTYPE, CTYPE_COMPUTE>;
       });
   return result;
 }
 
-template <typename CTYPE_COMMON, const char* op_name>
-store_common_to_tensor_fn<CTYPE_COMMON>
-get_store_common_to_tensor_fn_same_as_compute(const Tensor& t) {
-  constexpr auto common_scalar_type = CppTypeToScalarType<CTYPE_COMMON>::value;
+template <typename CTYPE_COMPUTE, const char* op_name>
+store_compute_to_tensor_fn<CTYPE_COMPUTE>
+get_store_compute_to_tensor_fn_same_as_compute(const Tensor& t) {
+  constexpr auto common_scalar_type = CppTypeToScalarType<CTYPE_COMPUTE>::value;
   ET_CHECK_MSG(
       t.scalar_type() == common_scalar_type,
       "Unhandled dtype %s for %s",
       ::executorch::runtime::toString(common_scalar_type),
       op_name);
-  return internal::convert_and_store<CTYPE_COMMON, CTYPE_COMMON>;
+  return internal::convert_and_store<CTYPE_COMPUTE, CTYPE_COMPUTE>;
 }
 
 template <
-    typename CTYPE_COMMON,
+    typename CTYPE_COMPUTE,
     const char* op_name,
-    std::enable_if_t<std::is_same_v<CTYPE_COMMON, float>, bool> = true>
-store_common_to_tensor_fn<CTYPE_COMMON>
-get_store_common_to_tensor_fn_same_as_common(const Tensor& t) {
-  void (*result)(CTYPE_COMMON, void*) = nullptr;
+    std::enable_if_t<std::is_same_v<CTYPE_COMPUTE, float>, bool> = true>
+store_compute_to_tensor_fn<CTYPE_COMPUTE>
+get_store_compute_to_tensor_fn_same_as_common(const Tensor& t) {
+  void (*result)(CTYPE_COMPUTE, void*) = nullptr;
   ET_SWITCH_THREE_TYPES(
       Float, Half, BFloat16, t.scalar_type(), unused, op_name, CTYPE, [&]() {
-        result = internal::convert_and_store<CTYPE, CTYPE_COMMON>;
+        result = internal::convert_and_store<CTYPE, CTYPE_COMPUTE>;
       });
   return result;
 }
 
 template <
-    typename CTYPE_COMMON,
+    typename CTYPE_COMPUTE,
     const char* op_name,
-    std::enable_if_t<!std::is_same_v<CTYPE_COMMON, float>, bool> = true>
-store_common_to_tensor_fn<CTYPE_COMMON>
-get_store_common_to_tensor_fn_same_as_common(const Tensor& t) {
-  return get_store_common_to_tensor_fn_same_as_compute<CTYPE_COMMON, op_name>(
+    std::enable_if_t<!std::is_same_v<CTYPE_COMPUTE, float>, bool> = true>
+store_compute_to_tensor_fn<CTYPE_COMPUTE>
+get_store_compute_to_tensor_fn_same_as_common(const Tensor& t) {
+  return get_store_compute_to_tensor_fn_same_as_compute<CTYPE_COMPUTE, op_name>(
       t);
 }
 
@@ -220,59 +220,64 @@ enum class SupportedTensorDtypes {
   FLOATHBF16,
   INTB,
   BOOL_OR_BYTE,
+  // DEPRECATED: not likely to be correct; use SAME_AS_COMMON.
   SAME_AS_COMPUTE,
   SAME_AS_COMMON,
 };
 
 namespace internal {
 
-template <typename CTYPE_COMMON, const char* op_name>
-load_to_common_fn<CTYPE_COMMON> get_load_to_common_fn(
+template <typename CTYPE_COMPUTE, const char* op_name>
+load_to_compute_fn<CTYPE_COMPUTE> get_load_to_compute_fn(
     const Tensor& t,
     SupportedTensorDtypes dtypes) {
   switch (dtypes) {
     case SupportedTensorDtypes::REALHBBF16:
-      return get_load_to_common_fn_realhbbf16<CTYPE_COMMON, op_name>(t);
+      return get_load_to_compute_fn_realhbbf16<CTYPE_COMPUTE, op_name>(t);
     case SupportedTensorDtypes::REALHBF16:
-      return get_load_to_common_fn_realhbf16<CTYPE_COMMON, op_name>(t);
+      return get_load_to_compute_fn_realhbf16<CTYPE_COMPUTE, op_name>(t);
     case SupportedTensorDtypes::FLOATHBF16:
-      return get_load_to_common_fn_realhbf16<CTYPE_COMMON, op_name>(t);
+      return get_load_to_compute_fn_realhbf16<CTYPE_COMPUTE, op_name>(t);
     case SupportedTensorDtypes::INTB:
-      return get_load_to_common_fn_intb<CTYPE_COMMON, op_name>(t);
+      return get_load_to_compute_fn_intb<CTYPE_COMPUTE, op_name>(t);
     case SupportedTensorDtypes::BOOL_OR_BYTE:
-      return get_load_to_common_fn_bool_or_byte<CTYPE_COMMON, op_name>(t);
+      return get_load_to_compute_fn_bool_or_byte<CTYPE_COMPUTE, op_name>(t);
     case SupportedTensorDtypes::SAME_AS_COMPUTE:
-      return get_load_to_common_fn_same_as_compute<CTYPE_COMMON, op_name>(t);
+      return get_load_to_compute_fn_same_as_compute<CTYPE_COMPUTE, op_name>(t);
     case SupportedTensorDtypes::SAME_AS_COMMON:
-      return get_load_to_common_fn_same_as_common<CTYPE_COMMON, op_name>(t);
+      return get_load_to_compute_fn_same_as_common<CTYPE_COMPUTE, op_name>(t);
   }
   ET_CHECK(false);
   return nullptr;
 }
 
-template <typename CTYPE_COMMON, const char* op_name>
-store_common_to_tensor_fn<CTYPE_COMMON> get_store_common_to_tensor_fn(
+template <typename CTYPE_COMPUTE, const char* op_name>
+store_compute_to_tensor_fn<CTYPE_COMPUTE> get_store_compute_to_tensor_fn(
     const Tensor& t,
     SupportedTensorDtypes dtypes) {
   switch (dtypes) {
     case SupportedTensorDtypes::REALHBBF16:
-      return get_store_common_to_tensor_fn_realhbbf16<CTYPE_COMMON, op_name>(t);
+      return get_store_compute_to_tensor_fn_realhbbf16<CTYPE_COMPUTE, op_name>(
+          t);
     case SupportedTensorDtypes::REALHBF16:
-      return get_store_common_to_tensor_fn_realhbf16<CTYPE_COMMON, op_name>(t);
+      return get_store_compute_to_tensor_fn_realhbf16<CTYPE_COMPUTE, op_name>(
+          t);
     case SupportedTensorDtypes::FLOATHBF16:
-      return get_store_common_to_tensor_fn_floathbf16<CTYPE_COMMON, op_name>(t);
+      return get_store_compute_to_tensor_fn_floathbf16<CTYPE_COMPUTE, op_name>(
+          t);
     case SupportedTensorDtypes::INTB:
-      return get_store_common_to_tensor_fn_intb<CTYPE_COMMON, op_name>(t);
+      return get_store_compute_to_tensor_fn_intb<CTYPE_COMPUTE, op_name>(t);
     case SupportedTensorDtypes::BOOL_OR_BYTE:
-      return get_store_common_to_tensor_fn_bool_or_byte<CTYPE_COMMON, op_name>(
-          t);
+      return get_store_compute_to_tensor_fn_bool_or_byte<
+          CTYPE_COMPUTE,
+          op_name>(t);
     case SupportedTensorDtypes::SAME_AS_COMPUTE:
-      return get_store_common_to_tensor_fn_same_as_compute<
-          CTYPE_COMMON,
+      return get_store_compute_to_tensor_fn_same_as_compute<
+          CTYPE_COMPUTE,
           op_name>(t);
     case SupportedTensorDtypes::SAME_AS_COMMON: {
-      return get_store_common_to_tensor_fn_same_as_common<
-          CTYPE_COMMON,
+      return get_store_compute_to_tensor_fn_same_as_common<
+          CTYPE_COMPUTE,
           op_name>(t);
     }
   }
diff --git a/kernels/portable/cpu/util/elementwise_util.h b/kernels/portable/cpu/util/elementwise_util.h
index f5932069005..206be87f98e 100644
--- a/kernels/portable/cpu/util/elementwise_util.h
+++ b/kernels/portable/cpu/util/elementwise_util.h
@@ -52,7 +52,7 @@ inline int64_t scalar_to<int64_t>(const Scalar& s) {
 
 namespace internal {
 template <
-    typename CTYPE_COMMON,
+    typename CTYPE_COMPUTE,
     const char* op_name,
     typename Op,
     typename... Args>
@@ -66,7 +66,7 @@ inline void apply_elementwise_fn(
       (std::is_same_v<Args, std::pair<const Tensor*, SupportedTensorDtypes>> &&
        ...));
   constexpr auto kNumInputs = sizeof...(inputs);
-  constexpr auto compute_type = CppTypeToScalarType<CTYPE_COMMON>::value;
+  constexpr auto compute_type = CppTypeToScalarType<CTYPE_COMPUTE>::value;
   const auto check_input_dtype = [](auto input, auto compute_type) {
     return internal::check_tensor_dtype(
         *input.first, input.second, compute_type);
@@ -78,19 +78,19 @@ inline void apply_elementwise_fn(
       InvalidArgument, );
 
   struct InputInfo {
-    load_to_common_fn<CTYPE_COMMON> load_to_common;
+    load_to_compute_fn<CTYPE_COMPUTE> load_to_compute;
     const char* data_ptr;
     ssize_t element_size;
   };
   std::array<InputInfo, kNumInputs> inputs_info = {(InputInfo{
-      internal::get_load_to_common_fn<CTYPE_COMMON, op_name>(
+      internal::get_load_to_compute_fn<CTYPE_COMPUTE, op_name>(
           *inputs.first, inputs.second),
       reinterpret_cast<const char*>(inputs.first->const_data_ptr()),
       inputs.first->element_size(),
   })...};
 
-  const auto store_common_to_out =
-      internal::get_store_common_to_tensor_fn<CTYPE_COMMON, op_name>(
+  const auto store_compute_to_out =
+      internal::get_store_compute_to_tensor_fn<CTYPE_COMPUTE, op_name>(
           out, out_dtypes);
   char* const data_out = reinterpret_cast<char*>(out.mutable_data_ptr());
   const auto out_element_size = out.element_size();
@@ -106,21 +106,21 @@ inline void apply_elementwise_fn(
         begin_it += begin;
         for (; (*begin_it)[0] < end; ++begin_it) {
           const auto& indexes = *begin_it;
-          std::array<CTYPE_COMMON, kNumInputs> loaded_inputs;
+          std::array<CTYPE_COMPUTE, kNumInputs> loaded_inputs;
           for (const auto idx : c10::irange(kNumInputs)) {
             const auto& input_info = inputs_info[idx];
-            loaded_inputs[idx] = input_info.load_to_common(
+            loaded_inputs[idx] = input_info.load_to_compute(
                 &input_info
                      .data_ptr[indexes[idx + 1] * input_info.element_size]);
           }
           auto result = std::apply(compute_fun, loaded_inputs);
-          store_common_to_out(result, &data_out[indexes[0] * out_element_size]);
+          store_compute_to_out(
+              result, &data_out[indexes[0] * out_element_size]);
         }
       });
 }
-} // namespace internal
 
-template <typename CTYPE_COMMON, const char* op_name, typename Op>
+template <typename CTYPE_COMPUTE, const char* op_name, typename Op>
 inline void apply_unitensor_elementwise_fn(
     const Op& compute_fun,
     KernelRuntimeContext& ctx,
@@ -128,7 +128,7 @@ inline void apply_unitensor_elementwise_fn(
     SupportedTensorDtypes a_dtypes,
     const Tensor& out,
     SupportedTensorDtypes out_dtypes) {
-  internal::apply_elementwise_fn<CTYPE_COMMON, op_name>(
+  internal::apply_elementwise_fn<CTYPE_COMPUTE, op_name>(
       compute_fun, ctx, out, out_dtypes, std::make_pair(&a, a_dtypes));
 }
 
@@ -137,7 +137,7 @@ inline void apply_unitensor_elementwise_fn(
  * perform a computation and write to the corresponding element of the output.
  * Tensor broadcasting is applied wherever it is required.
  */
-template <typename CTYPE_COMMON, const char* op_name, typename Op>
+template <typename CTYPE_COMPUTE, const char* op_name, typename Op>
 inline void apply_bitensor_elementwise_fn(
     const Op& compute_fun,
     KernelRuntimeContext& ctx,
@@ -147,7 +147,7 @@ inline void apply_bitensor_elementwise_fn(
     SupportedTensorDtypes b_dtypes,
     const Tensor& out,
     SupportedTensorDtypes out_dtypes) {
-  internal::apply_elementwise_fn<CTYPE_COMMON, op_name>(
+  internal::apply_elementwise_fn<CTYPE_COMPUTE, op_name>(
       compute_fun,
       ctx,
       out,
@@ -163,7 +163,7 @@ inline void apply_bitensor_elementwise_fn(
  *
  * In order to mitigate build time cost (straightforwardly |CTYPE_A| *
  * |CTYPE_B| * |CTYPE_C| * |CTYPE_OUT|), all arguments to compute_fun
- * are passed as CTYPE_COMMON.
+ * are passed as CTYPE_COMPUTE.
  *
  * Each tensor's supported dtypes set must be provided. The tensor
  * will be checked to ensure that its dtype falls into that set.
@@ -174,9 +174,9 @@ inline void apply_bitensor_elementwise_fn(
  * following:
  *
  * static constexpr const char op_name[] = "my_op";
- * apply_ternary_elementwise_fn<CTYPE_COMMON, op_name>.
+ * apply_ternary_elementwise_fn<CTYPE_COMPUTE, op_name>.
  */
-template <typename CTYPE_COMMON, const char* op_name, typename Op>
+template <typename CTYPE_COMPUTE, const char* op_name, typename Op>
 inline void apply_tritensor_elementwise_fn(
     const Op& compute_fun,
     KernelRuntimeContext& ctx,
@@ -188,7 +188,7 @@ inline void apply_tritensor_elementwise_fn(
     SupportedTensorDtypes c_dtypes,
     const Tensor& out,
     SupportedTensorDtypes out_dtypes) {
-  internal::apply_elementwise_fn<CTYPE_COMMON, op_name>(
+  internal::apply_elementwise_fn<CTYPE_COMPUTE, op_name>(
       compute_fun,
       ctx,
       out,
@@ -205,6 +205,14 @@ inline ScalarType get_compute_type(ScalarType& common_type) {
   }
   return compute_type;
 }
+} // namespace internal
+
+// DEPRECATED: these APIs should not have been stabilized for external
+// use as they are undergoing active development.
+using internal::apply_bitensor_elementwise_fn;
+using internal::apply_tritensor_elementwise_fn;
+using internal::apply_unitensor_elementwise_fn;
+using internal::get_compute_type;
 
 } // namespace utils
 } // namespace native
diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl
index a623b9d4d7a..e756a9bf282 100644
--- a/kernels/portable/cpu/util/targets.bzl
+++ b/kernels/portable/cpu/util/targets.bzl
@@ -12,6 +12,7 @@ def define_common_targets():
     runtime.cxx_library(
         name = "all_deps",
         deps = [
+            "//executorch/extension/threadpool:threadpool",
             "//executorch/kernels/portable/cpu/util:functional_util",
             "//executorch/kernels/portable/cpu/util:broadcast_util",
             "//executorch/kernels/portable/cpu/util:kernel_ops_util",
@@ -32,7 +33,6 @@ def define_common_targets():
             "//executorch/kernels/portable/cpu/util:slice_util",
             "//executorch/kernels/portable/cpu/util:elementwise_util",
             "//executorch/kernels/portable/cpu/util:upsample_util",
-            "//executorch/runtime/kernel:thread_parallel_interface",
         ],
         visibility = ["//executorch/...", "@EXECUTORCH_CLIENTS"],
     )
@@ -111,7 +111,7 @@ def define_common_targets():
             ":broadcast_util",
             ":dtype_util",
             "//executorch/runtime/kernel:kernel_runtime_context",
-            "//executorch/runtime/kernel:thread_parallel_interface",
+            "//executorch/extension/threadpool:threadpool",
         ],
         deps = [
             "//executorch/kernels/portable/cpu:scalar_utils",
@@ -245,7 +245,7 @@ def define_common_targets():
         srcs = [],
         exported_headers = ["functional_util.h"],
         exported_deps = [
-            "//executorch/runtime/kernel:thread_parallel_interface",
+            "//executorch/extension/threadpool:threadpool",
         ],
         deps = [
             "//executorch/runtime/kernel:kernel_includes",
@@ -319,7 +319,7 @@ def define_common_targets():
                 "//executorch/runtime/core/exec_aten/util:tensor_util{}".format(suffix),
             ],
             exported_deps = [
-                "//executorch/runtime/kernel:thread_parallel_interface",
+                "//executorch/extension/threadpool:threadpool",
             ],
             exported_preprocessor_flags = ["-DUSE_ATEN_LIB"] if aten_mode else [],
             visibility = [
diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml
index 29dfe8b1a0c..5e45a210a70 100644
--- a/kernels/portable/functions.yaml
+++ b/kernels/portable/functions.yaml
@@ -329,6 +329,11 @@
     - arg_meta: null
       kernel_name: torch::executor::eq_tensor_out
 
+- op: elu.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::elu_out
+
 - op: erf.out
   kernels:
     - arg_meta: null
diff --git a/kernels/test/CMakeLists.txt b/kernels/test/CMakeLists.txt
index b9f48f0c9a1..42578acbedd 100644
--- a/kernels/test/CMakeLists.txt
+++ b/kernels/test/CMakeLists.txt
@@ -135,6 +135,7 @@ set(all_test_sources
     "op_detach_copy_test.cpp"
     "op_diagonal_copy_test.cpp"
     "op_div_test.cpp"
+    "op_elu_test.cpp"
     "op_embedding_test.cpp"
     "op_empty_test.cpp"
     "op_eq_test.cpp"
diff --git a/kernels/test/op_elu_test.cpp b/kernels/test/op_elu_test.cpp
new file mode 100644
index 00000000000..73ee8ac31a7
--- /dev/null
+++ b/kernels/test/op_elu_test.cpp
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
+#include <executorch/kernels/test/TestUtil.h>
+#include <executorch/kernels/test/supported_features.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+
+#include <gtest/gtest.h>
+
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::string_view;
+using executorch::aten::Tensor;
+using torch::executor::testing::TensorFactory;
+
+class OpEluTest : public OperatorTest {
+ protected:
+  Tensor& op_elu_out(
+      const Tensor& self,
+      const Scalar& alpha,
+      const Scalar& scale,
+      const Scalar& input_scale,
+      Tensor& out) {
+    return torch::executor::aten::elu_outf(
+        context_, self, alpha, scale, input_scale, out);
+  }
+
+  template <ScalarType DTYPE>
+  void test_elu_execution() {
+    TensorFactory<DTYPE> tf;
+
+    const std::vector<int32_t> sizes = {3, 2};
+
+    Tensor in = tf.make(sizes, /*data=*/{-0.125, -0.25, -1, 0, 1.25, 100});
+
+    Tensor out = tf.zeros(sizes);
+
+    // Run full gelu.
+    op_elu_out(in, 1.25, 1, 1, out);
+
+    // Check that it matches the expected output.
+    EXPECT_TENSOR_CLOSE(
+        out,
+        tf.make(
+            sizes,
+            /*data=*/
+            {-0.146879, -0.276499, -0.790151, 0, 1.25, 100}));
+  }
+
+  template <ScalarType DTYPE>
+  void test_integer_elu_dies() {
+    TensorFactory<DTYPE> tf;
+
+    Tensor in = tf.ones({1});
+    Tensor out = tf.ones({1});
+    ET_EXPECT_KERNEL_FAILURE(context_, op_elu_out(in, 1, 1, 1, out));
+  }
+};
+
+TEST_F(OpEluTest, Basic) {
+#define TEST_ENTRY(ctype, dtype) test_elu_execution<ScalarType::dtype>();
+  ET_FORALL_FLOATHBF16_TYPES(TEST_ENTRY);
+#undef TEST_ENTRY
+}
+
+TEST_F(OpEluTest, UnhandledDtypeDies) {
+#define TEST_ENTRY(ctype, dtype) test_integer_elu_dies<ScalarType::dtype>();
+  ET_FORALL_INT_TYPES(TEST_ENTRY);
+#undef TEST_ENTRY
+}
+
+TEST_F(OpEluTest, MismatchedOutputDtypeDies) {
+  // Two different dtypes. This test uses two types with the same size to
+  // demonstrate that the ScalarType itself matters, not the size of the
+  // tensor elements.
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Double> tf_double;
+
+  const std::vector<int32_t> sizes = {2, 2};
+
+  Tensor a = tf_float.ones(sizes);
+
+  // Destination with a dtype different from the input.
+  Tensor out = tf_double.zeros(sizes);
+
+  ET_EXPECT_KERNEL_FAILURE(context_, op_elu_out(a, 1, 1, 1, out));
+}
diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl
index 18ab0ac2e28..3824551a46b 100644
--- a/kernels/test/targets.bzl
+++ b/kernels/test/targets.bzl
@@ -215,6 +215,7 @@ def define_common_targets():
     _common_op_test("op_detach_copy_test", ["aten", "portable"])
     _common_op_test("op_diagonal_copy_test", ["aten", "portable"])
     _common_op_test("op_div_test", ["aten", "portable", "optimized"])
+    _common_op_test("op_elu_test", ["aten", "portable"])
     _common_op_test("op_embedding_test", ["aten", "portable"])
     _common_op_test("op_empty_test", ["aten", "portable"])
     _common_op_test("op_eq_test", ["aten", "portable"])
diff --git a/pyproject.toml b/pyproject.toml
index 90640c8dbdd..b2bd5c06944 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -67,6 +67,8 @@ dependencies=[
   "sympy",
   "tabulate",
   "typing-extensions",
+  # Keep this version in sync with: ./backends/apple/coreml/scripts/install_requirements.sh
+  "coremltools==8.1; platform_system == 'Darwin'",
 ]
 
 [project.urls]
diff --git a/runtime/core/span.h b/runtime/core/span.h
index b671f340953..1bcde396ccd 100644
--- a/runtime/core/span.h
+++ b/runtime/core/span.h
@@ -35,6 +35,7 @@ namespace runtime {
 template <typename T>
 class Span final {
  public:
+  using value_type = T;
   using iterator = T*;
   using size_type = size_t;
 
diff --git a/runtime/executor/tensor_parser_exec_aten.cpp b/runtime/executor/tensor_parser_exec_aten.cpp
index 002c7366be6..14ba5e0d42c 100644
--- a/runtime/executor/tensor_parser_exec_aten.cpp
+++ b/runtime/executor/tensor_parser_exec_aten.cpp
@@ -56,7 +56,7 @@ ET_NODISCARD Result<void*> getMemPlannedPtr(
   const uint32_t memory_offset_high = allocation_info->memory_offset_high();
 
   size_t memory_offset = memory_offset_low;
-  if (memory_offset_high > 0) {
+  if ((sizeof(size_t) > sizeof(uint32_t)) && (memory_offset_high > 0)) {
     // The compiler should remove this always-true check on 64-bit systems.
     ET_CHECK_OR_RETURN_ERROR(
         sizeof(size_t) >= sizeof(uint64_t),
@@ -64,8 +64,7 @@ ET_NODISCARD Result<void*> getMemPlannedPtr(
         "size_t cannot hold memory offset 0x%08" PRIx32 ".%08" PRIx32,
         memory_offset_high,
         memory_offset_low);
-    memory_offset |= static_cast<size_t>(memory_offset_high)
-        << (sizeof(size_t) - sizeof(uint32_t));
+    memory_offset |= static_cast<size_t>(memory_offset_high) << 32;
   }
   return allocator->get_offset_address(memory_id, memory_offset, nbytes);
 }
diff --git a/runtime/kernel/targets.bzl b/runtime/kernel/targets.bzl
index 0726752d3dd..b6aa9d7a95e 100644
--- a/runtime/kernel/targets.bzl
+++ b/runtime/kernel/targets.bzl
@@ -59,9 +59,9 @@ def define_common_targets():
             "//executorch/runtime/core/portable_type/c10/c10:c10",
             "//executorch/runtime/platform:platform",
         ],
+        # Don't depend on this target, depend on //executorch/extension/threadpool:threadpool.
         visibility = [
-            "//executorch/...",
-            "@EXECUTORCH_CLIENTS",
+            "//executorch/extension/threadpool/...",
         ],
     )
 
diff --git a/scripts/build_apple_frameworks.sh b/scripts/build_apple_frameworks.sh
index 716575fd45a..d97b97a7298 100755
--- a/scripts/build_apple_frameworks.sh
+++ b/scripts/build_apple_frameworks.sh
@@ -206,7 +206,7 @@ cmake_build() {
     cmake --build . \
         --config "$mode" \
         --verbose
-    cd ..
+    cd -
 }
 
 for index in ${!PLATFORMS[*]}; do
@@ -285,4 +285,9 @@ done
 
 rm -rf "$HEADERS_PATH"
 
+echo "Running tests"
+
+cd "$SOURCE_ROOT_DIR"
+swift test
+
 echo "Build succeeded!"
diff --git a/setup.py b/setup.py
index b40d6f4738c..76fbbbd9025 100644
--- a/setup.py
+++ b/setup.py
@@ -60,8 +60,9 @@
 
 from distutils import log
 from distutils.sysconfig import get_python_lib
+from functools import cache
 from pathlib import Path
-from typing import List, Optional
+from typing import Dict, List, Optional
 
 from setuptools import Extension, setup
 from setuptools.command.build import build
@@ -69,33 +70,95 @@
 from setuptools.command.build_py import build_py
 
 
+@cache
+def _cmake_args_defines() -> Dict[str, str]:
+    result = {}
+
+    args = re.split(r"\s+", os.environ.get("CMAKE_ARGS", ""))
+    for arg in args:
+        if arg.startswith("-D") and "=" in arg:
+            arg_key, value = arg.split("=")
+            key = arg_key[2:]  # Remove the leading "-D"
+            result[key] = value
+
+    return result
+
+
+def _is_macos() -> bool:
+    return sys.platform == "darwin"
+
+
 class ShouldBuild:
     """Indicates whether to build various components."""
 
     @staticmethod
-    def _is_env_enabled(env_var: str, default: bool = False) -> bool:
-        val = os.environ.get(env_var, None)
-        if val is None:
-            return default
-        if val in ("OFF", "0", ""):
+    def _is_truthy(value: Optional[str]) -> bool:
+        if (value is None) or (value.lower() in ("off", "0", "")):
             return False
         return True
 
+    @staticmethod
+    def _is_cmake_arg_enabled(var: str, default: bool) -> bool:
+        if os.environ.get(var) is not None:
+            raise RuntimeError(
+                f"Python wheel building does not support setting '{var}' using environment variables. Use CMAKE_ARGS='-D{var}=ON' instead."
+            )
+
+        value = _cmake_args_defines().get(var, None)
+        if value is None:
+            return default
+        return ShouldBuild._is_truthy(value)
+
     @classmethod
     def pybindings(cls) -> bool:
-        return cls._is_env_enabled("EXECUTORCH_BUILD_PYBIND", default=False)
+        return cls._is_cmake_arg_enabled(
+            "EXECUTORCH_BUILD_PYBIND",
+            # If the user hasn't specified anything, we want to turn this on if any
+            # bindings are requested explicitly.
+            #
+            # Please keep this in sync with `VALID_PYBINDS` in install_executorch.py.
+            default=any(
+                [
+                    cls.coreml(),
+                    cls.mps(),
+                    cls.openvino(),
+                    cls.xnnpack(),
+                    cls.training(),
+                ]
+            ),
+        )
+
+    @classmethod
+    def coreml(cls) -> bool:
+        return cls._is_cmake_arg_enabled("EXECUTORCH_BUILD_COREML", default=_is_macos())
+
+    @classmethod
+    def mps(cls) -> bool:
+        return cls._is_cmake_arg_enabled("EXECUTORCH_BUILD_MPS", default=False)
+
+    @classmethod
+    def openvino(cls) -> bool:
+        return cls._is_cmake_arg_enabled("EXECUTORCH_BUILD_OPENVINO", default=False)
+
+    @classmethod
+    def xnnpack(cls) -> bool:
+        return cls._is_cmake_arg_enabled("EXECUTORCH_BUILD_XNNPACK", default=False)
 
     @classmethod
     def training(cls) -> bool:
-        return cls._is_env_enabled("EXECUTORCH_BUILD_TRAINING", default=False)
+        return cls._is_cmake_arg_enabled(
+            "EXECUTORCH_BUILD_EXTENSION_TRAINING", default=False
+        )
 
     @classmethod
     def llama_custom_ops(cls) -> bool:
-        return cls._is_env_enabled("EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT", default=True)
+        return cls._is_cmake_arg_enabled(
+            "EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT", default=True
+        )
 
     @classmethod
     def flatc(cls) -> bool:
-        return cls._is_env_enabled("EXECUTORCH_BUILD_FLATC", default=True)
+        return cls._is_cmake_arg_enabled("EXECUTORCH_BUILD_FLATC", default=True)
 
 
 class Version:
@@ -666,11 +729,14 @@ def run(self):
                 "-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON",  # add quantized ops to pybindings.
                 "-DEXECUTORCH_BUILD_KERNELS_QUANTIZED_AOT=ON",
             ]
+
             if ShouldBuild.training():
-                cmake_args += [
-                    "-DEXECUTORCH_BUILD_EXTENSION_TRAINING=ON",
-                ]
                 build_args += ["--target", "_training_lib"]
+
+            if ShouldBuild.coreml():
+                cmake_args += ["-DEXECUTORCH_BUILD_COREML=ON"]
+                build_args += ["--target", "executorchcoreml"]
+
             build_args += ["--target", "portable_lib"]
             # To link backends into the portable_lib target, callers should
             # add entries like `-DEXECUTORCH_BUILD_XNNPACK=ON` to the CMAKE_ARGS
@@ -804,6 +870,14 @@ def get_ext_modules() -> List[Extension]:
                     "executorch.extension.training.pybindings._training_lib",
                 )
             )
+        if ShouldBuild.coreml():
+            ext_modules.append(
+                BuiltExtension(
+                    src="executorchcoreml.*",
+                    src_dir="backends/apple/coreml",
+                    modpath="executorch.backends.apple.coreml.executorchcoreml",
+                )
+            )
     if ShouldBuild.llama_custom_ops():
         ext_modules.append(
             BuiltFile(
diff --git a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
index b56413b92f4..a1ffdc1eed3 100644
--- a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
+++ b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
@@ -482,6 +482,13 @@ ATEN_OPS = (
             ":scalar_utils",
         ],
     ),
+    op_target(
+        name = "op_elu",
+        deps = [
+            ":scalar_utils",
+            "//executorch/kernels/portable/cpu/util:elementwise_util",
+        ],
+    ),
     op_target(
         name = "op_embedding",
         deps = [