From 97bcbe7177dc255c013fc5d95f21dd909ab6da45 Mon Sep 17 00:00:00 2001
From: Vivek Trivedi <vivektrivedi@meta.com>
Date: Tue, 2 Dec 2025 06:34:17 -0800
Subject: [PATCH] Using buffer for weight tensors for quantized mat mul op.
 (#15990)

Summary:

This change affects the performance and memory usage of the quantized matrix multiplication operation in the Executorch Vulkan backend. By using a buffer for weight tensors, the operation may become more efficient and use less memory, especially for large matrices.

Reviewed By: yipjustin

Differential Revision: D87911255
---
 backends/vulkan/runtime/graph/ops/impl/Staging.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
index 40de9b59e81..db7c5a7e88b 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
@@ -285,7 +285,7 @@ ValueRef prepack_int4_linear_weight_transposed_interleaved(
   const int64_t N = qmat2_orig_sizes.at(ndim - 2);
   const int64_t N_div2 = N / int64_t(2);
 
-  utils::StorageType storage_type = utils::kTexture2D;
+  utils::StorageType storage_type = utils::kBuffer;
   uint32_t max_extent = graph.context()->adapter_ptr()->max_texture2d_dim();
   if (N_div2 > max_extent * 4 || K > max_extent) {
     storage_type = utils::kBuffer;