Skip to content

Commit 5d2cd16

Browse files
ssjiaSS-JIA
authored andcommitted
[ET-VK] Introduce BufferMetadata GLSL struct to abstract tensor layout
Pull Request resolved: #13595 As title; introduce an consolidated metadata UBO for buffer storage that can be used to abstract tensor indexing operations for buffer-backed tensors. This new metadata UBO is capable of representing tensors of up to 8 dimensions. This upper limit is hardcoded, but can be increased later on without needing to update callsites since everything is abstracted by the BufferMetadata struct. Update the following ops to use this new metadata UBO: * staging shaders (nchw_to_buffer and buffer_to_nchw) * binary op @imported-using-ghimport Differential Revision: [D80800082](https://our.internmc.facebook.com/intern/diff/D80800082/) ghstack-source-id: 305143836
1 parent 4d095f9 commit 5d2cd16

File tree

13 files changed

+374
-102
lines changed

13 files changed

+374
-102
lines changed

backends/vulkan/runtime/api/containers/Tensor.cpp

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -567,6 +567,7 @@ vTensor::vTensor(
567567
max_ubo_nbytes_{
568568
calculate_max_ubo_nbytes(min_nbytes_per_ubo_, storage_type)},
569569
uniforms_(),
570+
buffer_meta_(),
570571
// Construct Tensor storage
571572
storage_(std::make_shared<vTensorStorage>(
572573
context,
@@ -611,6 +612,7 @@ vTensor::vTensor(
611612
max_ubo_nbytes_{
612613
calculate_max_ubo_nbytes(min_nbytes_per_ubo_, utils::kTexture3D)},
613614
uniforms_(),
615+
buffer_meta_(),
614616
// Construct Tensor storage
615617
storage_(std::make_shared<vTensorStorage>(context, image)) {
616618
uniform_data_ = std::make_shared<UniformData>(UniformData{
@@ -634,6 +636,7 @@ vTensor::vTensor(vTensor& other)
634636
min_nbytes_per_ubo_{other.min_nbytes_per_ubo_},
635637
max_ubo_nbytes_{other.max_ubo_nbytes_},
636638
uniforms_(),
639+
buffer_meta_(),
637640
// Copy Tensor storage
638641
storage_(other.storage_) {
639642
uniform_data_ = std::make_shared<UniformData>(*other.get_uniform_data());
@@ -659,6 +662,7 @@ vTensor::vTensor(
659662
min_nbytes_per_ubo_{other.min_nbytes_per_ubo_},
660663
max_ubo_nbytes_{other.max_ubo_nbytes_},
661664
uniforms_(),
665+
buffer_meta_(),
662666
// Copy Tensor storage
663667
storage_(other.storage_) {
664668
uniform_data_ = std::make_shared<UniformData>(UniformData{
@@ -711,6 +715,38 @@ uint32_t vTensor::UniformData::write_attribute(
711715
return 0;
712716
}
713717

718+
vTensor::BufferMetadata::BufferMetadata(
719+
std::vector<int64_t>& src_sizes,
720+
std::vector<int64_t>& src_dim_order,
721+
std::vector<int64_t>& src_strides,
722+
size_t src_numel) {
723+
update(src_sizes, src_dim_order, src_strides, src_numel);
724+
}
725+
726+
void vTensor::BufferMetadata::update(
727+
std::vector<int64_t>& src_sizes,
728+
std::vector<int64_t>& src_dim_order,
729+
std::vector<int64_t>& src_strides,
730+
size_t src_numel) {
731+
int32_t fixed_ndim = utils::safe_downcast<int32_t>(kTensorDimLimit);
732+
733+
std::vector<uint32_t> fu_sizes = flip_and_unsqueeze<uint32_t>(
734+
src_sizes, kTensorSizes, src_numel, fixed_ndim);
735+
std::vector<uint32_t> fu_dim_order = flip_and_unsqueeze<uint32_t>(
736+
src_dim_order, kTensorDimOrder, src_numel, fixed_ndim);
737+
std::vector<uint32_t> fu_strides = flip_and_unsqueeze<uint32_t>(
738+
src_strides, kTensorStrides, src_numel, fixed_ndim);
739+
740+
for (int i = 0; i < fixed_ndim; ++i) {
741+
sizes[i] = fu_sizes.at(i);
742+
dim_order[i] = fu_dim_order.at(i);
743+
strides[i] = fu_strides.at(i);
744+
}
745+
746+
ndim = utils::safe_downcast<uint32_t>(src_sizes.size());
747+
numel = utils::safe_downcast<uint32_t>(src_numel);
748+
}
749+
714750
vkapi::VulkanImage& vTensor::image(
715751
vkapi::PipelineBarrier& pipeline_barrier,
716752
const vkapi::PipelineStageFlags stage) & {
@@ -799,6 +835,15 @@ const vkapi::BufferBindInfo vTensor::numel_ubo() {
799835
return metadata_ubo_impl(&numel_uniform_offset_, uniform_data_->numel);
800836
}
801837

838+
const vkapi::BufferBindInfo vTensor::buffer_meta_ubo() {
839+
size_t ubo_nbytes = sizeof(BufferMetadata);
840+
if (!buffer_meta_.buffer()) {
841+
BufferMetadata data(sizes_, dim_order_, strides_, numel_);
842+
buffer_meta_ = ParamsBuffer(storage_->context_, data);
843+
}
844+
return vkapi::BufferBindInfo(buffer_meta_.buffer(), 0, ubo_nbytes);
845+
}
846+
802847
VkMemoryRequirements vTensor::get_memory_requirements() const {
803848
switch (storage_type()) {
804849
case utils::kBuffer:
@@ -875,6 +920,11 @@ void vTensor::update_metadata() {
875920
uniforms_.update(
876921
uniform_data_->logical_limits.limits, logical_limits_uniform_offset_);
877922
}
923+
924+
if (buffer_meta_.buffer()) {
925+
BufferMetadata data(sizes_, dim_order_, strides_, numel_);
926+
buffer_meta_.update(data);
927+
}
878928
}
879929

880930
void vTensor::check_sizes(const std::vector<int64_t>& sizes) const {

backends/vulkan/runtime/api/containers/Tensor.h

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
namespace vkcompute {
2020
namespace api {
2121

22+
static constexpr size_t kTensorDimLimit = 8;
23+
2224
/*
2325
* Given a GPUMemoryLayout value, produce a dim order vector that matches the
2426
* given memory layout. The produced dim order vector will be in the NCHW
@@ -262,6 +264,26 @@ class vTensor final {
262264
const Attribute attr);
263265
};
264266

267+
struct BufferMetadata {
268+
uint32_t sizes[kTensorDimLimit];
269+
uint32_t dim_order[kTensorDimLimit];
270+
uint32_t strides[kTensorDimLimit];
271+
uint32_t ndim;
272+
uint32_t numel;
273+
274+
BufferMetadata(
275+
std::vector<int64_t>& sizes,
276+
std::vector<int64_t>& dim_order,
277+
std::vector<int64_t>& strides,
278+
size_t numel);
279+
280+
void update(
281+
std::vector<int64_t>& sizes,
282+
std::vector<int64_t>& dim_order,
283+
std::vector<int64_t>& strides,
284+
size_t numel);
285+
};
286+
265287
private:
266288
/*
267289
* "Core" tensor metadata. They are the minimum amount of information required
@@ -332,6 +354,11 @@ class vTensor final {
332354
*/
333355
ParamsBuffer uniforms_;
334356

357+
/*
358+
* Used to store data for BufferMetadata to pass to shaders as buffer_meta_ubo
359+
*/
360+
ParamsBuffer buffer_meta_;
361+
335362
uint32_t uniforms_size_ = 0u;
336363
uint32_t sizes_uniform_offset_ = kUniformOffsetUnset;
337364
uint32_t dim_order_uniform_offset_ = kUniformOffsetUnset;
@@ -557,6 +584,8 @@ class vTensor final {
557584

558585
const vkapi::BufferBindInfo numel_ubo();
559586

587+
const vkapi::BufferBindInfo buffer_meta_ubo();
588+
560589
public:
561590
inline size_t staging_buffer_numel() const {
562591
return storage_->buffer_len();

backends/vulkan/runtime/graph/ComputeGraph.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -357,6 +357,10 @@ class ComputeGraph final {
357357
return values_.at(idx).toConstTensor().has_buffer_storage();
358358
}
359359

360+
inline bool is_texture_storage(const ValueRef idx) const {
361+
return !is_buffer_storage(idx);
362+
}
363+
360364
/*
361365
* Checks that the following is true:
362366
* 1. The value at `idx` is a tensor
@@ -411,6 +415,10 @@ class ComputeGraph final {
411415
return values_.at(idx).toTensor().sizes_ubo();
412416
}
413417

418+
inline vkapi::BufferBindInfo buffer_meta_ubo(const ValueRef idx) {
419+
return values_.at(idx).toTensor().buffer_meta_ubo();
420+
}
421+
414422
inline vkapi::BufferBindInfo strides_ubo(const ValueRef idx) {
415423
return values_.at(idx).toTensor().strides_ubo();
416424
}

backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl

Lines changed: 20 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ $if IS_COMPARISON_OP:
3434

3535
layout(std430) buffer;
3636

37+
#include "indexing.glslh"
38+
3739
$if IS_COMPARISON_OP:
3840
${layout_declare_tensor(B, "w", "t_out", "uint8", STORAGE)}
3941
$else:
@@ -43,13 +45,11 @@ ${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
4345
${layout_declare_tensor(B, "r", "t_other", DTYPE, STORAGE)}
4446

4547
$if STORAGE == "buffer":
48+
${layout_declare_ubo(B, "BufferMetadata", "outp")}
49+
${layout_declare_ubo(B, "BufferMetadata", "inp")}
50+
${layout_declare_ubo(B, "BufferMetadata", "other")}
51+
4652
layout(push_constant) uniform restrict Block {
47-
ivec4 in_sizes;
48-
ivec4 other_sizes;
49-
ivec4 out_strides;
50-
ivec4 in_strides;
51-
ivec4 other_strides;
52-
int out_numel;
5353
float alpha;
5454
};
5555
$else:
@@ -83,25 +83,30 @@ $else:
8383
#ifdef USING_BUFFER
8484

8585
void main() {
86-
const int out_bufi = ivec3(gl_GlobalInvocationID).x;
87-
if (out_bufi >= out_numel) {
86+
const uint out_bufi = gl_GlobalInvocationID.x;
87+
if (out_bufi >= numel(outp)) {
8888
return;
8989
}
9090

9191
// Simple case; no broadcasting
92-
if (in_sizes == other_sizes) {
92+
if (are_equal(inp, other)) {
9393
t_out[out_bufi] = T(op(t_in[out_bufi], t_other[out_bufi], T(alpha)));
9494
return;
9595
}
9696

97-
const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order);
98-
const ivec4 in_tidx = min(out_tidx, in_sizes - 1);
99-
const ivec4 other_tidx = min(out_tidx, other_sizes - 1);
97+
TensorIndex outp_tidx;
98+
linear_idx_to_tensor_idx(outp, out_bufi, outp_tidx);
99+
100+
TensorIndex inp_tidx = outp_tidx;
101+
clamp_tensor_idx(inp, inp_tidx);
102+
103+
TensorIndex other_tidx = outp_tidx;
104+
clamp_tensor_idx(other, other_tidx);
100105

101-
const int in_bufi = tidx_to_bufi(in_tidx, in_strides);
102-
const int other_bufi = tidx_to_bufi(other_tidx, other_strides);
106+
uint inp_bufi = tensor_idx_to_linear_idx(inp, inp_tidx);
107+
uint other_bufi = tensor_idx_to_linear_idx(other, other_tidx);
103108

104-
t_out[out_bufi] = T(op(t_in[in_bufi], t_other[other_bufi], T(alpha)));
109+
t_out[out_bufi] = T(op(t_in[inp_bufi], t_other[other_bufi], T(alpha)));
105110
}
106111

107112
#else // USING_TEXTURE

backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.glsl

Lines changed: 13 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -4,40 +4,33 @@
44

55
#define T ${buffer_scalar_type(DTYPE)}
66

7-
#include "indexing_utils.h"
8-
97
${define_required_extensions(DTYPE)}
108

119
layout(std430) buffer;
1210

13-
${layout_declare_tensor(0, "w", "nchw_buf", DTYPE, STORAGE)}
14-
${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)}
11+
#include "indexing.glslh"
12+
13+
${layout_declare_tensor(B, "w", "nchw_buf", DTYPE, STORAGE)}
14+
${layout_declare_tensor(B, "r", "t_inp", DTYPE, STORAGE)}
1515

16-
$if USE_PUSH_CONST:
17-
layout(push_constant) uniform restrict Block {
18-
ivec4 in_sizes;
19-
ivec4 in_strides;
20-
int numel;
21-
};
22-
$else:
23-
${layout_declare_ubo(2, "ivec4", "in_sizes")}
24-
${layout_declare_ubo(3, "ivec4", "in_strides")}
25-
${layout_declare_ubo(4, "int", "numel")}
16+
${layout_declare_ubo(B, "BufferMetadata", "inp")}
2617

2718
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
2819

2920
// This constant is unused in this shader but is kept so that the signature is
3021
// consistent with image_to_nchw.
31-
layout(constant_id = 3) const int UNUSED_packed_dim = W_DIM;
22+
${layout_declare_spec_const(C, "int", "unused", "0")}
3223

3324
void main() {
34-
int nchwi = int(gl_GlobalInvocationID.x);
35-
if (nchwi >= numel) {
25+
uint inp_bufi = gl_GlobalInvocationID.x;
26+
if (inp_bufi>= numel(inp)) {
3627
return;
3728
}
3829

39-
ivec4 in_tidx = nchwi_to_tidx(nchwi, in_sizes);
40-
const int in_bufi = tidx_to_bufi(in_tidx, in_strides);
30+
TensorIndex inp_tidx;
31+
linear_idx_to_tensor_idx(inp, inp_bufi, inp_tidx);
32+
33+
uint nchwi = tensor_idx_to_contiguous_idx(inp, inp_tidx);
4134

42-
nchw_buf[nchwi] = t_in[in_bufi];
35+
nchw_buf[nchwi] = t_inp[inp_bufi];
4336
}

backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.yaml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,5 +19,3 @@ buffer_to_nchw:
1919
- VALUE: int32
2020
shader_variants:
2121
- NAME: buffer_to_nchw
22-
- NAME: buffer_to_nchw_no_pc
23-
USE_PUSH_CONST: False

0 commit comments

Comments
 (0)