Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions csrc/apis/attention.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -219,13 +219,13 @@ static torch::Tensor fp8_paged_mqa_logits(const torch::Tensor& q,
fused_kv_cache.data_ptr(),
{num_kv_blocks, block_kv, head_dim},
{kv_cache_stride_bytes, head_dim, 1},
torch::TensorOptions().dtype(torch::kFloat8_e4m3fn).device(fused_kv_cache.device())
torch::TensorOptions().dtype(torch::kFloat8_e4m3fn)
);
const auto& kv_cache_scales = torch::from_blob(
fused_kv_cache.data_ptr<uint8_t>() + block_kv * head_dim,
{num_kv_blocks, block_kv},
{kv_cache_stride_bytes / static_cast<int>(sizeof(float)), 1},
torch::TensorOptions().dtype(torch::kFloat32).device(fused_kv_cache.device())
torch::TensorOptions().dtype(torch::kFloat32)
);

// Allocate output
Expand Down
2 changes: 1 addition & 1 deletion csrc/apis/layout.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ static torch::Tensor transform_sf_into_required_layout(const torch::Tensor& sf,
// (FP32, 128, 128) on SM100: transform to (INT, 1, 128), TMA-aligned and MN-major
if (sf.scalar_type() == torch::kFloat and gran_mn == 128 and gran_k == 128 and arch_major == 10) {
DG_HOST_ASSERT(not disable_ue8m0_cast);
const auto& broadcasted = sf.index_select(-2, torch::arange(mn, at::TensorOptions().dtype(torch::kInt).device(sf.device())).floor_divide_(128));
const auto& broadcasted = sf.index_select(-2, torch::arange(mn, at::TensorOptions().device(sf.device())).floor_divide_(128));
return get_mn_major_tma_aligned_packed_ue8m0_tensor(broadcasted);
}

Expand Down
3 changes: 2 additions & 1 deletion csrc/jit/device_runtime.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#define PADDLE_WITH_CUDA // make sure gpuStream_t declaration

#include <cublasLt.h>
#include <torch/version.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAStream.h>

Expand All @@ -19,7 +20,7 @@ class DeviceRuntime {
static constexpr size_t kCublasLtWorkspaceSize = 32 * 1024 * 1024;

public:
#if false
#if TORCH_VERSION_MAJOR > 2 or (TORCH_VERSION_MAJOR == 2 and TORCH_VERSION_MINOR >= 3)
// For PyTorch 2.3+, share the PyTorch cuBLASLt handle
DeviceRuntime() = default;

Expand Down
5 changes: 3 additions & 2 deletions csrc/utils/compatibility.hpp
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
#pragma once

#include <torch/version.h>
#include <cuda.h>

// `torch::kFloat8_e4m3fn` is supported since PyTorch 2.1
#define DG_FP8_COMPATIBLE true
#define DG_FP8_COMPATIBLE (TORCH_VERSION_MAJOR > 2 or (TORCH_VERSION_MAJOR == 2 and TORCH_VERSION_MINOR >= 1))

// `cuTensorMapEncodeTiled` is supported since CUDA Driver API 12.1
#define DG_TENSORMAP_COMPATIBLE true
#define DG_TENSORMAP_COMPATIBLE (CUDA_VERSION >= 12010)