diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs index 18a95e810bee4..8ec1cd1be531b 100644 --- a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs +++ b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs @@ -8,7 +8,7 @@ use rustc_codegen_ssa::common::TypeKind; use rustc_codegen_ssa::mir::operand::{OperandRef, OperandValue}; use rustc_codegen_ssa::traits::{BaseTypeCodegenMethods, BuilderMethods}; use rustc_middle::bug; -use rustc_middle::ty::offload_meta::{MappingFlags, OffloadMetadata, OffloadSize}; +use rustc_middle::ty::offload_meta::{DynamicSize, MappingFlags, OffloadMetadata, OffloadSize}; use crate::builder::Builder; use crate::common::CodegenCx; @@ -448,14 +448,18 @@ pub(crate) fn gen_define_handling<'ll>( transfer.iter().map(|m| m.intersection(valid_begin_mappings).bits()).collect(); let transfer_from: Vec = transfer.iter().map(|m| m.intersection(MappingFlags::FROM).bits()).collect(); + let valid_kernel_mappings = MappingFlags::LITERAL | MappingFlags::IMPLICIT; // FIXME(offload): add `OMP_MAP_TARGET_PARAM = 0x20` only if necessary - let transfer_kernel = vec![MappingFlags::TARGET_PARAM.bits(); transfer_to.len()]; + let transfer_kernel: Vec = transfer + .iter() + .map(|m| (m.intersection(valid_kernel_mappings) | MappingFlags::TARGET_PARAM).bits()) + .collect(); let actual_sizes = sizes .iter() .map(|s| match s { OffloadSize::Static(sz) => *sz, - OffloadSize::Dynamic => 0, + OffloadSize::Dynamic(_) => 0, }) .collect::>(); let offload_sizes = @@ -542,12 +546,20 @@ pub(crate) fn scalar_width<'ll>(cx: &'ll SimpleCx<'_>, ty: &'ll Type) -> u64 { } fn get_runtime_size<'ll, 'tcx>( - _cx: &CodegenCx<'ll, 'tcx>, - _val: &'ll Value, - _meta: &OffloadMetadata, + builder: &mut Builder<'_, 'll, 'tcx>, + args: &[&'ll Value], + index: usize, + meta: &OffloadMetadata, ) -> &'ll Value { - // FIXME(Sa4dUs): handle dynamic-size data (e.g. slices) - bug!("offload does not support dynamic sizes yet"); + match meta.payload_size { + OffloadSize::Dynamic(DynamicSize::Slice { element_size }) => { + let length_idx = index + 1; + let length = args[length_idx]; + let length_i64 = builder.intcast(length, builder.cx.type_i64(), false); + builder.mul(length_i64, builder.cx.get_const_i64(element_size)) + } + OffloadSize::Static(_) => bug!("expected dynamic size"), + } } // For each kernel *call*, we now use some of our previous declared globals to move data to and from @@ -588,7 +600,7 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>( let OffloadKernelDims { num_workgroups, threads_per_block, workgroup_dims, thread_dims } = offload_dims; - let has_dynamic = metadata.iter().any(|m| matches!(m.payload_size, OffloadSize::Dynamic)); + let has_dynamic = metadata.iter().any(|m| matches!(m.payload_size, OffloadSize::Dynamic(_))); let tgt_decl = offload_globals.launcher_fn; let tgt_target_kernel_ty = offload_globals.launcher_ty; @@ -683,9 +695,9 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>( let gep2 = builder.inbounds_gep(ty, a2, &[i32_0, idx]); builder.store(geps[i as usize], gep2, Align::EIGHT); - if matches!(metadata[i as usize].payload_size, OffloadSize::Dynamic) { + if matches!(metadata[i as usize].payload_size, OffloadSize::Dynamic(_)) { let gep3 = builder.inbounds_gep(ty2, a4, &[i32_0, idx]); - let size_val = get_runtime_size(cx, args[i as usize], &metadata[i as usize]); + let size_val = get_runtime_size(builder, args, i as usize, &metadata[i as usize]); builder.store(size_val, gep3, Align::EIGHT); } } diff --git a/compiler/rustc_codegen_llvm/src/intrinsic.rs b/compiler/rustc_codegen_llvm/src/intrinsic.rs index cf088ed509092..15c63f83c2745 100644 --- a/compiler/rustc_codegen_llvm/src/intrinsic.rs +++ b/compiler/rustc_codegen_llvm/src/intrinsic.rs @@ -1437,9 +1437,20 @@ fn codegen_offload<'ll, 'tcx>( let sig = tcx.instantiate_bound_regions_with_erased(sig); let inputs = sig.inputs(); - let metadata = inputs.iter().map(|ty| OffloadMetadata::from_ty(tcx, *ty)).collect::>(); + let fn_abi = cx.fn_abi_of_instance(fn_target, ty::List::empty()); - let types = inputs.iter().map(|ty| cx.layout_of(*ty).llvm_type(cx)).collect::>(); + let mut metadata = Vec::new(); + let mut types = Vec::new(); + + for (i, arg_abi) in fn_abi.args.iter().enumerate() { + let ty = inputs[i]; + let decomposed = OffloadMetadata::handle_abi(cx, tcx, ty, arg_abi); + + for (meta, entry_ty) in decomposed { + metadata.push(meta); + types.push(bx.cx.layout_of(entry_ty).llvm_type(bx.cx)); + } + } let offload_globals_ref = cx.offload_globals.borrow(); let offload_globals = match offload_globals_ref.as_ref() { diff --git a/compiler/rustc_middle/src/ty/offload_meta.rs b/compiler/rustc_middle/src/ty/offload_meta.rs index 849670d76d464..25243376c2d8a 100644 --- a/compiler/rustc_middle/src/ty/offload_meta.rs +++ b/compiler/rustc_middle/src/ty/offload_meta.rs @@ -1,7 +1,10 @@ use bitflags::bitflags; +use rustc_abi::{BackendRepr, TyAbiInterface}; +use rustc_target::callconv::ArgAbi; use crate::ty::{self, PseudoCanonicalInput, Ty, TyCtxt, TypingEnv}; +#[derive(Debug, Copy, Clone)] pub struct OffloadMetadata { pub payload_size: OffloadSize, pub mode: MappingFlags, @@ -9,13 +12,18 @@ pub struct OffloadMetadata { #[derive(Debug, Copy, Clone)] pub enum OffloadSize { - Dynamic, Static(u64), + Dynamic(DynamicSize), +} + +#[derive(Debug, Copy, Clone)] +pub enum DynamicSize { + Slice { element_size: u64 }, } bitflags! { /// Mirrors `OpenMPOffloadMappingFlags` from Clang/OpenMP. - #[derive(Debug, Copy, Clone)] + #[derive(Debug, Copy, Clone, PartialEq, Eq)] #[repr(transparent)] pub struct MappingFlags: u64 { /// No flags. @@ -62,11 +70,38 @@ impl OffloadMetadata { mode: MappingFlags::from_ty(tcx, ty), } } + + pub fn handle_abi<'tcx, C>( + cx: &C, + tcx: TyCtxt<'tcx>, + ty: Ty<'tcx>, + arg_abi: &ArgAbi<'tcx, Ty<'tcx>>, + ) -> Vec<(Self, Ty<'tcx>)> + where + Ty<'tcx>: TyAbiInterface<'tcx, C>, + { + match arg_abi.layout.backend_repr { + BackendRepr::ScalarPair(_, _) => (0..2) + .map(|i| { + let ty = arg_abi.layout.field(cx, i).ty; + (OffloadMetadata::from_ty(tcx, ty), ty) + }) + .collect(), + _ => vec![(OffloadMetadata::from_ty(tcx, ty), ty)], + } + } } // FIXME(Sa4dUs): implement a solid logic to determine the payload size fn get_payload_size<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> OffloadSize { match ty.kind() { + ty::Slice(elem_ty) => { + let layout = tcx.layout_of(PseudoCanonicalInput { + typing_env: TypingEnv::fully_monomorphized(), + value: *elem_ty, + }); + OffloadSize::Dynamic(DynamicSize::Slice { element_size: layout.unwrap().size.bytes() }) + } ty::RawPtr(inner, _) | ty::Ref(_, inner, _) => get_payload_size(tcx, *inner), _ => OffloadSize::Static( tcx.layout_of(PseudoCanonicalInput { diff --git a/tests/codegen-llvm/gpu_offload/slice_device.rs b/tests/codegen-llvm/gpu_offload/slice_device.rs new file mode 100644 index 0000000000000..1abe04f8cc429 --- /dev/null +++ b/tests/codegen-llvm/gpu_offload/slice_device.rs @@ -0,0 +1,27 @@ +//@ add-minicore +//@ revisions: amdgpu nvptx +//@[nvptx] compile-flags: -Copt-level=3 -Zunstable-options -Zoffload=Device --target nvptx64-nvidia-cuda --crate-type=rlib +//@[nvptx] needs-llvm-components: nvptx +//@[amdgpu] compile-flags: -Copt-level=3 -Zunstable-options -Zoffload=Device --target amdgcn-amd-amdhsa -Ctarget-cpu=gfx900 --crate-type=rlib +//@[amdgpu] needs-llvm-components: amdgpu +//@ no-prefer-dynamic +//@ needs-offload + +#![feature(abi_gpu_kernel, rustc_attrs, no_core)] +#![no_core] + +extern crate minicore; + +// CHECK: ; Function Attrs +// nvptx-NEXT: define ptx_kernel void @foo +// amdgpu-NEXT: define amdgpu_kernel void @foo +// CHECK-SAME: ptr readnone captures(none) %dyn_ptr +// nvptx-SAME: [2 x i64] %0 +// amdgpu-SAME: ptr noalias {{.*}} %0, i64 {{.*}} %1 +// CHECK-NEXT: entry: +// CHECK-NEXT: ret void +// CHECK-NEXT: } + +#[unsafe(no_mangle)] +#[rustc_offload_kernel] +pub unsafe extern "gpu-kernel" fn foo(x: &[f32]) {} diff --git a/tests/codegen-llvm/gpu_offload/slice_host.rs b/tests/codegen-llvm/gpu_offload/slice_host.rs new file mode 100644 index 0000000000000..62f12da079d82 --- /dev/null +++ b/tests/codegen-llvm/gpu_offload/slice_host.rs @@ -0,0 +1,35 @@ +//@ compile-flags: -Zoffload=Test -Zunstable-options -C opt-level=1 -Clto=fat +//@ no-prefer-dynamic +//@ needs-offload + +// This test verifies that offload is properly handling slices passing them properly to the device + +#![feature(abi_gpu_kernel)] +#![feature(rustc_attrs)] +#![feature(core_intrinsics)] +#![no_main] + +// CHECK: @anon.[[ID:.*]].0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 + +// CHECK-DAG: @.offload_sizes.[[K:[^ ]*foo]] = private unnamed_addr constant [2 x i64] [i64 0, i64 8] +// CHECK-DAG: @.offload_maptypes.[[K]].begin = private unnamed_addr constant [2 x i64] [i64 1, i64 768] +// CHECK-DAG: @.offload_maptypes.[[K]].kernel = private unnamed_addr constant [2 x i64] [i64 32, i64 800] +// CHECK-DAG: @.offload_maptypes.[[K]].end = private unnamed_addr constant [2 x i64] [i64 2, i64 0] + +// CHECK: define{{( dso_local)?}} void @main() +// CHECK: %.offload_sizes = alloca [2 x i64], align 8 +// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}} %.offload_sizes, ptr {{.*}} @.offload_sizes.foo, i64 16, i1 false) +// CHECK: store i64 16, ptr %.offload_sizes, align 8 +// CHECK: call void @__tgt_target_data_begin_mapper(ptr nonnull @anon.[[ID]].1, i64 -1, i32 2, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes.[[K]].begin, ptr null, ptr null) +// CHECK: %11 = call i32 @__tgt_target_kernel(ptr nonnull @anon.[[ID]].1, i64 -1, i32 1, i32 1, ptr nonnull @.foo.region_id, ptr nonnull %kernel_args) +// CHECK-NEXT: call void @__tgt_target_data_end_mapper(ptr nonnull @anon.[[ID]].1, i64 -1, i32 2, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes.[[K]].end, ptr null, ptr null) + +#[unsafe(no_mangle)] +fn main() { + let mut x = [0.0, 0.0, 0.0, 0.0]; + core::intrinsics::offload::<_, _, ()>(foo, [1, 1, 1], [1, 1, 1], ((&mut x) as &mut [f64],)); +} + +unsafe extern "C" { + pub fn foo(x: &mut [f32]); +}