Skip to content

Commit c6dee2c

Browse files
FractalFirLegNeato
authored andcommitted
Add support for standard Rust atomic compareexchange intrinsic.
1 parent ac2674f commit c6dee2c

File tree

4 files changed

+228
-10
lines changed

4 files changed

+228
-10
lines changed

crates/rustc_codegen_nvvm/src/builder.rs

Lines changed: 152 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1134,16 +1134,64 @@ impl<'ll, 'tcx, 'a> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> {
11341134
// Atomic Operations
11351135
fn atomic_cmpxchg(
11361136
&mut self,
1137-
_dst: &'ll Value,
1138-
_cmp: &'ll Value,
1139-
_src: &'ll Value,
1140-
_order: AtomicOrdering,
1141-
_failure_order: AtomicOrdering,
1142-
_weak: bool,
1137+
dst: &'ll Value,
1138+
cmp: &'ll Value,
1139+
src: &'ll Value,
1140+
order: AtomicOrdering,
1141+
failure_order: AtomicOrdering,
1142+
weak: bool,
11431143
) -> (&'ll Value, &'ll Value) {
1144-
// allowed but only for some things and with restrictions
1145-
// https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#cmpxchg-instruction
1146-
self.fatal("atomic cmpxchg is not supported")
1144+
// LLVM verifier rejects cases where the `failure_order` is stronger than `order`
1145+
match (order,failure_order){
1146+
(AtomicOrdering::SeqCst, _)=>(),
1147+
(_, AtomicOrdering::Relaxed)=>(),
1148+
(AtomicOrdering::Release, AtomicOrdering::Release) | (AtomicOrdering::Release, AtomicOrdering::Acquire) | (AtomicOrdering::Acquire, AtomicOrdering::Acquire)=>(),
1149+
(AtomicOrdering::AcqRel,AtomicOrdering::Acquire) => (),
1150+
(AtomicOrdering::Relaxed, _) | (_, AtomicOrdering::Release | AtomicOrdering::AcqRel | AtomicOrdering::SeqCst)=>{
1151+
// Invalid cmpxchg - `failure_order` is stronger than `order`! So, we abort.
1152+
self.abort();
1153+
return (self.const_undef(self.val_ty(cmp)),self.const_undef(self.type_i1()));
1154+
}
1155+
};
1156+
let res = self.atomic_op(
1157+
dst,
1158+
|builder, dst| {
1159+
// We are in a supported address space - just use ordinary atomics
1160+
unsafe {
1161+
llvm::LLVMRustBuildAtomicCmpXchg(
1162+
builder.llbuilder,
1163+
dst,
1164+
cmp,
1165+
src,
1166+
crate::llvm::AtomicOrdering::from_generic( order),
1167+
crate::llvm::AtomicOrdering::from_generic(failure_order),
1168+
weak as u32,
1169+
)
1170+
}
1171+
},
1172+
|builder, dst| {
1173+
// Local space is only accessible to the current thread.
1174+
// So, there are no synchronization issues, and we can emulate it using a simple load / compare / store.
1175+
let load:&'ll Value = unsafe{ llvm::LLVMBuildLoad(builder.llbuilder, dst, UNNAMED) };
1176+
let compare = builder.icmp(IntPredicate::IntEQ, load, cmp);
1177+
// We can do something smart & branchless here:
1178+
// We select either the current value(if the comparison fails), or a new value.
1179+
// We then *undconditionally* write that back to local memory(which is very, very cheap).
1180+
// TODO: measure if this has a positive impact, or if we should just use more blocks, and conditional writes.
1181+
let value = builder.select(compare, src, load);
1182+
unsafe { llvm::LLVMBuildStore(builder.llbuilder, value, dst)};
1183+
let res_type = builder.type_struct(&[builder.val_ty(cmp),builder.type_ix(1)], false);
1184+
// We pack the result, to match the behaviour of proper atomics / emulated thread-local atomics.
1185+
let res = builder.const_undef(res_type);
1186+
let res = builder.insert_value(res, load, 0);
1187+
let res = builder.insert_value(res, compare, 1);
1188+
res
1189+
},
1190+
);
1191+
// Unpack the result
1192+
let val = self.extract_value(res, 0);
1193+
let success = self.extract_value(res, 1);
1194+
(val, success)
11471195
}
11481196
fn atomic_rmw(
11491197
&mut self,
@@ -1609,3 +1657,98 @@ impl<'a, 'll, 'tcx> Builder<'a, 'll, 'tcx> {
16091657
}
16101658
}
16111659
}
1660+
impl<'ll, 'tcx, 'a> Builder<'a, 'll, 'tcx> {
1661+
/// Implements a standard atomic, using LLVM intrinsics(in `atomic_supported`, if `dst` is in a supported address space)
1662+
/// or emulation(with `emulate_local`, if `dst` points to a thread-local address space).
1663+
fn atomic_op(
1664+
&mut self,
1665+
dst: &'ll Value,
1666+
atomic_supported: impl FnOnce(&mut Builder<'a,'ll ,'tcx>, &'ll Value) -> &'ll Value,
1667+
emulate_local: impl FnOnce(&mut Builder<'a,'ll ,'tcx>, &'ll Value) -> &'ll Value,
1668+
) -> &'ll Value {
1669+
// (FractalFir) Atomics in CUDA have some limitations, and we have to work around them.
1670+
// For example, they are restricted in what address space they operate on.
1671+
// CUDA has 4 address spaces(and a generic one, which is an union of all of those).
1672+
// An atomic instruction can soundly operate on:
1673+
// 1. The global address space
1674+
// 2. The shared(cluster) address space.
1675+
// It can't operate on:
1676+
// 1. The const address space(atomics on consts are UB anyway)
1677+
// 2. The thread address space(which should be only accessible to 1 thread, anyway?)
1678+
// So, we do the following:
1679+
// 1. Check if the pointer is in one of the address spaces atomics support.
1680+
// a) if so, we perform an atomic operation
1681+
// 2. Check if the pointer is in the thread-local address space. If it is, we use non-atomic ops here,
1682+
// **ASSUMING** only the current thread can access thread-local memory. (FIXME: is this sound?)
1683+
// 3. If the pointer is not in a supported address space, and is not thread-local, then we bail, and trap.
1684+
1685+
// We check if the `dst` pointer is in the `global` address space.
1686+
let (isspacep_global_ty, isspacep_global_fn) =
1687+
self.get_intrinsic("llvm.nvvm.isspacep.global");
1688+
let isspacep_global = self.call(
1689+
isspacep_global_ty,
1690+
None,
1691+
None,
1692+
isspacep_global_fn,
1693+
&[dst],
1694+
None,
1695+
None,
1696+
);
1697+
// We check if the `dst` pointer is in the `shared` address space.
1698+
let (isspacep_shared_ty, isspacep_shared_fn) =
1699+
self.get_intrinsic("llvm.nvvm.isspacep.shared");
1700+
let isspacep_shared = self.call(
1701+
isspacep_shared_ty,
1702+
None,
1703+
None,
1704+
isspacep_shared_fn,
1705+
&[dst],
1706+
None,
1707+
None,
1708+
);
1709+
// Combine those to check if we are in a supported address space.
1710+
let atomic_supported_addrspace = self.or(isspacep_shared, isspacep_global);
1711+
// We create 2 blocks here: one we branch to if atomic is in the right address space, and one we branch to otherwise.
1712+
let supported_bb = self.append_sibling_block("atomic_space_supported");
1713+
let unsupported_bb = self.append_sibling_block("atomic_space_unsupported");
1714+
self.cond_br(atomic_supported_addrspace, supported_bb, unsupported_bb);
1715+
// We also create a "merge" block we will jump to, after the the atomic ops finish.
1716+
let merge_bb = self.append_sibling_block("atomic_op_done");
1717+
// Execute atomic op if supported, then jump to merge
1718+
self.switch_to_block(supported_bb);
1719+
let supported_res = atomic_supported(self, dst);
1720+
self.br(merge_bb);
1721+
// Check if the pointer is in the thread space. If so, we can emulate it.
1722+
self.switch_to_block(unsupported_bb);
1723+
let (isspacep_local_ty, isspacep_local_fn) = self.get_intrinsic("llvm.nvvm.isspacep.local");
1724+
let isspacep_local = self.call(
1725+
isspacep_local_ty,
1726+
None,
1727+
None,
1728+
isspacep_local_fn,
1729+
&[dst],
1730+
None,
1731+
None,
1732+
);
1733+
let local_bb = self.append_sibling_block("atomic_local_space");
1734+
let atomic_ub_bb = self.append_sibling_block("atomic_space_ub");
1735+
self.cond_br(isspacep_local, local_bb, atomic_ub_bb);
1736+
// The pointer is in the thread(local) space.
1737+
self.switch_to_block(local_bb);
1738+
let local_res = emulate_local(self, dst);
1739+
self.br(merge_bb);
1740+
// The pointer is neither in the supported address space, nor the local space.
1741+
// This is very likely UB. So, we trap here.
1742+
// TODO: should we print some kind of a message here? NVVM supports printf.
1743+
self.switch_to_block(atomic_ub_bb);
1744+
self.abort();
1745+
self.unreachable();
1746+
// Atomic is impl has finished, and we can now switch to the merge_bb
1747+
self.switch_to_block(merge_bb);
1748+
self.phi(
1749+
self.val_ty(local_res),
1750+
&[supported_res, local_res],
1751+
&[supported_bb, local_bb],
1752+
)
1753+
}
1754+
}

crates/rustc_codegen_nvvm/src/ctx_intrinsics.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -449,5 +449,11 @@ impl<'ll> CodegenCx<'ll, '_> {
449449
"__nv_ynf",
450450
fn(t_i32, t_f32) -> t_f32
451451
);
452+
// Address space checks
453+
ifn!(map, "llvm.nvvm.isspacep.const", fn(i8p) -> i1);
454+
ifn!(map, "llvm.nvvm.isspacep.global", fn(i8p) -> i1);
455+
ifn!(map, "llvm.nvvm.isspacep.local", fn(i8p) -> i1);
456+
ifn!(map, "llvm.nvvm.isspacep.shared", fn(i8p) -> i1);
457+
452458
}
453459
}

crates/rustc_codegen_nvvm/src/llvm.rs

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
// but likely will use in the future, so we ignore any unused functions
1717
// in case we need them in the future for things like debug info or LTO.
1818
#![allow(dead_code)]
19-
19+
use rustc_codegen_ssa::common::AtomicRmwBinOp;
2020
use libc::{c_char, c_uint, c_void, size_t};
2121
use libc::{c_int, c_ulonglong};
2222
use std::ffi::{CStr, CString};
@@ -1947,4 +1947,50 @@ unsafe extern "C" {
19471947
pub(crate) fn LLVMRustAddDereferenceableOrNullAttr(Fn: &Value, index: c_uint, bytes: u64);
19481948

19491949
pub(crate) fn LLVMRustPositionBuilderAtStart<'a>(B: &Builder<'a>, BB: &'a BasicBlock);
1950+
// Atomics
1951+
pub fn LLVMRustBuildAtomicCmpXchg<'a>(
1952+
B: &Builder<'a>,
1953+
LHS: &Value,
1954+
CMP: &Value,
1955+
RHS: &Value,
1956+
Order: AtomicOrdering,
1957+
FailureOrder: AtomicOrdering,
1958+
Weak: Bool,
1959+
) -> &'a Value;
1960+
1961+
pub fn LLVMBuildAtomicRMW<'a>(
1962+
B: &Builder<'a>,
1963+
Op: AtomicRmwBinOp,
1964+
LHS: &Value,
1965+
RHS: &Value,
1966+
Order: AtomicOrdering,
1967+
SingleThreaded: Bool,
1968+
) -> &'a Value;
19501969
}
1970+
/// LLVMAtomicOrdering
1971+
#[derive(Copy, Clone)]
1972+
#[repr(C)]
1973+
pub(crate) enum AtomicOrdering {
1974+
#[allow(dead_code)]
1975+
NotAtomic = 0,
1976+
#[allow(dead_code)]
1977+
Unordered = 1,
1978+
Monotonic = 2,
1979+
// Consume = 3, // Not specified yet.
1980+
Acquire = 4,
1981+
Release = 5,
1982+
AcquireRelease = 6,
1983+
SequentiallyConsistent = 7,
1984+
}
1985+
impl AtomicOrdering {
1986+
pub(crate) fn from_generic(ao: rustc_middle::ty::AtomicOrdering) -> Self {
1987+
use rustc_middle::ty::AtomicOrdering as Common;
1988+
match ao {
1989+
Common::Relaxed => Self::Monotonic,
1990+
Common::Acquire => Self::Acquire,
1991+
Common::Release => Self::Release,
1992+
Common::AcqRel => Self::AcquireRelease,
1993+
Common::SeqCst => Self::SequentiallyConsistent,
1994+
}
1995+
}
1996+
}
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
// Test CUDA atomic operations compile correctly
2+
// build-pass
3+
// compile-flags: -Z verify-llvm-ir
4+
use core::sync::atomic::{AtomicUsize,Ordering};
5+
6+
use cuda_std::atomic::{
7+
AtomicF32, AtomicF64, BlockAtomicF32, BlockAtomicF64, SystemAtomicF32, SystemAtomicF64,
8+
};
9+
use cuda_std::kernel;
10+
static GLOBAL:AtomicUsize = AtomicUsize::new(0);
11+
#[kernel]
12+
pub unsafe fn test_cuda_atomic_floats() {
13+
let local = AtomicUsize::new(0);
14+
// `compare_exchange` should succeed
15+
local.compare_exchange(0, 1, Ordering::Relaxed, Ordering::Relaxed);
16+
// `compare_exchange` should fail
17+
local.compare_exchange(0, 1, Ordering::Relaxed, Ordering::Relaxed);
18+
// `compare_exchange` should succeed
19+
GLOBAL.compare_exchange(0, 1, Ordering::Relaxed, Ordering::Relaxed);
20+
// `compare_exchange` should fail
21+
GLOBAL.compare_exchange(0, 1, Ordering::Relaxed, Ordering::Relaxed);
22+
23+
}

0 commit comments

Comments
 (0)