Skip to content

Commit b0e3c10

Browse files
FractalFirLegNeato
authored andcommitted
Added support for most atomic operations
1 parent c6dee2c commit b0e3c10

File tree

3 files changed

+101
-50
lines changed

3 files changed

+101
-50
lines changed

crates/rustc_codegen_nvvm/src/builder.rs

Lines changed: 69 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ use libc::{c_char, c_uint};
66
use rustc_abi as abi;
77
use rustc_abi::{AddressSpace, Align, HasDataLayout, Size, TargetDataLayout, WrappingRange};
88
use rustc_codegen_ssa::MemFlags;
9-
use rustc_codegen_ssa::common::{IntPredicate, RealPredicate, TypeKind};
9+
use rustc_codegen_ssa::common::{IntPredicate, RealPredicate, TypeKind,AtomicRmwBinOp};
1010
use rustc_codegen_ssa::mir::operand::{OperandRef, OperandValue};
1111
use rustc_codegen_ssa::mir::place::PlaceRef;
1212
use rustc_codegen_ssa::traits::*;
@@ -546,30 +546,13 @@ impl<'ll, 'tcx, 'a> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> {
546546

547547
fn atomic_load(
548548
&mut self,
549-
_ty: &'ll Type,
549+
ty: &'ll Type,
550550
ptr: &'ll Value,
551-
_order: AtomicOrdering,
552-
_size: Size,
551+
order: AtomicOrdering,
552+
size: Size,
553553
) -> &'ll Value {
554-
// core seems to think that nvptx has atomic loads, which is not true for NVVM IR,
555-
// therefore our only option is to print that this is not supported then trap.
556-
// i have heard of cursed things such as emulating this with __threadfence and volatile loads
557-
// but that needs to be experimented with in terms of safety and behavior.
558-
// NVVM has explicit intrinsics for adding and subtracting floats which we expose elsewhere
559-
560-
// TODO(RDambrosio016): is there a way we can just generate a panic with a message instead
561-
// of doing this ourselves? since all panics will be aborts, it should be equivalent
562-
// let message = "Atomic Loads are not supported in CUDA.\0";
563-
564-
// let vprintf = self.get_intrinsic("vprintf");
565-
// let formatlist = self.const_str(Symbol::intern(message)).0;
566-
// let valist = self.const_null(self.type_void());
567-
568-
// self.call(vprintf, &[formatlist, valist], None);
569-
570-
let (ty, f) = self.get_intrinsic("llvm.trap");
571-
self.call(ty, None, None, f, &[], None, None);
572-
unsafe { llvm::LLVMBuildLoad(self.llbuilder, ptr, unnamed()) }
554+
// Since for any A, A | 0 = A, and performing atomics on constant memory is UB in Rust, we can abuse or to perform atomic reads.
555+
self.atomic_rmw(AtomicRmwBinOp::AtomicOr, ptr, self.const_int(ty, 0), order)
573556
}
574557

575558
fn load_operand(&mut self, place: PlaceRef<'tcx, &'ll Value>) -> OperandRef<'tcx, &'ll Value> {
@@ -796,24 +779,13 @@ impl<'ll, 'tcx, 'a> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> {
796779

797780
fn atomic_store(
798781
&mut self,
799-
_val: &'ll Value,
782+
val: &'ll Value,
800783
ptr: &'ll Value,
801-
_order: AtomicOrdering,
802-
_size: Size,
784+
order: AtomicOrdering,
785+
size: Size,
803786
) {
804-
// see comment in atomic_load
805-
806-
// let message = "Atomic Stores are not supported in CUDA.\0";
807-
808-
// let vprintf = self.get_intrinsic("vprintf");
809-
// let formatlist = self.const_str(Symbol::intern(message)).0;
810-
// let valist = self.const_null(self.type_void());
811-
812-
// self.call(vprintf, &[formatlist, valist], None);
813-
self.abort();
814-
unsafe {
815-
llvm::LLVMBuildLoad(self.llbuilder, ptr, UNNAMED);
816-
}
787+
// We can exchange *ptr with val, and then discard the result.
788+
self.atomic_rmw(AtomicRmwBinOp::AtomicXchg, ptr, val, order);
817789
}
818790

819791
fn gep(&mut self, ty: &'ll Type, ptr: &'ll Value, indices: &[&'ll Value]) -> &'ll Value {
@@ -1195,13 +1167,65 @@ impl<'ll, 'tcx, 'a> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> {
11951167
}
11961168
fn atomic_rmw(
11971169
&mut self,
1198-
_op: rustc_codegen_ssa::common::AtomicRmwBinOp,
1199-
_dst: &'ll Value,
1200-
_src: &'ll Value,
1201-
_order: AtomicOrdering,
1170+
op: AtomicRmwBinOp,
1171+
dst: &'ll Value,
1172+
src: &'ll Value,
1173+
order: AtomicOrdering,
12021174
) -> &'ll Value {
1203-
// see cmpxchg comment
1204-
self.fatal("atomic rmw is not supported")
1175+
if matches!(op,AtomicRmwBinOp::AtomicNand){
1176+
self.fatal("Atomic NAND not supported yet!")
1177+
}
1178+
self.atomic_op(
1179+
dst,
1180+
|builder, dst| {
1181+
// We are in a supported address space - just use ordinary atomics
1182+
unsafe {
1183+
llvm::LLVMBuildAtomicRMW(
1184+
builder.llbuilder,
1185+
op,
1186+
dst,
1187+
src,
1188+
crate::llvm::AtomicOrdering::from_generic( order),
1189+
0,
1190+
)
1191+
}
1192+
},
1193+
|builder, dst| {
1194+
// Local space is only accessible to the current thread.
1195+
// So, there are no synchronization issues, and we can emulate it using a simple load / compare / store.
1196+
let load:&'ll Value = unsafe{ llvm::LLVMBuildLoad(builder.llbuilder, dst, UNNAMED) };
1197+
let next_val = match op{
1198+
AtomicRmwBinOp::AtomicXchg => src,
1199+
AtomicRmwBinOp::AtomicAdd => builder.add(load, src),
1200+
AtomicRmwBinOp::AtomicSub => builder.sub(load, src),
1201+
AtomicRmwBinOp::AtomicAnd => builder.and(load, src),
1202+
AtomicRmwBinOp::AtomicNand => {
1203+
let and = builder.and(load, src);
1204+
builder.not(and)
1205+
},
1206+
AtomicRmwBinOp::AtomicOr => builder.or(load, src),
1207+
AtomicRmwBinOp::AtomicXor => builder.xor(load, src),
1208+
AtomicRmwBinOp::AtomicMax => {
1209+
let is_src_bigger = builder.icmp(IntPredicate::IntSGT, src, load);
1210+
builder.select(is_src_bigger,src,load)
1211+
}
1212+
AtomicRmwBinOp::AtomicMin => {
1213+
let is_src_smaller = builder.icmp(IntPredicate::IntSLT, src, load);
1214+
builder.select(is_src_smaller,src,load)
1215+
}
1216+
AtomicRmwBinOp::AtomicUMax => {
1217+
let is_src_bigger = builder.icmp(IntPredicate::IntUGT, src, load);
1218+
builder.select(is_src_bigger,src,load)
1219+
},
1220+
AtomicRmwBinOp::AtomicUMin => {
1221+
let is_src_smaller = builder.icmp(IntPredicate::IntULT, src, load);
1222+
builder.select(is_src_smaller,src,load)
1223+
}
1224+
};
1225+
unsafe { llvm::LLVMBuildStore(builder.llbuilder, next_val, dst)};
1226+
load
1227+
},
1228+
)
12051229
}
12061230

12071231
fn atomic_fence(

crates/rustc_codegen_nvvm/src/nvvm.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,9 +125,10 @@ pub fn codegen_bitcode_modules(
125125
let res = match prog.compile(&args.nvvm_options) {
126126
Ok(b) => b,
127127
Err(error) => {
128+
let log = prog.compiler_log().unwrap().unwrap_or_default();
128129
// this should never happen, if it does, something went really bad or its a bug on libnvvm's end
129130
panic!(
130-
"libnvvm returned an error that was not previously caught by the verifier: {error:?}"
131+
"libnvvm returned an error that was not previously caught by the verifier: {error:?} {log:?}"
131132
);
132133
}
133134
};
Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
// Test CUDA atomic operations compile correctly
22
// build-pass
3-
// compile-flags: -Z verify-llvm-ir
3+
// compile-flags: -Z verify-llvm-ir
44
use core::sync::atomic::{AtomicUsize,Ordering};
55

66
use cuda_std::atomic::{
@@ -10,14 +10,40 @@ use cuda_std::kernel;
1010
static GLOBAL:AtomicUsize = AtomicUsize::new(0);
1111
#[kernel]
1212
pub unsafe fn test_cuda_atomic_floats() {
13-
let local = AtomicUsize::new(0);
13+
let local = AtomicUsize::new(0);
1414
// `compare_exchange` should succeed
1515
local.compare_exchange(0, 1, Ordering::Relaxed, Ordering::Relaxed);
1616
// `compare_exchange` should fail
1717
local.compare_exchange(0, 1, Ordering::Relaxed, Ordering::Relaxed);
18-
// `compare_exchange` should succeed
18+
// `compare_exchange` should succeed
1919
GLOBAL.compare_exchange(0, 1, Ordering::Relaxed, Ordering::Relaxed);
2020
// `compare_exchange` should fail
2121
GLOBAL.compare_exchange(0, 1, Ordering::Relaxed, Ordering::Relaxed);
22-
22+
// Ops
23+
local.swap(1, Ordering::Relaxed);
24+
GLOBAL.swap(1, Ordering::Relaxed);
25+
local.fetch_add(1, Ordering::Relaxed);
26+
GLOBAL.fetch_add(1, Ordering::Relaxed);
27+
local.fetch_sub(1, Ordering::Relaxed);
28+
GLOBAL.fetch_sub(1, Ordering::Relaxed);
29+
local.fetch_and(1, Ordering::Relaxed);
30+
GLOBAL.fetch_and(1, Ordering::Relaxed);
31+
local.fetch_and(1, Ordering::Relaxed);
32+
GLOBAL.fetch_and(1, Ordering::Relaxed);
33+
local.fetch_or(1, Ordering::Relaxed);
34+
GLOBAL.fetch_or(1, Ordering::Relaxed);
35+
local.fetch_xor(1, Ordering::Relaxed);
36+
GLOBAL.fetch_xor(1, Ordering::Relaxed);
37+
local.fetch_max(1, Ordering::Relaxed);
38+
GLOBAL.fetch_max(1, Ordering::Relaxed);
39+
local.fetch_min(1, Ordering::Relaxed);
40+
GLOBAL.fetch_min(1, Ordering::Relaxed);
41+
// Loads:
42+
local.load(Ordering::Relaxed);
43+
GLOBAL.load(Ordering::Relaxed);
44+
local.store(1, Ordering::Relaxed);
45+
GLOBAL.store(1, Ordering::Relaxed);
46+
// Atomic NAND is not supported quite yet
47+
//local.fetch_nand(1, Ordering::Relaxed);
48+
//GLOBAL.fetch_nand(1, Ordering::Relaxed);
2349
}

0 commit comments

Comments
 (0)