@@ -1134,16 +1134,64 @@ impl<'ll, 'tcx, 'a> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> {
1134
1134
// Atomic Operations
1135
1135
fn atomic_cmpxchg (
1136
1136
& mut self ,
1137
- _dst : & ' ll Value ,
1138
- _cmp : & ' ll Value ,
1139
- _src : & ' ll Value ,
1140
- _order : AtomicOrdering ,
1141
- _failure_order : AtomicOrdering ,
1142
- _weak : bool ,
1137
+ dst : & ' ll Value ,
1138
+ cmp : & ' ll Value ,
1139
+ src : & ' ll Value ,
1140
+ order : AtomicOrdering ,
1141
+ failure_order : AtomicOrdering ,
1142
+ weak : bool ,
1143
1143
) -> ( & ' ll Value , & ' ll Value ) {
1144
- // allowed but only for some things and with restrictions
1145
- // https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#cmpxchg-instruction
1146
- self . fatal ( "atomic cmpxchg is not supported" )
1144
+ // LLVM verifier rejects cases where the `failure_order` is stronger than `order`
1145
+ match ( order, failure_order) {
1146
+ ( AtomicOrdering :: SeqCst , _) =>( ) ,
1147
+ ( _, AtomicOrdering :: Relaxed ) =>( ) ,
1148
+ ( AtomicOrdering :: Release , AtomicOrdering :: Release ) | ( AtomicOrdering :: Release , AtomicOrdering :: Acquire ) | ( AtomicOrdering :: Acquire , AtomicOrdering :: Acquire ) =>( ) ,
1149
+ ( AtomicOrdering :: AcqRel , AtomicOrdering :: Acquire ) => ( ) ,
1150
+ ( AtomicOrdering :: Relaxed , _) | ( _, AtomicOrdering :: Release | AtomicOrdering :: AcqRel | AtomicOrdering :: SeqCst ) =>{
1151
+ // Invalid cmpxchg - `failure_order` is stronger than `order`! So, we abort.
1152
+ self . abort ( ) ;
1153
+ return ( self . const_undef ( self . val_ty ( cmp) ) , self . const_undef ( self . type_i1 ( ) ) ) ;
1154
+ }
1155
+ } ;
1156
+ let res = self . atomic_op (
1157
+ dst,
1158
+ |builder, dst| {
1159
+ // We are in a supported address space - just use ordinary atomics
1160
+ unsafe {
1161
+ llvm:: LLVMRustBuildAtomicCmpXchg (
1162
+ builder. llbuilder ,
1163
+ dst,
1164
+ cmp,
1165
+ src,
1166
+ crate :: llvm:: AtomicOrdering :: from_generic ( order) ,
1167
+ crate :: llvm:: AtomicOrdering :: from_generic ( failure_order) ,
1168
+ weak as u32 ,
1169
+ )
1170
+ }
1171
+ } ,
1172
+ |builder, dst| {
1173
+ // Local space is only accessible to the current thread.
1174
+ // So, there are no synchronization issues, and we can emulate it using a simple load / compare / store.
1175
+ let load: & ' ll Value = unsafe { llvm:: LLVMBuildLoad ( builder. llbuilder , dst, UNNAMED ) } ;
1176
+ let compare = builder. icmp ( IntPredicate :: IntEQ , load, cmp) ;
1177
+ // We can do something smart & branchless here:
1178
+ // We select either the current value(if the comparison fails), or a new value.
1179
+ // We then *undconditionally* write that back to local memory(which is very, very cheap).
1180
+ // TODO: measure if this has a positive impact, or if we should just use more blocks, and conditional writes.
1181
+ let value = builder. select ( compare, src, load) ;
1182
+ unsafe { llvm:: LLVMBuildStore ( builder. llbuilder , value, dst) } ;
1183
+ let res_type = builder. type_struct ( & [ builder. val_ty ( cmp) , builder. type_ix ( 1 ) ] , false ) ;
1184
+ // We pack the result, to match the behaviour of proper atomics / emulated thread-local atomics.
1185
+ let res = builder. const_undef ( res_type) ;
1186
+ let res = builder. insert_value ( res, load, 0 ) ;
1187
+ let res = builder. insert_value ( res, compare, 1 ) ;
1188
+ res
1189
+ } ,
1190
+ ) ;
1191
+ // Unpack the result
1192
+ let val = self . extract_value ( res, 0 ) ;
1193
+ let success = self . extract_value ( res, 1 ) ;
1194
+ ( val, success)
1147
1195
}
1148
1196
fn atomic_rmw (
1149
1197
& mut self ,
@@ -1609,3 +1657,98 @@ impl<'a, 'll, 'tcx> Builder<'a, 'll, 'tcx> {
1609
1657
}
1610
1658
}
1611
1659
}
1660
+ impl < ' ll , ' tcx , ' a > Builder < ' a , ' ll , ' tcx > {
1661
+ /// Implements a standard atomic, using LLVM intrinsics(in `atomic_supported`, if `dst` is in a supported address space)
1662
+ /// or emulation(with `emulate_local`, if `dst` points to a thread-local address space).
1663
+ fn atomic_op (
1664
+ & mut self ,
1665
+ dst : & ' ll Value ,
1666
+ atomic_supported : impl FnOnce ( & mut Builder < ' a , ' ll , ' tcx > , & ' ll Value ) -> & ' ll Value ,
1667
+ emulate_local : impl FnOnce ( & mut Builder < ' a , ' ll , ' tcx > , & ' ll Value ) -> & ' ll Value ,
1668
+ ) -> & ' ll Value {
1669
+ // (FractalFir) Atomics in CUDA have some limitations, and we have to work around them.
1670
+ // For example, they are restricted in what address space they operate on.
1671
+ // CUDA has 4 address spaces(and a generic one, which is an union of all of those).
1672
+ // An atomic instruction can soundly operate on:
1673
+ // 1. The global address space
1674
+ // 2. The shared(cluster) address space.
1675
+ // It can't operate on:
1676
+ // 1. The const address space(atomics on consts are UB anyway)
1677
+ // 2. The thread address space(which should be only accessible to 1 thread, anyway?)
1678
+ // So, we do the following:
1679
+ // 1. Check if the pointer is in one of the address spaces atomics support.
1680
+ // a) if so, we perform an atomic operation
1681
+ // 2. Check if the pointer is in the thread-local address space. If it is, we use non-atomic ops here,
1682
+ // **ASSUMING** only the current thread can access thread-local memory. (FIXME: is this sound?)
1683
+ // 3. If the pointer is not in a supported address space, and is not thread-local, then we bail, and trap.
1684
+
1685
+ // We check if the `dst` pointer is in the `global` address space.
1686
+ let ( isspacep_global_ty, isspacep_global_fn) =
1687
+ self . get_intrinsic ( "llvm.nvvm.isspacep.global" ) ;
1688
+ let isspacep_global = self . call (
1689
+ isspacep_global_ty,
1690
+ None ,
1691
+ None ,
1692
+ isspacep_global_fn,
1693
+ & [ dst] ,
1694
+ None ,
1695
+ None ,
1696
+ ) ;
1697
+ // We check if the `dst` pointer is in the `shared` address space.
1698
+ let ( isspacep_shared_ty, isspacep_shared_fn) =
1699
+ self . get_intrinsic ( "llvm.nvvm.isspacep.shared" ) ;
1700
+ let isspacep_shared = self . call (
1701
+ isspacep_shared_ty,
1702
+ None ,
1703
+ None ,
1704
+ isspacep_shared_fn,
1705
+ & [ dst] ,
1706
+ None ,
1707
+ None ,
1708
+ ) ;
1709
+ // Combine those to check if we are in a supported address space.
1710
+ let atomic_supported_addrspace = self . or ( isspacep_shared, isspacep_global) ;
1711
+ // We create 2 blocks here: one we branch to if atomic is in the right address space, and one we branch to otherwise.
1712
+ let supported_bb = self . append_sibling_block ( "atomic_space_supported" ) ;
1713
+ let unsupported_bb = self . append_sibling_block ( "atomic_space_unsupported" ) ;
1714
+ self . cond_br ( atomic_supported_addrspace, supported_bb, unsupported_bb) ;
1715
+ // We also create a "merge" block we will jump to, after the the atomic ops finish.
1716
+ let merge_bb = self . append_sibling_block ( "atomic_op_done" ) ;
1717
+ // Execute atomic op if supported, then jump to merge
1718
+ self . switch_to_block ( supported_bb) ;
1719
+ let supported_res = atomic_supported ( self , dst) ;
1720
+ self . br ( merge_bb) ;
1721
+ // Check if the pointer is in the thread space. If so, we can emulate it.
1722
+ self . switch_to_block ( unsupported_bb) ;
1723
+ let ( isspacep_local_ty, isspacep_local_fn) = self . get_intrinsic ( "llvm.nvvm.isspacep.local" ) ;
1724
+ let isspacep_local = self . call (
1725
+ isspacep_local_ty,
1726
+ None ,
1727
+ None ,
1728
+ isspacep_local_fn,
1729
+ & [ dst] ,
1730
+ None ,
1731
+ None ,
1732
+ ) ;
1733
+ let local_bb = self . append_sibling_block ( "atomic_local_space" ) ;
1734
+ let atomic_ub_bb = self . append_sibling_block ( "atomic_space_ub" ) ;
1735
+ self . cond_br ( isspacep_local, local_bb, atomic_ub_bb) ;
1736
+ // The pointer is in the thread(local) space.
1737
+ self . switch_to_block ( local_bb) ;
1738
+ let local_res = emulate_local ( self , dst) ;
1739
+ self . br ( merge_bb) ;
1740
+ // The pointer is neither in the supported address space, nor the local space.
1741
+ // This is very likely UB. So, we trap here.
1742
+ // TODO: should we print some kind of a message here? NVVM supports printf.
1743
+ self . switch_to_block ( atomic_ub_bb) ;
1744
+ self . abort ( ) ;
1745
+ self . unreachable ( ) ;
1746
+ // Atomic is impl has finished, and we can now switch to the merge_bb
1747
+ self . switch_to_block ( merge_bb) ;
1748
+ self . phi (
1749
+ self . val_ty ( local_res) ,
1750
+ & [ supported_res, local_res] ,
1751
+ & [ supported_bb, local_bb] ,
1752
+ )
1753
+ }
1754
+ }
0 commit comments