Progress upgrading to the latest rust-cuda, FitsIntoDeviceRegister still broken

juntyr · juntyr · commit 73ab8e974c96 · 2023-07-04T11:53:24.000Z
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/necsim/core/Cargo.toml b/necsim/core/Cargo.toml
@@ -20,7 +20,7 @@ contracts = "0.6.3"
 serde = { version = "1.0", default-features = false, features = ["derive"] }
 
 [target.'cfg(target_os = "cuda")'.dependencies]
-rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "7a41652", features = ["derive"], optional = true }
+rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "626cd48", features = ["derive"], optional = true }
 
 [target.'cfg(not(target_os = "cuda"))'.dependencies]
-rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "7a41652", features = ["derive", "host"], optional = true }
+rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "626cd48", features = ["derive", "host"], optional = true }
diff --git a/necsim/impls/cuda/Cargo.toml b/necsim/impls/cuda/Cargo.toml
@@ -15,7 +15,7 @@ contracts = "0.6.3"
 serde = { version = "1.0", default-features = false, features = ["derive"] }
 
 [target.'cfg(target_os = "cuda")'.dependencies]
-rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "7a41652", features = ["derive"] }
+rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "626cd48", features = ["derive"] }
 
 [target.'cfg(not(target_os = "cuda"))'.dependencies]
-rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "7a41652", features = ["derive", "host"] }
+rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "626cd48", features = ["derive", "host"] }
diff --git a/necsim/impls/cuda/src/event_buffer.rs b/necsim/impls/cuda/src/event_buffer.rs
@@ -271,27 +271,32 @@ impl<ReportSpeciation: Boolean, ReportDispersal: Boolean>
 
         let shared_buffer_len = <Self as EventType>::SharedBuffer::<()>::len();
 
-        let block_dim = rust_cuda::device::utils::block_dim();
+        let thread = rust_cuda::device::thread::Thread::this();
+        let thread_idx = thread.idx();
+        let thread_block = thread.block();
+        let block_dim = thread_block.dim();
+        let block_idx = thread_block.idx();
+        let block_grid = thread_block.grid();
+        let grid_dim = block_grid.dim();
 
         if shared_buffer_len != (block_dim.size() * 2) {
             core::arch::nvptx::trap();
         }
 
-        let block_idx =
-            rust_cuda::device::utils::block_idx().as_id(&rust_cuda::device::utils::grid_dim());
-        let thread_idx = rust_cuda::device::utils::thread_idx().as_id(&block_dim);
+        let block_idx = block_idx.as_id(&grid_dim);
+        let thread_idx = thread_idx.as_id(&block_dim);
 
         let idx = block_idx * shared_buffer_len + thread_idx;
 
-        let shared_mask: rust_cuda::device::ThreadBlockShared<
+        let shared_mask: rust_cuda::utils::shared::r#static::ThreadBlockShared<
             <Self as EventType>::SharedBuffer<bool>,
-        > = rust_cuda::device::ThreadBlockShared::new_uninit();
-        let shared_mask_array: *mut bool = shared_mask.get().as_mut_ptr();
-        let shared_buffer: rust_cuda::device::ThreadBlockShared<
+        > = rust_cuda::utils::shared::r#static::ThreadBlockShared::new_uninit();
+        let shared_mask_array: *mut bool = shared_mask.as_mut_ptr().cast();
+        let shared_buffer: rust_cuda::utils::shared::r#static::ThreadBlockShared<
             <Self as EventType>::SharedBuffer<MaybeSome<<Self as EventType>::Event>>,
-        > = rust_cuda::device::ThreadBlockShared::new_uninit();
+        > = rust_cuda::utils::shared::r#static::ThreadBlockShared::new_uninit();
         let shared_buffer_array: *mut MaybeSome<<Self as EventType>::Event> =
-            shared_buffer.get().as_mut_ptr();
+            shared_buffer.as_mut_ptr().cast();
 
         *shared_mask_array.add(thread_idx) = match self.event_mask.alias_unchecked().get(idx) {
             None => false,
@@ -476,27 +481,32 @@ impl<ReportSpeciation: Boolean, ReportDispersal: Boolean>
 
         let shared_buffer_len = <Self as EventType>::SharedBuffer::<()>::len();
 
-        let block_dim = rust_cuda::device::utils::block_dim();
+        let thread = rust_cuda::device::thread::Thread::this();
+        let thread_idx = thread.idx();
+        let thread_block = thread.block();
+        let block_dim = thread_block.dim();
+        let block_idx = thread_block.idx();
+        let block_grid = thread_block.grid();
+        let grid_dim = block_grid.dim();
 
         if shared_buffer_len != (block_dim.size() * 2) {
             core::arch::nvptx::trap();
         }
 
-        let block_idx =
-            rust_cuda::device::utils::block_idx().as_id(&rust_cuda::device::utils::grid_dim());
-        let thread_idx = rust_cuda::device::utils::thread_idx().as_id(&block_dim);
+        let block_idx = block_idx.as_id(&grid_dim);
+        let thread_idx = thread_idx.as_id(&block_dim);
 
         let idx = block_idx * shared_buffer_len + thread_idx;
 
-        let shared_mask: rust_cuda::device::ThreadBlockShared<
+        let shared_mask: rust_cuda::utils::shared::r#static::ThreadBlockShared<
             <Self as EventType>::SharedBuffer<bool>,
-        > = rust_cuda::device::ThreadBlockShared::new_uninit();
-        let shared_mask_array: *mut bool = shared_mask.get().cast();
-        let shared_buffer: rust_cuda::device::ThreadBlockShared<
+        > = rust_cuda::utils::shared::r#static::ThreadBlockShared::new_uninit();
+        let shared_mask_array: *mut bool = shared_mask.as_mut_ptr().cast();
+        let shared_buffer: rust_cuda::utils::shared::r#static::ThreadBlockShared<
             <Self as EventType>::SharedBuffer<MaybeSome<<Self as EventType>::Event>>,
-        > = rust_cuda::device::ThreadBlockShared::new_uninit();
+        > = rust_cuda::utils::shared::r#static::ThreadBlockShared::new_uninit();
         let shared_buffer_array: *mut MaybeSome<<Self as EventType>::Event> =
-            shared_buffer.get().cast();
+            shared_buffer.as_mut_ptr().cast();
 
         *shared_mask_array.add(thread_idx) = match self.event_mask.alias_unchecked().get(idx) {
             None => false,
@@ -618,7 +628,7 @@ impl<ReportSpeciation: Boolean, ReportDispersal: Boolean>
     pub unsafe fn bitonic_sort_events_step(&mut self, size: usize, stride: usize) {
         use core::cmp::Ordering;
 
-        let idx = rust_cuda::device::utils::index();
+        let idx = rust_cuda::device::thread::Thread::this().index();
 
         let pos = idx & ((self.event_mask.alias_unchecked().len().next_power_of_two() / 2) - 1);
 
@@ -719,7 +729,7 @@ impl<ReportSpeciation: Boolean, ReportDispersal: Boolean>
     pub unsafe fn odd_even_sort_events_step(&mut self, size: usize, stride: usize) {
         use core::cmp::Ordering;
 
-        let idx = rust_cuda::device::utils::index();
+        let idx = rust_cuda::device::thread::Thread::this().index();
 
         let pos = 2 * idx - (idx & (stride - 1));
 
diff --git a/necsim/impls/no-std/Cargo.toml b/necsim/impls/no-std/Cargo.toml
@@ -31,7 +31,7 @@ rand_core = "0.6"
 rand_distr = { version = "0.4", default-features = false, features = [] }
 
 [target.'cfg(target_os = "cuda")'.dependencies]
-rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "7a41652", features = ["derive"], optional = true }
+rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "626cd48", features = ["derive"], optional = true }
 
 [target.'cfg(not(target_os = "cuda"))'.dependencies]
-rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "7a41652", features = ["derive", "host"], optional = true }
+rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "626cd48", features = ["derive", "host"], optional = true }
diff --git a/rustcoalescence/algorithms/cuda/Cargo.toml b/rustcoalescence/algorithms/cuda/Cargo.toml
@@ -23,4 +23,4 @@ thiserror = "1.0"
 serde = { version = "1.0", features = ["derive"] }
 serde_state = "0.4"
 serde_derive_state = "0.4"
-rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "7a41652", features = ["host"] }
+rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "626cd48", features = ["host"] }
diff --git a/rustcoalescence/algorithms/cuda/cpu-kernel/Cargo.toml b/rustcoalescence/algorithms/cuda/cpu-kernel/Cargo.toml
@@ -14,4 +14,4 @@ necsim-impls-no-std = { path = "../../../../necsim/impls/no-std", features = ["c
 necsim-impls-cuda = { path = "../../../../necsim/impls/cuda" }
 rustcoalescence-algorithms-cuda-gpu-kernel = { path = "../gpu-kernel" }
 
-rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "7a41652", features = ["host"] }
+rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "626cd48", features = ["host"] }
diff --git a/rustcoalescence/algorithms/cuda/cpu-kernel/src/link.rs b/rustcoalescence/algorithms/cuda/cpu-kernel/src/link.rs
@@ -1,8 +1,10 @@
 use rustcoalescence_algorithms_cuda_gpu_kernel::{
-    BitonicGlobalSortStepKernelArgs, BitonicGlobalSortSteppableKernel,
-    BitonicSharedSortPrepKernelArgs, BitonicSharedSortPreparableKernel,
-    BitonicSharedSortStepKernelArgs, BitonicSharedSortSteppableKernel, EvenOddSortKernelArgs,
-    EvenOddSortableKernel, SimulatableKernel, SimulationKernelArgs,
+    BitonicGlobalSortStepKernelArgs, BitonicGlobalSortStepKernelPtx,
+    BitonicGlobalSortSteppableKernel, BitonicSharedSortPrepKernelArgs,
+    BitonicSharedSortPrepKernelPtx, BitonicSharedSortPreparableKernel,
+    BitonicSharedSortStepKernelArgs, BitonicSharedSortStepKernelPtx,
+    BitonicSharedSortSteppableKernel, EvenOddSortKernelArgs, EvenOddSortKernelPtx,
+    EvenOddSortableKernel, SimulatableKernel, SimulationKernelArgs, SimulationKernelPtx,
 };
 
 use crate::{
diff --git a/rustcoalescence/algorithms/cuda/gpu-kernel/Cargo.toml b/rustcoalescence/algorithms/cuda/gpu-kernel/Cargo.toml
@@ -16,4 +16,4 @@ necsim-core-bond = { path = "../../../../necsim/core/bond" }
 necsim-impls-no-std = { path = "../../../../necsim/impls/no-std", features = ["cuda"] }
 necsim-impls-cuda = { path = "../../../../necsim/impls/cuda" }
 
-rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "7a41652", features = ["derive"] }
+rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "626cd48", features = ["derive"] }
diff --git a/rustcoalescence/algorithms/cuda/gpu-kernel/src/lib.rs b/rustcoalescence/algorithms/cuda/gpu-kernel/src/lib.rs
@@ -31,26 +31,28 @@ use necsim_impls_no_std::cogs::{
     event_sampler::tracking::{MinSpeciationTrackingEventSampler, SpeciationSample},
 };
 
-use rust_cuda::common::RustToCuda;
+use rust_cuda::{common::RustToCuda, safety::NoAliasing};
 
 #[rust_cuda::common::kernel(
-    pub use link_kernel! as impl SimulatableKernel<SimulationKernelArgs> for SimulationKernel
+    pub use link_kernel! as impl SimulatableKernel<
+        SimulationKernelArgs, SimulationKernelPtx,
+    > for SimulationKernel
 )]
 #[allow(clippy::too_many_arguments)]
 #[allow(clippy::type_complexity)]
 pub fn simulate<
     M: MathsCore,
-    H: Habitat<M> + RustToCuda,
-    G: Rng<M, Generator: PrimeableRng> + RustToCuda,
-    S: LineageStore<M, H> + RustToCuda,
-    X: EmigrationExit<M, H, G, S> + RustToCuda,
-    D: DispersalSampler<M, H, G> + RustToCuda,
-    C: CoalescenceSampler<M, H, S> + RustToCuda,
-    T: TurnoverRate<M, H> + RustToCuda,
-    N: SpeciationProbability<M, H> + RustToCuda,
-    E: MinSpeciationTrackingEventSampler<M, H, G, S, X, D, C, T, N> + RustToCuda,
-    I: ImmigrationEntry<M> + RustToCuda,
-    A: SingularActiveLineageSampler<M, H, G, S, X, D, C, T, N, E, I> + RustToCuda,
+    H: Habitat<M> + RustToCuda + NoAliasing,
+    G: Rng<M, Generator: PrimeableRng> + RustToCuda + NoAliasing,
+    S: LineageStore<M, H> + RustToCuda + NoAliasing,
+    X: EmigrationExit<M, H, G, S> + RustToCuda + NoAliasing,
+    D: DispersalSampler<M, H, G> + RustToCuda + NoAliasing,
+    C: CoalescenceSampler<M, H, S> + RustToCuda + NoAliasing,
+    T: TurnoverRate<M, H> + RustToCuda + NoAliasing,
+    N: SpeciationProbability<M, H> + RustToCuda + NoAliasing,
+    E: MinSpeciationTrackingEventSampler<M, H, G, S, X, D, C, T, N> + RustToCuda + NoAliasing,
+    I: ImmigrationEntry<M> + RustToCuda + NoAliasing,
+    A: SingularActiveLineageSampler<M, H, G, S, X, D, C, T, N, E, I> + RustToCuda + NoAliasing,
     ReportSpeciation: Boolean,
     ReportDispersal: Boolean,
 >(
@@ -135,13 +137,17 @@ pub fn simulate<
 }
 
 // #[rust_cuda::common::kernel(
-//     pub use link_sort_kernel! as impl SortableKernel<SortKernelArgs> for
-// SortKernel )]
+//     pub use link_sort_kernel! as impl SortableKernel<
+//         SortKernelArgs, SortKernelPtx,
+//     > for SortKernel
+// )]
 // pub fn sort_events_step<ReportSpeciation: Boolean, ReportDispersal: Boolean>(
-//     #[kernel(pass = LendRustToCuda, jit)] event_buffer_reporter: &mut
-// ShallowCopy<
-//         necsim_impls_cuda::event_buffer::EventBuffer<ReportSpeciation,
-// ReportDispersal>,     >,
+//     #[kernel(pass = LendRustToCuda, jit)]
+//     event_buffer_reporter: &mut ShallowCopy<
+//         necsim_impls_cuda::event_buffer::EventBuffer<
+//             ReportSpeciation, ReportDispersal,
+//         >,
+//     >,
 //     #[kernel(pass = SafeDeviceCopy)] size: usize,
 //     #[kernel(pass = SafeDeviceCopy)] stride: usize,
 // ) {
@@ -152,7 +158,9 @@ pub fn simulate<
 // }
 
 #[rust_cuda::common::kernel(
-    pub use link_even_odd_sort_kernel! as impl EvenOddSortableKernel<EvenOddSortKernelArgs> for EvenOddSortKernel
+    pub use link_even_odd_sort_kernel! as impl EvenOddSortableKernel<
+        EvenOddSortKernelArgs, EvenOddSortKernelPtx,
+    > for EvenOddSortKernel
 )]
 pub fn even_odd_sort_events_step<ReportSpeciation: Boolean, ReportDispersal: Boolean>(
     #[kernel(pass = LendRustToCuda, jit)] event_buffer_reporter: &mut ShallowCopy<
@@ -168,7 +176,9 @@ pub fn even_odd_sort_events_step<ReportSpeciation: Boolean, ReportDispersal: Boo
 }
 
 #[rust_cuda::common::kernel(
-    pub use link_bitonic_global_sort_step_kernel! as impl BitonicGlobalSortSteppableKernel<BitonicGlobalSortStepKernelArgs> for BitonicGlobalSortStepKernel
+    pub use link_bitonic_global_sort_step_kernel! as impl BitonicGlobalSortSteppableKernel<
+        BitonicGlobalSortStepKernelArgs, BitonicGlobalSortStepKernelPtx,
+    > for BitonicGlobalSortStepKernel
 )]
 pub fn bitonic_global_sort_events_step<ReportSpeciation: Boolean, ReportDispersal: Boolean>(
     #[kernel(pass = LendRustToCuda, jit)] event_buffer_reporter: &mut ShallowCopy<
@@ -184,7 +194,9 @@ pub fn bitonic_global_sort_events_step<ReportSpeciation: Boolean, ReportDispersa
 }
 
 #[rust_cuda::common::kernel(
-    pub use link_bitonic_shared_sort_step_kernel! as impl BitonicSharedSortSteppableKernel<BitonicSharedSortStepKernelArgs> for BitonicSharedSortStepKernel
+    pub use link_bitonic_shared_sort_step_kernel! as impl BitonicSharedSortSteppableKernel<
+        BitonicSharedSortStepKernelArgs, BitonicSharedSortStepKernelPtx,
+    > for BitonicSharedSortStepKernel
 )]
 pub fn bitonic_shared_sort_events_step<ReportSpeciation: Boolean, ReportDispersal: Boolean>(
     #[kernel(pass = LendRustToCuda, jit)] event_buffer_reporter: &mut ShallowCopy<
@@ -199,7 +211,9 @@ pub fn bitonic_shared_sort_events_step<ReportSpeciation: Boolean, ReportDispersa
 }
 
 #[rust_cuda::common::kernel(
-    pub use link_bitonic_shared_sort_prep_kernel! as impl BitonicSharedSortPreparableKernel<BitonicSharedSortPrepKernelArgs> for BitonicSharedSortPrepKernel
+    pub use link_bitonic_shared_sort_prep_kernel! as impl BitonicSharedSortPreparableKernel<
+        BitonicSharedSortPrepKernelArgs, BitonicSharedSortPrepKernelPtx,
+    > for BitonicSharedSortPrepKernel
 )]
 pub fn bitonic_shared_sort_events_prep<ReportSpeciation: Boolean, ReportDispersal: Boolean>(
     #[kernel(pass = LendRustToCuda, jit)] event_buffer_reporter: &mut ShallowCopy<
@@ -215,10 +229,10 @@ pub fn bitonic_shared_sort_events_prep<ReportSpeciation: Boolean, ReportDispersa
 mod cuda_prelude {
     use core::arch::nvptx;
 
-    use rust_cuda::device::utils;
+    use rust_cuda::device::alloc::PTXAllocator;
 
     #[global_allocator]
-    static _GLOBAL_ALLOCATOR: utils::PTXAllocator = utils::PTXAllocator;
+    static _GLOBAL_ALLOCATOR: PTXAllocator = PTXAllocator;
 
     #[cfg(not(debug_assertions))]
     #[panic_handler]