Start with bitonic sort implementation [wip]

juntyr · juntyr · commit c7869d5f3a0c · 2023-02-07T06:45:11.000Z
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/necsim/core/Cargo.toml b/necsim/core/Cargo.toml
@@ -20,7 +20,7 @@ contracts = "0.6.3"
 serde = { version = "1.0", default-features = false, features = ["derive"] }
 
 [target.'cfg(target_os = "cuda")'.dependencies]
-rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "d5dfd114", features = ["derive"], optional = true }
+rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "c56d7f8f", features = ["derive"], optional = true }
 
 [target.'cfg(not(target_os = "cuda"))'.dependencies]
-rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "d5dfd114", features = ["derive", "host"], optional = true }
+rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "c56d7f8f", features = ["derive", "host"], optional = true }
diff --git a/necsim/core/src/event.rs b/necsim/core/src/event.rs
@@ -255,6 +255,7 @@ impl PartialEq for PackedEvent {
 }
 
 impl Ord for PackedEvent {
+    #[inline]
     fn cmp(&self, other: &Self) -> Ordering {
         // Order `Event`s in lexicographical order:
         //  (1) event_time                       /=\
diff --git a/necsim/impls/cuda/Cargo.toml b/necsim/impls/cuda/Cargo.toml
@@ -15,7 +15,7 @@ contracts = "0.6.3"
 serde = { version = "1.0", default-features = false, features = ["derive"] }
 
 [target.'cfg(target_os = "cuda")'.dependencies]
-rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "d5dfd114", features = ["derive"] }
+rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "c56d7f8f", features = ["derive"] }
 
 [target.'cfg(not(target_os = "cuda"))'.dependencies]
-rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "d5dfd114", features = ["derive", "host"] }
+rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "c56d7f8f", features = ["derive", "host"] }
diff --git a/necsim/impls/cuda/src/event_buffer.rs b/necsim/impls/cuda/src/event_buffer.rs
@@ -59,35 +59,48 @@ pub trait AlignedToU64: sealed::AlignedToU64 {}
 impl<T: sealed::AlignedToU64> AlignedToU64 for T {}
 
 pub trait EventType {
-    type Event: ~const rust_cuda::const_type_layout::TypeGraphLayout
+    type Event: 'static
+        + ~const rust_cuda::const_type_layout::TypeGraphLayout
         + rust_cuda::safety::StackOnly
         + Into<TypedEvent>
         + Into<PackedEvent>
         + Ord
         + Clone
         + AlignedToU64;
+
+    const SHARED_LIMIT: usize;
 }
 
 impl<ReportSpeciation: Boolean, ReportDispersal: Boolean> EventType
     for EventBuffer<ReportSpeciation, ReportDispersal>
 {
     default type Event = PackedEvent;
+
+    default const SHARED_LIMIT: usize = 0;
 }
 
 impl EventType for EventBuffer<False, False> {
     type Event = PackedEvent;
+
+    const SHARED_LIMIT: usize = ((48*1024 / core::mem::size_of::<Self::Event>()) / 32) * 32;
 }
 
 impl EventType for EventBuffer<False, True> {
     type Event = PackedEvent;
+
+    const SHARED_LIMIT: usize = ((48*1024 / core::mem::size_of::<Self::Event>()) / 32) * 32;
 }
 
 impl EventType for EventBuffer<True, False> {
     type Event = SpeciationEvent;
+
+    const SHARED_LIMIT: usize = ((48*1024 / core::mem::size_of::<Self::Event>()) / 32) * 32;
 }
 
 impl EventType for EventBuffer<True, True> {
     type Event = PackedEvent;
+
+    const SHARED_LIMIT: usize = ((48*1024 / core::mem::size_of::<Self::Event>()) / 32) * 32;
 }
 
 impl<ReportSpeciation: Boolean, ReportDispersal: Boolean> fmt::Debug
@@ -212,19 +225,238 @@ impl<ReportSpeciation: Boolean, ReportDispersal: Boolean>
 impl<ReportSpeciation: Boolean, ReportDispersal: Boolean>
     EventBuffer<ReportSpeciation, ReportDispersal>
 {
+    /// Bitonic sort combined merge step for shared memory, based on
+    /// <https://github.com/NVIDIA/cuda-samples/blob/81992093d2b8c33cab22dbf6852c070c330f1715/Samples/2_Concepts_and_Techniques/sortingNetworks/bitonicSort.cu#L179-L220>
+    /// 
+    /// # Safety
+    ///
+    /// All CUDA threads must call this method with the same size argument.
+    /// Only one call per kernel launch is safe without further synchronisation.
+    /// 
+    /// # Panics
+    /// 
+    /// Panics if the thread block size does not equal `<Self as EventType>::SHARED_LIMIT`.
+    pub unsafe fn bitonic_sort_events_shared_step(&mut self, size: usize) where [(); <Self as EventType>::SHARED_LIMIT]: {
+        use core::cmp::Ordering;
+
+        let block_dim = rust_cuda::device::utils::block_dim();
+
+        rust_cuda::assert_eq!(block_dim.size() * 2, <Self as EventType>::SHARED_LIMIT);
+
+        let block_idx = rust_cuda::device::utils::block_idx().as_id(&rust_cuda::device::utils::grid_dim());
+        let thread_idx = rust_cuda::device::utils::thread_idx().as_id(&block_dim);
+
+        let idx = block_idx * <Self as EventType>::SHARED_LIMIT + thread_idx;
+
+        let shared_mask: rust_cuda::device::ThreadBlockShared<
+            [bool; <Self as EventType>::SHARED_LIMIT]
+        > = rust_cuda::device::ThreadBlockShared::new_uninit();
+        let shared_mask_array: *mut bool = shared_mask.get().cast();
+        let shared_buffer: rust_cuda::device::ThreadBlockShared<
+            [MaybeSome<<Self as EventType>::Event>; <Self as EventType>::SHARED_LIMIT]
+        > = rust_cuda::device::ThreadBlockShared::new_uninit();
+        let shared_buffer_array: *mut MaybeSome<<Self as EventType>::Event> = shared_buffer.get().cast();
+
+        *shared_mask_array.add(thread_idx) = match self.event_mask.alias_unchecked().get(idx) {
+            None => false,
+            Some(mask) => *mask.read(),
+        };
+        *shared_buffer_array.add(thread_idx) = match self.event_buffer.alias_unchecked().get(idx) {
+            None => MaybeSome::None,
+            Some(event) => event.as_uninit().assume_init_read(),
+        };
+        *shared_mask_array.add(thread_idx + (<Self as EventType>::SHARED_LIMIT / 2)) = match self.event_mask.alias_unchecked().get(idx + (<Self as EventType>::SHARED_LIMIT / 2)) {
+            None => false,
+            Some(mask) => *mask.read(),
+        };
+        *shared_buffer_array.add(thread_idx + (<Self as EventType>::SHARED_LIMIT / 2)) = match self.event_buffer.alias_unchecked().get(idx + (<Self as EventType>::SHARED_LIMIT / 2)) {
+            None => MaybeSome::None,
+            Some(event) => event.as_uninit().assume_init_read(),
+        };
+
+        let pos = (block_idx * block_dim.size() + thread_idx) & ((self.event_mask.alias_unchecked().len().next_power_of_two() / 2) - 1);
+        let dir = if (pos & (size / 2)) == 0 {
+            Ordering::Greater
+        } else {
+            Ordering::Less
+        };
+
+        let mut stride = <Self as EventType>::SHARED_LIMIT >> 1;
+
+        while stride > 0 {
+            ::core::arch::nvptx::_syncthreads();
+
+            let pos_a = 2 * thread_idx - (thread_idx & (stride - 1));
+            let pos_b = pos_a + stride;
+
+            let mask_a: bool = *shared_mask_array.add(pos_a);
+            let mask_b: bool = *shared_mask_array.add(pos_b);
+
+            let cmp = match (mask_a, mask_b) {
+                (false, false) => Ordering::Equal,
+                (false, true) => Ordering::Greater,
+                (true, false) => Ordering::Less,
+                (true, true) => {
+                    // Safety: both masks indicate that the two events exist
+                    let event_a: &<Self as EventType>::Event = unsafe {
+                        (*shared_buffer_array.add(pos_a)).assume_some_ref()
+                    };
+                    let event_b: &<Self as EventType>::Event = unsafe {
+                        (*shared_buffer_array.add(pos_b)).assume_some_ref()
+                    };
+
+                    event_a.cmp(event_b)
+                },
+            };
+
+            if cmp == dir {
+                *shared_mask_array.add(pos_a) = mask_b;
+                *shared_mask_array.add(pos_b) = mask_a;
+
+                let ptr_a: *mut u64 = shared_buffer_array.add(pos_a).cast();
+                let ptr_b: *mut u64 = shared_buffer_array.add(pos_b).cast();
+
+                // Manual swap implementation that can be unrolled without local memory
+                // Safety: AlignedToU64 guarantees that both events are aligned to u64
+                //         and can be copied as multiples of u64
+                for i in 0..(core::mem::size_of::<<Self as EventType>::Event>() / 8) {
+                    let swap = *ptr_a.add(i);
+                    *ptr_a.add(i) = *ptr_b.add(i);
+                    *ptr_b.add(i) = swap;
+                }
+            }
+
+            stride >>= 1;
+        }
+
+        ::core::arch::nvptx::_syncthreads();
+
+        if let Some(mask) = self.event_mask.alias_mut_unchecked().get_mut(idx) {
+            mask.write(*shared_mask_array.add(thread_idx));
+        }
+        if let Some(event) = self.event_buffer.alias_mut_unchecked().get_mut(idx) {
+            event.write(core::ptr::read(shared_buffer_array.add(thread_idx)));
+        }
+        if let Some(mask) = self.event_mask.alias_mut_unchecked().get_mut(idx + (<Self as EventType>::SHARED_LIMIT / 2)) {
+            mask.write(*shared_mask_array.add(thread_idx + (<Self as EventType>::SHARED_LIMIT / 2)));
+        }
+        if let Some(event) = self.event_buffer.alias_mut_unchecked().get_mut(idx + (<Self as EventType>::SHARED_LIMIT / 2)) {
+            event.write(core::ptr::read(shared_buffer_array.add(thread_idx + (<Self as EventType>::SHARED_LIMIT / 2))));
+        }
+    }
+
+    /// Bitonic sort single merge step for global memory, based on
+    /// <https://github.com/NVIDIA/cuda-samples/blob/81992093d2b8c33cab22dbf6852c070c330f1715/Samples/2_Concepts_and_Techniques/sortingNetworks/bitonicSort.cu#L154-L177>
+    /// 
+    /// # Safety
+    ///
+    /// All CUDA threads must call this method with the same size and stride arguments.
+    /// Only one call per kernel launch is safe without further synchronisation.
+    pub unsafe fn bitonic_sort_events_step(&mut self, size: usize, stride: usize) {
+        use core::cmp::Ordering;
+
+        let idx = rust_cuda::device::utils::index();
+
+        let pos = idx & ((self.event_mask.alias_unchecked().len().next_power_of_two() / 2) - 1);
+
+        let dir = if (pos & (size / 2)) == 0 {
+            Ordering::Greater
+        } else {
+            Ordering::Less
+        };
+
+        let pos_a = 2 * idx - (idx & (stride - 1));
+        let pos_b = pos_a + stride;
+
+        if (pos_a < self.event_mask.alias_unchecked().len())
+            && (pos_b < self.event_mask.alias_unchecked().len())
+        {
+            let mask_a: bool = *self
+                .event_mask
+                .alias_unchecked()
+                .get_unchecked(pos_a)
+                .read();
+            let mask_b: bool = *self
+                .event_mask
+                .alias_unchecked()
+                .get_unchecked(pos_b)
+                .read();
+
+            let cmp = match (mask_a, mask_b) {
+                (false, false) => Ordering::Equal,
+                (false, true) => Ordering::Greater,
+                (true, false) => Ordering::Less,
+                (true, true) => {
+                    // Safety: both masks indicate that the two events exist
+                    let event_a: &<Self as EventType>::Event = unsafe {
+                        self.event_buffer
+                            .alias_unchecked()
+                            .get_unchecked(pos_a)
+                            .as_uninit()
+                            .assume_init_ref()
+                            .assume_some_ref()
+                    };
+                    let event_b: &<Self as EventType>::Event = unsafe {
+                        self.event_buffer
+                            .alias_unchecked()
+                            .get_unchecked(pos_b)
+                            .as_uninit()
+                            .assume_init_ref()
+                            .assume_some_ref()
+                    };
+
+                    event_a.cmp(event_b)
+                },
+            };
+
+            if cmp == dir {
+                self.event_mask
+                    .alias_mut_unchecked()
+                    .get_unchecked_mut(pos_a)
+                    .write(mask_b);
+                self.event_mask
+                    .alias_mut_unchecked()
+                    .get_unchecked_mut(pos_b)
+                    .write(mask_a);
+
+                let ptr_a: *mut u64 = self
+                    .event_buffer
+                    .alias_mut_unchecked()
+                    .as_mut_ptr()
+                    .add(pos_a)
+                    .cast();
+                let ptr_b: *mut u64 = self
+                    .event_buffer
+                    .alias_mut_unchecked()
+                    .as_mut_ptr()
+                    .add(pos_b)
+                    .cast();
+
+                // Manual swap implementation that can be unrolled without local memory
+                // Safety: AlignedToU64 guarantees that both events are aligned to u64
+                //         and can be copied as multiples of u64
+                for i in 0..(core::mem::size_of::<<Self as EventType>::Event>() / 8) {
+                    let swap = *ptr_a.add(i);
+                    *ptr_a.add(i) = *ptr_b.add(i);
+                    *ptr_b.add(i) = swap;
+                }
+            }
+        }
+    }
+
     #[allow(clippy::too_many_lines)]
+    /// Odd-Even sort single merge step for global memory, based on
+    /// <https://github.com/NVIDIA/cuda-samples/blob/81992093d2b8c33cab22dbf6852c070c330f1715/Samples/2_Concepts_and_Techniques/sortingNetworks/oddEvenMergeSort.cu#L95-L137>
+    /// 
     /// # Safety
     ///
-    /// All CUDA threads must call this method with the same size, stride, and
-    /// direction arguments. Only one call per kernel launch is safe without
-    /// further synchronisation.
-    pub unsafe fn sort_events_step(&mut self, size: usize, stride: usize) {
+    /// All CUDA threads must call this method with the same size and stride arguments.
+    /// Only one call per kernel launch is safe without further synchronisation.
+    pub unsafe fn odd_even_sort_events_step(&mut self, size: usize, stride: usize) {
         use core::cmp::Ordering;
 
         let idx = rust_cuda::device::utils::index();
 
-        // Odd-Even merge based on
-        // https://github.com/NVIDIA/cuda-samples/blob/81992093d2b8c33cab22dbf6852c070c330f1715/Samples/2_Concepts_and_Techniques/sortingNetworks/oddEvenMergeSort.cu#L95-L137
         let pos = 2 * idx - (idx & (stride - 1));
 
         let (pos_a, pos_b) = if stride < (size / 2) {
diff --git a/necsim/impls/cuda/src/lib.rs b/necsim/impls/cuda/src/lib.rs
@@ -7,6 +7,7 @@
 #![feature(const_refs_to_cell)]
 #![feature(generic_const_exprs)]
 #![cfg_attr(target_os = "cuda", feature(asm_experimental_arch))]
+#![cfg_attr(target_os = "cuda", feature(stdsimd))]
 #![allow(incomplete_features)]
 #![feature(specialization)]
 
diff --git a/necsim/impls/cuda/src/utils.rs b/necsim/impls/cuda/src/utils.rs
@@ -8,7 +8,6 @@ use rust_cuda::safety::StackOnly;
 pub struct MaybeSome<T: StackOnly>(MaybeUninit<T>);
 
 impl<T: StackOnly> MaybeSome<T> {
-    #[cfg(not(target_os = "cuda"))]
     #[allow(non_upper_case_globals)]
     pub(crate) const None: Self = Self(MaybeUninit::uninit());
 
diff --git a/necsim/impls/no-std/Cargo.toml b/necsim/impls/no-std/Cargo.toml
@@ -31,7 +31,7 @@ rand_core = "0.6"
 rand_distr = { version = "0.4", default-features = false, features = [] }
 
 [target.'cfg(target_os = "cuda")'.dependencies]
-rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "d5dfd114", features = ["derive"], optional = true }
+rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "c56d7f8f", features = ["derive"], optional = true }
 
 [target.'cfg(not(target_os = "cuda"))'.dependencies]
-rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "d5dfd114", features = ["derive", "host"], optional = true }
+rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "c56d7f8f", features = ["derive", "host"], optional = true }
diff --git a/rustcoalescence/algorithms/cuda/Cargo.toml b/rustcoalescence/algorithms/cuda/Cargo.toml
@@ -23,4 +23,4 @@ thiserror = "1.0"
 serde = { version = "1.0", features = ["derive"] }
 serde_state = "0.4"
 serde_derive_state = "0.4"
-rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "d5dfd114", features = ["host"] }
+rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "c56d7f8f", features = ["host"] }
diff --git a/rustcoalescence/algorithms/cuda/cpu-kernel/Cargo.toml b/rustcoalescence/algorithms/cuda/cpu-kernel/Cargo.toml
@@ -14,4 +14,4 @@ necsim-impls-no-std = { path = "../../../../necsim/impls/no-std", features = ["c
 necsim-impls-cuda = { path = "../../../../necsim/impls/cuda" }
 rustcoalescence-algorithms-cuda-gpu-kernel = { path = "../gpu-kernel" }
 
-rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "d5dfd114", features = ["host"] }
+rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "c56d7f8f", features = ["host"] }
diff --git a/rustcoalescence/algorithms/cuda/gpu-kernel/Cargo.toml b/rustcoalescence/algorithms/cuda/gpu-kernel/Cargo.toml
@@ -16,4 +16,4 @@ necsim-core-bond = { path = "../../../../necsim/core/bond" }
 necsim-impls-no-std = { path = "../../../../necsim/impls/no-std", features = ["cuda"] }
 necsim-impls-cuda = { path = "../../../../necsim/impls/cuda" }
 
-rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "d5dfd114", features = ["derive"] }
+rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "c56d7f8f", features = ["derive"] }
diff --git a/rustcoalescence/algorithms/cuda/gpu-kernel/src/lib.rs b/rustcoalescence/algorithms/cuda/gpu-kernel/src/lib.rs
diff --git a/rustcoalescence/algorithms/cuda/src/parallelisation/monolithic.rs b/rustcoalescence/algorithms/cuda/src/parallelisation/monolithic.rs

Original file line number	Diff line number	Diff line change
`@@ -255,6 +255,7 @@ impl PartialEq for PackedEvent {`
`255`	`255`	`}`
`256`	`256`
`257`	`257`	`impl Ord for PackedEvent {`
	`258`	`+ #[inline]`
`258`	`259`	`fn cmp(&self, other: &Self) -> Ordering {`
`259`	`260`	// Order `Event`s in lexicographical order:
`260`	`261`	`// (1) event_time /=\`