(ml5717) Worked on event processing and simple launching

juntyr · juntyr · commit 8c5ab9645ae0 · 2020-11-10T22:11:24.000Z
diff --git a/necsim-core/src/event.rs b/necsim-core/src/event.rs
@@ -65,7 +65,7 @@ impl<H: Habitat, R: LineageReference<H>> Clone for Event<H, R> {
             time: self.time,
             lineage_reference: self.lineage_reference.clone(),
             r#type: self.r#type.clone(),
-            marker: self.marker.clone(),
+            marker: self.marker,
         }
     }
 }
@@ -83,7 +83,7 @@ impl<H: Habitat, R: LineageReference<H>> Clone for EventType<H, R> {
                 origin: origin.clone(),
                 target: target.clone(),
                 coalescence: coalescence.clone(),
-                marker: marker.clone(),
+                marker: *marker,
             },
         }
     }
diff --git a/necsim-cuda/kernel/src/lib.rs b/necsim-cuda/kernel/src/lib.rs
@@ -102,16 +102,17 @@ unsafe fn simulate_generic<
 ) {
     Simulation::with_borrow_from_rust_mut(simulation_ptr, |simulation| {
         EventBufferDevice::with_borrow_from_rust_mut(event_buffer_ptr, |event_buffer_reporter| {
-            let (time, steps) = simulation.simulate_incremental(max_steps, event_buffer_reporter);
+            /*let (time, steps) =*/
+            simulation.simulate_incremental(max_steps, event_buffer_reporter);
 
-            if utils::thread_idx().as_id(&utils::block_dim()) == 0 {
+            /*if utils::thread_idx().as_id(&utils::block_dim()) == 0 {
                 println!(
                     "index = {}, time = {:?}, steps = {}",
                     utils::index(),
                     F64(time),
                     steps
                 );
-            }
+            }*/
         })
     })
 }
diff --git a/necsim-cuda/src/lib.rs b/necsim-cuda/src/lib.rs
@@ -13,6 +13,7 @@ use array2d::Array2D;
 
 use rustacuda::context::Context as CudaContext;
 use rustacuda::function::Function;
+use rustacuda::module::Symbol;
 use rustacuda::prelude::*;
 
 use rust_cuda::host::LendToCuda;
@@ -177,15 +178,23 @@ impl CudaSimulation {
             .active_lineage_sampler(active_lineage_sampler)
             .build();
 
+        // TODO: Need a way to tune these based on the available CUDA device or cmd args
         let cuda_block_size = rustacuda::function::BlockSize::xy(16, 16);
-        let cuda_grid_size = rustacuda::function::GridSize::x({
+        let cuda_grid_size = rustacuda::function::GridSize::xy(16, 16);
+
+        #[allow(clippy::cast_possible_truncation)]
+        let cuda_grid_amount = {
             #[allow(clippy::cast_possible_truncation)]
-            let total_individuals = simulation.lineage_store().get_number_total_lineages() as u32;
-            let cuda_block_length = cuda_block_size.x * cuda_block_size.y * cuda_block_size.z;
+            let total_individuals = simulation.lineage_store().get_number_total_lineages();
+
+            let cuda_block_size =
+                (cuda_block_size.x * cuda_block_size.y * cuda_block_size.z) as usize;
+            let cuda_grid_size = (cuda_grid_size.x * cuda_grid_size.y * cuda_grid_size.z) as usize;
 
-            (total_individuals / cuda_block_length)
-                + (total_individuals % cuda_block_length > 0) as u32
-        });
+            let cuda_task_size = cuda_block_size * cuda_grid_size;
+
+            (total_individuals / cuda_task_size) + (total_individuals % cuda_task_size > 0) as usize
+        } as u32;
 
         let module_data = CString::new(include_str!(env!("KERNEL_PTX_PATH"))).unwrap();
 
@@ -199,6 +208,9 @@ impl CudaSimulation {
         with_cuda!(CudaContext::create_and_push(ContextFlags::MAP_HOST | ContextFlags::SCHED_AUTO, device)? => |context: CudaContext| {
         // Load the module containing the kernel function
         with_cuda!(Module::load_from_string(&module_data)? => |module: Module| {
+        // Load and initialise the grid_id symbol from the module
+        let mut grid_id_symbol: Symbol<u32>  = module.get_global(&CString::new("grid_id").unwrap())?;
+        grid_id_symbol.copy_from(&0_u32)?;
         // Load the kernel function from the module
         let simulate_kernel = module.get_function(&CString::new("simulate").unwrap())?;
         // Create a stream to submit work to
@@ -214,30 +226,54 @@ impl CudaSimulation {
             let mut event_buffer: EventBufferHost<InMemoryHabitat, InMemoryLineageReference> =
                 EventBufferHost::new(&cuda_block_size, &cuda_grid_size, SIMULATION_STEP_SLICE)?;
 
+            let mut remaining_individuals = simulation.lineage_store().get_number_total_lineages();
+
+            // TODO: We should use async launches and callbacks to rotate between simulation, event analysis etc.
             if let Err(err) = simulation.lend_to_cuda_mut(|simulation_mut_ptr| {
-                let block_index_range = 0..(cuda_grid_size.x * cuda_grid_size.y * cuda_grid_size.z);
-
-                // Launching kernels is unsafe since Rust can't enforce safety - think of kernel launches
-                // as a foreign-function call. In this case, it is - this kernel is written in CUDA C.
-                unsafe {
-                    launch!(simulate_kernel<<<cuda_grid_size, cuda_block_size, 0, stream>>>(
-                        simulation_mut_ptr,
-                        event_buffer.get_mut_cuda_ptr(),
-                        SIMULATION_STEP_SLICE
-                    ))?;
-                }
+                let mut time_slice = 0;
+
+                while remaining_individuals > 0 {
+                    println!("Starting time slice {} with {} remaining individuals ...", time_slice + 1, remaining_individuals);
+
+                    for grid_id in 0..cuda_grid_amount {
+                        grid_id_symbol.copy_from(&grid_id)?;
+
+                        let cuda_grid_size = cuda_grid_size.clone();
+                        let cuda_block_size = cuda_block_size.clone();
+
+                        println!("Launching grid {}/{} of time slice {} ...", grid_id + 1, cuda_grid_amount, time_slice + 1);
+
+                        // Launching kernels is unsafe since Rust cannot enforce safety across
+                        // the foreign function CUDA-C language barrier
+                        unsafe {
+                            launch!(simulate_kernel<<<cuda_grid_size, cuda_block_size, 0, stream>>>(
+                                simulation_mut_ptr,
+                                event_buffer.get_mut_cuda_ptr(),
+                                SIMULATION_STEP_SLICE
+                            ))?;
+                        }
+
+                        println!("Synchronising ...");
+
+                        stream.synchronize()?;
+
+                        println!("Analysing events ...");
 
-                stream.synchronize()?;
+                        event_buffer.with_fetched_events(|events| {
+                            events.inspect(|event| {
+                                if let necsim_core::event::EventType::Speciation = event.r#type() {
+                                    remaining_individuals -= 1;
+                                }
+                            }).for_each(|event| reporter.report_event(&event))
+                        })?
+                    }
 
-                for block_index in block_index_range {
-                    event_buffer.with_fetched_events_for_block(block_index as usize, |events| {
-                        events.iter().for_each(|event| reporter.report_event(event))
-                    })?
+                    time_slice += 1;
                 }
 
                 Ok(())
             }) {
-                eprintln!("Running kernel failed with {:#?}!", err);
+                eprintln!("\nRunning kernel failed with {:#?}!\n", err);
             }
 
         });});});
diff --git a/necsim-impls-cuda/src/event_buffer/device.rs b/necsim-impls-cuda/src/event_buffer/device.rs
@@ -46,8 +46,8 @@ impl<H: Habitat + RustToCuda, R: LineageReference<H> + DeviceCopy> EventBufferDe
         let raw_slice: &mut [Option<Event<H, R>>] =
             core::slice::from_raw_parts_mut(cuda_repr_ref.device_buffer.as_raw_mut(), buffer_len);
 
-        let (_before_raw_slice, rest_raw_slice) =
-            raw_slice.split_at_mut(rust_cuda::device::utils::index() * cuda_repr_ref.max_events);
+        let (_before_raw_slice, rest_raw_slice) = raw_slice
+            .split_at_mut(rust_cuda::device::utils::index_no_offset() * cuda_repr_ref.max_events);
         let (individual_raw_slice, _after_raw_slice) =
             rest_raw_slice.split_at_mut(cuda_repr_ref.max_events);
 
diff --git a/necsim-impls-cuda/src/event_buffer/host.rs b/necsim-impls-cuda/src/event_buffer/host.rs
@@ -1,4 +1,4 @@
-use alloc::vec::Vec;
+use core::ops::DerefMut;
 
 use rustacuda::error::CudaResult;
 use rustacuda::function::{BlockSize, GridSize};
@@ -15,14 +15,18 @@ use necsim_core::event::Event;
 
 #[allow(clippy::module_name_repetitions)]
 pub struct EventBufferHost<H: Habitat + RustToCuda, R: LineageReference<H> + DeviceCopy> {
-    block_size: usize,
-    grid_size: usize,
-    max_events: usize,
     host_buffer: CudaDropWrapper<LockedBuffer<Option<Event<H, R>>>>,
     device_buffer: CudaDropWrapper<DeviceBuffer<Option<Event<H, R>>>>,
     cuda_repr_box: CudaDropWrapper<DeviceBox<super::common::EventBufferCudaRepresentation<H, R>>>,
 }
 
+pub type EventIterator<'e, H, R> = core::iter::FilterMap<
+    core::slice::IterMut<'e, Option<necsim_core::event::Event<H, R>>>,
+    for<'r> fn(
+        &'r mut Option<necsim_core::event::Event<H, R>>,
+    ) -> Option<necsim_core::event::Event<H, R>>,
+>;
+
 impl<H: Habitat + RustToCuda, R: LineageReference<H> + DeviceCopy> EventBufferHost<H, R> {
     /// # Errors
     /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA
@@ -49,44 +53,24 @@ impl<H: Habitat + RustToCuda, R: LineageReference<H> + DeviceCopy> EventBufferHo
         let cuda_repr_box = CudaDropWrapper::from(DeviceBox::new(&cuda_repr)?);
 
         Ok(Self {
-            block_size,
-            grid_size,
-            max_events,
             host_buffer,
             device_buffer,
             cuda_repr_box,
         })
     }
 
-    #[debug_requires(block_index < self.grid_size, "block_index is in range")]
-    pub fn with_fetched_events_for_block<O, F: FnOnce(Vec<Event<H, R>>) -> O>(
+    /// # Errors
+    /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA
+    pub fn with_fetched_events<O, F: FnOnce(EventIterator<'_, H, R>) -> O>(
         &mut self,
-        block_index: usize,
         inner: F,
     ) -> CudaResult<O> {
-        let full_host_buffer = self.host_buffer.as_mut_slice();
-        let (_before_host_buffer, rest_host_buffer) =
-            full_host_buffer.split_at_mut(block_index * self.block_size * self.max_events);
-        let (block_host_buffer, _after_host_buffer) =
-            rest_host_buffer.split_at_mut(self.block_size * self.max_events);
-
-        let full_device_buffer = &mut self.device_buffer;
-        let (_before_device_buffer, rest_device_buffer) =
-            full_device_buffer.split_at_mut(block_index * self.block_size * self.max_events);
-        let (block_device_buffer, _after_device_buffer) =
-            rest_device_buffer.split_at_mut(self.block_size * self.max_events);
-
-        block_device_buffer.copy_to(block_host_buffer)?;
+        self.device_buffer.copy_to(self.host_buffer.deref_mut())?;
 
         // Collect the events and reset the buffer slice to all None's
-        let result = inner(
-            block_host_buffer
-                .iter_mut()
-                .filter_map(Option::take)
-                .collect::<Vec<Event<H, R>>>(),
-        );
-
-        block_device_buffer.copy_from(block_host_buffer)?;
+        let result = inner(self.host_buffer.iter_mut().filter_map(Option::take));
+
+        self.device_buffer.copy_from(self.host_buffer.deref_mut())?;
 
         Ok(result)
     }
diff --git a/necsim-impls-cuda/src/lib.rs b/necsim-impls-cuda/src/lib.rs
@@ -4,7 +4,7 @@
 
 extern crate alloc;
 
-#[macro_use]
+#[cfg_attr(target_os = "cuda", macro_use)]
 extern crate contracts;
 
 #[macro_use]
diff --git a/necsim-impls-no-std/src/cogs/lineage_reference/in_memory.rs b/necsim-impls-no-std/src/cogs/lineage_reference/in_memory.rs
@@ -26,6 +26,6 @@ impl rust_cuda::common::FromCudaThreadIdx for InMemoryLineageReference {
     #[cfg(target_os = "cuda")]
     fn from_cuda_thread_idx() -> Self {
         #[allow(clippy::cast_sign_loss)]
-        Self::from(rust_cuda::device::utils::index() as usize)
+        Self::from(rust_cuda::device::utils::index())
     }
 }
diff --git a/rust-cuda/src/device/nvptx.rs b/rust-cuda/src/device/nvptx.rs
@@ -45,6 +45,17 @@ extern "C" {
     fn thread_idx_z() -> u32;
 }
 
+extern "C" {
+    #[no_mangle]
+    static grid_id: u32;
+}
+
+#[must_use]
+#[inline]
+pub unsafe fn _grid_id() -> u32 {
+    grid_id
+}
+
 /// Calculate the base e logarithm of the input argument x.
 #[must_use]
 #[inline]
diff --git a/rust-cuda/src/device/utils.rs b/rust-cuda/src/device/utils.rs
@@ -203,9 +203,19 @@ impl Idx3 {
 }
 
 #[must_use]
-pub fn index() -> usize {
+pub fn grid_id() -> u32 {
+    unsafe { nvptx::_grid_id() }
+}
+
+#[must_use]
+pub fn index_no_offset() -> usize {
     let block_id = block_idx().as_id(&grid_dim());
     let thread_id = thread_idx().as_id(&block_dim());
 
     (block_id * block_dim().size() + thread_id) as usize
 }
+
+#[must_use]
+pub fn index() -> usize {
+    (grid_id() * grid_dim().size() * block_dim().size()) as usize + index_no_offset()
+}

Original file line number	Diff line number	Diff line change
`@@ -65,7 +65,7 @@ impl<H: Habitat, R: LineageReference<H>> Clone for Event<H, R> {`
`65`	`65`	`time: self.time,`
`66`	`66`	`lineage_reference: self.lineage_reference.clone(),`
`67`	`67`	`r#type: self.r#type.clone(),`
`68`		`- marker: self.marker.clone(),`
	`68`	`+ marker: self.marker,`
`69`	`69`	`}`
`70`	`70`	`}`
`71`	`71`	`}`
`@@ -83,7 +83,7 @@ impl<H: Habitat, R: LineageReference<H>> Clone for EventType<H, R> {`
`83`	`83`	`origin: origin.clone(),`
`84`	`84`	`target: target.clone(),`
`85`	`85`	`coalescence: coalescence.clone(),`
`86`		`- marker: marker.clone(),`
	`86`	`+ marker: *marker,`
`87`	`87`	`},`
`88`	`88`	`}`
`89`	`89`	`}`
Original file line number	Diff line number	Diff line change
`@@ -26,6 +26,6 @@ impl rust_cuda::common::FromCudaThreadIdx for InMemoryLineageReference {`
`26`	`26`	`#[cfg(target_os = "cuda")]`
`27`	`27`	`fn from_cuda_thread_idx() -> Self {`
`28`	`28`	`#[allow(clippy::cast_sign_loss)]`
`29`		`- Self::from(rust_cuda::device::utils::index() as usize)`
	`29`	`+ Self::from(rust_cuda::device::utils::index())`
`30`	`30`	`}`
`31`	`31`	`}`