Skip to content

Commit 8c5ab96

Browse files
committed
(ml5717) Worked on event processing and simple launching
1 parent d11b121 commit 8c5ab96

File tree

9 files changed

+106
-64
lines changed

9 files changed

+106
-64
lines changed

necsim-core/src/event.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ impl<H: Habitat, R: LineageReference<H>> Clone for Event<H, R> {
6565
time: self.time,
6666
lineage_reference: self.lineage_reference.clone(),
6767
r#type: self.r#type.clone(),
68-
marker: self.marker.clone(),
68+
marker: self.marker,
6969
}
7070
}
7171
}
@@ -83,7 +83,7 @@ impl<H: Habitat, R: LineageReference<H>> Clone for EventType<H, R> {
8383
origin: origin.clone(),
8484
target: target.clone(),
8585
coalescence: coalescence.clone(),
86-
marker: marker.clone(),
86+
marker: *marker,
8787
},
8888
}
8989
}

necsim-cuda/kernel/src/lib.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -102,16 +102,17 @@ unsafe fn simulate_generic<
102102
) {
103103
Simulation::with_borrow_from_rust_mut(simulation_ptr, |simulation| {
104104
EventBufferDevice::with_borrow_from_rust_mut(event_buffer_ptr, |event_buffer_reporter| {
105-
let (time, steps) = simulation.simulate_incremental(max_steps, event_buffer_reporter);
105+
/*let (time, steps) =*/
106+
simulation.simulate_incremental(max_steps, event_buffer_reporter);
106107

107-
if utils::thread_idx().as_id(&utils::block_dim()) == 0 {
108+
/*if utils::thread_idx().as_id(&utils::block_dim()) == 0 {
108109
println!(
109110
"index = {}, time = {:?}, steps = {}",
110111
utils::index(),
111112
F64(time),
112113
steps
113114
);
114-
}
115+
}*/
115116
})
116117
})
117118
}

necsim-cuda/src/lib.rs

Lines changed: 59 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ use array2d::Array2D;
1313

1414
use rustacuda::context::Context as CudaContext;
1515
use rustacuda::function::Function;
16+
use rustacuda::module::Symbol;
1617
use rustacuda::prelude::*;
1718

1819
use rust_cuda::host::LendToCuda;
@@ -177,15 +178,23 @@ impl CudaSimulation {
177178
.active_lineage_sampler(active_lineage_sampler)
178179
.build();
179180

181+
// TODO: Need a way to tune these based on the available CUDA device or cmd args
180182
let cuda_block_size = rustacuda::function::BlockSize::xy(16, 16);
181-
let cuda_grid_size = rustacuda::function::GridSize::x({
183+
let cuda_grid_size = rustacuda::function::GridSize::xy(16, 16);
184+
185+
#[allow(clippy::cast_possible_truncation)]
186+
let cuda_grid_amount = {
182187
#[allow(clippy::cast_possible_truncation)]
183-
let total_individuals = simulation.lineage_store().get_number_total_lineages() as u32;
184-
let cuda_block_length = cuda_block_size.x * cuda_block_size.y * cuda_block_size.z;
188+
let total_individuals = simulation.lineage_store().get_number_total_lineages();
189+
190+
let cuda_block_size =
191+
(cuda_block_size.x * cuda_block_size.y * cuda_block_size.z) as usize;
192+
let cuda_grid_size = (cuda_grid_size.x * cuda_grid_size.y * cuda_grid_size.z) as usize;
185193

186-
(total_individuals / cuda_block_length)
187-
+ (total_individuals % cuda_block_length > 0) as u32
188-
});
194+
let cuda_task_size = cuda_block_size * cuda_grid_size;
195+
196+
(total_individuals / cuda_task_size) + (total_individuals % cuda_task_size > 0) as usize
197+
} as u32;
189198

190199
let module_data = CString::new(include_str!(env!("KERNEL_PTX_PATH"))).unwrap();
191200

@@ -199,6 +208,9 @@ impl CudaSimulation {
199208
with_cuda!(CudaContext::create_and_push(ContextFlags::MAP_HOST | ContextFlags::SCHED_AUTO, device)? => |context: CudaContext| {
200209
// Load the module containing the kernel function
201210
with_cuda!(Module::load_from_string(&module_data)? => |module: Module| {
211+
// Load and initialise the grid_id symbol from the module
212+
let mut grid_id_symbol: Symbol<u32> = module.get_global(&CString::new("grid_id").unwrap())?;
213+
grid_id_symbol.copy_from(&0_u32)?;
202214
// Load the kernel function from the module
203215
let simulate_kernel = module.get_function(&CString::new("simulate").unwrap())?;
204216
// Create a stream to submit work to
@@ -214,30 +226,54 @@ impl CudaSimulation {
214226
let mut event_buffer: EventBufferHost<InMemoryHabitat, InMemoryLineageReference> =
215227
EventBufferHost::new(&cuda_block_size, &cuda_grid_size, SIMULATION_STEP_SLICE)?;
216228

229+
let mut remaining_individuals = simulation.lineage_store().get_number_total_lineages();
230+
231+
// TODO: We should use async launches and callbacks to rotate between simulation, event analysis etc.
217232
if let Err(err) = simulation.lend_to_cuda_mut(|simulation_mut_ptr| {
218-
let block_index_range = 0..(cuda_grid_size.x * cuda_grid_size.y * cuda_grid_size.z);
219-
220-
// Launching kernels is unsafe since Rust can't enforce safety - think of kernel launches
221-
// as a foreign-function call. In this case, it is - this kernel is written in CUDA C.
222-
unsafe {
223-
launch!(simulate_kernel<<<cuda_grid_size, cuda_block_size, 0, stream>>>(
224-
simulation_mut_ptr,
225-
event_buffer.get_mut_cuda_ptr(),
226-
SIMULATION_STEP_SLICE
227-
))?;
228-
}
233+
let mut time_slice = 0;
234+
235+
while remaining_individuals > 0 {
236+
println!("Starting time slice {} with {} remaining individuals ...", time_slice + 1, remaining_individuals);
237+
238+
for grid_id in 0..cuda_grid_amount {
239+
grid_id_symbol.copy_from(&grid_id)?;
240+
241+
let cuda_grid_size = cuda_grid_size.clone();
242+
let cuda_block_size = cuda_block_size.clone();
243+
244+
println!("Launching grid {}/{} of time slice {} ...", grid_id + 1, cuda_grid_amount, time_slice + 1);
245+
246+
// Launching kernels is unsafe since Rust cannot enforce safety across
247+
// the foreign function CUDA-C language barrier
248+
unsafe {
249+
launch!(simulate_kernel<<<cuda_grid_size, cuda_block_size, 0, stream>>>(
250+
simulation_mut_ptr,
251+
event_buffer.get_mut_cuda_ptr(),
252+
SIMULATION_STEP_SLICE
253+
))?;
254+
}
255+
256+
println!("Synchronising ...");
257+
258+
stream.synchronize()?;
259+
260+
println!("Analysing events ...");
229261

230-
stream.synchronize()?;
262+
event_buffer.with_fetched_events(|events| {
263+
events.inspect(|event| {
264+
if let necsim_core::event::EventType::Speciation = event.r#type() {
265+
remaining_individuals -= 1;
266+
}
267+
}).for_each(|event| reporter.report_event(&event))
268+
})?
269+
}
231270

232-
for block_index in block_index_range {
233-
event_buffer.with_fetched_events_for_block(block_index as usize, |events| {
234-
events.iter().for_each(|event| reporter.report_event(event))
235-
})?
271+
time_slice += 1;
236272
}
237273

238274
Ok(())
239275
}) {
240-
eprintln!("Running kernel failed with {:#?}!", err);
276+
eprintln!("\nRunning kernel failed with {:#?}!\n", err);
241277
}
242278

243279
});});});

necsim-impls-cuda/src/event_buffer/device.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,8 @@ impl<H: Habitat + RustToCuda, R: LineageReference<H> + DeviceCopy> EventBufferDe
4646
let raw_slice: &mut [Option<Event<H, R>>] =
4747
core::slice::from_raw_parts_mut(cuda_repr_ref.device_buffer.as_raw_mut(), buffer_len);
4848

49-
let (_before_raw_slice, rest_raw_slice) =
50-
raw_slice.split_at_mut(rust_cuda::device::utils::index() * cuda_repr_ref.max_events);
49+
let (_before_raw_slice, rest_raw_slice) = raw_slice
50+
.split_at_mut(rust_cuda::device::utils::index_no_offset() * cuda_repr_ref.max_events);
5151
let (individual_raw_slice, _after_raw_slice) =
5252
rest_raw_slice.split_at_mut(cuda_repr_ref.max_events);
5353

necsim-impls-cuda/src/event_buffer/host.rs

Lines changed: 15 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use alloc::vec::Vec;
1+
use core::ops::DerefMut;
22

33
use rustacuda::error::CudaResult;
44
use rustacuda::function::{BlockSize, GridSize};
@@ -15,14 +15,18 @@ use necsim_core::event::Event;
1515

1616
#[allow(clippy::module_name_repetitions)]
1717
pub struct EventBufferHost<H: Habitat + RustToCuda, R: LineageReference<H> + DeviceCopy> {
18-
block_size: usize,
19-
grid_size: usize,
20-
max_events: usize,
2118
host_buffer: CudaDropWrapper<LockedBuffer<Option<Event<H, R>>>>,
2219
device_buffer: CudaDropWrapper<DeviceBuffer<Option<Event<H, R>>>>,
2320
cuda_repr_box: CudaDropWrapper<DeviceBox<super::common::EventBufferCudaRepresentation<H, R>>>,
2421
}
2522

23+
pub type EventIterator<'e, H, R> = core::iter::FilterMap<
24+
core::slice::IterMut<'e, Option<necsim_core::event::Event<H, R>>>,
25+
for<'r> fn(
26+
&'r mut Option<necsim_core::event::Event<H, R>>,
27+
) -> Option<necsim_core::event::Event<H, R>>,
28+
>;
29+
2630
impl<H: Habitat + RustToCuda, R: LineageReference<H> + DeviceCopy> EventBufferHost<H, R> {
2731
/// # Errors
2832
/// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA
@@ -49,44 +53,24 @@ impl<H: Habitat + RustToCuda, R: LineageReference<H> + DeviceCopy> EventBufferHo
4953
let cuda_repr_box = CudaDropWrapper::from(DeviceBox::new(&cuda_repr)?);
5054

5155
Ok(Self {
52-
block_size,
53-
grid_size,
54-
max_events,
5556
host_buffer,
5657
device_buffer,
5758
cuda_repr_box,
5859
})
5960
}
6061

61-
#[debug_requires(block_index < self.grid_size, "block_index is in range")]
62-
pub fn with_fetched_events_for_block<O, F: FnOnce(Vec<Event<H, R>>) -> O>(
62+
/// # Errors
63+
/// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA
64+
pub fn with_fetched_events<O, F: FnOnce(EventIterator<'_, H, R>) -> O>(
6365
&mut self,
64-
block_index: usize,
6566
inner: F,
6667
) -> CudaResult<O> {
67-
let full_host_buffer = self.host_buffer.as_mut_slice();
68-
let (_before_host_buffer, rest_host_buffer) =
69-
full_host_buffer.split_at_mut(block_index * self.block_size * self.max_events);
70-
let (block_host_buffer, _after_host_buffer) =
71-
rest_host_buffer.split_at_mut(self.block_size * self.max_events);
72-
73-
let full_device_buffer = &mut self.device_buffer;
74-
let (_before_device_buffer, rest_device_buffer) =
75-
full_device_buffer.split_at_mut(block_index * self.block_size * self.max_events);
76-
let (block_device_buffer, _after_device_buffer) =
77-
rest_device_buffer.split_at_mut(self.block_size * self.max_events);
78-
79-
block_device_buffer.copy_to(block_host_buffer)?;
68+
self.device_buffer.copy_to(self.host_buffer.deref_mut())?;
8069

8170
// Collect the events and reset the buffer slice to all None's
82-
let result = inner(
83-
block_host_buffer
84-
.iter_mut()
85-
.filter_map(Option::take)
86-
.collect::<Vec<Event<H, R>>>(),
87-
);
88-
89-
block_device_buffer.copy_from(block_host_buffer)?;
71+
let result = inner(self.host_buffer.iter_mut().filter_map(Option::take));
72+
73+
self.device_buffer.copy_from(self.host_buffer.deref_mut())?;
9074

9175
Ok(result)
9276
}

necsim-impls-cuda/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
extern crate alloc;
66

7-
#[macro_use]
7+
#[cfg_attr(target_os = "cuda", macro_use)]
88
extern crate contracts;
99

1010
#[macro_use]

necsim-impls-no-std/src/cogs/lineage_reference/in_memory.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,6 @@ impl rust_cuda::common::FromCudaThreadIdx for InMemoryLineageReference {
2626
#[cfg(target_os = "cuda")]
2727
fn from_cuda_thread_idx() -> Self {
2828
#[allow(clippy::cast_sign_loss)]
29-
Self::from(rust_cuda::device::utils::index() as usize)
29+
Self::from(rust_cuda::device::utils::index())
3030
}
3131
}

rust-cuda/src/device/nvptx.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,17 @@ extern "C" {
4545
fn thread_idx_z() -> u32;
4646
}
4747

48+
extern "C" {
49+
#[no_mangle]
50+
static grid_id: u32;
51+
}
52+
53+
#[must_use]
54+
#[inline]
55+
pub unsafe fn _grid_id() -> u32 {
56+
grid_id
57+
}
58+
4859
/// Calculate the base e logarithm of the input argument x.
4960
#[must_use]
5061
#[inline]

rust-cuda/src/device/utils.rs

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -203,9 +203,19 @@ impl Idx3 {
203203
}
204204

205205
#[must_use]
206-
pub fn index() -> usize {
206+
pub fn grid_id() -> u32 {
207+
unsafe { nvptx::_grid_id() }
208+
}
209+
210+
#[must_use]
211+
pub fn index_no_offset() -> usize {
207212
let block_id = block_idx().as_id(&grid_dim());
208213
let thread_id = thread_idx().as_id(&block_dim());
209214

210215
(block_id * block_dim().size() + thread_id) as usize
211216
}
217+
218+
#[must_use]
219+
pub fn index() -> usize {
220+
(grid_id() * grid_dim().size() * block_dim().size()) as usize + index_no_offset()
221+
}

0 commit comments

Comments
 (0)