@@ -13,6 +13,7 @@ use array2d::Array2D;
1313
1414use rustacuda:: context:: Context as CudaContext ;
1515use rustacuda:: function:: Function ;
16+ use rustacuda:: module:: Symbol ;
1617use rustacuda:: prelude:: * ;
1718
1819use rust_cuda:: host:: LendToCuda ;
@@ -177,15 +178,23 @@ impl CudaSimulation {
177178 . active_lineage_sampler ( active_lineage_sampler)
178179 . build ( ) ;
179180
181+ // TODO: Need a way to tune these based on the available CUDA device or cmd args
180182 let cuda_block_size = rustacuda:: function:: BlockSize :: xy ( 16 , 16 ) ;
181- let cuda_grid_size = rustacuda:: function:: GridSize :: x ( {
183+ let cuda_grid_size = rustacuda:: function:: GridSize :: xy ( 16 , 16 ) ;
184+
185+ #[ allow( clippy:: cast_possible_truncation) ]
186+ let cuda_grid_amount = {
182187 #[ allow( clippy:: cast_possible_truncation) ]
183- let total_individuals = simulation. lineage_store ( ) . get_number_total_lineages ( ) as u32 ;
184- let cuda_block_length = cuda_block_size. x * cuda_block_size. y * cuda_block_size. z ;
188+ let total_individuals = simulation. lineage_store ( ) . get_number_total_lineages ( ) ;
189+
190+ let cuda_block_size =
191+ ( cuda_block_size. x * cuda_block_size. y * cuda_block_size. z ) as usize ;
192+ let cuda_grid_size = ( cuda_grid_size. x * cuda_grid_size. y * cuda_grid_size. z ) as usize ;
185193
186- ( total_individuals / cuda_block_length)
187- + ( total_individuals % cuda_block_length > 0 ) as u32
188- } ) ;
194+ let cuda_task_size = cuda_block_size * cuda_grid_size;
195+
196+ ( total_individuals / cuda_task_size) + ( total_individuals % cuda_task_size > 0 ) as usize
197+ } as u32 ;
189198
190199 let module_data = CString :: new ( include_str ! ( env!( "KERNEL_PTX_PATH" ) ) ) . unwrap ( ) ;
191200
@@ -199,6 +208,9 @@ impl CudaSimulation {
199208 with_cuda ! ( CudaContext :: create_and_push( ContextFlags :: MAP_HOST | ContextFlags :: SCHED_AUTO , device) ? => |context: CudaContext | {
200209 // Load the module containing the kernel function
201210 with_cuda!( Module :: load_from_string( & module_data) ? => |module: Module | {
211+ // Load and initialise the grid_id symbol from the module
212+ let mut grid_id_symbol: Symbol <u32 > = module. get_global( & CString :: new( "grid_id" ) . unwrap( ) ) ?;
213+ grid_id_symbol. copy_from( & 0_u32 ) ?;
202214 // Load the kernel function from the module
203215 let simulate_kernel = module. get_function( & CString :: new( "simulate" ) . unwrap( ) ) ?;
204216 // Create a stream to submit work to
@@ -214,30 +226,54 @@ impl CudaSimulation {
214226 let mut event_buffer: EventBufferHost <InMemoryHabitat , InMemoryLineageReference > =
215227 EventBufferHost :: new( & cuda_block_size, & cuda_grid_size, SIMULATION_STEP_SLICE ) ?;
216228
229+ let mut remaining_individuals = simulation. lineage_store( ) . get_number_total_lineages( ) ;
230+
231+ // TODO: We should use async launches and callbacks to rotate between simulation, event analysis etc.
217232 if let Err ( err) = simulation. lend_to_cuda_mut( |simulation_mut_ptr| {
218- let block_index_range = 0 ..( cuda_grid_size. x * cuda_grid_size. y * cuda_grid_size. z) ;
219-
220- // Launching kernels is unsafe since Rust can't enforce safety - think of kernel launches
221- // as a foreign-function call. In this case, it is - this kernel is written in CUDA C.
222- unsafe {
223- launch!( simulate_kernel<<<cuda_grid_size, cuda_block_size, 0 , stream>>>(
224- simulation_mut_ptr,
225- event_buffer. get_mut_cuda_ptr( ) ,
226- SIMULATION_STEP_SLICE
227- ) ) ?;
228- }
233+ let mut time_slice = 0 ;
234+
235+ while remaining_individuals > 0 {
236+ println!( "Starting time slice {} with {} remaining individuals ..." , time_slice + 1 , remaining_individuals) ;
237+
238+ for grid_id in 0 ..cuda_grid_amount {
239+ grid_id_symbol. copy_from( & grid_id) ?;
240+
241+ let cuda_grid_size = cuda_grid_size. clone( ) ;
242+ let cuda_block_size = cuda_block_size. clone( ) ;
243+
244+ println!( "Launching grid {}/{} of time slice {} ..." , grid_id + 1 , cuda_grid_amount, time_slice + 1 ) ;
245+
246+ // Launching kernels is unsafe since Rust cannot enforce safety across
247+ // the foreign function CUDA-C language barrier
248+ unsafe {
249+ launch!( simulate_kernel<<<cuda_grid_size, cuda_block_size, 0 , stream>>>(
250+ simulation_mut_ptr,
251+ event_buffer. get_mut_cuda_ptr( ) ,
252+ SIMULATION_STEP_SLICE
253+ ) ) ?;
254+ }
255+
256+ println!( "Synchronising ..." ) ;
257+
258+ stream. synchronize( ) ?;
259+
260+ println!( "Analysing events ..." ) ;
229261
230- stream. synchronize( ) ?;
262+ event_buffer. with_fetched_events( |events| {
263+ events. inspect( |event| {
264+ if let necsim_core:: event:: EventType :: Speciation = event. r#type( ) {
265+ remaining_individuals -= 1 ;
266+ }
267+ } ) . for_each( |event| reporter. report_event( & event) )
268+ } ) ?
269+ }
231270
232- for block_index in block_index_range {
233- event_buffer. with_fetched_events_for_block( block_index as usize , |events| {
234- events. iter( ) . for_each( |event| reporter. report_event( event) )
235- } ) ?
271+ time_slice += 1 ;
236272 }
237273
238274 Ok ( ( ) )
239275 } ) {
240- eprintln!( "Running kernel failed with {:#?}!" , err) ;
276+ eprintln!( "\n Running kernel failed with {:#?}!\n " , err) ;
241277 }
242278
243279 } ) ; } ) ; } ) ;
0 commit comments