@@ -12,6 +12,7 @@ use anyhow::Result;
1212use array2d:: Array2D ;
1313
1414use rustacuda:: context:: Context as CudaContext ;
15+ use rustacuda:: function:: Function ;
1516use rustacuda:: prelude:: * ;
1617
1718use rust_cuda:: host:: LendToCuda ;
@@ -54,6 +55,80 @@ macro_rules! with_cuda {
5455 } ;
5556}
5657
58+ fn print_context_resource_limits ( ) {
59+ use rustacuda:: context:: { CurrentContext , ResourceLimit } ;
60+
61+ println ! ( "{:=^80}" , " Context Resource Limits " ) ;
62+
63+ println ! (
64+ "StackSize: {:?}" ,
65+ CurrentContext :: get_resource_limit( ResourceLimit :: StackSize )
66+ ) ;
67+ println ! (
68+ "PrintfFifoSize: {:?}" ,
69+ CurrentContext :: get_resource_limit( ResourceLimit :: PrintfFifoSize )
70+ ) ;
71+ println ! (
72+ "MallocHeapSize: {:?}" ,
73+ CurrentContext :: get_resource_limit( ResourceLimit :: MallocHeapSize )
74+ ) ;
75+ println ! (
76+ "DeviceRuntimeSynchronizeDepth: {:?}" ,
77+ CurrentContext :: get_resource_limit( ResourceLimit :: DeviceRuntimeSynchronizeDepth )
78+ ) ;
79+ println ! (
80+ "DeviceRuntimePendingLaunchCount: {:?}" ,
81+ CurrentContext :: get_resource_limit( ResourceLimit :: DeviceRuntimePendingLaunchCount )
82+ ) ;
83+ println ! (
84+ "MaxL2FetchGranularity: {:?}" ,
85+ CurrentContext :: get_resource_limit( ResourceLimit :: MaxL2FetchGranularity )
86+ ) ;
87+
88+ println ! ( "{:=^80}" , "" ) ;
89+ }
90+
91+ fn print_kernel_function_attributes ( kernel : & Function ) {
92+ use rustacuda:: function:: FunctionAttribute ;
93+
94+ println ! ( "{:=^80}" , " Kernel Function Attributes " ) ;
95+
96+ println ! (
97+ "MaxThreadsPerBlock: {:?}" ,
98+ kernel. get_attribute( FunctionAttribute :: MaxThreadsPerBlock )
99+ ) ;
100+ println ! (
101+ "SharedMemorySizeBytes: {:?}" ,
102+ kernel. get_attribute( FunctionAttribute :: SharedMemorySizeBytes )
103+ ) ;
104+ println ! (
105+ "ConstSizeBytes: {:?}" ,
106+ kernel. get_attribute( FunctionAttribute :: ConstSizeBytes )
107+ ) ;
108+ println ! (
109+ "LocalSizeBytes: {:?}" ,
110+ kernel. get_attribute( FunctionAttribute :: LocalSizeBytes )
111+ ) ;
112+ println ! (
113+ "NumRegisters: {:?}" ,
114+ kernel. get_attribute( FunctionAttribute :: NumRegisters )
115+ ) ;
116+ println ! (
117+ "PtxVersion: {:?}" ,
118+ kernel. get_attribute( FunctionAttribute :: PtxVersion )
119+ ) ;
120+ println ! (
121+ "BinaryVersion: {:?}" ,
122+ kernel. get_attribute( FunctionAttribute :: BinaryVersion )
123+ ) ;
124+ println ! (
125+ "CacheModeCa: {:?}" ,
126+ kernel. get_attribute( FunctionAttribute :: CacheModeCa )
127+ ) ;
128+
129+ println ! ( "{:=^80}" , "" ) ;
130+ }
131+
57132pub struct CudaSimulation ;
58133
59134impl CudaSimulation {
@@ -88,10 +163,7 @@ impl CudaSimulation {
88163 let lineage_store = IncoherentInMemoryLineageStore :: new ( sample_percentage, & habitat) ;
89164 let coalescence_sampler = IndependentCoalescenceSampler :: default ( ) ;
90165 let event_sampler = IndependentEventSampler :: default ( ) ;
91- let active_lineage_sampler = IndependentActiveLineageSampler :: new (
92- InMemoryLineageReference :: from ( 9780_usize ) ,
93- & lineage_store,
94- ) ; // TODO
166+ let active_lineage_sampler = IndependentActiveLineageSampler :: default ( ) ;
95167
96168 // TODO: Should we copy the heap contents back over?
97169 let mut simulation = Simulation :: builder ( )
@@ -107,6 +179,16 @@ impl CudaSimulation {
107179
108180 let mut cuda_rng = CudaRng :: from_cloned ( rng) ;
109181
182+ let cuda_block_size = rustacuda:: function:: BlockSize :: xy ( 16 , 16 ) ;
183+ let cuda_grid_size = rustacuda:: function:: GridSize :: x ( {
184+ #[ allow( clippy:: cast_possible_truncation) ]
185+ let total_individuals = simulation. lineage_store ( ) . get_number_total_lineages ( ) as u32 ;
186+ let cuda_block_length = cuda_block_size. x * cuda_block_size. y * cuda_block_size. z ;
187+
188+ ( total_individuals / cuda_block_length)
189+ + ( total_individuals % cuda_block_length > 0 ) as u32
190+ } ) ;
191+
110192 //let (time, steps) = simulation.simulate(rng, reporter);
111193
112194 let module_data = CString :: new ( include_str ! ( env!( "KERNEL_PTX_PATH" ) ) ) . unwrap ( ) ;
@@ -121,28 +203,26 @@ impl CudaSimulation {
121203
122204 // Create a context associated to this device
123205 with_cuda ! ( CudaContext :: create_and_push( ContextFlags :: MAP_HOST | ContextFlags :: SCHED_AUTO , device) ? => |context: CudaContext | {
124- // Load the module containing the function we want to call
206+ // Load the module containing the kernel function
125207 with_cuda!( Module :: load_from_string( & module_data) ? => |module: Module | {
208+ // Load the kernel function from the module
209+ let simulate_kernel = module. get_function( & CString :: new( "simulate" ) . unwrap( ) ) ?;
126210 // Create a stream to submit work to
127211 with_cuda!( Stream :: new( StreamFlags :: NON_BLOCKING , None ) ? => |stream: Stream | {
128212
129213 use rustacuda:: context:: { CurrentContext , ResourceLimit } ;
130214
131215 CurrentContext :: set_resource_limit( ResourceLimit :: StackSize , 4096 ) ?;
132216
133- //println!("{:?}", CurrentContext::get_resource_limit(ResourceLimit::StackSize));
134- //println!("{:?}", CurrentContext::get_resource_limit(ResourceLimit::PrintfFifoSize));
135- //println!("{:?}", CurrentContext::get_resource_limit(ResourceLimit::MallocHeapSize));
136- //println!("{:?}", CurrentContext::get_resource_limit(ResourceLimit::DeviceRuntimeSynchronizeDepth));
137- //println!("{:?}", CurrentContext::get_resource_limit(ResourceLimit::DeviceRuntimePendingLaunchCount));
138- //println!("{:?}", CurrentContext::get_resource_limit(ResourceLimit::MaxL2FetchGranularity));
217+ print_context_resource_limits( ) ;
218+ print_kernel_function_attributes( & simulate_kernel) ;
139219
140220 if let Err ( err) = simulation. lend_to_cuda_mut( |simulation_mut_ptr| {
141221 cuda_rng. lend_to_cuda_mut( |cuda_rng_mut_ptr| {
142222 // Launching kernels is unsafe since Rust can't enforce safety - think of kernel launches
143223 // as a foreign-function call. In this case, it is - this kernel is written in CUDA C.
144224 unsafe {
145- launch!( module . simulate <<<1 , 1 , 0 , stream>>>(
225+ launch!( simulate_kernel <<<cuda_grid_size , cuda_block_size , 0 , stream>>>(
146226 simulation_mut_ptr,
147227 cuda_rng_mut_ptr,
148228 1_000_usize // max steps on GPU
0 commit comments