juntyr
diff --git a/‎Cargo.toml‎
Lines changed: 3 additions & 6 deletions b/‎Cargo.toml‎
Lines changed: 3 additions & 6 deletions
diff --git a/‎necsim-core/src/intrinsics.rs‎
Lines changed: 13 additions & 0 deletions b/‎necsim-core/src/intrinsics.rs‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎necsim-cuda/kernel/src/lib.rs‎
Lines changed: 8 additions & 3 deletions b/‎necsim-cuda/kernel/src/lib.rs‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎necsim-cuda/src/lib.rs‎
Lines changed: 92 additions & 12 deletions b/‎necsim-cuda/src/lib.rs‎
Lines changed: 92 additions & 12 deletions
diff --git a/‎necsim-impls-no-std/src/cogs/active_lineage_sampler/independent/mod.rs‎
Lines changed: 18 additions & 2 deletions b/‎necsim-impls-no-std/src/cogs/active_lineage_sampler/independent/mod.rs‎
Lines changed: 18 additions & 2 deletions
diff --git a/‎necsim-impls-no-std/src/cogs/active_lineage_sampler/independent/sampler.rs‎
Lines changed: 53 additions & 9 deletions b/‎necsim-impls-no-std/src/cogs/active_lineage_sampler/independent/sampler.rs‎
Lines changed: 53 additions & 9 deletions
diff --git a/‎necsim-impls-no-std/src/cogs/lineage_reference/in_memory.rs‎
Lines changed: 9 additions & 0 deletions b/‎necsim-impls-no-std/src/cogs/lineage_reference/in_memory.rs‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎rust-cuda-test/Cargo.toml‎
Lines changed: 0 additions & 19 deletions b/‎rust-cuda-test/Cargo.toml‎
Lines changed: 0 additions & 19 deletions
diff --git a/‎rust-cuda-test/README‎
Lines changed: 0 additions & 5 deletions b/‎rust-cuda-test/README‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎rust-cuda-test/build.rs‎
Lines changed: 0 additions & 6 deletions b/‎rust-cuda-test/build.rs‎
Lines changed: 0 additions & 6 deletions
@@ -19,14 +19,11 @@ members = [
     "necsim-cuda/kernel",
 
     "rustcoalescence",
-
-    "rust-cuda-test",
-    "rust-cuda-test/test-kernel",
 ]
 
-#[profile.dev]
-#opt-level = 3
-#lto = "fat"
+[profile.dev]
+opt-level = 3
+lto = "fat"
 
 [profile.release]
 opt-level = 3
 
@@ -16,3 +16,16 @@ pub fn ln(val: f64) -> f64 {
         rust_cuda::device::nvptx::_log(val)
     }
 }
+
+#[must_use]
+#[inline]
+pub fn exp(val: f64) -> f64 {
+    #[cfg(not(target_os = "cuda"))]
+    unsafe {
+        core::intrinsics::expf64(val)
+    }
+    #[cfg(target_os = "cuda")]
+    unsafe {
+        rust_cuda::device::nvptx::_exp(val)
+    }
+}
@@ -102,11 +102,16 @@ unsafe fn simulate_generic<
         CudaRng::with_borrow_from_rust_mut(cuda_rng_ptr, |cuda_rng| {
             let mut reporter = NullReporter;
 
-            //println!("{:#?}", simulation);
-
             let (time, steps) = simulation.simulate_incremental(max_steps, cuda_rng, &mut reporter);
 
-            println!("time = {:?}, steps = {}", F64(time), steps);
+            if utils::thread_idx().as_id(&utils::block_dim()) == 0 {
+                println!(
+                    "index = {}, time = {:?}, steps = {}",
+                    utils::index(),
+                    F64(time),
+                    steps
+                );
+            }
         })
     })
 }
@@ -12,6 +12,7 @@ use anyhow::Result;
 use array2d::Array2D;
 
 use rustacuda::context::Context as CudaContext;
+use rustacuda::function::Function;
 use rustacuda::prelude::*;
 
 use rust_cuda::host::LendToCuda;
@@ -54,6 +55,80 @@ macro_rules! with_cuda {
     };
 }
 
+fn print_context_resource_limits() {
+    use rustacuda::context::{CurrentContext, ResourceLimit};
+
+    println!("{:=^80}", " Context Resource Limits ");
+
+    println!(
+        "StackSize: {:?}",
+        CurrentContext::get_resource_limit(ResourceLimit::StackSize)
+    );
+    println!(
+        "PrintfFifoSize: {:?}",
+        CurrentContext::get_resource_limit(ResourceLimit::PrintfFifoSize)
+    );
+    println!(
+        "MallocHeapSize: {:?}",
+        CurrentContext::get_resource_limit(ResourceLimit::MallocHeapSize)
+    );
+    println!(
+        "DeviceRuntimeSynchronizeDepth: {:?}",
+        CurrentContext::get_resource_limit(ResourceLimit::DeviceRuntimeSynchronizeDepth)
+    );
+    println!(
+        "DeviceRuntimePendingLaunchCount: {:?}",
+        CurrentContext::get_resource_limit(ResourceLimit::DeviceRuntimePendingLaunchCount)
+    );
+    println!(
+        "MaxL2FetchGranularity: {:?}",
+        CurrentContext::get_resource_limit(ResourceLimit::MaxL2FetchGranularity)
+    );
+
+    println!("{:=^80}", "");
+}
+
+fn print_kernel_function_attributes(kernel: &Function) {
+    use rustacuda::function::FunctionAttribute;
+
+    println!("{:=^80}", " Kernel Function Attributes ");
+
+    println!(
+        "MaxThreadsPerBlock: {:?}",
+        kernel.get_attribute(FunctionAttribute::MaxThreadsPerBlock)
+    );
+    println!(
+        "SharedMemorySizeBytes: {:?}",
+        kernel.get_attribute(FunctionAttribute::SharedMemorySizeBytes)
+    );
+    println!(
+        "ConstSizeBytes: {:?}",
+        kernel.get_attribute(FunctionAttribute::ConstSizeBytes)
+    );
+    println!(
+        "LocalSizeBytes: {:?}",
+        kernel.get_attribute(FunctionAttribute::LocalSizeBytes)
+    );
+    println!(
+        "NumRegisters: {:?}",
+        kernel.get_attribute(FunctionAttribute::NumRegisters)
+    );
+    println!(
+        "PtxVersion: {:?}",
+        kernel.get_attribute(FunctionAttribute::PtxVersion)
+    );
+    println!(
+        "BinaryVersion: {:?}",
+        kernel.get_attribute(FunctionAttribute::BinaryVersion)
+    );
+    println!(
+        "CacheModeCa: {:?}",
+        kernel.get_attribute(FunctionAttribute::CacheModeCa)
+    );
+
+    println!("{:=^80}", "");
+}
+
 pub struct CudaSimulation;
 
 impl CudaSimulation {
@@ -88,10 +163,7 @@ impl CudaSimulation {
         let lineage_store = IncoherentInMemoryLineageStore::new(sample_percentage, &habitat);
         let coalescence_sampler = IndependentCoalescenceSampler::default();
         let event_sampler = IndependentEventSampler::default();
-        let active_lineage_sampler = IndependentActiveLineageSampler::new(
-            InMemoryLineageReference::from(9780_usize),
-            &lineage_store,
-        ); // TODO
+        let active_lineage_sampler = IndependentActiveLineageSampler::default();
 
         // TODO: Should we copy the heap contents back over?
         let mut simulation = Simulation::builder()
@@ -107,6 +179,16 @@ impl CudaSimulation {
 
         let mut cuda_rng = CudaRng::from_cloned(rng);
 
+        let cuda_block_size = rustacuda::function::BlockSize::xy(16, 16);
+        let cuda_grid_size = rustacuda::function::GridSize::x({
+            #[allow(clippy::cast_possible_truncation)]
+            let total_individuals = simulation.lineage_store().get_number_total_lineages() as u32;
+            let cuda_block_length = cuda_block_size.x * cuda_block_size.y * cuda_block_size.z;
+
+            (total_individuals / cuda_block_length)
+                + (total_individuals % cuda_block_length > 0) as u32
+        });
+
         //let (time, steps) = simulation.simulate(rng, reporter);
 
         let module_data = CString::new(include_str!(env!("KERNEL_PTX_PATH"))).unwrap();
@@ -121,28 +203,26 @@ impl CudaSimulation {
 
         // Create a context associated to this device
         with_cuda!(CudaContext::create_and_push(ContextFlags::MAP_HOST | ContextFlags::SCHED_AUTO, device)? => |context: CudaContext| {
-        // Load the module containing the function we want to call
+        // Load the module containing the kernel function
         with_cuda!(Module::load_from_string(&module_data)? => |module: Module| {
+        // Load the kernel function from the module
+        let simulate_kernel = module.get_function(&CString::new("simulate").unwrap())?;
         // Create a stream to submit work to
         with_cuda!(Stream::new(StreamFlags::NON_BLOCKING, None)? => |stream: Stream| {
 
             use rustacuda::context::{CurrentContext, ResourceLimit};
 
             CurrentContext::set_resource_limit(ResourceLimit::StackSize, 4096)?;
 
-            //println!("{:?}", CurrentContext::get_resource_limit(ResourceLimit::StackSize));
-            //println!("{:?}", CurrentContext::get_resource_limit(ResourceLimit::PrintfFifoSize));
-            //println!("{:?}", CurrentContext::get_resource_limit(ResourceLimit::MallocHeapSize));
-            //println!("{:?}", CurrentContext::get_resource_limit(ResourceLimit::DeviceRuntimeSynchronizeDepth));
-            //println!("{:?}", CurrentContext::get_resource_limit(ResourceLimit::DeviceRuntimePendingLaunchCount));
-            //println!("{:?}", CurrentContext::get_resource_limit(ResourceLimit::MaxL2FetchGranularity));
+            print_context_resource_limits();
+            print_kernel_function_attributes(&simulate_kernel);
 
             if let Err(err) = simulation.lend_to_cuda_mut(|simulation_mut_ptr| {
                 cuda_rng.lend_to_cuda_mut(|cuda_rng_mut_ptr| {
                     // Launching kernels is unsafe since Rust can't enforce safety - think of kernel launches
                     // as a foreign-function call. In this case, it is - this kernel is written in CUDA C.
                     unsafe {
-                        launch!(module.simulate<<<1, 1, 0, stream>>>(
+                        launch!(simulate_kernel<<<cuda_grid_size, cuda_block_size, 0, stream>>>(
                             simulation_mut_ptr,
                             cuda_rng_mut_ptr,
                             1_000_usize // max steps on GPU
 
@@ -8,7 +8,7 @@ mod sampler;
 #[cfg_attr(feature = "cuda", derive(RustToCuda))]
 #[cfg_attr(feature = "cuda", r2cBound(H: rust_cuda::common::RustToCuda))]
 #[cfg_attr(feature = "cuda", r2cBound(D: rust_cuda::common::RustToCuda))]
-#[cfg_attr(feature = "cuda", r2cBound(R: rustacuda_core::DeviceCopy))]
+#[cfg_attr(feature = "cuda", r2cBound(R: rustacuda_core::DeviceCopy + rust_cuda::common::FromCudaThreadIdx))]
 #[cfg_attr(feature = "cuda", r2cBound(S: rust_cuda::common::RustToCuda))]
 #[derive(Debug)]
 pub struct IndependentActiveLineageSampler<
@@ -17,7 +17,7 @@ pub struct IndependentActiveLineageSampler<
     R: LineageReference<H>,
     S: IncoherentLineageStore<H, R>,
 > {
-    // TODO: This reference needs to somehow be initialised by the thread index in CUDA whilst allowing for generalisation
+    #[cfg_attr(feature = "cuda", r2cEval(Some(R::from_cuda_thread_idx())))]
     active_lineage_reference: Option<R>,
     marker: PhantomData<(H, D, S)>,
 }
@@ -39,3 +39,19 @@ impl<
         }
     }
 }
+
+impl<
+        H: Habitat,
+        D: DispersalSampler<H>,
+        R: LineageReference<H>,
+        S: IncoherentLineageStore<H, R>,
+    > Default for IndependentActiveLineageSampler<H, D, R, S>
+{
+    #[must_use]
+    fn default() -> Self {
+        Self {
+            active_lineage_reference: None,
+            marker: PhantomData::<(H, D, S)>,
+        }
+    }
+}
@@ -3,6 +3,7 @@ use float_next_after::NextAfter;
 use necsim_core::cogs::{
     ActiveLineageSampler, DispersalSampler, Habitat, IncoherentLineageStore, LineageReference,
 };
+use necsim_core::intrinsics::{exp, floor};
 use necsim_core::landscape::Location;
 use necsim_core::rng::Rng;
 use necsim_core::simulation::partial::active_lineager_sampler::PartialSimulation;
@@ -53,19 +54,62 @@ impl<
             None => return None,
         };
 
+        #[allow(clippy::question_mark)]
+        if simulation
+            .lineage_store
+            .get(chosen_lineage_reference.clone())
+            .is_none()
+        {
+            // Check for extraneously simulated lineages
+            return None;
+        }
+
         let lineage_location = simulation
             .lineage_store
             .extract_lineage_from_its_location(chosen_lineage_reference.clone());
 
-        // TODO: As we are only doing geometric sampling for now, need to immediately increment discrete time step
-        // TODO: How do we choose the time step for now?
-        // TODO: Need to prime incoherent RNG here with location, discrete time step and substep 0
-
-        // TODO: Need to get time to next event in while loop with exponential (simplest option)
-
-        let event_time = time + rng.sample_exponential(0.5_f64);
-
-        // TODO: Need to prime incoherent RNG here with location, discrete time step and substep 0
+        let delta_t = 0.1_f64;
+        let lambda = 0.5_f64;
+
+        let p = 1.0_f64 - exp(-lambda * delta_t);
+
+        #[allow(clippy::cast_possible_truncation)]
+        #[allow(clippy::cast_sign_loss)]
+        let mut time_step = floor(time / delta_t) as u64 + 1;
+
+        loop {
+            /*let location_x_bytes = lineage_location.x().to_le_bytes();
+            let location_y_bytes = lineage_location.y().to_le_bytes();
+            let time_step_bytes = time_step.to_le_bytes();
+
+            rng.prime_with([
+                location_x_bytes[0],
+                location_x_bytes[1],
+                location_x_bytes[2],
+                location_x_bytes[3],
+                location_y_bytes[0],
+                location_y_bytes[1],
+                location_y_bytes[2],
+                location_y_bytes[3],
+                time_step_bytes[0],
+                time_step_bytes[1],
+                time_step_bytes[2],
+                time_step_bytes[3],
+                time_step_bytes[4],
+                time_step_bytes[5],
+                time_step_bytes[6],
+                time_step_bytes[7],
+            ]);*/
+
+            if rng.sample_event(p) {
+                break;
+            }
+
+            time_step += 1;
+        }
+
+        #[allow(clippy::cast_precision_loss)]
+        let event_time = (time_step as f64) * delta_t;
 
         let unique_event_time: f64 = if event_time > time {
             event_time
 
@@ -20,3 +20,12 @@ impl Into<usize> for InMemoryLineageReference {
         self.0
     }
 }
+
+#[cfg(feature = "cuda")]
+impl rust_cuda::common::FromCudaThreadIdx for InMemoryLineageReference {
+    #[cfg(target_os = "cuda")]
+    fn from_cuda_thread_idx() -> Self {
+        #[allow(clippy::cast_sign_loss)]
+        Self::from(rust_cuda::device::utils::index() as usize)
+    }
+}
Original file line number	Diff line number	Diff line change
`@@ -20,3 +20,12 @@ impl Into<usize> for InMemoryLineageReference {`
`20`	`20`	`self.0`
`21`	`21`	`}`
`22`	`22`	`}`
	`23`	`+`
	`24`	`+#[cfg(feature = "cuda")]`
	`25`	`+impl rust_cuda::common::FromCudaThreadIdx for InMemoryLineageReference {`
	`26`	`+ #[cfg(target_os = "cuda")]`
	`27`	`+ fn from_cuda_thread_idx() -> Self {`
	`28`	`+ #[allow(clippy::cast_sign_loss)]`
	`29`	`+ Self::from(rust_cuda::device::utils::index() as usize)`
	`30`	`+ }`
	`31`	`+}`