Skip to content

Commit 590dea6

Browse files
committed
(ml5717) Added primitive CUDA launch, Simulation RNG dependency TODO
1 parent c67ca04 commit 590dea6

File tree

24 files changed

+233
-678
lines changed

24 files changed

+233
-678
lines changed

Cargo.toml

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,11 @@ members = [
1919
"necsim-cuda/kernel",
2020

2121
"rustcoalescence",
22-
23-
"rust-cuda-test",
24-
"rust-cuda-test/test-kernel",
2522
]
2623

27-
#[profile.dev]
28-
#opt-level = 3
29-
#lto = "fat"
24+
[profile.dev]
25+
opt-level = 3
26+
lto = "fat"
3027

3128
[profile.release]
3229
opt-level = 3

necsim-core/src/intrinsics.rs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,16 @@ pub fn ln(val: f64) -> f64 {
1616
rust_cuda::device::nvptx::_log(val)
1717
}
1818
}
19+
20+
#[must_use]
21+
#[inline]
22+
pub fn exp(val: f64) -> f64 {
23+
#[cfg(not(target_os = "cuda"))]
24+
unsafe {
25+
core::intrinsics::expf64(val)
26+
}
27+
#[cfg(target_os = "cuda")]
28+
unsafe {
29+
rust_cuda::device::nvptx::_exp(val)
30+
}
31+
}

necsim-cuda/kernel/src/lib.rs

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -102,11 +102,16 @@ unsafe fn simulate_generic<
102102
CudaRng::with_borrow_from_rust_mut(cuda_rng_ptr, |cuda_rng| {
103103
let mut reporter = NullReporter;
104104

105-
//println!("{:#?}", simulation);
106-
107105
let (time, steps) = simulation.simulate_incremental(max_steps, cuda_rng, &mut reporter);
108106

109-
println!("time = {:?}, steps = {}", F64(time), steps);
107+
if utils::thread_idx().as_id(&utils::block_dim()) == 0 {
108+
println!(
109+
"index = {}, time = {:?}, steps = {}",
110+
utils::index(),
111+
F64(time),
112+
steps
113+
);
114+
}
110115
})
111116
})
112117
}

necsim-cuda/src/lib.rs

Lines changed: 92 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ use anyhow::Result;
1212
use array2d::Array2D;
1313

1414
use rustacuda::context::Context as CudaContext;
15+
use rustacuda::function::Function;
1516
use rustacuda::prelude::*;
1617

1718
use rust_cuda::host::LendToCuda;
@@ -54,6 +55,80 @@ macro_rules! with_cuda {
5455
};
5556
}
5657

58+
fn print_context_resource_limits() {
59+
use rustacuda::context::{CurrentContext, ResourceLimit};
60+
61+
println!("{:=^80}", " Context Resource Limits ");
62+
63+
println!(
64+
"StackSize: {:?}",
65+
CurrentContext::get_resource_limit(ResourceLimit::StackSize)
66+
);
67+
println!(
68+
"PrintfFifoSize: {:?}",
69+
CurrentContext::get_resource_limit(ResourceLimit::PrintfFifoSize)
70+
);
71+
println!(
72+
"MallocHeapSize: {:?}",
73+
CurrentContext::get_resource_limit(ResourceLimit::MallocHeapSize)
74+
);
75+
println!(
76+
"DeviceRuntimeSynchronizeDepth: {:?}",
77+
CurrentContext::get_resource_limit(ResourceLimit::DeviceRuntimeSynchronizeDepth)
78+
);
79+
println!(
80+
"DeviceRuntimePendingLaunchCount: {:?}",
81+
CurrentContext::get_resource_limit(ResourceLimit::DeviceRuntimePendingLaunchCount)
82+
);
83+
println!(
84+
"MaxL2FetchGranularity: {:?}",
85+
CurrentContext::get_resource_limit(ResourceLimit::MaxL2FetchGranularity)
86+
);
87+
88+
println!("{:=^80}", "");
89+
}
90+
91+
fn print_kernel_function_attributes(kernel: &Function) {
92+
use rustacuda::function::FunctionAttribute;
93+
94+
println!("{:=^80}", " Kernel Function Attributes ");
95+
96+
println!(
97+
"MaxThreadsPerBlock: {:?}",
98+
kernel.get_attribute(FunctionAttribute::MaxThreadsPerBlock)
99+
);
100+
println!(
101+
"SharedMemorySizeBytes: {:?}",
102+
kernel.get_attribute(FunctionAttribute::SharedMemorySizeBytes)
103+
);
104+
println!(
105+
"ConstSizeBytes: {:?}",
106+
kernel.get_attribute(FunctionAttribute::ConstSizeBytes)
107+
);
108+
println!(
109+
"LocalSizeBytes: {:?}",
110+
kernel.get_attribute(FunctionAttribute::LocalSizeBytes)
111+
);
112+
println!(
113+
"NumRegisters: {:?}",
114+
kernel.get_attribute(FunctionAttribute::NumRegisters)
115+
);
116+
println!(
117+
"PtxVersion: {:?}",
118+
kernel.get_attribute(FunctionAttribute::PtxVersion)
119+
);
120+
println!(
121+
"BinaryVersion: {:?}",
122+
kernel.get_attribute(FunctionAttribute::BinaryVersion)
123+
);
124+
println!(
125+
"CacheModeCa: {:?}",
126+
kernel.get_attribute(FunctionAttribute::CacheModeCa)
127+
);
128+
129+
println!("{:=^80}", "");
130+
}
131+
57132
pub struct CudaSimulation;
58133

59134
impl CudaSimulation {
@@ -88,10 +163,7 @@ impl CudaSimulation {
88163
let lineage_store = IncoherentInMemoryLineageStore::new(sample_percentage, &habitat);
89164
let coalescence_sampler = IndependentCoalescenceSampler::default();
90165
let event_sampler = IndependentEventSampler::default();
91-
let active_lineage_sampler = IndependentActiveLineageSampler::new(
92-
InMemoryLineageReference::from(9780_usize),
93-
&lineage_store,
94-
); // TODO
166+
let active_lineage_sampler = IndependentActiveLineageSampler::default();
95167

96168
// TODO: Should we copy the heap contents back over?
97169
let mut simulation = Simulation::builder()
@@ -107,6 +179,16 @@ impl CudaSimulation {
107179

108180
let mut cuda_rng = CudaRng::from_cloned(rng);
109181

182+
let cuda_block_size = rustacuda::function::BlockSize::xy(16, 16);
183+
let cuda_grid_size = rustacuda::function::GridSize::x({
184+
#[allow(clippy::cast_possible_truncation)]
185+
let total_individuals = simulation.lineage_store().get_number_total_lineages() as u32;
186+
let cuda_block_length = cuda_block_size.x * cuda_block_size.y * cuda_block_size.z;
187+
188+
(total_individuals / cuda_block_length)
189+
+ (total_individuals % cuda_block_length > 0) as u32
190+
});
191+
110192
//let (time, steps) = simulation.simulate(rng, reporter);
111193

112194
let module_data = CString::new(include_str!(env!("KERNEL_PTX_PATH"))).unwrap();
@@ -121,28 +203,26 @@ impl CudaSimulation {
121203

122204
// Create a context associated to this device
123205
with_cuda!(CudaContext::create_and_push(ContextFlags::MAP_HOST | ContextFlags::SCHED_AUTO, device)? => |context: CudaContext| {
124-
// Load the module containing the function we want to call
206+
// Load the module containing the kernel function
125207
with_cuda!(Module::load_from_string(&module_data)? => |module: Module| {
208+
// Load the kernel function from the module
209+
let simulate_kernel = module.get_function(&CString::new("simulate").unwrap())?;
126210
// Create a stream to submit work to
127211
with_cuda!(Stream::new(StreamFlags::NON_BLOCKING, None)? => |stream: Stream| {
128212

129213
use rustacuda::context::{CurrentContext, ResourceLimit};
130214

131215
CurrentContext::set_resource_limit(ResourceLimit::StackSize, 4096)?;
132216

133-
//println!("{:?}", CurrentContext::get_resource_limit(ResourceLimit::StackSize));
134-
//println!("{:?}", CurrentContext::get_resource_limit(ResourceLimit::PrintfFifoSize));
135-
//println!("{:?}", CurrentContext::get_resource_limit(ResourceLimit::MallocHeapSize));
136-
//println!("{:?}", CurrentContext::get_resource_limit(ResourceLimit::DeviceRuntimeSynchronizeDepth));
137-
//println!("{:?}", CurrentContext::get_resource_limit(ResourceLimit::DeviceRuntimePendingLaunchCount));
138-
//println!("{:?}", CurrentContext::get_resource_limit(ResourceLimit::MaxL2FetchGranularity));
217+
print_context_resource_limits();
218+
print_kernel_function_attributes(&simulate_kernel);
139219

140220
if let Err(err) = simulation.lend_to_cuda_mut(|simulation_mut_ptr| {
141221
cuda_rng.lend_to_cuda_mut(|cuda_rng_mut_ptr| {
142222
// Launching kernels is unsafe since Rust can't enforce safety - think of kernel launches
143223
// as a foreign-function call. In this case, it is - this kernel is written in CUDA C.
144224
unsafe {
145-
launch!(module.simulate<<<1, 1, 0, stream>>>(
225+
launch!(simulate_kernel<<<cuda_grid_size, cuda_block_size, 0, stream>>>(
146226
simulation_mut_ptr,
147227
cuda_rng_mut_ptr,
148228
1_000_usize // max steps on GPU

necsim-impls-no-std/src/cogs/active_lineage_sampler/independent/mod.rs

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ mod sampler;
88
#[cfg_attr(feature = "cuda", derive(RustToCuda))]
99
#[cfg_attr(feature = "cuda", r2cBound(H: rust_cuda::common::RustToCuda))]
1010
#[cfg_attr(feature = "cuda", r2cBound(D: rust_cuda::common::RustToCuda))]
11-
#[cfg_attr(feature = "cuda", r2cBound(R: rustacuda_core::DeviceCopy))]
11+
#[cfg_attr(feature = "cuda", r2cBound(R: rustacuda_core::DeviceCopy + rust_cuda::common::FromCudaThreadIdx))]
1212
#[cfg_attr(feature = "cuda", r2cBound(S: rust_cuda::common::RustToCuda))]
1313
#[derive(Debug)]
1414
pub struct IndependentActiveLineageSampler<
@@ -17,7 +17,7 @@ pub struct IndependentActiveLineageSampler<
1717
R: LineageReference<H>,
1818
S: IncoherentLineageStore<H, R>,
1919
> {
20-
// TODO: This reference needs to somehow be initialised by the thread index in CUDA whilst allowing for generalisation
20+
#[cfg_attr(feature = "cuda", r2cEval(Some(R::from_cuda_thread_idx())))]
2121
active_lineage_reference: Option<R>,
2222
marker: PhantomData<(H, D, S)>,
2323
}
@@ -39,3 +39,19 @@ impl<
3939
}
4040
}
4141
}
42+
43+
impl<
44+
H: Habitat,
45+
D: DispersalSampler<H>,
46+
R: LineageReference<H>,
47+
S: IncoherentLineageStore<H, R>,
48+
> Default for IndependentActiveLineageSampler<H, D, R, S>
49+
{
50+
#[must_use]
51+
fn default() -> Self {
52+
Self {
53+
active_lineage_reference: None,
54+
marker: PhantomData::<(H, D, S)>,
55+
}
56+
}
57+
}

necsim-impls-no-std/src/cogs/active_lineage_sampler/independent/sampler.rs

Lines changed: 53 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ use float_next_after::NextAfter;
33
use necsim_core::cogs::{
44
ActiveLineageSampler, DispersalSampler, Habitat, IncoherentLineageStore, LineageReference,
55
};
6+
use necsim_core::intrinsics::{exp, floor};
67
use necsim_core::landscape::Location;
78
use necsim_core::rng::Rng;
89
use necsim_core::simulation::partial::active_lineager_sampler::PartialSimulation;
@@ -53,19 +54,62 @@ impl<
5354
None => return None,
5455
};
5556

57+
#[allow(clippy::question_mark)]
58+
if simulation
59+
.lineage_store
60+
.get(chosen_lineage_reference.clone())
61+
.is_none()
62+
{
63+
// Check for extraneously simulated lineages
64+
return None;
65+
}
66+
5667
let lineage_location = simulation
5768
.lineage_store
5869
.extract_lineage_from_its_location(chosen_lineage_reference.clone());
5970

60-
// TODO: As we are only doing geometric sampling for now, need to immediately increment discrete time step
61-
// TODO: How do we choose the time step for now?
62-
// TODO: Need to prime incoherent RNG here with location, discrete time step and substep 0
63-
64-
// TODO: Need to get time to next event in while loop with exponential (simplest option)
65-
66-
let event_time = time + rng.sample_exponential(0.5_f64);
67-
68-
// TODO: Need to prime incoherent RNG here with location, discrete time step and substep 0
71+
let delta_t = 0.1_f64;
72+
let lambda = 0.5_f64;
73+
74+
let p = 1.0_f64 - exp(-lambda * delta_t);
75+
76+
#[allow(clippy::cast_possible_truncation)]
77+
#[allow(clippy::cast_sign_loss)]
78+
let mut time_step = floor(time / delta_t) as u64 + 1;
79+
80+
loop {
81+
/*let location_x_bytes = lineage_location.x().to_le_bytes();
82+
let location_y_bytes = lineage_location.y().to_le_bytes();
83+
let time_step_bytes = time_step.to_le_bytes();
84+
85+
rng.prime_with([
86+
location_x_bytes[0],
87+
location_x_bytes[1],
88+
location_x_bytes[2],
89+
location_x_bytes[3],
90+
location_y_bytes[0],
91+
location_y_bytes[1],
92+
location_y_bytes[2],
93+
location_y_bytes[3],
94+
time_step_bytes[0],
95+
time_step_bytes[1],
96+
time_step_bytes[2],
97+
time_step_bytes[3],
98+
time_step_bytes[4],
99+
time_step_bytes[5],
100+
time_step_bytes[6],
101+
time_step_bytes[7],
102+
]);*/
103+
104+
if rng.sample_event(p) {
105+
break;
106+
}
107+
108+
time_step += 1;
109+
}
110+
111+
#[allow(clippy::cast_precision_loss)]
112+
let event_time = (time_step as f64) * delta_t;
69113

70114
let unique_event_time: f64 = if event_time > time {
71115
event_time

necsim-impls-no-std/src/cogs/lineage_reference/in_memory.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,12 @@ impl Into<usize> for InMemoryLineageReference {
2020
self.0
2121
}
2222
}
23+
24+
#[cfg(feature = "cuda")]
25+
impl rust_cuda::common::FromCudaThreadIdx for InMemoryLineageReference {
26+
#[cfg(target_os = "cuda")]
27+
fn from_cuda_thread_idx() -> Self {
28+
#[allow(clippy::cast_sign_loss)]
29+
Self::from(rust_cuda::device::utils::index() as usize)
30+
}
31+
}

rust-cuda-test/Cargo.toml

Lines changed: 0 additions & 19 deletions
This file was deleted.

rust-cuda-test/README

Lines changed: 0 additions & 5 deletions
This file was deleted.

rust-cuda-test/build.rs

Lines changed: 0 additions & 6 deletions
This file was deleted.

0 commit comments

Comments
 (0)