Skip to content

Commit 429ee4f

Browse files
committed
Create crashdumps for VMs if process is crashing and creating a dump
Signed-off-by: Simon Davies <simongdavies@users.noreply.github.com>
1 parent f3acc2e commit 429ee4f

File tree

12 files changed

+843
-1
lines changed

12 files changed

+843
-1
lines changed

Cargo.lock

Lines changed: 22 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/hyperlight_host/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ metrics = "0.24.2"
5151
serde_json = "1.0"
5252
elfcore = "2.0"
5353
uuid = { version = "1.18.1", features = ["v4"] }
54+
once_cell = "1.20"
55+
dashmap = "6.1"
5456

5557
[target.'cfg(windows)'.dependencies]
5658
windows = { version = "0.62", features = [
Lines changed: 228 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,228 @@
1+
// Copyright 2024 The Hyperlight Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD 3-Clause License
3+
// that can be found in the LICENSE.txt file.
4+
// SPDX-License-Identifier: BSD-3-Clause
5+
6+
//! Host process crash handler for generating sandbox dumps.
7+
//!
8+
//! This module provides crash detection and dump generation for sandboxes
9+
//! when the host process crashes due to unhandled signals (Linux) or
10+
//! exceptions (Windows).
11+
//!
12+
//! # Architecture
13+
//!
14+
//! - **Registry**: Global map of sandbox ID -> (hypervisor raw pointer, dump enabled flag)
15+
//! - **Linux**: Signal handlers via `sigaction()` for fatal signals
16+
//! - **Windows**: Vectored exception handler via `AddVectoredExceptionHandler()`
17+
//! - **Automatic**: Initialized on first sandbox registration
18+
//! - **Cleanup**: Entries removed on sandbox Drop
19+
//!
20+
//! # Usage
21+
//!
22+
//! The crash handler is automatically initialized when the first sandbox
23+
//! is created. No explicit setup is required. When the host process crashes,
24+
//! dumps are generated for all registered sandboxes that have `guest_core_dump`
25+
//! enabled in their runtime configuration.
26+
//!
27+
//! # Feature Flag
28+
//!
29+
//! This entire module requires the `crashdump` feature to be enabled.
30+
31+
use std::sync::Mutex as StdMutex;
32+
use std::sync::atomic::{AtomicBool, Ordering};
33+
34+
use dashmap::DashMap;
35+
use once_cell::sync::Lazy;
36+
37+
use crate::hypervisor::Hypervisor;
38+
use crate::{Result, new_error};
39+
40+
/// Entry in the sandbox registry.
41+
///
42+
/// Stores a raw pointer to the hypervisor (unsafe!).
43+
/// This is safe during crash handling because:
44+
/// 1. The sandbox owns the hypervisor and won't drop it while registered
45+
/// 2. During a crash, normal thread-safety doesn't matter
46+
/// 3. We only access these pointers during crash (process is dying anyway)
47+
struct SandboxEntry {
48+
/// Raw pointer to the hypervisor (UNSAFE - only valid while sandbox is alive)
49+
hypervisor_ptr: *const dyn Hypervisor,
50+
}
51+
52+
// SAFETY: We only access these pointers during crash handling, when the process
53+
// is dying anyway and normal thread-safety rules don't apply
54+
unsafe impl Send for SandboxEntry {}
55+
unsafe impl Sync for SandboxEntry {}
56+
57+
/// Global registry of active sandboxes.
58+
///
59+
/// Maps sandbox ID to hypervisor pointer. Uses DashMap for lock-free concurrent access.
60+
/// Entries are removed when sandboxes are dropped.
61+
static SANDBOX_REGISTRY: Lazy<DashMap<u64, SandboxEntry>> = Lazy::new(DashMap::new);
62+
63+
/// Fast check for whether crash handlers have been initialized.
64+
///
65+
/// This atomic bool allows us to skip the initialization lock on the fast path
66+
/// (after first initialization). We use Acquire/Release ordering to ensure
67+
/// proper synchronization with the initialization code.
68+
static INITIALIZED_FAST: AtomicBool = AtomicBool::new(false);
69+
70+
/// Tracks if initialization failed (poisoned mutex or other error).
71+
///
72+
/// If true, we skip all crash handler operations since they won't work anyway.
73+
static INITIALIZATION_FAILED: AtomicBool = AtomicBool::new(false);
74+
75+
/// Mutex-protected initialization flag (only used during first initialization).
76+
/// We use std::sync::Mutex here (not parking_lot) so we can detect poisoning.
77+
static INITIALIZED: Lazy<StdMutex<bool>> = Lazy::new(|| StdMutex::new(false));
78+
79+
/// Register a sandbox with the crash handler.
80+
///
81+
/// This function:
82+
/// 1. Stores a raw pointer to the hypervisor (unsafe but controlled)
83+
/// 2. Initializes crash handlers on first call (lazy init)
84+
///
85+
/// Only registers the sandbox if crash dumps are enabled. If disabled,
86+
/// this function returns immediately without doing anything.
87+
///
88+
/// # Arguments
89+
///
90+
/// * `sandbox_id` - Unique ID of the sandbox
91+
/// * `hypervisor` - Reference to the hypervisor (we store a raw pointer)
92+
///
93+
/// # Safety
94+
///
95+
/// The caller MUST ensure the sandbox is unregistered before the hypervisor is dropped!
96+
/// This is enforced by MultiUseSandbox::Drop.
97+
///
98+
/// # Errors
99+
///
100+
/// Returns an error if the mutex is poisoned (extremely rare, would indicate
101+
/// a serious issue elsewhere in the program).
102+
pub fn register_sandbox(sandbox_id: u64, hypervisor: &dyn Hypervisor) -> Result<()> {
103+
// Check if initialization previously failed - no point trying again
104+
if INITIALIZATION_FAILED.load(Ordering::Acquire) {
105+
return Err(new_error!(
106+
"Crash handler initialization previously failed, skipping registration"
107+
));
108+
}
109+
110+
// Fast path: check if already initialized (lock-free!)
111+
if !INITIALIZED_FAST.load(Ordering::Acquire) {
112+
// Slow path: need to initialize (only happens once)
113+
match INITIALIZED.lock() {
114+
Ok(mut initialized) => {
115+
// Double-check inside the lock (another thread might have initialized)
116+
if !*initialized {
117+
platform::init_crash_handlers();
118+
*initialized = true;
119+
// Mark as initialized atomically (Release ensures all init is visible)
120+
INITIALIZED_FAST.store(true, Ordering::Release);
121+
}
122+
}
123+
Err(e) => {
124+
// Mutex is poisoned - mark as failed and return error
125+
INITIALIZATION_FAILED.store(true, Ordering::Release);
126+
return Err(new_error!(
127+
"INITIALIZED mutex poisoned during crash handler init: {}",
128+
e
129+
));
130+
}
131+
}
132+
}
133+
134+
// Add entry to registry (lock-free with DashMap!)
135+
let hypervisor_ptr = unsafe {
136+
std::mem::transmute::<*const dyn Hypervisor, *const dyn Hypervisor>(
137+
hypervisor as *const dyn Hypervisor,
138+
)
139+
};
140+
141+
SANDBOX_REGISTRY.insert(sandbox_id, SandboxEntry { hypervisor_ptr });
142+
143+
Ok(())
144+
}
145+
146+
/// Unregister a sandbox from the crash handler.
147+
///
148+
/// Called automatically by MultiUseSandbox::Drop.
149+
///
150+
/// # Arguments
151+
///
152+
/// * `sandbox_id` - Unique ID of the sandbox to unregister
153+
pub fn unregister_sandbox(sandbox_id: u64) {
154+
// Lock-free removal with DashMap
155+
SANDBOX_REGISTRY.remove(&sandbox_id);
156+
}
157+
158+
/// Generate dumps for all registered sandboxes.
159+
///
160+
/// Called by platform-specific crash handlers when a fatal signal/exception occurs.
161+
/// Iterates through the registry and generates dumps for all registered sandboxes.
162+
/// Only sandboxes with dumps enabled are registered, so all entries get dumped.
163+
///
164+
/// # Safety
165+
///
166+
/// This function is called during crash handling and:
167+
/// - Dereferences raw pointers (unsafe but acceptable during crash)
168+
/// - May violate async-signal-safety on Linux
169+
/// - Accesses hypervisor state without locks
170+
///
171+
/// All of this is acceptable because the process is crashing anyway.
172+
///
173+
/// # Returns
174+
///
175+
/// Number of dumps successfully generated.
176+
pub(crate) fn generate_crash_dumps() -> usize {
177+
let mut dump_count = 0;
178+
179+
// Iterate over the lock-free registry
180+
for entry_ref in SANDBOX_REGISTRY.iter() {
181+
let entry = entry_ref.value();
182+
183+
// SAFETY: This is unsafe! We're dereferencing a raw pointer.
184+
// This is acceptable because:
185+
// 1. The sandbox registers/unregisters properly via Drop
186+
// 2. During a crash, the process is dying anyway
187+
// 3. We're willing to accept potential UB during crash handling
188+
unsafe {
189+
let hypervisor = &*entry.hypervisor_ptr;
190+
191+
// Try to generate the crash dump
192+
// This is NOT async-signal-safe (file I/O, allocations, etc.)
193+
// but we're crashing, so this is acceptable
194+
//
195+
// Catch panics: If generating one dump panics, it maybe indicates
196+
// a systemic issue so we short-circuit
197+
// rather than risk cascading failures
198+
let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
199+
crate::hypervisor::crashdump::generate_crashdump(hypervisor)
200+
}));
201+
202+
match result {
203+
Ok(Ok(())) => {
204+
dump_count += 1;
205+
}
206+
Ok(Err(_)) => {
207+
// Silent failure - dump generation returned an error
208+
}
209+
Err(_) => {
210+
// Panic during dump generation - abort remaining dumps
211+
// This may indicate a systemic issue
212+
break;
213+
}
214+
}
215+
}
216+
}
217+
218+
dump_count
219+
}
220+
221+
// Platform-specific implementations
222+
#[cfg(target_os = "linux")]
223+
#[path = "crash_handler/linux.rs"]
224+
mod platform;
225+
226+
#[cfg(target_os = "windows")]
227+
#[path = "crash_handler/windows.rs"]
228+
mod platform;

0 commit comments

Comments
 (0)