|
| 1 | +// Copyright 2024 The Hyperlight Authors. All rights reserved. |
| 2 | +// Use of this source code is governed by a BSD 3-Clause License |
| 3 | +// that can be found in the LICENSE.txt file. |
| 4 | +// SPDX-License-Identifier: BSD-3-Clause |
| 5 | + |
| 6 | +//! Host process crash handler for generating sandbox dumps. |
| 7 | +//! |
| 8 | +//! This module provides crash detection and dump generation for sandboxes |
| 9 | +//! when the host process crashes due to unhandled signals (Linux) or |
| 10 | +//! exceptions (Windows). |
| 11 | +//! |
| 12 | +//! # Architecture |
| 13 | +//! |
| 14 | +//! - **Registry**: Global map of sandbox ID -> (hypervisor raw pointer, dump enabled flag) |
| 15 | +//! - **Linux**: Signal handlers via `sigaction()` for fatal signals |
| 16 | +//! - **Windows**: Vectored exception handler via `AddVectoredExceptionHandler()` |
| 17 | +//! - **Automatic**: Initialized on first sandbox registration |
| 18 | +//! - **Cleanup**: Entries removed on sandbox Drop |
| 19 | +//! |
| 20 | +//! # Usage |
| 21 | +//! |
| 22 | +//! The crash handler is automatically initialized when the first sandbox |
| 23 | +//! is created. No explicit setup is required. When the host process crashes, |
| 24 | +//! dumps are generated for all registered sandboxes that have `guest_core_dump` |
| 25 | +//! enabled in their runtime configuration. |
| 26 | +//! |
| 27 | +//! # Feature Flag |
| 28 | +//! |
| 29 | +//! This entire module requires the `crashdump` feature to be enabled. |
| 30 | +
|
| 31 | +use std::sync::Mutex as StdMutex; |
| 32 | +use std::sync::atomic::{AtomicBool, Ordering}; |
| 33 | + |
| 34 | +use dashmap::DashMap; |
| 35 | +use once_cell::sync::Lazy; |
| 36 | + |
| 37 | +use crate::hypervisor::Hypervisor; |
| 38 | +use crate::{Result, new_error}; |
| 39 | + |
| 40 | +/// Entry in the sandbox registry. |
| 41 | +/// |
| 42 | +/// Stores a raw pointer to the hypervisor (unsafe!). |
| 43 | +/// This is safe during crash handling because: |
| 44 | +/// 1. The sandbox owns the hypervisor and won't drop it while registered |
| 45 | +/// 2. During a crash, normal thread-safety doesn't matter |
| 46 | +/// 3. We only access these pointers during crash (process is dying anyway) |
| 47 | +struct SandboxEntry { |
| 48 | + /// Raw pointer to the hypervisor (UNSAFE - only valid while sandbox is alive) |
| 49 | + hypervisor_ptr: *const dyn Hypervisor, |
| 50 | +} |
| 51 | + |
| 52 | +// SAFETY: We only access these pointers during crash handling, when the process |
| 53 | +// is dying anyway and normal thread-safety rules don't apply |
| 54 | +unsafe impl Send for SandboxEntry {} |
| 55 | +unsafe impl Sync for SandboxEntry {} |
| 56 | + |
| 57 | +/// Global registry of active sandboxes. |
| 58 | +/// |
| 59 | +/// Maps sandbox ID to hypervisor pointer. Uses DashMap for lock-free concurrent access. |
| 60 | +/// Entries are removed when sandboxes are dropped. |
| 61 | +static SANDBOX_REGISTRY: Lazy<DashMap<u64, SandboxEntry>> = Lazy::new(DashMap::new); |
| 62 | + |
| 63 | +/// Fast check for whether crash handlers have been initialized. |
| 64 | +/// |
| 65 | +/// This atomic bool allows us to skip the initialization lock on the fast path |
| 66 | +/// (after first initialization). We use Acquire/Release ordering to ensure |
| 67 | +/// proper synchronization with the initialization code. |
| 68 | +static INITIALIZED_FAST: AtomicBool = AtomicBool::new(false); |
| 69 | + |
| 70 | +/// Tracks if initialization failed (poisoned mutex or other error). |
| 71 | +/// |
| 72 | +/// If true, we skip all crash handler operations since they won't work anyway. |
| 73 | +static INITIALIZATION_FAILED: AtomicBool = AtomicBool::new(false); |
| 74 | + |
| 75 | +/// Mutex-protected initialization flag (only used during first initialization). |
| 76 | +/// We use std::sync::Mutex here (not parking_lot) so we can detect poisoning. |
| 77 | +static INITIALIZED: Lazy<StdMutex<bool>> = Lazy::new(|| StdMutex::new(false)); |
| 78 | + |
| 79 | +/// Register a sandbox with the crash handler. |
| 80 | +/// |
| 81 | +/// This function: |
| 82 | +/// 1. Stores a raw pointer to the hypervisor (unsafe but controlled) |
| 83 | +/// 2. Initializes crash handlers on first call (lazy init) |
| 84 | +/// |
| 85 | +/// Only registers the sandbox if crash dumps are enabled. If disabled, |
| 86 | +/// this function returns immediately without doing anything. |
| 87 | +/// |
| 88 | +/// # Arguments |
| 89 | +/// |
| 90 | +/// * `sandbox_id` - Unique ID of the sandbox |
| 91 | +/// * `hypervisor` - Reference to the hypervisor (we store a raw pointer) |
| 92 | +/// |
| 93 | +/// # Safety |
| 94 | +/// |
| 95 | +/// The caller MUST ensure the sandbox is unregistered before the hypervisor is dropped! |
| 96 | +/// This is enforced by MultiUseSandbox::Drop. |
| 97 | +/// |
| 98 | +/// # Errors |
| 99 | +/// |
| 100 | +/// Returns an error if the mutex is poisoned (extremely rare, would indicate |
| 101 | +/// a serious issue elsewhere in the program). |
| 102 | +pub fn register_sandbox(sandbox_id: u64, hypervisor: &dyn Hypervisor) -> Result<()> { |
| 103 | + // Check if initialization previously failed - no point trying again |
| 104 | + if INITIALIZATION_FAILED.load(Ordering::Acquire) { |
| 105 | + return Err(new_error!( |
| 106 | + "Crash handler initialization previously failed, skipping registration" |
| 107 | + )); |
| 108 | + } |
| 109 | + |
| 110 | + // Fast path: check if already initialized (lock-free!) |
| 111 | + if !INITIALIZED_FAST.load(Ordering::Acquire) { |
| 112 | + // Slow path: need to initialize (only happens once) |
| 113 | + match INITIALIZED.lock() { |
| 114 | + Ok(mut initialized) => { |
| 115 | + // Double-check inside the lock (another thread might have initialized) |
| 116 | + if !*initialized { |
| 117 | + platform::init_crash_handlers(); |
| 118 | + *initialized = true; |
| 119 | + // Mark as initialized atomically (Release ensures all init is visible) |
| 120 | + INITIALIZED_FAST.store(true, Ordering::Release); |
| 121 | + } |
| 122 | + } |
| 123 | + Err(e) => { |
| 124 | + // Mutex is poisoned - mark as failed and return error |
| 125 | + INITIALIZATION_FAILED.store(true, Ordering::Release); |
| 126 | + return Err(new_error!( |
| 127 | + "INITIALIZED mutex poisoned during crash handler init: {}", |
| 128 | + e |
| 129 | + )); |
| 130 | + } |
| 131 | + } |
| 132 | + } |
| 133 | + |
| 134 | + // Add entry to registry (lock-free with DashMap!) |
| 135 | + let hypervisor_ptr = unsafe { |
| 136 | + std::mem::transmute::<*const dyn Hypervisor, *const dyn Hypervisor>( |
| 137 | + hypervisor as *const dyn Hypervisor, |
| 138 | + ) |
| 139 | + }; |
| 140 | + |
| 141 | + SANDBOX_REGISTRY.insert(sandbox_id, SandboxEntry { hypervisor_ptr }); |
| 142 | + |
| 143 | + Ok(()) |
| 144 | +} |
| 145 | + |
| 146 | +/// Unregister a sandbox from the crash handler. |
| 147 | +/// |
| 148 | +/// Called automatically by MultiUseSandbox::Drop. |
| 149 | +/// |
| 150 | +/// # Arguments |
| 151 | +/// |
| 152 | +/// * `sandbox_id` - Unique ID of the sandbox to unregister |
| 153 | +pub fn unregister_sandbox(sandbox_id: u64) { |
| 154 | + // Lock-free removal with DashMap |
| 155 | + SANDBOX_REGISTRY.remove(&sandbox_id); |
| 156 | +} |
| 157 | + |
| 158 | +/// Generate dumps for all registered sandboxes. |
| 159 | +/// |
| 160 | +/// Called by platform-specific crash handlers when a fatal signal/exception occurs. |
| 161 | +/// Iterates through the registry and generates dumps for all registered sandboxes. |
| 162 | +/// Only sandboxes with dumps enabled are registered, so all entries get dumped. |
| 163 | +/// |
| 164 | +/// # Safety |
| 165 | +/// |
| 166 | +/// This function is called during crash handling and: |
| 167 | +/// - Dereferences raw pointers (unsafe but acceptable during crash) |
| 168 | +/// - May violate async-signal-safety on Linux |
| 169 | +/// - Accesses hypervisor state without locks |
| 170 | +/// |
| 171 | +/// All of this is acceptable because the process is crashing anyway. |
| 172 | +/// |
| 173 | +/// # Returns |
| 174 | +/// |
| 175 | +/// Number of dumps successfully generated. |
| 176 | +pub(crate) fn generate_crash_dumps() -> usize { |
| 177 | + let mut dump_count = 0; |
| 178 | + |
| 179 | + // Iterate over the lock-free registry |
| 180 | + for entry_ref in SANDBOX_REGISTRY.iter() { |
| 181 | + let entry = entry_ref.value(); |
| 182 | + |
| 183 | + // SAFETY: This is unsafe! We're dereferencing a raw pointer. |
| 184 | + // This is acceptable because: |
| 185 | + // 1. The sandbox registers/unregisters properly via Drop |
| 186 | + // 2. During a crash, the process is dying anyway |
| 187 | + // 3. We're willing to accept potential UB during crash handling |
| 188 | + unsafe { |
| 189 | + let hypervisor = &*entry.hypervisor_ptr; |
| 190 | + |
| 191 | + // Try to generate the crash dump |
| 192 | + // This is NOT async-signal-safe (file I/O, allocations, etc.) |
| 193 | + // but we're crashing, so this is acceptable |
| 194 | + // |
| 195 | + // Catch panics: If generating one dump panics, it maybe indicates |
| 196 | + // a systemic issue so we short-circuit |
| 197 | + // rather than risk cascading failures |
| 198 | + let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { |
| 199 | + crate::hypervisor::crashdump::generate_crashdump(hypervisor) |
| 200 | + })); |
| 201 | + |
| 202 | + match result { |
| 203 | + Ok(Ok(())) => { |
| 204 | + dump_count += 1; |
| 205 | + } |
| 206 | + Ok(Err(_)) => { |
| 207 | + // Silent failure - dump generation returned an error |
| 208 | + } |
| 209 | + Err(_) => { |
| 210 | + // Panic during dump generation - abort remaining dumps |
| 211 | + // This may indicate a systemic issue |
| 212 | + break; |
| 213 | + } |
| 214 | + } |
| 215 | + } |
| 216 | + } |
| 217 | + |
| 218 | + dump_count |
| 219 | +} |
| 220 | + |
| 221 | +// Platform-specific implementations |
| 222 | +#[cfg(target_os = "linux")] |
| 223 | +#[path = "crash_handler/linux.rs"] |
| 224 | +mod platform; |
| 225 | + |
| 226 | +#[cfg(target_os = "windows")] |
| 227 | +#[path = "crash_handler/windows.rs"] |
| 228 | +mod platform; |
0 commit comments