diff --git a/.jules/sentinel.md b/.jules/sentinel.md new file mode 100644 index 0000000..fb003fd --- /dev/null +++ b/.jules/sentinel.md @@ -0,0 +1,11 @@ +# Sentinel Journal + +## 2025-01-27 - [Bus Error via Memory Mapping and File Truncation] +**Vulnerability:** A `Bus error` (SIGBUS) occurred when the input file (memory-mapped) was the same as the output file (opened with `File::create`, which truncates it). +**Learning:** Memory mapping a file and then truncating it via another file handle in the same or another process leads to a crash when the memory-mapped region is accessed. Standard path comparison with `canonicalize` is insufficient if hardlinks are used. +**Prevention:** Check if input and output paths refer to the same file using `std::fs::canonicalize` and, on Unix systems, compare device and inode numbers to catch hardlinks. + +## 2025-01-27 - [Decompression Bomb Mitigation] +**Vulnerability:** Lack of size limits during block decompression could lead to resource exhaustion (DoS) if a malicious or malformed bzip2 block is processed. +**Learning:** Even if a format specifies a maximum block size (like 900KB for standard bzip2), a decoder should never trust the input. A 2MB limit was chosen as it is more than double the standard maximum, allowing for safe margins while preventing massive memory allocation from malformed blocks. +**Prevention:** Use `Read::take()` to limit the amount of data decompressed from a single block and return an error if the limit is exceeded. diff --git a/a b/a new file mode 100644 index 0000000..9daeafb --- /dev/null +++ b/a @@ -0,0 +1 @@ +test diff --git a/b b/b new file mode 100644 index 0000000..9daeafb --- /dev/null +++ b/b @@ -0,0 +1 @@ +test diff --git a/bz2zstd/src/main.rs b/bz2zstd/src/main.rs index ced1e01..456b6f3 100644 --- a/bz2zstd/src/main.rs +++ b/bz2zstd/src/main.rs @@ -143,28 +143,57 @@ fn main() -> Result<()> { let (result_sender, result_receiver) = bounded::<(usize, Vec)>(rayon::current_num_threads() * 2); + // Determine output file path + let output_path = if let Some(path) = args.output { + path + } else { + // Auto-generate output filename by replacing .bz2 with .zst + let input_str = args.input.to_string_lossy(); + if input_str.ends_with("bz2") { + PathBuf::from(input_str.replace("bz2", "zst")) + } else { + let mut path = args.input.clone(); + path.set_extension("zst"); + path + } + }; + + // Check if input and output refer to the same file to avoid Bus Error (mmap conflict) + // This handles symlinks via canonicalize and hardlinks via device/inode check on Unix. + let is_same = (|| { + let abs_input = std::fs::canonicalize(&args.input).ok()?; + let abs_output = std::fs::canonicalize(&output_path).ok()?; + if abs_input == abs_output { + return Some(true); + } + + #[cfg(unix)] + { + use std::os::unix::fs::MetadataExt; + let meta_in = std::fs::metadata(&abs_input).ok()?; + let meta_out = std::fs::metadata(&abs_output).ok()?; + if meta_in.dev() == meta_out.dev() && meta_in.ino() == meta_out.ino() { + return Some(true); + } + } + Some(false) + })() + .unwrap_or(false); + + if is_same { + anyhow::bail!( + "Input and output files cannot be the same (preventing Bus Error with mmap)" + ); + } + // === STAGE 3: WRITER THREAD === // // Receives compressed blocks from workers and writes them in order. // Uses a HashMap to buffer out-of-order blocks. + let writer_path = output_path.clone(); let writer_handle = thread::spawn(move || -> Result<()> { - // Determine output file path - let output_path = if let Some(path) = args.output { - path - } else { - // Auto-generate output filename by replacing .bz2 with .zst - let input_str = args.input.to_string_lossy(); - if input_str.ends_with("bz2") { - PathBuf::from(input_str.replace("bz2", "zst")) - } else { - let mut path = args.input.clone(); - path.set_extension("zst"); - path - } - }; - let raw_out: Box = - Box::new(File::create(output_path).context("Failed to create output file")?); + Box::new(File::create(writer_path).context("Failed to create output file")?); let mut out = OutputWriter::new(raw_out)?; // Buffer for out-of-order blocks @@ -281,8 +310,13 @@ fn main() -> Result<()> { // Decompress the bzip2 block // Note: Last block may not have EOS marker, causing UnexpectedEof decomp_buf.clear(); - let mut decoder = BzDecoder::new(&wrapped_data[..]); + // Limit decompression to 2MB to prevent decompression bombs. + // Standard bzip2 blocks are max 900KB. + let mut decoder = BzDecoder::new(&wrapped_data[..]).take(2_000_001); match decoder.read_to_end(decomp_buf) { + Ok(n) if n > 2_000_000 => { + return Err(anyhow::anyhow!("Decompressed block exceeds 2MB limit (potential decompression bomb)")); + } Ok(_) => {} // Expected for last block without EOS marker Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => {} diff --git a/parallel_bzip2_decoder/src/lib.rs b/parallel_bzip2_decoder/src/lib.rs index 410e827..62b9e11 100644 --- a/parallel_bzip2_decoder/src/lib.rs +++ b/parallel_bzip2_decoder/src/lib.rs @@ -295,8 +295,13 @@ pub fn decompress_block_into( // Decompress using the bzip2 crate // Note: The last block may not have a proper EOS marker, causing UnexpectedEof out.clear(); - let mut decoder = BzDecoder::new(&scratch[..]); + // Limit decompression to 2MB to prevent decompression bombs. + // Standard bzip2 blocks are max 900KB. + let mut decoder = BzDecoder::new(&scratch[..]).take(2_000_001); match decoder.read_to_end(out) { + Ok(n) if n > 2_000_000 => Err(Bz2Error::InvalidFormat( + "Decompressed block exceeds 2MB limit (potential decompression bomb)".to_string(), + )), Ok(_) => Ok(()), // UnexpectedEof is expected for the last block without EOS marker Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => Ok(()), diff --git a/test.bz2 b/test.bz2 new file mode 100644 index 0000000..236db12 Binary files /dev/null and b/test.bz2 differ diff --git a/test_link.bz2 b/test_link.bz2 new file mode 100644 index 0000000..236db12 Binary files /dev/null and b/test_link.bz2 differ