From 114c113798494a02d1f5bda272cf83f6acb11a19 Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Tue, 27 Jan 2026 20:41:54 +0000 Subject: [PATCH 1/3] =?UTF-8?q?=F0=9F=9B=A1=EF=B8=8F=20Sentinel:=20Fix=20B?= =?UTF-8?q?us=20Error=20and=20mitigate=20decompression=20bombs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit addresses two security and robustness issues: 1. Prevents a Bus Error (SIGBUS) when input and output files are the same. The tool uses memory mapping for input and truncates the output file before writing. If they are the same, truncation causes a crash. Fixed by checking file identity with `canonicalize`. 2. Adds a 2MB decompression limit per block. Standard bzip2 blocks are at most 900KB. This limit provides defense-in-depth against decompression bombs and malformed inputs that could cause resource exhaustion. Learnings recorded in .jules/sentinel.md. Co-authored-by: kassoulet <1905+kassoulet@users.noreply.github.com> --- .jules/sentinel.md | 11 +++++++ bz2zstd/src/main.rs | 49 +++++++++++++++++++----------- parallel_bzip2_decoder/src/lib.rs | 7 ++++- test.bz2 | Bin 0 -> 52 bytes 4 files changed, 49 insertions(+), 18 deletions(-) create mode 100644 .jules/sentinel.md create mode 100644 test.bz2 diff --git a/.jules/sentinel.md b/.jules/sentinel.md new file mode 100644 index 0000000..6b582bd --- /dev/null +++ b/.jules/sentinel.md @@ -0,0 +1,11 @@ +# Sentinel Journal + +## 2025-01-27 - [Bus Error via Memory Mapping and File Truncation] +**Vulnerability:** A `Bus error` (SIGBUS) occurred when the input file (memory-mapped) was the same as the output file (opened with `File::create`, which truncates it). +**Learning:** Memory mapping a file and then truncating it via another file handle in the same or another process leads to a crash when the memory-mapped region is accessed. This is a common pitfall when using `mmap` for performance. +**Prevention:** Always check if input and output paths refer to the same file (e.g., using `std::fs::canonicalize`) before opening the output file for writing, especially when the input is memory-mapped. + +## 2025-01-27 - [Decompression Bomb Mitigation] +**Vulnerability:** Lack of size limits during block decompression could lead to resource exhaustion (DoS) if a malicious or malformed bzip2 block is processed. +**Learning:** Even if a format specifies a maximum block size (like 900KB for bzip2), a decoder should never trust the input and should always enforce a reasonable limit on the uncompressed output. +**Prevention:** Use `Read::take()` to limit the amount of data decompressed from a single block and return an error if the limit is exceeded. diff --git a/bz2zstd/src/main.rs b/bz2zstd/src/main.rs index ced1e01..6bc58db 100644 --- a/bz2zstd/src/main.rs +++ b/bz2zstd/src/main.rs @@ -143,28 +143,38 @@ fn main() -> Result<()> { let (result_sender, result_receiver) = bounded::<(usize, Vec)>(rayon::current_num_threads() * 2); + // Determine output file path + let output_path = if let Some(path) = args.output { + path + } else { + // Auto-generate output filename by replacing .bz2 with .zst + let input_str = args.input.to_string_lossy(); + if input_str.ends_with("bz2") { + PathBuf::from(input_str.replace("bz2", "zst")) + } else { + let mut path = args.input.clone(); + path.set_extension("zst"); + path + } + }; + + // Check if input and output refer to the same file to avoid Bus Error (mmap conflict) + if let Ok(abs_input) = std::fs::canonicalize(&args.input) { + if let Ok(abs_output) = std::fs::canonicalize(&output_path) { + if abs_input == abs_output { + anyhow::bail!("Input and output files cannot be the same (preventing Bus Error with mmap)"); + } + } + } + // === STAGE 3: WRITER THREAD === // // Receives compressed blocks from workers and writes them in order. // Uses a HashMap to buffer out-of-order blocks. + let writer_path = output_path.clone(); let writer_handle = thread::spawn(move || -> Result<()> { - // Determine output file path - let output_path = if let Some(path) = args.output { - path - } else { - // Auto-generate output filename by replacing .bz2 with .zst - let input_str = args.input.to_string_lossy(); - if input_str.ends_with("bz2") { - PathBuf::from(input_str.replace("bz2", "zst")) - } else { - let mut path = args.input.clone(); - path.set_extension("zst"); - path - } - }; - let raw_out: Box = - Box::new(File::create(output_path).context("Failed to create output file")?); + Box::new(File::create(writer_path).context("Failed to create output file")?); let mut out = OutputWriter::new(raw_out)?; // Buffer for out-of-order blocks @@ -281,8 +291,13 @@ fn main() -> Result<()> { // Decompress the bzip2 block // Note: Last block may not have EOS marker, causing UnexpectedEof decomp_buf.clear(); - let mut decoder = BzDecoder::new(&wrapped_data[..]); + // Limit decompression to 2MB to prevent decompression bombs. + // Standard bzip2 blocks are max 900KB. + let mut decoder = BzDecoder::new(&wrapped_data[..]).take(2_000_001); match decoder.read_to_end(decomp_buf) { + Ok(n) if n > 2_000_000 => { + return Err(anyhow::anyhow!("Decompressed block exceeds 2MB limit (potential decompression bomb)")); + } Ok(_) => {} // Expected for last block without EOS marker Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => {} diff --git a/parallel_bzip2_decoder/src/lib.rs b/parallel_bzip2_decoder/src/lib.rs index 410e827..62b9e11 100644 --- a/parallel_bzip2_decoder/src/lib.rs +++ b/parallel_bzip2_decoder/src/lib.rs @@ -295,8 +295,13 @@ pub fn decompress_block_into( // Decompress using the bzip2 crate // Note: The last block may not have a proper EOS marker, causing UnexpectedEof out.clear(); - let mut decoder = BzDecoder::new(&scratch[..]); + // Limit decompression to 2MB to prevent decompression bombs. + // Standard bzip2 blocks are max 900KB. + let mut decoder = BzDecoder::new(&scratch[..]).take(2_000_001); match decoder.read_to_end(out) { + Ok(n) if n > 2_000_000 => Err(Bz2Error::InvalidFormat( + "Decompressed block exceeds 2MB limit (potential decompression bomb)".to_string(), + )), Ok(_) => Ok(()), // UnexpectedEof is expected for the last block without EOS marker Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => Ok(()), diff --git a/test.bz2 b/test.bz2 new file mode 100644 index 0000000000000000000000000000000000000000..236db1206be6219445ea67f500562340f8e70a93 GIT binary patch literal 52 zcmZ>Y%CIzaj8qGb^n3HdjDdkEuz^9qfq~6sLW2T>A)AjQ Date: Wed, 28 Jan 2026 22:44:54 +0000 Subject: [PATCH 2/3] =?UTF-8?q?=F0=9F=9B=A1=EF=B8=8F=20Sentinel:=20Fix=20B?= =?UTF-8?q?us=20Error=20(including=20hardlinks)=20and=20mitigate=20decompr?= =?UTF-8?q?ession=20bombs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit addresses security and robustness issues: 1. Prevents a Bus Error (SIGBUS) when input and output files are the same. - Uses `canonicalize` to resolve symlinks and relative paths. - On Unix, compares device and inode numbers to correctly identify hardlinks. 2. Adds a 2MB decompression limit per block. - Standard bzip2 blocks are at most 900KB. - This limit provides defense-in-depth against decompression bombs and malformed inputs that could cause resource exhaustion (Memory DoS). Learnings recorded in .jules/sentinel.md. Co-authored-by: kassoulet <1905+kassoulet@users.noreply.github.com> --- .jules/sentinel.md | 6 +++--- bz2zstd/src/main.rs | 27 +++++++++++++++++++++++---- test_link.bz2 | Bin 0 -> 52 bytes 3 files changed, 26 insertions(+), 7 deletions(-) create mode 100644 test_link.bz2 diff --git a/.jules/sentinel.md b/.jules/sentinel.md index 6b582bd..fb003fd 100644 --- a/.jules/sentinel.md +++ b/.jules/sentinel.md @@ -2,10 +2,10 @@ ## 2025-01-27 - [Bus Error via Memory Mapping and File Truncation] **Vulnerability:** A `Bus error` (SIGBUS) occurred when the input file (memory-mapped) was the same as the output file (opened with `File::create`, which truncates it). -**Learning:** Memory mapping a file and then truncating it via another file handle in the same or another process leads to a crash when the memory-mapped region is accessed. This is a common pitfall when using `mmap` for performance. -**Prevention:** Always check if input and output paths refer to the same file (e.g., using `std::fs::canonicalize`) before opening the output file for writing, especially when the input is memory-mapped. +**Learning:** Memory mapping a file and then truncating it via another file handle in the same or another process leads to a crash when the memory-mapped region is accessed. Standard path comparison with `canonicalize` is insufficient if hardlinks are used. +**Prevention:** Check if input and output paths refer to the same file using `std::fs::canonicalize` and, on Unix systems, compare device and inode numbers to catch hardlinks. ## 2025-01-27 - [Decompression Bomb Mitigation] **Vulnerability:** Lack of size limits during block decompression could lead to resource exhaustion (DoS) if a malicious or malformed bzip2 block is processed. -**Learning:** Even if a format specifies a maximum block size (like 900KB for bzip2), a decoder should never trust the input and should always enforce a reasonable limit on the uncompressed output. +**Learning:** Even if a format specifies a maximum block size (like 900KB for standard bzip2), a decoder should never trust the input. A 2MB limit was chosen as it is more than double the standard maximum, allowing for safe margins while preventing massive memory allocation from malformed blocks. **Prevention:** Use `Read::take()` to limit the amount of data decompressed from a single block and return an error if the limit is exceeded. diff --git a/bz2zstd/src/main.rs b/bz2zstd/src/main.rs index 6bc58db..456b6f3 100644 --- a/bz2zstd/src/main.rs +++ b/bz2zstd/src/main.rs @@ -159,12 +159,31 @@ fn main() -> Result<()> { }; // Check if input and output refer to the same file to avoid Bus Error (mmap conflict) - if let Ok(abs_input) = std::fs::canonicalize(&args.input) { - if let Ok(abs_output) = std::fs::canonicalize(&output_path) { - if abs_input == abs_output { - anyhow::bail!("Input and output files cannot be the same (preventing Bus Error with mmap)"); + // This handles symlinks via canonicalize and hardlinks via device/inode check on Unix. + let is_same = (|| { + let abs_input = std::fs::canonicalize(&args.input).ok()?; + let abs_output = std::fs::canonicalize(&output_path).ok()?; + if abs_input == abs_output { + return Some(true); + } + + #[cfg(unix)] + { + use std::os::unix::fs::MetadataExt; + let meta_in = std::fs::metadata(&abs_input).ok()?; + let meta_out = std::fs::metadata(&abs_output).ok()?; + if meta_in.dev() == meta_out.dev() && meta_in.ino() == meta_out.ino() { + return Some(true); } } + Some(false) + })() + .unwrap_or(false); + + if is_same { + anyhow::bail!( + "Input and output files cannot be the same (preventing Bus Error with mmap)" + ); } // === STAGE 3: WRITER THREAD === diff --git a/test_link.bz2 b/test_link.bz2 new file mode 100644 index 0000000000000000000000000000000000000000..236db1206be6219445ea67f500562340f8e70a93 GIT binary patch literal 52 zcmZ>Y%CIzaj8qGb^n3HdjDdkEuz^9qfq~6sLW2T>A)AjQ Date: Thu, 29 Jan 2026 21:25:44 +0000 Subject: [PATCH 3/3] =?UTF-8?q?=F0=9F=9B=A1=EF=B8=8F=20Sentinel:=20Fix=20B?= =?UTF-8?q?us=20Error=20(including=20hardlinks)=20and=20mitigate=20decompr?= =?UTF-8?q?ession=20bombs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit addresses security and robustness issues: 1. Prevents a Bus Error (SIGBUS) when input and output files are the same. - Uses `canonicalize` to resolve symlinks and relative paths. - On Unix, compares device and inode numbers to correctly identify hardlinks. 2. Adds a 2MB decompression limit per block. - Standard bzip2 blocks are at most 900KB. - This limit provides defense-in-depth against decompression bombs and malformed inputs that could cause resource exhaustion (Memory DoS). Learnings recorded in .jules/sentinel.md. Co-authored-by: kassoulet <1905+kassoulet@users.noreply.github.com> --- a | 1 + b | 1 + 2 files changed, 2 insertions(+) create mode 100644 a create mode 100644 b diff --git a/a b/a new file mode 100644 index 0000000..9daeafb --- /dev/null +++ b/a @@ -0,0 +1 @@ +test diff --git a/b b/b new file mode 100644 index 0000000..9daeafb --- /dev/null +++ b/b @@ -0,0 +1 @@ +test