From 0261c63f52eaedfc19774b95fa6fc7dc5f25127d Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 8 Feb 2026 07:27:26 +0000 Subject: [PATCH 1/3] Initial plan From 58d3d10802a5921ce5d40a2391f08cc25c56b439 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 8 Feb 2026 07:36:32 +0000 Subject: [PATCH 2/3] Fix -m and -M options for file utility to support hex escape sequences - Fixed get_magic_files() to only use specified magic file with -m (not default) - Changed Value::String to store Vec instead of String to handle non-UTF-8 bytes - Implemented hex escape sequence parsing (\xNN) for magic files - Fixed octal escape sequence parsing to work with raw bytes - Updated string_test to compare raw bytes directly - Added test for JBIG2 image detection using hex escape sequences - All existing tests pass Co-authored-by: jgarzik <494411+jgarzik@users.noreply.github.com> --- file/file.rs | 2 +- file/magic.rs | 219 ++++++++++++++++++++++++++++-------- file/tests/file/jbig2.magic | 1 + file/tests/file/mod.rs | 23 ++++ file/tests/file/test.jb2 | 2 + 5 files changed, 196 insertions(+), 51 deletions(-) create mode 100644 file/tests/file/jbig2.magic create mode 100644 file/tests/file/test.jb2 diff --git a/file/file.rs b/file/file.rs index 0c77af90d..8fa266d7f 100644 --- a/file/file.rs +++ b/file/file.rs @@ -102,7 +102,7 @@ fn get_magic_files(args: &Args) -> Vec { } else if let Some(test_file1) = &args.test_file1 { magic_files.push(test_file1.clone()); - if args.test_file2.is_none() && !args.default_tests { + if args.default_tests { magic_files.push(default_magic_file); } } else { diff --git a/file/magic.rs b/file/magic.rs index 8c0d6ecc5..27c2f0792 100644 --- a/file/magic.rs +++ b/file/magic.rs @@ -18,8 +18,6 @@ use std::{ }; // Pre-compiled static regexes for performance -static OCTAL_RE: LazyLock = - LazyLock::new(|| Regex::new(r"\\([0-7]{1,3})").expect("invalid regex")); static COMP_OP_RE: LazyLock = LazyLock::new(|| Regex::new(r"^[=<>&^x]").expect("invalid regex")); static HEX_RE: LazyLock = @@ -90,7 +88,7 @@ enum ComparisonOperator { #[derive(Debug)] enum Value { - String(String), + String(Vec), // Store raw bytes instead of String Number(ComparisonOperator, u64), } @@ -98,39 +96,43 @@ impl Value { fn parse(mut input: String, _type: Type) -> Result { match _type { Type::String => { - let mut result = String::new(); - let mut chars = input.chars(); - - // replace character escape sequences with their characters - while let Some(c) = chars.next() { - if c == '\\' { - if let Some(escaped) = chars.next() { - let replacement = match escaped { - '\\' => '\\', - 'a' => '\x07', // alert - 'b' => '\x08', // backspace - 'f' => '\x0C', // form feed - 'n' => '\n', // newline - 'r' => '\r', // carriage return - 't' => '\t', // horizontal tab - 'v' => '\x0B', // vertical tab - ' ' => ' ', // space - _ => { - result.push('\\'); - escaped - } // Treat any other character as itself - }; - result.push(replacement); - } else { - result.push('\\'); - } + // First, replace hex escape sequences + let bytes = Self::replace_all_hex_sequences_with_their_coded_values(&input)?; + + // Then, replace octal escape sequences + let bytes = + Self::replace_all_octal_sequences_with_their_coded_values_bytes(&bytes)?; + + // Replace character escape sequences with their characters + let mut result = Vec::new(); + let mut i = 0; + while i < bytes.len() { + if bytes[i] == b'\\' && i + 1 < bytes.len() { + let replacement = match bytes[i + 1] { + b'\\' => b'\\', + b'a' => 0x07, // alert + b'b' => 0x08, // backspace + b'f' => 0x0C, // form feed + b'n' => b'\n', // newline + b'r' => b'\r', // carriage return + b't' => b'\t', // horizontal tab + b'v' => 0x0B, // vertical tab + b' ' => b' ', // space + _ => { + result.push(b'\\'); + result.push(bytes[i + 1]); + i += 2; + continue; + } + }; + result.push(replacement); + i += 2; } else { - result.push(c); + result.push(bytes[i]); + i += 1; } } - result = Self::replace_all_octal_sequences_with_their_coded_values(&input)?; - Ok(Value::String(result)) } _ => { @@ -141,24 +143,104 @@ impl Value { } } - /// Replace the octal sequences in the string with the value they represent - /// in utf8 - fn replace_all_octal_sequences_with_their_coded_values( - input: &str, - ) -> Result { - // replace octal sequences with specific coded values (using pre-compiled regex) - let result = OCTAL_RE - .replace_all(input, |capture: ®ex::Captures| { - let mat = capture.get(1).unwrap().as_str(); - - let v = u32::from_str_radix(mat, 8).unwrap(); - let c = char::from_u32(v).unwrap(); - c.to_string() - }) - .to_string(); + /// Replace the octal sequences in the bytes with the value they represent + fn replace_all_octal_sequences_with_their_coded_values_bytes( + input: &[u8], + ) -> Result, RawMagicLineParseError> { + let mut result = Vec::new(); + let mut i = 0; + + while i < input.len() { + if input[i] == b'\\' && i + 1 < input.len() { + // Check if next characters are octal digits (1-3 digits) + let start = i + 1; + let mut end = start; + let mut count = 0; + + while end < input.len() && count < 3 { + if input[end] >= b'0' && input[end] <= b'7' { + end += 1; + count += 1; + } else { + break; + } + } + + if count > 0 { + // Parse the octal number + let octal_str = std::str::from_utf8(&input[start..end]).unwrap(); + if let Ok(byte_val) = u8::from_str_radix(octal_str, 8) { + result.push(byte_val); + i = end; + continue; + } + } + } + + result.push(input[i]); + i += 1; + } + Ok(result) } + /// Replace hex escape sequences (\xNN) with their byte values + fn replace_all_hex_sequences_with_their_coded_values( + input: &str, + ) -> Result, RawMagicLineParseError> { + let mut result_bytes = Vec::new(); + let mut chars = input.chars().peekable(); + + while let Some(c) = chars.next() { + if c == '\\' { + if chars.peek() == Some(&'x') { + chars.next(); // consume 'x' + + // Collect hex digits (1-2 digits) + let mut hex_str = String::new(); + for _ in 0..2 { + if let Some(&ch) = chars.peek() { + if ch.is_ascii_hexdigit() { + hex_str.push(ch); + chars.next(); + } else { + break; + } + } else { + break; + } + } + + if !hex_str.is_empty() { + // Parse hex value and add as raw byte + if let Ok(byte_val) = u8::from_str_radix(&hex_str, 16) { + result_bytes.push(byte_val); + } else { + // If parsing fails, keep the original sequence + result_bytes.push(b'\\'); + result_bytes.push(b'x'); + result_bytes.extend_from_slice(hex_str.as_bytes()); + } + } else { + // No hex digits found, keep \x as is + result_bytes.push(b'\\'); + result_bytes.push(b'x'); + } + } else { + // Not a hex escape, keep the backslash + result_bytes.push(b'\\'); + } + } else { + // Regular character - encode as UTF-8 bytes + let mut buf = [0u8; 4]; + let bytes = c.encode_utf8(&mut buf).as_bytes(); + result_bytes.extend_from_slice(bytes); + } + } + + Ok(result_bytes) + } + fn parse_number(input: &mut String) -> Option<(ComparisonOperator, u64)> { let comparision_op = match COMP_OP_RE.find(input) { Some(mat) => { @@ -430,9 +512,8 @@ impl RawMagicFileLine { return false; } - if let Ok(tf_val) = String::from_utf8(buf.clone()) { - return tf_val.as_bytes() == val.as_bytes(); - } + // Compare raw bytes directly + return buf == *val; } false } @@ -506,3 +587,41 @@ fn parse_magic_file_and_test( "Couldn't match any magic number", ))) } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_hex_escape_replacement() { + let input = r"\x97JB2\x0D\x0A\x1A\x0A"; + let result = Value::replace_all_hex_sequences_with_their_coded_values(input).unwrap(); + + // \x97 should become byte 0x97 (151 in decimal, 227 in octal) + // JB2 stays as is + // \x0D should become CR (13) + // \x0A should become LF (10) + // \x1A should become SUB (26) + // \x0A should become LF (10) + + let expected_bytes: Vec = vec![0x97, b'J', b'B', b'2', 0x0D, 0x0A, 0x1A, 0x0A]; + assert_eq!(result, expected_bytes); + } +} + +#[test] +fn test_magic_line_parsing() { + let line = "0 string \\x97 MATCH_97"; + let result = RawMagicFileLine::parse(line.to_string()); + assert!(result.is_ok(), "Failed to parse magic line: {:?}", result); + + let magic_line = result.unwrap(); + assert_eq!(magic_line.offset.num, 0); + assert_eq!(magic_line.message, "MATCH_97"); + + if let Value::String(val) = magic_line.value { + assert_eq!(val, vec![0x97]); + } else { + panic!("Expected String value"); + } +} diff --git a/file/tests/file/jbig2.magic b/file/tests/file/jbig2.magic new file mode 100644 index 000000000..1b9c4802a --- /dev/null +++ b/file/tests/file/jbig2.magic @@ -0,0 +1 @@ +0 string \x97JB2\x0D\x0A\x1A\x0A JBIG2 image data diff --git a/file/tests/file/mod.rs b/file/tests/file/mod.rs index f292b5084..cf3392682 100644 --- a/file/tests/file/mod.rs +++ b/file/tests/file/mod.rs @@ -253,3 +253,26 @@ fn file_magic_file_priority_with_M_and_m_option_as_they_appear_using_cpio_archiv "", ); } + +#[test] +fn file_magic_with_hex_escape_sequences_jbig2() { + use std::env; + use std::path::PathBuf; + + // Get the directory of the Cargo project + let cargo_manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap()); + + let jbig2_magic = cargo_manifest_dir.join("tests/file/jbig2.magic"); + let test_jb2_file = cargo_manifest_dir.join("tests/file/test.jb2"); + + // Test that file recognizes JBIG2 image data using hex escape sequences in magic file + file_test( + &[ + "-m", + jbig2_magic.to_str().unwrap(), + test_jb2_file.to_str().unwrap(), + ], + &format!("{}: JBIG2 image data\n", test_jb2_file.to_str().unwrap()), + "", + ); +} diff --git a/file/tests/file/test.jb2 b/file/tests/file/test.jb2 new file mode 100644 index 000000000..537e35d07 --- /dev/null +++ b/file/tests/file/test.jb2 @@ -0,0 +1,2 @@ +—JB2 + From 4288734fe000eac22046e86383bc094c92558e41 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 8 Feb 2026 07:37:50 +0000 Subject: [PATCH 3/3] Address code review comments - Fix misleading comment about hex value representation - Move test_magic_line_parsing inside mod tests block Co-authored-by: jgarzik <494411+jgarzik@users.noreply.github.com> --- file/magic.rs | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/file/magic.rs b/file/magic.rs index 27c2f0792..84e2a736b 100644 --- a/file/magic.rs +++ b/file/magic.rs @@ -597,7 +597,7 @@ mod tests { let input = r"\x97JB2\x0D\x0A\x1A\x0A"; let result = Value::replace_all_hex_sequences_with_their_coded_values(input).unwrap(); - // \x97 should become byte 0x97 (151 in decimal, 227 in octal) + // \x97 should become byte 0x97 (151 decimal, 0x97 hex) // JB2 stays as is // \x0D should become CR (13) // \x0A should become LF (10) @@ -607,21 +607,21 @@ mod tests { let expected_bytes: Vec = vec![0x97, b'J', b'B', b'2', 0x0D, 0x0A, 0x1A, 0x0A]; assert_eq!(result, expected_bytes); } -} -#[test] -fn test_magic_line_parsing() { - let line = "0 string \\x97 MATCH_97"; - let result = RawMagicFileLine::parse(line.to_string()); - assert!(result.is_ok(), "Failed to parse magic line: {:?}", result); + #[test] + fn test_magic_line_parsing() { + let line = "0 string \\x97 MATCH_97"; + let result = RawMagicFileLine::parse(line.to_string()); + assert!(result.is_ok(), "Failed to parse magic line: {:?}", result); - let magic_line = result.unwrap(); - assert_eq!(magic_line.offset.num, 0); - assert_eq!(magic_line.message, "MATCH_97"); + let magic_line = result.unwrap(); + assert_eq!(magic_line.offset.num, 0); + assert_eq!(magic_line.message, "MATCH_97"); - if let Value::String(val) = magic_line.value { - assert_eq!(val, vec![0x97]); - } else { - panic!("Expected String value"); + if let Value::String(val) = magic_line.value { + assert_eq!(val, vec![0x97]); + } else { + panic!("Expected String value"); + } } }