From e5e4e841f822739631305823a4c118680f4d0ee2 Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Sun, 12 Apr 2026 10:58:01 -0400 Subject: [PATCH 01/26] refactor(patch): ParseOpts for configurable parsing behavior --- src/patch/parse.rs | 46 +++++++++++++++++++++++++++++++----------- src/patch_set/parse.rs | 3 ++- 2 files changed, 36 insertions(+), 13 deletions(-) diff --git a/src/patch/parse.rs b/src/patch/parse.rs index dd74b15..2c40209 100644 --- a/src/patch/parse.rs +++ b/src/patch/parse.rs @@ -12,6 +12,31 @@ use std::borrow::Cow; type Result = std::result::Result; +/// Options that control parsing behavior. +/// +/// Defaults match the [`parse`]/[`parse_bytes`] behavior. +#[derive(Clone, Copy)] +pub(crate) struct ParseOpts { + reject_orphaned_hunks: bool, +} + +impl Default for ParseOpts { + fn default() -> Self { + Self { + reject_orphaned_hunks: false, + } + } +} + +impl ParseOpts { + /// Reject orphaned `@@ ` hunk headers after parsed hunks, + /// matching `git apply` behavior. + pub(crate) fn reject_orphaned_hunks(mut self) -> Self { + self.reject_orphaned_hunks = true; + self + } +} + struct Parser<'a, T: Text + ?Sized> { lines: std::iter::Peekable>, offset: usize, @@ -54,7 +79,7 @@ impl<'a, T: Text + ?Sized> Parser<'a, T> { } pub fn parse(input: &str) -> Result> { - let (result, _consumed) = parse_one(input); + let (result, _consumed) = parse_one(input, ParseOpts::default()); result } @@ -62,7 +87,7 @@ pub fn parse(input: &str) -> Result> { /// /// Always returns consumed bytes alongside the result /// so callers can advance past the parsed or partially parsed content. -pub(crate) fn parse_one(input: &str) -> (Result>, usize) { +pub(crate) fn parse_one(input: &str, opts: ParseOpts) -> (Result>, usize) { let mut parser = Parser::new(input); let header = match patch_header(&mut parser) { @@ -73,6 +98,11 @@ pub(crate) fn parse_one(input: &str) -> (Result>, usize) { Ok(h) => h, Err(e) => return (Err(e), parser.offset()), }; + if opts.reject_orphaned_hunks { + if let Err(e) = reject_orphaned_hunk_headers(&mut parser) { + return (Err(e), parser.offset()); + } + } let patch = Patch::new( header.0.map(convert_cow_to_str), @@ -83,16 +113,8 @@ pub(crate) fn parse_one(input: &str) -> (Result>, usize) { } pub fn parse_strict(input: &str) -> Result> { - let mut parser = Parser::new(input); - let header = patch_header(&mut parser)?; - let hunks = hunks(&mut parser)?; - reject_orphaned_hunk_headers(&mut parser)?; - - Ok(Patch::new( - header.0.map(convert_cow_to_str), - header.1.map(convert_cow_to_str), - hunks, - )) + let (result, _consumed) = parse_one(input, ParseOpts::default().reject_orphaned_hunks()); + result } pub fn parse_bytes(input: &[u8]) -> Result> { diff --git a/src/patch_set/parse.rs b/src/patch_set/parse.rs index c4864f9..cec612b 100644 --- a/src/patch_set/parse.rs +++ b/src/patch_set/parse.rs @@ -83,7 +83,8 @@ impl<'a> PatchSet<'a> { let patch_input = &remaining[patch_start..]; - let (result, consumed) = parse_one(patch_input); + let opts = crate::patch::parse::ParseOpts::default(); + let (result, consumed) = parse_one(patch_input, opts); // Always advance so the iterator makes progress even on error. let abs_patch_start = self.offset + patch_start; self.offset += patch_start + consumed; From 1a83fe90451af5c497678508cf16b9e816f00344 Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Sun, 12 Apr 2026 11:05:06 -0400 Subject: [PATCH 02/26] refactor(patch): make preamble skipping configurable --- src/patch/parse.rs | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/src/patch/parse.rs b/src/patch/parse.rs index 2c40209..b6b869b 100644 --- a/src/patch/parse.rs +++ b/src/patch/parse.rs @@ -17,18 +17,30 @@ type Result = std::result::Result; /// Defaults match the [`parse`]/[`parse_bytes`] behavior. #[derive(Clone, Copy)] pub(crate) struct ParseOpts { + skip_preamble: bool, reject_orphaned_hunks: bool, } impl Default for ParseOpts { fn default() -> Self { Self { + skip_preamble: true, reject_orphaned_hunks: false, } } } impl ParseOpts { + /// Don't skip preamble lines before `---`/`+++`/`@@`. + /// + /// Useful when the caller has already positioned the input + /// at the start of the patch content. + #[allow(dead_code)] // will be used by patch_set parser + pub(crate) fn no_skip_preamble(mut self) -> Self { + self.skip_preamble = false; + self + } + /// Reject orphaned `@@ ` hunk headers after parsed hunks, /// matching `git apply` behavior. pub(crate) fn reject_orphaned_hunks(mut self) -> Self { @@ -90,7 +102,7 @@ pub fn parse(input: &str) -> Result> { pub(crate) fn parse_one(input: &str, opts: ParseOpts) -> (Result>, usize) { let mut parser = Parser::new(input); - let header = match patch_header(&mut parser) { + let header = match patch_header(&mut parser, &opts) { Ok(h) => h, Err(e) => return (Err(e), parser.offset()), }; @@ -119,7 +131,7 @@ pub fn parse_strict(input: &str) -> Result> { pub fn parse_bytes(input: &[u8]) -> Result> { let mut parser = Parser::new(input); - let header = patch_header(&mut parser)?; + let header = patch_header(&mut parser, &ParseOpts::default())?; let hunks = hunks(&mut parser)?; Ok(Patch::new(header.0, header.1, hunks)) @@ -127,7 +139,7 @@ pub fn parse_bytes(input: &[u8]) -> Result> { pub fn parse_bytes_strict(input: &[u8]) -> Result> { let mut parser = Parser::new(input); - let header = patch_header(&mut parser)?; + let header = patch_header(&mut parser, &ParseOpts::default())?; let hunks = hunks(&mut parser)?; reject_orphaned_hunk_headers(&mut parser)?; @@ -145,8 +157,11 @@ fn convert_cow_to_str(cow: Cow<'_, [u8]>) -> Cow<'_, str> { #[allow(clippy::type_complexity)] fn patch_header<'a, T: Text + ToOwned + ?Sized>( parser: &mut Parser<'a, T>, + opts: &ParseOpts, ) -> Result<(Option>, Option>)> { - skip_header_preamble(parser)?; + if opts.skip_preamble { + skip_header_preamble(parser)?; + } let mut filename1 = None; let mut filename2 = None; From d1ec23656a3089cd1d67a1e6c329aab5c0b5be72 Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Sun, 12 Apr 2026 11:09:12 -0400 Subject: [PATCH 03/26] refactor(patch): organize item order --- src/patch/parse.rs | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/src/patch/parse.rs b/src/patch/parse.rs index b6b869b..4400aec 100644 --- a/src/patch/parse.rs +++ b/src/patch/parse.rs @@ -95,6 +95,28 @@ pub fn parse(input: &str) -> Result> { result } +pub fn parse_strict(input: &str) -> Result> { + let (result, _consumed) = parse_one(input, ParseOpts::default().reject_orphaned_hunks()); + result +} + +pub fn parse_bytes(input: &[u8]) -> Result> { + let mut parser = Parser::new(input); + let header = patch_header(&mut parser, &ParseOpts::default())?; + let hunks = hunks(&mut parser)?; + + Ok(Patch::new(header.0, header.1, hunks)) +} + +pub fn parse_bytes_strict(input: &[u8]) -> Result> { + let mut parser = Parser::new(input); + let header = patch_header(&mut parser, &ParseOpts::default())?; + let hunks = hunks(&mut parser)?; + reject_orphaned_hunk_headers(&mut parser)?; + + Ok(Patch::new(header.0, header.1, hunks)) +} + /// Parses one patch from input. /// /// Always returns consumed bytes alongside the result @@ -124,28 +146,6 @@ pub(crate) fn parse_one(input: &str, opts: ParseOpts) -> (Result> (Ok(patch), parser.offset()) } -pub fn parse_strict(input: &str) -> Result> { - let (result, _consumed) = parse_one(input, ParseOpts::default().reject_orphaned_hunks()); - result -} - -pub fn parse_bytes(input: &[u8]) -> Result> { - let mut parser = Parser::new(input); - let header = patch_header(&mut parser, &ParseOpts::default())?; - let hunks = hunks(&mut parser)?; - - Ok(Patch::new(header.0, header.1, hunks)) -} - -pub fn parse_bytes_strict(input: &[u8]) -> Result> { - let mut parser = Parser::new(input); - let header = patch_header(&mut parser, &ParseOpts::default())?; - let hunks = hunks(&mut parser)?; - reject_orphaned_hunk_headers(&mut parser)?; - - Ok(Patch::new(header.0, header.1, hunks)) -} - // This is only used when the type originated as a utf8 string fn convert_cow_to_str(cow: Cow<'_, [u8]>) -> Cow<'_, str> { match cow { From fe9b1b3be77b04bb8ea61956a37443c1b7043a28 Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Sun, 12 Apr 2026 11:10:47 -0400 Subject: [PATCH 04/26] refactor(patch): wire up ParseOpts for byte parsing --- src/patch/parse.rs | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/src/patch/parse.rs b/src/patch/parse.rs index 4400aec..3d7f3ca 100644 --- a/src/patch/parse.rs +++ b/src/patch/parse.rs @@ -101,20 +101,13 @@ pub fn parse_strict(input: &str) -> Result> { } pub fn parse_bytes(input: &[u8]) -> Result> { - let mut parser = Parser::new(input); - let header = patch_header(&mut parser, &ParseOpts::default())?; - let hunks = hunks(&mut parser)?; - - Ok(Patch::new(header.0, header.1, hunks)) + let (result, _consumed) = parse_bytes_one(input, ParseOpts::default()); + result } pub fn parse_bytes_strict(input: &[u8]) -> Result> { - let mut parser = Parser::new(input); - let header = patch_header(&mut parser, &ParseOpts::default())?; - let hunks = hunks(&mut parser)?; - reject_orphaned_hunk_headers(&mut parser)?; - - Ok(Patch::new(header.0, header.1, hunks)) + let (result, _consumed) = parse_bytes_one(input, ParseOpts::default().reject_orphaned_hunks()); + result } /// Parses one patch from input. @@ -146,6 +139,27 @@ pub(crate) fn parse_one(input: &str, opts: ParseOpts) -> (Result> (Ok(patch), parser.offset()) } +/// Like [`parse_one`] but for bytes. +pub(crate) fn parse_bytes_one(input: &[u8], opts: ParseOpts) -> (Result>, usize) { + let mut parser = Parser::new(input); + + let header = match patch_header(&mut parser, &opts) { + Ok(h) => h, + Err(e) => return (Err(e), parser.offset()), + }; + let hunks = match hunks(&mut parser) { + Ok(h) => h, + Err(e) => return (Err(e), parser.offset()), + }; + if opts.reject_orphaned_hunks { + if let Err(e) = reject_orphaned_hunk_headers(&mut parser) { + return (Err(e), parser.offset()); + } + } + + (Ok(Patch::new(header.0, header.1, hunks)), parser.offset()) +} + // This is only used when the type originated as a utf8 string fn convert_cow_to_str(cow: Cow<'_, [u8]>) -> Cow<'_, str> { match cow { From f4e6b2ba297cf8bd1d25f3be0c2986682f3a24d9 Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Sun, 12 Apr 2026 11:24:20 -0400 Subject: [PATCH 05/26] refactor(patch): deduplicate parse_one logic --- src/patch/parse.rs | 49 ++++++++++------------------------------------ src/utils.rs | 22 ++++++++++++++++----- 2 files changed, 27 insertions(+), 44 deletions(-) diff --git a/src/patch/parse.rs b/src/patch/parse.rs index 3d7f3ca..c58dfd7 100644 --- a/src/patch/parse.rs +++ b/src/patch/parse.rs @@ -101,12 +101,12 @@ pub fn parse_strict(input: &str) -> Result> { } pub fn parse_bytes(input: &[u8]) -> Result> { - let (result, _consumed) = parse_bytes_one(input, ParseOpts::default()); + let (result, _consumed) = parse_one(input, ParseOpts::default()); result } pub fn parse_bytes_strict(input: &[u8]) -> Result> { - let (result, _consumed) = parse_bytes_one(input, ParseOpts::default().reject_orphaned_hunks()); + let (result, _consumed) = parse_one(input, ParseOpts::default().reject_orphaned_hunks()); result } @@ -114,7 +114,10 @@ pub fn parse_bytes_strict(input: &[u8]) -> Result> { /// /// Always returns consumed bytes alongside the result /// so callers can advance past the parsed or partially parsed content. -pub(crate) fn parse_one(input: &str, opts: ParseOpts) -> (Result>, usize) { +pub(crate) fn parse_one<'a, T: Text + ?Sized>( + input: &'a T, + opts: ParseOpts, +) -> (Result>, usize) { let mut parser = Parser::new(input); let header = match patch_header(&mut parser, &opts) { @@ -132,44 +135,15 @@ pub(crate) fn parse_one(input: &str, opts: ParseOpts) -> (Result> } let patch = Patch::new( - header.0.map(convert_cow_to_str), - header.1.map(convert_cow_to_str), + header.0.map(T::from_bytes_cow), + header.1.map(T::from_bytes_cow), hunks, ); (Ok(patch), parser.offset()) } -/// Like [`parse_one`] but for bytes. -pub(crate) fn parse_bytes_one(input: &[u8], opts: ParseOpts) -> (Result>, usize) { - let mut parser = Parser::new(input); - - let header = match patch_header(&mut parser, &opts) { - Ok(h) => h, - Err(e) => return (Err(e), parser.offset()), - }; - let hunks = match hunks(&mut parser) { - Ok(h) => h, - Err(e) => return (Err(e), parser.offset()), - }; - if opts.reject_orphaned_hunks { - if let Err(e) = reject_orphaned_hunk_headers(&mut parser) { - return (Err(e), parser.offset()); - } - } - - (Ok(Patch::new(header.0, header.1, hunks)), parser.offset()) -} - -// This is only used when the type originated as a utf8 string -fn convert_cow_to_str(cow: Cow<'_, [u8]>) -> Cow<'_, str> { - match cow { - Cow::Borrowed(b) => std::str::from_utf8(b).unwrap().into(), - Cow::Owned(o) => String::from_utf8(o).unwrap().into(), - } -} - #[allow(clippy::type_complexity)] -fn patch_header<'a, T: Text + ToOwned + ?Sized>( +fn patch_header<'a, T: Text + ?Sized>( parser: &mut Parser<'a, T>, opts: &ParseOpts, ) -> Result<(Option>, Option>)> { @@ -212,10 +186,7 @@ fn skip_header_preamble(parser: &mut Parser<'_, T>) -> Result< Ok(()) } -fn parse_filename<'a, T: Text + ToOwned + ?Sized>( - prefix: &str, - line: &'a T, -) -> Result> { +fn parse_filename<'a, T: Text + ?Sized>(prefix: &str, line: &'a T) -> Result> { let line = line .strip_prefix(prefix) .ok_or(ParsePatchErrorKind::InvalidFilename)?; diff --git a/src/utils.rs b/src/utils.rs index c3b1594..4e98c95 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -132,7 +132,7 @@ impl<'a, T: Text + ?Sized> Iterator for LineIter<'a, T> { /// A helper trait for processing text like `str` and `[u8]` /// Useful for abstracting over those types for parsing as well as breaking input into lines -pub trait Text: Eq + Hash { +pub trait Text: Eq + Hash + ToOwned { fn is_empty(&self) -> bool; fn len(&self) -> usize; fn starts_with(&self, prefix: &str) -> bool; @@ -148,6 +148,9 @@ pub trait Text: Eq + Hash { #[allow(unused)] fn lines(&self) -> LineIter<'_, Self>; + /// Converts a byte-oriented `Cow` into a `Cow` of `Self`. + fn from_bytes_cow(cow: Cow<'_, [u8]>) -> Cow<'_, Self>; + fn parse(&self) -> Option { self.as_str().and_then(|s| s.parse().ok()) } @@ -202,6 +205,13 @@ impl Text for str { fn lines(&self) -> LineIter<'_, Self> { LineIter::new(self) } + + fn from_bytes_cow(cow: Cow<'_, [u8]>) -> Cow<'_, Self> { + match cow { + Cow::Borrowed(b) => Cow::Borrowed(std::str::from_utf8(b).unwrap()), + Cow::Owned(o) => Cow::Owned(String::from_utf8(o).unwrap()), + } + } } impl Text for [u8] { @@ -252,6 +262,10 @@ impl Text for [u8] { fn lines(&self) -> LineIter<'_, Self> { LineIter::new(self) } + + fn from_bytes_cow(cow: Cow<'_, [u8]>) -> Cow<'_, Self> { + cow + } } fn find_bytes(haystack: &[u8], needle: &[u8]) -> Option { @@ -292,7 +306,7 @@ fn find_byte(haystack: &[u8], byte: u8) -> Option { /// /// See [`byte_needs_quoting`] for the set of characters that /// require quoting. -pub(crate) fn escaped_filename( +pub(crate) fn escaped_filename( filename: &T, ) -> Result, ParsePatchError> { if let Some(inner) = filename @@ -309,9 +323,7 @@ pub(crate) fn escaped_filename( } } -fn decode_escaped( - escaped: &T, -) -> Result, ParsePatchError> { +fn decode_escaped(escaped: &T) -> Result, ParsePatchError> { let bytes = escaped.as_bytes(); let mut result = Vec::new(); let mut i = 0; From edf93e51e5d5945aa8472af6be468d20a60c8bc0 Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Sun, 12 Apr 2026 12:30:29 -0400 Subject: [PATCH 06/26] refactor(patch_set): extract `line_ending_len` helper `.lines()` strips line endings, so callers tracking byte offsets need to re-add the `\r\n` or `\n` length manually. Extract the repeated inline pattern into a reusable helper. --- src/patch_set/parse.rs | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/src/patch_set/parse.rs b/src/patch_set/parse.rs index cec612b..e93d7b6 100644 --- a/src/patch_set/parse.rs +++ b/src/patch_set/parse.rs @@ -142,12 +142,7 @@ fn find_patch_start(input: &str) -> Option { return Some(offset); } offset += line.len(); - // Account for the line ending that `.lines()` strips - if input[offset..].starts_with("\r\n") { - offset += 2; - } else if input[offset..].starts_with('\n') { - offset += 1; - } + offset += line_ending_len(&input[offset..]); } None } @@ -222,3 +217,17 @@ pub(crate) fn extract_file_op_unidiff<'a>( } } } + +/// Returns the length of the line ending at the start of `s`. +/// +/// `.lines()` strips line endings, so callers tracking byte offsets need to +/// account for the `\r\n` or `\n` that was consumed. +fn line_ending_len(s: &str) -> usize { + if s.starts_with("\r\n") { + 2 + } else if s.starts_with('\n') { + 1 + } else { + 0 + } +} From 90861e0faee3c4ac143e523aeccb0b79e2ebaa8c Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Sun, 12 Apr 2026 12:30:59 -0400 Subject: [PATCH 07/26] feat(patch_set): support git diff application * Parse `diff --git` extended headers * split multi-file git diffs at `diff --git` boundaries --- src/patch/parse.rs | 1 - src/patch_set/error.rs | 14 ++ src/patch_set/mod.rs | 76 +++++++- src/patch_set/parse.rs | 431 ++++++++++++++++++++++++++++++++++++++++- src/patch_set/tests.rs | 121 ++++++++++++ 5 files changed, 638 insertions(+), 5 deletions(-) diff --git a/src/patch/parse.rs b/src/patch/parse.rs index c58dfd7..2ccb409 100644 --- a/src/patch/parse.rs +++ b/src/patch/parse.rs @@ -35,7 +35,6 @@ impl ParseOpts { /// /// Useful when the caller has already positioned the input /// at the start of the patch content. - #[allow(dead_code)] // will be used by patch_set parser pub(crate) fn no_skip_preamble(mut self) -> Self { self.skip_preamble = false; self diff --git a/src/patch_set/error.rs b/src/patch_set/error.rs index ba60e86..9cd6aa9 100644 --- a/src/patch_set/error.rs +++ b/src/patch_set/error.rs @@ -70,6 +70,15 @@ pub(crate) enum PatchSetParseErrorKind { /// Create patch missing modified path. CreateMissingModifiedPath, + + /// Invalid file mode string. + InvalidFileMode(String), + + /// Invalid `diff --git` path. + InvalidDiffGitPath, + + /// Binary diff not supported in current configuration. + BinaryNotSupported { path: String }, } impl fmt::Display for PatchSetParseErrorKind { @@ -81,6 +90,11 @@ impl fmt::Display for PatchSetParseErrorKind { Self::BothDevNull => write!(f, "patch has both original and modified as /dev/null"), Self::DeleteMissingOriginalPath => write!(f, "delete patch has no original path"), Self::CreateMissingModifiedPath => write!(f, "create patch has no modified path"), + Self::InvalidFileMode(mode) => write!(f, "invalid file mode: {mode}"), + Self::InvalidDiffGitPath => write!(f, "invalid diff --git path"), + Self::BinaryNotSupported { path } => { + write!(f, "binary diff not supported: {path}") + } } } } diff --git a/src/patch_set/mod.rs b/src/patch_set/mod.rs index 7e15a6e..f83442c 100644 --- a/src/patch_set/mod.rs +++ b/src/patch_set/mod.rs @@ -13,11 +13,25 @@ use std::borrow::Cow; use crate::Patch; pub use error::PatchSetParseError; +use error::PatchSetParseErrorKind; pub use parse::PatchSet; /// Options for parsing patch content. /// -/// Use [`ParseOptions::unidiff()`] to create options for the desired format. +/// Use [`ParseOptions::unidiff()`] or [`ParseOptions::gitdiff()`] +/// to create options for the desired format. +/// +/// ## Binary Files +/// +/// When parsing git diffs, binary file changes are detected by: +/// +/// * `Binary files a/path and b/path differ` (`git diff` without `--binary` flag) +/// * `GIT binary patch` (from `git diff --binary`) +/// +/// Note that this is not a documented Git behavior, +/// so the implementation here is subject to change if Git changes. +/// +/// By default, binary diffs are skipped. /// /// ## Example /// @@ -40,12 +54,26 @@ pub use parse::PatchSet; #[derive(Debug, Clone)] pub struct ParseOptions { pub(crate) format: Format, + pub(crate) binary: Binary, } -#[derive(Debug, Clone, Copy)] +#[derive(Debug, Clone, Copy, Default)] pub(crate) enum Format { /// Standard unified diff format. + #[default] UniDiff, + /// Git extended diff format. + GitDiff, +} + +/// How to handle binary diffs in GitDiff mode. +#[derive(Debug, Clone, Copy, Default)] +pub(crate) enum Binary { + /// Skip binary diffs silently. + #[default] + Skip, + /// Return error if binary diff encountered. + Fail, } impl ParseOptions { @@ -64,8 +92,38 @@ impl ParseOptions { pub fn unidiff() -> Self { Self { format: Format::UniDiff, + binary: Default::default(), } } + + /// Parse as [git extended diff format][git-diff-format]. + /// + /// Supports all features of [`unidiff()`](Self::unidiff) plus: + /// + /// * `diff --git` headers + /// * Extended headers (`new file mode`, `deleted file mode`, etc.) + /// * Rename/copy detection (`rename from`/`rename to`, `copy from`/`copy to`) + /// * Binary file detection (skipped by default) + /// + /// [git-diff-format]: https://git-scm.com/docs/diff-format + pub fn gitdiff() -> Self { + Self { + format: Format::GitDiff, + binary: Default::default(), + } + } + + /// Skip binary diffs silently. + pub fn skip_binary(mut self) -> Self { + self.binary = Binary::Skip; + self + } + + /// Return an error if a binary diff is encountered. + pub fn fail_on_binary(mut self) -> Self { + self.binary = Binary::Fail; + self + } } /// File mode extracted from git extended headers. @@ -81,6 +139,20 @@ pub enum FileMode { Gitlink, } +impl std::str::FromStr for FileMode { + type Err = PatchSetParseError; + + fn from_str(mode: &str) -> Result { + match mode { + "100644" => Ok(Self::Regular), + "100755" => Ok(Self::Executable), + "120000" => Ok(Self::Symlink), + "160000" => Ok(Self::Gitlink), + _ => Err(PatchSetParseErrorKind::InvalidFileMode(mode.to_owned()).into()), + } + } +} + /// The kind of patch content in a [`FilePatch`]. #[derive(Clone, PartialEq, Eq)] pub enum PatchKind<'a, T: ToOwned + ?Sized> { diff --git a/src/patch_set/parse.rs b/src/patch_set/parse.rs index e93d7b6..0d8b994 100644 --- a/src/patch_set/parse.rs +++ b/src/patch_set/parse.rs @@ -1,10 +1,12 @@ //! Parse multiple file patches from a unified diff. use super::{ - error::PatchSetParseErrorKind, FileOperation, FilePatch, Format, ParseOptions, - PatchSetParseError, + error::PatchSetParseErrorKind, Binary, FileMode, FileOperation, FilePatch, Format, + ParseOptions, PatchSetParseError, }; use crate::patch::parse::parse_one; +use crate::utils::escaped_filename; +use crate::Patch; use std::borrow::Cow; @@ -72,6 +74,71 @@ impl<'a> PatchSet<'a> { PatchSetParseError::new(kind, self.offset..self.offset) } + fn next_gitdiff_patch(&mut self) -> Option, PatchSetParseError>> { + let remaining = &self.input[self.offset..]; + let patch_start = find_gitdiff_start(remaining)?; + self.offset += patch_start; + self.found_any = true; + + let abs_patch_start = self.offset; + + // Parse extended headers incrementally — stops at first unrecognized line + let (header, header_consumed) = GitHeader::parse(&self.input[self.offset..]); + self.offset += header_consumed; + + // Handle binary markers ("Binary files ... differ") + if header.is_binary_marker || header.is_binary_patch { + match self.opts.binary { + Binary::Skip => { + return self.next_gitdiff_patch(); + } + Binary::Fail => { + let path = header.diff_git_line.unwrap_or("").to_owned(); + return Some(Err(PatchSetParseError::new( + PatchSetParseErrorKind::BinaryNotSupported { path }, + abs_patch_start..abs_patch_start, + ))); + } + } + } + + // `git diff` output format is stricter. + // There is no preamble between Git headers and unidiff patch portion, + // so we safely don't perform the preamble skipping. + // + // If we did, it would fail the pure rename/mode-change operation + // since those ops have no unidiff patch portion + // and is directly followed by the next `diff --git` header. + let opts = crate::patch::parse::ParseOpts::default().no_skip_preamble(); + let remaining = &self.input[self.offset..]; + let (result, consumed) = parse_one(remaining, opts); + self.offset += consumed; + let patch = match result { + Ok(patch) => patch, + Err(e) => return Some(Err(e.into())), + }; + + // FIXME: error spans point at `diff --git` line, not the specific offending line + let operation = match extract_file_op_gitdiff(&header, &patch) { + Ok(op) => op, + Err(mut e) => { + e.set_span(abs_patch_start..abs_patch_start); + return Some(Err(e)); + } + }; + + // FIXME: error spans point at `diff --git` line, not the specific offending line + let (old_mode, new_mode) = match parse_file_modes(&header) { + Ok(modes) => modes, + Err(mut e) => { + e.set_span(abs_patch_start..abs_patch_start); + return Some(Err(e)); + } + }; + + Some(Ok(FilePatch::new(operation, patch, old_mode, new_mode))) + } + fn next_unidiff_patch(&mut self) -> Option, PatchSetParseError>> { let remaining = &self.input[self.offset..]; if remaining.is_empty() { @@ -125,6 +192,16 @@ impl<'a> Iterator for PatchSet<'a> { } result } + Format::GitDiff => { + let result = self.next_gitdiff_patch(); + if result.is_none() { + self.finished = true; + if !self.found_any { + return Some(Err(self.error(PatchSetParseErrorKind::NoPatchesFound))); + } + } + result + } }; result @@ -174,6 +251,356 @@ fn strip_email_preamble(input: &str) -> &str { } } +/// Finds the byte offset of the first `diff --git` line in `input`. +fn find_gitdiff_start(input: &str) -> Option { + let mut offset = 0; + for line in input.lines() { + if line.starts_with("diff --git ") { + return Some(offset); + } + offset += line.len(); + offset += line_ending_len(&input[offset..]); + } + None +} + +/// Git extended header metadata. +/// +/// Extracted from lines between `diff --git` and `---` (or end of patch). +/// See [git-diff format documentation](https://git-scm.com/docs/diff-format). +#[derive(Debug, Default)] +struct GitHeader<'a> { + /// Raw content after "diff --git " prefix. + /// + /// Only parsed in fallback when `---`/`+++` is absent (mode-only, binary, empty file). + diff_git_line: Option<&'a str>, + /// Source path from `rename from `. + rename_from: Option<&'a str>, + /// Destination path from `rename to `. + rename_to: Option<&'a str>, + /// Source path from `copy from `. + copy_from: Option<&'a str>, + /// Destination path from `copy to `. + copy_to: Option<&'a str>, + /// File mode from `old mode `. + old_mode: Option<&'a str>, + /// File mode from `new mode `. + new_mode: Option<&'a str>, + /// File mode from `new file mode `. + new_file_mode: Option<&'a str>, + /// File mode from `deleted file mode `. + deleted_file_mode: Option<&'a str>, + /// Whether this is a binary diff with no actual patch content. + /// + /// Observed `git diff` output (without `--binary`): + /// + /// ```text + /// diff --git a/image.png b/image.png + /// new file mode 100644 + /// index 0000000..7c4530c + /// Binary files /dev/null and b/image.png differ + /// ``` + is_binary_marker: bool, + /// Whether this is a binary diff with actual patch content. + /// + /// Observed `git diff --binary` output: + /// + /// ```text + /// diff --git a/image.png b/image.png + /// new file mode 100644 + /// index 0000000..7c4530c + /// GIT binary patch + /// literal 67 + /// zcmV-J0KET+... + /// + /// literal 0 + /// KcmV+b0RR6000031 + /// ``` + is_binary_patch: bool, +} + +impl<'a> GitHeader<'a> { + /// Parses git extended headers incrementally from the current position. + /// + /// Consumes the `diff --git` line and all recognized extended header lines, + /// stopping at the first unrecognized line (typically `---`/`+++`/`@@` + /// or the next `diff --git`). + /// + /// Returns the parsed header and the number of bytes consumed. + fn parse(input: &'a str) -> (Self, usize) { + let mut header = GitHeader::default(); + let mut consumed = 0; + + for line in input.lines() { + if let Some(rest) = line.strip_prefix("diff --git ") { + // Only accept the first `diff --git` line. + // A second one means we've reached the next patch. + if header.diff_git_line.is_some() { + break; + } + header.diff_git_line = Some(rest); + } else if let Some(path) = line.strip_prefix("rename from ") { + header.rename_from = Some(path); + } else if let Some(path) = line.strip_prefix("rename to ") { + header.rename_to = Some(path); + } else if let Some(path) = line.strip_prefix("copy from ") { + header.copy_from = Some(path); + } else if let Some(path) = line.strip_prefix("copy to ") { + header.copy_to = Some(path); + } else if let Some(mode) = line.strip_prefix("old mode ") { + header.old_mode = Some(mode); + } else if let Some(mode) = line.strip_prefix("new mode ") { + header.new_mode = Some(mode); + } else if let Some(mode) = line.strip_prefix("new file mode ") { + header.new_file_mode = Some(mode); + } else if let Some(mode) = line.strip_prefix("deleted file mode ") { + header.deleted_file_mode = Some(mode); + } else if line.starts_with("index ") + || line.starts_with("similarity index ") + || line.starts_with("dissimilarity index ") + { + // Recognized but nothing to extract. + } else if line.starts_with("Binary files ") { + header.is_binary_marker = true; + } else if line.starts_with("GIT binary patch") { + header.is_binary_patch = true; + } else { + // Unrecognized line: End of extended headers + // (typically `---`/`+++`/`@@` or trailing content). + break; + } + + consumed += line.len(); + consumed += line_ending_len(&input[consumed..]); + } + + (header, consumed) + } +} + +/// Determines the file operation from git headers and patch paths. +fn extract_file_op_gitdiff<'a>( + header: &GitHeader<'a>, + patch: &Patch<'a, str>, +) -> Result, PatchSetParseError> { + // Git headers are authoritative for rename/copy + if let (Some(from), Some(to)) = (header.rename_from, header.rename_to) { + return Ok(FileOperation::Rename { + from: Cow::Borrowed(from), + to: Cow::Borrowed(to), + }); + } + if let (Some(from), Some(to)) = (header.copy_from, header.copy_to) { + return Ok(FileOperation::Copy { + from: Cow::Borrowed(from), + to: Cow::Borrowed(to), + }); + } + + // Try ---/+++ paths first + if patch.original().is_some() || patch.modified().is_some() { + return extract_file_op_unidiff(patch.original_path(), patch.modified_path()); + } + + // Fall back to `diff --git ` for mode-only and empty file changes + let Some((original, modified)) = header.diff_git_line.and_then(parse_diff_git_path) else { + return Err(PatchSetParseErrorKind::InvalidDiffGitPath.into()); + }; + + if header.new_file_mode.is_some() { + Ok(FileOperation::Create(modified)) + } else if header.deleted_file_mode.is_some() { + Ok(FileOperation::Delete(original)) + } else { + Ok(FileOperation::Modify { original, modified }) + } +} + +/// Parses file modes from git extended headers. +fn parse_file_modes( + header: &GitHeader<'_>, +) -> Result<(Option, Option), PatchSetParseError> { + let old_mode = header + .old_mode + .or(header.deleted_file_mode) + .map(str::parse::) + .transpose()?; + let new_mode = header + .new_mode + .or(header.new_file_mode) + .map(str::parse::) + .transpose()?; + Ok((old_mode, new_mode)) +} + +/// Extracts both old and new paths from `diff --git` line content. +/// +/// ## Assumption #1: old and new paths are the same +/// +/// This extraction has one strong assumption: +/// Beside their prefixes, old and new paths are the same. +/// +/// From [git-diff format documentation]: +/// +/// > The `a/` and `b/` filenames are the same unless rename/copy is involved. +/// > Especially, even for a creation or a deletion, `/dev/null` is not used +/// > in place of the `a/` or `b/` filenames. +/// > +/// > When a rename/copy is involved, file1 and file2 show the name of the +/// > source file of the rename/copy and the name of the file that the +/// > rename/copy produces, respectively. +/// +/// Since rename/copy operations use `rename from/to` and `copy from/to` headers +/// we have handled earlier in [`extract_file_op_gitdiff`], +/// (which have no `a/`/`b/` prefix per git spec), +/// +/// this extraction is only used +/// * when unified diff headers (`---`/`+++`) are absent +/// * Only for mode-only and empty file cases +/// +/// [git-diff format documentation]: https://git-scm.com/docs/diff-format +/// +/// ## Assumption #2: the longest common path suffix is the shared path +/// +/// When custom prefixes contain spaces, +/// multiple splits may produce valid path suffixes. +/// +/// Example: `src/foo.rs src/foo.rs src/foo.rs src/foo.rs` +/// +/// Three splits all produce valid path suffixes (contain `/`): +/// +/// * Position 10 +/// * old path: `src/foo.rs` +/// * new path: `src/foo.rs src/foo.rs src/foo.rs` +/// * common suffix: `foo.rs` +/// * Position 21 +/// * old path: `src/foo.rs src/foo.rs` +/// * new path: `src/foo.rs src/foo.rs` +/// * common suffix: `foo.rs src/foo.rs` +/// * Position 32 +/// * old path: `src/foo.rs src/foo.rs src/foo.rs` +/// * new path: `src/foo.rs` +/// * common suffix: `foo.rs` +/// +/// We observed that `git apply` would pick position 21, +/// which has the longest path suffix, +/// hence this heuristic. +/// +/// ## Supported formats +/// +/// * `a/ b/` (default prefix) +/// * ` ` (`git diff --no-prefix`) +/// * ` ` (custom prefix) +/// * `"" ""` (quoted, with escapes) +/// * Mixed quoted/unquoted +fn parse_diff_git_path(line: &str) -> Option<(Cow<'_, str>, Cow<'_, str>)> { + if line.starts_with('"') || line.ends_with('"') { + parse_quoted_diff_git_path(line) + } else { + parse_unquoted_diff_git_path(line) + } +} + +/// See [`parse_diff_git_path`]. +fn parse_unquoted_diff_git_path(line: &str) -> Option<(Cow<'_, str>, Cow<'_, str>)> { + let mut best_match = None; + let mut longest_path = ""; + + for (i, _) in line.match_indices(' ') { + let left = &line[..i]; + let right = &line[i + 1..]; + if left.is_empty() || right.is_empty() { + continue; + } + // Select split with longest common path suffix (matches Git behavior) + if let Some(path) = longest_common_path_suffix(left, right) { + if path.len() > longest_path.len() { + longest_path = path; + best_match = Some((left, right)); + } + } + } + + best_match.map(|(l, r)| (Cow::Borrowed(l), Cow::Borrowed(r))) +} + +/// See [`parse_diff_git_path`]. +fn parse_quoted_diff_git_path(line: &str) -> Option<(Cow<'_, str>, Cow<'_, str>)> { + let (left_raw, right_raw) = if line.starts_with('"') { + // First token is quoted. + let bytes = line.as_bytes(); + let mut i = 1; // skip starting `"` + let end = loop { + match bytes.get(i)? { + b'"' => break i + 1, + b'\\' => i += 2, + _ => i += 1, + } + }; + let (first, rest) = line.split_at(end); + let rest = rest.strip_prefix(' ')?; + (first, rest) + } else if let Some(pos) = line.find(" \"") { + // First token is unquoted. The second must be quoted. + (&line[..pos], &line[pos + 1..]) + } else { + // Malformed: ends with `"` but no valid quoted path found + return None; + }; + + let left = escaped_filename(left_raw).ok().and_then(|cow| match cow { + Cow::Borrowed(b) => std::str::from_utf8(b).ok().map(Cow::Borrowed), + Cow::Owned(v) => String::from_utf8(v).ok().map(Cow::Owned), + })?; + let right = escaped_filename(right_raw).ok().and_then(|cow| match cow { + Cow::Borrowed(b) => std::str::from_utf8(b).ok().map(Cow::Borrowed), + Cow::Owned(v) => String::from_utf8(v).ok().map(Cow::Owned), + })?; + + // Verify both sides share the same path. + longest_common_path_suffix(&left, &right)?; + Some((left, right)) +} + +/// Extracts the longest common path suffix starting at a path component boundary. +/// +/// Returns `None` if no valid common path exists. +/// +/// Path component boundary means: +/// +/// * At `/` character (e.g., `foo/bar.rs` vs `fooo/bar.rs` → `bar.rs`) +/// * Or the entire string is identical +fn longest_common_path_suffix<'a>(a: &'a str, b: &'a str) -> Option<&'a str> { + if a.is_empty() || b.is_empty() { + return None; + } + + let suffix_len = a + .as_bytes() + .iter() + .rev() + .zip(b.as_bytes().iter().rev()) + .take_while(|(x, y)| x == y) + .count(); + + if suffix_len == 0 { + return None; + } + + // Identical strings + if suffix_len == a.len() && a.len() == b.len() { + return Some(a); + } + + // Find first '/' in suffix and return path after it + let suffix_start = a.len() - suffix_len; + let suffix = &a[suffix_start..]; + suffix + .split_once('/') + .map(|(_, path)| path) + .filter(|p| !p.is_empty()) +} + /// Extracts the file operation from a patch based on its header paths. pub(crate) fn extract_file_op_unidiff<'a>( original: Option<&Cow<'a, str>>, diff --git a/src/patch_set/tests.rs b/src/patch_set/tests.rs index 5bbfd31..9648930 100644 --- a/src/patch_set/tests.rs +++ b/src/patch_set/tests.rs @@ -463,3 +463,124 @@ In a hole in the ground there lived a hobbit ); } } + +mod patchset_gitdiff { + use super::*; + use crate::patch_set::PatchKind; + + fn parse_gitdiff(input: &str) -> Vec> { + PatchSet::parse(input, ParseOptions::gitdiff()) + .collect::, _>>() + .unwrap() + } + + /// `parse_one` must stop at `diff --git` boundaries so that + /// back-to-back patches are split correctly. + /// Without this, the second patch's `diff --git` line would be + /// swallowed as trailing junk by the first patch's hunk parser. + #[test] + fn multi_file_stops_at_diff_git_boundary() { + let input = "\ +diff --git a/foo b/foo +--- a/foo ++++ b/foo +@@ -1 +1 @@ +-old foo ++new foo +diff --git a/bar b/bar +--- a/bar ++++ b/bar +@@ -1 +1 @@ +-old bar ++new bar +"; + let patches = parse_gitdiff(input); + assert_eq!(patches.len(), 2); + } + + #[test] + fn pure_rename() { + let input = "\ +diff --git a/old.rs b/new.rs +similarity index 100% +rename from old.rs +rename to new.rs +"; + let patches = parse_gitdiff(input); + assert_eq!(patches.len(), 1); + assert_eq!( + patches[0].operation(), + &FileOperation::Rename { + from: "old.rs".into(), + to: "new.rs".into(), + } + ); + } + + /// Empty file creation has no ---/+++ headers, so the path comes + /// from the `diff --git` line and retains the `b/` prefix. + /// Callers use `strip_prefix(1)` to remove it. + #[test] + fn new_empty_file() { + let input = "\ +diff --git a/empty b/empty +new file mode 100644 +index 0000000..e69de29 +"; + let patches = parse_gitdiff(input); + assert_eq!(patches.len(), 1); + assert_eq!( + patches[0].operation(), + &FileOperation::Create("b/empty".into()) + ); + match patches[0].patch() { + PatchKind::Text(p) => assert!(p.hunks().is_empty()), + } + } + + #[test] + fn rename_then_modify() { + // Rename with no hunks followed by a modify with hunks. + // Tests that offset advances correctly across both. + let input = "\ +diff --git a/old.rs b/new.rs +similarity index 100% +rename from old.rs +rename to new.rs +diff --git a/foo b/foo +--- a/foo ++++ b/foo +@@ -1 +1 @@ +-old ++new +"; + let patches = parse_gitdiff(input); + assert_eq!(patches.len(), 2); + assert!(matches!( + patches[0].operation(), + FileOperation::Rename { .. } + )); + assert!(matches!( + patches[1].operation(), + FileOperation::Modify { .. } + )); + } + + #[test] + fn binary_skip_does_not_stall() { + // Binary marker must be skipped and offset must advance + // to the next patch without infinite loop. + let input = "\ +diff --git a/img.png b/img.png +Binary files a/img.png and b/img.png differ +diff --git a/foo b/foo +--- a/foo ++++ b/foo +@@ -1 +1 @@ +-old ++new +"; + let patches = parse_gitdiff(input); + assert_eq!(patches.len(), 1, "binary should be skipped, text parsed"); + } +} From 95836ec24a55ce50266ffd64cf6fbf24aee5f08d Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Sat, 11 Apr 2026 12:46:38 -0400 Subject: [PATCH 08/26] test(compat): add git apply compatibility tests Compat test for also `git apply`. --- tests/compat/common.rs | 111 ++++++++++++-- .../compat/git/fail_both_devnull/in/foo.patch | 6 + .../format_patch_diff_in_message/in/file.txt | 1 + .../format_patch_diff_in_message/in/foo.patch | 20 +++ .../format_patch_diff_in_message/out/file.txt | 1 + .../in/file.txt | 1 + .../in/foo.patch | 21 +++ .../out/file.txt | 3 + .../git/format_patch_preamble/in/file.txt | 1 + .../git/format_patch_preamble/in/foo.patch | 20 +++ .../git/format_patch_preamble/out/file.txt | 1 + .../git/format_patch_signature/in/file.txt | 1 + .../git/format_patch_signature/in/foo.patch | 19 +++ .../git/format_patch_signature/out/file.txt | 1 + .../compat/git/junk_between_files/in/bar.txt | 1 + .../git/junk_between_files/in/foo.patch | 17 +++ .../compat/git/junk_between_files/in/foo.txt | 1 + .../compat/git/junk_between_files/out/bar.txt | 1 + .../compat/git/junk_between_files/out/foo.txt | 1 + .../compat/git/junk_between_hunks/in/file.txt | 9 ++ .../git/junk_between_hunks/in/foo.patch | 15 ++ .../git/junk_between_hunks/out/file.txt | 9 ++ tests/compat/git/mod.rs | 138 ++++++++++++++++++ .../git/nested_diff_signature/in/example.rs | 1 + .../git/nested_diff_signature/in/foo.patch | 25 ++++ .../nested_diff_signature/in/mir-test.diff | 12 ++ .../git/nested_diff_signature/out/example.rs | 2 + .../nested_diff_signature/out/mir-test.diff | 0 .../git/path_ambiguous_suffix/in/foo.patch | 3 + .../out/foo.rs src/foo.rs | 0 .../path_containing_space_b/in/foo b/baz.txt | 1 + .../git/path_containing_space_b/in/foo.patch | 7 + .../path_containing_space_b/out/foo b/baz.txt | 1 + tests/compat/git/path_no_prefix/in/file.txt | 1 + tests/compat/git/path_no_prefix/in/foo.patch | 7 + tests/compat/git/path_no_prefix/out/file.txt | 1 + .../git/path_quoted_escapes/in/foo\tbar.txt" | 1 + .../git/path_quoted_escapes/in/foo.patch | 7 + .../git/path_quoted_escapes/out/foo\tbar.txt" | 1 + .../git/path_quoted_named_escape/in/foo.patch | 6 + .../git/path_quoted_named_escape/out/bel\a" | 1 + .../git/path_quoted_octal_escape/in/foo.patch | 6 + .../git/path_quoted_octal_escape/out/tl\033" | 1 + .../git/path_with_spaces/in/foo bar.txt | 1 + .../compat/git/path_with_spaces/in/foo.patch | 7 + .../git/path_with_spaces/out/foo bar.txt | 1 + tests/compat/main.rs | 4 +- 47 files changed, 486 insertions(+), 10 deletions(-) create mode 100644 tests/compat/git/fail_both_devnull/in/foo.patch create mode 100644 tests/compat/git/format_patch_diff_in_message/in/file.txt create mode 100644 tests/compat/git/format_patch_diff_in_message/in/foo.patch create mode 100644 tests/compat/git/format_patch_diff_in_message/out/file.txt create mode 100644 tests/compat/git/format_patch_multiple_separators/in/file.txt create mode 100644 tests/compat/git/format_patch_multiple_separators/in/foo.patch create mode 100644 tests/compat/git/format_patch_multiple_separators/out/file.txt create mode 100644 tests/compat/git/format_patch_preamble/in/file.txt create mode 100644 tests/compat/git/format_patch_preamble/in/foo.patch create mode 100644 tests/compat/git/format_patch_preamble/out/file.txt create mode 100644 tests/compat/git/format_patch_signature/in/file.txt create mode 100644 tests/compat/git/format_patch_signature/in/foo.patch create mode 100644 tests/compat/git/format_patch_signature/out/file.txt create mode 100644 tests/compat/git/junk_between_files/in/bar.txt create mode 100644 tests/compat/git/junk_between_files/in/foo.patch create mode 100644 tests/compat/git/junk_between_files/in/foo.txt create mode 100644 tests/compat/git/junk_between_files/out/bar.txt create mode 100644 tests/compat/git/junk_between_files/out/foo.txt create mode 100644 tests/compat/git/junk_between_hunks/in/file.txt create mode 100644 tests/compat/git/junk_between_hunks/in/foo.patch create mode 100644 tests/compat/git/junk_between_hunks/out/file.txt create mode 100644 tests/compat/git/mod.rs create mode 100644 tests/compat/git/nested_diff_signature/in/example.rs create mode 100644 tests/compat/git/nested_diff_signature/in/foo.patch create mode 100644 tests/compat/git/nested_diff_signature/in/mir-test.diff create mode 100644 tests/compat/git/nested_diff_signature/out/example.rs create mode 100644 tests/compat/git/nested_diff_signature/out/mir-test.diff create mode 100644 tests/compat/git/path_ambiguous_suffix/in/foo.patch create mode 100644 tests/compat/git/path_ambiguous_suffix/out/foo.rs src/foo.rs create mode 100644 tests/compat/git/path_containing_space_b/in/foo b/baz.txt create mode 100644 tests/compat/git/path_containing_space_b/in/foo.patch create mode 100644 tests/compat/git/path_containing_space_b/out/foo b/baz.txt create mode 100644 tests/compat/git/path_no_prefix/in/file.txt create mode 100644 tests/compat/git/path_no_prefix/in/foo.patch create mode 100644 tests/compat/git/path_no_prefix/out/file.txt create mode 100644 "tests/compat/git/path_quoted_escapes/in/foo\tbar.txt" create mode 100644 tests/compat/git/path_quoted_escapes/in/foo.patch create mode 100644 "tests/compat/git/path_quoted_escapes/out/foo\tbar.txt" create mode 100644 tests/compat/git/path_quoted_named_escape/in/foo.patch create mode 100644 "tests/compat/git/path_quoted_named_escape/out/bel\a" create mode 100644 tests/compat/git/path_quoted_octal_escape/in/foo.patch create mode 100644 "tests/compat/git/path_quoted_octal_escape/out/tl\033" create mode 100644 tests/compat/git/path_with_spaces/in/foo bar.txt create mode 100644 tests/compat/git/path_with_spaces/in/foo.patch create mode 100644 tests/compat/git/path_with_spaces/out/foo bar.txt diff --git a/tests/compat/common.rs b/tests/compat/common.rs index 7bc6f37..117ed04 100644 --- a/tests/compat/common.rs +++ b/tests/compat/common.rs @@ -2,16 +2,27 @@ use std::{ fs, + io::Write, path::{Path, PathBuf}, - process::Command, + process::{Command, Stdio}, sync::Once, }; use diffy::patch_set::{FileOperation, ParseOptions, PatchKind, PatchSet, PatchSetParseError}; +/// Which external tool to compare against. +#[derive(Clone, Copy)] +pub enum CompatMode { + /// `git apply` with `ParseOptions::gitdiff()` + Git, + /// GNU `patch` with `ParseOptions::unidiff()` + GnuPatch, +} + /// A test case with fluent builder API. pub struct Case<'a> { case_name: &'a str, + mode: CompatMode, /// Strip level for path prefixes (default: 0) strip_level: u32, /// Whether diffy is expected to succeed (default: true) @@ -21,20 +32,37 @@ pub struct Case<'a> { } impl<'a> Case<'a> { + /// Create a test case for `git apply` comparison. + pub fn git(name: &'a str) -> Self { + Self { + case_name: name, + mode: CompatMode::Git, + strip_level: 0, + expect_success: true, + expect_compat: true, + } + } + /// Create a test case for GNU patch comparison. pub fn gnu_patch(name: &'a str) -> Self { Self { case_name: name, + mode: CompatMode::GnuPatch, strip_level: 0, expect_success: true, expect_compat: true, } } - /// Get the case directory path. + /// Get the case directory path based on mode. fn case_dir(&self) -> PathBuf { + let subdir = match self.mode { + CompatMode::Git => "git", + CompatMode::GnuPatch => "gnu_patch", + }; PathBuf::from(env!("CARGO_MANIFEST_DIR")) - .join("tests/compat/gnu_patch") + .join("tests/compat") + .join(subdir) .join(self.case_name) } @@ -62,12 +90,19 @@ impl<'a> Case<'a> { .unwrap_or_else(|e| panic!("failed to read {}: {e}", patch_path.display())); let case_name = self.case_name; + let prefix = match self.mode { + CompatMode::Git => "git", + CompatMode::GnuPatch => "gnu", + }; let temp_base = temp_base(); - let diffy_output = temp_base.join(format!("gnu-{case_name}-diffy")); + let diffy_output = temp_base.join(format!("{prefix}-{case_name}-diffy")); create_output_dir(&diffy_output); - let opts = ParseOptions::unidiff(); + let opts = match self.mode { + CompatMode::Git => ParseOptions::gitdiff(), + CompatMode::GnuPatch => ParseOptions::unidiff(), + }; // Apply with diffy let diffy_result = apply_diffy(&in_dir, &patch, &diffy_output, opts, self.strip_level); @@ -81,12 +116,19 @@ impl<'a> Case<'a> { // In CI mode, also verify external tool behavior if is_ci() { - let external_output = temp_base.join(format!("gnu-{case_name}-external")); + let external_output = temp_base.join(format!("{prefix}-{case_name}-external")); create_output_dir(&external_output); - print_patch_version(); - let external_result = - gnu_patch_apply(&in_dir, &patch_path, &external_output, self.strip_level); + let external_result = match self.mode { + CompatMode::Git => { + print_git_version(); + git_apply(&external_output, &patch, self.strip_level, &in_dir) + } + CompatMode::GnuPatch => { + print_patch_version(); + gnu_patch_apply(&in_dir, &patch_path, &external_output, self.strip_level) + } + }; // For success cases where both succeed and are expected to be compatible, // verify outputs match @@ -120,6 +162,39 @@ impl<'a> Case<'a> { // External tool invocations +fn git_apply( + output_dir: &Path, + patch: &str, + strip_level: u32, + in_dir: &Path, +) -> Result<(), String> { + copy_input_files(in_dir, output_dir, &["patch"]); + + let mut cmd = Command::new("git"); + cmd.env("GIT_CONFIG_NOSYSTEM", "1"); + cmd.env("GIT_CONFIG_GLOBAL", "/dev/null"); + cmd.current_dir(output_dir); + cmd.args(["apply", &format!("-p{strip_level}"), "-"]); + cmd.stdin(Stdio::piped()); + cmd.stdout(Stdio::piped()); + cmd.stderr(Stdio::piped()); + + let mut child = cmd.spawn().expect("failed to spawn git apply"); + child + .stdin + .as_mut() + .unwrap() + .write_all(patch.as_bytes()) + .unwrap(); + + let output = child.wait_with_output().unwrap(); + if output.status.success() { + Ok(()) + } else { + Err(String::from_utf8_lossy(&output.stderr).to_string()) + } +} + fn gnu_patch_apply( in_dir: &Path, patch_path: &Path, @@ -149,6 +224,24 @@ fn gnu_patch_apply( } } +fn print_git_version() { + static ONCE: Once = Once::new(); + ONCE.call_once(|| { + let output = Command::new("git").arg("--version").output(); + match output { + Ok(o) if o.status.success() => { + let version = String::from_utf8_lossy(&o.stdout); + eprintln!( + "git version: {}", + version.lines().next().unwrap_or("unknown") + ); + } + Ok(o) => eprintln!("git --version failed: {}", o.status), + Err(e) => eprintln!("git command not found: {e}"), + } + }); +} + fn print_patch_version() { static ONCE: Once = Once::new(); ONCE.call_once(|| { diff --git a/tests/compat/git/fail_both_devnull/in/foo.patch b/tests/compat/git/fail_both_devnull/in/foo.patch new file mode 100644 index 0000000..26b2827 --- /dev/null +++ b/tests/compat/git/fail_both_devnull/in/foo.patch @@ -0,0 +1,6 @@ +diff --git a/foo b/foo +--- /dev/null ++++ /dev/null +@@ -1 +1 @@ +-old ++new diff --git a/tests/compat/git/format_patch_diff_in_message/in/file.txt b/tests/compat/git/format_patch_diff_in_message/in/file.txt new file mode 100644 index 0000000..3367afd --- /dev/null +++ b/tests/compat/git/format_patch_diff_in_message/in/file.txt @@ -0,0 +1 @@ +old diff --git a/tests/compat/git/format_patch_diff_in_message/in/foo.patch b/tests/compat/git/format_patch_diff_in_message/in/foo.patch new file mode 100644 index 0000000..ce33371 --- /dev/null +++ b/tests/compat/git/format_patch_diff_in_message/in/foo.patch @@ -0,0 +1,20 @@ +From 8a14b135fe7ba10bab09a77c4a687faaf1d92a26 Mon Sep 17 00:00:00 2001 +From: Test +Date: Wed, 1 Jan 2020 00:00:00 +0000 +Subject: [PATCH] Fix diff --git parsing bug + +The line `diff --git a/x b/x` in messages was incorrectly parsed. +--- + file.txt | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/file.txt b/file.txt +index 3367afd..3e75765 100644 +--- a/file.txt ++++ b/file.txt +@@ -1 +1 @@ +-old ++new +-- +2.52.0 + diff --git a/tests/compat/git/format_patch_diff_in_message/out/file.txt b/tests/compat/git/format_patch_diff_in_message/out/file.txt new file mode 100644 index 0000000..3e75765 --- /dev/null +++ b/tests/compat/git/format_patch_diff_in_message/out/file.txt @@ -0,0 +1 @@ +new diff --git a/tests/compat/git/format_patch_multiple_separators/in/file.txt b/tests/compat/git/format_patch_multiple_separators/in/file.txt new file mode 100644 index 0000000..3367afd --- /dev/null +++ b/tests/compat/git/format_patch_multiple_separators/in/file.txt @@ -0,0 +1 @@ +old diff --git a/tests/compat/git/format_patch_multiple_separators/in/foo.patch b/tests/compat/git/format_patch_multiple_separators/in/foo.patch new file mode 100644 index 0000000..a0da5b9 --- /dev/null +++ b/tests/compat/git/format_patch_multiple_separators/in/foo.patch @@ -0,0 +1,21 @@ +From 6bfbbfa49a16bb8173145a933fe5ad918ad48a31 Mon Sep 17 00:00:00 2001 +From: Test +Date: Wed, 1 Jan 2020 00:00:00 +0000 +Subject: [PATCH] Add content with --- markers + +--- + file.txt | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/file.txt b/file.txt +index 3367afd..c4d4ea8 100644 +--- a/file.txt ++++ b/file.txt +@@ -1 +1,3 @@ +-old ++--- ++new ++--- +-- +2.52.0 + diff --git a/tests/compat/git/format_patch_multiple_separators/out/file.txt b/tests/compat/git/format_patch_multiple_separators/out/file.txt new file mode 100644 index 0000000..c4d4ea8 --- /dev/null +++ b/tests/compat/git/format_patch_multiple_separators/out/file.txt @@ -0,0 +1,3 @@ +--- +new +--- diff --git a/tests/compat/git/format_patch_preamble/in/file.txt b/tests/compat/git/format_patch_preamble/in/file.txt new file mode 100644 index 0000000..3367afd --- /dev/null +++ b/tests/compat/git/format_patch_preamble/in/file.txt @@ -0,0 +1 @@ +old diff --git a/tests/compat/git/format_patch_preamble/in/foo.patch b/tests/compat/git/format_patch_preamble/in/foo.patch new file mode 100644 index 0000000..9d90425 --- /dev/null +++ b/tests/compat/git/format_patch_preamble/in/foo.patch @@ -0,0 +1,20 @@ +From ddbc9053359329dd016ed89f0d6e460b3b8ff5e3 Mon Sep 17 00:00:00 2001 +From: Test +Date: Wed, 1 Jan 2020 00:00:00 +0000 +Subject: [PATCH] Add new content + +This is the commit body. +--- + file.txt | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/file.txt b/file.txt +index 3367afd..3e75765 100644 +--- a/file.txt ++++ b/file.txt +@@ -1 +1 @@ +-old ++new +-- +2.52.0 + diff --git a/tests/compat/git/format_patch_preamble/out/file.txt b/tests/compat/git/format_patch_preamble/out/file.txt new file mode 100644 index 0000000..3e75765 --- /dev/null +++ b/tests/compat/git/format_patch_preamble/out/file.txt @@ -0,0 +1 @@ +new diff --git a/tests/compat/git/format_patch_signature/in/file.txt b/tests/compat/git/format_patch_signature/in/file.txt new file mode 100644 index 0000000..3367afd --- /dev/null +++ b/tests/compat/git/format_patch_signature/in/file.txt @@ -0,0 +1 @@ +old diff --git a/tests/compat/git/format_patch_signature/in/foo.patch b/tests/compat/git/format_patch_signature/in/foo.patch new file mode 100644 index 0000000..7614287 --- /dev/null +++ b/tests/compat/git/format_patch_signature/in/foo.patch @@ -0,0 +1,19 @@ +From b3bb3125eff3d2648f15af2a6e0cdcdf6ad8fce1 Mon Sep 17 00:00:00 2001 +From: Test +Date: Wed, 1 Jan 2020 00:00:00 +0000 +Subject: [PATCH] modify + +--- + file.txt | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/file.txt b/file.txt +index 3367afd..3e75765 100644 +--- a/file.txt ++++ b/file.txt +@@ -1 +1 @@ +-old ++new +-- +2.52.0 + diff --git a/tests/compat/git/format_patch_signature/out/file.txt b/tests/compat/git/format_patch_signature/out/file.txt new file mode 100644 index 0000000..3e75765 --- /dev/null +++ b/tests/compat/git/format_patch_signature/out/file.txt @@ -0,0 +1 @@ +new diff --git a/tests/compat/git/junk_between_files/in/bar.txt b/tests/compat/git/junk_between_files/in/bar.txt new file mode 100644 index 0000000..601d8ee --- /dev/null +++ b/tests/compat/git/junk_between_files/in/bar.txt @@ -0,0 +1 @@ +bar line1 diff --git a/tests/compat/git/junk_between_files/in/foo.patch b/tests/compat/git/junk_between_files/in/foo.patch new file mode 100644 index 0000000..dbc5939 --- /dev/null +++ b/tests/compat/git/junk_between_files/in/foo.patch @@ -0,0 +1,17 @@ +diff --git a/foo.txt b/foo.txt +index 1234567..89abcdef 100644 +--- a/foo.txt ++++ b/foo.txt +@@ -1 +1 @@ +-foo line1 ++FOO LINE1 +JUNK BETWEEN FILES!!!! +This preamble text should be ignored +by both git apply and diffy +diff --git a/bar.txt b/bar.txt +index 1234567..89abcdef 100644 +--- a/bar.txt ++++ b/bar.txt +@@ -1 +1 @@ +-bar line1 ++BAR LINE1 diff --git a/tests/compat/git/junk_between_files/in/foo.txt b/tests/compat/git/junk_between_files/in/foo.txt new file mode 100644 index 0000000..b11358e --- /dev/null +++ b/tests/compat/git/junk_between_files/in/foo.txt @@ -0,0 +1 @@ +foo line1 diff --git a/tests/compat/git/junk_between_files/out/bar.txt b/tests/compat/git/junk_between_files/out/bar.txt new file mode 100644 index 0000000..76c036d --- /dev/null +++ b/tests/compat/git/junk_between_files/out/bar.txt @@ -0,0 +1 @@ +BAR LINE1 diff --git a/tests/compat/git/junk_between_files/out/foo.txt b/tests/compat/git/junk_between_files/out/foo.txt new file mode 100644 index 0000000..787bc66 --- /dev/null +++ b/tests/compat/git/junk_between_files/out/foo.txt @@ -0,0 +1 @@ +FOO LINE1 diff --git a/tests/compat/git/junk_between_hunks/in/file.txt b/tests/compat/git/junk_between_hunks/in/file.txt new file mode 100644 index 0000000..822aed3 --- /dev/null +++ b/tests/compat/git/junk_between_hunks/in/file.txt @@ -0,0 +1,9 @@ +line1 +line2 +line3 +line4 +line5 +line6 +line7 +line8 +line9 diff --git a/tests/compat/git/junk_between_hunks/in/foo.patch b/tests/compat/git/junk_between_hunks/in/foo.patch new file mode 100644 index 0000000..242e959 --- /dev/null +++ b/tests/compat/git/junk_between_hunks/in/foo.patch @@ -0,0 +1,15 @@ +diff --git a/file.txt b/file.txt +index 1234567..89abcdef 100644 +--- a/file.txt ++++ b/file.txt +@@ -1,3 +1,3 @@ +-line1 ++LINE1 + line2 + line3 +JUNK BETWEEN HUNKS +@@ -7,3 +7,3 @@ + line7 +-line8 ++LINE8 + line9 diff --git a/tests/compat/git/junk_between_hunks/out/file.txt b/tests/compat/git/junk_between_hunks/out/file.txt new file mode 100644 index 0000000..2e5e454 --- /dev/null +++ b/tests/compat/git/junk_between_hunks/out/file.txt @@ -0,0 +1,9 @@ +LINE1 +line2 +line3 +line4 +line5 +line6 +line7 +line8 +line9 diff --git a/tests/compat/git/mod.rs b/tests/compat/git/mod.rs new file mode 100644 index 0000000..e7dc99b --- /dev/null +++ b/tests/compat/git/mod.rs @@ -0,0 +1,138 @@ +//! Git compatibility tests. See [`crate`] for test structure and usage. +//! +//! Focus areas: +//! +//! - `diff --git` path parsing edge cases (quotes, spaces, ambiguous prefixes) +//! - `git format-patch` email format (preamble/signature stripping) +//! - Agreement between diffy and `git apply` + +use crate::common::Case; + +#[test] +fn path_no_prefix() { + Case::git("path_no_prefix").run(); +} + +#[test] +fn path_quoted_escapes() { + Case::git("path_quoted_escapes").strip(1).run(); +} + +// Git uses C-style named escapes (\a, \b, \f, \v) for certain control +// characters in quoted filenames. Both `git apply` and GNU patch decode +// these correctly. +// +// Observed with git 2.53.0: +// $ printf 'x' > "$(printf 'bel\a')" && git add -A +// $ git diff --cached | grep '+++' +// +++ "b/bel\a" +// +// diffy now decodes these correctly. +#[test] +fn path_quoted_named_escape() { + Case::git("path_quoted_named_escape").strip(1).run(); +} + +// Git uses 3-digit octal escapes (\000-\377) for bytes that don't have +// a named escape. Both `git apply` and GNU patch decode these correctly. +// +// Observed with git 2.53.0: +// $ printf 'x' > "$(printf 'tl\033')" && git add -A +// $ git diff --cached | grep '+++' +// +++ "b/tl\033" +// +// Found via full-history replay test against llvm/llvm-project +// (commits 17af06ba..229c95ab, 6c031780..0683a1e5). +#[test] +fn path_quoted_octal_escape() { + Case::git("path_quoted_octal_escape").strip(1).run(); +} + +#[test] +fn path_with_spaces() { + Case::git("path_with_spaces").strip(1).run(); +} + +#[test] +fn path_containing_space_b() { + Case::git("path_containing_space_b").strip(1).run(); +} + +#[test] +fn format_patch_preamble() { + // Ambiguous: where does preamble end? First `\n---\n` - verify matches git + Case::git("format_patch_preamble").strip(1).run(); +} + +#[test] +fn format_patch_diff_in_message() { + // `diff --git` in commit message must NOT trigger early parsing + Case::git("format_patch_diff_in_message").strip(1).run(); +} + +#[test] +fn format_patch_multiple_separators() { + // Git uses first `\n---\n` as separator (observed git mailinfo behavior) + Case::git("format_patch_multiple_separators").strip(1).run(); +} + +#[test] +fn format_patch_signature() { + // Ambiguous: `\n-- \n` could appear in patch content - verify matches git + Case::git("format_patch_signature").strip(1).run(); +} + +#[test] +fn nested_diff_signature() { + // Patch that deletes a diff file containing `-- ` patterns within its content, + // followed by a real email signature at the end. + // + // Tests that we correctly distinguish between: + // - `-- ` appearing as patch content (from inner diff's empty context lines) + // - `-- ` appearing as the actual email signature separator + // + // Both git apply and GNU patch handle this correctly. + Case::git("nested_diff_signature").strip(1).run(); +} + +#[test] +fn path_ambiguous_suffix() { + // Multiple valid splits in `diff --git` line; algorithm picks longest common suffix. + // Tests the pathological case from parse.rs comments where custom prefix + // creates `src/foo.rs src/foo.rs src/foo.rs src/foo.rs` - verify matches git. + Case::git("path_ambiguous_suffix").strip(1).run(); +} + +// Both --- and +++ point to /dev/null. +// git apply rejects: "dev/null: No such file or directory" +#[test] +fn fail_both_devnull() { + Case::git("fail_both_devnull") + .strip(1) + .expect_success(false) + .run(); +} + +// Single-file patch with junk between hunks. +// +// - git apply: errors ("patch fragment without header") +// - diffy: succeeds, ignores trailing junk (matches GNU patch behavior) +#[test] +fn junk_between_hunks() { + Case::git("junk_between_hunks") + .strip(1) + .expect_compat(false) + .run(); +} + +// Multi-file patch with junk/preamble text between different files. +// +// git apply behavior: Ignores content between `diff --git` boundaries. +// In GitDiff mode, splitting occurs at `diff --git`, so junk between +// files becomes trailing content of the previous chunk (harmless). +// +// This is different from junk between HUNKS of the same file (which fails). +#[test] +fn junk_between_files() { + Case::git("junk_between_files").strip(1).run(); +} diff --git a/tests/compat/git/nested_diff_signature/in/example.rs b/tests/compat/git/nested_diff_signature/in/example.rs new file mode 100644 index 0000000..8f3b7ef --- /dev/null +++ b/tests/compat/git/nested_diff_signature/in/example.rs @@ -0,0 +1 @@ +fn foo() {} diff --git a/tests/compat/git/nested_diff_signature/in/foo.patch b/tests/compat/git/nested_diff_signature/in/foo.patch new file mode 100644 index 0000000..5d876c6 --- /dev/null +++ b/tests/compat/git/nested_diff_signature/in/foo.patch @@ -0,0 +1,25 @@ +diff --git a/mir-test.diff b/mir-test.diff +deleted file mode 100644 +index 98012d7..0000000 +--- a/mir-test.diff ++++ /dev/null +@@ -1,12 +0,0 @@ +-- // MIR before +-+ // MIR after +- +- fn opt() { +- bb0: { +-- nop; +-- } +-- +-- bb1: { +-- nop; +- } +- } +diff --git a/example.rs b/example.rs +index 8f3b7ef..2a40712 100644 +--- a/example.rs ++++ b/example.rs +@@ -1 +1,2 @@ + fn foo() {} ++fn bar() {} diff --git a/tests/compat/git/nested_diff_signature/in/mir-test.diff b/tests/compat/git/nested_diff_signature/in/mir-test.diff new file mode 100644 index 0000000..98012d7 --- /dev/null +++ b/tests/compat/git/nested_diff_signature/in/mir-test.diff @@ -0,0 +1,12 @@ +- // MIR before ++ // MIR after + + fn opt() { + bb0: { +- nop; +- } +- +- bb1: { +- nop; + } + } diff --git a/tests/compat/git/nested_diff_signature/out/example.rs b/tests/compat/git/nested_diff_signature/out/example.rs new file mode 100644 index 0000000..2a40712 --- /dev/null +++ b/tests/compat/git/nested_diff_signature/out/example.rs @@ -0,0 +1,2 @@ +fn foo() {} +fn bar() {} diff --git a/tests/compat/git/nested_diff_signature/out/mir-test.diff b/tests/compat/git/nested_diff_signature/out/mir-test.diff new file mode 100644 index 0000000..e69de29 diff --git a/tests/compat/git/path_ambiguous_suffix/in/foo.patch b/tests/compat/git/path_ambiguous_suffix/in/foo.patch new file mode 100644 index 0000000..a6815eb --- /dev/null +++ b/tests/compat/git/path_ambiguous_suffix/in/foo.patch @@ -0,0 +1,3 @@ +diff --git src/foo.rs src/foo.rs src/foo.rs src/foo.rs +new file mode 100644 +index 0000000..e69de29 diff --git a/tests/compat/git/path_ambiguous_suffix/out/foo.rs src/foo.rs b/tests/compat/git/path_ambiguous_suffix/out/foo.rs src/foo.rs new file mode 100644 index 0000000..e69de29 diff --git a/tests/compat/git/path_containing_space_b/in/foo b/baz.txt b/tests/compat/git/path_containing_space_b/in/foo b/baz.txt new file mode 100644 index 0000000..3367afd --- /dev/null +++ b/tests/compat/git/path_containing_space_b/in/foo b/baz.txt @@ -0,0 +1 @@ +old diff --git a/tests/compat/git/path_containing_space_b/in/foo.patch b/tests/compat/git/path_containing_space_b/in/foo.patch new file mode 100644 index 0000000..15c6fca --- /dev/null +++ b/tests/compat/git/path_containing_space_b/in/foo.patch @@ -0,0 +1,7 @@ +diff --git a/foo b/baz.txt b/foo b/baz.txt +index 3367afd..3e75765 100644 +--- a/foo b/baz.txt ++++ b/foo b/baz.txt +@@ -1 +1 @@ +-old ++new diff --git a/tests/compat/git/path_containing_space_b/out/foo b/baz.txt b/tests/compat/git/path_containing_space_b/out/foo b/baz.txt new file mode 100644 index 0000000..3e75765 --- /dev/null +++ b/tests/compat/git/path_containing_space_b/out/foo b/baz.txt @@ -0,0 +1 @@ +new diff --git a/tests/compat/git/path_no_prefix/in/file.txt b/tests/compat/git/path_no_prefix/in/file.txt new file mode 100644 index 0000000..3367afd --- /dev/null +++ b/tests/compat/git/path_no_prefix/in/file.txt @@ -0,0 +1 @@ +old diff --git a/tests/compat/git/path_no_prefix/in/foo.patch b/tests/compat/git/path_no_prefix/in/foo.patch new file mode 100644 index 0000000..5e2a9f8 --- /dev/null +++ b/tests/compat/git/path_no_prefix/in/foo.patch @@ -0,0 +1,7 @@ +diff --git file.txt file.txt +index 3367afd..3e75765 100644 +--- file.txt ++++ file.txt +@@ -1 +1 @@ +-old ++new diff --git a/tests/compat/git/path_no_prefix/out/file.txt b/tests/compat/git/path_no_prefix/out/file.txt new file mode 100644 index 0000000..3e75765 --- /dev/null +++ b/tests/compat/git/path_no_prefix/out/file.txt @@ -0,0 +1 @@ +new diff --git "a/tests/compat/git/path_quoted_escapes/in/foo\tbar.txt" "b/tests/compat/git/path_quoted_escapes/in/foo\tbar.txt" new file mode 100644 index 0000000..3367afd --- /dev/null +++ "b/tests/compat/git/path_quoted_escapes/in/foo\tbar.txt" @@ -0,0 +1 @@ +old diff --git a/tests/compat/git/path_quoted_escapes/in/foo.patch b/tests/compat/git/path_quoted_escapes/in/foo.patch new file mode 100644 index 0000000..2669223 --- /dev/null +++ b/tests/compat/git/path_quoted_escapes/in/foo.patch @@ -0,0 +1,7 @@ +diff --git "a/foo\tbar.txt" "b/foo\tbar.txt" +index 3367afd..3e75765 100644 +--- "a/foo\tbar.txt" ++++ "b/foo\tbar.txt" +@@ -1 +1 @@ +-old ++new diff --git "a/tests/compat/git/path_quoted_escapes/out/foo\tbar.txt" "b/tests/compat/git/path_quoted_escapes/out/foo\tbar.txt" new file mode 100644 index 0000000..3e75765 --- /dev/null +++ "b/tests/compat/git/path_quoted_escapes/out/foo\tbar.txt" @@ -0,0 +1 @@ +new diff --git a/tests/compat/git/path_quoted_named_escape/in/foo.patch b/tests/compat/git/path_quoted_named_escape/in/foo.patch new file mode 100644 index 0000000..cfd3f56 --- /dev/null +++ b/tests/compat/git/path_quoted_named_escape/in/foo.patch @@ -0,0 +1,6 @@ +diff --git "a/bel\a" "b/bel\a" +new file mode 100644 +--- /dev/null ++++ "b/bel\a" +@@ -0,0 +1 @@ ++hello diff --git "a/tests/compat/git/path_quoted_named_escape/out/bel\a" "b/tests/compat/git/path_quoted_named_escape/out/bel\a" new file mode 100644 index 0000000..ce01362 --- /dev/null +++ "b/tests/compat/git/path_quoted_named_escape/out/bel\a" @@ -0,0 +1 @@ +hello diff --git a/tests/compat/git/path_quoted_octal_escape/in/foo.patch b/tests/compat/git/path_quoted_octal_escape/in/foo.patch new file mode 100644 index 0000000..5dda9ec --- /dev/null +++ b/tests/compat/git/path_quoted_octal_escape/in/foo.patch @@ -0,0 +1,6 @@ +diff --git "a/tl\033" "b/tl\033" +new file mode 100644 +--- /dev/null ++++ "b/tl\033" +@@ -0,0 +1 @@ ++hello diff --git "a/tests/compat/git/path_quoted_octal_escape/out/tl\033" "b/tests/compat/git/path_quoted_octal_escape/out/tl\033" new file mode 100644 index 0000000..ce01362 --- /dev/null +++ "b/tests/compat/git/path_quoted_octal_escape/out/tl\033" @@ -0,0 +1 @@ +hello diff --git a/tests/compat/git/path_with_spaces/in/foo bar.txt b/tests/compat/git/path_with_spaces/in/foo bar.txt new file mode 100644 index 0000000..3367afd --- /dev/null +++ b/tests/compat/git/path_with_spaces/in/foo bar.txt @@ -0,0 +1 @@ +old diff --git a/tests/compat/git/path_with_spaces/in/foo.patch b/tests/compat/git/path_with_spaces/in/foo.patch new file mode 100644 index 0000000..b3d1d46 --- /dev/null +++ b/tests/compat/git/path_with_spaces/in/foo.patch @@ -0,0 +1,7 @@ +diff --git a/foo bar.txt b/foo bar.txt +index 3367afd..3e75765 100644 +--- a/foo bar.txt ++++ b/foo bar.txt +@@ -1 +1 @@ +-old ++new diff --git a/tests/compat/git/path_with_spaces/out/foo bar.txt b/tests/compat/git/path_with_spaces/out/foo bar.txt new file mode 100644 index 0000000..3e75765 --- /dev/null +++ b/tests/compat/git/path_with_spaces/out/foo bar.txt @@ -0,0 +1 @@ +new diff --git a/tests/compat/main.rs b/tests/compat/main.rs index 8faf9ea..e35ed07 100644 --- a/tests/compat/main.rs +++ b/tests/compat/main.rs @@ -39,9 +39,11 @@ //! //! 1. Create `case_name/in/` with input file(s) and `foo.patch` //! 2. Run `SNAPSHOTS=overwrite cargo test --test compat` to generate `out/` -//! 3. Add `#[test] fn case_name() { Case::gnu_patch(...).run(); }` in the module +//! 3. Add `#[test] fn case_name() { Case::{gnu_patch,git}(...).run(); }` in the module //! //! For failure tests, use `.expect_success(false)` and skip step 2. +//! For intentional compat divergence, use `.expect_compat(false)`. mod common; +mod git; mod gnu_patch; From ff392af700443c665ebdf62d85758af931bdb28c Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Sat, 11 Apr 2026 13:13:10 -0400 Subject: [PATCH 09/26] test(replay): add gitdiff mode support Unlike unidiff, gitdiff produces patches for empty file creations/deletions (`0\t0` in numstat) because they carry `diff --git` + extended headers even without hunks. Binary files (`-\t-\t`) are skipped in gitdiff mode for now. --- tests/replay.rs | 38 +++++++++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/tests/replay.rs b/tests/replay.rs index 7eaa56b..1d4e731 100644 --- a/tests/replay.rs +++ b/tests/replay.rs @@ -18,8 +18,7 @@ //! * A range (e.g., `abc123..def456`) for a specific commit range //! //! Defaults to 200. Use `0` to verify entire history. -//! * `DIFFY_TEST_PARSE_MODE`: Parse mode to use. -//! Currently only `unidiff` is supported. +//! * `DIFFY_TEST_PARSE_MODE`: Parse mode to use (`unidiff` or `gitdiff`). //! Defaults to `unidiff`. //! //! ## Requirements @@ -161,12 +160,14 @@ impl CatFile { #[derive(Debug, Clone, Copy, PartialEq, Eq)] enum TestMode { UniDiff, + GitDiff, } impl From for ParseOptions { fn from(value: TestMode) -> Self { match value { TestMode::UniDiff => ParseOptions::unidiff(), + TestMode::GitDiff => ParseOptions::gitdiff(), } } } @@ -228,7 +229,8 @@ fn test_mode() -> TestMode { }; match val.trim().to_lowercase().as_str() { "unidiff" => TestMode::UniDiff, - _ => panic!("invalid DIFFY_TEST_PARSE_MODE='{val}': expected 'unidiff'"), + "gitdiff" => TestMode::GitDiff, + _ => panic!("invalid DIFFY_TEST_PARSE_MODE='{val}': expected 'unidiff' or 'gitdiff'"), } } @@ -323,8 +325,13 @@ fn process_commit( // UniDiff format cannot express pure renames (no ---/+++ headers). // Use `--no-renames` to represent them as delete + create instead. + // GitDiff mode handles renames via extended headers natively. let diff_output = match mode { TestMode::UniDiff => git(repo, &["diff", "--no-renames", parent, child]), + // TODO: pass `--binary` once binary patch support lands, + // so binary files get actual delta/literal data instead of + // "Binary files differ" markers. + TestMode::GitDiff => git(repo, &["diff", parent, child]), }; if diff_output.is_empty() { @@ -373,6 +380,30 @@ fn process_commit( } text_files + type_changes } + TestMode::GitDiff => { + // Can't use `--numstat` for GitDiff: it shows `-\t-\t` for both + // actual binary diffs AND pure binary renames (100% similarity). + // Parser correctly handles pure renames (rename headers, no binary content). + // + // Use `--raw` for total count, subtract actual binary markers from diff. + // + // TODO: once `--binary` is passed above, count ALL `--raw` + // entries — every file will have patch data (delta, literal, or text). + let raw = git(repo, &["diff", "--raw", parent, child]); + let (mut total, mut type_changes) = (0, 0); + for line in raw.lines().filter(|l| !l.is_empty()) { + total += 1; + if is_type_change(line) { + type_changes += 1; + } + } + let binary = diff_output + .lines() + .filter(|l| l.starts_with("Binary files ") || l.starts_with("GIT binary patch")) + .count(); + skipped += binary; + total - binary + type_changes + } }; if expected_file_count == 0 { @@ -518,6 +549,7 @@ fn replay() { .unwrap_or_else(|| ".".to_string()); let mode_name = match mode { TestMode::UniDiff => "unidiff", + TestMode::GitDiff => "gitdiff", }; // Shared state for progress reporting From e5895d96bcca079a625dbe644a6110f1015a8ae8 Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Sun, 12 Apr 2026 12:59:07 -0400 Subject: [PATCH 10/26] chore(ci): run gitdiff also in replay workflow --- .github/workflows/replay.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/replay.yml b/.github/workflows/replay.yml index a9f8f99..08384d6 100644 --- a/.github/workflows/replay.yml +++ b/.github/workflows/replay.yml @@ -44,7 +44,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - parse_mode: [unidiff] + parse_mode: [unidiff, gitdiff] name: ${{ inputs.name && matrix.parse_mode || format('{0} ({1}, {2})', inputs.repo_url, matrix.parse_mode, inputs.commits) }} steps: - uses: actions/checkout@v6 From f1eb3c68c5e33776dc0dc55ddbf152a8752d854a Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Sun, 12 Apr 2026 18:46:08 -0400 Subject: [PATCH 11/26] feat(binary): binary patch types and parser * Added types representing both literal and delta Git binary patches * Added a parser for the `GIT binary patch` format. This doesn't include the patch application (which will be added in later commits) The implementation is based on * Specification from * Behavior observation of Git CLI --- src/binary/mod.rs | 352 ++++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 1 + 2 files changed, 353 insertions(+) create mode 100644 src/binary/mod.rs diff --git a/src/binary/mod.rs b/src/binary/mod.rs new file mode 100644 index 0000000..696f0d6 --- /dev/null +++ b/src/binary/mod.rs @@ -0,0 +1,352 @@ +//! Git binary diffs support. +//! +//! This module provides parsing and decoding for Git's binary diff format, +//! as generated by `git diff --binary` or `git format-patch --binary`. +//! +//! Based on [DiffX Binary Diffs specification](https://diffx.org/spec/binary-diffs.html). + +use std::{fmt, ops::Range}; + +/// The type of a binary patch block. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum BinaryBlockKind { + /// [Literal](https://diffx.org/spec/binary-diffs.html#git-literal-binary-diffs): + /// contains the full file content, zlib-compressed and Base85-encoded. + Literal, + /// [Delta](https://diffx.org/spec/binary-diffs.html#git-delta-binary-diffs): + /// contains delta instructions to transform one file into another. + Delta, +} + +/// A single block in a binary patch. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct BinaryBlock<'a> { + /// The type of this block (literal content or delta instructions). + pub kind: BinaryBlockKind, + /// The encoded data. + pub data: BinaryData<'a>, +} + +/// A parsed binary patch. +/// +/// A binary patch contains encoded binary data that can be decoded +/// to recover the original and modified file contents. +/// +/// Git may use different encodings for each direction: +/// +/// - `literal`: full file content +/// - `delta`: instructions to transform one file into another +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum BinaryPatch<'a> { + /// A full binary patch with forward and reverse data. + /// + /// The forward block transforms original -> modified. + /// The reverse block transforms modified -> original. + /// + /// Each block can independently be either `literal` or `delta`. + Full { + /// Forward transformation (original -> modified). + forward: BinaryBlock<'a>, + /// Reverse transformation (modified -> original). + reverse: BinaryBlock<'a>, + }, + /// A Git binary diff marker. + /// + /// This represents the `Binary files a/path and b/path differ` case, + /// where git detected a binary change but didn't include the actual data. + Marker, +} + +/// Represents a single binary payload in a Git binary diff. +/// +/// For example, the following patch block +/// +/// * is parsed as `BinaryData { size: 10, data: "UcmV+l0QLU>0RjUA1qKHQ2>\`DEE&u=k" }` +/// * The line starts with a length indicator (`U` = 21 decoded bytes) +/// +/// +/// ```text +/// literal 10 +/// UcmV+l0QLU>0RjUA1qKHQ2>`DEE&u=k +/// ``` +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct BinaryData<'a> { + /// Uncompressed size in bytes. + pub size: u64, + /// Raw Base85 lines with length indicators. + pub data: &'a str, +} + +/// Error type for binary patch operations. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct BinaryPatchParseError { + pub(crate) kind: BinaryPatchParseErrorKind, + span: Option>, +} + +impl BinaryPatchParseError { + /// Creates a new error with the given kind and span. + pub(crate) fn new(kind: BinaryPatchParseErrorKind, span: Range) -> Self { + Self { + kind, + span: Some(span), + } + } + + /// Returns the byte range in the input where the error occurred. + pub fn span(&self) -> Option> { + self.span.clone() + } +} + +impl fmt::Display for BinaryPatchParseError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if let Some(span) = &self.span { + write!( + f, + "error parsing binary patch at byte {}: {}", + span.start, self.kind + ) + } else { + write!(f, "error parsing binary patch: {}", self.kind) + } + } +} + +impl std::error::Error for BinaryPatchParseError {} + +impl From for BinaryPatchParseError { + fn from(kind: BinaryPatchParseErrorKind) -> Self { + Self { kind, span: None } + } +} + +/// The kind of error that occurred when parsing a binary patch. +#[derive(Debug, Clone, PartialEq, Eq)] +#[non_exhaustive] +pub(crate) enum BinaryPatchParseErrorKind { + /// Missing or invalid "GIT binary patch" header. + InvalidHeader, + + /// First binary block (forward) not found. + MissingForwardBlock, + + /// Second binary block (reverse) not found. + MissingReverseBlock, + + /// No binary data available (marker-only patch). + NoBinaryData, + + /// Invalid line length indicator in Base85 data. + InvalidLineLengthIndicator, +} + +impl fmt::Display for BinaryPatchParseErrorKind { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::InvalidHeader => write!(f, "invalid binary patch header"), + Self::MissingForwardBlock => write!(f, "first binary block not found"), + Self::MissingReverseBlock => write!(f, "second binary block not found"), + Self::NoBinaryData => write!(f, "no binary data available"), + Self::InvalidLineLengthIndicator => write!(f, "invalid line length indicator"), + } + } +} + +/// Simple streaming parser for binary patches using lines() iterator. +struct BinaryParser<'a> { + input: &'a str, + offset: usize, +} + +impl<'a> BinaryParser<'a> { + fn new(input: &'a str) -> Self { + Self { input, offset: 0 } + } + + fn peek_line(&self) -> Option<&'a str> { + self.input[self.offset..].lines().next() + } + + /// Creates an error with the current offset as span. + fn error(&self, kind: BinaryPatchParseErrorKind) -> BinaryPatchParseError { + BinaryPatchParseError::new(kind, self.offset..self.offset) + } + + fn next_line(&mut self) -> Option<&'a str> { + let rest = &self.input[self.offset..]; + let line = rest.lines().next()?; + self.offset += line.len(); + + // Skip line ending (CRLF or LF) + let remaining = &self.input[self.offset..]; + if remaining.starts_with("\r\n") { + self.offset += 2; + } else if remaining.starts_with('\n') { + self.offset += 1; + } + + Some(line) + } + + fn slice_from(&self, start_offset: usize) -> &'a str { + let end = self.offset; + // Strip trailing line ending before blank line + let slice = &self.input[start_offset..end]; + slice + .strip_suffix("\r\n") + .or_else(|| slice.strip_suffix('\n')) + .unwrap_or(slice) + } +} + +/// Parses binary patch content after git extended headers. +/// +/// Expects input starting with "GIT binary patch" line. +/// Returns the parsed patch and the number of bytes consumed. +/// +/// Format: +/// +/// ```text +/// GIT binary patch +/// +/// +/// +/// +/// +/// ``` +pub(crate) fn parse_binary_patch( + input: &str, +) -> Result<(BinaryPatch<'_>, usize), BinaryPatchParseError> { + let mut parser = BinaryParser::new(input); + + // Expect "GIT binary patch" marker + if parser.next_line() != Some("GIT binary patch") { + return Err(parser.error(BinaryPatchParseErrorKind::InvalidHeader)); + } + + // Parse first block (forward: original -> modified) + let Some(forward) = parse_binary_block(&mut parser) else { + return Err(parser.error(BinaryPatchParseErrorKind::MissingForwardBlock)); + }; + + // Parse second block (reverse: modified -> original) + let Some(reverse) = parse_binary_block(&mut parser) else { + return Err(parser.error(BinaryPatchParseErrorKind::MissingReverseBlock)); + }; + + Ok((BinaryPatch::Full { forward, reverse }, parser.offset)) +} + +/// Parses a single binary block. +/// +/// Returns a `BinaryBlock` with kind (literal/delta) and data. +fn parse_binary_block<'a>(parser: &mut BinaryParser<'a>) -> Option> { + // Parse "literal 10" or "delta 18" + let format_line = parser.next_line()?; + let (patch_type, size_str) = format_line.split_once(' ')?; + let size: u64 = size_str.parse().ok()?; + + let kind = match patch_type { + "literal" => BinaryBlockKind::Literal, + "delta" => BinaryBlockKind::Delta, + _ => return None, + }; + + // Record start of Base85 data + let data_start = parser.offset; + + // Consume Base85 lines until blank line + while let Some(line) = parser.peek_line() { + if line.is_empty() { + parser.next_line(); // Consume blank line + break; + } + parser.next_line(); + } + + let data = parser.slice_from(data_start); + + Some(BinaryBlock { + kind, + data: BinaryData { size, data }, + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_literal_format_simple() { + let input = "GIT binary patch\nliteral 10\nUcmV+l0QLU>0RjUA1qKHQ2>`DEE&u=k\n\nliteral 0\nKcmV+b0RR6000031\n\n"; + let (patch, consumed) = parse_binary_patch(input).unwrap(); + + assert_eq!(consumed, input.len()); + match &patch { + BinaryPatch::Full { forward, reverse } => { + assert_eq!(forward.kind, BinaryBlockKind::Literal); + assert_eq!(forward.data.size, 10); + assert_eq!(reverse.kind, BinaryBlockKind::Literal); + assert_eq!(reverse.data.size, 0); + } + _ => panic!("expected Full variant"), + } + } + + #[test] + fn parse_delta_format() { + let input = "GIT binary patch\ndelta 18\nccmV+t0PX*P2!IH%^Z^9`00000v-trB0x!=5aR2}S\n\ndelta 18\nccmV+t0PX*P2!IH%^Z^BFm9#}av-trB0zxAOrvLx|\n\n"; + let (patch, _) = parse_binary_patch(input).unwrap(); + + match &patch { + BinaryPatch::Full { forward, reverse } => { + assert_eq!(forward.kind, BinaryBlockKind::Delta); + assert_eq!(forward.data.size, 18); + assert_eq!(reverse.kind, BinaryBlockKind::Delta); + assert_eq!(reverse.data.size, 18); + } + _ => panic!("expected Full variant"), + } + } + + #[test] + fn parse_invalid_header() { + // Without "GIT binary patch" marker, parse_binary_patch returns error + let input = "literal 10\nUcmV+l0QLU>0RjUA1qKHQ2>`DEE&u=k\n\n"; + let err = parse_binary_patch(input).unwrap_err(); + assert_eq!(err.kind, BinaryPatchParseErrorKind::InvalidHeader); + } + + #[test] + fn parse_with_crlf_line_endings() { + let input = "GIT binary patch\r\nliteral 10\r\nUcmV+l0QLU>0RjUA1qKHQ2>`DEE&u=k\r\n\r\nliteral 0\r\nKcmV+b0RR6000031\r\n\r\n"; + let (patch, consumed) = parse_binary_patch(input).unwrap(); + + assert_eq!(consumed, input.len()); + match &patch { + BinaryPatch::Full { forward, reverse } => { + assert_eq!(forward.kind, BinaryBlockKind::Literal); + assert_eq!(forward.data.size, 10); + assert_eq!(reverse.kind, BinaryBlockKind::Literal); + assert_eq!(reverse.data.size, 0); + } + _ => panic!("expected Full variant"), + } + } + + #[test] + fn parse_mixed_format() { + // Git can use different encoding for each direction + let input = "GIT binary patch\nliteral 10\nUcmV+l0QLU>0RjUA1qKHQ2>`DEE&u=k\n\ndelta 18\nccmV+t0PX*P2!IH%^Z^9`00000v-trB0x!=5aR2}S\n\n"; + let (patch, _) = parse_binary_patch(input).unwrap(); + + match &patch { + BinaryPatch::Full { forward, reverse } => { + assert_eq!(forward.kind, BinaryBlockKind::Literal); + assert_eq!(reverse.kind, BinaryBlockKind::Delta); + } + _ => panic!("expected Full variant"), + } + } +} diff --git a/src/lib.rs b/src/lib.rs index 33ebbb6..5e11a40 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -223,6 +223,7 @@ //! [`create_patch_bytes`]: fn.create_patch_bytes.html mod apply; +pub mod binary; mod diff; mod merge; mod patch; From 1f7f50f49ca4a4210365a948346823466f54be47 Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Sun, 12 Apr 2026 18:46:16 -0400 Subject: [PATCH 12/26] feat(patch_set): wire binary patch into gitdiff parser - Add `Binary::Keep` variant (now the default) to `ParseOptions` - Add `PatchKind::Binary` variant for binary patches - Parse `GIT binary patch` payload via `parse_binary_patch` - Handle `Binary files ... differ` as `BinaryPatch::Marker` - Add `extract_file_op_binary` for file ops without ---/+++ headers --- src/patch_set/error.rs | 11 ++++ src/patch_set/mod.rs | 35 +++++++++++-- src/patch_set/parse.rs | 115 +++++++++++++++++++++++++++++++++++++++-- src/patch_set/tests.rs | 24 ++++++++- tests/compat/common.rs | 12 ++++- tests/replay.rs | 5 ++ 6 files changed, 192 insertions(+), 10 deletions(-) diff --git a/src/patch_set/error.rs b/src/patch_set/error.rs index 9cd6aa9..20d7d3e 100644 --- a/src/patch_set/error.rs +++ b/src/patch_set/error.rs @@ -3,6 +3,7 @@ use std::fmt; use std::ops::Range; +use crate::binary::BinaryPatchParseError; use crate::patch::ParsePatchError; /// An error returned when parsing patches fails. @@ -79,6 +80,9 @@ pub(crate) enum PatchSetParseErrorKind { /// Binary diff not supported in current configuration. BinaryNotSupported { path: String }, + + /// Binary patch parsing failed. + BinaryParse(BinaryPatchParseError), } impl fmt::Display for PatchSetParseErrorKind { @@ -95,6 +99,7 @@ impl fmt::Display for PatchSetParseErrorKind { Self::BinaryNotSupported { path } => { write!(f, "binary diff not supported: {path}") } + Self::BinaryParse(e) => write!(f, "{e}"), } } } @@ -104,3 +109,9 @@ impl From for PatchSetParseError { PatchSetParseErrorKind::Patch(e).into() } } + +impl From for PatchSetParseError { + fn from(e: BinaryPatchParseError) -> Self { + PatchSetParseErrorKind::BinaryParse(e).into() + } +} diff --git a/src/patch_set/mod.rs b/src/patch_set/mod.rs index f83442c..6141c54 100644 --- a/src/patch_set/mod.rs +++ b/src/patch_set/mod.rs @@ -10,6 +10,7 @@ mod tests; use std::borrow::Cow; +use crate::binary::BinaryPatch; use crate::Patch; pub use error::PatchSetParseError; @@ -31,7 +32,7 @@ pub use parse::PatchSet; /// Note that this is not a documented Git behavior, /// so the implementation here is subject to change if Git changes. /// -/// By default, binary diffs are skipped. +/// By default, binary diffs are kept in the output. /// /// ## Example /// @@ -70,10 +71,12 @@ pub(crate) enum Format { #[derive(Debug, Clone, Copy, Default)] pub(crate) enum Binary { /// Skip binary diffs silently. - #[default] Skip, /// Return error if binary diff encountered. Fail, + /// Keep binary diffs in the output. + #[default] + Keep, } impl ParseOptions { @@ -103,7 +106,7 @@ impl ParseOptions { /// * `diff --git` headers /// * Extended headers (`new file mode`, `deleted file mode`, etc.) /// * Rename/copy detection (`rename from`/`rename to`, `copy from`/`copy to`) - /// * Binary file detection (skipped by default) + /// * Binary file detection (kept by default) /// /// [git-diff-format]: https://git-scm.com/docs/diff-format pub fn gitdiff() -> Self { @@ -158,6 +161,8 @@ impl std::str::FromStr for FileMode { pub enum PatchKind<'a, T: ToOwned + ?Sized> { /// Text patch with hunks. Text(Patch<'a, T>), + /// Binary patch (literal or delta encoded, or marker-only). + Binary(BinaryPatch<'a>), } impl std::fmt::Debug for PatchKind<'_, T> @@ -168,6 +173,7 @@ where fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { PatchKind::Text(patch) => f.debug_tuple("Text").field(patch).finish(), + PatchKind::Binary(patch) => f.debug_tuple("Binary").field(patch).finish(), } } } @@ -177,6 +183,15 @@ impl<'a, T: ToOwned + ?Sized> PatchKind<'a, T> { pub fn as_text(&self) -> Option<&Patch<'a, T>> { match self { PatchKind::Text(patch) => Some(patch), + PatchKind::Binary(_) => None, + } + } + + /// Returns the binary patch, or `None` if this is a text patch. + pub fn as_binary(&self) -> Option<&BinaryPatch<'a>> { + match self { + PatchKind::Binary(patch) => Some(patch), + PatchKind::Text(_) => None, } } } @@ -224,6 +239,20 @@ impl<'a, T: ToOwned + ?Sized> FilePatch<'a, T> { } } + fn new_binary( + operation: FileOperation<'a>, + patch: BinaryPatch<'a>, + old_mode: Option, + new_mode: Option, + ) -> Self { + Self { + operation, + kind: PatchKind::Binary(patch), + old_mode, + new_mode, + } + } + /// Returns the file operation for this patch. pub fn operation(&self) -> &FileOperation<'a> { &self.operation diff --git a/src/patch_set/parse.rs b/src/patch_set/parse.rs index 0d8b994..b089479 100644 --- a/src/patch_set/parse.rs +++ b/src/patch_set/parse.rs @@ -4,6 +4,7 @@ use super::{ error::PatchSetParseErrorKind, Binary, FileMode, FileOperation, FilePatch, Format, ParseOptions, PatchSetParseError, }; +use crate::binary::{parse_binary_patch, BinaryPatch}; use crate::patch::parse::parse_one; use crate::utils::escaped_filename; use crate::Patch; @@ -86,8 +87,8 @@ impl<'a> PatchSet<'a> { let (header, header_consumed) = GitHeader::parse(&self.input[self.offset..]); self.offset += header_consumed; - // Handle binary markers ("Binary files ... differ") - if header.is_binary_marker || header.is_binary_patch { + // Handle "Binary files ... differ" (no patch data) + if header.is_binary_marker { match self.opts.binary { Binary::Skip => { return self.next_gitdiff_patch(); @@ -99,6 +100,77 @@ impl<'a> PatchSet<'a> { abs_patch_start..abs_patch_start, ))); } + Binary::Keep => { + // FIXME: error spans point at `diff --git` line, not the specific offending line + let operation = match extract_file_op_binary(&header) { + Ok(op) => op, + Err(mut e) => { + e.set_span(abs_patch_start..abs_patch_start); + return Some(Err(e)); + } + }; + let (old_mode, new_mode) = match parse_file_modes(&header) { + Ok(modes) => modes, + Err(mut e) => { + e.set_span(abs_patch_start..abs_patch_start); + return Some(Err(e)); + } + }; + return Some(Ok(FilePatch::new_binary( + operation, + BinaryPatch::Marker, + old_mode, + new_mode, + ))); + } + } + } + + // Handle "GIT binary patch" (has patch data) + if let Some(binary_patch_start) = header.binary_patch_offset { + match self.opts.binary { + Binary::Skip => { + return self.next_gitdiff_patch(); + } + Binary::Fail => { + let path = header.diff_git_line.unwrap_or("").to_owned(); + return Some(Err(PatchSetParseError::new( + PatchSetParseErrorKind::BinaryNotSupported { path }, + abs_patch_start..abs_patch_start, + ))); + } + Binary::Keep => { + // GitHeader::parse consumed the marker line but not the payload. + // Use the recorded offset to pass input from the marker onward. + let binary_input = &self.input[abs_patch_start + binary_patch_start..]; + let (binary_patch, consumed) = match parse_binary_patch(binary_input) { + Ok(result) => result, + Err(e) => return Some(Err(e.into())), + }; + self.offset = abs_patch_start + binary_patch_start + consumed; + + // FIXME: error spans point at `diff --git` line, not the specific offending line + let operation = match extract_file_op_binary(&header) { + Ok(op) => op, + Err(mut e) => { + e.set_span(abs_patch_start..abs_patch_start); + return Some(Err(e)); + } + }; + let (old_mode, new_mode) = match parse_file_modes(&header) { + Ok(modes) => modes, + Err(mut e) => { + e.set_span(abs_patch_start..abs_patch_start); + return Some(Err(e)); + } + }; + return Some(Ok(FilePatch::new_binary( + operation, + binary_patch, + old_mode, + new_mode, + ))); + } } } @@ -301,7 +373,8 @@ struct GitHeader<'a> { /// Binary files /dev/null and b/image.png differ /// ``` is_binary_marker: bool, - /// Whether this is a binary diff with actual patch content. + /// Byte offset of `"GIT binary patch"` line relative to header input, + /// or `None` if no binary patch content was found. /// /// Observed `git diff --binary` output: /// @@ -316,7 +389,7 @@ struct GitHeader<'a> { /// literal 0 /// KcmV+b0RR6000031 /// ``` - is_binary_patch: bool, + binary_patch_offset: Option, } impl<'a> GitHeader<'a> { @@ -363,7 +436,7 @@ impl<'a> GitHeader<'a> { } else if line.starts_with("Binary files ") { header.is_binary_marker = true; } else if line.starts_with("GIT binary patch") { - header.is_binary_patch = true; + header.binary_patch_offset = Some(consumed); } else { // Unrecognized line: End of extended headers // (typically `---`/`+++`/`@@` or trailing content). @@ -416,6 +489,38 @@ fn extract_file_op_gitdiff<'a>( } } +/// Extracts file operation for binary patches (no ---/+++ headers). +fn extract_file_op_binary<'a>( + header: &GitHeader<'a>, +) -> Result, PatchSetParseError> { + // Git headers are authoritative for rename/copy + if let (Some(from), Some(to)) = (header.rename_from, header.rename_to) { + return Ok(FileOperation::Rename { + from: Cow::Borrowed(from), + to: Cow::Borrowed(to), + }); + } + if let (Some(from), Some(to)) = (header.copy_from, header.copy_to) { + return Ok(FileOperation::Copy { + from: Cow::Borrowed(from), + to: Cow::Borrowed(to), + }); + } + + // Use `diff --git ` for binary patches. + let Some((original, modified)) = header.diff_git_line.and_then(parse_diff_git_path) else { + return Err(PatchSetParseErrorKind::InvalidDiffGitPath.into()); + }; + + if header.new_file_mode.is_some() { + Ok(FileOperation::Create(modified)) + } else if header.deleted_file_mode.is_some() { + Ok(FileOperation::Delete(original)) + } else { + Ok(FileOperation::Modify { original, modified }) + } +} + /// Parses file modes from git extended headers. fn parse_file_modes( header: &GitHeader<'_>, diff --git a/src/patch_set/tests.rs b/src/patch_set/tests.rs index 9648930..0944c54 100644 --- a/src/patch_set/tests.rs +++ b/src/patch_set/tests.rs @@ -535,6 +535,7 @@ index 0000000..e69de29 ); match patches[0].patch() { PatchKind::Text(p) => assert!(p.hunks().is_empty()), + PatchKind::Binary(_) => panic!("expected text patch"), } } @@ -566,6 +567,25 @@ diff --git a/foo b/foo )); } + #[test] + fn binary_marker_kept_by_default() { + // Default is Keep: binary marker is returned as BinaryPatch::Marker. + let input = "\ +diff --git a/img.png b/img.png +Binary files a/img.png and b/img.png differ +diff --git a/foo b/foo +--- a/foo ++++ b/foo +@@ -1 +1 @@ +-old ++new +"; + let patches = parse_gitdiff(input); + assert_eq!(patches.len(), 2); + assert!(patches[0].patch().as_binary().is_some()); + assert!(patches[1].patch().as_text().is_some()); + } + #[test] fn binary_skip_does_not_stall() { // Binary marker must be skipped and offset must advance @@ -580,7 +600,9 @@ diff --git a/foo b/foo -old +new "; - let patches = parse_gitdiff(input); + let patches: Vec<_> = PatchSet::parse(input, ParseOptions::gitdiff().skip_binary()) + .collect::>() + .unwrap(); assert_eq!(patches.len(), 1, "binary should be skipped, text parsed"); } } diff --git a/tests/compat/common.rs b/tests/compat/common.rs index 117ed04..c55cdd1 100644 --- a/tests/compat/common.rs +++ b/tests/compat/common.rs @@ -8,7 +8,10 @@ use std::{ sync::Once, }; -use diffy::patch_set::{FileOperation, ParseOptions, PatchKind, PatchSet, PatchSetParseError}; +use diffy::{ + binary::BinaryPatch, + patch_set::{FileOperation, ParseOptions, PatchKind, PatchSet, PatchSetParseError}, +}; /// Which external tool to compare against. #[derive(Clone, Copy)] @@ -368,6 +371,13 @@ pub fn apply_diffy( } fs::write(&result_path, result.as_bytes()).unwrap(); } + PatchKind::Binary(BinaryPatch::Marker) => { + // Dont do anything if it is just a binary patch marker.[ + } + PatchKind::Binary(_) => { + // Binary patch application requires the `binary` feature. + // Will be wired up when that feature is added. + } } } diff --git a/tests/replay.rs b/tests/replay.rs index 1d4e731..4996151 100644 --- a/tests/replay.rs +++ b/tests/replay.rs @@ -512,6 +512,11 @@ fn process_commit( ); } } + PatchKind::Binary(_) => { + // Binary patch application not yet wired up in replay tests. + // Will be done once the `binary` Cargo feature is added. + skipped += 1; + } } applied += 1; From 4b299387ec0ac56ba9d3e38a1fb28759641ded69 Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Mon, 2 Feb 2026 07:22:59 +0800 Subject: [PATCH 13/26] refactor: clippy manual_div_ceil The API was stabilized in 1.73. The lint was added in 1.93. This is required for a MSRV bump to 1.75 --- src/diff/myers.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diff/myers.rs b/src/diff/myers.rs index 73eb410..502d077 100644 --- a/src/diff/myers.rs +++ b/src/diff/myers.rs @@ -69,7 +69,7 @@ impl ::std::fmt::Display for Snake { fn max_d(len1: usize, len2: usize) -> usize { // XXX look into reducing the need to have the additional '+ 1' - (len1 + len2 + 1) / 2 + 1 + (len1 + len2).div_ceil(2) + 1 } // The divide part of a divide-and-conquer strategy. A D-path has D+1 snakes some of which may From 70eb62f6d8b890b473761b14db24a0cdc003332b Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Sun, 12 Apr 2026 21:15:04 -0400 Subject: [PATCH 14/26] chore: dependency flate2 behind binary feature This is a preparation for binary diff application support. * Git binary patch is compressed by zlib hence flate2 * zlib-rs (which is the most performant zlib backend) requires MSRF 1.75.0+ hence the bump. --- .github/workflows/ci.yml | 2 +- Cargo.lock | 39 +++++++++++++++++++++++++++++++++++++++ Cargo.toml | 4 +++- deny.toml | 1 + 4 files changed, 44 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d1c8292..b1eb5a6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -23,7 +23,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - rust: [stable, beta, nightly, 1.70.0] + rust: [stable, beta, nightly, 1.75.0] steps: - uses: actions/checkout@v6 diff --git a/Cargo.lock b/Cargo.lock index ad290e3..c726b1c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,12 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "adler2" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" + [[package]] name = "anstream" version = "0.6.21" @@ -109,6 +115,7 @@ name = "diffy" version = "0.4.2" dependencies = [ "anstyle", + "flate2", "rayon", "snapbox", ] @@ -152,6 +159,16 @@ dependencies = [ "libredox", ] +[[package]] +name = "flate2" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" +dependencies = [ + "miniz_oxide", + "zlib-rs", +] + [[package]] name = "getrandom" version = "0.3.4" @@ -200,6 +217,16 @@ version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" +[[package]] +name = "miniz_oxide" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" +dependencies = [ + "adler2", + "simd-adler32", +] + [[package]] name = "normalize-line-endings" version = "0.3.0" @@ -281,6 +308,12 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "simd-adler32" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "703d5c7ef118737c72f1af64ad2f6f8c5e1921f818cdcb97b8fe6fc69bf66214" + [[package]] name = "similar" version = "2.7.0" @@ -446,3 +479,9 @@ name = "wit-bindgen" version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" + +[[package]] +name = "zlib-rs" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3be3d40e40a133f9c916ee3f9f4fa2d9d63435b5fbe1bfc6d9dae0aa0ada1513" diff --git a/Cargo.toml b/Cargo.toml index 25c6e51..68ce3ff 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,14 +9,16 @@ repository = "https://github.com/bmwill/diffy" readme = "README.md" keywords = ["diff", "patch", "merge"] categories = ["text-processing"] -rust-version = "1.70.0" +rust-version = "1.75.0" edition = "2021" [features] +binary = ["dep:flate2"] color = ["dep:anstyle"] [dependencies] anstyle = { version = "1.0.13", optional = true } +flate2 = { version = "1.1.9", optional = true, default-features = false, features = ["zlib-rs"] } [dev-dependencies] rayon = "1.10.0" diff --git a/deny.toml b/deny.toml index d495b7c..740f35a 100644 --- a/deny.toml +++ b/deny.toml @@ -91,6 +91,7 @@ ignore = [ allow = [ "MIT", "Apache-2.0", + "Zlib", #"Apache-2.0 WITH LLVM-exception", ] # The confidence threshold for detecting a license from license text. From 3540c162477491f1661fd53a854451222fd3c3f3 Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Sun, 12 Apr 2026 21:25:54 -0400 Subject: [PATCH 15/26] feat(binary): base85/delta decode and patch application * Add base85 encoder/decoder and Git delta format decoder. * Wire them into `BinaryPatch::apply() and `apply_reverse()` for decoding zlib-compressed, base85-encoded binary payload. These are feature-gated behind the `binary` feature. --- src/binary/base85.rs | 229 +++++++++++++++++++++++++++ src/binary/delta.rs | 362 +++++++++++++++++++++++++++++++++++++++++++ src/binary/mod.rs | 210 +++++++++++++++++++++++++ 3 files changed, 801 insertions(+) create mode 100644 src/binary/base85.rs create mode 100644 src/binary/delta.rs diff --git a/src/binary/base85.rs b/src/binary/base85.rs new file mode 100644 index 0000000..182efe8 --- /dev/null +++ b/src/binary/base85.rs @@ -0,0 +1,229 @@ +//! Base85 encoding and decoding using the character set defined in [RFC 1924]. +//! +//! ## References +//! +//! * [RFC 1924] +//! * [Wikipedia: Ascii85 § RFC 1924 version](https://en.wikipedia.org/wiki/Ascii85#RFC_1924_version) +//! +//! [RFC 1924]: https://datatracker.ietf.org/doc/html/rfc1924 + +use std::fmt; + +/// Base85 character set (RFC 1924). +const ALPHABET: &[u8; 85] = b"0123456789\ + ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\ + !#$%&()*+-;<=>?@^_`{|}~"; + +/// Pre-computed lookup table for Base85 decoding. +/// +/// Maps ASCII byte value → digit value or `0xFF` for invalid characters. +/// This provides O(1) lookup. +const TABLE: [u8; 256] = { + let mut table = [0xFFu8; 256]; + let mut i = 0usize; + while i < 85 { + table[ALPHABET[i] as usize] = i as u8; + i += 1; + } + table +}; + +/// Error type for Base85 operations. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum Base85Error { + /// Invalid character that is not in RFC 1924 alphabet. + InvalidCharacter(char), + /// Invalid input length for the operation. + InvalidLength, +} + +impl fmt::Display for Base85Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Base85Error::InvalidCharacter(c) => write!(f, "invalid base85 character: {:?}", c), + Base85Error::InvalidLength => write!(f, "invalid input length"), + } + } +} + +impl std::error::Error for Base85Error {} + +/// Decodes a Base85 string to the provided output. +/// +/// ## Limitations +/// +/// The input length must be a multiple of 5. +/// +/// This function does not handle padding for partial chunks. +/// When decoding data where the original byte count isn't a multiple of 4, +/// callers must handle truncation at a higher level. +/// For example, via a length indicator in Git binary patch. +pub fn decode_into(input: &str, output: &mut impl Extend) -> Result<(), Base85Error> { + let bytes = input.as_bytes(); + + if bytes.len() % 5 != 0 { + return Err(Base85Error::InvalidLength); + } + + // TODO: Use `as_chunks::<5>()` when MSRV >= 1.88 + for chunk in bytes.chunks_exact(5) { + let mut value: u32 = 0; + for &byte in chunk { + let digit = TABLE[byte as usize]; + if digit == 0xFF { + return Err(Base85Error::InvalidCharacter(byte as char)); + } + value = value * 85 + digit as u32; + } + + output.extend(value.to_be_bytes()); + } + + Ok(()) +} + +/// Encodes bytes in Base85 to the provided output. +/// +/// ## Limitations +/// +/// The input length must be a multiple of 4. +/// +/// This function does not handle padding for partial chunks. +/// Callers encoding data where the byte count isn't a multiple of 4 +/// must handle padding at a higher level. +/// For example, via a length indicator in Git binary patch format. +#[allow(dead_code)] // will be used for patch formatting +pub fn encode_into(input: &[u8], output: &mut impl Extend) -> Result<(), Base85Error> { + if input.len() % 4 != 0 { + return Err(Base85Error::InvalidLength); + } + + // TODO: Use `as_chunks::<4>()` when MSRV >= 1.88 + for chunk in input.chunks_exact(4) { + let mut value = u32::from_be_bytes(chunk.try_into().unwrap()); + + // Extract 5 base85 digits (least to most significant order) + let mut digits = [0u8; 5]; + for digit in digits.iter_mut().rev() { + *digit = ALPHABET[(value % 85) as usize]; + value /= 85; + } + output.extend(digits.iter().map(|&b| b as char)); + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn decode(input: &str) -> Result, Base85Error> { + let mut result = Vec::with_capacity((input.len() / 5) * 4); + decode_into(input, &mut result)?; + Ok(result) + } + + fn encode(input: &[u8]) -> Result { + let mut result = String::with_capacity((input.len() / 4) * 5); + encode_into(input, &mut result)?; + Ok(result) + } + + const TEST_VECTORS: &[(&[u8], &str)] = &[ + (b"", ""), + (&[0x00, 0x00, 0x00, 0x00], "00000"), + (&[0xff, 0xff, 0xff, 0xff], "|NsC0"), + // Rust ecosystem phrases + (b"Rust", "Qgw55"), + (b"Fearless concurrency", "MrC1gY-MwEAY*TCV|8+JWo~16"), + (b"memory safe!", "ZDnn5a(N(gVP<6^"), + (b"blazing fast", "Vr*f0X>MmAW?^%5"), + ( + b"zero-cost abstraction!??", + "dS!BNEn{zUbRc13b98cHV{~b6ZXrKE", + ), + ]; + + #[test] + fn table_covers_all_alphabet_chars() { + for (i, &c) in ALPHABET.iter().enumerate() { + assert_eq!( + TABLE[c as usize], i as u8, + "mismatch for char '{}' at index {}", + c as char, i + ); + } + } + + #[test] + fn table_rejects_invalid_chars() { + let invalid_chars = b" \t\n\r\"'\\[],:"; + for &c in invalid_chars { + assert_eq!( + TABLE[c as usize], 0xFF, + "char '{}' should be invalid", + c as char + ); + } + } + + #[test] + fn decode_test_vectors() { + for (bytes, encoded) in TEST_VECTORS { + let result = decode(encoded).unwrap(); + assert_eq!(&result, *bytes, "decode({:?}) failed", encoded); + } + } + + #[test] + fn encode_test_vectors() { + for (bytes, encoded) in TEST_VECTORS { + let result = encode(bytes).unwrap(); + assert_eq!(result, *encoded, "encode({:?}) failed", bytes); + } + } + + #[test] + fn decode_invalid_length() { + assert!(matches!(decode("0000"), Err(Base85Error::InvalidLength))); + assert!(matches!(decode("000"), Err(Base85Error::InvalidLength))); + assert!(matches!(decode("00"), Err(Base85Error::InvalidLength))); + assert!(matches!(decode("0"), Err(Base85Error::InvalidLength))); + } + + #[test] + fn decode_invalid_character() { + assert!(matches!( + decode("0000 "), + Err(Base85Error::InvalidCharacter(' ')) + )); + assert!(matches!( + decode("0000\""), + Err(Base85Error::InvalidCharacter('"')) + )); + } + + #[test] + fn encode_invalid_length() { + assert!(matches!(encode(&[0]), Err(Base85Error::InvalidLength))); + assert!(matches!(encode(&[0, 0]), Err(Base85Error::InvalidLength))); + assert!(matches!( + encode(&[0, 0, 0]), + Err(Base85Error::InvalidLength) + )); + assert!(matches!( + encode(&[0, 0, 0, 0, 0]), + Err(Base85Error::InvalidLength) + )); + } + + #[test] + fn round_trip() { + for (bytes, _) in TEST_VECTORS { + let encoded = encode(bytes).unwrap(); + let decoded = decode(&encoded).unwrap(); + assert_eq!(&decoded, *bytes, "round-trip failed for {:?}", bytes); + } + } +} diff --git a/src/binary/delta.rs b/src/binary/delta.rs new file mode 100644 index 0000000..fa1de26 --- /dev/null +++ b/src/binary/delta.rs @@ -0,0 +1,362 @@ +//! Git delta binary diff support. +//! +//! A delta payload contains: +//! +//! 1. Header: variable-length encoded sizes (original_size, modified_size) +//! 2. Instructions: sequence of `ADD` and `COPY` operations +//! +//! Based on Diffx's [Git Delta Binary Diffs](https://diffx.org/spec/binary-diffs.html#git-delta-binary-diffs) + +use std::fmt; + +/// Applies delta instructions to an original file, producing the modified file. +pub fn apply(original: &[u8], delta: &[u8]) -> Result, DeltaError> { + let mut cursor = DeltaCursor::new(delta); + + let header_orig_size = cursor.read_size()?; + let header_mod_size = cursor.read_size()?; + + // Validate original size + if original.len() as u64 != header_orig_size { + return Err(DeltaError::OriginalSizeMismatch { + expected: header_orig_size, + actual: original.len() as u64, + }); + } + + let mut result = Vec::with_capacity(header_mod_size as usize); + + // Process instructions until we've consumed all delta data + while !cursor.is_empty() { + let control = cursor.read_byte()?; + + if control & 0x80 != 0 { + // COPY instruction + let (src_offset, copy_len) = cursor.read_copy_params(control)?; + let src_end = src_offset + .checked_add(copy_len) + .ok_or(DeltaError::InvalidCopyRange)?; + + if src_end > original.len() { + return Err(DeltaError::CopyOutOfBounds { + offset: src_offset, + length: copy_len, + original_size: original.len(), + }); + } + + result.extend_from_slice(&original[src_offset..src_end]); + } else { + // ADD instruction + let add_len = control as usize; + let data = cursor.read_bytes(add_len)?; + result.extend_from_slice(data); + } + } + + // Validate result size + if result.len() as u64 != header_mod_size { + return Err(DeltaError::ModifiedSizeMismatch { + expected: header_mod_size, + actual: result.len() as u64, + }); + } + + Ok(result) +} + +/// Cursor for reading delta instructions. +struct DeltaCursor<'a> { + data: &'a [u8], + offset: usize, +} + +impl<'a> DeltaCursor<'a> { + fn new(data: &'a [u8]) -> Self { + Self { data, offset: 0 } + } + + fn is_empty(&self) -> bool { + self.offset >= self.data.len() + } + + fn read_byte(&mut self) -> Result { + if self.offset >= self.data.len() { + return Err(DeltaError::UnexpectedEof); + } + let byte = self.data[self.offset]; + self.offset += 1; + Ok(byte) + } + + fn read_bytes(&mut self, len: usize) -> Result<&'a [u8], DeltaError> { + let end = self + .offset + .checked_add(len) + .ok_or(DeltaError::UnexpectedEof)?; + if end > self.data.len() { + return Err(DeltaError::UnexpectedEof); + } + let bytes = &self.data[self.offset..end]; + self.offset = end; + Ok(bytes) + } + + /// Reads a variable-length encoded size from the header. + /// + /// Format: each byte uses 7 bits for value, MSB indicates continuation. + /// Bytes are in little-endian order (LSB first). + fn read_size(&mut self) -> Result { + let mut file_len: u64 = 0; + let mut shift: u32 = 0; + + loop { + let byte = self.read_byte()?; + + // Add 7 bits of value at current shift position + let value = (byte & 0x7F) as u64; + file_len |= value.checked_shl(shift).ok_or(DeltaError::SizeOverflow)?; + + // MSB clear means this is the last byte + if byte & 0x80 == 0 { + break; + } + + shift += 7; + } + + Ok(file_len) + } + + /// Reads `COPY` instruction parameters from the control byte. + /// Returns `(src_offset, copy_len)`. + /// + /// Control byte format is `1oooosss`: + /// + /// * Bits 0-3: src_offset bytes + /// * Bits 4-6: copy_len bytes + fn read_copy_params(&mut self, control: u8) -> Result<(usize, usize), DeltaError> { + let mut src_offset: u32 = 0; + for (mask, shift) in [(0x01, 0), (0x02, 8), (0x04, 16), (0x08, 24)] { + if control & mask != 0 { + let byte = self.read_byte()? as u32; + src_offset |= byte.checked_shl(shift).ok_or(DeltaError::SizeOverflow)?; + } + } + + let mut copy_len: u32 = 0; + for (mask, shift) in [(0x10, 0), (0x20, 8), (0x40, 16)] { + if control & mask != 0 { + let byte = self.read_byte()? as u32; + copy_len |= byte.checked_shl(shift).ok_or(DeltaError::SizeOverflow)?; + } + } + + if copy_len == 0 { + // Size of 0 means 65536 + copy_len = 0x10000; + } + + Ok((src_offset as usize, copy_len as usize)) + } +} + +/// Error type for delta operations. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum DeltaError { + /// Unexpected end of delta data. + UnexpectedEof, + /// Size value overflowed during decoding. + SizeOverflow, + /// Original file size doesn't match header. + OriginalSizeMismatch { expected: u64, actual: u64 }, + /// Modified file size doesn't match header. + ModifiedSizeMismatch { expected: u64, actual: u64 }, + /// COPY instruction references out-of-bounds data. + CopyOutOfBounds { + offset: usize, + length: usize, + original_size: usize, + }, + /// COPY range calculation overflowed. + InvalidCopyRange, +} + +impl fmt::Display for DeltaError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + DeltaError::UnexpectedEof => write!(f, "unexpected end of delta data"), + DeltaError::SizeOverflow => write!(f, "size value overflow"), + DeltaError::OriginalSizeMismatch { expected, actual } => { + write!( + f, + "original size mismatch: expected {expected}, got {actual}" + ) + } + DeltaError::ModifiedSizeMismatch { expected, actual } => { + write!( + f, + "modified size mismatch: expected {expected}, got {actual}" + ) + } + DeltaError::CopyOutOfBounds { + offset, + length, + original_size, + } => { + write!( + f, + "copy out of bounds: offset={offset}, length={length}, original_size={original_size}" + ) + } + DeltaError::InvalidCopyRange => write!(f, "copy range calculation overflow"), + } + } +} + +impl std::error::Error for DeltaError {} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn read_size_single_byte() { + // 0x0A = 10, MSB clear = end + let data = [0x0A]; + let mut cursor = DeltaCursor::new(&data); + assert_eq!(cursor.read_size().unwrap(), 10); + } + + #[test] + fn read_size_multi_byte() { + // 0x80 | 0x01 = 1, continue; 0x02 = 2 << 7 = 256; total = 257 + let data = [0x81, 0x02]; + let mut cursor = DeltaCursor::new(&data); + assert_eq!(cursor.read_size().unwrap(), 1 + (2 << 7)); + } + + #[test] + fn apply_add_only() { + // Header: orig_size=0, mod_size=5 + // ADD 5 bytes: "hello" + let delta = [ + 0x00, // orig_size = 0 + 0x05, // mod_size = 5 + 0x05, // ADD 5 bytes + b'h', b'e', b'l', b'l', b'o', + ]; + let result = apply(&[], &delta).unwrap(); + assert_eq!(result, b"hello"); + } + + #[test] + fn apply_copy_only() { + // Header: orig_size=5, mod_size=5 + // COPY offset=0, len=5 + let delta = [ + 0x05, // orig_size = 5 + 0x05, // mod_size = 5 + 0x90, // COPY: control=0x90 (0x80 | 0x10), offset=0, size byte present + 0x05, // size = 5 + ]; + let original = b"hello"; + let result = apply(original, &delta).unwrap(); + assert_eq!(result, b"hello"); + } + + #[test] + fn apply_copy_with_offset() { + // Header: orig_size=10, mod_size=5 + // COPY offset=5, len=5 + let delta = [ + 0x0A, // orig_size = 10 + 0x05, // mod_size = 5 + 0x91, // COPY: 0x80 | 0x10 | 0x01 (offset1 + size1 present) + 0x05, // offset = 5 + 0x05, // size = 5 + ]; + let original = b"helloworld"; + let result = apply(original, &delta).unwrap(); + assert_eq!(result, b"world"); + } + + #[test] + fn apply_mixed_instructions() { + // Create "HELLO world" from "hello world" + // Header: orig_size=11, mod_size=11 + // ADD 5: "HELLO" + // COPY offset=5, len=6: " world" + let delta = [ + 0x0B, // orig_size = 11 + 0x0B, // mod_size = 11 + 0x05, // ADD 5 bytes + b'H', b'E', b'L', b'L', b'O', // "HELLO" + 0x91, // COPY: offset1 + size1 present + 0x05, // offset = 5 + 0x06, // size = 6 + ]; + let original = b"hello world"; + let result = apply(original, &delta).unwrap(); + assert_eq!(result, b"HELLO world"); + } + + #[test] + fn apply_copy_size_zero_means_65536() { + // When size bytes result in 0, it means 65536 + // Header: orig_size=65536, mod_size=65536 + // COPY offset=0, len=65536 (encoded as 0) + let original = vec![0xAB; 65536]; + let delta = [ + 0x80, 0x80, 0x04, // orig_size = 65536 (varint) + 0x80, 0x80, 0x04, // mod_size = 65536 (varint) + 0x80, // COPY: no offset bytes, no size bytes = offset 0, size 65536 + ]; + let result = apply(&original, &delta).unwrap(); + assert_eq!(result.len(), 65536); + assert_eq!(result, original); + } + + #[test] + fn error_original_size_mismatch() { + let delta = [ + 0x0A, // orig_size = 10 + 0x05, // mod_size = 5 + ]; + let original = b"short"; // only 5 bytes + let err = apply(original, &delta).unwrap_err(); + assert!(matches!( + err, + DeltaError::OriginalSizeMismatch { + expected: 10, + actual: 5 + } + )); + } + + #[test] + fn error_copy_out_of_bounds() { + let delta = [ + 0x05, // orig_size = 5 + 0x05, // mod_size = 5 + 0x91, // COPY + 0x0A, // offset = 10 (out of bounds!) + 0x05, // size = 5 + ]; + let original = b"hello"; + let err = apply(original, &delta).unwrap_err(); + assert!(matches!(err, DeltaError::CopyOutOfBounds { .. })); + } + + #[test] + fn error_unexpected_eof_in_add() { + let delta = [ + 0x00, // orig_size = 0 + 0x05, // mod_size = 5 + 0x05, // ADD 5 bytes + b'h', b'i', // only 2 bytes provided + ]; + let err = apply(&[], &delta).unwrap_err(); + assert_eq!(err, DeltaError::UnexpectedEof); + } +} diff --git a/src/binary/mod.rs b/src/binary/mod.rs index 696f0d6..c1c7ab4 100644 --- a/src/binary/mod.rs +++ b/src/binary/mod.rs @@ -5,6 +5,11 @@ //! //! Based on [DiffX Binary Diffs specification](https://diffx.org/spec/binary-diffs.html). +#[cfg(feature = "binary")] +mod base85; +#[cfg(feature = "binary")] +mod delta; + use std::{fmt, ops::Range}; /// The type of a binary patch block. @@ -54,9 +59,69 @@ pub enum BinaryPatch<'a> { /// /// This represents the `Binary files a/path and b/path differ` case, /// where git detected a binary change but didn't include the actual data. + /// + /// Calling [`apply()`](Self::apply) on this variant returns an error. Marker, } +impl<'a> BinaryPatch<'a> { + /// Applies a binary patch forward: original -> modified. + /// + /// - If the forward block is `Literal`: returns the decoded content directly. + /// - If the forward block is `Delta`: applies delta instructions to `original`. + /// + /// Unlike `git apply`, this doesn't validate the original content hash. + #[cfg(feature = "binary")] + pub fn apply(&self, original: &[u8]) -> Result, BinaryPatchParseError> { + match self { + BinaryPatch::Full { forward, .. } => Self::apply_block(forward, original), + BinaryPatch::Marker => Err(BinaryPatchParseErrorKind::NoBinaryData.into()), + } + } + + /// Applies a binary patch in reverse: modified -> original. + /// + /// - If the reverse block is `Literal`: returns the decoded content directly. + /// - If the reverse block is `Delta`: applies delta instructions to `modified`. + /// + /// Unlike `git apply`, this doesn't validate the modified content hash. + #[cfg(feature = "binary")] + pub fn apply_reverse(&self, modified: &[u8]) -> Result, BinaryPatchParseError> { + match self { + BinaryPatch::Full { reverse, .. } => Self::apply_block(reverse, modified), + BinaryPatch::Marker => Err(BinaryPatchParseErrorKind::NoBinaryData.into()), + } + } + + /// Applies a single block (either literal or delta). + #[cfg(feature = "binary")] + fn apply_block(block: &BinaryBlock<'_>, base: &[u8]) -> Result, BinaryPatchParseError> { + match block.kind { + BinaryBlockKind::Literal => Self::decode_data(&block.data), + BinaryBlockKind::Delta => { + let delta_instructions = Self::decode_data(&block.data)?; + delta::apply(base, &delta_instructions).map_err(BinaryPatchParseError::from) + } + } + } + + /// See [Decoding Logic](https://diffx.org/spec/binary-diffs.html#decoding-logic) + #[cfg(feature = "binary")] + fn decode_data(binary_data: &BinaryData<'_>) -> Result, BinaryPatchParseError> { + use std::io::Read; + + let compressed = decode_base85_lines(binary_data.data)?; + + let mut decoder = flate2::read::ZlibDecoder::new(&compressed[..]); + let mut decompressed = Vec::new(); + decoder + .read_to_end(&mut decompressed) + .map_err(|e| BinaryPatchParseErrorKind::DecompressionFailed(e.to_string()))?; + + Ok(decompressed) + } +} + /// Represents a single binary payload in a Git binary diff. /// /// For example, the following patch block @@ -115,6 +180,20 @@ impl fmt::Display for BinaryPatchParseError { impl std::error::Error for BinaryPatchParseError {} +#[cfg(feature = "binary")] +impl From for BinaryPatchParseError { + fn from(e: base85::Base85Error) -> Self { + BinaryPatchParseErrorKind::Base85(e).into() + } +} + +#[cfg(feature = "binary")] +impl From for BinaryPatchParseError { + fn from(e: delta::DeltaError) -> Self { + BinaryPatchParseErrorKind::Delta(e).into() + } +} + impl From for BinaryPatchParseError { fn from(kind: BinaryPatchParseErrorKind) -> Self { Self { kind, span: None } @@ -126,6 +205,8 @@ impl From for BinaryPatchParseError { #[non_exhaustive] pub(crate) enum BinaryPatchParseErrorKind { /// Missing or invalid "GIT binary patch" header. + // TODO: Switch to #[expect(dead_code)] when MSRV >= 1.81 + #[cfg_attr(not(feature = "binary"), allow(dead_code))] InvalidHeader, /// First binary block (forward) not found. @@ -135,10 +216,26 @@ pub(crate) enum BinaryPatchParseErrorKind { MissingReverseBlock, /// No binary data available (marker-only patch). + // TODO: Switch to #[expect(dead_code)] when MSRV >= 1.81 + #[cfg_attr(not(feature = "binary"), allow(dead_code))] NoBinaryData, /// Invalid line length indicator in Base85 data. + // TODO: Switch to #[expect(dead_code)] when MSRV >= 1.81 + #[cfg_attr(not(feature = "binary"), allow(dead_code))] InvalidLineLengthIndicator, + + /// Base85 decoding failed. + #[cfg(feature = "binary")] + Base85(base85::Base85Error), + + /// Delta application failed. + #[cfg(feature = "binary")] + Delta(delta::DeltaError), + + /// Zlib decompression failed. + #[cfg(feature = "binary")] + DecompressionFailed(String), } impl fmt::Display for BinaryPatchParseErrorKind { @@ -149,6 +246,12 @@ impl fmt::Display for BinaryPatchParseErrorKind { Self::MissingReverseBlock => write!(f, "second binary block not found"), Self::NoBinaryData => write!(f, "no binary data available"), Self::InvalidLineLengthIndicator => write!(f, "invalid line length indicator"), + #[cfg(feature = "binary")] + Self::Base85(e) => write!(f, "{e}"), + #[cfg(feature = "binary")] + Self::Delta(e) => write!(f, "{e}"), + #[cfg(feature = "binary")] + Self::DecompressionFailed(msg) => write!(f, "decompression failed: {msg}"), } } } @@ -273,6 +376,62 @@ fn parse_binary_block<'a>(parser: &mut BinaryParser<'a>) -> Option` +/// +/// From [5.1.1. Binary Payloads](https://diffx.org/spec/binary-diffs.html#binary-payloads): +/// +/// > Each line represents up to 52 bytes of pre-encoded data. +/// > There may be an unlimited number of lines. +/// > They contain the following fields: +/// > +/// > * `len_c` is a line length character. +/// > This encodes the length of the (pre-encoded) data written on this line. +/// > * `data` is Base85-encoded data for this line. +#[cfg(feature = "binary")] +fn decode_base85_lines(data: &str) -> Result, BinaryPatchParseError> { + // A rough estimate: In Base85, 5 chars -> 4 bytes + let mut result = Vec::with_capacity(data.len() * 4 / 5); + + for line in data.lines() { + if line.is_empty() { + continue; + } + + let line_bytes = line.as_bytes(); + + let length = decode_line_length(line_bytes[0]) + .ok_or(BinaryPatchParseErrorKind::InvalidLineLengthIndicator)?; + let encoded = &line[1..]; + let start = result.len(); + base85::decode_into(encoded, &mut result)?; + result.truncate(start + length); + } + + Ok(result) +} + +/// Decodes a line length character to its numeric value. +/// +/// From [Line Length Characters](https://diffx.org/spec/binary-diffs.html#line-length-characters): +/// +/// > Each encoded line in a binary diff payload is prefixed by a line length character. +/// > This encodes the length of the compressed (but not encoded) data for the line. +/// > +/// > Line length characters always represent a value between 1 and 52: +/// > +/// > * A value of A-Z represents a number between 1..26. +/// > * A value of a-z represents a number between 27..52. +#[cfg(feature = "binary")] +fn decode_line_length(c: u8) -> Option { + match c { + b'A'..=b'Z' => Some((c - b'A' + 1) as usize), + b'a'..=b'z' => Some((c - b'a' + 27) as usize), + _ => None, + } +} + #[cfg(test)] mod tests { use super::*; @@ -350,3 +509,54 @@ mod tests { } } } + +#[cfg(test)] +#[cfg(feature = "binary")] +mod apply_tests { + use super::*; + + #[test] + fn decode_line_length_uppercase() { + assert_eq!(decode_line_length(b'A'), Some(1)); + assert_eq!(decode_line_length(b'B'), Some(2)); + assert_eq!(decode_line_length(b'Z'), Some(26)); + } + + #[test] + fn decode_line_length_lowercase() { + assert_eq!(decode_line_length(b'a'), Some(27)); + assert_eq!(decode_line_length(b'b'), Some(28)); + assert_eq!(decode_line_length(b'z'), Some(52)); + } + + #[test] + fn decode_line_length_invalid() { + assert_eq!(decode_line_length(b'0'), None); + assert_eq!(decode_line_length(b'!'), None); + assert_eq!(decode_line_length(b' '), None); + } + + #[test] + fn apply_literal_patch() { + let input = "GIT binary patch\nliteral 10\nUcmV+l0QLU>0RjUA1qKHQ2>`DEE&u=k\n\nliteral 0\nKcmV+b0RR6000031\n\n"; + let (patch, _) = parse_binary_patch(input).unwrap(); + + let modified = patch.apply(&[]).unwrap(); + assert_eq!(modified.len(), 10); + assert_eq!(modified, vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]); + + let original = patch.apply_reverse(&[]).unwrap(); + assert_eq!(original.len(), 0); + } + + #[test] + fn apply_with_crlf_line_endings() { + let input = "GIT binary patch\r\nliteral 10\r\nUcmV+l0QLU>0RjUA1qKHQ2>`DEE&u=k\r\n\r\nliteral 0\r\nKcmV+b0RR6000031\r\n\r\n"; + let (patch, _) = parse_binary_patch(input).unwrap(); + + let modified = patch.apply(&[]).unwrap(); + assert_eq!(modified, vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]); + let original = patch.apply_reverse(&[]).unwrap(); + assert_eq!(original.len(), 0); + } +} From 531e8d99191c933cd1fedd01c8f664ab5e31bc5b Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Sun, 12 Apr 2026 21:54:37 -0400 Subject: [PATCH 16/26] test: add binary patch to compat and replay tests Now both tests require `binary` Cargo feature. --- .github/workflows/ci.yml | 1 + .github/workflows/replay.yml | 2 +- Cargo.toml | 8 ++ tests/compat/common.rs | 47 ++++++---- .../git/binary_and_text_mixed/in/foo.patch | 16 ++++ .../git/binary_and_text_mixed/in/image.png | 2 + .../git/binary_and_text_mixed/in/text.txt | 1 + .../git/binary_and_text_mixed/out/image.png | Bin 0 -> 10 bytes .../git/binary_and_text_mixed/out/text.txt | 1 + tests/compat/git/binary_delta/in/foo.patch | 9 ++ tests/compat/git/binary_delta/in/large.bin | Bin 0 -> 5120 bytes tests/compat/git/binary_delta/out/large.bin | Bin 0 -> 5120 bytes .../binary_delta_wrong_original/in/foo.patch | 9 ++ .../binary_delta_wrong_original/in/large.bin | 1 + tests/compat/git/binary_literal/in/foo.patch | 10 ++ tests/compat/git/binary_literal/out/small.bin | Bin 0 -> 10 bytes .../binary_literal_wrong_original/in/file.bin | 1 + .../in/foo.patch | 8 ++ .../out/file.bin | Bin 0 -> 10 bytes .../binary_mixed_delta_literal/in/favicon.png | Bin 0 -> 2919 bytes .../binary_mixed_delta_literal/in/foo.patch | 86 ++++++++++++++++++ .../out/favicon.png | Bin 0 -> 1125 bytes tests/compat/git/mod.rs | 69 ++++++++++++++ tests/compat/main.rs | 10 +- tests/replay.rs | 76 +++++++++++----- 25 files changed, 309 insertions(+), 48 deletions(-) create mode 100644 tests/compat/git/binary_and_text_mixed/in/foo.patch create mode 100644 tests/compat/git/binary_and_text_mixed/in/image.png create mode 100644 tests/compat/git/binary_and_text_mixed/in/text.txt create mode 100644 tests/compat/git/binary_and_text_mixed/out/image.png create mode 100644 tests/compat/git/binary_and_text_mixed/out/text.txt create mode 100644 tests/compat/git/binary_delta/in/foo.patch create mode 100644 tests/compat/git/binary_delta/in/large.bin create mode 100644 tests/compat/git/binary_delta/out/large.bin create mode 100644 tests/compat/git/binary_delta_wrong_original/in/foo.patch create mode 100644 tests/compat/git/binary_delta_wrong_original/in/large.bin create mode 100644 tests/compat/git/binary_literal/in/foo.patch create mode 100644 tests/compat/git/binary_literal/out/small.bin create mode 100644 tests/compat/git/binary_literal_wrong_original/in/file.bin create mode 100644 tests/compat/git/binary_literal_wrong_original/in/foo.patch create mode 100644 tests/compat/git/binary_literal_wrong_original/out/file.bin create mode 100644 tests/compat/git/binary_mixed_delta_literal/in/favicon.png create mode 100644 tests/compat/git/binary_mixed_delta_literal/in/foo.patch create mode 100644 tests/compat/git/binary_mixed_delta_literal/out/favicon.png diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b1eb5a6..9bbd188 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -30,6 +30,7 @@ jobs: - run: rustup toolchain install ${{ matrix.rust }} --profile minimal - run: cargo +${{ matrix.rust }} check --all-targets --all-features - run: cargo +${{ matrix.rust }} test + - run: cargo +${{ matrix.rust }} test -F binary lint: runs-on: ubuntu-latest diff --git a/.github/workflows/replay.yml b/.github/workflows/replay.yml index 08384d6..48368fc 100644 --- a/.github/workflows/replay.yml +++ b/.github/workflows/replay.yml @@ -67,7 +67,7 @@ jobs: exit 1 fi - run: rustup toolchain install stable --profile minimal - - run: cargo test --release --test replay -- --ignored --nocapture + - run: cargo test --release --test replay -F binary -- --ignored --nocapture env: DIFFY_TEST_REPO: ${{ inputs.repo_url == '' && '.' || 'target/test-repo' }} DIFFY_TEST_COMMITS: ${{ inputs.commits }} diff --git a/Cargo.toml b/Cargo.toml index 68ce3ff..125a3c7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,3 +27,11 @@ snapbox = { version = "0.6.24", features = ["dir"] } [[example]] name = "patch_formatter" required-features = ["color"] + +[[test]] +name = "compat" +required-features = ["binary"] + +[[test]] +name = "replay" +required-features = ["binary"] diff --git a/tests/compat/common.rs b/tests/compat/common.rs index c55cdd1..8e5fdda 100644 --- a/tests/compat/common.rs +++ b/tests/compat/common.rs @@ -9,7 +9,7 @@ use std::{ }; use diffy::{ - binary::BinaryPatch, + binary::{BinaryPatch, BinaryPatchParseError}, patch_set::{FileOperation, ParseOptions, PatchKind, PatchSet, PatchSetParseError}, }; @@ -268,6 +268,7 @@ fn print_patch_version() { pub enum TestError { Parse(PatchSetParseError), Apply(diffy::ApplyError), + Binary(BinaryPatchParseError), } impl std::fmt::Display for TestError { @@ -275,6 +276,7 @@ impl std::fmt::Display for TestError { match self { TestError::Parse(e) => write!(f, "parse error: {e}"), TestError::Apply(e) => write!(f, "apply error: {e}"), + TestError::Binary(e) => write!(f, "binary patch error: {e}"), } } } @@ -352,31 +354,40 @@ pub fn apply_diffy( } }; + let read_original = || { + if let Some(name) = original_name { + let original_path = in_dir.join(name); + fs::read(&original_path).unwrap_or_default() + } else { + Vec::new() + } + }; + + let write_modified = |result: &[u8]| { + let result_path = output_dir.join(target_name); + if let Some(parent) = result_path.parent() { + fs::create_dir_all(parent).unwrap(); + } + fs::write(&result_path, result).unwrap(); + }; + match file_patch.patch() { PatchKind::Text(patch) => { - let original = if let Some(name) = original_name { - let original_path = in_dir.join(name); - fs::read_to_string(&original_path).unwrap_or_else(|e| { - panic!("failed to read {}: {e}", original_path.display()) - }) - } else { - String::new() - }; + let original = String::from_utf8(read_original()).unwrap(); let result = diffy::apply(&original, patch).map_err(TestError::Apply)?; - let result_path = output_dir.join(target_name); - if let Some(parent) = result_path.parent() { - fs::create_dir_all(parent).unwrap(); - } - fs::write(&result_path, result.as_bytes()).unwrap(); + write_modified(result.as_bytes()); } PatchKind::Binary(BinaryPatch::Marker) => { - // Dont do anything if it is just a binary patch marker.[ + // Dont do anything if it is just a binary patch marker. } - PatchKind::Binary(_) => { - // Binary patch application requires the `binary` feature. - // Will be wired up when that feature is added. + PatchKind::Binary(patch) => { + let original = read_original(); + + let result = patch.apply(&original).map_err(TestError::Binary)?; + + write_modified(&result); } } } diff --git a/tests/compat/git/binary_and_text_mixed/in/foo.patch b/tests/compat/git/binary_and_text_mixed/in/foo.patch new file mode 100644 index 0000000..9ae73cb --- /dev/null +++ b/tests/compat/git/binary_and_text_mixed/in/foo.patch @@ -0,0 +1,16 @@ +diff --git a/image.png b/image.png +index 0dd1608e45a9c4d35bfc1e6f266a796364aa8754..bccac03558b00545e7ea8ced4a3a1ee232cc185a 100644 +GIT binary patch +literal 10 +UcmV+l0QLWgP)&C2V_V*#WK--S zBVwa$Qc>CVe~2gp%FIc0GITmkOWRdRQEMmVnHk@)n0}jy)YRZ2E>S4v)yQrWOqy%C zdgGx1O3NTlf2Us8A}W>x>EtwIF$OTi6@1sv;(4 zSerdbxr+=xZ*CHL1$$Y~T$p4Ql#onXq!jP!-SBRm?3{^{)&p}f(rnh>tgKb z0#OkH>rALj017{~{J53G%CxL`Dy*DE6{&tSlxro?3Fc|vT#w|asfzO_Ul0Z*R^kdE#G>fnm!7AlT4LsQiQA1JQJ$~VH7UW}xT0Ar zR{}HtO#4SxnK5{FQyL3u3R?X$C5`2p`SRH;FOE28UQeuoI4mJxbsBUgASe$R4tsvVn630le?D zMN5q`%O)Rccq~)bw>wPs!fx~VfZcbPP?U9f!pLQO-(_^If@v&kSR@~n4asb zmXL$%jpwQhAcB3){9)QPqT59E_@{H|Pig%{_3x0M6)!E_HFJ|=>M~Gs zQCwuqC7jz*s7kk{zjyQJVA)-#xX>`FoNN*ekORq?UJHH;BJBw3Mf;-h6UTuqy@Eqi zF?|buaQ}k5X>co?%e8nRuBJG-KO+wWbx5OFfGB-)y7VPv1|VYGJk4IMbQ@`b_{z{2 z`}fZoQxe{^$-dg+i{+4y5iPaFmsgx3D&Q7+7FiYU1Pk1#gOJ)x8yav9>4S>u7S0l` z-Vx_v@G{J@^6tq(J(ec$wJ^ZJ1WV$gcCds}_O%9%zav3@9T?o3Q5o`OJ&rv|J_0ZQ z^DS}&B2Gsod;)y2sZM-i5~?~@(3Uh_>qFkF@Uyf`Mno%hxhc$nG$xVyFc zUec{06WnI{V8}@7YU~_h+{b3%Vy`4C>QJo1mF31Os_gZFR470hjHws7t??a}R`X^K zQCB$+d#d5+MU)W1k>&uzMkTg(fFbR6B2mi=e?y+TQ!!?|0K{A@=nf1uqr&(YQcYFI{L$hVD3E@DkNU>X@dKXf+j)=7vwc-Pbly&x0||iaQ~(g zT3D#h6~k5`x}cK8E>9(1wK}^|z1}3x@G$ZL#?ZiTq7f?<*$VpXj>$IT2@~(WC*e*n z6uQVCeQ7fJ4;7isICnMpoSba_2oesDirM%K0o?%a;BzmBtz0Qss(z24_d|aG6&O>( z{{v2hi)5iN?FIK?bC{KeEMqie#(8jM^35J0t_|e}S}k%+S_jg`hVZ8j3K~APN>}Uj zahvzpmhU>;KP|7vqkdCWTF?zBbcm%#ytAt2yNt46;WQWWy5*Bc2(a`Iogup=I97#c zXy(B5W*9~tFC;N(1h>!{9^yoZXUVVMAy+-^fa~5|;3|e_VeU*DL<{drWd&wVMZ6(+M0E##EkDskklG00;4I<+wi1#G$v~Chc`qIg5%jF&Wk=wdmUJV-fw6~=9 zzqEZw&CTp+GA&E|5d*KnFkSsIYp&6(DXE4O?x{^Z(^ChNjqJIi*Z2NyrI(Y#?GMl7 zlps#EOUvXErQ-3$Fe!NXOv3QIwbHToa>p0TXj$Bi^UoQjW!``l z@Nrc%U^^a0l5iz*bAd54vC>TRD5k=TJc`vvVW8O0D#<~+Ia}{&W{^c`(3i)HHdKsU zU-@hy$ReEvxLr9X7W@NG!%WQRq}=owID9M1Djm-dYu=KnvqU`hz}q;Mdv;aG{m3IA zRz_PF5e;LJ-s+9zZW0= zq#jU81@MSGvD#o+?)NsE$J^Ih2qyV=2OyOlr{U%*Dw(`&vL45QUz_ASOl~K~gxt+Q zEJlrYO(2=|J>G1+f*t>4As{B%(k+nE-{Nfh`T3Z*`3@a)2KB?!qNMC|;8Api+BrOg z03oHz&!P#4Ur<+#2|Fvi?vQ$4V&0p3vpUe1Lp>h&|;m6HRaL8j>vu*`q4~2}K55!{sMXA)w1c`(ZPN z)0$Xj_cxW@w!4(ndA!@`?Rz6gMUbf7(|W?Cm#w0m z$gP}{QKad(5UrzwZf{k$h&fT?f!r62RB}{;Wy7Xg8pO-B}0irpZ&%kyAqY!^3=|MUtn1FR~~)u09I0)?;5O}LxXc#Aha zQ;Dhl5?>aDy}9ynzvJzs#(3xekyylTY=eJTQ4ijc+|$YA?s=TvLGt?ai~6OrX9BBg z*2GrN8@vK#m{xAO2OE^2;~*L}ed2Oz%_H^agmv_qHtn95$?vAU0als~BXv7P{H7q~ z2U9;6SCO|q)i+@Uaz>Y(2hUZs zq3z>{I7>;5A{7^<>kPvrN_{>av~k@7>fn*p;0UIFP`$*J)wlYxfgj+5A+wjiSa`@j z)aP`e&u7gM@Y;3tm@@S=dJCQ-XftSRp(!Eg1EplGo?K~0ty&~6)sG(BVk}as!^~|5 z)l*wPiKfYxt&Y)yJF%`~1jNVIbam_2$Qz)w$@g5QpWg=Kp&1=P-o%d~&x}+JxO{Fh zscLvTrX}s7qLQf4kb$!=mCw?+FVx1+2_t!r)+$Bb<0i}_wgyh|?OR{bxG5Rk?u7ud`8|o{i$#bL5Y3j1$32rRr)&EnGpTMTtay?y<|&3IfTW@KcU$%3TN2DuT;OA<1VTsn z9f-0y(t({nRKk~KEuZ)n;$=XJBxOeSFcXWWCLL6~_|CQqU!e(Vyj`su)OI=oyhdy= z5ok1#IPois2V`hupRGSYd*PoM zj0!d^VO{X6y+y_!BEoO@#$MBulI2|(`0lbt= z#nzsXLKycOl?w#&=?Fo;6~}fBY$8h=(~35@IjB0iDaToC>ONJyv?0=M#8+uBLp+ww z&t_ddbUe@@2s{fXlxh>ov*vBN+Dh$9IE*LBHj-&bUGi!y=?6*+knenfQP+7|Vsb=M zU+U(mr-})lCx8z5sO?;EQeYTN0KoVKsklKQG=QLc6^gM?ArA<$(@MYgK#$H9Pmtdk z<9C0eT~>x@MVEZ&G1U{rn&g>Kwpe#X&^$TB+U&&G*a5keR)BvfqEM;#aApH zVzoyFu=(TvSAJ5|Cy!+h_CGz1`NL7)7E6+o2Zbxo-MoFo<2?|_z9zpnIv%o1SuK7x zaZ>=%cUu!X>59F3#I=Ib-4TL+Z9Y|BbGPMNgP)XA$T5eyR{1 z>QO4j!WobyV7xk0F)J6Ui{L+_E~=Q8w3gLUqfCZbIdPhFd>$87JNxB|Mv5o8Iq4rO z9{uM_RA*6!3I`Jw_kWtMLW>*hhC1-O$QaYTtBy~k{^0kqyjLZJ=_1q2wItMCx} zA3EIUBEw)qU>R4wg)aJ(GDeCshXQr`T%IE;=yrYok~UNT7Y^&+;R)bP`biKpV*weBQk2c(_sD5lif zC2MKzR~%}yh(Grk3JHW^Bk%D1o(^+!=E_3 zn7S^}Y^)$5e_mFPiD=~qdIV{c+dm@#R?|PY@9c&yVusFF)+GyXOdnftGM0VqAdKHU z5=+q3I^puAIyw-fn~`2DB7+5h`c{#<6w!xMi7&_BBDdz%F=EZjamcz!Z0B94s#7y8 z>wTX>=!?n)1WQYVq*-SbF`@#jNv32D)}VP8CLc6XQRI~w>GJb}452Hpgu&<5o8*VT zi<$l^&c`DH03?v~g)O7Wv_2Ow$-or2_zy4@#|jLBaf1sQep142KTO`*v_x z$N=RsAzD5zwawueOGUJ?3jT}z<03j8 iFeF}$w^8qC1SDUCXp$k{m_!>&X-}m2V55dcq+r238RXvp literal 0 HcmV?d00001 diff --git a/tests/compat/git/binary_delta/out/large.bin b/tests/compat/git/binary_delta/out/large.bin new file mode 100644 index 0000000000000000000000000000000000000000..a8a025a84f32c7552841b683361d03d3e171294d GIT binary patch literal 5120 zcmV+b6#whcY*DIPbx_e&SH6(E=m?lE#2}*s*|N=_Pn6rew1;C{e>&C2V_V*#WK--S zBVwa$Qc>CVe~2gp%FIc0GITmkOWRdRQEMmVnHk@)n0}jy)YRZ2E>S4v)yQrWOqy%C zdgG!0|Ns9^f2Us8A}W>x>EtwIF$OTi6@1sv;(4 zSerdbxr+=xZ*CHL1$$Y~T$p4Ql#onXq!jP!-SBRm?3{^{)&p}f(rnh>tgKb z0#OkH>rALj017{~{J53G%CxL`Dy*DE6{&tSlxro?3Fc|vT#w|asfzO_Ul0Z*R^kdE#G>fnm!7AlT4LsQiQA1JQJ$~VH7UW}xT0Ar zR{}HtO#4SxnK5{FQyL3u3R?X$C5`2p`SRH;FOE28UQeuoI4mJxbsBUgASe$R4tsvVn630le?D zMN5q`%O)Rccq~)bw>wPs!fx~VfZcbPP?U9f!pLQO-(_^If@v&kSR@~n4asb zmXL$%jpwQhAcB3){9)QPqT59E_@{H|Pig%{_3x0M6)!E_HFJ|=>M~Gs zQCwuqC7jz*s7kk{zjyQJVA)-#xX>`FoNN*ekORq?UJHH;BJBw3Mf;-h6UTuqy@Eqi zF?|buaQ}k5X>co?%e8nRuBJG-KO+wWbx5OFfGB-)y7VPv1|VYGJk4IMbQ@`b_{z{2 z`}fZoQxe{^$-dg+i{+4y5iPaFmsgx3D&Q7+7FiYU1Pk1#gOJ)x8yav9>4S>u7S0l` z-Vx_v@G{J@^6tq(J(ec$wJ^ZJ1WV$gcCds}_O%9%zav3@9T?o3Q5o`OJ&rv|J_0ZQ z^DS}&B2Gsod;)y2sZM-i5~?~@(3Uh_>qFkF@Uyf`Mno%hxhc$nG$xVyFc zUec{06WnI{V8}@7YU~_h+{b3%Vy`4C>QJo1mF31Os_gZFR470hjHws7t??a}R`X^K zQCB$+d#d5+MU)W1k>&uzMkTg(fFbR6B2mi=e?y+TQ!!?|0K{A@=nf1uqr&(YQcYFI{L$hVD3E@DkNU>X@dKXf+j)=7vwc-Pbly&x0||iaQ~(g zT3D#h6~k5`x}cK8E>9(1wK}^|z1}3x@G$ZL#?ZiTq7f?<*$VpXj>$IT2@~(WC*e*n z6uQVCeQ7fJ4;7isICnMpoSba_2oesDirM%K0o?%a;BzmBtz0Qss(z24_d|aG6&O>( z{{v2hi)5iN?FIK?bC{KeEMqie#(8jM^35J0t_|e}S}k%+S_jg`hVZ8j3K~APN>}Uj zahvzpmhU>;KP|7vqkdCWTF?zBbcm%#ytAt2yNt46;WQWWy5*Bc2(a`Iogup=I97#c zXy(B5W*9~tFC;N(1h>!{9^yoZXUVVMAy+-^fa~5|;3|e_VeU*DL<{drWd&wVMZ6(+M0E##EkDskklG00;4I<+wi1#G$v~Chc`qIg5%jF&Wk=wdmUJV-fw6~=9 zzqEZw&CTp+GA&E|5d*KnFkSsIYp&6(DXE4O?x{^Z(^ChNjqJIi*Z2NyrI(Y#?GMl7 zlps#EOUvXErQ-3$Fe!NXOv3QIwbHToa>p0TXj$Bi^UoQjW!``l z@Nrc%U^^a0l5iz*bAd54vC>TRD5k=TJc`vvVW8O0D#<~+Ia}{&W{^c`(3i)HHdKsU zU-@hy$ReEvxLr9X7W@NG!%WQRq}=owID9M1Djm-dYu=KnvqU`hz}q;Mdv;aG{m3IA zRz_PF5e;LJ-s+9zZW0= zq#jU81@MSGvD#o+?)NsE$J^Ih2qyV=2OyOlr{U%*Dw(`&vL45QUz_ASOl~K~gxt+Q zEJlrYO(2=|J>G1+f*t>4As{B%(k+nE-{Nfh`T3Z*`3@a)2KB?!qNMC|;8Api+BrOg z03oHz&!P#4Ur<+#2|Fvi?vQ$4V&0p3vpUe1Lp>h&|;m6HRaL8j>vu*`q4~2}K55!{sMXA)w1c`(ZPN z)0$Xj_cxW@w!4(ndA!@`?Rz6gMUbf7(|W?Cm#w0m z$gP}{QKad(5UrzwZf{k$h&fT?f!r62RB}{;Wy7Xg8pO-B}0irpZ&%kyAqY!^3=|MUtn1FR~~)u09I0)?;5O}LxXc#Aha zQ;Dhl5?>aDy}9ynzvJzs#(3xekyylTY=eJTQ4ijc+|$YA?s=TvLGt?ai~6OrX9BBg z*2GrN8@vK#m{xAO2OE^2;~*L}ed2Oz%_H^agmv_qHtn95$?vAU0als~BXv7P{H7q~ z2U9;6SCO|q)i+@Uaz>Y(2hUZs zq3z>{I7>;5A{7^<>kPvrN_{>av~k@7>fn*p;0UIFP`$*J)wlYxfgj+5A+wjiSa`@j z)aP`e&u7gM@Y;3tm@@S=dJCQ-XftSRp(!Eg1EplGo?K~0ty&~6)sG(BVk}as!^~|5 z)l*wPiKfYxt&Y)yJF%`~1jNVIbam_2$Qz)w$@g5QpWg=Kp&1=P-o%d~&x}+JxO{Fh zscLvTrX}s7qLQf4kb$!=mCw?+FVx1+2_t!r)+$Bb<0i}_wgyh|?OR{bxG5Rk?u7ud`8|o{i$#bL5Y3j1$32rRr)&EnGpTMTtay?y<|&3IfTW@KcU$%3TN2DuT;OA<1VTsn z9f-0y(t({nRKk~KEuZ)n;$=XJBxOeSFcXWWCLL6~_|CQqU!e(Vyj`su)OI=oyhdy= z5ok1#IPois2V`hupRGSYd*PoM zj0!d^VO{X6y+y_!BEoO@#$MBulI2|(`0lbt= z#nzsXLKycOl?w#&=?Fo;6~}fBY$8h=(~35@IjB0iDaToC>ONJyv?0=M#8+uBLp+ww z&t_ddbUe@@2s{fXlxh>ov*vBN+Dh$9IE*LBHj-&bUGi!y=?6*+knenfQP+7|Vsb=M zU+U(mr-})lCx8z5sO?;EQeYTN0KoVKsklKQG=QLc6^gM?ArA<$(@MYgK#$H9Pmtdk z<9C0eT~>x@MVEZ&G1U{rn&g>Kwpe#X&^$TB+U&&G*a5keR)BvfqEM;#aApH zVzoyFu=(TvSAJ5|Cy!+h_CGz1`NL7)7E6+o2Zbxo-MoFo<2?|_z9zpnIv%o1SuK7x zaZ>=%cUu!X>59F3#I=Ib-4TL+Z9Y|BbGPMNgP)XA$T5eyR{1 z>QO4j!WobyV7xk0F)J6Ui{L+_E~=Q8w3gLUqfCZbIdPhFd>$87JNxB|Mv5o8Iq4rO z9{uM_RA*6!3I`Jw_kWtMLW>*hhC1-O$QaYTtBy~k{^0kqyjLZJ=_1q2wItMCx} zA3EIUBEw)qU>R4wg)aJ(GDeCshXQr`T%IE;=yrYok~UNT7Y^&+;R)bP`biKpV*weBQk2c(_sD5lif zC2MKzR~%}yh(Grk3JHW^Bk%D1o(^+!=E_3 zn7S^}Y^)$5e_mFPiD=~qdIV{c+dm@#R?|PY@9c&yVusFF)+GyXOdnftGM0VqAdKHU z5=+q3I^puAIyw-fn~`2DB7+5h`c{#<6w!xMi7&_BBDdz%F=EZjamcz!Z0B94s#7y8 z>wTX>=!?n)1WQYVq*-SbF`@#jNv32D)}VP8CLc6XQRI~w>GJb}452Hpgu&<5o8*VT zi<$l^&c`DH03?v~g)O7Wv_2Ow$-or2_zy4@#|jLBaf1sQep142KTO`*v_x z$N=RsAzD5zwawueOGUJ?3jT}z<03j8 iFeF}$w^8qC1SDUCXp$k{m_!>&X-}m2V55dcq+r3GG3TWK literal 0 HcmV?d00001 diff --git a/tests/compat/git/binary_delta_wrong_original/in/foo.patch b/tests/compat/git/binary_delta_wrong_original/in/foo.patch new file mode 100644 index 0000000..9abf334 --- /dev/null +++ b/tests/compat/git/binary_delta_wrong_original/in/foo.patch @@ -0,0 +1,9 @@ +diff --git a/large.bin b/large.bin +index ffba9ca51637158e8f46f0a2a9014372778eeef4..a8a025a84f32c7552841b683361d03d3e171294d 100644 +GIT binary patch +delta 15 +ZcmV+q0Pz2SD1a!CWCZ{J|NpUQm=imI2nhfH + +delta 15 +ZcmV+q0Pz2SD1a!CWCQ_9%OJ66m=h@q1w#M; + diff --git a/tests/compat/git/binary_delta_wrong_original/in/large.bin b/tests/compat/git/binary_delta_wrong_original/in/large.bin new file mode 100644 index 0000000..0d6307b --- /dev/null +++ b/tests/compat/git/binary_delta_wrong_original/in/large.bin @@ -0,0 +1 @@ +WRONG CONTENT \ No newline at end of file diff --git a/tests/compat/git/binary_literal/in/foo.patch b/tests/compat/git/binary_literal/in/foo.patch new file mode 100644 index 0000000..61ea8bf --- /dev/null +++ b/tests/compat/git/binary_literal/in/foo.patch @@ -0,0 +1,10 @@ +diff --git a/small.bin b/small.bin +new file mode 100644 +index 0000000000000000000000000000000000000000..df93f5f3f72487244976c34e85525cf445016566 +GIT binary patch +literal 10 +UcmV+l0QLU>0RjUA1qKHQ2>`DEE&u=k + +literal 0 +KcmV+b0RR6000031 + diff --git a/tests/compat/git/binary_literal/out/small.bin b/tests/compat/git/binary_literal/out/small.bin new file mode 100644 index 0000000000000000000000000000000000000000..df93f5f3f72487244976c34e85525cf445016566 GIT binary patch literal 10 RcmZQzWMXDvWn<^y1ONc904@Lk literal 0 HcmV?d00001 diff --git a/tests/compat/git/binary_literal_wrong_original/in/file.bin b/tests/compat/git/binary_literal_wrong_original/in/file.bin new file mode 100644 index 0000000..605cc40 --- /dev/null +++ b/tests/compat/git/binary_literal_wrong_original/in/file.bin @@ -0,0 +1 @@ +WRONG CONTENT - not matching the hash in patch \ No newline at end of file diff --git a/tests/compat/git/binary_literal_wrong_original/in/foo.patch b/tests/compat/git/binary_literal_wrong_original/in/foo.patch new file mode 100644 index 0000000..d3e39d9 --- /dev/null +++ b/tests/compat/git/binary_literal_wrong_original/in/foo.patch @@ -0,0 +1,8 @@ +diff --git a/file.bin b/file.bin +index df93f5f3f72487244976c34e85525cf445016566..a8a025a84f32c7552841b683361d03d3e171294d 100644 +GIT binary patch +literal 10 +UcmV+l0QLU>0RjUA1qKHQ2>`DEE&u=k + +literal 10 +UcmV+l0QLU>0RjUA1qKHQ2>`DEE&u=k diff --git a/tests/compat/git/binary_literal_wrong_original/out/file.bin b/tests/compat/git/binary_literal_wrong_original/out/file.bin new file mode 100644 index 0000000000000000000000000000000000000000..df93f5f3f72487244976c34e85525cf445016566 GIT binary patch literal 10 RcmZQzWMXDvWn<^y1ONc904@Lk literal 0 HcmV?d00001 diff --git a/tests/compat/git/binary_mixed_delta_literal/in/favicon.png b/tests/compat/git/binary_mixed_delta_literal/in/favicon.png new file mode 100644 index 0000000000000000000000000000000000000000..5109c1de8bea744180448b347fb44ddef23b90f4 GIT binary patch literal 2919 zcmai030P8T7skEJu&Hdcv?0>65vtoO=gysTc){dn_8A;mgY3Mwv7J;8=I#3|L3_2=X>9CzO#MjlI-o}Hd}k4HUfc| zO?4;xz;Be|o}mdpp_rIq1VX(S^!1nc(>w`mA>W+E5rzWha=r-05eTA#T*P9B0}@my z5C#fJ=%K5nXcWjHq5Z6AIGV@>;DPQjVt^jw<;#uoG@8VBUyC+h;}WB70)unGigVUEN8lP&?d-=HgyFX$qkP$GnQLeX>ppKUP> zf1wDc@9QdZmgs9_8ts46`TWnqfFzVCI1wL{`k7CEZvgp5ivX+-012gHHb9921d^r7 zOepd~@CM~Tgg+VN0|E%nE(yIF|1U7$8=#9YLMW!gO$Ct97D^pO<_J_iD-7@hITGHq ztxxNofFSJOfWor~siL$D0SL!3wZBrw{}V<++u-4*QWRVX1$-5nt^+~o9ZkrAom;Kc z`UFv3T)f3XE*Jr0kdK=)ib`>@wy?9d#^BAT0)dN0fOSD`G?^uVS$G^Chr!ukEO5RS zHUvvN0cQn^1l*KAN;Z5JI4lY4|M3+ajUvJtG#bGjgd{?7v~p>x&FMholr$wqfXegak(Iz&S(^TbignHAV!(e zSYp&_3zQ<56`$+McmQno+fiaa`iNDWy}z9#V!sH3s3iR3u)=y%Yw!?&rwMj?oWR2L zKmr8t6cWSZCPU1#K_FBTLAXF)6b6AXO_8QFtRka%~Izh4ror z*EZ_|=QE<;c-&c(KbCDA2%Is!@H+g)rZQ&0D#{5Cz@kMoLfJM}sdZNl#oCX>4%v0a znisL7DOS0$j+g^6mUBlNs`}n%hY?$wJRPq86v`{7)93jX?p_lVRnlJ&&afZLZ#&hz zO*L;r_Yz7At=pnIS&x1(ID65shfw2gS^Mv+f$LH_YO+OYq_KaMk=!7=N$s8 zYVb^P@k%r7@*~gE_dVG>kS=f8ngJ&BpPbVtC(o&BusyzO&arf-3>V3FCjyziw}vL} z9Oh8PdKp&CrT}9PWUANt^pi0gLS2UocM;D-O-_!s-5S<7^r8#bN(^WZTqFPeacoP0 zy)NeEyBWEQyN`L_-b6@Dk~P_QYzX;bVSD#L!3h)D<2bbqiwl3s9^aQ+(V3xBlAVY} z((ZcO3=N+xYIsemXzw|b7b4Ro^u(3l)Me_0bv4ljj>4afnIiW92!WV2PjRatG7sv* zpP6bZ*~!;`Yi0U6&-pUA8W*JpZr^s@LEElz(fOFOSxl}IQgcO)DY)q9iK>b%>u&__ zvRF9OMNB(k`N6GOoqZPEZY5um`P|9V%DR}L>-`rGr4p1@kcTSTc%3vbz{xP`*qS+< zRl?zRy}OoBI<{=+M)PZfMxH~$tT?`djmk)EW4fNbO1!E#X-UWeTiuEXf-vFTtUuHS z%c>{aPOG#ca`;4~smG4gTVB7OEBF6ZZRp{SU~?`nB*f81&>>G@IbC>r8?!JFg9}h z32oc#$D||Zkr31zZ$H2E5}cLQ$@}*M)^!a&yv*?R?TI>OXd04|dAgFp%d+PC2!;8l zk|J<;{INS(HnDoo>swFLhh7Tjp3`(h%_%E5>*@Eb%#s#0aLuc*=6Z?Q$bk9d*ai8|Oy~^8zMPyKV`^t-=c~-|`ud_RhdcWd7D=8|%sj<> zA7A=zVZ6Jv>@X*rAYDtLP#zQjJCTOAsM><&`NGa(hEys=@*n4N#$HYUv+miL+Q_QJePvpR)YMcigQ0iSd;4*>-39u=C2}WQ!>F$6s;U`Vok8)> zNF&uFB_#(}ht)T>wN-X>bZBR7R4>U&`(V;9b#d9Esik%3RtbL3Y$H1{`GS946ZEnj z)7IL0;h&yB z^p~mghPtj97>Q&~mJ1n?k&&4}bb4yGLqc9k%6GjzJw+4wiJI<(-Hv-%0sGtYHon}` z(C0xfVU=q#n9SUQ0v{5Iw72I~7NR3A6DYM+KY^*`<- zHk2kfhb5Je4BJD-A+*@(Gwzg%9mTv3ayQk{wy-2a6X6N$1FCQN-$_u`h zo18qe^Qvs!8RogQw9RQ1N3ymRCRN~cdV70oENMpeakk-FFJ8RhWafCb46iJ+^&ad^ zbP5f6i)%}2YFfYXikJcbK!CyS>ZjW3s>QKmc`-3D9aXiWUpTLR8`n#Y>~Ar~&P}d! zh#PM#ivK`>$!Te6A(`0*94>bo)K}kh|2~z)S^~G|Br#^#G~juD(&ERuvQ~k5((c6I z$LnlbhQ{cDfgV)q%*x8jil>pTZf*vKhCL51h1b>9$tRdzq4z!*>5mR~sTq8qYG(R0 z^v`Ab#-mp!`BolVW9>Fmp7j>C(^~?|`pkkF0tfqmbA6Ml6`Ne%l@YX&ivPz{iWfQ0 IIpoLx0#^I3DgXcg literal 0 HcmV?d00001 diff --git a/tests/compat/git/binary_mixed_delta_literal/in/foo.patch b/tests/compat/git/binary_mixed_delta_literal/in/foo.patch new file mode 100644 index 0000000..7f52479 --- /dev/null +++ b/tests/compat/git/binary_mixed_delta_literal/in/foo.patch @@ -0,0 +1,86 @@ +diff --git a/favicon.png b/favicon.png +index 5109c1de8bea744180448b347fb44ddef23b90f4..69b8613ce1506e1c92b864f91cc46df06348c62b 100644 +GIT binary patch +delta 1115 +zcmV-h1f=`t7Uc+#8Gi%-007x@vVQ;o00d`2O+f$vv5yP +zfP?@5`Tzg`fam}Kbua(`>RI+y?e7jT@qQ9J+u00Lr5M??VshmXv^00009a7bBm +z000DZ000DZ0m8#+P5=M^2XskIMF-{q4Gj?pmKmEP0008eNq +zS*l<~QA$1JAY?<@4F!`%H{0#bjE9}fZuU?Ayt&Np_x{ZHeczk+=q00aIO{BOMm>NW +zGb^R2YFbUrw129ilv&5_?R*#op%sCs1zv6w<_>da(*43C8z5GS?N}X +z{^V`QxQx)j*!`A8b;E|$ExHA=4hUt88|nlQ7tEk9hz=YqEbOPg@0Wz`s +zmN!~(9dV=u?I~Mvh)g?mp_)jf?D@Nr{DQJb>GW45GWc;+ig_8yt0(H +zchI)mFn`!(GAY|t|C#flvV9n{2WCVfKibO<1Qi=LY`EQ$z;UqGqDW*0&cq!s6My4; +zPXgEE5_G3UA~k24J=gwQb;>^P#8wX*Y*kE)MBeQ{FfT7}!Y4_A1FwN^c9)O_GgkHj~RdK~NC^L5i~IUXh_ +z95o@^6-U;DqjIVuk>ys^VPVp56?GT72DqTE5?dt$r77DZ?T}ZyX3UaF&vYMYG>7U( +z=6{7=Hn2lG;1!X1AGZ(HW&;lyamBia;`62jOIGoOLtacMykp3iKdiS0Xh_|vJ_mY~ +zb4{&%v9!zfYAbcDwo>=i)=iqrbxKXm)PpS7rrhUx{~ymQhCvo@p6LJp03~!qSaf7z +zbY(hYa%Ew3WdJfTGBPbNF)cAOR53F;F@HBYFgPnPFgh?WYdBZ^0000bbVXQnWMOn= +zI&E)cX=Zrz>%8FWQhbW?9;ba!EL +zWdL_~cP?peYja~^aAhuUa%Y?FJQ@H109SfcSaechcOYjeH +zXlY1#a%EF`PE=!hYhyWNB0oL~Ja{^IZE$U6bYUQPZES9HI(R)IVPtP&WjbziI&Eci +zVJ{*ecsh7(aCB=uB3MmOAVY6*Wgs;!H7+nBJ_;Z_a%5&YQba}|cx`NMb2@TlW<4Tk +zbaZe!FE4j@cP@7`E^l&YFEKeeIWI6WFETPMa%5&Lb9rubVR$WWb0Z=?3Lqdna%5&Y +zL}hbha%pgMX>V>Ia%5&YVPbD}bUh*>3LqdLAb4$TZgVKLZ*V;#XmoUNIxjD7b1q?IZ(?OG +zV{dIQaA>gzG9n5fARu&UW@b8AQe|^*Y;|;LZ*DyzH!?0T +zA_^cNAarSFW;$6?Wpi(Ab#!TOZapG5GA=M83LqdLaA4UZB6CtlLLf;+LpCuvHa0CXE-@ksARr)k +zZE!kGZ)9m^c|>7!Wj!J?FfuSLFgGnRFjO%&Iy5pmFf}bOH##sdA_^cNAb4$XI!$GC +zVPs)+VMJkcWj!J?FfuSLFgGnRFjO%&Iy5pmFf}bOH##sdB0dTrARs(=ZE#IZI!I}A +zbZ>HbJ_;ZpARs()WM(>3WpO?VARr(hAUtwpW;$$X3LqdLARr)fbVYV_I$>jUX>V>l +zB5-nVWOZX@WFiV6ARr(hAais@c62&(Z)S9NVRB_bXJu}5Jt9G7W@&C|ba@~|Wpim~ +zZe?;HC{1BkLr`;4M?xS;MME|*IW{&eGA=Oy04R}lk;GOVI$ud@Qx+*L$C!pq+mEwKumw3~KnQ4h_;;k4&i4exmIHQaZ +zqL)FLLv#_OTUJ!a@A=K2WJz)rnKf3?StLsilTJ~WrFvRoM)b6dJgav9|Mu0$^aY*j +z-Z0cWS=L)Sc(&ks)3QDfE$jOTsvhB@P|9CAfPr_>H%nSz9~#!-?6RaKci?;jS{}57 +zp7@oj#NC+;yq@B{6@$N$x+0n`AdZ9EPrONx`oPL8d^SdIhl+lpQ;W@unwKsRpO +zc#|V9Z5{cG8F|O!TM*a<%mH87Q8-w?emyNME%f#E;q&>pbLS2d6B8Uec8uxiX~xIL +zQ^j*gMBf4I1foDBQ{-RT#+^tE!_dKl2X*u2P0i2GOOo#2zpwoKe3h1#>dBKQO5*Y3 +z$J)PtzYN20XJZcd;!TDP;60!s^KAyZ*Q-sfow1`GNR3!H+vQ6&eW(=@9S?8tp-3K5WupmOjqW>G);Ew*g<1s +zBYXDj!LlsE;V=~y6+C_VG%5Dd&U7sG>@MJY7O&;n&z@CjX{nlx}S9Ri@dyqq&<&d}A>#oXN73Y%R7yqlE$CFylTLxTnf2c5{gGLQ`& +zIdVj?Sj^FL>((vR)YPQPbR>3Yr1gZLaJO0X!YUx +zE2XA(xfmWEroMjRXwcu^&%nTdThy`bTtRblbH)s`x3?=C4l5iEYiMXl`}Xbg%tR;@ +za`aSJSG)Pn6*Q3;CIo2j-n|(eaCbp88dY&|ahhCHQyXdqOH;gI!vu@%7>Cz=h +zQb$L}QqQgg@YxQ01I)WCWGUFadpE0Bug+F{9%uzf{uF)%UIQ&ji9jGgVPRoLO-Z9` +z)~w<3<;$EteVWP1$ptcFV`D_4(bP{r0>8O+O)YsRi&dyFfd-g1? +zuNTAk__(fKy{d_c3EjJQPpej~N=+C +z_VnHZ85ff*DC1)CCKx34yZ>!9ONDzmeMRQwwANG8TfpZmT+dtW=Vh(u{{qBcL;Juw +R3Jw4O002ovPDHLkV1icrt||Zk + diff --git a/tests/compat/git/binary_mixed_delta_literal/out/favicon.png b/tests/compat/git/binary_mixed_delta_literal/out/favicon.png new file mode 100644 index 0000000000000000000000000000000000000000..69b8613ce1506e1c92b864f91cc46df06348c62b GIT binary patch literal 1125 zcmeAS@N?(olHy`uVBq!ia0vp^3LwnE0wix1Z>k4UEa{HEjtmSN`?>!lvVtU&J%W50 z7^>757#dm_7=8hT8eT9klo~KFyh>nTu$sZZAYL$MSD+10f+@+{-G$+Qd;gjJKptm- zM`SV3z!DH1*13q>1OJmp-&A)3@J_A;A0V7u0bkfJ)-`Krrb zGey1(YybBo_r#8#3kPrfo#tA4xb3R$F4j$a9H~9huUPE$JQDstTM~O>^@Ps(;>UH<3Fx0=LBZUre@8723HjVToWun1;~KVtGY7SF1E7liM5< zHa%KaZ1y+v;MF5PX0dXM#bh1)zI}?P`JCclPp)GktoyD%Uv%1HzQp-VwViVJr}FN4 zBVAi3pdsabJ2zzio=sD>mtWX++u%m3k>>5t|1&=?+*B*EnLW)#$^O=9J{D1Fvz#4w zCmkrSML-}_v8Imc2?OP1;|%KWmLM+u&^dKy+fI{C57UY0UhRg-3U_ zKl;3k)jRBCi*uZh#-8L8Gwj!FXV37syULEeYD%&1+S-jgUC&wB|>?y4oO5hW>!C8<`)MX5lF!N|bKNY}tn*U&h` z(Adh*+{(a0+rYrez#!Wq{4a`z-29Zxv`X9>q*C7l^C^QQ$cEtjw370~qEv?R@^Zb* zyzJuS#DY}4{G#;P?`))iio&ZxB1(c1%M}WW^3yVNQWZ)n3sMy_3rdn17%JvG{=~yk z7^b0d%K!8k&!<5Q%*xz)$=t%q!rqfbn1vNw8cYtSFe`5kQ8<0$%84Uqj>sHgKi%N5 cz)O$emAGKZCnwXXKr0wLUHx3vIVCg!0EmFw6951J literal 0 HcmV?d00001 diff --git a/tests/compat/git/mod.rs b/tests/compat/git/mod.rs index e7dc99b..25df395 100644 --- a/tests/compat/git/mod.rs +++ b/tests/compat/git/mod.rs @@ -125,6 +125,75 @@ fn junk_between_hunks() { .run(); } +// Mixed binary and text patch. +// +// Both git apply and diffy should apply both the binary and text changes. +#[test] +fn binary_and_text_mixed() { + Case::git("binary_and_text_mixed").strip(1).run(); +} + +// Binary patch in literal format (new file creation). +#[test] +#[cfg(feature = "binary")] +fn binary_literal() { + Case::git("binary_literal").strip(1).run(); +} + +// Binary patch in delta format (modify existing file). +#[test] +#[cfg(feature = "binary")] +fn binary_delta() { + Case::git("binary_delta").strip(1).run(); +} + +// Binary literal patch applied to wrong original content. +// +// This documents a behavioral difference: +// - diffy: succeeds (skips validation, ignores original for literal format) +// - git: fails (validates original content via index hash before applying) +// +// diffy's behavior is intentional - we don't have access to git's object database +// to verify hashes, and for literal format the original content isn't needed anyway. +#[test] +#[cfg(feature = "binary")] +fn binary_literal_wrong_original() { + Case::git("binary_literal_wrong_original") + .strip(1) + .expect_compat(false) + .run(); +} + +// Binary delta patch applied to wrong original content. +// +// Both diffy and git fail, but for different reasons: +// - diffy: fails because delta instructions reference wrong offsets/sizes +// - git: fails because index hash doesn't match before even trying to apply +// +// This test verifies diffy correctly rejects invalid delta applications. +#[test] +#[cfg(feature = "binary")] +fn binary_delta_wrong_original() { + Case::git("binary_delta_wrong_original") + .strip(1) + .expect_success(false) + .run(); +} + +// Binary patch with mixed delta/literal format. +// +// Git can choose different encodings for forward and reverse transformations +// based on which is more efficient. This patch has: +// - forward (original -> modified): delta +// - reverse (modified -> original): literal +// +// From rust-lang/rust@ad46af24 (favicon-32x32.png update). +#[test] +#[cfg(feature = "binary")] +fn binary_mixed_delta_literal() { + Case::git("binary_mixed_delta_literal").strip(1).run(); +} + // Multi-file patch with junk/preamble text between different files. // // git apply behavior: Ignores content between `diff --git` boundaries. diff --git a/tests/compat/main.rs b/tests/compat/main.rs index e35ed07..2c0afe7 100644 --- a/tests/compat/main.rs +++ b/tests/compat/main.rs @@ -20,25 +20,25 @@ //! //! ```sh //! # Run all compat tests -//! cargo test --test compat +//! cargo test --test compat -F binary //! //! # Run with reference tool comparison (CI mode) -//! CI=1 cargo test --test compat +//! CI=1 cargo test --test compat -F binary //! //! # For Nix users, run this to ensure you have GNU patch -//! CI=1 nix shell nixpkgs#gnupatch -c cargo test --test compat +//! CI=1 nix shell nixpkgs#gnupatch -c cargo test --test compat -F binary //! ``` //! //! ## Regenerating snapshots //! //! ```sh -//! SNAPSHOTS=overwrite cargo test --test compat +//! SNAPSHOTS=overwrite cargo test --test compat -F binary //! ``` //! //! ## Adding new test cases //! //! 1. Create `case_name/in/` with input file(s) and `foo.patch` -//! 2. Run `SNAPSHOTS=overwrite cargo test --test compat` to generate `out/` +//! 2. Run `SNAPSHOTS=overwrite cargo test --test compat -F binary` to generate `out/` //! 3. Add `#[test] fn case_name() { Case::{gnu_patch,git}(...).run(); }` in the module //! //! For failure tests, use `.expect_success(false)` and skip step 2. diff --git a/tests/replay.rs b/tests/replay.rs index 4996151..8a50065 100644 --- a/tests/replay.rs +++ b/tests/replay.rs @@ -6,7 +6,7 @@ //! ## Usage //! //! ```console -//! $ cargo test --test replay -- --ignored --nocapture +//! $ cargo test --test replay -F binary -- --ignored --nocapture //! ``` //! //! ## Environment Variables @@ -59,7 +59,10 @@ use std::{ sync::Mutex, }; -use diffy::patch_set::{FileOperation, ParseOptions, PatchKind, PatchSet}; +use diffy::{ + binary::BinaryPatch, + patch_set::{FileOperation, ParseOptions, PatchKind, PatchSet}, +}; use rayon::prelude::*; /// Persistent `git cat-file --batch` process for fast object lookups. @@ -325,13 +328,10 @@ fn process_commit( // UniDiff format cannot express pure renames (no ---/+++ headers). // Use `--no-renames` to represent them as delete + create instead. - // GitDiff mode handles renames via extended headers natively. + // GitDiff mode uses `--binary` to get actual binary patch data. let diff_output = match mode { TestMode::UniDiff => git(repo, &["diff", "--no-renames", parent, child]), - // TODO: pass `--binary` once binary patch support lands, - // so binary files get actual delta/literal data instead of - // "Binary files differ" markers. - TestMode::GitDiff => git(repo, &["diff", parent, child]), + TestMode::GitDiff => git(repo, &["diff", "--binary", parent, child]), }; if diff_output.is_empty() { @@ -381,14 +381,7 @@ fn process_commit( text_files + type_changes } TestMode::GitDiff => { - // Can't use `--numstat` for GitDiff: it shows `-\t-\t` for both - // actual binary diffs AND pure binary renames (100% similarity). - // Parser correctly handles pure renames (rename headers, no binary content). - // - // Use `--raw` for total count, subtract actual binary markers from diff. - // - // TODO: once `--binary` is passed above, count ALL `--raw` - // entries — every file will have patch data (delta, literal, or text). + // With `--binary`, all files including binary ones have patch data. let raw = git(repo, &["diff", "--raw", parent, child]); let (mut total, mut type_changes) = (0, 0); for line in raw.lines().filter(|l| !l.is_empty()) { @@ -397,12 +390,7 @@ fn process_commit( type_changes += 1; } } - let binary = diff_output - .lines() - .filter(|l| l.starts_with("Binary files ") || l.starts_with("GIT binary patch")) - .count(); - skipped += binary; - total - binary + type_changes + total + type_changes } }; @@ -512,10 +500,50 @@ fn process_commit( ); } } - PatchKind::Binary(_) => { - // Binary patch application not yet wired up in replay tests. - // Will be done once the `binary` Cargo feature is added. + PatchKind::Binary(BinaryPatch::Marker) => { + // `Binary files differ` marker - no actual data to apply skipped += 1; + continue; + } + PatchKind::Binary(patch) => { + // Get content as bytes + let base_content = if let Some(path) = base_path { + let Some(content) = cat.get(parent, path) else { + skipped += 1; + continue; + }; + content + } else { + Vec::new() + }; + + let expected_content = if let Some(path) = target_path { + let Some(content) = cat.get(child, path) else { + skipped += 1; + continue; + }; + content + } else { + Vec::new() + }; + + let result = match patch.apply(&base_content) { + Ok(r) => r, + Err(e) => { + panic!( + "Failed to apply binary patch at {parent_short}..{child_short} for {desc}: {e}" + ); + } + }; + + if result != expected_content { + panic!( + "Binary content mismatch at {parent_short}..{child_short} for {desc}\n\n\ + Expected {} bytes, got {} bytes", + expected_content.len(), + result.len() + ); + } } } From c255bbf9ae7cae1a18022531575d71c030362cc2 Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Mon, 13 Apr 2026 19:58:59 -0400 Subject: [PATCH 17/26] refactor(patch_set): make free helpers generic over Text Preparation for `PatchSet::parse_bytes(&[u8])` support. No behavior change. --- src/patch_set/parse.rs | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/patch_set/parse.rs b/src/patch_set/parse.rs index b089479..5e6aa5e 100644 --- a/src/patch_set/parse.rs +++ b/src/patch_set/parse.rs @@ -6,7 +6,7 @@ use super::{ }; use crate::binary::{parse_binary_patch, BinaryPatch}; use crate::patch::parse::parse_one; -use crate::utils::escaped_filename; +use crate::utils::{escaped_filename, Text}; use crate::Patch; use std::borrow::Cow; @@ -284,14 +284,13 @@ impl<'a> Iterator for PatchSet<'a> { /// /// A patch header starts with `--- ` or `+++ ` (the file path lines). /// Returns `None` if no header is found. -fn find_patch_start(input: &str) -> Option { +fn find_patch_start(input: &T) -> Option { let mut offset = 0; for line in input.lines() { if line.starts_with(ORIGINAL_PREFIX) || line.starts_with(MODIFIED_PREFIX) { return Some(offset); } offset += line.len(); - offset += line_ending_len(&input[offset..]); } None } @@ -311,27 +310,29 @@ fn find_patch_start(input: &str) -> Option { /// > The log message and the patch are separated by a line with a three-dash line. /// /// [`git format-patch`]: https://git-scm.com/docs/git-format-patch -fn strip_email_preamble(input: &str) -> &str { +fn strip_email_preamble(input: &T) -> &T { // only strip preamble for mbox-formatted input if !input.starts_with("From ") { return input; } match input.find(EMAIL_PREAMBLE_SEPARATOR) { - Some(pos) => &input[pos + EMAIL_PREAMBLE_SEPARATOR.len()..], + Some(pos) => { + let (_, rest) = input.split_at(pos + EMAIL_PREAMBLE_SEPARATOR.len()); + rest + } None => input, } } /// Finds the byte offset of the first `diff --git` line in `input`. -fn find_gitdiff_start(input: &str) -> Option { +fn find_gitdiff_start(input: &T) -> Option { let mut offset = 0; for line in input.lines() { if line.starts_with("diff --git ") { return Some(offset); } offset += line.len(); - offset += line_ending_len(&input[offset..]); } None } From 8f5c1d1198478f66279fdffabec3bc32c4abc8f6 Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Mon, 13 Apr 2026 20:27:26 -0400 Subject: [PATCH 18/26] refactor(patch_set): make GitHeader::parse generic over Text Assumption: Header lines are always ASCII (with `core.quotePath=true`, git's default). --- src/patch_set/parse.rs | 58 ++++++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 27 deletions(-) diff --git a/src/patch_set/parse.rs b/src/patch_set/parse.rs index 5e6aa5e..78d8611 100644 --- a/src/patch_set/parse.rs +++ b/src/patch_set/parse.rs @@ -401,42 +401,51 @@ impl<'a> GitHeader<'a> { /// or the next `diff --git`). /// /// Returns the parsed header and the number of bytes consumed. - fn parse(input: &'a str) -> (Self, usize) { + /// + /// Assumption: Header lines are always ASCII + /// (with `core.quotePath=true`, git's default). + fn parse(input: &'a T) -> (Self, usize) { let mut header = GitHeader::default(); let mut consumed = 0; for line in input.lines() { - if let Some(rest) = line.strip_prefix("diff --git ") { + let trimmed = strip_line_ending(line); + let Some(s) = trimmed.as_str() else { + // Non-UTF-8 line — not a recognized header + break; + }; + + if let Some(rest) = s.strip_prefix("diff --git ") { // Only accept the first `diff --git` line. // A second one means we've reached the next patch. if header.diff_git_line.is_some() { break; } header.diff_git_line = Some(rest); - } else if let Some(path) = line.strip_prefix("rename from ") { + } else if let Some(path) = s.strip_prefix("rename from ") { header.rename_from = Some(path); - } else if let Some(path) = line.strip_prefix("rename to ") { + } else if let Some(path) = s.strip_prefix("rename to ") { header.rename_to = Some(path); - } else if let Some(path) = line.strip_prefix("copy from ") { + } else if let Some(path) = s.strip_prefix("copy from ") { header.copy_from = Some(path); - } else if let Some(path) = line.strip_prefix("copy to ") { + } else if let Some(path) = s.strip_prefix("copy to ") { header.copy_to = Some(path); - } else if let Some(mode) = line.strip_prefix("old mode ") { + } else if let Some(mode) = s.strip_prefix("old mode ") { header.old_mode = Some(mode); - } else if let Some(mode) = line.strip_prefix("new mode ") { + } else if let Some(mode) = s.strip_prefix("new mode ") { header.new_mode = Some(mode); - } else if let Some(mode) = line.strip_prefix("new file mode ") { + } else if let Some(mode) = s.strip_prefix("new file mode ") { header.new_file_mode = Some(mode); - } else if let Some(mode) = line.strip_prefix("deleted file mode ") { + } else if let Some(mode) = s.strip_prefix("deleted file mode ") { header.deleted_file_mode = Some(mode); - } else if line.starts_with("index ") - || line.starts_with("similarity index ") - || line.starts_with("dissimilarity index ") + } else if s.starts_with("index ") + || s.starts_with("similarity index ") + || s.starts_with("dissimilarity index ") { // Recognized but nothing to extract. - } else if line.starts_with("Binary files ") { + } else if s.starts_with("Binary files ") { header.is_binary_marker = true; - } else if line.starts_with("GIT binary patch") { + } else if s.starts_with("GIT binary patch") { header.binary_patch_offset = Some(consumed); } else { // Unrecognized line: End of extended headers @@ -445,7 +454,6 @@ impl<'a> GitHeader<'a> { } consumed += line.len(); - consumed += line_ending_len(&input[consumed..]); } (header, consumed) @@ -751,16 +759,12 @@ pub(crate) fn extract_file_op_unidiff<'a>( } } -/// Returns the length of the line ending at the start of `s`. +/// Strips the trailing `\n` from a line yielded by [`Text::lines`]. /// -/// `.lines()` strips line endings, so callers tracking byte offsets need to -/// account for the `\r\n` or `\n` that was consumed. -fn line_ending_len(s: &str) -> usize { - if s.starts_with("\r\n") { - 2 - } else if s.starts_with('\n') { - 1 - } else { - 0 - } +/// [`Text::lines`] includes line endings; strip for matching. +fn strip_line_ending(line: &T) -> &T { + // TODO: GNU patch strips trailing CRs from CRLF patches automatically. + // We should consider adding compat tests for GNU patch. + // And `git apply` seems to reject. Worth adding tests as well. + line.strip_suffix("\n").unwrap_or(line) } From 79b3804e3baf5ad2a0acd25e2ec785fe54900d70 Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Mon, 13 Apr 2026 21:02:48 -0400 Subject: [PATCH 19/26] refactor(patch_set): extract `PatchSet` methods to free fns Move dispatch logic to free functions, so we can have private trait bound on them. * `next_gitdiff_patch` * `next_unidiff_patch` * `Iterator::next` -> `next_patch` --- src/patch_set/parse.rs | 368 ++++++++++++++++++++--------------------- 1 file changed, 183 insertions(+), 185 deletions(-) diff --git a/src/patch_set/parse.rs b/src/patch_set/parse.rs index 78d8611..fc62424 100644 --- a/src/patch_set/parse.rs +++ b/src/patch_set/parse.rs @@ -69,215 +69,208 @@ impl<'a> PatchSet<'a> { found_any: false, } } +} - /// Creates an error with the current offset as span. - fn error(&self, kind: PatchSetParseErrorKind) -> PatchSetParseError { - PatchSetParseError::new(kind, self.offset..self.offset) - } +impl<'a> Iterator for PatchSet<'a> { + type Item = Result, PatchSetParseError>; - fn next_gitdiff_patch(&mut self) -> Option, PatchSetParseError>> { - let remaining = &self.input[self.offset..]; - let patch_start = find_gitdiff_start(remaining)?; - self.offset += patch_start; - self.found_any = true; + fn next(&mut self) -> Option { + next_patch(self) + } +} - let abs_patch_start = self.offset; +fn next_patch<'a>(ps: &mut PatchSet<'a>) -> Option, PatchSetParseError>> { + if ps.finished { + return None; + } - // Parse extended headers incrementally — stops at first unrecognized line - let (header, header_consumed) = GitHeader::parse(&self.input[self.offset..]); - self.offset += header_consumed; + let result = match ps.opts.format { + Format::UniDiff => next_unidiff_patch(ps), + Format::GitDiff => next_gitdiff_patch(ps), + }; - // Handle "Binary files ... differ" (no patch data) - if header.is_binary_marker { - match self.opts.binary { - Binary::Skip => { - return self.next_gitdiff_patch(); - } - Binary::Fail => { - let path = header.diff_git_line.unwrap_or("").to_owned(); - return Some(Err(PatchSetParseError::new( - PatchSetParseErrorKind::BinaryNotSupported { path }, - abs_patch_start..abs_patch_start, - ))); - } - Binary::Keep => { - // FIXME: error spans point at `diff --git` line, not the specific offending line - let operation = match extract_file_op_binary(&header) { - Ok(op) => op, - Err(mut e) => { - e.set_span(abs_patch_start..abs_patch_start); - return Some(Err(e)); - } - }; - let (old_mode, new_mode) = match parse_file_modes(&header) { - Ok(modes) => modes, - Err(mut e) => { - e.set_span(abs_patch_start..abs_patch_start); - return Some(Err(e)); - } - }; - return Some(Ok(FilePatch::new_binary( - operation, - BinaryPatch::Marker, - old_mode, - new_mode, - ))); - } - } + if result.is_none() { + ps.finished = true; + if !ps.found_any { + let err = PatchSetParseError::new( + PatchSetParseErrorKind::NoPatchesFound, + ps.offset..ps.offset, + ); + return Some(Err(err)); } + } - // Handle "GIT binary patch" (has patch data) - if let Some(binary_patch_start) = header.binary_patch_offset { - match self.opts.binary { - Binary::Skip => { - return self.next_gitdiff_patch(); - } - Binary::Fail => { - let path = header.diff_git_line.unwrap_or("").to_owned(); - return Some(Err(PatchSetParseError::new( - PatchSetParseErrorKind::BinaryNotSupported { path }, - abs_patch_start..abs_patch_start, - ))); - } - Binary::Keep => { - // GitHeader::parse consumed the marker line but not the payload. - // Use the recorded offset to pass input from the marker onward. - let binary_input = &self.input[abs_patch_start + binary_patch_start..]; - let (binary_patch, consumed) = match parse_binary_patch(binary_input) { - Ok(result) => result, - Err(e) => return Some(Err(e.into())), - }; - self.offset = abs_patch_start + binary_patch_start + consumed; - - // FIXME: error spans point at `diff --git` line, not the specific offending line - let operation = match extract_file_op_binary(&header) { - Ok(op) => op, - Err(mut e) => { - e.set_span(abs_patch_start..abs_patch_start); - return Some(Err(e)); - } - }; - let (old_mode, new_mode) = match parse_file_modes(&header) { - Ok(modes) => modes, - Err(mut e) => { - e.set_span(abs_patch_start..abs_patch_start); - return Some(Err(e)); - } - }; - return Some(Ok(FilePatch::new_binary( - operation, - binary_patch, - old_mode, - new_mode, - ))); - } - } - } + result +} - // `git diff` output format is stricter. - // There is no preamble between Git headers and unidiff patch portion, - // so we safely don't perform the preamble skipping. - // - // If we did, it would fail the pure rename/mode-change operation - // since those ops have no unidiff patch portion - // and is directly followed by the next `diff --git` header. - let opts = crate::patch::parse::ParseOpts::default().no_skip_preamble(); - let remaining = &self.input[self.offset..]; - let (result, consumed) = parse_one(remaining, opts); - self.offset += consumed; - let patch = match result { - Ok(patch) => patch, - Err(e) => return Some(Err(e.into())), - }; +fn next_gitdiff_patch<'a>( + ps: &mut PatchSet<'a>, +) -> Option, PatchSetParseError>> { + let patch_start = find_gitdiff_start(remaining(ps))?; + ps.offset += patch_start; + ps.found_any = true; - // FIXME: error spans point at `diff --git` line, not the specific offending line - let operation = match extract_file_op_gitdiff(&header, &patch) { - Ok(op) => op, - Err(mut e) => { - e.set_span(abs_patch_start..abs_patch_start); - return Some(Err(e)); - } - }; + let abs_patch_start = ps.offset; - // FIXME: error spans point at `diff --git` line, not the specific offending line - let (old_mode, new_mode) = match parse_file_modes(&header) { - Ok(modes) => modes, - Err(mut e) => { - e.set_span(abs_patch_start..abs_patch_start); - return Some(Err(e)); - } - }; + // Parse extended headers incrementally — stops at first unrecognized line + let (header, header_consumed) = GitHeader::parse(remaining(ps)); + ps.offset += header_consumed; - Some(Ok(FilePatch::new(operation, patch, old_mode, new_mode))) + // Handle "Binary files ... differ" (no patch data) + if header.is_binary_marker { + match ps.opts.binary { + Binary::Skip => { + return next_gitdiff_patch(ps); + } + Binary::Fail => { + let path = header.diff_git_line.unwrap_or("").to_owned(); + return Some(Err(PatchSetParseError::new( + PatchSetParseErrorKind::BinaryNotSupported { path }, + abs_patch_start..abs_patch_start, + ))); + } + Binary::Keep => { + // FIXME: error spans point at `diff --git` line, not the specific offending line + let operation = match extract_file_op_binary(&header) { + Ok(op) => op, + Err(mut e) => { + e.set_span(abs_patch_start..abs_patch_start); + return Some(Err(e)); + } + }; + let (old_mode, new_mode) = match parse_file_modes(&header) { + Ok(modes) => modes, + Err(mut e) => { + e.set_span(abs_patch_start..abs_patch_start); + return Some(Err(e)); + } + }; + return Some(Ok(FilePatch::new_binary( + operation, + BinaryPatch::Marker, + old_mode, + new_mode, + ))); + } + } } - fn next_unidiff_patch(&mut self) -> Option, PatchSetParseError>> { - let remaining = &self.input[self.offset..]; - if remaining.is_empty() { - return None; + // Handle "GIT binary patch" (has patch data) + if let Some(binary_patch_start) = header.binary_patch_offset { + match ps.opts.binary { + Binary::Skip => { + return next_gitdiff_patch(ps); + } + Binary::Fail => { + let path = header.diff_git_line.unwrap_or("").to_owned(); + return Some(Err(PatchSetParseError::new( + PatchSetParseErrorKind::BinaryNotSupported { path }, + abs_patch_start..abs_patch_start, + ))); + } + Binary::Keep => { + // GitHeader::parse consumed the marker line but not the payload. + // Use the recorded offset to pass input from the marker onward. + let binary_input = &ps.input[abs_patch_start + binary_patch_start..]; + let (binary_patch, consumed) = match parse_binary_patch(binary_input) { + Ok(result) => result, + Err(e) => return Some(Err(e.into())), + }; + ps.offset = abs_patch_start + binary_patch_start + consumed; + + // FIXME: error spans point at `diff --git` line, not the specific offending line + let operation = match extract_file_op_binary(&header) { + Ok(op) => op, + Err(mut e) => { + e.set_span(abs_patch_start..abs_patch_start); + return Some(Err(e)); + } + }; + let (old_mode, new_mode) = match parse_file_modes(&header) { + Ok(modes) => modes, + Err(mut e) => { + e.set_span(abs_patch_start..abs_patch_start); + return Some(Err(e)); + } + }; + return Some(Ok(FilePatch::new_binary( + operation, + binary_patch, + old_mode, + new_mode, + ))); + } } + } - let patch_start = find_patch_start(remaining)?; - self.found_any = true; + // `git diff` output format is stricter. + // There is no preamble between Git headers and unidiff patch portion, + // so we safely don't perform the preamble skipping. + // + // If we did, it would fail the pure rename/mode-change operation + // since those ops have no unidiff patch portion + // and is directly followed by the next `diff --git` header. + let opts = crate::patch::parse::ParseOpts::default().no_skip_preamble(); + let (result, consumed) = parse_one(remaining(ps), opts); + ps.offset += consumed; + let patch = match result { + Ok(patch) => patch, + Err(e) => return Some(Err(e.into())), + }; - let patch_input = &remaining[patch_start..]; + // FIXME: error spans point at `diff --git` line, not the specific offending line + let operation = match extract_file_op_gitdiff(&header, &patch) { + Ok(op) => op, + Err(mut e) => { + e.set_span(abs_patch_start..abs_patch_start); + return Some(Err(e)); + } + }; - let opts = crate::patch::parse::ParseOpts::default(); - let (result, consumed) = parse_one(patch_input, opts); - // Always advance so the iterator makes progress even on error. - let abs_patch_start = self.offset + patch_start; - self.offset += patch_start + consumed; + // FIXME: error spans point at `diff --git` line, not the specific offending line + let (old_mode, new_mode) = match parse_file_modes(&header) { + Ok(modes) => modes, + Err(mut e) => { + e.set_span(abs_patch_start..abs_patch_start); + return Some(Err(e)); + } + }; - let patch = match result { - Ok(patch) => patch, - Err(e) => return Some(Err(e.into())), - }; - let operation = match extract_file_op_unidiff(patch.original_path(), patch.modified_path()) - { - Ok(op) => op, - Err(mut e) => { - e.set_span(abs_patch_start..abs_patch_start); - return Some(Err(e)); - } - }; + Some(Ok(FilePatch::new(operation, patch, old_mode, new_mode))) +} - Some(Ok(FilePatch::new(operation, patch, None, None))) +fn next_unidiff_patch<'a>( + ps: &mut PatchSet<'a>, +) -> Option, PatchSetParseError>> { + let remaining = remaining(ps); + if remaining.is_empty() { + return None; } -} -impl<'a> Iterator for PatchSet<'a> { - type Item = Result, PatchSetParseError>; + let patch_start = find_patch_start(remaining)?; + ps.found_any = true; - fn next(&mut self) -> Option { - if self.finished { - return None; - } + let patch_input = &remaining[patch_start..]; - let result = match self.opts.format { - Format::UniDiff => { - let result = self.next_unidiff_patch(); - if result.is_none() { - self.finished = true; - if !self.found_any { - return Some(Err(self.error(PatchSetParseErrorKind::NoPatchesFound))); - } - } - result - } - Format::GitDiff => { - let result = self.next_gitdiff_patch(); - if result.is_none() { - self.finished = true; - if !self.found_any { - return Some(Err(self.error(PatchSetParseErrorKind::NoPatchesFound))); - } - } - result - } - }; + let opts = crate::patch::parse::ParseOpts::default(); + let (result, consumed) = parse_one(patch_input, opts); + // Always advance so the iterator makes progress even on error. + let abs_patch_start = ps.offset + patch_start; + ps.offset += patch_start + consumed; - result - } + let patch = match result { + Ok(patch) => patch, + Err(e) => return Some(Err(e.into())), + }; + let operation = match extract_file_op_unidiff(patch.original_path(), patch.modified_path()) { + Ok(op) => op, + Err(mut e) => { + e.set_span(abs_patch_start..abs_patch_start); + return Some(Err(e)); + } + }; + + Some(Ok(FilePatch::new(operation, patch, None, None))) } /// Finds the byte offset of the first patch header in the input. @@ -768,3 +761,8 @@ fn strip_line_ending(line: &T) -> &T { // And `git apply` seems to reject. Worth adding tests as well. line.strip_suffix("\n").unwrap_or(line) } + +fn remaining<'a>(ps: &PatchSet<'a>) -> &'a str { + let (_, rest) = ps.input.split_at(ps.offset); + rest +} From 33b4e38be48491297b3b7a129e3f62b97f80cc7c Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Tue, 14 Apr 2026 01:33:51 -0400 Subject: [PATCH 20/26] refactor: add `Text::as_str_prefix` method Returns the longest valid UTF-8 prefix of the input. This will be used to safely convert binary patch data to `&str` without validating the entire remaining input. --- src/utils.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/utils.rs b/src/utils.rs index 4e98c95..6047050 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -144,6 +144,11 @@ pub trait Text: Eq + Hash + ToOwned { fn find(&self, needle: &str) -> Option; fn split_at(&self, mid: usize) -> (&Self, &Self); fn as_str(&self) -> Option<&str>; + /// Returns the longest valid UTF-8 prefix. + /// + /// For `str` this is the entire input. + /// For `[u8]` this returns up to the first invalid UTF-8 byte. + fn as_str_prefix(&self) -> &str; fn as_bytes(&self) -> &[u8]; #[allow(unused)] fn lines(&self) -> LineIter<'_, Self>; @@ -198,6 +203,10 @@ impl Text for str { Some(self) } + fn as_str_prefix(&self) -> &str { + self + } + fn as_bytes(&self) -> &[u8] { self.as_bytes() } @@ -255,6 +264,13 @@ impl Text for [u8] { std::str::from_utf8(self).ok() } + fn as_str_prefix(&self) -> &str { + match std::str::from_utf8(self) { + Ok(s) => s, + Err(e) => std::str::from_utf8(&self[..e.valid_up_to()]).unwrap(), + } + } + fn as_bytes(&self) -> &[u8] { self } From e50b8de7b3671ab18fe6e083771e290125cd9f8a Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Tue, 14 Apr 2026 00:00:52 -0400 Subject: [PATCH 21/26] refactor(patch_set): make `PatchSet` internal generic over `T: Text` We have the assumption that file path from hunk hader is UTF-8 (This is actually not true but we can leave it for future fix) Only the `str` constructor and Iterator impl are exposed for now. `parse_bytes` support comes in a follow-up commit. --- src/patch_set/error.rs | 4 +++ src/patch_set/parse.rs | 76 ++++++++++++++++++++++++++---------------- 2 files changed, 51 insertions(+), 29 deletions(-) diff --git a/src/patch_set/error.rs b/src/patch_set/error.rs index 20d7d3e..77ccd0c 100644 --- a/src/patch_set/error.rs +++ b/src/patch_set/error.rs @@ -78,6 +78,9 @@ pub(crate) enum PatchSetParseErrorKind { /// Invalid `diff --git` path. InvalidDiffGitPath, + /// File path contains invalid UTF-8. + InvalidUtf8Path, + /// Binary diff not supported in current configuration. BinaryNotSupported { path: String }, @@ -96,6 +99,7 @@ impl fmt::Display for PatchSetParseErrorKind { Self::CreateMissingModifiedPath => write!(f, "create patch has no modified path"), Self::InvalidFileMode(mode) => write!(f, "invalid file mode: {mode}"), Self::InvalidDiffGitPath => write!(f, "invalid diff --git path"), + Self::InvalidUtf8Path => write!(f, "file path is not valid UTF-8"), Self::BinaryNotSupported { path } => { write!(f, "binary diff not supported: {path}") } diff --git a/src/patch_set/parse.rs b/src/patch_set/parse.rs index fc62424..c021f02 100644 --- a/src/patch_set/parse.rs +++ b/src/patch_set/parse.rs @@ -48,15 +48,15 @@ const EMAIL_PREAMBLE_SEPARATOR: &str = "\n---\n"; /// println!("{:?}", patch.operation()); /// } /// ``` -pub struct PatchSet<'a> { - input: &'a str, +pub struct PatchSet<'a, T: ?Sized = str> { + input: &'a T, offset: usize, opts: ParseOptions, finished: bool, found_any: bool, } -impl<'a> PatchSet<'a> { +impl<'a> PatchSet<'a, str> { /// Creates a streaming parser for multiple file patches. pub fn parse(input: &'a str, opts: ParseOptions) -> Self { // Strip email preamble once at construction @@ -71,7 +71,7 @@ impl<'a> PatchSet<'a> { } } -impl<'a> Iterator for PatchSet<'a> { +impl<'a> Iterator for PatchSet<'a, str> { type Item = Result, PatchSetParseError>; fn next(&mut self) -> Option { @@ -79,7 +79,9 @@ impl<'a> Iterator for PatchSet<'a> { } } -fn next_patch<'a>(ps: &mut PatchSet<'a>) -> Option, PatchSetParseError>> { +fn next_patch<'a, T: Text + ?Sized>( + ps: &mut PatchSet<'a, T>, +) -> Option, PatchSetParseError>> { if ps.finished { return None; } @@ -103,9 +105,9 @@ fn next_patch<'a>(ps: &mut PatchSet<'a>) -> Option, Pa result } -fn next_gitdiff_patch<'a>( - ps: &mut PatchSet<'a>, -) -> Option, PatchSetParseError>> { +fn next_gitdiff_patch<'a, T: Text + ?Sized>( + ps: &mut PatchSet<'a, T>, +) -> Option, PatchSetParseError>> { let patch_start = find_gitdiff_start(remaining(ps))?; ps.offset += patch_start; ps.found_any = true; @@ -171,7 +173,9 @@ fn next_gitdiff_patch<'a>( Binary::Keep => { // GitHeader::parse consumed the marker line but not the payload. // Use the recorded offset to pass input from the marker onward. - let binary_input = &ps.input[abs_patch_start + binary_patch_start..]; + let (_, binary_input) = ps.input.split_at(abs_patch_start + binary_patch_start); + // Binary patch data is always ASCII (base85-encoded). + let binary_input = binary_input.as_str_prefix(); let (binary_patch, consumed) = match parse_binary_patch(binary_input) { Ok(result) => result, Err(e) => return Some(Err(e.into())), @@ -239,9 +243,9 @@ fn next_gitdiff_patch<'a>( Some(Ok(FilePatch::new(operation, patch, old_mode, new_mode))) } -fn next_unidiff_patch<'a>( - ps: &mut PatchSet<'a>, -) -> Option, PatchSetParseError>> { +fn next_unidiff_patch<'a, T: Text + ?Sized>( + ps: &mut PatchSet<'a, T>, +) -> Option, PatchSetParseError>> { let remaining = remaining(ps); if remaining.is_empty() { return None; @@ -250,7 +254,7 @@ fn next_unidiff_patch<'a>( let patch_start = find_patch_start(remaining)?; ps.found_any = true; - let patch_input = &remaining[patch_start..]; + let (_, patch_input) = remaining.split_at(patch_start); let opts = crate::patch::parse::ParseOpts::default(); let (result, consumed) = parse_one(patch_input, opts); @@ -454,9 +458,9 @@ impl<'a> GitHeader<'a> { } /// Determines the file operation from git headers and patch paths. -fn extract_file_op_gitdiff<'a>( +fn extract_file_op_gitdiff<'a, T: Text + ?Sized>( header: &GitHeader<'a>, - patch: &Patch<'a, str>, + patch: &Patch<'a, T>, ) -> Result, PatchSetParseError> { // Git headers are authoritative for rename/copy if let (Some(from), Some(to)) = (header.rename_from, header.rename_to) { @@ -709,12 +713,29 @@ fn longest_common_path_suffix<'a>(a: &'a str, b: &'a str) -> Option<&'a str> { } /// Extracts the file operation from a patch based on its header paths. -pub(crate) fn extract_file_op_unidiff<'a>( - original: Option<&Cow<'a, str>>, - modified: Option<&Cow<'a, str>>, +/// +/// Converts paths from `Cow<'a, T>` to `Cow<'a, str>` via [`Text::as_str`], +/// since [`FileOperation`] always uses `Cow` for paths. +fn extract_file_op_unidiff<'a, T: Text + ?Sized>( + original: Option<&Cow<'a, T>>, + modified: Option<&Cow<'a, T>>, ) -> Result, PatchSetParseError> { - let is_create = original.map(Cow::as_ref) == Some(DEV_NULL); - let is_delete = modified.map(Cow::as_ref) == Some(DEV_NULL); + let to_str = |cow: &Cow<'a, T>| -> Option> { + match cow { + Cow::Borrowed(b) => b.as_str().map(Cow::Borrowed), + Cow::Owned(_) => cow.as_ref().as_str().map(|s| Cow::Owned(s.to_owned())), + } + }; + // TODO: support non-UTF8 paths + let original = original + .map(|cow| to_str(cow).ok_or(PatchSetParseErrorKind::InvalidUtf8Path)) + .transpose()?; + let modified = modified + .map(|cow| to_str(cow).ok_or(PatchSetParseErrorKind::InvalidUtf8Path)) + .transpose()?; + + let is_create = original.as_deref() == Some(DEV_NULL); + let is_delete = modified.as_deref() == Some(DEV_NULL); if is_create && is_delete { return Err(PatchSetParseErrorKind::BothDevNull.into()); @@ -722,29 +743,26 @@ pub(crate) fn extract_file_op_unidiff<'a>( if is_delete { let path = original.ok_or(PatchSetParseErrorKind::DeleteMissingOriginalPath)?; - Ok(FileOperation::Delete(path.clone())) + Ok(FileOperation::Delete(path)) } else if is_create { let path = modified.ok_or(PatchSetParseErrorKind::CreateMissingModifiedPath)?; - Ok(FileOperation::Create(path.clone())) + Ok(FileOperation::Create(path)) } else { match (original, modified) { - (Some(original), Some(modified)) => Ok(FileOperation::Modify { - original: original.clone(), - modified: modified.clone(), - }), + (Some(original), Some(modified)) => Ok(FileOperation::Modify { original, modified }), (None, Some(modified)) => { // No original path, but has modified path. // Observed that GNU patch reads from the modified path in this case. Ok(FileOperation::Modify { original: modified.clone(), - modified: modified.clone(), + modified, }) } (Some(original), None) => { // No modified path, but has original path. Ok(FileOperation::Modify { modified: original.clone(), - original: original.clone(), + original, }) } (None, None) => Err(PatchSetParseErrorKind::NoFilePath.into()), @@ -762,7 +780,7 @@ fn strip_line_ending(line: &T) -> &T { line.strip_suffix("\n").unwrap_or(line) } -fn remaining<'a>(ps: &PatchSet<'a>) -> &'a str { +fn remaining<'a, T: Text + ?Sized>(ps: &PatchSet<'a, T>) -> &'a T { let (_, rest) = ps.input.split_at(ps.offset); rest } From c189ab8008fee62f99caf113c1779837ab1bd945 Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Tue, 14 Apr 2026 00:12:01 -0400 Subject: [PATCH 22/26] feat(patch_set): `PatchSet::parse_bytes` for raw byte input --- src/patch_set/parse.rs | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/src/patch_set/parse.rs b/src/patch_set/parse.rs index c021f02..08502d4 100644 --- a/src/patch_set/parse.rs +++ b/src/patch_set/parse.rs @@ -79,6 +79,31 @@ impl<'a> Iterator for PatchSet<'a, str> { } } +impl<'a> PatchSet<'a, [u8]> { + /// Creates a streaming parser for multiple file patches from raw bytes. + /// + /// Unlike [`PatchSet::parse`], this preserves non-UTF-8 content in hunks + /// (e.g., binary files that git misdetects as text). + pub fn parse_bytes(input: &'a [u8], opts: ParseOptions) -> Self { + let input = strip_email_preamble(input); + Self { + input, + offset: 0, + opts, + finished: false, + found_any: false, + } + } +} + +impl<'a> Iterator for PatchSet<'a, [u8]> { + type Item = Result, PatchSetParseError>; + + fn next(&mut self) -> Option { + next_patch(self) + } +} + fn next_patch<'a, T: Text + ?Sized>( ps: &mut PatchSet<'a, T>, ) -> Option, PatchSetParseError>> { From 035b7d6f7b88d22e224e1df8c6a12bb5a22f6a51 Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Tue, 14 Apr 2026 01:21:13 -0400 Subject: [PATCH 23/26] test(patch_set): add unit tests for `parse_bytes` --- src/patch_set/tests.rs | 50 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/src/patch_set/tests.rs b/src/patch_set/tests.rs index 0944c54..5a9951e 100644 --- a/src/patch_set/tests.rs +++ b/src/patch_set/tests.rs @@ -606,3 +606,53 @@ diff --git a/foo b/foo assert_eq!(patches.len(), 1, "binary should be skipped, text parsed"); } } + +mod patchset_parse_bytes { + use super::*; + + #[test] + fn non_utf8_hunk_content() { + // Hunk lines contain invalid UTF-8 bytes (0x80, 0xff). + let mut input: Vec = Vec::new(); + input.extend(b"--- a/file.bin\n"); + input.extend(b"+++ b/file.bin\n"); + input.extend(b"@@ -1,3 +1,3 @@\n"); + input.extend(b" valid ascii\n"); + input.extend(b"-old \x80\xff bytes\n"); + input.extend(b"+new \x80\xff bytes\n"); + input.extend(b" more ascii\n"); + + let patches = PatchSet::parse_bytes(&input, ParseOptions::unidiff()) + .collect::, _>>() + .unwrap(); + assert_eq!(patches.len(), 1); + + let patch = patches[0].patch().as_text().expect("expected text patch"); + let hunks = patch.hunks(); + assert_eq!(hunks.len(), 1); + + use crate::patch::Line; + let lines = hunks[0].lines(); + assert_eq!(lines.len(), 4); + assert_eq!(lines[0], Line::Context(b"valid ascii\n" as &[u8])); + assert_eq!(lines[1], Line::Delete(b"old \x80\xff bytes\n" as &[u8])); + assert_eq!(lines[2], Line::Insert(b"new \x80\xff bytes\n" as &[u8])); + assert_eq!(lines[3], Line::Context(b"more ascii\n" as &[u8])); + } + + #[test] + fn non_utf8_path_returns_error() { + // File paths (currently) must be valid UTF-8 even in byte mode. + let mut input: Vec = Vec::new(); + input.extend(b"--- a/bad\xffpath\n"); + input.extend(b"+++ b/good\n"); + input.extend(b"@@ -1 +1 @@\n"); + input.extend(b"-old\n"); + input.extend(b"+new\n"); + + let result: Result, _> = + PatchSet::parse_bytes(&input, ParseOptions::unidiff()).collect(); + let err = result.unwrap_err(); + assert_eq!(err.kind, PatchSetParseErrorKind::InvalidUtf8Path); + } +} From 095fa47d6856f242e254ee4d346d7ea00027b8ff Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Tue, 14 Apr 2026 01:21:14 -0400 Subject: [PATCH 24/26] test(replay): switch to `PatchSet::parse_bytes~ Avoid lossy UTF-8 conversion of diff output so non-UTF8 content round-trips correctly. No more skips in GitDiff mode (except submodule of course) --- tests/replay.rs | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/tests/replay.rs b/tests/replay.rs index 8a50065..cbf07ac 100644 --- a/tests/replay.rs +++ b/tests/replay.rs @@ -152,11 +152,6 @@ impl CatFile { Some(buf) } - - /// Like [`CatFile::get`] but returns only UTF-8 string. - fn get_text(&mut self, rev: &str, path: &str) -> Option { - self.get(rev, path).and_then(|b| String::from_utf8(b).ok()) - } } /// Local enum for test configuration (maps to ParseOptions). @@ -237,7 +232,7 @@ fn test_mode() -> TestMode { } } -fn git(repo: &Path, args: &[&str]) -> String { +fn git_bytes(repo: &Path, args: &[&str]) -> Vec { let mut cmd = Command::new("git"); cmd.env("GIT_CONFIG_NOSYSTEM", "1"); cmd.env("GIT_CONFIG_GLOBAL", "/dev/null"); @@ -251,7 +246,11 @@ fn git(repo: &Path, args: &[&str]) -> String { panic!("git {args:?} failed: {stderr}"); } - String::from_utf8_lossy(&output.stdout).into_owned() + output.stdout +} + +fn git(repo: &Path, args: &[&str]) -> String { + String::from_utf8_lossy(&git_bytes(repo, args)).into_owned() } /// Get the list of commits from oldest to newest. @@ -330,8 +329,8 @@ fn process_commit( // Use `--no-renames` to represent them as delete + create instead. // GitDiff mode uses `--binary` to get actual binary patch data. let diff_output = match mode { - TestMode::UniDiff => git(repo, &["diff", "--no-renames", parent, child]), - TestMode::GitDiff => git(repo, &["diff", "--binary", parent, child]), + TestMode::UniDiff => git_bytes(repo, &["diff", "--no-renames", parent, child]), + TestMode::GitDiff => git_bytes(repo, &["diff", "--binary", parent, child]), }; if diff_output.is_empty() { @@ -404,9 +403,10 @@ fn process_commit( }; } - let patchset: Vec<_> = match PatchSet::parse(&diff_output, mode.into()).collect() { + let patchset: Vec<_> = match PatchSet::parse_bytes(&diff_output, mode.into()).collect() { Ok(ps) => ps, Err(e) => { + let diff_output = String::from_utf8_lossy(&diff_output); panic!( "Failed to parse patch for {parent_short}..{child_short}: {e}\n\n\ Diff:\n{diff_output}" @@ -418,6 +418,7 @@ fn process_commit( // This catches both missing and spurious patches. if patchset.len() != expected_file_count { let n = patchset.len(); + let diff_output = String::from_utf8_lossy(&diff_output); panic!( "Patch count mismatch for {parent_short}..{child_short}: \ expected {expected_file_count} files, parsed {n} patches\n\n\ @@ -461,42 +462,43 @@ fn process_commit( match file_patch.patch() { PatchKind::Text(patch) => { let base_content = if let Some(path) = base_path { - let Some(content) = cat.get_text(parent, path) else { + let Some(content) = cat.get(parent, path) else { skipped += 1; continue; }; content } else { - String::new() + Vec::new() }; let expected_content = if let Some(path) = target_path { - let Some(content) = cat.get_text(child, path) else { + let Some(content) = cat.get(child, path) else { skipped += 1; continue; }; content } else { - String::new() + Vec::new() }; - let result = match diffy::apply(&base_content, patch) { + let result = match diffy::apply_bytes(&base_content, patch) { Ok(r) => r, Err(e) => { + let base_content = String::from_utf8_lossy(&base_content); panic!( "Failed to apply patch at {parent_short}..{child_short} for {desc}: {e}\n\n\ - Patch:\n{patch}\n\n\ Base content:\n{base_content}" ); } }; if result != expected_content { + let expected_content = String::from_utf8_lossy(&expected_content); + let result = String::from_utf8_lossy(&result); panic!( "Content mismatch at {parent_short}..{child_short} for {desc}\n\n\ --- Expected ---\n{expected_content}\n\n\ - --- Got ---\n{result}\n\n\ - --- Patch ---\n{patch}" + --- Got ---\n{result}" ); } } From 2edeadb8fae4d8a34797a53ae50c7ba1045c7d14 Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Tue, 14 Apr 2026 01:21:15 -0400 Subject: [PATCH 25/26] test(compat): switch to byte-aware API --- tests/compat/common.rs | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/tests/compat/common.rs b/tests/compat/common.rs index 8e5fdda..83f27ad 100644 --- a/tests/compat/common.rs +++ b/tests/compat/common.rs @@ -89,7 +89,7 @@ impl<'a> Case<'a> { let case_dir = self.case_dir(); let in_dir = case_dir.join("in"); let patch_path = in_dir.join("foo.patch"); - let patch = fs::read_to_string(&patch_path) + let patch = fs::read(&patch_path) .unwrap_or_else(|e| panic!("failed to read {}: {e}", patch_path.display())); let case_name = self.case_name; @@ -167,7 +167,7 @@ impl<'a> Case<'a> { fn git_apply( output_dir: &Path, - patch: &str, + patch: &[u8], strip_level: u32, in_dir: &Path, ) -> Result<(), String> { @@ -183,12 +183,7 @@ fn git_apply( cmd.stderr(Stdio::piped()); let mut child = cmd.spawn().expect("failed to spawn git apply"); - child - .stdin - .as_mut() - .unwrap() - .write_all(patch.as_bytes()) - .unwrap(); + child.stdin.as_mut().unwrap().write_all(patch).unwrap(); let output = child.wait_with_output().unwrap(); if output.status.success() { @@ -331,12 +326,12 @@ fn copy_input_files_impl(src: &Path, dst: &Path, base: &Path, skip_extensions: & /// Apply patch using diffy to output directory. pub fn apply_diffy( in_dir: &Path, - patch: &str, + patch: &[u8], output_dir: &Path, opts: ParseOptions, strip_prefix: u32, ) -> Result<(), TestError> { - let patches: Vec<_> = PatchSet::parse(patch, opts) + let patches: Vec<_> = PatchSet::parse_bytes(patch, opts) .collect::>() .map_err(TestError::Parse)?; @@ -373,11 +368,11 @@ pub fn apply_diffy( match file_patch.patch() { PatchKind::Text(patch) => { - let original = String::from_utf8(read_original()).unwrap(); + let original = read_original(); - let result = diffy::apply(&original, patch).map_err(TestError::Apply)?; + let result = diffy::apply_bytes(&original, patch).map_err(TestError::Apply)?; - write_modified(result.as_bytes()); + write_modified(&result); } PatchKind::Binary(BinaryPatch::Marker) => { // Dont do anything if it is just a binary patch marker. From 05cdc37fd97cebb9d4c6ee079ab78ba6c9d844cd Mon Sep 17 00:00:00 2001 From: Weihang Lo Date: Tue, 14 Apr 2026 01:21:16 -0400 Subject: [PATCH 26/26] test(compat): add non-UTF8 hunk tests --- tests/compat/git/mod.rs | 7 +++++++ tests/compat/git/non_utf8_hunk_content/in/file.bin | 1 + tests/compat/git/non_utf8_hunk_content/in/foo.patch | 6 ++++++ tests/compat/git/non_utf8_hunk_content/out/file.bin | 1 + tests/compat/gnu_patch/mod.rs | 7 +++++++ tests/compat/gnu_patch/non_utf8_hunk_content/in/file.bin | 1 + tests/compat/gnu_patch/non_utf8_hunk_content/in/foo.patch | 5 +++++ tests/compat/gnu_patch/non_utf8_hunk_content/out/file.bin | 1 + 8 files changed, 29 insertions(+) create mode 100644 tests/compat/git/non_utf8_hunk_content/in/file.bin create mode 100644 tests/compat/git/non_utf8_hunk_content/in/foo.patch create mode 100644 tests/compat/git/non_utf8_hunk_content/out/file.bin create mode 100644 tests/compat/gnu_patch/non_utf8_hunk_content/in/file.bin create mode 100644 tests/compat/gnu_patch/non_utf8_hunk_content/in/foo.patch create mode 100644 tests/compat/gnu_patch/non_utf8_hunk_content/out/file.bin diff --git a/tests/compat/git/mod.rs b/tests/compat/git/mod.rs index 25df395..335a63c 100644 --- a/tests/compat/git/mod.rs +++ b/tests/compat/git/mod.rs @@ -194,6 +194,13 @@ fn binary_mixed_delta_literal() { Case::git("binary_mixed_delta_literal").strip(1).run(); } +// Patch with non-UTF-8 bytes (0x80, 0xff) in hunk content. +// Both git apply and diffy handle raw bytes correctly. +#[test] +fn non_utf8_hunk_content() { + Case::git("non_utf8_hunk_content").strip(1).run(); +} + // Multi-file patch with junk/preamble text between different files. // // git apply behavior: Ignores content between `diff --git` boundaries. diff --git a/tests/compat/git/non_utf8_hunk_content/in/file.bin b/tests/compat/git/non_utf8_hunk_content/in/file.bin new file mode 100644 index 0000000..5ddb01f --- /dev/null +++ b/tests/compat/git/non_utf8_hunk_content/in/file.bin @@ -0,0 +1 @@ +hello €ÿ world diff --git a/tests/compat/git/non_utf8_hunk_content/in/foo.patch b/tests/compat/git/non_utf8_hunk_content/in/foo.patch new file mode 100644 index 0000000..0c3f72f --- /dev/null +++ b/tests/compat/git/non_utf8_hunk_content/in/foo.patch @@ -0,0 +1,6 @@ +diff --git a/file.bin b/file.bin +--- a/file.bin ++++ b/file.bin +@@ -1 +1 @@ +-hello €ÿ world ++hello €ÿ universe diff --git a/tests/compat/git/non_utf8_hunk_content/out/file.bin b/tests/compat/git/non_utf8_hunk_content/out/file.bin new file mode 100644 index 0000000..715e61f --- /dev/null +++ b/tests/compat/git/non_utf8_hunk_content/out/file.bin @@ -0,0 +1 @@ +hello €ÿ universe diff --git a/tests/compat/gnu_patch/mod.rs b/tests/compat/gnu_patch/mod.rs index fe59fcb..643d16e 100644 --- a/tests/compat/gnu_patch/mod.rs +++ b/tests/compat/gnu_patch/mod.rs @@ -151,6 +151,13 @@ fn junk_between_hunks() { Case::gnu_patch("junk_between_hunks").run(); } +// Patch with non-UTF-8 bytes (0x80, 0xff) in hunk content. +// Both GNU patch and diffy handle raw bytes correctly. +#[test] +fn non_utf8_hunk_content() { + Case::gnu_patch("non_utf8_hunk_content").run(); +} + // Failure cases #[test] diff --git a/tests/compat/gnu_patch/non_utf8_hunk_content/in/file.bin b/tests/compat/gnu_patch/non_utf8_hunk_content/in/file.bin new file mode 100644 index 0000000..5ddb01f --- /dev/null +++ b/tests/compat/gnu_patch/non_utf8_hunk_content/in/file.bin @@ -0,0 +1 @@ +hello €ÿ world diff --git a/tests/compat/gnu_patch/non_utf8_hunk_content/in/foo.patch b/tests/compat/gnu_patch/non_utf8_hunk_content/in/foo.patch new file mode 100644 index 0000000..10e9997 --- /dev/null +++ b/tests/compat/gnu_patch/non_utf8_hunk_content/in/foo.patch @@ -0,0 +1,5 @@ +--- file.bin ++++ file.bin +@@ -1 +1 @@ +-hello €ÿ world ++hello €ÿ universe diff --git a/tests/compat/gnu_patch/non_utf8_hunk_content/out/file.bin b/tests/compat/gnu_patch/non_utf8_hunk_content/out/file.bin new file mode 100644 index 0000000..715e61f --- /dev/null +++ b/tests/compat/gnu_patch/non_utf8_hunk_content/out/file.bin @@ -0,0 +1 @@ +hello €ÿ universe