diff --git a/.github/workflows/replay.yml b/.github/workflows/replay.yml index a9f8f997..08384d61 100644 --- a/.github/workflows/replay.yml +++ b/.github/workflows/replay.yml @@ -44,7 +44,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - parse_mode: [unidiff] + parse_mode: [unidiff, gitdiff] name: ${{ inputs.name && matrix.parse_mode || format('{0} ({1}, {2})', inputs.repo_url, matrix.parse_mode, inputs.commits) }} steps: - uses: actions/checkout@v6 diff --git a/src/patch/parse.rs b/src/patch/parse.rs index dd74b15c..2ccb409a 100644 --- a/src/patch/parse.rs +++ b/src/patch/parse.rs @@ -12,6 +12,42 @@ use std::borrow::Cow; type Result = std::result::Result; +/// Options that control parsing behavior. +/// +/// Defaults match the [`parse`]/[`parse_bytes`] behavior. +#[derive(Clone, Copy)] +pub(crate) struct ParseOpts { + skip_preamble: bool, + reject_orphaned_hunks: bool, +} + +impl Default for ParseOpts { + fn default() -> Self { + Self { + skip_preamble: true, + reject_orphaned_hunks: false, + } + } +} + +impl ParseOpts { + /// Don't skip preamble lines before `---`/`+++`/`@@`. + /// + /// Useful when the caller has already positioned the input + /// at the start of the patch content. + pub(crate) fn no_skip_preamble(mut self) -> Self { + self.skip_preamble = false; + self + } + + /// Reject orphaned `@@ ` hunk headers after parsed hunks, + /// matching `git apply` behavior. + pub(crate) fn reject_orphaned_hunks(mut self) -> Self { + self.reject_orphaned_hunks = true; + self + } +} + struct Parser<'a, T: Text + ?Sized> { lines: std::iter::Peekable>, offset: usize, @@ -54,7 +90,22 @@ impl<'a, T: Text + ?Sized> Parser<'a, T> { } pub fn parse(input: &str) -> Result> { - let (result, _consumed) = parse_one(input); + let (result, _consumed) = parse_one(input, ParseOpts::default()); + result +} + +pub fn parse_strict(input: &str) -> Result> { + let (result, _consumed) = parse_one(input, ParseOpts::default().reject_orphaned_hunks()); + result +} + +pub fn parse_bytes(input: &[u8]) -> Result> { + let (result, _consumed) = parse_one(input, ParseOpts::default()); + result +} + +pub fn parse_bytes_strict(input: &[u8]) -> Result> { + let (result, _consumed) = parse_one(input, ParseOpts::default().reject_orphaned_hunks()); result } @@ -62,10 +113,13 @@ pub fn parse(input: &str) -> Result> { /// /// Always returns consumed bytes alongside the result /// so callers can advance past the parsed or partially parsed content. -pub(crate) fn parse_one(input: &str) -> (Result>, usize) { +pub(crate) fn parse_one<'a, T: Text + ?Sized>( + input: &'a T, + opts: ParseOpts, +) -> (Result>, usize) { let mut parser = Parser::new(input); - let header = match patch_header(&mut parser) { + let header = match patch_header(&mut parser, &opts) { Ok(h) => h, Err(e) => return (Err(e), parser.offset()), }; @@ -73,58 +127,28 @@ pub(crate) fn parse_one(input: &str) -> (Result>, usize) { Ok(h) => h, Err(e) => return (Err(e), parser.offset()), }; + if opts.reject_orphaned_hunks { + if let Err(e) = reject_orphaned_hunk_headers(&mut parser) { + return (Err(e), parser.offset()); + } + } let patch = Patch::new( - header.0.map(convert_cow_to_str), - header.1.map(convert_cow_to_str), + header.0.map(T::from_bytes_cow), + header.1.map(T::from_bytes_cow), hunks, ); (Ok(patch), parser.offset()) } -pub fn parse_strict(input: &str) -> Result> { - let mut parser = Parser::new(input); - let header = patch_header(&mut parser)?; - let hunks = hunks(&mut parser)?; - reject_orphaned_hunk_headers(&mut parser)?; - - Ok(Patch::new( - header.0.map(convert_cow_to_str), - header.1.map(convert_cow_to_str), - hunks, - )) -} - -pub fn parse_bytes(input: &[u8]) -> Result> { - let mut parser = Parser::new(input); - let header = patch_header(&mut parser)?; - let hunks = hunks(&mut parser)?; - - Ok(Patch::new(header.0, header.1, hunks)) -} - -pub fn parse_bytes_strict(input: &[u8]) -> Result> { - let mut parser = Parser::new(input); - let header = patch_header(&mut parser)?; - let hunks = hunks(&mut parser)?; - reject_orphaned_hunk_headers(&mut parser)?; - - Ok(Patch::new(header.0, header.1, hunks)) -} - -// This is only used when the type originated as a utf8 string -fn convert_cow_to_str(cow: Cow<'_, [u8]>) -> Cow<'_, str> { - match cow { - Cow::Borrowed(b) => std::str::from_utf8(b).unwrap().into(), - Cow::Owned(o) => String::from_utf8(o).unwrap().into(), - } -} - #[allow(clippy::type_complexity)] -fn patch_header<'a, T: Text + ToOwned + ?Sized>( +fn patch_header<'a, T: Text + ?Sized>( parser: &mut Parser<'a, T>, + opts: &ParseOpts, ) -> Result<(Option>, Option>)> { - skip_header_preamble(parser)?; + if opts.skip_preamble { + skip_header_preamble(parser)?; + } let mut filename1 = None; let mut filename2 = None; @@ -161,10 +185,7 @@ fn skip_header_preamble(parser: &mut Parser<'_, T>) -> Result< Ok(()) } -fn parse_filename<'a, T: Text + ToOwned + ?Sized>( - prefix: &str, - line: &'a T, -) -> Result> { +fn parse_filename<'a, T: Text + ?Sized>(prefix: &str, line: &'a T) -> Result> { let line = line .strip_prefix(prefix) .ok_or(ParsePatchErrorKind::InvalidFilename)?; diff --git a/src/patch_set/error.rs b/src/patch_set/error.rs index ba60e861..9cd6aa9e 100644 --- a/src/patch_set/error.rs +++ b/src/patch_set/error.rs @@ -70,6 +70,15 @@ pub(crate) enum PatchSetParseErrorKind { /// Create patch missing modified path. CreateMissingModifiedPath, + + /// Invalid file mode string. + InvalidFileMode(String), + + /// Invalid `diff --git` path. + InvalidDiffGitPath, + + /// Binary diff not supported in current configuration. + BinaryNotSupported { path: String }, } impl fmt::Display for PatchSetParseErrorKind { @@ -81,6 +90,11 @@ impl fmt::Display for PatchSetParseErrorKind { Self::BothDevNull => write!(f, "patch has both original and modified as /dev/null"), Self::DeleteMissingOriginalPath => write!(f, "delete patch has no original path"), Self::CreateMissingModifiedPath => write!(f, "create patch has no modified path"), + Self::InvalidFileMode(mode) => write!(f, "invalid file mode: {mode}"), + Self::InvalidDiffGitPath => write!(f, "invalid diff --git path"), + Self::BinaryNotSupported { path } => { + write!(f, "binary diff not supported: {path}") + } } } } diff --git a/src/patch_set/mod.rs b/src/patch_set/mod.rs index 7e15a6e3..f83442c9 100644 --- a/src/patch_set/mod.rs +++ b/src/patch_set/mod.rs @@ -13,11 +13,25 @@ use std::borrow::Cow; use crate::Patch; pub use error::PatchSetParseError; +use error::PatchSetParseErrorKind; pub use parse::PatchSet; /// Options for parsing patch content. /// -/// Use [`ParseOptions::unidiff()`] to create options for the desired format. +/// Use [`ParseOptions::unidiff()`] or [`ParseOptions::gitdiff()`] +/// to create options for the desired format. +/// +/// ## Binary Files +/// +/// When parsing git diffs, binary file changes are detected by: +/// +/// * `Binary files a/path and b/path differ` (`git diff` without `--binary` flag) +/// * `GIT binary patch` (from `git diff --binary`) +/// +/// Note that this is not a documented Git behavior, +/// so the implementation here is subject to change if Git changes. +/// +/// By default, binary diffs are skipped. /// /// ## Example /// @@ -40,12 +54,26 @@ pub use parse::PatchSet; #[derive(Debug, Clone)] pub struct ParseOptions { pub(crate) format: Format, + pub(crate) binary: Binary, } -#[derive(Debug, Clone, Copy)] +#[derive(Debug, Clone, Copy, Default)] pub(crate) enum Format { /// Standard unified diff format. + #[default] UniDiff, + /// Git extended diff format. + GitDiff, +} + +/// How to handle binary diffs in GitDiff mode. +#[derive(Debug, Clone, Copy, Default)] +pub(crate) enum Binary { + /// Skip binary diffs silently. + #[default] + Skip, + /// Return error if binary diff encountered. + Fail, } impl ParseOptions { @@ -64,8 +92,38 @@ impl ParseOptions { pub fn unidiff() -> Self { Self { format: Format::UniDiff, + binary: Default::default(), } } + + /// Parse as [git extended diff format][git-diff-format]. + /// + /// Supports all features of [`unidiff()`](Self::unidiff) plus: + /// + /// * `diff --git` headers + /// * Extended headers (`new file mode`, `deleted file mode`, etc.) + /// * Rename/copy detection (`rename from`/`rename to`, `copy from`/`copy to`) + /// * Binary file detection (skipped by default) + /// + /// [git-diff-format]: https://git-scm.com/docs/diff-format + pub fn gitdiff() -> Self { + Self { + format: Format::GitDiff, + binary: Default::default(), + } + } + + /// Skip binary diffs silently. + pub fn skip_binary(mut self) -> Self { + self.binary = Binary::Skip; + self + } + + /// Return an error if a binary diff is encountered. + pub fn fail_on_binary(mut self) -> Self { + self.binary = Binary::Fail; + self + } } /// File mode extracted from git extended headers. @@ -81,6 +139,20 @@ pub enum FileMode { Gitlink, } +impl std::str::FromStr for FileMode { + type Err = PatchSetParseError; + + fn from_str(mode: &str) -> Result { + match mode { + "100644" => Ok(Self::Regular), + "100755" => Ok(Self::Executable), + "120000" => Ok(Self::Symlink), + "160000" => Ok(Self::Gitlink), + _ => Err(PatchSetParseErrorKind::InvalidFileMode(mode.to_owned()).into()), + } + } +} + /// The kind of patch content in a [`FilePatch`]. #[derive(Clone, PartialEq, Eq)] pub enum PatchKind<'a, T: ToOwned + ?Sized> { diff --git a/src/patch_set/parse.rs b/src/patch_set/parse.rs index c4864f95..0d8b9945 100644 --- a/src/patch_set/parse.rs +++ b/src/patch_set/parse.rs @@ -1,10 +1,12 @@ //! Parse multiple file patches from a unified diff. use super::{ - error::PatchSetParseErrorKind, FileOperation, FilePatch, Format, ParseOptions, - PatchSetParseError, + error::PatchSetParseErrorKind, Binary, FileMode, FileOperation, FilePatch, Format, + ParseOptions, PatchSetParseError, }; use crate::patch::parse::parse_one; +use crate::utils::escaped_filename; +use crate::Patch; use std::borrow::Cow; @@ -72,6 +74,71 @@ impl<'a> PatchSet<'a> { PatchSetParseError::new(kind, self.offset..self.offset) } + fn next_gitdiff_patch(&mut self) -> Option, PatchSetParseError>> { + let remaining = &self.input[self.offset..]; + let patch_start = find_gitdiff_start(remaining)?; + self.offset += patch_start; + self.found_any = true; + + let abs_patch_start = self.offset; + + // Parse extended headers incrementally — stops at first unrecognized line + let (header, header_consumed) = GitHeader::parse(&self.input[self.offset..]); + self.offset += header_consumed; + + // Handle binary markers ("Binary files ... differ") + if header.is_binary_marker || header.is_binary_patch { + match self.opts.binary { + Binary::Skip => { + return self.next_gitdiff_patch(); + } + Binary::Fail => { + let path = header.diff_git_line.unwrap_or("").to_owned(); + return Some(Err(PatchSetParseError::new( + PatchSetParseErrorKind::BinaryNotSupported { path }, + abs_patch_start..abs_patch_start, + ))); + } + } + } + + // `git diff` output format is stricter. + // There is no preamble between Git headers and unidiff patch portion, + // so we safely don't perform the preamble skipping. + // + // If we did, it would fail the pure rename/mode-change operation + // since those ops have no unidiff patch portion + // and is directly followed by the next `diff --git` header. + let opts = crate::patch::parse::ParseOpts::default().no_skip_preamble(); + let remaining = &self.input[self.offset..]; + let (result, consumed) = parse_one(remaining, opts); + self.offset += consumed; + let patch = match result { + Ok(patch) => patch, + Err(e) => return Some(Err(e.into())), + }; + + // FIXME: error spans point at `diff --git` line, not the specific offending line + let operation = match extract_file_op_gitdiff(&header, &patch) { + Ok(op) => op, + Err(mut e) => { + e.set_span(abs_patch_start..abs_patch_start); + return Some(Err(e)); + } + }; + + // FIXME: error spans point at `diff --git` line, not the specific offending line + let (old_mode, new_mode) = match parse_file_modes(&header) { + Ok(modes) => modes, + Err(mut e) => { + e.set_span(abs_patch_start..abs_patch_start); + return Some(Err(e)); + } + }; + + Some(Ok(FilePatch::new(operation, patch, old_mode, new_mode))) + } + fn next_unidiff_patch(&mut self) -> Option, PatchSetParseError>> { let remaining = &self.input[self.offset..]; if remaining.is_empty() { @@ -83,7 +150,8 @@ impl<'a> PatchSet<'a> { let patch_input = &remaining[patch_start..]; - let (result, consumed) = parse_one(patch_input); + let opts = crate::patch::parse::ParseOpts::default(); + let (result, consumed) = parse_one(patch_input, opts); // Always advance so the iterator makes progress even on error. let abs_patch_start = self.offset + patch_start; self.offset += patch_start + consumed; @@ -124,6 +192,16 @@ impl<'a> Iterator for PatchSet<'a> { } result } + Format::GitDiff => { + let result = self.next_gitdiff_patch(); + if result.is_none() { + self.finished = true; + if !self.found_any { + return Some(Err(self.error(PatchSetParseErrorKind::NoPatchesFound))); + } + } + result + } }; result @@ -141,12 +219,7 @@ fn find_patch_start(input: &str) -> Option { return Some(offset); } offset += line.len(); - // Account for the line ending that `.lines()` strips - if input[offset..].starts_with("\r\n") { - offset += 2; - } else if input[offset..].starts_with('\n') { - offset += 1; - } + offset += line_ending_len(&input[offset..]); } None } @@ -178,6 +251,356 @@ fn strip_email_preamble(input: &str) -> &str { } } +/// Finds the byte offset of the first `diff --git` line in `input`. +fn find_gitdiff_start(input: &str) -> Option { + let mut offset = 0; + for line in input.lines() { + if line.starts_with("diff --git ") { + return Some(offset); + } + offset += line.len(); + offset += line_ending_len(&input[offset..]); + } + None +} + +/// Git extended header metadata. +/// +/// Extracted from lines between `diff --git` and `---` (or end of patch). +/// See [git-diff format documentation](https://git-scm.com/docs/diff-format). +#[derive(Debug, Default)] +struct GitHeader<'a> { + /// Raw content after "diff --git " prefix. + /// + /// Only parsed in fallback when `---`/`+++` is absent (mode-only, binary, empty file). + diff_git_line: Option<&'a str>, + /// Source path from `rename from `. + rename_from: Option<&'a str>, + /// Destination path from `rename to `. + rename_to: Option<&'a str>, + /// Source path from `copy from `. + copy_from: Option<&'a str>, + /// Destination path from `copy to `. + copy_to: Option<&'a str>, + /// File mode from `old mode `. + old_mode: Option<&'a str>, + /// File mode from `new mode `. + new_mode: Option<&'a str>, + /// File mode from `new file mode `. + new_file_mode: Option<&'a str>, + /// File mode from `deleted file mode `. + deleted_file_mode: Option<&'a str>, + /// Whether this is a binary diff with no actual patch content. + /// + /// Observed `git diff` output (without `--binary`): + /// + /// ```text + /// diff --git a/image.png b/image.png + /// new file mode 100644 + /// index 0000000..7c4530c + /// Binary files /dev/null and b/image.png differ + /// ``` + is_binary_marker: bool, + /// Whether this is a binary diff with actual patch content. + /// + /// Observed `git diff --binary` output: + /// + /// ```text + /// diff --git a/image.png b/image.png + /// new file mode 100644 + /// index 0000000..7c4530c + /// GIT binary patch + /// literal 67 + /// zcmV-J0KET+... + /// + /// literal 0 + /// KcmV+b0RR6000031 + /// ``` + is_binary_patch: bool, +} + +impl<'a> GitHeader<'a> { + /// Parses git extended headers incrementally from the current position. + /// + /// Consumes the `diff --git` line and all recognized extended header lines, + /// stopping at the first unrecognized line (typically `---`/`+++`/`@@` + /// or the next `diff --git`). + /// + /// Returns the parsed header and the number of bytes consumed. + fn parse(input: &'a str) -> (Self, usize) { + let mut header = GitHeader::default(); + let mut consumed = 0; + + for line in input.lines() { + if let Some(rest) = line.strip_prefix("diff --git ") { + // Only accept the first `diff --git` line. + // A second one means we've reached the next patch. + if header.diff_git_line.is_some() { + break; + } + header.diff_git_line = Some(rest); + } else if let Some(path) = line.strip_prefix("rename from ") { + header.rename_from = Some(path); + } else if let Some(path) = line.strip_prefix("rename to ") { + header.rename_to = Some(path); + } else if let Some(path) = line.strip_prefix("copy from ") { + header.copy_from = Some(path); + } else if let Some(path) = line.strip_prefix("copy to ") { + header.copy_to = Some(path); + } else if let Some(mode) = line.strip_prefix("old mode ") { + header.old_mode = Some(mode); + } else if let Some(mode) = line.strip_prefix("new mode ") { + header.new_mode = Some(mode); + } else if let Some(mode) = line.strip_prefix("new file mode ") { + header.new_file_mode = Some(mode); + } else if let Some(mode) = line.strip_prefix("deleted file mode ") { + header.deleted_file_mode = Some(mode); + } else if line.starts_with("index ") + || line.starts_with("similarity index ") + || line.starts_with("dissimilarity index ") + { + // Recognized but nothing to extract. + } else if line.starts_with("Binary files ") { + header.is_binary_marker = true; + } else if line.starts_with("GIT binary patch") { + header.is_binary_patch = true; + } else { + // Unrecognized line: End of extended headers + // (typically `---`/`+++`/`@@` or trailing content). + break; + } + + consumed += line.len(); + consumed += line_ending_len(&input[consumed..]); + } + + (header, consumed) + } +} + +/// Determines the file operation from git headers and patch paths. +fn extract_file_op_gitdiff<'a>( + header: &GitHeader<'a>, + patch: &Patch<'a, str>, +) -> Result, PatchSetParseError> { + // Git headers are authoritative for rename/copy + if let (Some(from), Some(to)) = (header.rename_from, header.rename_to) { + return Ok(FileOperation::Rename { + from: Cow::Borrowed(from), + to: Cow::Borrowed(to), + }); + } + if let (Some(from), Some(to)) = (header.copy_from, header.copy_to) { + return Ok(FileOperation::Copy { + from: Cow::Borrowed(from), + to: Cow::Borrowed(to), + }); + } + + // Try ---/+++ paths first + if patch.original().is_some() || patch.modified().is_some() { + return extract_file_op_unidiff(patch.original_path(), patch.modified_path()); + } + + // Fall back to `diff --git ` for mode-only and empty file changes + let Some((original, modified)) = header.diff_git_line.and_then(parse_diff_git_path) else { + return Err(PatchSetParseErrorKind::InvalidDiffGitPath.into()); + }; + + if header.new_file_mode.is_some() { + Ok(FileOperation::Create(modified)) + } else if header.deleted_file_mode.is_some() { + Ok(FileOperation::Delete(original)) + } else { + Ok(FileOperation::Modify { original, modified }) + } +} + +/// Parses file modes from git extended headers. +fn parse_file_modes( + header: &GitHeader<'_>, +) -> Result<(Option, Option), PatchSetParseError> { + let old_mode = header + .old_mode + .or(header.deleted_file_mode) + .map(str::parse::) + .transpose()?; + let new_mode = header + .new_mode + .or(header.new_file_mode) + .map(str::parse::) + .transpose()?; + Ok((old_mode, new_mode)) +} + +/// Extracts both old and new paths from `diff --git` line content. +/// +/// ## Assumption #1: old and new paths are the same +/// +/// This extraction has one strong assumption: +/// Beside their prefixes, old and new paths are the same. +/// +/// From [git-diff format documentation]: +/// +/// > The `a/` and `b/` filenames are the same unless rename/copy is involved. +/// > Especially, even for a creation or a deletion, `/dev/null` is not used +/// > in place of the `a/` or `b/` filenames. +/// > +/// > When a rename/copy is involved, file1 and file2 show the name of the +/// > source file of the rename/copy and the name of the file that the +/// > rename/copy produces, respectively. +/// +/// Since rename/copy operations use `rename from/to` and `copy from/to` headers +/// we have handled earlier in [`extract_file_op_gitdiff`], +/// (which have no `a/`/`b/` prefix per git spec), +/// +/// this extraction is only used +/// * when unified diff headers (`---`/`+++`) are absent +/// * Only for mode-only and empty file cases +/// +/// [git-diff format documentation]: https://git-scm.com/docs/diff-format +/// +/// ## Assumption #2: the longest common path suffix is the shared path +/// +/// When custom prefixes contain spaces, +/// multiple splits may produce valid path suffixes. +/// +/// Example: `src/foo.rs src/foo.rs src/foo.rs src/foo.rs` +/// +/// Three splits all produce valid path suffixes (contain `/`): +/// +/// * Position 10 +/// * old path: `src/foo.rs` +/// * new path: `src/foo.rs src/foo.rs src/foo.rs` +/// * common suffix: `foo.rs` +/// * Position 21 +/// * old path: `src/foo.rs src/foo.rs` +/// * new path: `src/foo.rs src/foo.rs` +/// * common suffix: `foo.rs src/foo.rs` +/// * Position 32 +/// * old path: `src/foo.rs src/foo.rs src/foo.rs` +/// * new path: `src/foo.rs` +/// * common suffix: `foo.rs` +/// +/// We observed that `git apply` would pick position 21, +/// which has the longest path suffix, +/// hence this heuristic. +/// +/// ## Supported formats +/// +/// * `a/ b/` (default prefix) +/// * ` ` (`git diff --no-prefix`) +/// * ` ` (custom prefix) +/// * `"" ""` (quoted, with escapes) +/// * Mixed quoted/unquoted +fn parse_diff_git_path(line: &str) -> Option<(Cow<'_, str>, Cow<'_, str>)> { + if line.starts_with('"') || line.ends_with('"') { + parse_quoted_diff_git_path(line) + } else { + parse_unquoted_diff_git_path(line) + } +} + +/// See [`parse_diff_git_path`]. +fn parse_unquoted_diff_git_path(line: &str) -> Option<(Cow<'_, str>, Cow<'_, str>)> { + let mut best_match = None; + let mut longest_path = ""; + + for (i, _) in line.match_indices(' ') { + let left = &line[..i]; + let right = &line[i + 1..]; + if left.is_empty() || right.is_empty() { + continue; + } + // Select split with longest common path suffix (matches Git behavior) + if let Some(path) = longest_common_path_suffix(left, right) { + if path.len() > longest_path.len() { + longest_path = path; + best_match = Some((left, right)); + } + } + } + + best_match.map(|(l, r)| (Cow::Borrowed(l), Cow::Borrowed(r))) +} + +/// See [`parse_diff_git_path`]. +fn parse_quoted_diff_git_path(line: &str) -> Option<(Cow<'_, str>, Cow<'_, str>)> { + let (left_raw, right_raw) = if line.starts_with('"') { + // First token is quoted. + let bytes = line.as_bytes(); + let mut i = 1; // skip starting `"` + let end = loop { + match bytes.get(i)? { + b'"' => break i + 1, + b'\\' => i += 2, + _ => i += 1, + } + }; + let (first, rest) = line.split_at(end); + let rest = rest.strip_prefix(' ')?; + (first, rest) + } else if let Some(pos) = line.find(" \"") { + // First token is unquoted. The second must be quoted. + (&line[..pos], &line[pos + 1..]) + } else { + // Malformed: ends with `"` but no valid quoted path found + return None; + }; + + let left = escaped_filename(left_raw).ok().and_then(|cow| match cow { + Cow::Borrowed(b) => std::str::from_utf8(b).ok().map(Cow::Borrowed), + Cow::Owned(v) => String::from_utf8(v).ok().map(Cow::Owned), + })?; + let right = escaped_filename(right_raw).ok().and_then(|cow| match cow { + Cow::Borrowed(b) => std::str::from_utf8(b).ok().map(Cow::Borrowed), + Cow::Owned(v) => String::from_utf8(v).ok().map(Cow::Owned), + })?; + + // Verify both sides share the same path. + longest_common_path_suffix(&left, &right)?; + Some((left, right)) +} + +/// Extracts the longest common path suffix starting at a path component boundary. +/// +/// Returns `None` if no valid common path exists. +/// +/// Path component boundary means: +/// +/// * At `/` character (e.g., `foo/bar.rs` vs `fooo/bar.rs` → `bar.rs`) +/// * Or the entire string is identical +fn longest_common_path_suffix<'a>(a: &'a str, b: &'a str) -> Option<&'a str> { + if a.is_empty() || b.is_empty() { + return None; + } + + let suffix_len = a + .as_bytes() + .iter() + .rev() + .zip(b.as_bytes().iter().rev()) + .take_while(|(x, y)| x == y) + .count(); + + if suffix_len == 0 { + return None; + } + + // Identical strings + if suffix_len == a.len() && a.len() == b.len() { + return Some(a); + } + + // Find first '/' in suffix and return path after it + let suffix_start = a.len() - suffix_len; + let suffix = &a[suffix_start..]; + suffix + .split_once('/') + .map(|(_, path)| path) + .filter(|p| !p.is_empty()) +} + /// Extracts the file operation from a patch based on its header paths. pub(crate) fn extract_file_op_unidiff<'a>( original: Option<&Cow<'a, str>>, @@ -221,3 +644,17 @@ pub(crate) fn extract_file_op_unidiff<'a>( } } } + +/// Returns the length of the line ending at the start of `s`. +/// +/// `.lines()` strips line endings, so callers tracking byte offsets need to +/// account for the `\r\n` or `\n` that was consumed. +fn line_ending_len(s: &str) -> usize { + if s.starts_with("\r\n") { + 2 + } else if s.starts_with('\n') { + 1 + } else { + 0 + } +} diff --git a/src/patch_set/tests.rs b/src/patch_set/tests.rs index 5bbfd316..9648930b 100644 --- a/src/patch_set/tests.rs +++ b/src/patch_set/tests.rs @@ -463,3 +463,124 @@ In a hole in the ground there lived a hobbit ); } } + +mod patchset_gitdiff { + use super::*; + use crate::patch_set::PatchKind; + + fn parse_gitdiff(input: &str) -> Vec> { + PatchSet::parse(input, ParseOptions::gitdiff()) + .collect::, _>>() + .unwrap() + } + + /// `parse_one` must stop at `diff --git` boundaries so that + /// back-to-back patches are split correctly. + /// Without this, the second patch's `diff --git` line would be + /// swallowed as trailing junk by the first patch's hunk parser. + #[test] + fn multi_file_stops_at_diff_git_boundary() { + let input = "\ +diff --git a/foo b/foo +--- a/foo ++++ b/foo +@@ -1 +1 @@ +-old foo ++new foo +diff --git a/bar b/bar +--- a/bar ++++ b/bar +@@ -1 +1 @@ +-old bar ++new bar +"; + let patches = parse_gitdiff(input); + assert_eq!(patches.len(), 2); + } + + #[test] + fn pure_rename() { + let input = "\ +diff --git a/old.rs b/new.rs +similarity index 100% +rename from old.rs +rename to new.rs +"; + let patches = parse_gitdiff(input); + assert_eq!(patches.len(), 1); + assert_eq!( + patches[0].operation(), + &FileOperation::Rename { + from: "old.rs".into(), + to: "new.rs".into(), + } + ); + } + + /// Empty file creation has no ---/+++ headers, so the path comes + /// from the `diff --git` line and retains the `b/` prefix. + /// Callers use `strip_prefix(1)` to remove it. + #[test] + fn new_empty_file() { + let input = "\ +diff --git a/empty b/empty +new file mode 100644 +index 0000000..e69de29 +"; + let patches = parse_gitdiff(input); + assert_eq!(patches.len(), 1); + assert_eq!( + patches[0].operation(), + &FileOperation::Create("b/empty".into()) + ); + match patches[0].patch() { + PatchKind::Text(p) => assert!(p.hunks().is_empty()), + } + } + + #[test] + fn rename_then_modify() { + // Rename with no hunks followed by a modify with hunks. + // Tests that offset advances correctly across both. + let input = "\ +diff --git a/old.rs b/new.rs +similarity index 100% +rename from old.rs +rename to new.rs +diff --git a/foo b/foo +--- a/foo ++++ b/foo +@@ -1 +1 @@ +-old ++new +"; + let patches = parse_gitdiff(input); + assert_eq!(patches.len(), 2); + assert!(matches!( + patches[0].operation(), + FileOperation::Rename { .. } + )); + assert!(matches!( + patches[1].operation(), + FileOperation::Modify { .. } + )); + } + + #[test] + fn binary_skip_does_not_stall() { + // Binary marker must be skipped and offset must advance + // to the next patch without infinite loop. + let input = "\ +diff --git a/img.png b/img.png +Binary files a/img.png and b/img.png differ +diff --git a/foo b/foo +--- a/foo ++++ b/foo +@@ -1 +1 @@ +-old ++new +"; + let patches = parse_gitdiff(input); + assert_eq!(patches.len(), 1, "binary should be skipped, text parsed"); + } +} diff --git a/src/utils.rs b/src/utils.rs index c3b15944..4e98c95f 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -132,7 +132,7 @@ impl<'a, T: Text + ?Sized> Iterator for LineIter<'a, T> { /// A helper trait for processing text like `str` and `[u8]` /// Useful for abstracting over those types for parsing as well as breaking input into lines -pub trait Text: Eq + Hash { +pub trait Text: Eq + Hash + ToOwned { fn is_empty(&self) -> bool; fn len(&self) -> usize; fn starts_with(&self, prefix: &str) -> bool; @@ -148,6 +148,9 @@ pub trait Text: Eq + Hash { #[allow(unused)] fn lines(&self) -> LineIter<'_, Self>; + /// Converts a byte-oriented `Cow` into a `Cow` of `Self`. + fn from_bytes_cow(cow: Cow<'_, [u8]>) -> Cow<'_, Self>; + fn parse(&self) -> Option { self.as_str().and_then(|s| s.parse().ok()) } @@ -202,6 +205,13 @@ impl Text for str { fn lines(&self) -> LineIter<'_, Self> { LineIter::new(self) } + + fn from_bytes_cow(cow: Cow<'_, [u8]>) -> Cow<'_, Self> { + match cow { + Cow::Borrowed(b) => Cow::Borrowed(std::str::from_utf8(b).unwrap()), + Cow::Owned(o) => Cow::Owned(String::from_utf8(o).unwrap()), + } + } } impl Text for [u8] { @@ -252,6 +262,10 @@ impl Text for [u8] { fn lines(&self) -> LineIter<'_, Self> { LineIter::new(self) } + + fn from_bytes_cow(cow: Cow<'_, [u8]>) -> Cow<'_, Self> { + cow + } } fn find_bytes(haystack: &[u8], needle: &[u8]) -> Option { @@ -292,7 +306,7 @@ fn find_byte(haystack: &[u8], byte: u8) -> Option { /// /// See [`byte_needs_quoting`] for the set of characters that /// require quoting. -pub(crate) fn escaped_filename( +pub(crate) fn escaped_filename( filename: &T, ) -> Result, ParsePatchError> { if let Some(inner) = filename @@ -309,9 +323,7 @@ pub(crate) fn escaped_filename( } } -fn decode_escaped( - escaped: &T, -) -> Result, ParsePatchError> { +fn decode_escaped(escaped: &T) -> Result, ParsePatchError> { let bytes = escaped.as_bytes(); let mut result = Vec::new(); let mut i = 0; diff --git a/tests/compat/common.rs b/tests/compat/common.rs index 7bc6f37a..117ed043 100644 --- a/tests/compat/common.rs +++ b/tests/compat/common.rs @@ -2,16 +2,27 @@ use std::{ fs, + io::Write, path::{Path, PathBuf}, - process::Command, + process::{Command, Stdio}, sync::Once, }; use diffy::patch_set::{FileOperation, ParseOptions, PatchKind, PatchSet, PatchSetParseError}; +/// Which external tool to compare against. +#[derive(Clone, Copy)] +pub enum CompatMode { + /// `git apply` with `ParseOptions::gitdiff()` + Git, + /// GNU `patch` with `ParseOptions::unidiff()` + GnuPatch, +} + /// A test case with fluent builder API. pub struct Case<'a> { case_name: &'a str, + mode: CompatMode, /// Strip level for path prefixes (default: 0) strip_level: u32, /// Whether diffy is expected to succeed (default: true) @@ -21,20 +32,37 @@ pub struct Case<'a> { } impl<'a> Case<'a> { + /// Create a test case for `git apply` comparison. + pub fn git(name: &'a str) -> Self { + Self { + case_name: name, + mode: CompatMode::Git, + strip_level: 0, + expect_success: true, + expect_compat: true, + } + } + /// Create a test case for GNU patch comparison. pub fn gnu_patch(name: &'a str) -> Self { Self { case_name: name, + mode: CompatMode::GnuPatch, strip_level: 0, expect_success: true, expect_compat: true, } } - /// Get the case directory path. + /// Get the case directory path based on mode. fn case_dir(&self) -> PathBuf { + let subdir = match self.mode { + CompatMode::Git => "git", + CompatMode::GnuPatch => "gnu_patch", + }; PathBuf::from(env!("CARGO_MANIFEST_DIR")) - .join("tests/compat/gnu_patch") + .join("tests/compat") + .join(subdir) .join(self.case_name) } @@ -62,12 +90,19 @@ impl<'a> Case<'a> { .unwrap_or_else(|e| panic!("failed to read {}: {e}", patch_path.display())); let case_name = self.case_name; + let prefix = match self.mode { + CompatMode::Git => "git", + CompatMode::GnuPatch => "gnu", + }; let temp_base = temp_base(); - let diffy_output = temp_base.join(format!("gnu-{case_name}-diffy")); + let diffy_output = temp_base.join(format!("{prefix}-{case_name}-diffy")); create_output_dir(&diffy_output); - let opts = ParseOptions::unidiff(); + let opts = match self.mode { + CompatMode::Git => ParseOptions::gitdiff(), + CompatMode::GnuPatch => ParseOptions::unidiff(), + }; // Apply with diffy let diffy_result = apply_diffy(&in_dir, &patch, &diffy_output, opts, self.strip_level); @@ -81,12 +116,19 @@ impl<'a> Case<'a> { // In CI mode, also verify external tool behavior if is_ci() { - let external_output = temp_base.join(format!("gnu-{case_name}-external")); + let external_output = temp_base.join(format!("{prefix}-{case_name}-external")); create_output_dir(&external_output); - print_patch_version(); - let external_result = - gnu_patch_apply(&in_dir, &patch_path, &external_output, self.strip_level); + let external_result = match self.mode { + CompatMode::Git => { + print_git_version(); + git_apply(&external_output, &patch, self.strip_level, &in_dir) + } + CompatMode::GnuPatch => { + print_patch_version(); + gnu_patch_apply(&in_dir, &patch_path, &external_output, self.strip_level) + } + }; // For success cases where both succeed and are expected to be compatible, // verify outputs match @@ -120,6 +162,39 @@ impl<'a> Case<'a> { // External tool invocations +fn git_apply( + output_dir: &Path, + patch: &str, + strip_level: u32, + in_dir: &Path, +) -> Result<(), String> { + copy_input_files(in_dir, output_dir, &["patch"]); + + let mut cmd = Command::new("git"); + cmd.env("GIT_CONFIG_NOSYSTEM", "1"); + cmd.env("GIT_CONFIG_GLOBAL", "/dev/null"); + cmd.current_dir(output_dir); + cmd.args(["apply", &format!("-p{strip_level}"), "-"]); + cmd.stdin(Stdio::piped()); + cmd.stdout(Stdio::piped()); + cmd.stderr(Stdio::piped()); + + let mut child = cmd.spawn().expect("failed to spawn git apply"); + child + .stdin + .as_mut() + .unwrap() + .write_all(patch.as_bytes()) + .unwrap(); + + let output = child.wait_with_output().unwrap(); + if output.status.success() { + Ok(()) + } else { + Err(String::from_utf8_lossy(&output.stderr).to_string()) + } +} + fn gnu_patch_apply( in_dir: &Path, patch_path: &Path, @@ -149,6 +224,24 @@ fn gnu_patch_apply( } } +fn print_git_version() { + static ONCE: Once = Once::new(); + ONCE.call_once(|| { + let output = Command::new("git").arg("--version").output(); + match output { + Ok(o) if o.status.success() => { + let version = String::from_utf8_lossy(&o.stdout); + eprintln!( + "git version: {}", + version.lines().next().unwrap_or("unknown") + ); + } + Ok(o) => eprintln!("git --version failed: {}", o.status), + Err(e) => eprintln!("git command not found: {e}"), + } + }); +} + fn print_patch_version() { static ONCE: Once = Once::new(); ONCE.call_once(|| { diff --git a/tests/compat/git/fail_both_devnull/in/foo.patch b/tests/compat/git/fail_both_devnull/in/foo.patch new file mode 100644 index 00000000..26b28273 --- /dev/null +++ b/tests/compat/git/fail_both_devnull/in/foo.patch @@ -0,0 +1,6 @@ +diff --git a/foo b/foo +--- /dev/null ++++ /dev/null +@@ -1 +1 @@ +-old ++new diff --git a/tests/compat/git/format_patch_diff_in_message/in/file.txt b/tests/compat/git/format_patch_diff_in_message/in/file.txt new file mode 100644 index 00000000..3367afdb --- /dev/null +++ b/tests/compat/git/format_patch_diff_in_message/in/file.txt @@ -0,0 +1 @@ +old diff --git a/tests/compat/git/format_patch_diff_in_message/in/foo.patch b/tests/compat/git/format_patch_diff_in_message/in/foo.patch new file mode 100644 index 00000000..ce333713 --- /dev/null +++ b/tests/compat/git/format_patch_diff_in_message/in/foo.patch @@ -0,0 +1,20 @@ +From 8a14b135fe7ba10bab09a77c4a687faaf1d92a26 Mon Sep 17 00:00:00 2001 +From: Test +Date: Wed, 1 Jan 2020 00:00:00 +0000 +Subject: [PATCH] Fix diff --git parsing bug + +The line `diff --git a/x b/x` in messages was incorrectly parsed. +--- + file.txt | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/file.txt b/file.txt +index 3367afd..3e75765 100644 +--- a/file.txt ++++ b/file.txt +@@ -1 +1 @@ +-old ++new +-- +2.52.0 + diff --git a/tests/compat/git/format_patch_diff_in_message/out/file.txt b/tests/compat/git/format_patch_diff_in_message/out/file.txt new file mode 100644 index 00000000..3e757656 --- /dev/null +++ b/tests/compat/git/format_patch_diff_in_message/out/file.txt @@ -0,0 +1 @@ +new diff --git a/tests/compat/git/format_patch_multiple_separators/in/file.txt b/tests/compat/git/format_patch_multiple_separators/in/file.txt new file mode 100644 index 00000000..3367afdb --- /dev/null +++ b/tests/compat/git/format_patch_multiple_separators/in/file.txt @@ -0,0 +1 @@ +old diff --git a/tests/compat/git/format_patch_multiple_separators/in/foo.patch b/tests/compat/git/format_patch_multiple_separators/in/foo.patch new file mode 100644 index 00000000..a0da5b94 --- /dev/null +++ b/tests/compat/git/format_patch_multiple_separators/in/foo.patch @@ -0,0 +1,21 @@ +From 6bfbbfa49a16bb8173145a933fe5ad918ad48a31 Mon Sep 17 00:00:00 2001 +From: Test +Date: Wed, 1 Jan 2020 00:00:00 +0000 +Subject: [PATCH] Add content with --- markers + +--- + file.txt | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/file.txt b/file.txt +index 3367afd..c4d4ea8 100644 +--- a/file.txt ++++ b/file.txt +@@ -1 +1,3 @@ +-old ++--- ++new ++--- +-- +2.52.0 + diff --git a/tests/compat/git/format_patch_multiple_separators/out/file.txt b/tests/compat/git/format_patch_multiple_separators/out/file.txt new file mode 100644 index 00000000..c4d4ea8d --- /dev/null +++ b/tests/compat/git/format_patch_multiple_separators/out/file.txt @@ -0,0 +1,3 @@ +--- +new +--- diff --git a/tests/compat/git/format_patch_preamble/in/file.txt b/tests/compat/git/format_patch_preamble/in/file.txt new file mode 100644 index 00000000..3367afdb --- /dev/null +++ b/tests/compat/git/format_patch_preamble/in/file.txt @@ -0,0 +1 @@ +old diff --git a/tests/compat/git/format_patch_preamble/in/foo.patch b/tests/compat/git/format_patch_preamble/in/foo.patch new file mode 100644 index 00000000..9d904250 --- /dev/null +++ b/tests/compat/git/format_patch_preamble/in/foo.patch @@ -0,0 +1,20 @@ +From ddbc9053359329dd016ed89f0d6e460b3b8ff5e3 Mon Sep 17 00:00:00 2001 +From: Test +Date: Wed, 1 Jan 2020 00:00:00 +0000 +Subject: [PATCH] Add new content + +This is the commit body. +--- + file.txt | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/file.txt b/file.txt +index 3367afd..3e75765 100644 +--- a/file.txt ++++ b/file.txt +@@ -1 +1 @@ +-old ++new +-- +2.52.0 + diff --git a/tests/compat/git/format_patch_preamble/out/file.txt b/tests/compat/git/format_patch_preamble/out/file.txt new file mode 100644 index 00000000..3e757656 --- /dev/null +++ b/tests/compat/git/format_patch_preamble/out/file.txt @@ -0,0 +1 @@ +new diff --git a/tests/compat/git/format_patch_signature/in/file.txt b/tests/compat/git/format_patch_signature/in/file.txt new file mode 100644 index 00000000..3367afdb --- /dev/null +++ b/tests/compat/git/format_patch_signature/in/file.txt @@ -0,0 +1 @@ +old diff --git a/tests/compat/git/format_patch_signature/in/foo.patch b/tests/compat/git/format_patch_signature/in/foo.patch new file mode 100644 index 00000000..7614287a --- /dev/null +++ b/tests/compat/git/format_patch_signature/in/foo.patch @@ -0,0 +1,19 @@ +From b3bb3125eff3d2648f15af2a6e0cdcdf6ad8fce1 Mon Sep 17 00:00:00 2001 +From: Test +Date: Wed, 1 Jan 2020 00:00:00 +0000 +Subject: [PATCH] modify + +--- + file.txt | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/file.txt b/file.txt +index 3367afd..3e75765 100644 +--- a/file.txt ++++ b/file.txt +@@ -1 +1 @@ +-old ++new +-- +2.52.0 + diff --git a/tests/compat/git/format_patch_signature/out/file.txt b/tests/compat/git/format_patch_signature/out/file.txt new file mode 100644 index 00000000..3e757656 --- /dev/null +++ b/tests/compat/git/format_patch_signature/out/file.txt @@ -0,0 +1 @@ +new diff --git a/tests/compat/git/junk_between_files/in/bar.txt b/tests/compat/git/junk_between_files/in/bar.txt new file mode 100644 index 00000000..601d8ee1 --- /dev/null +++ b/tests/compat/git/junk_between_files/in/bar.txt @@ -0,0 +1 @@ +bar line1 diff --git a/tests/compat/git/junk_between_files/in/foo.patch b/tests/compat/git/junk_between_files/in/foo.patch new file mode 100644 index 00000000..dbc59398 --- /dev/null +++ b/tests/compat/git/junk_between_files/in/foo.patch @@ -0,0 +1,17 @@ +diff --git a/foo.txt b/foo.txt +index 1234567..89abcdef 100644 +--- a/foo.txt ++++ b/foo.txt +@@ -1 +1 @@ +-foo line1 ++FOO LINE1 +JUNK BETWEEN FILES!!!! +This preamble text should be ignored +by both git apply and diffy +diff --git a/bar.txt b/bar.txt +index 1234567..89abcdef 100644 +--- a/bar.txt ++++ b/bar.txt +@@ -1 +1 @@ +-bar line1 ++BAR LINE1 diff --git a/tests/compat/git/junk_between_files/in/foo.txt b/tests/compat/git/junk_between_files/in/foo.txt new file mode 100644 index 00000000..b11358e1 --- /dev/null +++ b/tests/compat/git/junk_between_files/in/foo.txt @@ -0,0 +1 @@ +foo line1 diff --git a/tests/compat/git/junk_between_files/out/bar.txt b/tests/compat/git/junk_between_files/out/bar.txt new file mode 100644 index 00000000..76c036d0 --- /dev/null +++ b/tests/compat/git/junk_between_files/out/bar.txt @@ -0,0 +1 @@ +BAR LINE1 diff --git a/tests/compat/git/junk_between_files/out/foo.txt b/tests/compat/git/junk_between_files/out/foo.txt new file mode 100644 index 00000000..787bc665 --- /dev/null +++ b/tests/compat/git/junk_between_files/out/foo.txt @@ -0,0 +1 @@ +FOO LINE1 diff --git a/tests/compat/git/junk_between_hunks/in/file.txt b/tests/compat/git/junk_between_hunks/in/file.txt new file mode 100644 index 00000000..822aed3f --- /dev/null +++ b/tests/compat/git/junk_between_hunks/in/file.txt @@ -0,0 +1,9 @@ +line1 +line2 +line3 +line4 +line5 +line6 +line7 +line8 +line9 diff --git a/tests/compat/git/junk_between_hunks/in/foo.patch b/tests/compat/git/junk_between_hunks/in/foo.patch new file mode 100644 index 00000000..242e959c --- /dev/null +++ b/tests/compat/git/junk_between_hunks/in/foo.patch @@ -0,0 +1,15 @@ +diff --git a/file.txt b/file.txt +index 1234567..89abcdef 100644 +--- a/file.txt ++++ b/file.txt +@@ -1,3 +1,3 @@ +-line1 ++LINE1 + line2 + line3 +JUNK BETWEEN HUNKS +@@ -7,3 +7,3 @@ + line7 +-line8 ++LINE8 + line9 diff --git a/tests/compat/git/junk_between_hunks/out/file.txt b/tests/compat/git/junk_between_hunks/out/file.txt new file mode 100644 index 00000000..2e5e454d --- /dev/null +++ b/tests/compat/git/junk_between_hunks/out/file.txt @@ -0,0 +1,9 @@ +LINE1 +line2 +line3 +line4 +line5 +line6 +line7 +line8 +line9 diff --git a/tests/compat/git/mod.rs b/tests/compat/git/mod.rs new file mode 100644 index 00000000..e7dc99b3 --- /dev/null +++ b/tests/compat/git/mod.rs @@ -0,0 +1,138 @@ +//! Git compatibility tests. See [`crate`] for test structure and usage. +//! +//! Focus areas: +//! +//! - `diff --git` path parsing edge cases (quotes, spaces, ambiguous prefixes) +//! - `git format-patch` email format (preamble/signature stripping) +//! - Agreement between diffy and `git apply` + +use crate::common::Case; + +#[test] +fn path_no_prefix() { + Case::git("path_no_prefix").run(); +} + +#[test] +fn path_quoted_escapes() { + Case::git("path_quoted_escapes").strip(1).run(); +} + +// Git uses C-style named escapes (\a, \b, \f, \v) for certain control +// characters in quoted filenames. Both `git apply` and GNU patch decode +// these correctly. +// +// Observed with git 2.53.0: +// $ printf 'x' > "$(printf 'bel\a')" && git add -A +// $ git diff --cached | grep '+++' +// +++ "b/bel\a" +// +// diffy now decodes these correctly. +#[test] +fn path_quoted_named_escape() { + Case::git("path_quoted_named_escape").strip(1).run(); +} + +// Git uses 3-digit octal escapes (\000-\377) for bytes that don't have +// a named escape. Both `git apply` and GNU patch decode these correctly. +// +// Observed with git 2.53.0: +// $ printf 'x' > "$(printf 'tl\033')" && git add -A +// $ git diff --cached | grep '+++' +// +++ "b/tl\033" +// +// Found via full-history replay test against llvm/llvm-project +// (commits 17af06ba..229c95ab, 6c031780..0683a1e5). +#[test] +fn path_quoted_octal_escape() { + Case::git("path_quoted_octal_escape").strip(1).run(); +} + +#[test] +fn path_with_spaces() { + Case::git("path_with_spaces").strip(1).run(); +} + +#[test] +fn path_containing_space_b() { + Case::git("path_containing_space_b").strip(1).run(); +} + +#[test] +fn format_patch_preamble() { + // Ambiguous: where does preamble end? First `\n---\n` - verify matches git + Case::git("format_patch_preamble").strip(1).run(); +} + +#[test] +fn format_patch_diff_in_message() { + // `diff --git` in commit message must NOT trigger early parsing + Case::git("format_patch_diff_in_message").strip(1).run(); +} + +#[test] +fn format_patch_multiple_separators() { + // Git uses first `\n---\n` as separator (observed git mailinfo behavior) + Case::git("format_patch_multiple_separators").strip(1).run(); +} + +#[test] +fn format_patch_signature() { + // Ambiguous: `\n-- \n` could appear in patch content - verify matches git + Case::git("format_patch_signature").strip(1).run(); +} + +#[test] +fn nested_diff_signature() { + // Patch that deletes a diff file containing `-- ` patterns within its content, + // followed by a real email signature at the end. + // + // Tests that we correctly distinguish between: + // - `-- ` appearing as patch content (from inner diff's empty context lines) + // - `-- ` appearing as the actual email signature separator + // + // Both git apply and GNU patch handle this correctly. + Case::git("nested_diff_signature").strip(1).run(); +} + +#[test] +fn path_ambiguous_suffix() { + // Multiple valid splits in `diff --git` line; algorithm picks longest common suffix. + // Tests the pathological case from parse.rs comments where custom prefix + // creates `src/foo.rs src/foo.rs src/foo.rs src/foo.rs` - verify matches git. + Case::git("path_ambiguous_suffix").strip(1).run(); +} + +// Both --- and +++ point to /dev/null. +// git apply rejects: "dev/null: No such file or directory" +#[test] +fn fail_both_devnull() { + Case::git("fail_both_devnull") + .strip(1) + .expect_success(false) + .run(); +} + +// Single-file patch with junk between hunks. +// +// - git apply: errors ("patch fragment without header") +// - diffy: succeeds, ignores trailing junk (matches GNU patch behavior) +#[test] +fn junk_between_hunks() { + Case::git("junk_between_hunks") + .strip(1) + .expect_compat(false) + .run(); +} + +// Multi-file patch with junk/preamble text between different files. +// +// git apply behavior: Ignores content between `diff --git` boundaries. +// In GitDiff mode, splitting occurs at `diff --git`, so junk between +// files becomes trailing content of the previous chunk (harmless). +// +// This is different from junk between HUNKS of the same file (which fails). +#[test] +fn junk_between_files() { + Case::git("junk_between_files").strip(1).run(); +} diff --git a/tests/compat/git/nested_diff_signature/in/example.rs b/tests/compat/git/nested_diff_signature/in/example.rs new file mode 100644 index 00000000..8f3b7ef1 --- /dev/null +++ b/tests/compat/git/nested_diff_signature/in/example.rs @@ -0,0 +1 @@ +fn foo() {} diff --git a/tests/compat/git/nested_diff_signature/in/foo.patch b/tests/compat/git/nested_diff_signature/in/foo.patch new file mode 100644 index 00000000..5d876c61 --- /dev/null +++ b/tests/compat/git/nested_diff_signature/in/foo.patch @@ -0,0 +1,25 @@ +diff --git a/mir-test.diff b/mir-test.diff +deleted file mode 100644 +index 98012d7..0000000 +--- a/mir-test.diff ++++ /dev/null +@@ -1,12 +0,0 @@ +-- // MIR before +-+ // MIR after +- +- fn opt() { +- bb0: { +-- nop; +-- } +-- +-- bb1: { +-- nop; +- } +- } +diff --git a/example.rs b/example.rs +index 8f3b7ef..2a40712 100644 +--- a/example.rs ++++ b/example.rs +@@ -1 +1,2 @@ + fn foo() {} ++fn bar() {} diff --git a/tests/compat/git/nested_diff_signature/in/mir-test.diff b/tests/compat/git/nested_diff_signature/in/mir-test.diff new file mode 100644 index 00000000..98012d7e --- /dev/null +++ b/tests/compat/git/nested_diff_signature/in/mir-test.diff @@ -0,0 +1,12 @@ +- // MIR before ++ // MIR after + + fn opt() { + bb0: { +- nop; +- } +- +- bb1: { +- nop; + } + } diff --git a/tests/compat/git/nested_diff_signature/out/example.rs b/tests/compat/git/nested_diff_signature/out/example.rs new file mode 100644 index 00000000..2a40712e --- /dev/null +++ b/tests/compat/git/nested_diff_signature/out/example.rs @@ -0,0 +1,2 @@ +fn foo() {} +fn bar() {} diff --git a/tests/compat/git/nested_diff_signature/out/mir-test.diff b/tests/compat/git/nested_diff_signature/out/mir-test.diff new file mode 100644 index 00000000..e69de29b diff --git a/tests/compat/git/path_ambiguous_suffix/in/foo.patch b/tests/compat/git/path_ambiguous_suffix/in/foo.patch new file mode 100644 index 00000000..a6815eb7 --- /dev/null +++ b/tests/compat/git/path_ambiguous_suffix/in/foo.patch @@ -0,0 +1,3 @@ +diff --git src/foo.rs src/foo.rs src/foo.rs src/foo.rs +new file mode 100644 +index 0000000..e69de29 diff --git a/tests/compat/git/path_ambiguous_suffix/out/foo.rs src/foo.rs b/tests/compat/git/path_ambiguous_suffix/out/foo.rs src/foo.rs new file mode 100644 index 00000000..e69de29b diff --git a/tests/compat/git/path_containing_space_b/in/foo b/baz.txt b/tests/compat/git/path_containing_space_b/in/foo b/baz.txt new file mode 100644 index 00000000..3367afdb --- /dev/null +++ b/tests/compat/git/path_containing_space_b/in/foo b/baz.txt @@ -0,0 +1 @@ +old diff --git a/tests/compat/git/path_containing_space_b/in/foo.patch b/tests/compat/git/path_containing_space_b/in/foo.patch new file mode 100644 index 00000000..15c6fca2 --- /dev/null +++ b/tests/compat/git/path_containing_space_b/in/foo.patch @@ -0,0 +1,7 @@ +diff --git a/foo b/baz.txt b/foo b/baz.txt +index 3367afd..3e75765 100644 +--- a/foo b/baz.txt ++++ b/foo b/baz.txt +@@ -1 +1 @@ +-old ++new diff --git a/tests/compat/git/path_containing_space_b/out/foo b/baz.txt b/tests/compat/git/path_containing_space_b/out/foo b/baz.txt new file mode 100644 index 00000000..3e757656 --- /dev/null +++ b/tests/compat/git/path_containing_space_b/out/foo b/baz.txt @@ -0,0 +1 @@ +new diff --git a/tests/compat/git/path_no_prefix/in/file.txt b/tests/compat/git/path_no_prefix/in/file.txt new file mode 100644 index 00000000..3367afdb --- /dev/null +++ b/tests/compat/git/path_no_prefix/in/file.txt @@ -0,0 +1 @@ +old diff --git a/tests/compat/git/path_no_prefix/in/foo.patch b/tests/compat/git/path_no_prefix/in/foo.patch new file mode 100644 index 00000000..5e2a9f8f --- /dev/null +++ b/tests/compat/git/path_no_prefix/in/foo.patch @@ -0,0 +1,7 @@ +diff --git file.txt file.txt +index 3367afd..3e75765 100644 +--- file.txt ++++ file.txt +@@ -1 +1 @@ +-old ++new diff --git a/tests/compat/git/path_no_prefix/out/file.txt b/tests/compat/git/path_no_prefix/out/file.txt new file mode 100644 index 00000000..3e757656 --- /dev/null +++ b/tests/compat/git/path_no_prefix/out/file.txt @@ -0,0 +1 @@ +new diff --git "a/tests/compat/git/path_quoted_escapes/in/foo\tbar.txt" "b/tests/compat/git/path_quoted_escapes/in/foo\tbar.txt" new file mode 100644 index 00000000..3367afdb --- /dev/null +++ "b/tests/compat/git/path_quoted_escapes/in/foo\tbar.txt" @@ -0,0 +1 @@ +old diff --git a/tests/compat/git/path_quoted_escapes/in/foo.patch b/tests/compat/git/path_quoted_escapes/in/foo.patch new file mode 100644 index 00000000..26692238 --- /dev/null +++ b/tests/compat/git/path_quoted_escapes/in/foo.patch @@ -0,0 +1,7 @@ +diff --git "a/foo\tbar.txt" "b/foo\tbar.txt" +index 3367afd..3e75765 100644 +--- "a/foo\tbar.txt" ++++ "b/foo\tbar.txt" +@@ -1 +1 @@ +-old ++new diff --git "a/tests/compat/git/path_quoted_escapes/out/foo\tbar.txt" "b/tests/compat/git/path_quoted_escapes/out/foo\tbar.txt" new file mode 100644 index 00000000..3e757656 --- /dev/null +++ "b/tests/compat/git/path_quoted_escapes/out/foo\tbar.txt" @@ -0,0 +1 @@ +new diff --git a/tests/compat/git/path_quoted_named_escape/in/foo.patch b/tests/compat/git/path_quoted_named_escape/in/foo.patch new file mode 100644 index 00000000..cfd3f562 --- /dev/null +++ b/tests/compat/git/path_quoted_named_escape/in/foo.patch @@ -0,0 +1,6 @@ +diff --git "a/bel\a" "b/bel\a" +new file mode 100644 +--- /dev/null ++++ "b/bel\a" +@@ -0,0 +1 @@ ++hello diff --git "a/tests/compat/git/path_quoted_named_escape/out/bel\a" "b/tests/compat/git/path_quoted_named_escape/out/bel\a" new file mode 100644 index 00000000..ce013625 --- /dev/null +++ "b/tests/compat/git/path_quoted_named_escape/out/bel\a" @@ -0,0 +1 @@ +hello diff --git a/tests/compat/git/path_quoted_octal_escape/in/foo.patch b/tests/compat/git/path_quoted_octal_escape/in/foo.patch new file mode 100644 index 00000000..5dda9ec6 --- /dev/null +++ b/tests/compat/git/path_quoted_octal_escape/in/foo.patch @@ -0,0 +1,6 @@ +diff --git "a/tl\033" "b/tl\033" +new file mode 100644 +--- /dev/null ++++ "b/tl\033" +@@ -0,0 +1 @@ ++hello diff --git "a/tests/compat/git/path_quoted_octal_escape/out/tl\033" "b/tests/compat/git/path_quoted_octal_escape/out/tl\033" new file mode 100644 index 00000000..ce013625 --- /dev/null +++ "b/tests/compat/git/path_quoted_octal_escape/out/tl\033" @@ -0,0 +1 @@ +hello diff --git a/tests/compat/git/path_with_spaces/in/foo bar.txt b/tests/compat/git/path_with_spaces/in/foo bar.txt new file mode 100644 index 00000000..3367afdb --- /dev/null +++ b/tests/compat/git/path_with_spaces/in/foo bar.txt @@ -0,0 +1 @@ +old diff --git a/tests/compat/git/path_with_spaces/in/foo.patch b/tests/compat/git/path_with_spaces/in/foo.patch new file mode 100644 index 00000000..b3d1d463 --- /dev/null +++ b/tests/compat/git/path_with_spaces/in/foo.patch @@ -0,0 +1,7 @@ +diff --git a/foo bar.txt b/foo bar.txt +index 3367afd..3e75765 100644 +--- a/foo bar.txt ++++ b/foo bar.txt +@@ -1 +1 @@ +-old ++new diff --git a/tests/compat/git/path_with_spaces/out/foo bar.txt b/tests/compat/git/path_with_spaces/out/foo bar.txt new file mode 100644 index 00000000..3e757656 --- /dev/null +++ b/tests/compat/git/path_with_spaces/out/foo bar.txt @@ -0,0 +1 @@ +new diff --git a/tests/compat/main.rs b/tests/compat/main.rs index 8faf9eac..e35ed079 100644 --- a/tests/compat/main.rs +++ b/tests/compat/main.rs @@ -39,9 +39,11 @@ //! //! 1. Create `case_name/in/` with input file(s) and `foo.patch` //! 2. Run `SNAPSHOTS=overwrite cargo test --test compat` to generate `out/` -//! 3. Add `#[test] fn case_name() { Case::gnu_patch(...).run(); }` in the module +//! 3. Add `#[test] fn case_name() { Case::{gnu_patch,git}(...).run(); }` in the module //! //! For failure tests, use `.expect_success(false)` and skip step 2. +//! For intentional compat divergence, use `.expect_compat(false)`. mod common; +mod git; mod gnu_patch; diff --git a/tests/replay.rs b/tests/replay.rs index 7eaa56ba..1d4e731d 100644 --- a/tests/replay.rs +++ b/tests/replay.rs @@ -18,8 +18,7 @@ //! * A range (e.g., `abc123..def456`) for a specific commit range //! //! Defaults to 200. Use `0` to verify entire history. -//! * `DIFFY_TEST_PARSE_MODE`: Parse mode to use. -//! Currently only `unidiff` is supported. +//! * `DIFFY_TEST_PARSE_MODE`: Parse mode to use (`unidiff` or `gitdiff`). //! Defaults to `unidiff`. //! //! ## Requirements @@ -161,12 +160,14 @@ impl CatFile { #[derive(Debug, Clone, Copy, PartialEq, Eq)] enum TestMode { UniDiff, + GitDiff, } impl From for ParseOptions { fn from(value: TestMode) -> Self { match value { TestMode::UniDiff => ParseOptions::unidiff(), + TestMode::GitDiff => ParseOptions::gitdiff(), } } } @@ -228,7 +229,8 @@ fn test_mode() -> TestMode { }; match val.trim().to_lowercase().as_str() { "unidiff" => TestMode::UniDiff, - _ => panic!("invalid DIFFY_TEST_PARSE_MODE='{val}': expected 'unidiff'"), + "gitdiff" => TestMode::GitDiff, + _ => panic!("invalid DIFFY_TEST_PARSE_MODE='{val}': expected 'unidiff' or 'gitdiff'"), } } @@ -323,8 +325,13 @@ fn process_commit( // UniDiff format cannot express pure renames (no ---/+++ headers). // Use `--no-renames` to represent them as delete + create instead. + // GitDiff mode handles renames via extended headers natively. let diff_output = match mode { TestMode::UniDiff => git(repo, &["diff", "--no-renames", parent, child]), + // TODO: pass `--binary` once binary patch support lands, + // so binary files get actual delta/literal data instead of + // "Binary files differ" markers. + TestMode::GitDiff => git(repo, &["diff", parent, child]), }; if diff_output.is_empty() { @@ -373,6 +380,30 @@ fn process_commit( } text_files + type_changes } + TestMode::GitDiff => { + // Can't use `--numstat` for GitDiff: it shows `-\t-\t` for both + // actual binary diffs AND pure binary renames (100% similarity). + // Parser correctly handles pure renames (rename headers, no binary content). + // + // Use `--raw` for total count, subtract actual binary markers from diff. + // + // TODO: once `--binary` is passed above, count ALL `--raw` + // entries — every file will have patch data (delta, literal, or text). + let raw = git(repo, &["diff", "--raw", parent, child]); + let (mut total, mut type_changes) = (0, 0); + for line in raw.lines().filter(|l| !l.is_empty()) { + total += 1; + if is_type_change(line) { + type_changes += 1; + } + } + let binary = diff_output + .lines() + .filter(|l| l.starts_with("Binary files ") || l.starts_with("GIT binary patch")) + .count(); + skipped += binary; + total - binary + type_changes + } }; if expected_file_count == 0 { @@ -518,6 +549,7 @@ fn replay() { .unwrap_or_else(|| ".".to_string()); let mode_name = match mode { TestMode::UniDiff => "unidiff", + TestMode::GitDiff => "gitdiff", }; // Shared state for progress reporting