Summary
On the pass path (the golden file exists and matches the obtained contents),
FileRegressionFixture.check still:
- Writes an
.obtained file to disk via dump_fn.
- Reads both the
.obtained file and the expected file back from disk.
- Splits both into lines and compares them with
==.
- Sets up
difflib.unified_diff / difflib.HtmlDiff machinery that is
only needed on a mismatch.
For suites with thousands of fast, text-only regression checks the disk
round-trip dominates wall-clock time. Short-circuiting the match case by
encoding contents in memory and comparing it byte-exact against the
expected file — falling back to the current code path on mismatch or
when --force-regen / --regen-all is set — preserves the existing
error output while removing the .obtained write on the hot path.
Reproducer
test_bench.py:
"""Benchmark ``FileRegressionFixture.check`` pass path vs in-memory
fast paths (splitlines equality and byte-exact), both for the match
and the miss case.
"""
import time
from pathlib import Path
import pytest
from pytest_regressions.file_regression import FileRegressionFixture
ITERATIONS = 1000
LINES = 200
CONTENTS = "\n".join(f"line {i} with some payload text" for i in range(LINES)) + "\n"
@pytest.fixture(name="golden")
def fixture_golden(tmp_path: Path) -> Path:
path = tmp_path / "golden.txt"
path.write_text(CONTENTS)
return path
def _fast_path_splitlines(
*,
file_regression: FileRegressionFixture,
contents: str,
golden_path: Path,
extension: str,
newline: str | None,
) -> None:
config = file_regression.request.config
regen = file_regression.force_regen or bool(
config.getoption(name="regen_all")
or config.getoption(name="force_regen"),
)
if (
not regen
and golden_path.is_file()
and contents.splitlines() == golden_path.read_text().splitlines()
):
return
file_regression.check(
contents=contents,
extension=extension,
newline=newline,
fullpath=golden_path,
)
def _fast_path_byte_exact(
*,
file_regression: FileRegressionFixture,
contents: str,
golden_path: Path,
extension: str,
encoding: str | None,
newline: str | None,
) -> None:
config = file_regression.request.config
regen = file_regression.force_regen or bool(
config.getoption(name="regen_all")
or config.getoption(name="force_regen"),
)
if not regen and golden_path.is_file():
expected = golden_path.read_bytes()
encoded = contents.encode(encoding or "utf-8")
if newline not in (None, ""):
encoded = encoded.replace(b"\n", newline.encode(encoding or "utf-8"))
if expected == encoded:
return
file_regression.check(
contents=contents,
extension=extension,
encoding=encoding,
newline=newline,
fullpath=golden_path,
)
def test_upstream(
file_regression: FileRegressionFixture,
golden: Path,
) -> None:
start = time.perf_counter()
for _ in range(ITERATIONS):
file_regression.check(
contents=CONTENTS,
extension=".txt",
newline="",
fullpath=golden,
)
elapsed = time.perf_counter() - start
print(
f"\nupstream file_regression.check: "
f"{ITERATIONS} calls in {elapsed:.3f}s "
f"({elapsed / ITERATIONS * 1e6:.1f} us/call)"
)
def test_fast_path_splitlines(
file_regression: FileRegressionFixture,
golden: Path,
) -> None:
start = time.perf_counter()
for _ in range(ITERATIONS):
_fast_path_splitlines(
file_regression=file_regression,
contents=CONTENTS,
golden_path=golden,
extension=".txt",
newline="",
)
elapsed = time.perf_counter() - start
print(
f"\nfast path (splitlines equality): "
f"{ITERATIONS} calls in {elapsed:.3f}s "
f"({elapsed / ITERATIONS * 1e6:.1f} us/call)"
)
def test_fast_path_byte_exact(
file_regression: FileRegressionFixture,
golden: Path,
) -> None:
start = time.perf_counter()
for _ in range(ITERATIONS):
_fast_path_byte_exact(
file_regression=file_regression,
contents=CONTENTS,
golden_path=golden,
extension=".txt",
encoding=None,
newline="",
)
elapsed = time.perf_counter() - start
print(
f"\nfast path (byte-exact, no .obtained write): "
f"{ITERATIONS} calls in {elapsed:.3f}s "
f"({elapsed / ITERATIONS * 1e6:.1f} us/call)"
)
MISS_CONTENTS = CONTENTS + "extra line\n"
MISS_ITERATIONS = 200
def test_miss_upstream(
file_regression: FileRegressionFixture,
golden: Path,
) -> None:
start = time.perf_counter()
for _ in range(MISS_ITERATIONS):
with pytest.raises(AssertionError):
file_regression.check(
contents=MISS_CONTENTS,
extension=".txt",
newline="",
fullpath=golden,
)
elapsed = time.perf_counter() - start
print(
f"\nmiss upstream file_regression.check: "
f"{MISS_ITERATIONS} calls in {elapsed:.3f}s "
f"({elapsed / MISS_ITERATIONS * 1e6:.1f} us/call)"
)
def test_miss_fast_path_splitlines(
file_regression: FileRegressionFixture,
golden: Path,
) -> None:
start = time.perf_counter()
for _ in range(MISS_ITERATIONS):
with pytest.raises(AssertionError):
_fast_path_splitlines(
file_regression=file_regression,
contents=MISS_CONTENTS,
golden_path=golden,
extension=".txt",
newline="",
)
elapsed = time.perf_counter() - start
print(
f"\nmiss fast path (splitlines equality, then fall through): "
f"{MISS_ITERATIONS} calls in {elapsed:.3f}s "
f"({elapsed / MISS_ITERATIONS * 1e6:.1f} us/call)"
)
def test_miss_fast_path_byte_exact(
file_regression: FileRegressionFixture,
golden: Path,
) -> None:
start = time.perf_counter()
for _ in range(MISS_ITERATIONS):
with pytest.raises(AssertionError):
_fast_path_byte_exact(
file_regression=file_regression,
contents=MISS_CONTENTS,
golden_path=golden,
extension=".txt",
encoding=None,
newline="",
)
elapsed = time.perf_counter() - start
print(
f"\nmiss fast path (byte-exact, then fall through): "
f"{MISS_ITERATIONS} calls in {elapsed:.3f}s "
f"({elapsed / MISS_ITERATIONS * 1e6:.1f} us/call)"
)
Run with:
Timings
pytest-regressions 2.10.0
- Python 3.13.9
- macOS (Darwin 25.4.0, Apple Silicon), SSD
- 200-line text golden, 1000 iterations per run, 3 runs
Match case (1000 iterations)
| Run |
Upstream check |
Byte-exact fast path |
Speedup |
| 1 |
86.8 us/call |
11.9 us/call |
7.3x |
| 2 |
99.0 us/call |
11.9 us/call |
8.3x |
| 3 |
93.9 us/call |
12.9 us/call |
7.3x |
In a downstream suite with ~13k file_regression.check calls per run,
the fast path removed several seconds of wall-clock time end to end.
Miss case (200 iterations)
When contents does not match the golden, the fast path falls through
to the existing check code path. The added overhead is one
read_bytes() + one encode() before the fallthrough; the ~800 us of
.obtained write and difflib.HtmlDiff rendering dominate either way.
| Run |
Miss upstream |
Miss byte-exact fast path |
Overhead |
| 1 |
882.4 us/call |
910.1 us/call |
+28 us |
| 2 |
816.9 us/call |
861.1 us/call |
+44 us |
| 3 |
864.4 us/call |
841.2 us/call |
−23 us |
The overhead is within run-to-run noise (~1–3%) on a branch that only
runs for failing tests.
Proposal
Add an in-memory byte-exact short-circuit to
FileRegressionFixture.check for the non-binary, default-check_fn
path. Encode contents using the same encoding and newline
semantics that dump_fn would apply, compare the result against
expected_filename.read_bytes(), and return without calling dump_fn
or check_fn on a match. On mismatch or when --force-regen /
--regen-all is set, fall through to the current code path so the
.obtained file and HTML diff are produced exactly as today.
Summary
On the pass path (the golden file exists and matches the obtained contents),
FileRegressionFixture.checkstill:.obtainedfile to disk viadump_fn..obtainedfile and the expected file back from disk.==.difflib.unified_diff/difflib.HtmlDiffmachinery that isonly needed on a mismatch.
For suites with thousands of fast, text-only regression checks the disk
round-trip dominates wall-clock time. Short-circuiting the match case by
encoding
contentsin memory and comparing it byte-exact against theexpected file — falling back to the current code path on mismatch or
when
--force-regen/--regen-allis set — preserves the existingerror output while removing the
.obtainedwrite on the hot path.Reproducer
test_bench.py:Run with:
Timings
pytest-regressions2.10.0Match case (1000 iterations)
checkIn a downstream suite with ~13k
file_regression.checkcalls per run,the fast path removed several seconds of wall-clock time end to end.
Miss case (200 iterations)
When
contentsdoes not match the golden, the fast path falls throughto the existing
checkcode path. The added overhead is oneread_bytes()+ oneencode()before the fallthrough; the ~800 us of.obtainedwrite anddifflib.HtmlDiffrendering dominate either way.The overhead is within run-to-run noise (~1–3%) on a branch that only
runs for failing tests.
Proposal
Add an in-memory byte-exact short-circuit to
FileRegressionFixture.checkfor the non-binary, default-check_fnpath. Encode
contentsusing the sameencodingandnewlinesemantics that
dump_fnwould apply, compare the result againstexpected_filename.read_bytes(), and return without callingdump_fnor
check_fnon a match. On mismatch or when--force-regen/--regen-allis set, fall through to the current code path so the.obtainedfile and HTML diff are produced exactly as today.