Skip to content

Fast path for FileRegressionFixture.check when contents already match #240

@adamtheturtle

Description

@adamtheturtle

Summary

On the pass path (the golden file exists and matches the obtained contents),
FileRegressionFixture.check still:

  1. Writes an .obtained file to disk via dump_fn.
  2. Reads both the .obtained file and the expected file back from disk.
  3. Splits both into lines and compares them with ==.
  4. Sets up difflib.unified_diff / difflib.HtmlDiff machinery that is
    only needed on a mismatch.

For suites with thousands of fast, text-only regression checks the disk
round-trip dominates wall-clock time. Short-circuiting the match case by
encoding contents in memory and comparing it byte-exact against the
expected file — falling back to the current code path on mismatch or
when --force-regen / --regen-all is set — preserves the existing
error output while removing the .obtained write on the hot path.

Reproducer

test_bench.py:

"""Benchmark ``FileRegressionFixture.check`` pass path vs in-memory
fast paths (splitlines equality and byte-exact), both for the match
and the miss case.
"""

import time
from pathlib import Path

import pytest
from pytest_regressions.file_regression import FileRegressionFixture


ITERATIONS = 1000
LINES = 200
CONTENTS = "\n".join(f"line {i} with some payload text" for i in range(LINES)) + "\n"


@pytest.fixture(name="golden")
def fixture_golden(tmp_path: Path) -> Path:
    path = tmp_path / "golden.txt"
    path.write_text(CONTENTS)
    return path


def _fast_path_splitlines(
    *,
    file_regression: FileRegressionFixture,
    contents: str,
    golden_path: Path,
    extension: str,
    newline: str | None,
) -> None:
    config = file_regression.request.config
    regen = file_regression.force_regen or bool(
        config.getoption(name="regen_all")
        or config.getoption(name="force_regen"),
    )
    if (
        not regen
        and golden_path.is_file()
        and contents.splitlines() == golden_path.read_text().splitlines()
    ):
        return
    file_regression.check(
        contents=contents,
        extension=extension,
        newline=newline,
        fullpath=golden_path,
    )


def _fast_path_byte_exact(
    *,
    file_regression: FileRegressionFixture,
    contents: str,
    golden_path: Path,
    extension: str,
    encoding: str | None,
    newline: str | None,
) -> None:
    config = file_regression.request.config
    regen = file_regression.force_regen or bool(
        config.getoption(name="regen_all")
        or config.getoption(name="force_regen"),
    )
    if not regen and golden_path.is_file():
        expected = golden_path.read_bytes()
        encoded = contents.encode(encoding or "utf-8")
        if newline not in (None, ""):
            encoded = encoded.replace(b"\n", newline.encode(encoding or "utf-8"))
        if expected == encoded:
            return
    file_regression.check(
        contents=contents,
        extension=extension,
        encoding=encoding,
        newline=newline,
        fullpath=golden_path,
    )


def test_upstream(
    file_regression: FileRegressionFixture,
    golden: Path,
) -> None:
    start = time.perf_counter()
    for _ in range(ITERATIONS):
        file_regression.check(
            contents=CONTENTS,
            extension=".txt",
            newline="",
            fullpath=golden,
        )
    elapsed = time.perf_counter() - start
    print(
        f"\nupstream file_regression.check: "
        f"{ITERATIONS} calls in {elapsed:.3f}s "
        f"({elapsed / ITERATIONS * 1e6:.1f} us/call)"
    )


def test_fast_path_splitlines(
    file_regression: FileRegressionFixture,
    golden: Path,
) -> None:
    start = time.perf_counter()
    for _ in range(ITERATIONS):
        _fast_path_splitlines(
            file_regression=file_regression,
            contents=CONTENTS,
            golden_path=golden,
            extension=".txt",
            newline="",
        )
    elapsed = time.perf_counter() - start
    print(
        f"\nfast path (splitlines equality): "
        f"{ITERATIONS} calls in {elapsed:.3f}s "
        f"({elapsed / ITERATIONS * 1e6:.1f} us/call)"
    )


def test_fast_path_byte_exact(
    file_regression: FileRegressionFixture,
    golden: Path,
) -> None:
    start = time.perf_counter()
    for _ in range(ITERATIONS):
        _fast_path_byte_exact(
            file_regression=file_regression,
            contents=CONTENTS,
            golden_path=golden,
            extension=".txt",
            encoding=None,
            newline="",
        )
    elapsed = time.perf_counter() - start
    print(
        f"\nfast path (byte-exact, no .obtained write): "
        f"{ITERATIONS} calls in {elapsed:.3f}s "
        f"({elapsed / ITERATIONS * 1e6:.1f} us/call)"
    )


MISS_CONTENTS = CONTENTS + "extra line\n"
MISS_ITERATIONS = 200


def test_miss_upstream(
    file_regression: FileRegressionFixture,
    golden: Path,
) -> None:
    start = time.perf_counter()
    for _ in range(MISS_ITERATIONS):
        with pytest.raises(AssertionError):
            file_regression.check(
                contents=MISS_CONTENTS,
                extension=".txt",
                newline="",
                fullpath=golden,
            )
    elapsed = time.perf_counter() - start
    print(
        f"\nmiss upstream file_regression.check: "
        f"{MISS_ITERATIONS} calls in {elapsed:.3f}s "
        f"({elapsed / MISS_ITERATIONS * 1e6:.1f} us/call)"
    )


def test_miss_fast_path_splitlines(
    file_regression: FileRegressionFixture,
    golden: Path,
) -> None:
    start = time.perf_counter()
    for _ in range(MISS_ITERATIONS):
        with pytest.raises(AssertionError):
            _fast_path_splitlines(
                file_regression=file_regression,
                contents=MISS_CONTENTS,
                golden_path=golden,
                extension=".txt",
                newline="",
            )
    elapsed = time.perf_counter() - start
    print(
        f"\nmiss fast path (splitlines equality, then fall through): "
        f"{MISS_ITERATIONS} calls in {elapsed:.3f}s "
        f"({elapsed / MISS_ITERATIONS * 1e6:.1f} us/call)"
    )


def test_miss_fast_path_byte_exact(
    file_regression: FileRegressionFixture,
    golden: Path,
) -> None:
    start = time.perf_counter()
    for _ in range(MISS_ITERATIONS):
        with pytest.raises(AssertionError):
            _fast_path_byte_exact(
                file_regression=file_regression,
                contents=MISS_CONTENTS,
                golden_path=golden,
                extension=".txt",
                encoding=None,
                newline="",
            )
    elapsed = time.perf_counter() - start
    print(
        f"\nmiss fast path (byte-exact, then fall through): "
        f"{MISS_ITERATIONS} calls in {elapsed:.3f}s "
        f"({elapsed / MISS_ITERATIONS * 1e6:.1f} us/call)"
    )

Run with:

pytest test_bench.py -s

Timings

  • pytest-regressions 2.10.0
  • Python 3.13.9
  • macOS (Darwin 25.4.0, Apple Silicon), SSD
  • 200-line text golden, 1000 iterations per run, 3 runs

Match case (1000 iterations)

Run Upstream check Byte-exact fast path Speedup
1 86.8 us/call 11.9 us/call 7.3x
2 99.0 us/call 11.9 us/call 8.3x
3 93.9 us/call 12.9 us/call 7.3x

In a downstream suite with ~13k file_regression.check calls per run,
the fast path removed several seconds of wall-clock time end to end.

Miss case (200 iterations)

When contents does not match the golden, the fast path falls through
to the existing check code path. The added overhead is one
read_bytes() + one encode() before the fallthrough; the ~800 us of
.obtained write and difflib.HtmlDiff rendering dominate either way.

Run Miss upstream Miss byte-exact fast path Overhead
1 882.4 us/call 910.1 us/call +28 us
2 816.9 us/call 861.1 us/call +44 us
3 864.4 us/call 841.2 us/call −23 us

The overhead is within run-to-run noise (~1–3%) on a branch that only
runs for failing tests.

Proposal

Add an in-memory byte-exact short-circuit to
FileRegressionFixture.check for the non-binary, default-check_fn
path. Encode contents using the same encoding and newline
semantics that dump_fn would apply, compare the result against
expected_filename.read_bytes(), and return without calling dump_fn
or check_fn on a match. On mismatch or when --force-regen /
--regen-all is set, fall through to the current code path so the
.obtained file and HTML diff are produced exactly as today.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions