From f12cda9a4f278afe1a131f6db459e5a4481951fa Mon Sep 17 00:00:00 2001 From: Joost Jansen <2032793+joost-j@users.noreply.github.com> Date: Tue, 12 Aug 2025 16:11:16 +0200 Subject: [PATCH 1/4] Patch copyfileobj function --- acquire/outputs/tar.py | 39 +++++++++++++++++++++++++++++++++++++++ tests/test_outputs_tar.py | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) diff --git a/acquire/outputs/tar.py b/acquire/outputs/tar.py index 69b72906..fe0b0877 100644 --- a/acquire/outputs/tar.py +++ b/acquire/outputs/tar.py @@ -1,6 +1,7 @@ from __future__ import annotations import io +import shutil import tarfile from typing import TYPE_CHECKING, BinaryIO @@ -15,6 +16,44 @@ TAR_COMPRESSION_METHODS = {"gzip": "gz", "bzip2": "bz2", "xz": "xz"} +def copyfileobj( + src: BinaryIO, dst: BinaryIO, length: int | None = None, _: Exception = OSError, bufsize: int | None = None +) -> None: + """ + Patched version of the copyfileobj function from the Python stdlib (tarfile.py), + to handle cases where the source file is actually shorter than expected. + By patching the missing bytes with zeroes, we avoid raising an exception + and potentially corrupting the tar file. + """ + bufsize = bufsize or 16 * 1024 + if length == 0: + return + if length is None: + shutil.copyfileobj(src, dst, bufsize) + return + + blocks, remainder = divmod(length, bufsize) + for _ in range(blocks): + buf = src.read(bufsize) + if len(buf) < bufsize: + # raise exception("unexpected end of data") + # PATCH; instead of raising an exception, pad the data to the desired length + buf += b"\x00" * (bufsize - len(buf)) + dst.write(buf) + + if remainder != 0: + buf = src.read(remainder) + if len(buf) < remainder: + # raise exception("unexpected end of data") + # PATCH; instead of raising an exception, pad the data to the desired length + buf += b"\x00" * (remainder - len(buf)) + dst.write(buf) + return + + +tarfile.copyfileobj = copyfileobj + + class TarOutput(Output): """Tar archive acquire output format. Output can be compressed and/or encrypted. diff --git a/tests/test_outputs_tar.py b/tests/test_outputs_tar.py index 81059bb2..fbba5eb3 100644 --- a/tests/test_outputs_tar.py +++ b/tests/test_outputs_tar.py @@ -1,5 +1,6 @@ from __future__ import annotations +import io import tarfile from pathlib import Path from typing import TYPE_CHECKING @@ -63,3 +64,40 @@ def test_tar_output_encrypt(mock_fs: VirtualFilesystem, public_key: bytes, tmp_p with tarfile.open(name=decrypted_path, mode="r") as tar_file: assert entry.open().read() == tar_file.extractfile(entry_name).read() + + +def test_tar_output_race_condition_with_growing_file(tmp_path: Path, public_key: bytes) -> None: + class ShrinkingFile(io.BytesIO): + """ + A file-like object that returns 5 bytes less than required. + Simulates a file on disk that has shrunk in between the time of + determining the size and actually reading the data. + """ + + def __init__(self, data: bytes): + super().__init__(data) + + def read(self, size: int) -> bytes: + return super().read(size - 5) + + content = b"some text" + + content_padded = content[:-5] + b"\x00" * 5 + file = ShrinkingFile(content) + + tar_output = TarOutput(tmp_path / "race.tar", encrypt=True, public_key=public_key) + tar_output.write("file.log", file) + tar_output.close() + file.close() + + encrypted_stream = EncryptedFile(tar_output.path.open("rb"), Path("tests/_data/private_key.pem")) + decrypted_path = tmp_path / "decrypted.tar" + + # Direct streaming is not an option because tarfile needs seek when reading from encrypted files directly + Path(decrypted_path).write_bytes(encrypted_stream.read()) + + with tarfile.open(name=decrypted_path, mode="r") as tar_file: + member = tar_file.getmember("file.log") + extracted = tar_file.extractfile(member).read() + # The content should be padded with zeros to match the original size, despite the fact that the file shrunk + assert extracted == content_padded From 12d8ae743d1ecdc1149bc90c99aa3b76ceb91757 Mon Sep 17 00:00:00 2001 From: Joost Jansen <2032793+joost-j@users.noreply.github.com> Date: Tue, 12 Aug 2025 16:37:13 +0200 Subject: [PATCH 2/4] Applied changes according to feedback --- acquire/outputs/tar.py | 9 +++++---- tests/test_outputs_tar.py | 39 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 43 insertions(+), 5 deletions(-) diff --git a/acquire/outputs/tar.py b/acquire/outputs/tar.py index fe0b0877..2cf8c679 100644 --- a/acquire/outputs/tar.py +++ b/acquire/outputs/tar.py @@ -4,6 +4,7 @@ import shutil import tarfile from typing import TYPE_CHECKING, BinaryIO +from unittest.mock import patch from acquire.crypt import EncryptedStream from acquire.outputs.base import Output @@ -34,6 +35,7 @@ def copyfileobj( blocks, remainder = divmod(length, bufsize) for _ in range(blocks): + # Already prevents "long reads" because it reads at max bufsize bytes at a time buf = src.read(bufsize) if len(buf) < bufsize: # raise exception("unexpected end of data") @@ -42,6 +44,7 @@ def copyfileobj( dst.write(buf) if remainder != 0: + # Already prevents "long reads" because it reads at max bufsize bytes at a time buf = src.read(remainder) if len(buf) < remainder: # raise exception("unexpected end of data") @@ -51,9 +54,6 @@ def copyfileobj( return -tarfile.copyfileobj = copyfileobj - - class TarOutput(Output): """Tar archive acquire output format. Output can be compressed and/or encrypted. @@ -139,7 +139,8 @@ def write( if stat: info.mtime = stat.st_mtime - self.tar.addfile(info, fh) + with patch("tarfile.copyfileobj", copyfileobj): + self.tar.addfile(info, fh) def close(self) -> None: """Closes the tar file.""" diff --git a/tests/test_outputs_tar.py b/tests/test_outputs_tar.py index fbba5eb3..c1a0c873 100644 --- a/tests/test_outputs_tar.py +++ b/tests/test_outputs_tar.py @@ -66,7 +66,7 @@ def test_tar_output_encrypt(mock_fs: VirtualFilesystem, public_key: bytes, tmp_p assert entry.open().read() == tar_file.extractfile(entry_name).read() -def test_tar_output_race_condition_with_growing_file(tmp_path: Path, public_key: bytes) -> None: +def test_tar_output_race_condition_with_shrinking_file(tmp_path: Path, public_key: bytes) -> None: class ShrinkingFile(io.BytesIO): """ A file-like object that returns 5 bytes less than required. @@ -101,3 +101,40 @@ def read(self, size: int) -> bytes: extracted = tar_file.extractfile(member).read() # The content should be padded with zeros to match the original size, despite the fact that the file shrunk assert extracted == content_padded + + +def test_tar_output_race_condition_with_growing_file(tmp_path: Path, public_key: bytes) -> None: + class GrowingFile(io.BytesIO): + """ + A file-like object that returns 3 extra bytes. + Simulates a file on disk that has grown in between the time of + determining the size and actually reading the data. + """ + + def __init__(self, data: bytes): + super().__init__(data) + + def read(self, size: int) -> bytes: + return super().read(size) + b"FOX" + + content = b"some text" + + file = GrowingFile(content) + + tar_output = TarOutput(tmp_path / "race.tar", encrypt=True, public_key=public_key) + tar_output.write("file.log", file) + tar_output.close() + file.close() + + encrypted_stream = EncryptedFile(tar_output.path.open("rb"), Path("tests/_data/private_key.pem")) + decrypted_path = tmp_path / "decrypted.tar" + + # Direct streaming is not an option because tarfile needs seek when reading from encrypted files directly + Path(decrypted_path).write_bytes(encrypted_stream.read()) + + with tarfile.open(name=decrypted_path, mode="r") as tar_file: + member = tar_file.getmember("file.log") + extracted = tar_file.extractfile(member).read() + # The content should match the original content, without the extra bytes + # because the file was read with the original size + assert extracted == content From 5b22f2cb2f12ce463a99e73d13afa380fafc4922 Mon Sep 17 00:00:00 2001 From: Joost Jansen <2032793+joost-j@users.noreply.github.com> Date: Tue, 12 Aug 2025 17:10:59 +0200 Subject: [PATCH 3/4] Also inline 'addfile' function of tarfile --- acquire/outputs/tar.py | 89 +++++++++++++++++++++------------------ tests/test_outputs_tar.py | 2 +- 2 files changed, 50 insertions(+), 41 deletions(-) diff --git a/acquire/outputs/tar.py b/acquire/outputs/tar.py index 2cf8c679..8dd6eccb 100644 --- a/acquire/outputs/tar.py +++ b/acquire/outputs/tar.py @@ -1,10 +1,10 @@ from __future__ import annotations +import copy import io import shutil import tarfile from typing import TYPE_CHECKING, BinaryIO -from unittest.mock import patch from acquire.crypt import EncryptedStream from acquire.outputs.base import Output @@ -17,43 +17,6 @@ TAR_COMPRESSION_METHODS = {"gzip": "gz", "bzip2": "bz2", "xz": "xz"} -def copyfileobj( - src: BinaryIO, dst: BinaryIO, length: int | None = None, _: Exception = OSError, bufsize: int | None = None -) -> None: - """ - Patched version of the copyfileobj function from the Python stdlib (tarfile.py), - to handle cases where the source file is actually shorter than expected. - By patching the missing bytes with zeroes, we avoid raising an exception - and potentially corrupting the tar file. - """ - bufsize = bufsize or 16 * 1024 - if length == 0: - return - if length is None: - shutil.copyfileobj(src, dst, bufsize) - return - - blocks, remainder = divmod(length, bufsize) - for _ in range(blocks): - # Already prevents "long reads" because it reads at max bufsize bytes at a time - buf = src.read(bufsize) - if len(buf) < bufsize: - # raise exception("unexpected end of data") - # PATCH; instead of raising an exception, pad the data to the desired length - buf += b"\x00" * (bufsize - len(buf)) - dst.write(buf) - - if remainder != 0: - # Already prevents "long reads" because it reads at max bufsize bytes at a time - buf = src.read(remainder) - if len(buf) < remainder: - # raise exception("unexpected end of data") - # PATCH; instead of raising an exception, pad the data to the desired length - buf += b"\x00" * (remainder - len(buf)) - dst.write(buf) - return - - class TarOutput(Output): """Tar archive acquire output format. Output can be compressed and/or encrypted. @@ -139,8 +102,54 @@ def write( if stat: info.mtime = stat.st_mtime - with patch("tarfile.copyfileobj", copyfileobj): - self.tar.addfile(info, fh) + # Inline version of Python stdlib's tarfile.addfile & tarfile.copyfileobj, + # to allow for padding and more control over the tar file writing. + self.tar._check("awx") + + if fh is None and info.isreg() and info.size != 0: + raise ValueError("fileobj not provided for non zero-size regular file") + + info = copy.copy(info) + + buf = info.tobuf(self.tar.format, self.tar.encoding, self.tar.errors) + self.tar.fileobj.write(buf) + self.tar.offset += len(buf) + bufsize = self.tar.copybufsize + if fh is not None: + bufsize = bufsize or 16 * 1024 + + if info.size == 0: + return + if info.size is None: + shutil.copyfileobj(fh, self.tar.fileobj, bufsize) + return + + blocks, remainder = divmod(info.size, bufsize) + for _ in range(blocks): + # Prevents "long reads" because it reads at max bufsize bytes at a time + buf = fh.read(bufsize) + if len(buf) < bufsize: + # raise exception("unexpected end of data") + # PATCH; instead of raising an exception, pad the data to the desired length + buf += tarfile.NUL * (bufsize - len(buf)) + self.tar.fileobj.write(buf) + + if remainder != 0: + # Prevents "long reads" because it reads at max bufsize bytes at a time + buf = fh.read(remainder) + if len(buf) < remainder: + # raise exception("unexpected end of data") + # PATCH; instead of raising an exception, pad the data to the desired length + buf += tarfile.NUL * (remainder - len(buf)) + self.tar.fileobj.write(buf) + + blocks, remainder = divmod(info.size, tarfile.BLOCKSIZE) + if remainder > 0: + self.tar.fileobj.write(tarfile.NUL * (tarfile.BLOCKSIZE - remainder)) + blocks += 1 + self.tar.offset += blocks * tarfile.BLOCKSIZE + + self.tar.members.append(info) def close(self) -> None: """Closes the tar file.""" diff --git a/tests/test_outputs_tar.py b/tests/test_outputs_tar.py index c1a0c873..49d00f86 100644 --- a/tests/test_outputs_tar.py +++ b/tests/test_outputs_tar.py @@ -82,7 +82,7 @@ def read(self, size: int) -> bytes: content = b"some text" - content_padded = content[:-5] + b"\x00" * 5 + content_padded = content[:-5] + tarfile.NUL * 5 file = ShrinkingFile(content) tar_output = TarOutput(tmp_path / "race.tar", encrypt=True, public_key=public_key) From 00a211c942f288e25a902d5e28633a0ffebe2acb Mon Sep 17 00:00:00 2001 From: Joost Jansen <12032793+joost-j@users.noreply.github.com> Date: Thu, 21 Aug 2025 14:44:35 +0200 Subject: [PATCH 4/4] Apply suggestions from code review Co-authored-by: Erik Schamper <1254028+Schamper@users.noreply.github.com> --- acquire/outputs/tar.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/acquire/outputs/tar.py b/acquire/outputs/tar.py index 8dd6eccb..73237d25 100644 --- a/acquire/outputs/tar.py +++ b/acquire/outputs/tar.py @@ -129,7 +129,6 @@ def write( # Prevents "long reads" because it reads at max bufsize bytes at a time buf = fh.read(bufsize) if len(buf) < bufsize: - # raise exception("unexpected end of data") # PATCH; instead of raising an exception, pad the data to the desired length buf += tarfile.NUL * (bufsize - len(buf)) self.tar.fileobj.write(buf) @@ -138,7 +137,6 @@ def write( # Prevents "long reads" because it reads at max bufsize bytes at a time buf = fh.read(remainder) if len(buf) < remainder: - # raise exception("unexpected end of data") # PATCH; instead of raising an exception, pad the data to the desired length buf += tarfile.NUL * (remainder - len(buf)) self.tar.fileobj.write(buf)