From f12cda9a4f278afe1a131f6db459e5a4481951fa Mon Sep 17 00:00:00 2001
From: Joost Jansen <2032793+joost-j@users.noreply.github.com>
Date: Tue, 12 Aug 2025 16:11:16 +0200
Subject: [PATCH 1/4] Patch copyfileobj function

---
 acquire/outputs/tar.py    | 39 +++++++++++++++++++++++++++++++++++++++
 tests/test_outputs_tar.py | 38 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 77 insertions(+)

diff --git a/acquire/outputs/tar.py b/acquire/outputs/tar.py
index 69b72906..fe0b0877 100644
--- a/acquire/outputs/tar.py
+++ b/acquire/outputs/tar.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import io
+import shutil
 import tarfile
 from typing import TYPE_CHECKING, BinaryIO
 
@@ -15,6 +16,44 @@
 TAR_COMPRESSION_METHODS = {"gzip": "gz", "bzip2": "bz2", "xz": "xz"}
 
 
+def copyfileobj(
+    src: BinaryIO, dst: BinaryIO, length: int | None = None, _: Exception = OSError, bufsize: int | None = None
+) -> None:
+    """
+    Patched version of the copyfileobj function from the Python stdlib (tarfile.py),
+    to handle cases where the source file is actually shorter than expected.
+    By patching the missing bytes with zeroes, we avoid raising an exception
+    and potentially corrupting the tar file.
+    """
+    bufsize = bufsize or 16 * 1024
+    if length == 0:
+        return
+    if length is None:
+        shutil.copyfileobj(src, dst, bufsize)
+        return
+
+    blocks, remainder = divmod(length, bufsize)
+    for _ in range(blocks):
+        buf = src.read(bufsize)
+        if len(buf) < bufsize:
+            # raise exception("unexpected end of data")
+            # PATCH; instead of raising an exception, pad the data to the desired length
+            buf += b"\x00" * (bufsize - len(buf))
+        dst.write(buf)
+
+    if remainder != 0:
+        buf = src.read(remainder)
+        if len(buf) < remainder:
+            # raise exception("unexpected end of data")
+            # PATCH; instead of raising an exception, pad the data to the desired length
+            buf += b"\x00" * (remainder - len(buf))
+        dst.write(buf)
+    return
+
+
+tarfile.copyfileobj = copyfileobj
+
+
 class TarOutput(Output):
     """Tar archive acquire output format. Output can be compressed and/or encrypted.
 
diff --git a/tests/test_outputs_tar.py b/tests/test_outputs_tar.py
index 81059bb2..fbba5eb3 100644
--- a/tests/test_outputs_tar.py
+++ b/tests/test_outputs_tar.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import io
 import tarfile
 from pathlib import Path
 from typing import TYPE_CHECKING
@@ -63,3 +64,40 @@ def test_tar_output_encrypt(mock_fs: VirtualFilesystem, public_key: bytes, tmp_p
 
     with tarfile.open(name=decrypted_path, mode="r") as tar_file:
         assert entry.open().read() == tar_file.extractfile(entry_name).read()
+
+
+def test_tar_output_race_condition_with_growing_file(tmp_path: Path, public_key: bytes) -> None:
+    class ShrinkingFile(io.BytesIO):
+        """
+        A file-like object that returns 5 bytes less than required.
+        Simulates a file on disk that has shrunk in between the time of
+        determining the size and actually reading the data.
+        """
+
+        def __init__(self, data: bytes):
+            super().__init__(data)
+
+        def read(self, size: int) -> bytes:
+            return super().read(size - 5)
+
+    content = b"some text"
+
+    content_padded = content[:-5] + b"\x00" * 5
+    file = ShrinkingFile(content)
+
+    tar_output = TarOutput(tmp_path / "race.tar", encrypt=True, public_key=public_key)
+    tar_output.write("file.log", file)
+    tar_output.close()
+    file.close()
+
+    encrypted_stream = EncryptedFile(tar_output.path.open("rb"), Path("tests/_data/private_key.pem"))
+    decrypted_path = tmp_path / "decrypted.tar"
+
+    # Direct streaming is not an option because tarfile needs seek when reading from encrypted files directly
+    Path(decrypted_path).write_bytes(encrypted_stream.read())
+
+    with tarfile.open(name=decrypted_path, mode="r") as tar_file:
+        member = tar_file.getmember("file.log")
+        extracted = tar_file.extractfile(member).read()
+        # The content should be padded with zeros to match the original size, despite the fact that the file shrunk
+        assert extracted == content_padded

From 12d8ae743d1ecdc1149bc90c99aa3b76ceb91757 Mon Sep 17 00:00:00 2001
From: Joost Jansen <2032793+joost-j@users.noreply.github.com>
Date: Tue, 12 Aug 2025 16:37:13 +0200
Subject: [PATCH 2/4] Applied changes according to feedback

---
 acquire/outputs/tar.py    |  9 +++++----
 tests/test_outputs_tar.py | 39 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/acquire/outputs/tar.py b/acquire/outputs/tar.py
index fe0b0877..2cf8c679 100644
--- a/acquire/outputs/tar.py
+++ b/acquire/outputs/tar.py
@@ -4,6 +4,7 @@
 import shutil
 import tarfile
 from typing import TYPE_CHECKING, BinaryIO
+from unittest.mock import patch
 
 from acquire.crypt import EncryptedStream
 from acquire.outputs.base import Output
@@ -34,6 +35,7 @@ def copyfileobj(
 
     blocks, remainder = divmod(length, bufsize)
     for _ in range(blocks):
+        # Already prevents "long reads" because it reads at max bufsize bytes at a time
         buf = src.read(bufsize)
         if len(buf) < bufsize:
             # raise exception("unexpected end of data")
@@ -42,6 +44,7 @@ def copyfileobj(
         dst.write(buf)
 
     if remainder != 0:
+        # Already prevents "long reads" because it reads at max bufsize bytes at a time
         buf = src.read(remainder)
         if len(buf) < remainder:
             # raise exception("unexpected end of data")
@@ -51,9 +54,6 @@ def copyfileobj(
     return
 
 
-tarfile.copyfileobj = copyfileobj
-
-
 class TarOutput(Output):
     """Tar archive acquire output format. Output can be compressed and/or encrypted.
 
@@ -139,7 +139,8 @@ def write(
             if stat:
                 info.mtime = stat.st_mtime
 
-        self.tar.addfile(info, fh)
+        with patch("tarfile.copyfileobj", copyfileobj):
+            self.tar.addfile(info, fh)
 
     def close(self) -> None:
         """Closes the tar file."""
diff --git a/tests/test_outputs_tar.py b/tests/test_outputs_tar.py
index fbba5eb3..c1a0c873 100644
--- a/tests/test_outputs_tar.py
+++ b/tests/test_outputs_tar.py
@@ -66,7 +66,7 @@ def test_tar_output_encrypt(mock_fs: VirtualFilesystem, public_key: bytes, tmp_p
         assert entry.open().read() == tar_file.extractfile(entry_name).read()
 
 
-def test_tar_output_race_condition_with_growing_file(tmp_path: Path, public_key: bytes) -> None:
+def test_tar_output_race_condition_with_shrinking_file(tmp_path: Path, public_key: bytes) -> None:
     class ShrinkingFile(io.BytesIO):
         """
         A file-like object that returns 5 bytes less than required.
@@ -101,3 +101,40 @@ def read(self, size: int) -> bytes:
         extracted = tar_file.extractfile(member).read()
         # The content should be padded with zeros to match the original size, despite the fact that the file shrunk
         assert extracted == content_padded
+
+
+def test_tar_output_race_condition_with_growing_file(tmp_path: Path, public_key: bytes) -> None:
+    class GrowingFile(io.BytesIO):
+        """
+        A file-like object that returns 3 extra bytes.
+        Simulates a file on disk that has grown in between the time of
+        determining the size and actually reading the data.
+        """
+
+        def __init__(self, data: bytes):
+            super().__init__(data)
+
+        def read(self, size: int) -> bytes:
+            return super().read(size) + b"FOX"
+
+    content = b"some text"
+
+    file = GrowingFile(content)
+
+    tar_output = TarOutput(tmp_path / "race.tar", encrypt=True, public_key=public_key)
+    tar_output.write("file.log", file)
+    tar_output.close()
+    file.close()
+
+    encrypted_stream = EncryptedFile(tar_output.path.open("rb"), Path("tests/_data/private_key.pem"))
+    decrypted_path = tmp_path / "decrypted.tar"
+
+    # Direct streaming is not an option because tarfile needs seek when reading from encrypted files directly
+    Path(decrypted_path).write_bytes(encrypted_stream.read())
+
+    with tarfile.open(name=decrypted_path, mode="r") as tar_file:
+        member = tar_file.getmember("file.log")
+        extracted = tar_file.extractfile(member).read()
+        # The content should match the original content, without the extra bytes
+        # because the file was read with the original size
+        assert extracted == content

From 5b22f2cb2f12ce463a99e73d13afa380fafc4922 Mon Sep 17 00:00:00 2001
From: Joost Jansen <2032793+joost-j@users.noreply.github.com>
Date: Tue, 12 Aug 2025 17:10:59 +0200
Subject: [PATCH 3/4] Also inline 'addfile' function of tarfile

---
 acquire/outputs/tar.py    | 89 +++++++++++++++++++++------------------
 tests/test_outputs_tar.py |  2 +-
 2 files changed, 50 insertions(+), 41 deletions(-)

diff --git a/acquire/outputs/tar.py b/acquire/outputs/tar.py
index 2cf8c679..8dd6eccb 100644
--- a/acquire/outputs/tar.py
+++ b/acquire/outputs/tar.py
@@ -1,10 +1,10 @@
 from __future__ import annotations
 
+import copy
 import io
 import shutil
 import tarfile
 from typing import TYPE_CHECKING, BinaryIO
-from unittest.mock import patch
 
 from acquire.crypt import EncryptedStream
 from acquire.outputs.base import Output
@@ -17,43 +17,6 @@
 TAR_COMPRESSION_METHODS = {"gzip": "gz", "bzip2": "bz2", "xz": "xz"}
 
 
-def copyfileobj(
-    src: BinaryIO, dst: BinaryIO, length: int | None = None, _: Exception = OSError, bufsize: int | None = None
-) -> None:
-    """
-    Patched version of the copyfileobj function from the Python stdlib (tarfile.py),
-    to handle cases where the source file is actually shorter than expected.
-    By patching the missing bytes with zeroes, we avoid raising an exception
-    and potentially corrupting the tar file.
-    """
-    bufsize = bufsize or 16 * 1024
-    if length == 0:
-        return
-    if length is None:
-        shutil.copyfileobj(src, dst, bufsize)
-        return
-
-    blocks, remainder = divmod(length, bufsize)
-    for _ in range(blocks):
-        # Already prevents "long reads" because it reads at max bufsize bytes at a time
-        buf = src.read(bufsize)
-        if len(buf) < bufsize:
-            # raise exception("unexpected end of data")
-            # PATCH; instead of raising an exception, pad the data to the desired length
-            buf += b"\x00" * (bufsize - len(buf))
-        dst.write(buf)
-
-    if remainder != 0:
-        # Already prevents "long reads" because it reads at max bufsize bytes at a time
-        buf = src.read(remainder)
-        if len(buf) < remainder:
-            # raise exception("unexpected end of data")
-            # PATCH; instead of raising an exception, pad the data to the desired length
-            buf += b"\x00" * (remainder - len(buf))
-        dst.write(buf)
-    return
-
-
 class TarOutput(Output):
     """Tar archive acquire output format. Output can be compressed and/or encrypted.
 
@@ -139,8 +102,54 @@ def write(
             if stat:
                 info.mtime = stat.st_mtime
 
-        with patch("tarfile.copyfileobj", copyfileobj):
-            self.tar.addfile(info, fh)
+        # Inline version of Python stdlib's tarfile.addfile & tarfile.copyfileobj,
+        # to allow for padding and more control over the tar file writing.
+        self.tar._check("awx")
+
+        if fh is None and info.isreg() and info.size != 0:
+            raise ValueError("fileobj not provided for non zero-size regular file")
+
+        info = copy.copy(info)
+
+        buf = info.tobuf(self.tar.format, self.tar.encoding, self.tar.errors)
+        self.tar.fileobj.write(buf)
+        self.tar.offset += len(buf)
+        bufsize = self.tar.copybufsize
+        if fh is not None:
+            bufsize = bufsize or 16 * 1024
+
+            if info.size == 0:
+                return
+            if info.size is None:
+                shutil.copyfileobj(fh, self.tar.fileobj, bufsize)
+                return
+
+            blocks, remainder = divmod(info.size, bufsize)
+            for _ in range(blocks):
+                # Prevents "long reads" because it reads at max bufsize bytes at a time
+                buf = fh.read(bufsize)
+                if len(buf) < bufsize:
+                    # raise exception("unexpected end of data")
+                    # PATCH; instead of raising an exception, pad the data to the desired length
+                    buf += tarfile.NUL * (bufsize - len(buf))
+                self.tar.fileobj.write(buf)
+
+            if remainder != 0:
+                # Prevents "long reads" because it reads at max bufsize bytes at a time
+                buf = fh.read(remainder)
+                if len(buf) < remainder:
+                    # raise exception("unexpected end of data")
+                    # PATCH; instead of raising an exception, pad the data to the desired length
+                    buf += tarfile.NUL * (remainder - len(buf))
+                self.tar.fileobj.write(buf)
+
+            blocks, remainder = divmod(info.size, tarfile.BLOCKSIZE)
+            if remainder > 0:
+                self.tar.fileobj.write(tarfile.NUL * (tarfile.BLOCKSIZE - remainder))
+                blocks += 1
+            self.tar.offset += blocks * tarfile.BLOCKSIZE
+
+        self.tar.members.append(info)
 
     def close(self) -> None:
         """Closes the tar file."""
diff --git a/tests/test_outputs_tar.py b/tests/test_outputs_tar.py
index c1a0c873..49d00f86 100644
--- a/tests/test_outputs_tar.py
+++ b/tests/test_outputs_tar.py
@@ -82,7 +82,7 @@ def read(self, size: int) -> bytes:
 
     content = b"some text"
 
-    content_padded = content[:-5] + b"\x00" * 5
+    content_padded = content[:-5] + tarfile.NUL * 5
     file = ShrinkingFile(content)
 
     tar_output = TarOutput(tmp_path / "race.tar", encrypt=True, public_key=public_key)

From 00a211c942f288e25a902d5e28633a0ffebe2acb Mon Sep 17 00:00:00 2001
From: Joost Jansen <12032793+joost-j@users.noreply.github.com>
Date: Thu, 21 Aug 2025 14:44:35 +0200
Subject: [PATCH 4/4] Apply suggestions from code review

Co-authored-by: Erik Schamper <1254028+Schamper@users.noreply.github.com>
---
 acquire/outputs/tar.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/acquire/outputs/tar.py b/acquire/outputs/tar.py
index 8dd6eccb..73237d25 100644
--- a/acquire/outputs/tar.py
+++ b/acquire/outputs/tar.py
@@ -129,7 +129,6 @@ def write(
                 # Prevents "long reads" because it reads at max bufsize bytes at a time
                 buf = fh.read(bufsize)
                 if len(buf) < bufsize:
-                    # raise exception("unexpected end of data")
                     # PATCH; instead of raising an exception, pad the data to the desired length
                     buf += tarfile.NUL * (bufsize - len(buf))
                 self.tar.fileobj.write(buf)
@@ -138,7 +137,6 @@ def write(
                 # Prevents "long reads" because it reads at max bufsize bytes at a time
                 buf = fh.read(remainder)
                 if len(buf) < remainder:
-                    # raise exception("unexpected end of data")
                     # PATCH; instead of raising an exception, pad the data to the desired length
                     buf += tarfile.NUL * (remainder - len(buf))
                 self.tar.fileobj.write(buf)