From 48f97fe362c890da924ee5793a0c4b95306f24aa Mon Sep 17 00:00:00 2001
From: Benjamin Drung <benjamin.drung@canonical.com>
Date: Tue, 21 Apr 2026 12:41:32 +0200
Subject: [PATCH] problem_report: let load() return skipped keys

The documentation for `ProblemReport.load()` says:

> If binary is False, binary data is not loaded; the dictionary key is
> created, but its value will be an empty string.

Commit 64820a0e2de0 ("apport-unpack: Fix ValueError: ['separator'] has
no binary content") changed this behavior to fix a crash in
apport-unpack.

Setting non-loaded values to `None` will make type hints more
complicated and less useful. Values in the problem report should not be
allowed to be `None`.

Instead of adding dictionary keys with a special value for not loaded
keys, just let `load()` return the keys that were skipped. This reduces
guess work and make the code more robust.

Remove `ProblemReport.has_removed_fields()` since that information is
lost and this function was only used in Apport tests.
---
 bin/apport-unpack                 | 11 +++--------
 problem_report.py                 | 28 ++++++++++++----------------
 tests/unit/test_problem_report.py | 26 ++++++++++++--------------
 3 files changed, 27 insertions(+), 38 deletions(-)

diff --git a/bin/apport-unpack b/bin/apport-unpack
index 96abc2a26..88cfc4be5 100755
--- a/bin/apport-unpack
+++ b/bin/apport-unpack
@@ -56,22 +56,17 @@ def open_report(report_filename: str) -> Iterator[(BinaryIO | gzip.GzipFile)]:
 
 def unpack_report_to_directory(
     report: problem_report.ProblemReport, target_directory: str
-) -> list[str]:
+) -> None:
     """Write each report entry into a separate file.
 
     Return a list of keys that were not loaded.
     """
-    missing_keys = []
     for key, value in report.items():
-        if value is None:
-            missing_keys.append(key)
-            continue
         with open(os.path.join(target_directory, key), "wb") as key_file:
             if isinstance(value, str):
                 key_file.write(value.encode("UTF-8"))
             else:
                 key_file.write(value)
-    return missing_keys
 
 
 # pylint: disable-next=missing-function-docstring
@@ -95,10 +90,10 @@ def main() -> None:
             # In case of passing the report to stdin,
             # the report needs to be loaded in one go.
             # The current implementation loads the whole report into memory.
-            report.load(report_file, binary=args.report == "-")
+            bin_keys = report.load(report_file, binary=args.report == "-")
     except (OSError, problem_report.MalformedProblemReport) as error:
         fatal("%s", str(error))
-    bin_keys = unpack_report_to_directory(report, args.target_directory)
+    unpack_report_to_directory(report, args.target_directory)
     if bin_keys:
         try:
             with open_report(args.report) as report_file:
diff --git a/problem_report.py b/problem_report.py
index 63afb2eeb..ceac800ce 100644
--- a/problem_report.py
+++ b/problem_report.py
@@ -358,11 +358,10 @@ def load(
         file: gzip.GzipFile | typing.IO[bytes],
         binary: bool | typing.Literal["compressed"] = True,
         key_filter: Iterable[str] | None = None,
-    ) -> None:
+    ) -> list[str]:
         """Initialize problem report from a file-like object.
 
-        If binary is False, binary data is not loaded; the dictionary key is
-        created, but its value will be an empty string. If it is True, it is
+        If binary is False, binary data is not loaded. If it is True, it is
         transparently uncompressed and available as dictionary byte array
         values. If binary is 'compressed', the compressed value is retained,
         and the dictionary value will be a CompressedValue object. This is
@@ -371,9 +370,13 @@ def load(
 
         file needs to be opened in binary mode.
 
-        If key_filter is given, only those keys will be loaded.
+        If key_filter is given, only those keys will be loaded. If all
+        specified keys were loaded, the remaining file will be skipped.
 
         Files are in RFC822 format, but with case sensitive keys.
+
+        Return a list of dictionary keys that were skipped. Note: This list
+        might not be exhaustive in case key_filter is given.
         """
         self._assert_bin_mode(file)
         self.data.clear()
@@ -382,14 +385,16 @@ def load(
         else:
             remaining_keys = None
 
+        skipped_keys = []
         for entry in _EntryParser(file):
             key, iterator, base64_encoded = _parse_entry(entry)
             if remaining_keys is not None and key not in remaining_keys:
+                skipped_keys.append(key)
                 continue
 
             if base64_encoded:
                 if binary is False:
-                    self.data[key] = None
+                    skipped_keys.append(key)
                 elif binary == "compressed":
                     self.data[key] = CompressedValue(
                         name=key, compressed_value=b"".join(iterator)
@@ -408,6 +413,7 @@ def load(
                     break
 
         self.old_keys = set(self.data.keys())
+        return skipped_keys
 
     def extract_keys(
         self,
@@ -475,13 +481,6 @@ def get_timestamp(self) -> int | None:
         except locale.Error:
             return None
 
-    def has_removed_fields(self) -> bool:
-        """Check if the report has any keys which were not loaded.
-
-        This could happen when using binary=False in load().
-        """
-        return None in self.values()
-
     @staticmethod
     def is_binary(string: bytes | str) -> bool:
         """Check if the given strings contains binary data."""
@@ -508,8 +507,7 @@ def sorted_items(
 
         The most interesting fields will be returned first. The remaining
         items will be returned in alphabetical order. Keys starting with
-        an underscore are considered internal and are skipped. Also values
-        that are None will be skipped.
+        an underscore are considered internal and are skipped.
 
         If keys is provided, only those keys will be iterated over.
         """
@@ -534,8 +532,6 @@ def sorted_items(
             if key.startswith("_"):
                 continue
             value = self[key]
-            if value is None:
-                continue
             yield key, value
 
     def write(self, file: typing.IO[bytes], only_new: bool = False) -> None:
diff --git a/tests/unit/test_problem_report.py b/tests/unit/test_problem_report.py
index 186f1ed99..b62c227d9 100644
--- a/tests/unit/test_problem_report.py
+++ b/tests/unit/test_problem_report.py
@@ -426,19 +426,18 @@ def test_read_file(self) -> None:
 
         # test with reading everything
         pr = problem_report.ProblemReport()
-        pr.load(io.BytesIO(bin_report))
+        skipped_keys = pr.load(io.BytesIO(bin_report))
         self.assertEqual(pr["File"], BIN_DATA)
-        self.assertEqual(pr.has_removed_fields(), False)
+        self.assertEqual(skipped_keys, [])
 
         # test with skipping binary data
-        pr.load(io.BytesIO(bin_report), binary=False)
-        self.assertIsNone(pr["File"])
-        self.assertEqual(pr.has_removed_fields(), True)
+        skipped_keys = pr.load(io.BytesIO(bin_report), binary=False)
+        self.assertEqual(skipped_keys, ["File"])
 
         # test with keeping compressed binary data
-        pr.load(io.BytesIO(bin_report), binary="compressed")
+        skipped_keys = pr.load(io.BytesIO(bin_report), binary="compressed")
         self.assertEqual(pr["Foo"], "Bar")
-        self.assertEqual(pr.has_removed_fields(), False)
+        self.assertEqual(skipped_keys, [])
         self.assertTrue(isinstance(pr["File"], problem_report.CompressedValue))
         self.assertEqual(len(pr["File"]), len(BIN_DATA))
         self.assertEqual(pr["File"].get_value(), BIN_DATA)
@@ -458,18 +457,17 @@ def test_read_file_legacy(self) -> None:
 
         # test with reading everything
         pr = problem_report.ProblemReport()
-        pr.load(io.BytesIO(bin_report))
+        skipped_keys = pr.load(io.BytesIO(bin_report))
         self.assertEqual(pr["File"], b"AB" * 10 + b"\0" * 10 + b"Z")
-        self.assertEqual(pr.has_removed_fields(), False)
+        self.assertEqual(skipped_keys, [])
 
         # test with skipping binary data
-        pr.load(io.BytesIO(bin_report), binary=False)
-        self.assertIsNone(pr["File"])
-        self.assertEqual(pr.has_removed_fields(), True)
+        skipped_keys = pr.load(io.BytesIO(bin_report), binary=False)
+        self.assertEqual(skipped_keys, ["File"])
 
         # test with keeping CompressedValues
-        pr.load(io.BytesIO(bin_report), binary="compressed")
-        self.assertEqual(pr.has_removed_fields(), False)
+        skipped_keys = pr.load(io.BytesIO(bin_report), binary="compressed")
+        self.assertEqual(skipped_keys, [])
         self.assertEqual(len(pr["File"]), 31)
         self.assertEqual(pr["File"].get_value(), b"AB" * 10 + b"\0" * 10 + b"Z")
         self.assertEqual(pr["File"].name, "File")