From e4cfd35b900a3047c532e700a09b0f2bba0ae4fd Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 21 Apr 2026 22:07:33 +0000 Subject: [PATCH 1/6] Initial plan From 15193d9deaaa920660de5ac238af1cbe4c62d0f1 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 21 Apr 2026 22:17:55 +0000 Subject: [PATCH 2/6] fix: wait for loop device partition nodes after losetup to avoid race condition After `losetup --find --show --partscan`, the kernel creates partition device nodes (e.g. /dev/loop7p1, /dev/loop7p2) asynchronously via udev. If code tries to mount a partition immediately after attaching, it may fail with "special device /dev/loopNpM does not exist". Fix this by adding `_wait_for_loopdev_partitions()` which: 1. Calls `udevadm settle` to flush all pending udev events 2. Polls for each expected partition device node with a configurable timeout 3. Raises ImageError if nodes don't appear within the timeout The wait is called from: - `Image.attach_loopdev()` in imagecraft/pack/image.py - `ImageService.attach_images()` in imagecraft/services/image.py (fresh attaches only) Adds unit tests for the new helper function and the integration points. Agent-Logs-Url: https://github.com/canonical/imagecraft/sessions/dfb57e1c-b428-4e70-b2b1-83a7428353fc Co-authored-by: lengau <4305943+lengau@users.noreply.github.com> --- imagecraft/pack/image.py | 49 ++++++++++++++++ imagecraft/services/image.py | 13 +++++ tests/unit/pack/test_image.py | 92 +++++++++++++++++++++++++++++++ tests/unit/services/test_image.py | 42 ++++++++++++++ 4 files changed, 196 insertions(+) diff --git a/imagecraft/pack/image.py b/imagecraft/pack/image.py index 04ecc756..42863d11 100644 --- a/imagecraft/pack/image.py +++ b/imagecraft/pack/image.py @@ -16,6 +16,7 @@ import contextlib import json +import time from collections.abc import Generator, Iterator from pathlib import Path from typing import Any, cast @@ -27,6 +28,8 @@ from imagecraft.subprocesses import run _LOSETUP_BIN = "losetup" +_UDEVADM_BIN = "udevadm" +_PARTITION_WAIT_TIMEOUT = 10.0 def _get_loop_devices() -> list[dict[str, Any]]: @@ -43,6 +46,47 @@ def _detach_loop_device( run(_LOSETUP_BIN, "-d", loop_device) +def _wait_for_loopdev_partitions( + loop_device: str, + partition_nums: list[int], + timeout: float = _PARTITION_WAIT_TIMEOUT, +) -> None: + """Wait for loop device partition nodes to appear in /dev. + + After attaching a loop device with --partscan, the kernel may not + immediately create the partition device nodes. This function waits + until all expected partition nodes are available. + + :param loop_device: Path to the loop device (e.g. '/dev/loop7') + :param partition_nums: List of partition numbers to wait for + :param timeout: Maximum time to wait in seconds + :raises errors.ImageError: If partition nodes do not appear within the timeout + """ + if not partition_nums: + return + + expected = [Path(f"{loop_device}p{n}") for n in partition_nums] + + # Ask udev to settle so device nodes are created before we poll. + try: + run(_UDEVADM_BIN, "settle", "--timeout", str(int(timeout))) + except Exception: # noqa: BLE001 + emit.debug("udevadm settle failed or unavailable; falling back to polling") + + # Poll until all partition nodes appear or the timeout is reached. + deadline = time.monotonic() + timeout + missing = [p for p in expected if not p.exists()] + while missing and time.monotonic() < deadline: + time.sleep(0.1) + missing = [p for p in expected if not p.exists()] + + if missing: + raise errors.ImageError( + f"Loop device partition nodes did not appear within {timeout:.0f}s: " + + ", ".join(str(p) for p in missing) + ) + + class Image: """Image. @@ -91,6 +135,11 @@ def attach_loopdev(self) -> Iterator[str]: emit.debug( f"Attached image {self.disk_path} as loop device {self.loop_device}" ) + partition_nums = [ + (s.partition_number or i) + for i, s in enumerate(self.volume.structure, start=1) + ] + _wait_for_loopdev_partitions(self.loop_device, partition_nums) try: yield self.loop_device finally: diff --git a/imagecraft/services/image.py b/imagecraft/services/image.py index 18bf48f7..1c9751fb 100644 --- a/imagecraft/services/image.py +++ b/imagecraft/services/image.py @@ -30,6 +30,7 @@ from imagecraft.models import Project from imagecraft.models.volume import PartitionSchema from imagecraft.pack import gptutil +from imagecraft.pack.image import _wait_for_loopdev_partitions from imagecraft.subprocesses import run _LOSETUP_BIN = "losetup" @@ -116,9 +117,11 @@ def attach_images(self) -> Mapping[str, str]: raise ValueError("Images must be created before attaching.") all_devices = self._get_all_loop_devices() + project = cast(Project, self._services.get("project").get()) for name, image_path in self._images.items(): attached_device: str | None = None + is_fresh_attach = False # 1. Check for existing devices pointing to this file. for dev in all_devices: @@ -149,6 +152,7 @@ def attach_images(self) -> Mapping[str, str]: str(image_path), ).stdout.strip() emit.debug(f"Attached {image_path} as {attached_device}") + is_fresh_attach = True except subprocess.CalledProcessError as err: raise CraftError( f"Failed to attach loop device for {image_path}.", @@ -156,6 +160,15 @@ def attach_images(self) -> Mapping[str, str]: resolution="Ensure loop devices are available and you have sufficient permissions (sudo).", ) from err + # 3. Wait for partition nodes after a fresh attach to avoid race conditions. + if is_fresh_attach: + volume = project.volumes[name] + partition_nums = [ + (s.partition_number or i) + for i, s in enumerate(volume.structure, start=1) + ] + _wait_for_loopdev_partitions(attached_device, partition_nums) + self._loop_devices[name] = attached_device if not self._atexit_registered: diff --git a/tests/unit/pack/test_image.py b/tests/unit/pack/test_image.py index ba441213..58780d31 100644 --- a/tests/unit/pack/test_image.py +++ b/tests/unit/pack/test_image.py @@ -48,6 +48,7 @@ def run(cmd: str, *args: Any, **kwargs: Any) -> CompletedProcess[str]: class TestImage: def test_loopdev(self, mocker, new_dir: Path): mock_run = mocker.patch("imagecraft.pack.image.run", side_effect=run) + mock_wait = mocker.patch("imagecraft.pack.image._wait_for_loopdev_partitions") volume = Volume.unmarshal( { @@ -78,6 +79,7 @@ def test_loopdev(self, mocker, new_dir: Path): call("losetup", "--json"), call("losetup", "-d", "/dev/loop99"), ] + mock_wait.assert_called_once_with("loop99", [1]) @pytest.mark.parametrize( ("volume_data", "has_data_partition"), @@ -248,3 +250,93 @@ def test_has_boot_partition( ) assert image.has_boot_partition == has_boot_partition + + +class TestWaitForLoopdevPartitions: + """Tests for the _wait_for_loopdev_partitions helper.""" + + def test_empty_partition_list_returns_immediately(self, mocker): + """No waiting when there are no partitions to check.""" + from imagecraft.pack.image import _wait_for_loopdev_partitions + + mock_run = mocker.patch("imagecraft.pack.image.run") + mock_sleep = mocker.patch("time.sleep") + + _wait_for_loopdev_partitions("/dev/loop0", []) + + mock_run.assert_not_called() + mock_sleep.assert_not_called() + + def test_waits_until_partitions_appear(self, mocker, tmp_path): + """Polls until partition device nodes exist.""" + from imagecraft.pack.image import _wait_for_loopdev_partitions + + mocker.patch("imagecraft.pack.image.run") + + part1 = tmp_path / "loop0p1" + part2 = tmp_path / "loop0p2" + + # Simulate partition nodes appearing after two poll cycles. + call_count = 0 + + def fake_exists(self: Path) -> bool: + nonlocal call_count + call_count += 1 + # Make nodes appear on the third check (after first poll cycle) + return call_count > 4 + + mocker.patch.object(Path, "exists", fake_exists) + mocker.patch("time.sleep") + + _wait_for_loopdev_partitions( + str(tmp_path / "loop0"), [1, 2], timeout=5.0 + ) + + def test_raises_on_timeout(self, mocker): + """Raises ImageError when partition nodes don't appear in time.""" + from imagecraft.errors import ImageError + from imagecraft.pack.image import _wait_for_loopdev_partitions + + mocker.patch("imagecraft.pack.image.run") + mocker.patch.object(Path, "exists", return_value=False) + mocker.patch("time.sleep") + # Simulate timeout immediately + mocker.patch( + "time.monotonic", + side_effect=[0.0, 0.0, 100.0], + ) + + with pytest.raises(ImageError, match="partition nodes did not appear"): + _wait_for_loopdev_partitions("/dev/loop0", [1, 2], timeout=1.0) + + def test_udevadm_failure_falls_back_to_polling(self, mocker, tmp_path): + """Falls back to polling when udevadm settle fails.""" + import subprocess + + from imagecraft.pack.image import _wait_for_loopdev_partitions + + part = tmp_path / "loop0p1" + part.touch() + + # udevadm settle raises CalledProcessError + mocker.patch( + "imagecraft.pack.image.run", + side_effect=subprocess.CalledProcessError(1, "udevadm"), + ) + mocker.patch.object(Path, "exists", return_value=True) + + # Should not raise even though udevadm failed + _wait_for_loopdev_partitions(str(tmp_path / "loop0"), [1], timeout=5.0) + + def test_calls_udevadm_settle_with_timeout(self, mocker, tmp_path): + """Calls udevadm settle with the specified timeout.""" + from unittest.mock import call as mcall + + from imagecraft.pack.image import _wait_for_loopdev_partitions + + mock_run = mocker.patch("imagecraft.pack.image.run") + mocker.patch.object(Path, "exists", return_value=True) + + _wait_for_loopdev_partitions("/dev/loop5", [1, 2], timeout=7.0) + + mock_run.assert_called_once_with("udevadm", "settle", "--timeout", "7") diff --git a/tests/unit/services/test_image.py b/tests/unit/services/test_image.py index 221d84af..6c0d68d4 100644 --- a/tests/unit/services/test_image.py +++ b/tests/unit/services/test_image.py @@ -157,6 +157,7 @@ def test_attach_images_stale_inode(image_service, project_dir, mocker): mocker.patch("pathlib.Path.samefile", side_effect=FileNotFoundError) mock_run = mocker.patch("imagecraft.services.image.run") mock_run.return_value.stdout = "/dev/loop11\n" + mocker.patch("imagecraft.services.image._wait_for_loopdev_partitions") devices = image_service.attach_images() @@ -169,6 +170,47 @@ def test_attach_images_stale_inode(image_service, project_dir, mocker): ) +def test_attach_images_waits_for_partitions(image_service, project_dir, mocker, mock_project, mock_services): + """attach_images() waits for partition nodes after a fresh losetup attach.""" + image_service._images = {"pc": project_dir / ".pc.img.tmp"} + + mocker.patch.object(image_service, "_get_all_loop_devices", return_value=[]) + mock_run = mocker.patch("imagecraft.services.image.run") + mock_run.return_value.stdout = "/dev/loop8\n" + + mock_project_service = MagicMock() + mock_project_service.get.return_value = mock_project + mock_services.get.return_value = mock_project_service + + mock_wait = mocker.patch("imagecraft.services.image._wait_for_loopdev_partitions") + + with patch("atexit.register"): + image_service.attach_images() + + # efi has no explicit partition_number -> 1; rootfs has partition_number=2 -> 2 + mock_wait.assert_called_once_with("/dev/loop8", [1, 2]) + + +def test_attach_images_no_wait_on_reuse(image_service, project_dir, mocker): + """attach_images() does NOT wait when reusing an existing loop device.""" + image_path = project_dir / ".pc.img.tmp" + image_path.touch() + image_service._images = {"pc": image_path} + + mocker.patch.object( + image_service, + "_get_all_loop_devices", + return_value=[{"name": "/dev/loop9", "back-file": str(image_path)}], + ) + mocker.patch("pathlib.Path.samefile", return_value=True) + mocker.patch("imagecraft.services.image.run") + mock_wait = mocker.patch("imagecraft.services.image._wait_for_loopdev_partitions") + + image_service.attach_images() + + mock_wait.assert_not_called() + + def test_detach_images_success(image_service, mocker): image_service._loop_devices = {"pc": "/dev/loop8"} mock_run = mocker.patch("imagecraft.services.image.run") From 79c9ccfda6b958bbd9c64878aab266ca6fe8ad26 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 21 Apr 2026 22:20:05 +0000 Subject: [PATCH 3/6] refactor: address code review - specific exception catch + extract partition number helper - Replace broad `except Exception` with `except (subprocess.CalledProcessError, OSError)` to be precise about what failures are expected from udevadm settle - Extract duplicated partition-number calculation into `_volume_partition_nums()` helper function and import it into services/image.py to avoid code duplication Agent-Logs-Url: https://github.com/canonical/imagecraft/sessions/dfb57e1c-b428-4e70-b2b1-83a7428353fc Co-authored-by: lengau <4305943+lengau@users.noreply.github.com> --- imagecraft/pack/image.py | 22 ++++++++++++++++------ imagecraft/services/image.py | 11 ++++------- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/imagecraft/pack/image.py b/imagecraft/pack/image.py index 42863d11..87856d2a 100644 --- a/imagecraft/pack/image.py +++ b/imagecraft/pack/image.py @@ -16,6 +16,7 @@ import contextlib import json +import subprocess import time from collections.abc import Generator, Iterator from pathlib import Path @@ -46,6 +47,17 @@ def _detach_loop_device( run(_LOSETUP_BIN, "-d", loop_device) +def _volume_partition_nums(volume: Volume) -> list[int]: + """Return the partition numbers for each structure item in a volume. + + Partition numbers are either explicitly set via ``structure_item.partition_number`` + or implicitly assigned as the 1-based index of the item in the structure list. + """ + return [ + (s.partition_number or i) for i, s in enumerate(volume.structure, start=1) + ] + + def _wait_for_loopdev_partitions( loop_device: str, partition_nums: list[int], @@ -70,7 +82,7 @@ def _wait_for_loopdev_partitions( # Ask udev to settle so device nodes are created before we poll. try: run(_UDEVADM_BIN, "settle", "--timeout", str(int(timeout))) - except Exception: # noqa: BLE001 + except (subprocess.CalledProcessError, OSError): emit.debug("udevadm settle failed or unavailable; falling back to polling") # Poll until all partition nodes appear or the timeout is reached. @@ -135,11 +147,9 @@ def attach_loopdev(self) -> Iterator[str]: emit.debug( f"Attached image {self.disk_path} as loop device {self.loop_device}" ) - partition_nums = [ - (s.partition_number or i) - for i, s in enumerate(self.volume.structure, start=1) - ] - _wait_for_loopdev_partitions(self.loop_device, partition_nums) + _wait_for_loopdev_partitions( + self.loop_device, _volume_partition_nums(self.volume) + ) try: yield self.loop_device finally: diff --git a/imagecraft/services/image.py b/imagecraft/services/image.py index 1c9751fb..5b9bac88 100644 --- a/imagecraft/services/image.py +++ b/imagecraft/services/image.py @@ -30,7 +30,7 @@ from imagecraft.models import Project from imagecraft.models.volume import PartitionSchema from imagecraft.pack import gptutil -from imagecraft.pack.image import _wait_for_loopdev_partitions +from imagecraft.pack.image import _volume_partition_nums, _wait_for_loopdev_partitions from imagecraft.subprocesses import run _LOSETUP_BIN = "losetup" @@ -162,12 +162,9 @@ def attach_images(self) -> Mapping[str, str]: # 3. Wait for partition nodes after a fresh attach to avoid race conditions. if is_fresh_attach: - volume = project.volumes[name] - partition_nums = [ - (s.partition_number or i) - for i, s in enumerate(volume.structure, start=1) - ] - _wait_for_loopdev_partitions(attached_device, partition_nums) + _wait_for_loopdev_partitions( + attached_device, _volume_partition_nums(project.volumes[name]) + ) self._loop_devices[name] = attached_device From f7b72b24a9b4b022e24430cd41780e8637a09905 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 22 Apr 2026 05:32:19 +0000 Subject: [PATCH 4/6] fix: make wait/partition-nums helpers public and move wait before debug log - Rename _wait_for_loopdev_partitions -> wait_for_loopdev_partitions - Rename _volume_partition_nums -> volume_partition_nums (both are imported by services/image.py so they can't be private) - In Image.attach_loopdev(), call wait_for_loopdev_partitions() before emit.debug() so the log only fires once the partitions are actually ready - Update all references in services/image.py and both test files Agent-Logs-Url: https://github.com/canonical/imagecraft/sessions/065c4f4e-7f04-47a7-9ac7-f5d74ae39dfc Co-authored-by: lengau <4305943+lengau@users.noreply.github.com> --- imagecraft/pack/image.py | 10 +++++----- imagecraft/services/image.py | 6 +++--- tests/unit/pack/test_image.py | 24 ++++++++++++------------ tests/unit/services/test_image.py | 6 +++--- 4 files changed, 23 insertions(+), 23 deletions(-) diff --git a/imagecraft/pack/image.py b/imagecraft/pack/image.py index 87856d2a..b56dc07f 100644 --- a/imagecraft/pack/image.py +++ b/imagecraft/pack/image.py @@ -47,7 +47,7 @@ def _detach_loop_device( run(_LOSETUP_BIN, "-d", loop_device) -def _volume_partition_nums(volume: Volume) -> list[int]: +def volume_partition_nums(volume: Volume) -> list[int]: """Return the partition numbers for each structure item in a volume. Partition numbers are either explicitly set via ``structure_item.partition_number`` @@ -58,7 +58,7 @@ def _volume_partition_nums(volume: Volume) -> list[int]: ] -def _wait_for_loopdev_partitions( +def wait_for_loopdev_partitions( loop_device: str, partition_nums: list[int], timeout: float = _PARTITION_WAIT_TIMEOUT, @@ -144,12 +144,12 @@ def attach_loopdev(self) -> Iterator[str]: "--partscan", self.disk_path, ).stdout.strip() + wait_for_loopdev_partitions( + self.loop_device, volume_partition_nums(self.volume) + ) emit.debug( f"Attached image {self.disk_path} as loop device {self.loop_device}" ) - _wait_for_loopdev_partitions( - self.loop_device, _volume_partition_nums(self.volume) - ) try: yield self.loop_device finally: diff --git a/imagecraft/services/image.py b/imagecraft/services/image.py index 5b9bac88..d0bfe588 100644 --- a/imagecraft/services/image.py +++ b/imagecraft/services/image.py @@ -30,7 +30,7 @@ from imagecraft.models import Project from imagecraft.models.volume import PartitionSchema from imagecraft.pack import gptutil -from imagecraft.pack.image import _volume_partition_nums, _wait_for_loopdev_partitions +from imagecraft.pack.image import volume_partition_nums, wait_for_loopdev_partitions from imagecraft.subprocesses import run _LOSETUP_BIN = "losetup" @@ -162,8 +162,8 @@ def attach_images(self) -> Mapping[str, str]: # 3. Wait for partition nodes after a fresh attach to avoid race conditions. if is_fresh_attach: - _wait_for_loopdev_partitions( - attached_device, _volume_partition_nums(project.volumes[name]) + wait_for_loopdev_partitions( + attached_device, volume_partition_nums(project.volumes[name]) ) self._loop_devices[name] = attached_device diff --git a/tests/unit/pack/test_image.py b/tests/unit/pack/test_image.py index 58780d31..475db750 100644 --- a/tests/unit/pack/test_image.py +++ b/tests/unit/pack/test_image.py @@ -48,7 +48,7 @@ def run(cmd: str, *args: Any, **kwargs: Any) -> CompletedProcess[str]: class TestImage: def test_loopdev(self, mocker, new_dir: Path): mock_run = mocker.patch("imagecraft.pack.image.run", side_effect=run) - mock_wait = mocker.patch("imagecraft.pack.image._wait_for_loopdev_partitions") + mock_wait = mocker.patch("imagecraft.pack.image.wait_for_loopdev_partitions") volume = Volume.unmarshal( { @@ -253,23 +253,23 @@ def test_has_boot_partition( class TestWaitForLoopdevPartitions: - """Tests for the _wait_for_loopdev_partitions helper.""" + """Tests for the wait_for_loopdev_partitions helper.""" def test_empty_partition_list_returns_immediately(self, mocker): """No waiting when there are no partitions to check.""" - from imagecraft.pack.image import _wait_for_loopdev_partitions + from imagecraft.pack.image import wait_for_loopdev_partitions mock_run = mocker.patch("imagecraft.pack.image.run") mock_sleep = mocker.patch("time.sleep") - _wait_for_loopdev_partitions("/dev/loop0", []) + wait_for_loopdev_partitions("/dev/loop0", []) mock_run.assert_not_called() mock_sleep.assert_not_called() def test_waits_until_partitions_appear(self, mocker, tmp_path): """Polls until partition device nodes exist.""" - from imagecraft.pack.image import _wait_for_loopdev_partitions + from imagecraft.pack.image import wait_for_loopdev_partitions mocker.patch("imagecraft.pack.image.run") @@ -288,14 +288,14 @@ def fake_exists(self: Path) -> bool: mocker.patch.object(Path, "exists", fake_exists) mocker.patch("time.sleep") - _wait_for_loopdev_partitions( + wait_for_loopdev_partitions( str(tmp_path / "loop0"), [1, 2], timeout=5.0 ) def test_raises_on_timeout(self, mocker): """Raises ImageError when partition nodes don't appear in time.""" from imagecraft.errors import ImageError - from imagecraft.pack.image import _wait_for_loopdev_partitions + from imagecraft.pack.image import wait_for_loopdev_partitions mocker.patch("imagecraft.pack.image.run") mocker.patch.object(Path, "exists", return_value=False) @@ -307,13 +307,13 @@ def test_raises_on_timeout(self, mocker): ) with pytest.raises(ImageError, match="partition nodes did not appear"): - _wait_for_loopdev_partitions("/dev/loop0", [1, 2], timeout=1.0) + wait_for_loopdev_partitions("/dev/loop0", [1, 2], timeout=1.0) def test_udevadm_failure_falls_back_to_polling(self, mocker, tmp_path): """Falls back to polling when udevadm settle fails.""" import subprocess - from imagecraft.pack.image import _wait_for_loopdev_partitions + from imagecraft.pack.image import wait_for_loopdev_partitions part = tmp_path / "loop0p1" part.touch() @@ -326,17 +326,17 @@ def test_udevadm_failure_falls_back_to_polling(self, mocker, tmp_path): mocker.patch.object(Path, "exists", return_value=True) # Should not raise even though udevadm failed - _wait_for_loopdev_partitions(str(tmp_path / "loop0"), [1], timeout=5.0) + wait_for_loopdev_partitions(str(tmp_path / "loop0"), [1], timeout=5.0) def test_calls_udevadm_settle_with_timeout(self, mocker, tmp_path): """Calls udevadm settle with the specified timeout.""" from unittest.mock import call as mcall - from imagecraft.pack.image import _wait_for_loopdev_partitions + from imagecraft.pack.image import wait_for_loopdev_partitions mock_run = mocker.patch("imagecraft.pack.image.run") mocker.patch.object(Path, "exists", return_value=True) - _wait_for_loopdev_partitions("/dev/loop5", [1, 2], timeout=7.0) + wait_for_loopdev_partitions("/dev/loop5", [1, 2], timeout=7.0) mock_run.assert_called_once_with("udevadm", "settle", "--timeout", "7") diff --git a/tests/unit/services/test_image.py b/tests/unit/services/test_image.py index 6c0d68d4..c0c30b02 100644 --- a/tests/unit/services/test_image.py +++ b/tests/unit/services/test_image.py @@ -157,7 +157,7 @@ def test_attach_images_stale_inode(image_service, project_dir, mocker): mocker.patch("pathlib.Path.samefile", side_effect=FileNotFoundError) mock_run = mocker.patch("imagecraft.services.image.run") mock_run.return_value.stdout = "/dev/loop11\n" - mocker.patch("imagecraft.services.image._wait_for_loopdev_partitions") + mocker.patch("imagecraft.services.image.wait_for_loopdev_partitions") devices = image_service.attach_images() @@ -182,7 +182,7 @@ def test_attach_images_waits_for_partitions(image_service, project_dir, mocker, mock_project_service.get.return_value = mock_project mock_services.get.return_value = mock_project_service - mock_wait = mocker.patch("imagecraft.services.image._wait_for_loopdev_partitions") + mock_wait = mocker.patch("imagecraft.services.image.wait_for_loopdev_partitions") with patch("atexit.register"): image_service.attach_images() @@ -204,7 +204,7 @@ def test_attach_images_no_wait_on_reuse(image_service, project_dir, mocker): ) mocker.patch("pathlib.Path.samefile", return_value=True) mocker.patch("imagecraft.services.image.run") - mock_wait = mocker.patch("imagecraft.services.image._wait_for_loopdev_partitions") + mock_wait = mocker.patch("imagecraft.services.image.wait_for_loopdev_partitions") image_service.attach_images() From b33fb8f88df3a4888cfdba6236484c143ddf2721 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 22 Apr 2026 05:43:47 +0000 Subject: [PATCH 5/6] fix: replace udevadm polling with BSD flock synchronization Per systemd BLOCK_DEVICE_LOCKING, the correct fix for the partition-node race is to hold a shared BSD flock on the loop device while using its partitions. udev holds an exclusive lock while processing; a shared lock blocks until udev is done and then keeps it held during our operations. - imagecraft/pack/image.py: - Remove volume_partition_nums() and wait_for_loopdev_partitions() - Remove _UDEVADM_BIN, _PARTITION_WAIT_TIMEOUT, time, subprocess imports - In Image.attach_loopdev(): open loop device and acquire LOCK_SH for the duration of the context manager - imagecraft/services/image.py: - Remove import of removed helpers - Add _loop_fds list to track open file handles holding flocks - In attach_images(): open device and acquire LOCK_SH after attaching (both fresh attach and reuse); store fd in _loop_fds - In detach_images(): close all fds (releasing flocks) before detaching - tests: remove TestWaitForLoopdevPartitions; update all tests to mock fcntl.flock and builtins.open; add test for flock release on detach Agent-Logs-Url: https://github.com/canonical/imagecraft/sessions/1d13f3fb-01b2-4424-b470-cbc2611b70e0 Co-authored-by: lengau <4305943+lengau@users.noreply.github.com> --- imagecraft/pack/image.py | 74 ++++-------------------- imagecraft/services/image.py | 26 +++++---- tests/unit/pack/test_image.py | 95 ++----------------------------- tests/unit/services/test_image.py | 52 ++++++++--------- 4 files changed, 54 insertions(+), 193 deletions(-) diff --git a/imagecraft/pack/image.py b/imagecraft/pack/image.py index b56dc07f..92b76166 100644 --- a/imagecraft/pack/image.py +++ b/imagecraft/pack/image.py @@ -15,9 +15,8 @@ """Image handling.""" import contextlib +import fcntl import json -import subprocess -import time from collections.abc import Generator, Iterator from pathlib import Path from typing import Any, cast @@ -29,8 +28,6 @@ from imagecraft.subprocesses import run _LOSETUP_BIN = "losetup" -_UDEVADM_BIN = "udevadm" -_PARTITION_WAIT_TIMEOUT = 10.0 def _get_loop_devices() -> list[dict[str, Any]]: @@ -47,58 +44,6 @@ def _detach_loop_device( run(_LOSETUP_BIN, "-d", loop_device) -def volume_partition_nums(volume: Volume) -> list[int]: - """Return the partition numbers for each structure item in a volume. - - Partition numbers are either explicitly set via ``structure_item.partition_number`` - or implicitly assigned as the 1-based index of the item in the structure list. - """ - return [ - (s.partition_number or i) for i, s in enumerate(volume.structure, start=1) - ] - - -def wait_for_loopdev_partitions( - loop_device: str, - partition_nums: list[int], - timeout: float = _PARTITION_WAIT_TIMEOUT, -) -> None: - """Wait for loop device partition nodes to appear in /dev. - - After attaching a loop device with --partscan, the kernel may not - immediately create the partition device nodes. This function waits - until all expected partition nodes are available. - - :param loop_device: Path to the loop device (e.g. '/dev/loop7') - :param partition_nums: List of partition numbers to wait for - :param timeout: Maximum time to wait in seconds - :raises errors.ImageError: If partition nodes do not appear within the timeout - """ - if not partition_nums: - return - - expected = [Path(f"{loop_device}p{n}") for n in partition_nums] - - # Ask udev to settle so device nodes are created before we poll. - try: - run(_UDEVADM_BIN, "settle", "--timeout", str(int(timeout))) - except (subprocess.CalledProcessError, OSError): - emit.debug("udevadm settle failed or unavailable; falling back to polling") - - # Poll until all partition nodes appear or the timeout is reached. - deadline = time.monotonic() + timeout - missing = [p for p in expected if not p.exists()] - while missing and time.monotonic() < deadline: - time.sleep(0.1) - missing = [p for p in expected if not p.exists()] - - if missing: - raise errors.ImageError( - f"Loop device partition nodes did not appear within {timeout:.0f}s: " - + ", ".join(str(p) for p in missing) - ) - - class Image: """Image. @@ -144,14 +89,17 @@ def attach_loopdev(self) -> Iterator[str]: "--partscan", self.disk_path, ).stdout.strip() - wait_for_loopdev_partitions( - self.loop_device, volume_partition_nums(self.volume) - ) - emit.debug( - f"Attached image {self.disk_path} as loop device {self.loop_device}" - ) try: - yield self.loop_device + # Acquire a shared BSD lock on the loop device. udev holds an + # exclusive lock while it processes the new device; taking a shared + # lock here blocks until udev is done and then holds it for the + # duration of the context so udev does not interfere with our use. + with open(self.loop_device, "rb") as loop_fd: + fcntl.flock(loop_fd, fcntl.LOCK_SH) + emit.debug( + f"Attached image {self.disk_path} as loop device {self.loop_device}" + ) + yield self.loop_device finally: self._detach_loopdevs() diff --git a/imagecraft/services/image.py b/imagecraft/services/image.py index d0bfe588..8f8c33ee 100644 --- a/imagecraft/services/image.py +++ b/imagecraft/services/image.py @@ -16,13 +16,14 @@ import atexit import contextlib +import fcntl import json import pathlib import shutil import subprocess import time from collections.abc import Mapping -from typing import Any, cast +from typing import IO, Any, cast from craft_application import AppMetadata, AppService, ServiceFactory from craft_cli import CraftError, emit @@ -30,7 +31,6 @@ from imagecraft.models import Project from imagecraft.models.volume import PartitionSchema from imagecraft.pack import gptutil -from imagecraft.pack.image import volume_partition_nums, wait_for_loopdev_partitions from imagecraft.subprocesses import run _LOSETUP_BIN = "losetup" @@ -51,6 +51,7 @@ def __init__( self._sector_size = gptutil.SECTOR_SIZE_512 self._images: dict[str, pathlib.Path] | None = None self._loop_devices: dict[str, str] = {} + self._loop_fds: list[IO[bytes]] = [] self._atexit_registered = False def get_images(self) -> Mapping[str, pathlib.Path]: @@ -117,11 +118,9 @@ def attach_images(self) -> Mapping[str, str]: raise ValueError("Images must be created before attaching.") all_devices = self._get_all_loop_devices() - project = cast(Project, self._services.get("project").get()) for name, image_path in self._images.items(): attached_device: str | None = None - is_fresh_attach = False # 1. Check for existing devices pointing to this file. for dev in all_devices: @@ -152,7 +151,6 @@ def attach_images(self) -> Mapping[str, str]: str(image_path), ).stdout.strip() emit.debug(f"Attached {image_path} as {attached_device}") - is_fresh_attach = True except subprocess.CalledProcessError as err: raise CraftError( f"Failed to attach loop device for {image_path}.", @@ -160,11 +158,13 @@ def attach_images(self) -> Mapping[str, str]: resolution="Ensure loop devices are available and you have sufficient permissions (sudo).", ) from err - # 3. Wait for partition nodes after a fresh attach to avoid race conditions. - if is_fresh_attach: - wait_for_loopdev_partitions( - attached_device, volume_partition_nums(project.volumes[name]) - ) + # 3. Acquire a shared BSD lock on the loop device. udev holds an + # exclusive lock while it processes the device; a shared lock here + # blocks until udev is done and then holds it while we use the + # device's partitions, preventing udev from interfering. + loop_fd = open(attached_device, "rb") # noqa: SIM115 (held deliberately) + fcntl.flock(loop_fd, fcntl.LOCK_SH) + self._loop_fds.append(loop_fd) self._loop_devices[name] = attached_device @@ -179,6 +179,12 @@ def detach_images(self) -> None: Includes a retry loop for busy devices. Safe to call as an atexit handler. """ + # Release shared flocks so the devices are no longer considered in use. + for fd in self._loop_fds: + with contextlib.suppress(Exception): + fd.close() + self._loop_fds.clear() + for name, device in list(self._loop_devices.items()): success = False start_time = time.monotonic() diff --git a/tests/unit/pack/test_image.py b/tests/unit/pack/test_image.py index 475db750..a84eda7f 100644 --- a/tests/unit/pack/test_image.py +++ b/tests/unit/pack/test_image.py @@ -48,7 +48,8 @@ def run(cmd: str, *args: Any, **kwargs: Any) -> CompletedProcess[str]: class TestImage: def test_loopdev(self, mocker, new_dir: Path): mock_run = mocker.patch("imagecraft.pack.image.run", side_effect=run) - mock_wait = mocker.patch("imagecraft.pack.image.wait_for_loopdev_partitions") + mock_flock = mocker.patch("fcntl.flock") + mocker.patch("builtins.open", mocker.mock_open()) volume = Volume.unmarshal( { @@ -79,7 +80,8 @@ def test_loopdev(self, mocker, new_dir: Path): call("losetup", "--json"), call("losetup", "-d", "/dev/loop99"), ] - mock_wait.assert_called_once_with("loop99", [1]) + import fcntl as fcntl_mod + mock_flock.assert_called_once_with(mocker.ANY, fcntl_mod.LOCK_SH) @pytest.mark.parametrize( ("volume_data", "has_data_partition"), @@ -251,92 +253,3 @@ def test_has_boot_partition( assert image.has_boot_partition == has_boot_partition - -class TestWaitForLoopdevPartitions: - """Tests for the wait_for_loopdev_partitions helper.""" - - def test_empty_partition_list_returns_immediately(self, mocker): - """No waiting when there are no partitions to check.""" - from imagecraft.pack.image import wait_for_loopdev_partitions - - mock_run = mocker.patch("imagecraft.pack.image.run") - mock_sleep = mocker.patch("time.sleep") - - wait_for_loopdev_partitions("/dev/loop0", []) - - mock_run.assert_not_called() - mock_sleep.assert_not_called() - - def test_waits_until_partitions_appear(self, mocker, tmp_path): - """Polls until partition device nodes exist.""" - from imagecraft.pack.image import wait_for_loopdev_partitions - - mocker.patch("imagecraft.pack.image.run") - - part1 = tmp_path / "loop0p1" - part2 = tmp_path / "loop0p2" - - # Simulate partition nodes appearing after two poll cycles. - call_count = 0 - - def fake_exists(self: Path) -> bool: - nonlocal call_count - call_count += 1 - # Make nodes appear on the third check (after first poll cycle) - return call_count > 4 - - mocker.patch.object(Path, "exists", fake_exists) - mocker.patch("time.sleep") - - wait_for_loopdev_partitions( - str(tmp_path / "loop0"), [1, 2], timeout=5.0 - ) - - def test_raises_on_timeout(self, mocker): - """Raises ImageError when partition nodes don't appear in time.""" - from imagecraft.errors import ImageError - from imagecraft.pack.image import wait_for_loopdev_partitions - - mocker.patch("imagecraft.pack.image.run") - mocker.patch.object(Path, "exists", return_value=False) - mocker.patch("time.sleep") - # Simulate timeout immediately - mocker.patch( - "time.monotonic", - side_effect=[0.0, 0.0, 100.0], - ) - - with pytest.raises(ImageError, match="partition nodes did not appear"): - wait_for_loopdev_partitions("/dev/loop0", [1, 2], timeout=1.0) - - def test_udevadm_failure_falls_back_to_polling(self, mocker, tmp_path): - """Falls back to polling when udevadm settle fails.""" - import subprocess - - from imagecraft.pack.image import wait_for_loopdev_partitions - - part = tmp_path / "loop0p1" - part.touch() - - # udevadm settle raises CalledProcessError - mocker.patch( - "imagecraft.pack.image.run", - side_effect=subprocess.CalledProcessError(1, "udevadm"), - ) - mocker.patch.object(Path, "exists", return_value=True) - - # Should not raise even though udevadm failed - wait_for_loopdev_partitions(str(tmp_path / "loop0"), [1], timeout=5.0) - - def test_calls_udevadm_settle_with_timeout(self, mocker, tmp_path): - """Calls udevadm settle with the specified timeout.""" - from unittest.mock import call as mcall - - from imagecraft.pack.image import wait_for_loopdev_partitions - - mock_run = mocker.patch("imagecraft.pack.image.run") - mocker.patch.object(Path, "exists", return_value=True) - - wait_for_loopdev_partitions("/dev/loop5", [1, 2], timeout=7.0) - - mock_run.assert_called_once_with("udevadm", "settle", "--timeout", "7") diff --git a/tests/unit/services/test_image.py b/tests/unit/services/test_image.py index c0c30b02..f207b697 100644 --- a/tests/unit/services/test_image.py +++ b/tests/unit/services/test_image.py @@ -104,6 +104,8 @@ def test_attach_images_new(image_service, project_dir, mocker): mocker.patch.object(image_service, "_get_all_loop_devices", return_value=[]) mock_run.return_value.stdout = "/dev/loop8\n" + mock_flock = mocker.patch("fcntl.flock") + mocker.patch("builtins.open", return_value=mocker.MagicMock()) with patch("atexit.register") as mock_atexit: devices = image_service.attach_images() @@ -118,6 +120,9 @@ def test_attach_images_new(image_service, project_dir, mocker): ) mock_atexit.assert_called_once_with(image_service.detach_images) + import fcntl as fcntl_mod + mock_flock.assert_called_once_with(mocker.ANY, fcntl_mod.LOCK_SH) + def test_attach_images_reuse(image_service, project_dir, mocker): image_path = project_dir / ".pc.img.tmp" @@ -134,12 +139,17 @@ def test_attach_images_reuse(image_service, project_dir, mocker): # Mock samefile to return True mocker.patch("pathlib.Path.samefile", return_value=True) mock_run = mocker.patch("imagecraft.services.image.run") + mock_flock = mocker.patch("fcntl.flock") + mocker.patch("builtins.open", return_value=mocker.MagicMock()) devices = image_service.attach_images() assert devices == {"pc": "/dev/loop9"} mock_run.assert_not_called() # Should not call losetup attach + import fcntl as fcntl_mod + mock_flock.assert_called_once_with(mocker.ANY, fcntl_mod.LOCK_SH) + def test_attach_images_stale_inode(image_service, project_dir, mocker): image_path = project_dir / ".pc.img.tmp" @@ -157,7 +167,8 @@ def test_attach_images_stale_inode(image_service, project_dir, mocker): mocker.patch("pathlib.Path.samefile", side_effect=FileNotFoundError) mock_run = mocker.patch("imagecraft.services.image.run") mock_run.return_value.stdout = "/dev/loop11\n" - mocker.patch("imagecraft.services.image.wait_for_loopdev_partitions") + mocker.patch("fcntl.flock") + mocker.patch("builtins.open", return_value=mocker.MagicMock()) devices = image_service.attach_images() @@ -170,45 +181,28 @@ def test_attach_images_stale_inode(image_service, project_dir, mocker): ) -def test_attach_images_waits_for_partitions(image_service, project_dir, mocker, mock_project, mock_services): - """attach_images() waits for partition nodes after a fresh losetup attach.""" +def test_attach_images_flock_released_on_detach(image_service, project_dir, mocker): + """attach_images() holds a shared flock; detach_images() closes the fd to release it.""" image_service._images = {"pc": project_dir / ".pc.img.tmp"} mocker.patch.object(image_service, "_get_all_loop_devices", return_value=[]) mock_run = mocker.patch("imagecraft.services.image.run") mock_run.return_value.stdout = "/dev/loop8\n" - - mock_project_service = MagicMock() - mock_project_service.get.return_value = mock_project - mock_services.get.return_value = mock_project_service - - mock_wait = mocker.patch("imagecraft.services.image.wait_for_loopdev_partitions") + mocker.patch("fcntl.flock") + mock_fd = mocker.MagicMock() + mocker.patch("builtins.open", return_value=mock_fd) with patch("atexit.register"): image_service.attach_images() - # efi has no explicit partition_number -> 1; rootfs has partition_number=2 -> 2 - mock_wait.assert_called_once_with("/dev/loop8", [1, 2]) - - -def test_attach_images_no_wait_on_reuse(image_service, project_dir, mocker): - """attach_images() does NOT wait when reusing an existing loop device.""" - image_path = project_dir / ".pc.img.tmp" - image_path.touch() - image_service._images = {"pc": image_path} - - mocker.patch.object( - image_service, - "_get_all_loop_devices", - return_value=[{"name": "/dev/loop9", "back-file": str(image_path)}], - ) - mocker.patch("pathlib.Path.samefile", return_value=True) - mocker.patch("imagecraft.services.image.run") - mock_wait = mocker.patch("imagecraft.services.image.wait_for_loopdev_partitions") + assert len(image_service._loop_fds) == 1 - image_service.attach_images() + # After detach the fd should be closed and the list cleared + mock_run.return_value = mocker.MagicMock() # losetup -d + image_service.detach_images() - mock_wait.assert_not_called() + mock_fd.close.assert_called_once() + assert image_service._loop_fds == [] def test_detach_images_success(image_service, mocker): From 3814238b2b540fee2dba0cc2a43c47d151269fd3 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 22 Apr 2026 05:45:42 +0000 Subject: [PATCH 6/6] fix: close fd on flock failure and move fcntl imports to module level - Wrap fcntl.flock in try/except in attach_images() to close the fd if flock raises, preventing a resource leak - Move inline `import fcntl` statements to module level in both test files Agent-Logs-Url: https://github.com/canonical/imagecraft/sessions/1d13f3fb-01b2-4424-b470-cbc2611b70e0 Co-authored-by: lengau <4305943+lengau@users.noreply.github.com> --- imagecraft/services/image.py | 6 +++++- tests/unit/pack/test_image.py | 4 ++-- tests/unit/services/test_image.py | 7 +++---- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/imagecraft/services/image.py b/imagecraft/services/image.py index 8f8c33ee..83dff47f 100644 --- a/imagecraft/services/image.py +++ b/imagecraft/services/image.py @@ -163,7 +163,11 @@ def attach_images(self) -> Mapping[str, str]: # blocks until udev is done and then holds it while we use the # device's partitions, preventing udev from interfering. loop_fd = open(attached_device, "rb") # noqa: SIM115 (held deliberately) - fcntl.flock(loop_fd, fcntl.LOCK_SH) + try: + fcntl.flock(loop_fd, fcntl.LOCK_SH) + except Exception: + loop_fd.close() + raise self._loop_fds.append(loop_fd) self._loop_devices[name] = attached_device diff --git a/tests/unit/pack/test_image.py b/tests/unit/pack/test_image.py index a84eda7f..0fe103c8 100644 --- a/tests/unit/pack/test_image.py +++ b/tests/unit/pack/test_image.py @@ -17,6 +17,7 @@ from typing import Any from unittest.mock import call +import fcntl import pytest from imagecraft.models import Volume from imagecraft.pack.image import Image @@ -80,8 +81,7 @@ def test_loopdev(self, mocker, new_dir: Path): call("losetup", "--json"), call("losetup", "-d", "/dev/loop99"), ] - import fcntl as fcntl_mod - mock_flock.assert_called_once_with(mocker.ANY, fcntl_mod.LOCK_SH) + mock_flock.assert_called_once_with(mocker.ANY, fcntl.LOCK_SH) @pytest.mark.parametrize( ("volume_data", "has_data_partition"), diff --git a/tests/unit/services/test_image.py b/tests/unit/services/test_image.py index f207b697..dfcadd6b 100644 --- a/tests/unit/services/test_image.py +++ b/tests/unit/services/test_image.py @@ -12,6 +12,7 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . +import fcntl import subprocess from unittest.mock import MagicMock, patch @@ -120,8 +121,7 @@ def test_attach_images_new(image_service, project_dir, mocker): ) mock_atexit.assert_called_once_with(image_service.detach_images) - import fcntl as fcntl_mod - mock_flock.assert_called_once_with(mocker.ANY, fcntl_mod.LOCK_SH) + mock_flock.assert_called_once_with(mocker.ANY, fcntl.LOCK_SH) def test_attach_images_reuse(image_service, project_dir, mocker): @@ -147,8 +147,7 @@ def test_attach_images_reuse(image_service, project_dir, mocker): assert devices == {"pc": "/dev/loop9"} mock_run.assert_not_called() # Should not call losetup attach - import fcntl as fcntl_mod - mock_flock.assert_called_once_with(mocker.ANY, fcntl_mod.LOCK_SH) + mock_flock.assert_called_once_with(mocker.ANY, fcntl.LOCK_SH) def test_attach_images_stale_inode(image_service, project_dir, mocker):