From cf1f8299cb4eae6588cd8e12a47df9493c8bbece Mon Sep 17 00:00:00 2001 From: Varun Valada Date: Wed, 24 Sep 2025 09:34:04 -0500 Subject: [PATCH 1/3] Add reservation phase to device connectors --- .../devices/__init__.py | 212 ++++++------- .../devices/multi/multi.py | 45 ++- .../devices/multi/tests/test_multi.py | 124 ++++++++ .../devices/tests/test_devices.py | 282 +++++++++++++++++- 4 files changed, 552 insertions(+), 111 deletions(-) diff --git a/device-connectors/src/testflinger_device_connectors/devices/__init__.py b/device-connectors/src/testflinger_device_connectors/devices/__init__.py index f16d2e2b0..e99771407 100644 --- a/device-connectors/src/testflinger_device_connectors/devices/__init__.py +++ b/device-connectors/src/testflinger_device_connectors/devices/__init__.py @@ -72,6 +72,114 @@ def SerialLogger(host=None, port=None, filename=None): return StubSerialLogger(host, port, filename) +def import_ssh_key(key: str, keyfile: str = "key.pub") -> None: + """Import SSH key provided in Reserve data. + + :param key: SSH key to import. + :param keyfile: Output file where to store the imported key + :raises RuntimeError: If failure during import ssh keys + """ + cmd = ["ssh-import-id", "-o", keyfile, key] + for retry in range(10): + try: + subprocess.run( + cmd, + timeout=30, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + check=True, + ) + logger.info("Successfully imported key: %s", key) + break + + except subprocess.TimeoutExpired: + pass + except subprocess.CalledProcessError as exc: + output = (exc.stdout or b"").decode() + if "status_code=404" in output: + raise RuntimeError( + f"Failed to import ssh key: {key}. User not found." + ) from exc + + logger.error("Unable to import ssh key from: %s", key) + logger.info("Retrying...") + time.sleep(min(2**retry, 100)) + else: + raise RuntimeError( + f"Failed to import ssh key: {key}. Maximum retries reached" + ) + + +def copy_ssh_key( + device_ip: str, + username: str, + password: Optional[str] = None, + key: Optional[str] = None, +): + """If provided, copy the SSH `key` to the DUT, + otherwise copy the agent's using password authentication. + + :raises RuntimeError in case it can't copy the SSH keys + """ + if not key and not password: + raise ValueError("Cannot copy the agent's SSH key w/o password") + + if password: + cmd = ["sshpass", "-p", password] + else: + cmd = [] + + cmd.extend(["ssh-copy-id", "-f"]) + + if key: + cmd.extend(["-i", key]) + + cmd.extend( + [ + "-o", + "StrictHostKeyChecking=no", + "-o", + "UserKnownHostsFile=/dev/null", + "{}@{}".format(username, device_ip), + ] + ) + + for _retry in range(10): + # Retry ssh key copy just in case it's rebooting + try: + subprocess.check_call(cmd, timeout=30) + break + except ( + subprocess.CalledProcessError, + subprocess.TimeoutExpired, + ): + logger.error("Error copying ssh key to device for: %s", key) + logger.info("Retrying...") + time.sleep(60) + + else: + logger.error("Failed to copy ssh key: %s", key) + raise RuntimeError + + +def copy_ssh_keys_to_devices(ssh_keys, device_ips, test_username="ubuntu"): + """Copy list of ssh keys to list of devices.""" + for key in ssh_keys: + with contextlib.suppress(FileNotFoundError): + os.unlink("key.pub") + + try: + # Import SSH Keys with ssh-import-id + import_ssh_key(key, keyfile="key.pub") + + # Attempt to copy keys only if import succeeds + with contextlib.suppress(RuntimeError): + for device_ip in device_ips: + copy_ssh_key(device_ip, test_username, key="key.pub") + except RuntimeError as exc: + logger.error(exc) + + class StubSerialLogger: """Fake SerialLogger when we don't have Serial Logger data defined.""" @@ -269,95 +377,6 @@ def allocate(self): """Allocate devices for multi-agent jobs (default method).""" pass - def import_ssh_key(self, key: str, keyfile: str = "key.pub") -> None: - """Import SSH key provided in Reserve data. - - :param key: SSH key to import. - :param keyfile: Output file where to store the imported key - :raises RuntimeError: If failure during import ssh keys - """ - cmd = ["ssh-import-id", "-o", keyfile, key] - for retry in range(10): - try: - subprocess.run( - cmd, - timeout=30, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - check=True, - ) - logger.info("Successfully imported key: %s", key) - break - - except subprocess.TimeoutExpired: - pass - except subprocess.CalledProcessError as exc: - output = (exc.stdout or b"").decode() - if "status_code=404" in output: - raise RuntimeError( - f"Failed to import ssh key: {key}. User not found." - ) from exc - - logger.error("Unable to import ssh key from: %s", key) - logger.info("Retrying...") - time.sleep(min(2**retry, 100)) - else: - raise RuntimeError( - f"Failed to import ssh key: {key}. Maximum retries reached" - ) - - def copy_ssh_key( - self, - device_ip: str, - username: str, - password: Optional[str] = None, - key: Optional[str] = None, - ): - """If provided, copy the SSH `key` to the DUT, - otherwise copy the agent's using password authentication. - - :raises RuntimeError in case it can't copy the SSH keys - """ - if not key and not password: - raise ValueError("Cannot copy the agent's SSH key w/o password") - - if password: - cmd = ["sshpass", "-p", password] - else: - cmd = [] - - cmd.extend(["ssh-copy-id", "-f"]) - - if key: - cmd.extend(["-i", key]) - - cmd.extend( - [ - "-o", - "StrictHostKeyChecking=no", - "-o", - "UserKnownHostsFile=/dev/null", - "{}@{}".format(username, device_ip), - ] - ) - - for _retry in range(10): - # Retry ssh key copy just in case it's rebooting - try: - subprocess.check_call(cmd, timeout=30) - break - except ( - subprocess.CalledProcessError, - subprocess.TimeoutExpired, - ): - logger.error("Error copying ssh key to device for: %s", key) - logger.info("Retrying...") - time.sleep(60) - - else: - logger.error("Failed to copy ssh key: %s", key) - raise RuntimeError - def reserve(self, args): """Reserve systems (default method).""" with open(args.config) as configfile: @@ -373,20 +392,7 @@ def reserve(self, args): device_ip = config["device_ip"] reserve_data = job_data["reserve_data"] ssh_keys = reserve_data.get("ssh_keys", []) - for key in ssh_keys: - with contextlib.suppress(FileNotFoundError): - os.unlink("key.pub") - - try: - # Import SSH Keys with ssh-import-id - self.import_ssh_key(key, keyfile="key.pub") - - # Attempt to copy keys only if import succeeds - with contextlib.suppress(RuntimeError): - self.copy_ssh_key(device_ip, test_username, key="key.pub") - except RuntimeError as exc: - logger.error(exc) - + copy_ssh_keys_to_devices(ssh_keys, [device_ip], test_username) # default reservation timeout is 1 hour timeout = int(reserve_data.get("timeout", "3600")) serial_host = config.get("serial_host") diff --git a/device-connectors/src/testflinger_device_connectors/devices/multi/multi.py b/device-connectors/src/testflinger_device_connectors/devices/multi/multi.py index 7e805d2a9..d088bc5e1 100644 --- a/device-connectors/src/testflinger_device_connectors/devices/multi/multi.py +++ b/device-connectors/src/testflinger_device_connectors/devices/multi/multi.py @@ -18,10 +18,14 @@ import logging import os import time +from datetime import datetime, timedelta import requests -from testflinger_device_connectors.devices import ProvisioningError +from testflinger_device_connectors.devices import ( + ProvisioningError, + copy_ssh_keys_to_devices, +) logger = logging.getLogger(__name__) @@ -80,6 +84,45 @@ def provision(self): self.save_job_list_file() + def reserve(self): + """Push ssh keys to each device in reservation phase.""" + logger.info("BEGIN multi device reservation") + job_data = self.job_data + try: + test_username = job_data["test_data"]["test_username"] + except KeyError: + test_username = "ubuntu" + reserve_data = job_data["reserve_data"] + ssh_keys = reserve_data.get("ssh_keys", []) + with open("job_list.json", "r") as json_file: + job_list = json.load(json_file) + device_ips = [job["device_info"]["device_ip"] for job in job_list] + copy_ssh_keys_to_devices(ssh_keys, device_ips, test_username) + print("*** TESTFLINGER SYSTEMS RESERVED ***") + print("You can now connect to the following devices:") + for job in job_list: + device_ip = job["device_info"]["device_ip"] + print(f"{test_username}@{device_ip}") + + timeout = int(reserve_data.get("timeout", "3600")) + now = datetime.now().astimezone().isoformat() + expire_time = ( + datetime.now().astimezone() + timedelta(seconds=timeout) + ).isoformat() + print("Current time: [{}]".format(now)) + print("Reservation expires at: [{}]".format(expire_time)) + print( + "Reservation will automatically timeout in {} seconds".format( + timeout + ) + ) + job_id = job_data.get("job_id", "") + print( + "To end the reservation sooner use: " + + "testflinger-cli cancel {}".format(job_id) + ) + time.sleep(timeout) + def terminate_if_parent_completed(self): """If parent job is completed or cancelled, cancel sub jobs.""" if self.this_job_completed(): diff --git a/device-connectors/src/testflinger_device_connectors/devices/multi/tests/test_multi.py b/device-connectors/src/testflinger_device_connectors/devices/multi/tests/test_multi.py index f5a516f73..48825ab22 100644 --- a/device-connectors/src/testflinger_device_connectors/devices/multi/tests/test_multi.py +++ b/device-connectors/src/testflinger_device_connectors/devices/multi/tests/test_multi.py @@ -14,6 +14,9 @@ """Unit tests for multi-device support code.""" +import json +import tempfile +from unittest.mock import patch from uuid import uuid4 import pytest @@ -98,3 +101,124 @@ def test_this_job_completed(): incomplete_client.get_status = lambda job_id: "something else" test_agent = Multi(test_config, job_data, incomplete_client) assert test_agent.this_job_completed() is False + + +@patch( + "testflinger_device_connectors.devices.multi.multi.copy_ssh_keys_to_devices" +) +@patch("time.sleep") +def test_multi_reserve(mock_sleep, mock_copy_keys): + """Test Multi.reserve method functionality.""" + test_config = {"agent_name": "test_agent"} + job_data = { + "job_id": "test-job-123", + "reserve_data": {"ssh_keys": ["key1", "key2"], "timeout": "1800"}, + "test_data": {"test_username": "testuser"}, + } + + # Create job_list.json file with mock data + job_list = [ + {"device_info": {"device_ip": "192.168.1.1"}}, + {"device_info": {"device_ip": "192.168.1.2"}}, + ] + + with tempfile.NamedTemporaryFile( + mode="w", suffix=".json", delete=False + ) as tmp_file: + json.dump(job_list, tmp_file) + + # Mock builtins.open to return our temp file + # when job_list.json is requested + with patch("builtins.open", create=True) as mo: + mo.return_value.__enter__.return_value.read.return_value = json.dumps( + job_list + ) + + # Mock print to capture output + with patch("builtins.print") as mock_print: + test_agent = Multi( + test_config, job_data, MockTFClient("http://localhost") + ) + test_agent.reserve() + + # Verify copy_ssh_keys_to_devices was called with correct parameters + mock_copy_keys.assert_called_once_with( + ["key1", "key2"], ["192.168.1.1", "192.168.1.2"], "testuser" + ) + + # Verify time.sleep was called with timeout + mock_sleep.assert_called_once_with(1800) + + # Verify print statements were made + assert ( + mock_print.call_count >= 5 + ) # Multiple print statements in reserve method + + +@patch( + "testflinger_device_connectors.devices.multi.multi.copy_ssh_keys_to_devices" +) +@patch("time.sleep") +def test_multi_reserve_default_username(mock_sleep, mock_copy_keys): + """Test Multi.reserve method with default username.""" + test_config = {"agent_name": "test_agent"} + job_data = { + "job_id": "test-job-123", + "reserve_data": {"ssh_keys": ["key1"], "timeout": "3600"}, + # No test_data section - should default to ubuntu + } + + job_list = [{"device_info": {"device_ip": "192.168.1.1"}}] + + with patch("builtins.open", create=True) as mock_open: + mock_open.return_value.__enter__.return_value.read.return_value = ( + json.dumps(job_list) + ) + + with patch("builtins.print"): + test_agent = Multi( + test_config, job_data, MockTFClient("http://localhost") + ) + test_agent.reserve() + + # Verify copy_ssh_keys_to_devices was called + mock_copy_keys.assert_called_once_with(["key1"], ["192.168.1.1"], "ubuntu") + + # Verify time.sleep was called with timeout + mock_sleep.assert_called_once_with(3600) + + +@patch( + "testflinger_device_connectors.devices.multi.multi.copy_ssh_keys_to_devices" +) +@patch("time.sleep") +def test_multi_reserve_no_ssh_keys(mock_sleep, mock_copy_keys): + """Test Multi.reserve method with no SSH keys.""" + test_config = {"agent_name": "test_agent"} + job_data = { + "job_id": "test-job-123", + "reserve_data": { + "timeout": "1800" + # No ssh_keys provided + }, + "test_data": {"test_username": "testuser"}, + } + + job_list = [{"device_info": {"device_ip": "192.168.1.1"}}] + + with patch("builtins.open", create=True) as mock_open: + mock_open.return_value.__enter__.return_value.read.return_value = ( + json.dumps(job_list) + ) + + with patch("builtins.print"): + test_agent = Multi( + test_config, job_data, MockTFClient("http://localhost") + ) + test_agent.reserve() + + # Verify copy_ssh_keys_to_devices was called with empty list + mock_copy_keys.assert_called_once_with([], ["192.168.1.1"], "testuser") + + # Verify time.sleep was called with timeout + mock_sleep.assert_called_once_with(1800) diff --git a/device-connectors/src/testflinger_device_connectors/devices/tests/test_devices.py b/device-connectors/src/testflinger_device_connectors/devices/tests/test_devices.py index 54dc4be59..c1d5e8883 100644 --- a/device-connectors/src/testflinger_device_connectors/devices/tests/test_devices.py +++ b/device-connectors/src/testflinger_device_connectors/devices/tests/test_devices.py @@ -18,7 +18,11 @@ import unittest from unittest.mock import Mock, call, patch -from testflinger_device_connectors.devices import DefaultDevice +from testflinger_device_connectors.devices import ( + DefaultDevice, + copy_ssh_key, + copy_ssh_keys_to_devices, +) class DefaultDeviceTests(unittest.TestCase): @@ -30,8 +34,8 @@ def test_copy_ssh_id(self, mock_check): to the DUT. """ fake_config = {"device_ip": "10.10.10.10", "agent_name": "fake_agent"} - connector = DefaultDevice(fake_config) - connector.copy_ssh_key( + DefaultDevice(fake_config) + copy_ssh_key( "192.168.1.2", "username", "password", @@ -50,8 +54,8 @@ def test_copy_ssh_id_with_key(self, mock_check): to the DUT. """ fake_config = {"device_ip": "10.10.10.10", "agent_name": "fake_agent"} - connector = DefaultDevice(fake_config) - connector.copy_ssh_key( + DefaultDevice(fake_config) + copy_ssh_key( "192.168.1.2", "username", key="key.pub", @@ -72,12 +76,12 @@ def test_copy_ssh_id_raises(self, mock_check): exception after 3 failed attempts. """ fake_config = {"device_ip": "10.10.10.10", "agent_name": "fake_agent"} - connector = DefaultDevice(fake_config) + DefaultDevice(fake_config) mock_check.side_effect = subprocess.CalledProcessError(1, "") with self.assertRaises(RuntimeError): - connector.copy_ssh_key( + copy_ssh_key( "192.168.1.2", "username", "password", @@ -103,3 +107,267 @@ def test_write_device_info(self): assert all( device_info[key] == value for key, value in fake_config.items() ) + + +class CopySshKeysToDevicesTests(unittest.TestCase): + """Unit tests for copy_ssh_keys_to_devices function.""" + + @patch("testflinger_device_connectors.devices.copy_ssh_key") + @patch("testflinger_device_connectors.devices.import_ssh_key") + @patch("os.unlink") + def test_copy_ssh_keys_to_devices_success( + self, mock_unlink, mock_import, mock_copy + ): + """Test successful copying of SSH keys to multiple devices.""" + ssh_keys = ["key1", "key2"] + device_ips = ["192.168.1.1", "192.168.1.2"] + + copy_ssh_keys_to_devices(ssh_keys, device_ips, "testuser") + + # Should unlink key.pub twice (once per key) + assert mock_unlink.call_count == 2 + mock_unlink.assert_has_calls([call("key.pub"), call("key.pub")]) + + # Should import each key + mock_import.assert_has_calls( + [call("key1", keyfile="key.pub"), call("key2", keyfile="key.pub")] + ) + + # Should copy each key to each device (2 keys * 2 devices = 4 calls) + expected_copy_calls = [ + call("192.168.1.1", "testuser", key="key.pub"), + call("192.168.1.2", "testuser", key="key.pub"), + call("192.168.1.1", "testuser", key="key.pub"), + call("192.168.1.2", "testuser", key="key.pub"), + ] + mock_copy.assert_has_calls(expected_copy_calls) + + @patch("testflinger_device_connectors.devices.copy_ssh_key") + @patch("testflinger_device_connectors.devices.import_ssh_key") + @patch("os.unlink") + def test_copy_ssh_keys_to_devices_import_failure( + self, mock_unlink, mock_import, mock_copy + ): + """Test handling of import_ssh_key failure.""" + ssh_keys = ["key1"] + device_ips = ["192.168.1.1"] + + # Make import_ssh_key raise RuntimeError + mock_import.side_effect = RuntimeError("Import failed") + + copy_ssh_keys_to_devices(ssh_keys, device_ips, "testuser") + + # Should still attempt to unlink + mock_unlink.assert_called_once_with("key.pub") + + # Should attempt to import + mock_import.assert_called_once_with("key1", keyfile="key.pub") + + # Should not attempt to copy since import failed + mock_copy.assert_not_called() + + @patch("testflinger_device_connectors.devices.copy_ssh_key") + @patch("testflinger_device_connectors.devices.import_ssh_key") + @patch("os.unlink") + def test_copy_ssh_keys_to_devices_copy_failure( + self, mock_unlink, mock_import, mock_copy + ): + """Test handling of copy_ssh_key failure.""" + ssh_keys = ["key1"] + device_ips = ["192.168.1.1"] + + # Make copy_ssh_key raise RuntimeError + mock_copy.side_effect = RuntimeError("Copy failed") + + copy_ssh_keys_to_devices(ssh_keys, device_ips, "testuser") + + # Should unlink and import successfully + mock_unlink.assert_called_once_with("key.pub") + mock_import.assert_called_once_with("key1", keyfile="key.pub") + + # Should attempt to copy but fail gracefully + mock_copy.assert_called_once_with( + "192.168.1.1", "testuser", key="key.pub" + ) + + @patch("testflinger_device_connectors.devices.copy_ssh_key") + @patch("testflinger_device_connectors.devices.import_ssh_key") + @patch("os.unlink", side_effect=FileNotFoundError) + def test_copy_ssh_keys_to_devices_file_not_found( + self, mock_unlink, mock_import, mock_copy + ): + """Test handling when key.pub file doesn't exist.""" + ssh_keys = ["key1"] + device_ips = ["192.168.1.1"] + + copy_ssh_keys_to_devices(ssh_keys, device_ips, "testuser") + + # Should attempt to unlink but suppress FileNotFoundError + mock_unlink.assert_called_once_with("key.pub") + + # Should continue with import and copy + mock_import.assert_called_once_with("key1", keyfile="key.pub") + mock_copy.assert_called_once_with( + "192.168.1.1", "testuser", key="key.pub" + ) + + @patch("testflinger_device_connectors.devices.copy_ssh_key") + @patch("testflinger_device_connectors.devices.import_ssh_key") + @patch("os.unlink") + def test_copy_ssh_keys_to_devices_empty_lists( + self, mock_unlink, mock_import, mock_copy + ): + """Test with empty SSH keys and device lists.""" + copy_ssh_keys_to_devices([], []) + + # Should not call any functions + mock_unlink.assert_not_called() + mock_import.assert_not_called() + mock_copy.assert_not_called() + + +class DefaultDeviceReserveTests(unittest.TestCase): + """Unit tests for DefaultDevice.reserve method.""" + + @patch("testflinger_device_connectors.devices.copy_ssh_keys_to_devices") + @patch("time.sleep") + def test_reserve_with_ssh_keys(self, mock_sleep, mock_copy_keys): + """Test DefaultDevice.reserve method with SSH keys.""" + config_data = {"device_ip": "192.168.1.10", "agent_name": "test_agent"} + + job_data = { + "reserve_data": {"ssh_keys": ["key1", "key2"], "timeout": "1800"}, + "test_data": {"test_username": "testuser"}, + } + + # Create a mock args object + mock_args = Mock() + mock_args.config = "test_config.yaml" + + # Mock file operations + with ( + patch( + "testflinger_device_connectors.get_test_opportunity", + return_value=job_data, + ), + patch("builtins.open") as mock_open, + ): + # Mock the config file read + mock_open.return_value.__enter__.return_value = Mock() + mock_open.return_value.__enter__.return_value.read.return_value = ( + '{"device_ip": "192.168.1.10"}' + ) + + with ( + patch( + "yaml.safe_load", + return_value={"device_ip": "192.168.1.10"}, + ), + patch("builtins.print"), + ): + device = DefaultDevice(config_data) + device.reserve(mock_args) + + # Verify copy_ssh_keys_to_devices was called with correct parameters + mock_copy_keys.assert_called_once_with( + ["key1", "key2"], ["192.168.1.10"], "testuser" + ) + + # Verify sleep was called with timeout + mock_sleep.assert_called_once_with(1800) + + @patch("testflinger_device_connectors.devices.copy_ssh_keys_to_devices") + @patch("time.sleep") + def test_reserve_no_ssh_keys(self, mock_sleep, mock_copy_keys): + """Test DefaultDevice.reserve method with no SSH keys.""" + config_data = {"device_ip": "192.168.1.10", "agent_name": "test_agent"} + + job_data = { + "reserve_data": { + "timeout": "3600" + # No ssh_keys provided + }, + "test_data": {"test_username": "testuser"}, + } + + mock_args = Mock() + mock_args.config = "test_config.yaml" + + with ( + patch( + "testflinger_device_connectors.get_test_opportunity", + return_value=job_data, + ), + patch("builtins.open") as mock_open, + ): + # Mock the config file read + mock_open.return_value.__enter__.return_value = Mock() + mock_open.return_value.__enter__.return_value.read.return_value = ( + '{"device_ip": "192.168.1.10"}' + ) + + with ( + patch( + "yaml.safe_load", + return_value={"device_ip": "192.168.1.10"}, + ), + patch("builtins.print"), + ): + device = DefaultDevice(config_data) + device.reserve(mock_args) + + # Verify copy_ssh_keys_to_devices was called with empty list + mock_copy_keys.assert_called_once_with( + [], ["192.168.1.10"], "testuser" + ) + + # Verify sleep was called with timeout + mock_sleep.assert_called_once_with(3600) + + @patch("testflinger_device_connectors.devices.copy_ssh_keys_to_devices") + @patch("time.sleep") + def test_reserve_default_timeout(self, mock_sleep, mock_copy_keys): + """Test DefaultDevice.reserve method with default timeout.""" + config_data = {"device_ip": "192.168.1.10", "agent_name": "test_agent"} + + job_data = { + "reserve_data": { + "ssh_keys": ["key1"] + # No timeout provided - should default to 3600 + }, + "test_data": {"test_username": "testuser"}, + } + + mock_args = Mock() + mock_args.config = "test_config.yaml" + + with ( + patch( + "testflinger_device_connectors.get_test_opportunity", + return_value=job_data, + ), + patch("builtins.open") as mock_open, + ): + # Mock the config file read + mock_open.return_value.__enter__.return_value = Mock() + mock_open.return_value.__enter__.return_value.read.return_value = ( + '{"device_ip": "192.168.1.10"}' + ) + + with ( + patch( + "yaml.safe_load", + return_value={"device_ip": "192.168.1.10"}, + ), + patch("builtins.print"), + ): + device = DefaultDevice(config_data) + device.reserve(mock_args) + + # Verify copy_ssh_keys_to_devices was called + mock_copy_keys.assert_called_once_with( + ["key1"], ["192.168.1.10"], "testuser" + ) + + # Verify sleep was called with default timeout of 3600 + mock_sleep.assert_called_once_with(3600) From ccfde58e93a3e9251fc52197d89c294d1e1828ef Mon Sep 17 00:00:00 2001 From: Varun Valada Date: Wed, 27 Aug 2025 14:20:43 -0500 Subject: [PATCH 2/3] Create new endpoint for multi agent submission of child jobs --- .../devices/multi/multi.py | 3 +- .../devices/multi/tests/test_multi.py | 4 + .../devices/multi/tfclient.py | 13 ++ server/src/testflinger/api/v1.py | 70 +++++++++- server/src/testflinger/database.py | 16 +++ server/tests/test_v1_authorization.py | 124 ++++++++++++++++++ 6 files changed, 226 insertions(+), 4 deletions(-) diff --git a/device-connectors/src/testflinger_device_connectors/devices/multi/multi.py b/device-connectors/src/testflinger_device_connectors/devices/multi/multi.py index d088bc5e1..1170c318a 100644 --- a/device-connectors/src/testflinger_device_connectors/devices/multi/multi.py +++ b/device-connectors/src/testflinger_device_connectors/devices/multi/multi.py @@ -185,7 +185,8 @@ def create_jobs(self): updated_job = self.inject_parent_jobid(updated_job) try: - job_id = self.client.submit_job(updated_job) + # Use agent job submission for credential inheritance + job_id = self.client.submit_agent_job(updated_job) except requests.exceptions.HTTPError as exc: logger.error("Unable to create job: %s", exc.response.text) self.cancel_jobs(self.jobs) diff --git a/device-connectors/src/testflinger_device_connectors/devices/multi/tests/test_multi.py b/device-connectors/src/testflinger_device_connectors/devices/multi/tests/test_multi.py index 48825ab22..465cefbf6 100644 --- a/device-connectors/src/testflinger_device_connectors/devices/multi/tests/test_multi.py +++ b/device-connectors/src/testflinger_device_connectors/devices/multi/tests/test_multi.py @@ -32,6 +32,10 @@ def submit_job(self, job_data): """Return a fake job id.""" return str(uuid4()) + def submit_agent_job(self, job_data): + """Return a fake agent job id.""" + return self.submit_job(job_data) + def test_bad_tfclient_url(): """Test that Multi raises an exception when TFClient URL is bad.""" diff --git a/device-connectors/src/testflinger_device_connectors/devices/multi/tfclient.py b/device-connectors/src/testflinger_device_connectors/devices/multi/tfclient.py index 8843b52fe..0f33a158d 100644 --- a/device-connectors/src/testflinger_device_connectors/devices/multi/tfclient.py +++ b/device-connectors/src/testflinger_device_connectors/devices/multi/tfclient.py @@ -145,6 +145,19 @@ def submit_job(self, job_data): response = self.post(endpoint, job_data) return json.loads(response).get("job_id") + def submit_agent_job(self, job_data): + """Submit a child job to the testflinger server with + credential inheritance. + + :param job_data: + dict of data for the job to submit, must include parent_job_id + :return: + ID for the test job + """ + endpoint = "/v1/agent/jobs" + response = self.post(endpoint, job_data) + return json.loads(response).get("job_id") + def cancel_job(self, job_id): """Tell the server to cancel a specified job_id.""" try: diff --git a/server/src/testflinger/api/v1.py b/server/src/testflinger/api/v1.py index 4b3a3cbdb..0639f0f60 100644 --- a/server/src/testflinger/api/v1.py +++ b/server/src/testflinger/api/v1.py @@ -141,6 +141,47 @@ def validate_secrets(data: dict): data["client_id"] = client_id +@v1.post("/agent/jobs") +@authenticate +@v1.input(schemas.Job, location="json") +@v1.output(schemas.JobId) +def agent_job_post(json_data: dict): + """ + Add a child job to the queue with inherited credentials + from parent job. + """ + try: + job_queue = json_data.get("job_queue") + parent_job_id = json_data.get("parent_job_id") + except (AttributeError, BadRequest): + job_queue = "" + parent_job_id = None + + if not job_queue: + abort(422, message="Invalid data or no job_queue specified") + + if not parent_job_id: + abort(422, message="parent_job_id required for agent job submission") + + if not check_valid_uuid(parent_job_id): + abort(400, message="Invalid parent_job_id specified") + + # Retrieve parent job permissions for credential inheritance + inherited_permissions = database.retrieve_parent_permissions(parent_job_id) + + try: + job = job_builder(json_data, inherited_permissions) + except ValueError: + abort(400, message="Invalid job_id specified") + + jobs_metric.labels(queue=job_queue).inc() + if "reserve_data" in json_data: + reservations_metric.labels(queue=job_queue).inc() + + database.add_job(job) + return jsonify(job_id=job.get("job_id")) + + def has_attachments(data: dict) -> bool: """Predicate if the job described by `data` involves attachments.""" return any( @@ -151,8 +192,13 @@ def has_attachments(data: dict) -> bool: ) -def job_builder(data: dict): - """Build a job from a dictionary of data.""" +def job_builder(data: dict, inherited_permissions: dict = None): + """Build a job from a dictionary of data. + + :param data: Job data dictionary + :param inherited_permissions: Optional permissions inherited from + parent job + """ job = { "created_at": datetime.now(timezone.utc), "result_data": { @@ -175,12 +221,30 @@ def job_builder(data: dict): data["attachments_status"] = "waiting" priority_level = data.get("job_priority", 0) + + # Use inherited permissions if provided, otherwise use current + # user's permissions + if inherited_permissions: + permissions_to_check = inherited_permissions + else: + permissions_to_check = g.permissions auth.check_permissions( - g.permissions, + permissions_to_check, data, ) job["job_priority"] = priority_level + # Store authentication permissions for credential inheritance + if inherited_permissions: + job["auth_permissions"] = inherited_permissions + elif g.is_authenticated: + job["auth_permissions"] = g.permissions + + # Store parent job relationship if this is a child job + parent_job_id = data.get("parent_job_id") + if parent_job_id: + job["parent_job_id"] = parent_job_id + job["job_id"] = job_id job["job_data"] = data return job diff --git a/server/src/testflinger/database.py b/server/src/testflinger/database.py index a7066a36e..52b8ed568 100644 --- a/server/src/testflinger/database.py +++ b/server/src/testflinger/database.py @@ -574,3 +574,19 @@ def register_web_client(oidc_token: dict): "role": ServerRoles.CONTRIBUTOR, } ) + +def retrieve_parent_permissions(parent_job_id: str) -> dict: + """Retrieve auth permissions from parent job for credential inheritance. + + :param parent_job_id: UUID of the parent job to inherit permissions from. + :return: Dictionary with parent job's auth permissions, + or empty dict if none. + """ + parent_job = mongo.db.jobs.find_one( + {"job_id": parent_job_id}, {"auth_permissions": True, "_id": False} + ) + + if not parent_job: + return {} + + return parent_job.get("auth_permissions", {}) diff --git a/server/tests/test_v1_authorization.py b/server/tests/test_v1_authorization.py index 3338af45d..e69019fb8 100644 --- a/server/tests/test_v1_authorization.py +++ b/server/tests/test_v1_authorization.py @@ -1349,3 +1349,127 @@ def test_refresh_token_last_accessed_update(mongo_app_with_permissions): last_accessed_after = token_entry_after["last_accessed"] assert last_accessed_after > last_accessed_before + + +def test_retrieve_parent_permissions_valid(mongo_app): + """Test retrieving auth permissions from valid parent job.""" + from testflinger import database + + app, mongo = mongo_app + parent_permissions = { + "client_id": "test_client", + "max_priority": {"queue1": 100}, + "allowed_queues": ["restricted_queue"], + "max_reservation_time": {"queue1": 7200}, + } + + # Insert parent job with permissions + parent_job = { + "job_id": "parent-123", + "auth_permissions": parent_permissions, + "job_data": {"job_queue": "queue1"}, + } + mongo.jobs.insert_one(parent_job) + + # Test retrieval + retrieved_permissions = database.retrieve_parent_permissions("parent-123") + assert retrieved_permissions == parent_permissions + + +def test_retrieve_parent_permissions_no_permissions(mongo_app): + """Test retrieving permissions from parent job with no auth_permissions.""" + from testflinger import database + + app, mongo = mongo_app + + # Insert parent job without permissions + parent_job = {"job_id": "parent-456", "job_data": {"job_queue": "queue1"}} + mongo.jobs.insert_one(parent_job) + + # Should return empty dict, not error + retrieved_permissions = database.retrieve_parent_permissions("parent-456") + assert retrieved_permissions == {} + + +def test_retrieve_parent_permissions_nonexistent(mongo_app): + """Test retrieving permissions from non-existent parent job.""" + from testflinger import database + + app, mongo = mongo_app + + # Should return empty dict, not error + retrieved_permissions = database.retrieve_parent_permissions( + "nonexistent-job" + ) + assert retrieved_permissions == {} + + +def test_agent_jobs_endpoint_with_credentials(mongo_app_with_permissions): + """Test agent endpoint submits child job with inherited credentials.""" + app, mongo, client_id, client_key, _ = mongo_app_with_permissions + + # Create parent job with permissions + token = get_access_token(app, client_id, client_key) + parent_job = {"job_queue": "myqueue2", "job_priority": 200} + parent_response = app.post( + "/v1/job", json=parent_job, headers={"Authorization": token} + ) + parent_job_id = parent_response.json.get("job_id") + + # Submit child job via agent endpoint + child_job = { + "job_queue": "myqueue2", + "job_priority": 200, + "parent_job_id": parent_job_id, + } + child_response = app.post("/v1/agent/jobs", json=child_job) + assert child_response.status_code == 200 + + child_job_id = child_response.json.get("job_id") + assert child_job_id is not None + + # Verify child job inherited parent permissions + child_job_data = mongo.jobs.find_one({"job_id": child_job_id}) + assert child_job_data["parent_job_id"] == parent_job_id + assert "auth_permissions" in child_job_data + + +def test_agent_jobs_endpoint_missing_parent_job_id(mongo_app): + """Test agent endpoint rejects jobs without parent_job_id.""" + app, _ = mongo_app + + child_job = {"job_queue": "myqueue"} + response = app.post("/v1/agent/jobs", json=child_job) + assert response.status_code == 422 + assert "parent_job_id required" in response.get_json()["message"] + + +def test_agent_jobs_endpoint_invalid_parent_job_id(mongo_app): + """Test agent endpoint rejects jobs with invalid parent_job_id.""" + app, _ = mongo_app + + child_job = {"job_queue": "myqueue", "parent_job_id": "invalid-uuid"} + response = app.post("/v1/agent/jobs", json=child_job) + assert response.status_code == 400 + assert "Invalid parent_job_id" in response.get_json()["message"] + + +def test_agent_jobs_endpoint_no_parent_permissions(mongo_app): + """Test agent endpoint works when parent has no permissions.""" + app, mongo = mongo_app + + # Create parent job without authentication (no permissions) + parent_job = {"job_queue": "myqueue"} + parent_response = app.post("/v1/job", json=parent_job) + parent_job_id = parent_response.json.get("job_id") + + # Submit child job via agent endpoint + child_job = {"job_queue": "myqueue", "parent_job_id": parent_job_id} + child_response = app.post("/v1/agent/jobs", json=child_job) + assert child_response.status_code == 200 + + # Verify child job was created successfully + child_job_id = child_response.json.get("job_id") + child_job_data = mongo.jobs.find_one({"job_id": child_job_id}) + assert child_job_data["parent_job_id"] == parent_job_id + assert "auth_permissions" not in child_job_data From 649274205929298e5243e2bbaf539266478c1ef1 Mon Sep 17 00:00:00 2001 From: Varun Valada Date: Wed, 22 Oct 2025 14:59:23 -0500 Subject: [PATCH 3/3] Added multi device agent documentation --- docs/how-to/index.rst | 1 + docs/how-to/multi-device-jobs.rst | 308 ++++++++++++++++++++++ docs/reference/device-connector-types.rst | 85 +++++- docs/reference/job-schema.rst | 11 +- server/API.md | 37 +++ 5 files changed, 439 insertions(+), 3 deletions(-) create mode 100644 docs/how-to/multi-device-jobs.rst diff --git a/docs/how-to/index.rst b/docs/how-to/index.rst index 56695a4d2..7f15528ad 100644 --- a/docs/how-to/index.rst +++ b/docs/how-to/index.rst @@ -15,6 +15,7 @@ Work with jobs via Testflinger CLI cancel-job reserve-job search-job + multi-device-jobs job-priority authentication manage-client-permissions diff --git a/docs/how-to/multi-device-jobs.rst b/docs/how-to/multi-device-jobs.rst new file mode 100644 index 000000000..28b3941c9 --- /dev/null +++ b/docs/how-to/multi-device-jobs.rst @@ -0,0 +1,308 @@ +Multi-device jobs +================== + +Multi-device jobs allow you to coordinate testing across multiple devices simultaneously. This is useful for scenarios like: + +- Network testing between multiple machines +- Client-server application testing +- Distributed system testing +- Multi-node cluster testing + +Overview +-------- + +The ``multi`` device connector orchestrates multiple child jobs, each running on a separate device. The parent job coordinates the provisioning, allocation, and optional reservation of all devices before running coordinated tests. + +Workflow +-------- + +Multi-device jobs follow this phase sequence: + +1. **Provision phase**: The multi connector creates and submits child jobs for each device specified in ``provision_data.jobs``. Each child job is submitted to its respective queue and automatically includes: + + - ``allocate_data: {allocate: true}`` - Instructs the child to enter allocated state after provisioning + - ``parent_job_id`` - Links the child job to the parent for audit trail and credential inheritance + +2. **Allocate phase**: The parent job waits for all child jobs to reach the ``allocated`` state. During this phase: + + - Child jobs provision their devices and gather device information (IP addresses) + - Child jobs post device information to the server + - Parent job polls child jobs every 10 seconds + - If ``allocation_timeout`` is reached before all devices are allocated, the job is cancelled + +3. **Reserve phase** (optional): If ``reserve_data`` is present, SSH keys are copied to all allocated devices simultaneously, allowing users to connect to all devices during the reservation period. + +4. **Test phase** (optional): Coordinated tests can be executed across all devices. Device information is available in ``job_list.json``. + +5. **Cleanup phase**: The parent job cancels all child jobs, which triggers cleanup on each device. + +Credential inheritance +---------------------- + +When authenticated users submit multi-device jobs, child jobs automatically inherit authentication permissions from the parent job. This enables: + +- **Extended reservation times**: Child jobs can use the parent's ``max_reservation_time`` limits +- **Job priority**: Child jobs inherit the parent's ``max_priority`` settings +- **Restricted queue access**: Child jobs can access queues that require authorization + +This inheritance happens automatically through the ``/v1/agent/jobs`` API endpoint used by the multi connector. No additional configuration is required. + +Job schema +---------- + +Basic structure +~~~~~~~~~~~~~~~ + +.. code-block:: yaml + + job_queue: multi + allocation_timeout: 7200 # Optional, defaults to 7200 seconds (2 hours) + provision_data: + jobs: + - job_queue: device-queue-1 + provision_data: + # Device-specific provisioning data + - job_queue: device-queue-2 + provision_data: + # Device-specific provisioning data + reserve_data: # Optional + ssh_keys: + - "gh:your-github-username" + - "lp:your-launchpad-username" + timeout: "3600" # Reservation duration in seconds + test_data: # Optional + test_cmds: | + # Commands to run across all devices + +Required fields +~~~~~~~~~~~~~~~ + +- ``job_queue``: Must be set to ``multi`` +- ``provision_data.jobs``: List of child job definitions, each containing: + + - ``job_queue``: Queue name for the child device + - ``provision_data``: Device-specific provisioning parameters (varies by device connector type) + +Optional fields +~~~~~~~~~~~~~~~ + +- ``allocation_timeout``: Maximum time (in seconds) to wait for all child jobs to reach allocated state. Default: 7200 (2 hours) +- ``reserve_data``: Configuration for device reservation + + - ``ssh_keys``: List of SSH key identifiers (format: ``provider:username``, e.g., ``gh:username`` or ``lp:username``) + - ``timeout``: Reservation duration in seconds. Default: 3600 (1 hour). Can also use duration format (e.g., ``2h30m``) + +- ``test_data``: Coordinated test commands to run after devices are allocated +- ``test_data.test_username``: SSH username for device connections. Default: ``ubuntu`` + +Examples +-------- + +Simple two-device job +~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: yaml + + job_queue: multi + provision_data: + jobs: + - job_queue: rpi4b + provision_data: + url: https://cdimage.ubuntu.com/ubuntu-core/22/stable/current/ubuntu-core-22-arm64+raspi.img.xz + - job_queue: maas-x86 + provision_data: + distro: jammy + +Multi-device with reservation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: yaml + + job_queue: multi + provision_data: + jobs: + - job_queue: device-queue-1 + provision_data: + distro: jammy + - job_queue: device-queue-2 + provision_data: + distro: jammy + - job_queue: device-queue-3 + provision_data: + distro: jammy + reserve_data: + ssh_keys: + - "gh:github-username" + - "lp:launchpad-username" + timeout: "7200" # 2 hours + test_data: + test_username: ubuntu + +Multi-device with coordinated testing +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: yaml + + job_queue: multi + provision_data: + jobs: + - job_queue: server-queue + provision_data: + distro: jammy + - job_queue: client-queue-1 + provision_data: + distro: jammy + - job_queue: client-queue-2 + provision_data: + distro: jammy + test_data: + test_cmds: | + # Parse job_list.json to get device IPs + SERVER_IP=$(jq -r '.[0].device_info.device_ip' job_list.json) + CLIENT1_IP=$(jq -r '.[1].device_info.device_ip' job_list.json) + CLIENT2_IP=$(jq -r '.[2].device_info.device_ip' job_list.json) + + # Start server + ssh ubuntu@$SERVER_IP "iperf3 -s -D" + + # Run clients + ssh ubuntu@$CLIENT1_IP "iperf3 -c $SERVER_IP -t 30" & + ssh ubuntu@$CLIENT2_IP "iperf3 -c $SERVER_IP -t 30" & + wait + +Accessing device information +----------------------------- + +After the allocate phase completes, device information is stored in ``job_list.json`` in the job's working directory. The file contains an array of child job objects: + +.. code-block:: json + + [ + { + "job_id": "child-job-uuid-1", + "device_info": { + "device_ip": "192.168.1.100" + } + }, + { + "job_id": "child-job-uuid-2", + "device_info": { + "device_ip": "192.168.1.101" + } + } + ] + +You can parse this file in test commands to access device IPs: + +.. code-block:: bash + + # Get first device IP + DEVICE1_IP=$(jq -r '.[0].device_info.device_ip' job_list.json) + + # Get all device IPs + ALL_IPS=$(jq -r '.[].device_info.device_ip' job_list.json) + +Reservation workflow +-------------------- + +When ``reserve_data`` is specified, the reserve phase executes after all devices are allocated: + +1. The multi connector reads device IPs from ``job_list.json`` +2. For each SSH key in ``ssh_keys``: + + - Imports the key using ``ssh-import-id`` (supports GitHub and Launchpad) + - Copies the key to all device IPs using ``ssh-copy-id`` + +3. Displays reservation information to the user: + + .. code-block:: text + + *** TESTFLINGER SYSTEMS RESERVED *** + You can now connect to the following devices: + ubuntu@192.168.1.100 + ubuntu@192.168.1.101 + ubuntu@192.168.1.102 + Current time: 2025-10-22T10:30:00+00:00 + Reservation expires at: 2025-10-22T11:30:00+00:00 + Reservation will automatically timeout in 3600 seconds + To end the reservation sooner use: testflinger-cli cancel + +4. Waits for the reservation timeout duration + +**Important notes:** + +- The reserve phase uses a separate timeout independent from ``global_timeout`` +- Reservation timeout is controlled by ``reserve_data.timeout`` +- SSH keys must be accessible via ``ssh-import-id`` (GitHub or Launchpad) +- The ``test_username`` from ``test_data.test_username`` is used for SSH connections (defaults to ``ubuntu``) + +Error handling +-------------- + +Multi-device jobs can fail at various stages: + +Allocation failures +~~~~~~~~~~~~~~~~~~~ + +- **Child job allocation timeout**: If a child job doesn't reach ``allocated`` state within ``allocation_timeout``, the parent job cancels all child jobs and fails +- **Child job fails to allocate**: If a child job enters ``cancelled``, ``complete``, or ``completed`` state during allocation, the parent job cancels remaining child jobs and fails +- **Parent job cancelled**: If the parent job is cancelled during the allocate phase, all child jobs are cancelled + +Reservation failures +~~~~~~~~~~~~~~~~~~~~ + +- **SSH key import failure**: If ``ssh-import-id`` fails after retries, the reservation phase fails +- **SSH key copy failure**: Copy failures to individual devices are logged but don't fail the entire reservation (graceful degradation) +- **Missing job_list.json**: If the allocate phase didn't create device information, the reserve phase fails + +Best practices +-------------- + +1. **Allocation timeout**: Set ``allocation_timeout`` based on the slowest device provisioning time. Default is 2 hours, which should be sufficient for most cases. + +2. **Reservation duration**: For authenticated users with extended reservation permissions, you can request longer reservation times (up to the limit configured in your ``max_reservation_time`` permission). + +3. **Test username**: If your devices use a non-default username, specify it in ``test_data.test_username``. + +4. **Device order**: Child jobs are created in the order specified in ``provision_data.jobs``. This order is preserved in ``job_list.json``, so you can rely on it when parsing device information. + +5. **Error recovery**: If a multi-device job fails during allocation, all child jobs are automatically cancelled. You don't need to manually clean up child jobs. + +6. **Monitoring**: You can monitor child job status using the Testflinger CLI: + + .. code-block:: bash + + # Get parent job status + testflinger-cli status + + # Get child job IDs from job_list.json (if allocate phase completed) + testflinger-cli results + +Authentication requirements +--------------------------- + +For credential inheritance to work: + +1. Submit the parent job with authentication (JWT token via ``testflinger-cli`` login) +2. Ensure your client permissions include the queues used by child jobs +3. Child jobs automatically inherit permissions - no additional authentication needed + +If submitting without authentication, multi-device jobs still work but child jobs won't have extended privileges (priority, extended reservations, restricted queue access). + +Limitations +----------- + +- Maximum number of child jobs: Limited by server capacity and allocation timeout +- Reservation timeout: Limited by ``max_reservation_time`` in your authentication permissions +- All child jobs must successfully allocate for the parent job to proceed +- Child jobs run independently after allocation - there's no built-in synchronization mechanism beyond the test commands you write + +See also +-------- + +- :doc:`../reference/job-schema` - Complete job schema reference +- :doc:`../reference/test-phases` - Detailed information about test phases +- :doc:`../reference/device-connector-types` - Device connector types and provisioning options +- :doc:`../explanation/extended-reservation` - Extended reservation permissions +- :doc:`../explanation/job-priority` - Job priority configuration +- :doc:`../explanation/restricted-queues` - Restricted queue access diff --git a/docs/reference/device-connector-types.rst b/docs/reference/device-connector-types.rst index 76532aa52..23ed7c424 100644 --- a/docs/reference/device-connector-types.rst +++ b/docs/reference/device-connector-types.rst @@ -16,8 +16,8 @@ To specify the commands to run by the device in each test phase, set the ``testf - Qualcomm Dragonboard 410c setup to boot from both a special image on a USB stick when the SD card is erased, as well as an SD card that can be provisioned by booting the stable image on a USB stick and then flashing the new image to the SD card. * - ``maas2`` - Uses `MAAS `_ to provision supported images on devices that are capable of being controlled by a MAAS server. - * - ``multi`` - - Experimental device type that is used for provisioning multiple other devices in order to coordinate a job across multiple devices at the same time. + * - ``multi`` + - Device connector for coordinating multi-device jobs. Provisions multiple devices simultaneously and optionally reserves them for SSH access. Supports credential inheritance, allowing child jobs to inherit authentication permissions from the parent job. See :doc:`../how-to/multi-device-jobs` for detailed usage. * - ``muxpi`` - MuxPi or SDWire device capable of multiplexing the SD card so that it can be written, then control can be switched to the DUT to boot the image, see :ref:`muxpi`. * - ``netboot`` @@ -78,6 +78,87 @@ fake_connector The ``fake_connector`` device connector doesn't actually provision any devices, but is useful for testing the Testflinger without needing to have any real devices connected. +.. _multi: + +multi +----- + +The ``multi`` device connector is used for coordinating multi-device jobs across multiple devices simultaneously. It creates child jobs for each device, waits for them to be allocated, and optionally reserves them for SSH access. + +**Workflow:** + +1. **Provision phase**: Creates and submits child jobs via the ``/v1/agent/jobs`` endpoint with credential inheritance +2. **Allocate phase**: Waits for all child jobs to reach ``allocated`` state +3. **Reserve phase** (optional): Copies SSH keys to all allocated devices +4. **Test phase** (optional): Executes coordinated tests across devices +5. **Cleanup phase**: Cancels all child jobs + +The ``multi`` device connector supports the following ``provision_data`` keys: + +.. list-table:: Supported ``provision_data`` keys for ``multi`` + :header-rows: 1 + + * - Key + - Description + * - ``jobs`` + - List of child job definitions. Each entry must contain: + + - ``job_queue``: Queue name for the child device + - ``provision_data``: Device-specific provisioning parameters (varies by device connector type) + +The ``multi`` device connector supports the following ``reserve_data`` keys: + +.. list-table:: Supported ``reserve_data`` keys for ``multi`` + :header-rows: 1 + + * - Key + - Description + * - ``ssh_keys`` + - List of SSH key identifiers to import and copy to all devices. Format: ``provider:username`` (e.g., ``gh:username`` for GitHub or ``lp:username`` for Launchpad). Uses ``ssh-import-id`` to retrieve keys. + * - ``timeout`` + - Reservation duration in seconds. Default: 3600 (1 hour). Can also use duration format (e.g., ``2h30m``). This timeout is independent from ``global_timeout``. + +**Additional fields:** + +- ``allocation_timeout``: Maximum time (in seconds) to wait for all child jobs to reach allocated state. Default: 7200 (2 hours) +- ``test_data.test_username``: SSH username for device connections. Default: ``ubuntu`` + +**Credential inheritance:** + +Child jobs automatically inherit authentication permissions (``auth_permissions``) from the parent job, including: + +- Extended reservation time limits (``max_reservation_time``) +- Job priority settings (``max_priority``) +- Restricted queue access (``allowed_queues``) + +This enables authenticated users to submit multi-device jobs with elevated privileges that apply to all child jobs. + +**Example:** + +.. code-block:: yaml + + job_queue: multi + allocation_timeout: 7200 + provision_data: + jobs: + - job_queue: rpi4b + provision_data: + url: https://cdimage.ubuntu.com/ubuntu-core/22/stable/current/ubuntu-core-22-arm64+raspi.img.xz + - job_queue: maas-x86 + provision_data: + distro: jammy + reserve_data: + ssh_keys: + - "gh:your-username" + timeout: "3600" + test_data: + test_username: ubuntu + test_cmds: | + # Access device IPs from job_list.json + jq '.[].device_info.device_ip' job_list.json + +For comprehensive documentation on multi-device jobs, see :doc:`../how-to/multi-device-jobs`. + .. _maas2: maas2 diff --git a/docs/reference/job-schema.rst b/docs/reference/job-schema.rst index f9687ff75..64fb70126 100644 --- a/docs/reference/job-schema.rst +++ b/docs/reference/job-schema.rst @@ -36,7 +36,16 @@ The following table lists the key elements that a job definition file should con - integer - | 7200 | (2 hours) - - (Optional) Maximum time (in seconds) Testflinger should wait in the ``allocate`` phase for multi-device jobs to reach the ``allocated`` state. If the timeout is reached before all devices are allocated, Testflinger will cancel the job. + - (Optional) Maximum time (in seconds) Testflinger should wait in the ``allocate`` phase for multi-device jobs to reach the ``allocated`` state. If the timeout is reached before all devices are allocated, Testflinger will cancel the job. This timeout only applies to multi-device jobs that use the ``multi`` device connector. + * - ``parent_job_id`` + - string (UUID) + - / + - | (Internal field) UUID of the parent job for child jobs created by multi-device workflows. This field is automatically injected by the multi device connector when creating child jobs and should not be manually specified by users. + | Child jobs automatically inherit authentication permissions (``auth_permissions``) from their parent job, including: + | - Extended reservation time limits + | - Job priority settings + | - Restricted queue access + | This field enables audit trail tracking and credential inheritance for multi-device workflows. * - ``_data`` - dictionary - / diff --git a/server/API.md b/server/API.md index 4eaa685f8..48677c415 100644 --- a/server/API.md +++ b/server/API.md @@ -31,6 +31,43 @@ curl http://localhost:8000/v1/job -X POST \ --data '{ "job_queue": "myqueue", "option":"foo" }' ``` +## `[POST] /v1/agent/jobs` + +Create a child job with credential inheritance from a parent job + +This endpoint is designed specifically for agents submitting child jobs on behalf of multi-device parent jobs. Child jobs automatically inherit authentication permissions from the parent job, including extended reservation times, job priority, and restricted queue access. + +Parameters: + +- `job_queue` (JSON): queue name to use for processing the job (required) +- `parent_job_id` (JSON): UUID of the parent job from which to inherit credentials (required) + +Returns: + +```json +{ "job_id": "" } +``` + +Status Codes: + +- `HTTP 200 (OK)` +- `HTTP 400 (Bad Request)`: Invalid UUID format for `parent_job_id` +- `HTTP 422 (Unprocessable Content)`: Missing required fields (`job_queue` or `parent_job_id`) + +Notes: + +- Child jobs inherit `auth_permissions` from the parent job stored in the database +- If the parent job has no authentication permissions, the child job is created without them (no error) +- The `parent_job_id` field is stored in the child job document for audit trail purposes + +Examples: + +```shell +curl http://localhost:8000/v1/agent/jobs -X POST \ + --header "Content-Type: application/json" \ + --data '{ "job_queue": "device-queue", "parent_job_id": "550e8400-e29b-41d4-a716-446655440000" }' +``` + ## `[GET] /v1/job` Get a test job from the specified queue(s)