-
Notifications
You must be signed in to change notification settings - Fork 42
fix(agent): include GlobalTimeout during allocate phase #928
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
98bf548
81c3dd2
7868cdf
58df8ea
8a3e71c
d104f1d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -14,11 +14,12 @@ | |
|
|
||
| import json | ||
| import logging | ||
| import signal | ||
| import time | ||
| from pathlib import Path | ||
| from typing import Optional | ||
|
|
||
| from testflinger_common.enums import TestPhase | ||
| from testflinger_common.enums import JobState, TestEvent, TestPhase | ||
|
|
||
| from testflinger_agent.errors import TFServerError | ||
| from testflinger_agent.handlers import ( | ||
|
|
@@ -197,8 +198,20 @@ def run_test_phase(self, phase, rundir): | |
| phase, | ||
| exitcode, | ||
| ) | ||
| if phase == "allocate": | ||
| self.allocate_phase(rundir) | ||
|
|
||
| # Allocate phase has special behavior where we wait for the parent job | ||
| if phase == TestPhase.ALLOCATE: | ||
| wait_exit_event, wait_exit_reason = self.allocate_phase( | ||
| rundir, global_timeout_checker | ||
| ) | ||
| if wait_exit_event is not None: | ||
| # If allocate_phase returns a non-None event | ||
| # it means the global timeout was reached | ||
| exit_event = wait_exit_event | ||
| exit_reason = wait_exit_reason | ||
| exitcode = -signal.SIGKILL.value % 256 | ||
| self._update_phase_results(results_file, phase, exitcode) | ||
|
|
||
| return exitcode, exit_event, exit_reason | ||
|
|
||
| def _update_phase_results( | ||
|
|
@@ -231,11 +244,19 @@ def _update_phase_results( | |
| exc, | ||
| ) | ||
|
|
||
| def allocate_phase(self, rundir): | ||
| """ | ||
| Read the json dict from "device-info.json" and send it to the server | ||
| def allocate_phase( | ||
| self, | ||
| rundir: str, | ||
| global_timeout_checker: GlobalTimeoutChecker, | ||
| ) -> tuple[TestEvent, str] | tuple[None, None]: | ||
| """Send device info to server and wait for the parent job completion. | ||
|
|
||
| This reads "device-info.json" and sends it to the server | ||
| so that the multi-device agent can find the IP addresses of all | ||
| subordinate jobs. | ||
|
|
||
| :param rundir: Directory in which to look for "device-info.json" | ||
| :param global_timeout_checker: timeout checker for the allocate phase. | ||
| """ | ||
| device_info = self.get_device_info(rundir) | ||
|
|
||
|
|
@@ -249,33 +270,56 @@ def allocate_phase(self, rundir): | |
| logger.warning("Failed to post device_info, retrying...") | ||
| time.sleep(60) | ||
|
|
||
| self.client.post_job_state(self.job_id, "allocated") | ||
| # Multi Device looks for a job that is `allocated` to know when | ||
| # a child job has already sent its device info and is ready. | ||
| self.client.post_job_state(self.job_id, JobState.ALLOCATED) | ||
|
|
||
| self.wait_for_completion() | ||
| # Child job is now allocated, wait for parent job to complete | ||
| # or for global timeout to be reached. | ||
| return self.wait_for_completion(global_timeout_checker) | ||
|
|
||
| def wait_for_completion(self): | ||
| """Monitor the parent job and exit when it completes.""" | ||
| def wait_for_completion( | ||
| self, | ||
| global_timeout_checker: GlobalTimeoutChecker, | ||
| ) -> tuple[TestEvent, str] | tuple[None, None]: | ||
| """Monitor the parent job and exit when it completes. | ||
|
|
||
| :param global_timeout_checker: timeout checker for the allocate phase. | ||
| """ | ||
| while True: | ||
| try: | ||
| this_job_state = self.client.check_job_state(self.job_id) | ||
| if this_job_state in ("complete", "completed", "cancelled"): | ||
| if this_job_state in (JobState.COMPLETED, JobState.CANCELLED): | ||
| logger.info("This job completed, exiting...") | ||
| break | ||
|
|
||
| parent_job_id = self.job_data.get("parent_job_id") | ||
| if not parent_job_id: | ||
| # TODO: Remove this path once Spread testing use cases | ||
| # are migrated to reserve phase instead of allocate phase. | ||
| logger.warning("No parent job ID found while allocated") | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What will remedy this situation? We seem to get into this while loop, and if we don't have a parent_job_id, how will we get one? Will we just spin around in this loop and never leave? |
||
| continue | ||
| parent_job_state = self.client.check_job_state( | ||
| self.job_data.get("parent_job_id") | ||
| ) | ||
| if parent_job_state in ("complete", "completed", "cancelled"): | ||
| logger.info("Parent job completed, exiting...") | ||
| break | ||
| else: | ||
| parent_job_state = self.client.check_job_state( | ||
| parent_job_id | ||
| ) | ||
| if parent_job_state in ( | ||
| JobState.COMPLETED, | ||
| JobState.CANCELLED, | ||
| ): | ||
| logger.info("Parent job completed, exiting...") | ||
| break | ||
| except TFServerError: | ||
| logger.warning("Failed to get allocated job status, retrying") | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are two jobs that could be responsible for this error -- parent and self.job_id; should we have two try statements to isolate them and provide a better error message and handling? |
||
| event, reason = global_timeout_checker() | ||
| if event is not None: | ||
| # If global timeout is reached, return event and reason | ||
| return event, reason | ||
| time.sleep(60) | ||
|
|
||
| # The job was completed or cancelled | ||
| # Returning a None tuple indicates normal allocate phase completion | ||
| return None, None | ||
|
|
||
| def get_global_timeout(self): | ||
| """Get the global timeout for the test run in seconds.""" | ||
| # Default timeout is 4 hours | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -265,7 +265,7 @@ def runtest(self, args): | |
| logger.info("END testrun") | ||
| return exitcode | ||
|
|
||
| def allocate(self): | ||
| def allocate(self, _): | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why aren't we naming this new arg? Is allocate intended to be an abstract method for multi-device sub-classes? |
||
| """Allocate devices for multi-agent jobs (default method).""" | ||
| pass | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we have any sort of future ticket we can reference in this TODO to make sure this happens?