diff --git a/docs/src/changelog.md b/docs/src/changelog.md index 3e3dd46..478b59c 100644 --- a/docs/src/changelog.md +++ b/docs/src/changelog.md @@ -2,6 +2,7 @@ ## Unreleased ### Added #### Commands +- `runbms`: automatic detection and warning for rogue processes that consume high CPU resources (>50% configurable threshold). Warnings appear in both log prologue output and Zulip notifications when enabled. - `runbms`: new `--exit-on-failure [CODE]` flag to exit with a specified code (default: 1) when any benchmark configuration fails, making it suitable for CI environments. - `runbms` gains an extra argument, `--randomize-configs`, to randomize the order of configs for each invocation to help distinguish between system-related noise and configuration-specific issues. diff --git a/src/running/command/runbms.py b/src/running/command/runbms.py index 76c8d07..0b630b3 100644 --- a/src/running/command/runbms.py +++ b/src/running/command/runbms.py @@ -21,6 +21,7 @@ config_index_to_chr, config_str_encode, dont_emit_heapsize_modifier, + detect_rogue_processes, ) import socket from datetime import datetime @@ -291,6 +292,16 @@ def run_one_benchmark( logging.warning( "More than one user logged in: {}".format(" ".join(logged_in_users)) ) + + # Check for rogue processes with high CPU usage + top_output = system("top -bcn 1 -w512 |head -n 12") + rogue_processes = detect_rogue_processes(top_output) + for pid, user, cpu_percent, command in rogue_processes: + logging.warning( + "High CPU usage process detected: {} (PID: {}, User: {}) using {:.1f}% CPU".format( + command, pid, user, cpu_percent + ) + ) ever_ran = [False] * len(configs) for i in range(0, invocations): for p in plugins.values(): diff --git a/src/running/plugin/runbms/zulip.py b/src/running/plugin/runbms/zulip.py index 71d9cc8..ff15a16 100644 --- a/src/running/plugin/runbms/zulip.py +++ b/src/running/plugin/runbms/zulip.py @@ -7,6 +7,8 @@ MomaReservationStatus, config_index_to_chr, get_logged_in_users, + system, + detect_rogue_processes, ) import logging import copy @@ -43,10 +45,11 @@ def __init__(self, **kwargs): def send_message(self, content): message_data = copy.deepcopy(self.request) - message_data["content"] = "{}\n{}{}{}\n".format( + message_data["content"] = "{}\n{}{}{}{}\n".format( self.run_id, self.get_reservation_message(), self.get_user_warnings(), + self.get_rogue_process_warnings(), content, ) try: @@ -189,3 +192,19 @@ def get_user_warnings(self) -> str: " ".join(sorted(logged_in_users)) ) return "" + + def get_rogue_process_warnings(self) -> str: + """Check for rogue processes with high CPU usage and generate warnings.""" + top_output = system("top -bcn 1 -w512 |head -n 12") + rogue_processes = detect_rogue_processes(top_output) + + if not rogue_processes: + return "" + + warning = "# ** Warning: High CPU usage processes detected: **\n" + for pid, user, cpu_percent, command in rogue_processes: + warning += "- Process {} (PID: {}, User: {}) using {:.1f}% CPU\n".format( + command, pid, user, cpu_percent + ) + + return warning diff --git a/src/running/util.py b/src/running/util.py index 4153e47..872b2dd 100644 --- a/src/running/util.py +++ b/src/running/util.py @@ -180,3 +180,55 @@ def update_reservation(self): def get_reservation(self) -> Optional[MomaReservaton]: self.update_reservation() return self.reservation + + +def detect_rogue_processes( + top_output: str, cpu_threshold: float = 50.0 +) -> List[Tuple[str, str, float, str]]: + """ + Parse top output and detect processes with high CPU usage. + + Args: + top_output: Raw output from top command + cpu_threshold: CPU percentage threshold for considering a process "rogue" + + Returns: + List of tuples: (pid, user, cpu_percent, command) + """ + rogue_processes: List[Tuple[str, str, float, str]] = [] + lines = top_output.splitlines() + + # Find the start of the process list (after the header line with PID USER PR NI...) + process_start_idx = -1 + for i, line in enumerate(lines): + if line.strip().startswith("PID") and "USER" in line and "%CPU" in line: + process_start_idx = i + 1 + break + + if process_start_idx == -1: + return rogue_processes + + # Parse each process line + for line in lines[process_start_idx:]: + if not line.strip(): + continue + + # Split the line and extract relevant fields + # Format: PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND + parts = line.split() + if len(parts) < 12: + continue + + try: + pid = parts[0] + user = parts[1] + cpu_percent = float(parts[8]) # %CPU column + command = " ".join(parts[11:]) # COMMAND column (may contain spaces) + + if cpu_percent >= cpu_threshold: + rogue_processes.append((pid, user, cpu_percent, command)) + except (ValueError, IndexError): + # Skip lines that don't match expected format + continue + + return rogue_processes diff --git a/tests/test_util.py b/tests/test_util.py index f83d4bc..d4d4c51 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -1,6 +1,11 @@ from pathlib import Path from running.config import Configuration -from running.util import parse_config_str, smart_quote, split_quoted +from running.util import ( + parse_config_str, + smart_quote, + split_quoted, + detect_rogue_processes, +) def test_split_quoted(): @@ -36,3 +41,67 @@ def test_issue104(): c.resolve_class() _, modifiers = parse_config_str(c, "jdk8|") assert len(modifiers) == 0 + + +def test_detect_rogue_processes(): + # Test with no rogue processes + top_output_normal = """top - 03:18:43 up 4 min, 1 user, load average: 0.55, 0.35, 0.15 +Tasks: 181 total, 1 running, 180 sleeping, 0 stopped, 0 zombie +%Cpu(s): 0.0 us, 2.2 sy, 0.0 ni, 97.8 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st +MiB Mem : 15995.6 total, 13333.4 free, 1396.1 used, 1616.3 buff/cache +MiB Swap: 4096.0 total, 4096.0 free, 0.0 used. 14599.5 avail Mem + + PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND + 3524 runner 20 0 12340 5384 3336 R 10.0 0.0 0:00.01 top -bcn 1 -w512 + 1 root 20 0 22876 13828 9476 S 0.0 0.1 0:03.23 /sbin/init""" + + rogue_processes = detect_rogue_processes(top_output_normal) + assert len(rogue_processes) == 0 + + # Test with one rogue process + top_output_rogue = """top - 03:18:43 up 4 min, 1 user, load average: 0.55, 0.35, 0.15 +Tasks: 181 total, 1 running, 180 sleeping, 0 stopped, 0 zombie +%Cpu(s): 0.0 us, 2.2 sy, 0.0 ni, 97.8 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st +MiB Mem : 15995.6 total, 13333.4 free, 1396.1 used, 1616.3 buff/cache +MiB Swap: 4096.0 total, 4096.0 free, 0.0 used. 14599.5 avail Mem + + PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND + 1234 user 20 0 123456 7890 4567 S 85.3 0.1 0:12.34 rust-analyzer + 3524 runner 20 0 12340 5384 3336 R 10.0 0.0 0:00.01 top -bcn 1 -w512 + 1 root 20 0 22876 13828 9476 S 0.0 0.1 0:03.23 /sbin/init""" + + rogue_processes = detect_rogue_processes(top_output_rogue) + assert len(rogue_processes) == 1 + pid, user, cpu_percent, command = rogue_processes[0] + assert pid == "1234" + assert user == "user" + assert cpu_percent == 85.3 + assert command == "rust-analyzer" + + # Test with custom threshold + rogue_processes_low_threshold = detect_rogue_processes( + top_output_rogue, cpu_threshold=5.0 + ) + assert ( + len(rogue_processes_low_threshold) == 2 + ) # Both rust-analyzer (85.3%) and top (10.0%) + + # Test with high threshold + rogue_processes_high_threshold = detect_rogue_processes( + top_output_rogue, cpu_threshold=90.0 + ) + assert len(rogue_processes_high_threshold) == 0 # No processes above 90% + + +def test_detect_rogue_processes_malformed(): + # Test with malformed top output (no header) + malformed_output = """some random text +not a top output +no PID header""" + + rogue_processes = detect_rogue_processes(malformed_output) + assert len(rogue_processes) == 0 + + # Test with empty output + rogue_processes = detect_rogue_processes("") + assert len(rogue_processes) == 0