From 7a257b73932a81c2c4ea2a2047a23385868d799b Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 27 Jul 2025 03:14:22 +0000 Subject: [PATCH 1/6] Initial plan From 76cd936f725a4aeb75ccce611f80245a066e60cc Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 27 Jul 2025 03:26:36 +0000 Subject: [PATCH 2/6] Implement rogue process detection for high CPU usage Co-authored-by: caizixian <2891235+caizixian@users.noreply.github.com> --- src/running/command/runbms.py | 16 +++++++- src/running/plugin/runbms/zulip.py | 21 ++++++++++- src/running/util.py | 50 +++++++++++++++++++++++++ tests/test_util.py | 60 +++++++++++++++++++++++++++++- 4 files changed, 144 insertions(+), 3 deletions(-) diff --git a/src/running/command/runbms.py b/src/running/command/runbms.py index 35dbf03..bb3f849 100644 --- a/src/running/command/runbms.py +++ b/src/running/command/runbms.py @@ -21,6 +21,7 @@ config_index_to_chr, config_str_encode, dont_emit_heapsize_modifier, + detect_rogue_processes, ) import socket from datetime import datetime @@ -204,7 +205,20 @@ def get_log_prologue(runtime: Runtime, bm: Benchmark) -> str: output += system("date") + "\n" output += system("w") + "\n" output += system("vmstat 1 2") + "\n" - output += system("top -bcn 1 -w512 |head -n 12") + "\n" + + # Get top output and check for rogue processes + top_output = system("top -bcn 1 -w512 |head -n 12") + output += top_output + "\n" + + # Check for rogue processes with high CPU usage + rogue_processes = detect_rogue_processes(top_output) + if rogue_processes: + output += "# ** Warning: High CPU usage processes detected: **\n" + for pid, user, cpu_percent, command in rogue_processes: + output += "# Process {} (PID: {}, User: {}) using {:.1f}% CPU: {}\n".format( + command, pid, user, cpu_percent, command + ) + output += "\n" output += "Environment variables: \n" for k, v in sorted(os.environ.items()): output += "\t{}={}\n".format(k, v) diff --git a/src/running/plugin/runbms/zulip.py b/src/running/plugin/runbms/zulip.py index 71d9cc8..7a2575f 100644 --- a/src/running/plugin/runbms/zulip.py +++ b/src/running/plugin/runbms/zulip.py @@ -7,6 +7,8 @@ MomaReservationStatus, config_index_to_chr, get_logged_in_users, + system, + detect_rogue_processes, ) import logging import copy @@ -43,10 +45,11 @@ def __init__(self, **kwargs): def send_message(self, content): message_data = copy.deepcopy(self.request) - message_data["content"] = "{}\n{}{}{}\n".format( + message_data["content"] = "{}\n{}{}{}{}\n".format( self.run_id, self.get_reservation_message(), self.get_user_warnings(), + self.get_rogue_process_warnings(), content, ) try: @@ -189,3 +192,19 @@ def get_user_warnings(self) -> str: " ".join(sorted(logged_in_users)) ) return "" + + def get_rogue_process_warnings(self) -> str: + """Check for rogue processes with high CPU usage and generate warnings.""" + top_output = system("top -bcn 1 -w512 |head -n 12") + rogue_processes = detect_rogue_processes(top_output) + + if not rogue_processes: + return "" + + warning = "# ** Warning: High CPU usage processes detected: **\n" + for pid, user, cpu_percent, command in rogue_processes: + warning += "# Process {} (PID: {}, User: {}) using {:.1f}% CPU\n".format( + command, pid, user, cpu_percent + ) + + return warning diff --git a/src/running/util.py b/src/running/util.py index 4153e47..3d98033 100644 --- a/src/running/util.py +++ b/src/running/util.py @@ -180,3 +180,53 @@ def update_reservation(self): def get_reservation(self) -> Optional[MomaReservaton]: self.update_reservation() return self.reservation + + +def detect_rogue_processes(top_output: str, cpu_threshold: float = 50.0) -> List[Tuple[str, str, float, str]]: + """ + Parse top output and detect processes with high CPU usage. + + Args: + top_output: Raw output from top command + cpu_threshold: CPU percentage threshold for considering a process "rogue" + + Returns: + List of tuples: (pid, user, cpu_percent, command) + """ + rogue_processes: List[Tuple[str, str, float, str]] = [] + lines = top_output.splitlines() + + # Find the start of the process list (after the header line with PID USER PR NI...) + process_start_idx = -1 + for i, line in enumerate(lines): + if line.strip().startswith("PID") and "USER" in line and "%CPU" in line: + process_start_idx = i + 1 + break + + if process_start_idx == -1: + return rogue_processes + + # Parse each process line + for line in lines[process_start_idx:]: + if not line.strip(): + continue + + # Split the line and extract relevant fields + # Format: PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND + parts = line.split() + if len(parts) < 12: + continue + + try: + pid = parts[0] + user = parts[1] + cpu_percent = float(parts[8]) # %CPU column + command = " ".join(parts[11:]) # COMMAND column (may contain spaces) + + if cpu_percent >= cpu_threshold: + rogue_processes.append((pid, user, cpu_percent, command)) + except (ValueError, IndexError): + # Skip lines that don't match expected format + continue + + return rogue_processes diff --git a/tests/test_util.py b/tests/test_util.py index f83d4bc..289988e 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -1,6 +1,6 @@ from pathlib import Path from running.config import Configuration -from running.util import parse_config_str, smart_quote, split_quoted +from running.util import parse_config_str, smart_quote, split_quoted, detect_rogue_processes def test_split_quoted(): @@ -36,3 +36,61 @@ def test_issue104(): c.resolve_class() _, modifiers = parse_config_str(c, "jdk8|") assert len(modifiers) == 0 + + +def test_detect_rogue_processes(): + # Test with no rogue processes + top_output_normal = """top - 03:18:43 up 4 min, 1 user, load average: 0.55, 0.35, 0.15 +Tasks: 181 total, 1 running, 180 sleeping, 0 stopped, 0 zombie +%Cpu(s): 0.0 us, 2.2 sy, 0.0 ni, 97.8 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st +MiB Mem : 15995.6 total, 13333.4 free, 1396.1 used, 1616.3 buff/cache +MiB Swap: 4096.0 total, 4096.0 free, 0.0 used. 14599.5 avail Mem + + PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND + 3524 runner 20 0 12340 5384 3336 R 10.0 0.0 0:00.01 top -bcn 1 -w512 + 1 root 20 0 22876 13828 9476 S 0.0 0.1 0:03.23 /sbin/init""" + + rogue_processes = detect_rogue_processes(top_output_normal) + assert len(rogue_processes) == 0 + + # Test with one rogue process + top_output_rogue = """top - 03:18:43 up 4 min, 1 user, load average: 0.55, 0.35, 0.15 +Tasks: 181 total, 1 running, 180 sleeping, 0 stopped, 0 zombie +%Cpu(s): 0.0 us, 2.2 sy, 0.0 ni, 97.8 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st +MiB Mem : 15995.6 total, 13333.4 free, 1396.1 used, 1616.3 buff/cache +MiB Swap: 4096.0 total, 4096.0 free, 0.0 used. 14599.5 avail Mem + + PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND + 1234 user 20 0 123456 7890 4567 S 85.3 0.1 0:12.34 rust-analyzer + 3524 runner 20 0 12340 5384 3336 R 10.0 0.0 0:00.01 top -bcn 1 -w512 + 1 root 20 0 22876 13828 9476 S 0.0 0.1 0:03.23 /sbin/init""" + + rogue_processes = detect_rogue_processes(top_output_rogue) + assert len(rogue_processes) == 1 + pid, user, cpu_percent, command = rogue_processes[0] + assert pid == "1234" + assert user == "user" + assert cpu_percent == 85.3 + assert command == "rust-analyzer" + + # Test with custom threshold + rogue_processes_low_threshold = detect_rogue_processes(top_output_rogue, cpu_threshold=5.0) + assert len(rogue_processes_low_threshold) == 2 # Both rust-analyzer (85.3%) and top (10.0%) + + # Test with high threshold + rogue_processes_high_threshold = detect_rogue_processes(top_output_rogue, cpu_threshold=90.0) + assert len(rogue_processes_high_threshold) == 0 # No processes above 90% + + +def test_detect_rogue_processes_malformed(): + # Test with malformed top output (no header) + malformed_output = """some random text +not a top output +no PID header""" + + rogue_processes = detect_rogue_processes(malformed_output) + assert len(rogue_processes) == 0 + + # Test with empty output + rogue_processes = detect_rogue_processes("") + assert len(rogue_processes) == 0 From 8c45a04a8f55269061dbb7aec01f5a3000b3de04 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 27 Jul 2025 03:38:38 +0000 Subject: [PATCH 3/6] Use logging library for rogue process warnings instead of markdown format in output Co-authored-by: caizixian <2891235+caizixian@users.noreply.github.com> --- src/running/command/runbms.py | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/src/running/command/runbms.py b/src/running/command/runbms.py index bb3f849..929ff1a 100644 --- a/src/running/command/runbms.py +++ b/src/running/command/runbms.py @@ -205,20 +205,7 @@ def get_log_prologue(runtime: Runtime, bm: Benchmark) -> str: output += system("date") + "\n" output += system("w") + "\n" output += system("vmstat 1 2") + "\n" - - # Get top output and check for rogue processes - top_output = system("top -bcn 1 -w512 |head -n 12") - output += top_output + "\n" - - # Check for rogue processes with high CPU usage - rogue_processes = detect_rogue_processes(top_output) - if rogue_processes: - output += "# ** Warning: High CPU usage processes detected: **\n" - for pid, user, cpu_percent, command in rogue_processes: - output += "# Process {} (PID: {}, User: {}) using {:.1f}% CPU: {}\n".format( - command, pid, user, cpu_percent, command - ) - output += "\n" + output += system("top -bcn 1 -w512 |head -n 12") + "\n" output += "Environment variables: \n" for k, v in sorted(os.environ.items()): output += "\t{}={}\n".format(k, v) @@ -288,6 +275,16 @@ def run_one_benchmark( logging.warning( "More than one user logged in: {}".format(" ".join(logged_in_users)) ) + + # Check for rogue processes with high CPU usage + top_output = system("top -bcn 1 -w512 |head -n 12") + rogue_processes = detect_rogue_processes(top_output) + for pid, user, cpu_percent, command in rogue_processes: + logging.warning( + "High CPU usage process detected: {} (PID: {}, User: {}) using {:.1f}% CPU".format( + command, pid, user, cpu_percent + ) + ) ever_ran = [False] * len(configs) for i in range(0, invocations): for p in plugins.values(): From 5a6f57246a4b31bd5f305c6e2b882576c9f38988 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 27 Jul 2025 03:46:05 +0000 Subject: [PATCH 4/6] Change zulip rogue process warnings to use markdown dash enumeration format Co-authored-by: caizixian <2891235+caizixian@users.noreply.github.com> --- src/running/plugin/runbms/zulip.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/running/plugin/runbms/zulip.py b/src/running/plugin/runbms/zulip.py index 7a2575f..007419b 100644 --- a/src/running/plugin/runbms/zulip.py +++ b/src/running/plugin/runbms/zulip.py @@ -203,7 +203,7 @@ def get_rogue_process_warnings(self) -> str: warning = "# ** Warning: High CPU usage processes detected: **\n" for pid, user, cpu_percent, command in rogue_processes: - warning += "# Process {} (PID: {}, User: {}) using {:.1f}% CPU\n".format( + warning += "- Process {} (PID: {}, User: {}) using {:.1f}% CPU\n".format( command, pid, user, cpu_percent ) From 22ac418d30f9f420ee5eefab160b411a6bb0f13f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 27 Jul 2025 03:54:22 +0000 Subject: [PATCH 5/6] Fix black formatting issues in rogue process detection code Co-authored-by: caizixian <2891235+caizixian@users.noreply.github.com> --- src/running/command/runbms.py | 2 +- src/running/plugin/runbms/zulip.py | 6 +++--- src/running/util.py | 22 +++++++++++--------- tests/test_util.py | 33 ++++++++++++++++++++---------- 4 files changed, 38 insertions(+), 25 deletions(-) diff --git a/src/running/command/runbms.py b/src/running/command/runbms.py index 929ff1a..40d4eae 100644 --- a/src/running/command/runbms.py +++ b/src/running/command/runbms.py @@ -275,7 +275,7 @@ def run_one_benchmark( logging.warning( "More than one user logged in: {}".format(" ".join(logged_in_users)) ) - + # Check for rogue processes with high CPU usage top_output = system("top -bcn 1 -w512 |head -n 12") rogue_processes = detect_rogue_processes(top_output) diff --git a/src/running/plugin/runbms/zulip.py b/src/running/plugin/runbms/zulip.py index 007419b..ff15a16 100644 --- a/src/running/plugin/runbms/zulip.py +++ b/src/running/plugin/runbms/zulip.py @@ -197,14 +197,14 @@ def get_rogue_process_warnings(self) -> str: """Check for rogue processes with high CPU usage and generate warnings.""" top_output = system("top -bcn 1 -w512 |head -n 12") rogue_processes = detect_rogue_processes(top_output) - + if not rogue_processes: return "" - + warning = "# ** Warning: High CPU usage processes detected: **\n" for pid, user, cpu_percent, command in rogue_processes: warning += "- Process {} (PID: {}, User: {}) using {:.1f}% CPU\n".format( command, pid, user, cpu_percent ) - + return warning diff --git a/src/running/util.py b/src/running/util.py index 3d98033..872b2dd 100644 --- a/src/running/util.py +++ b/src/running/util.py @@ -182,51 +182,53 @@ def get_reservation(self) -> Optional[MomaReservaton]: return self.reservation -def detect_rogue_processes(top_output: str, cpu_threshold: float = 50.0) -> List[Tuple[str, str, float, str]]: +def detect_rogue_processes( + top_output: str, cpu_threshold: float = 50.0 +) -> List[Tuple[str, str, float, str]]: """ Parse top output and detect processes with high CPU usage. - + Args: top_output: Raw output from top command cpu_threshold: CPU percentage threshold for considering a process "rogue" - + Returns: List of tuples: (pid, user, cpu_percent, command) """ rogue_processes: List[Tuple[str, str, float, str]] = [] lines = top_output.splitlines() - + # Find the start of the process list (after the header line with PID USER PR NI...) process_start_idx = -1 for i, line in enumerate(lines): if line.strip().startswith("PID") and "USER" in line and "%CPU" in line: process_start_idx = i + 1 break - + if process_start_idx == -1: return rogue_processes - + # Parse each process line for line in lines[process_start_idx:]: if not line.strip(): continue - + # Split the line and extract relevant fields # Format: PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND parts = line.split() if len(parts) < 12: continue - + try: pid = parts[0] user = parts[1] cpu_percent = float(parts[8]) # %CPU column command = " ".join(parts[11:]) # COMMAND column (may contain spaces) - + if cpu_percent >= cpu_threshold: rogue_processes.append((pid, user, cpu_percent, command)) except (ValueError, IndexError): # Skip lines that don't match expected format continue - + return rogue_processes diff --git a/tests/test_util.py b/tests/test_util.py index 289988e..d4d4c51 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -1,6 +1,11 @@ from pathlib import Path from running.config import Configuration -from running.util import parse_config_str, smart_quote, split_quoted, detect_rogue_processes +from running.util import ( + parse_config_str, + smart_quote, + split_quoted, + detect_rogue_processes, +) def test_split_quoted(): @@ -49,10 +54,10 @@ def test_detect_rogue_processes(): PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND 3524 runner 20 0 12340 5384 3336 R 10.0 0.0 0:00.01 top -bcn 1 -w512 1 root 20 0 22876 13828 9476 S 0.0 0.1 0:03.23 /sbin/init""" - + rogue_processes = detect_rogue_processes(top_output_normal) assert len(rogue_processes) == 0 - + # Test with one rogue process top_output_rogue = """top - 03:18:43 up 4 min, 1 user, load average: 0.55, 0.35, 0.15 Tasks: 181 total, 1 running, 180 sleeping, 0 stopped, 0 zombie @@ -64,7 +69,7 @@ def test_detect_rogue_processes(): 1234 user 20 0 123456 7890 4567 S 85.3 0.1 0:12.34 rust-analyzer 3524 runner 20 0 12340 5384 3336 R 10.0 0.0 0:00.01 top -bcn 1 -w512 1 root 20 0 22876 13828 9476 S 0.0 0.1 0:03.23 /sbin/init""" - + rogue_processes = detect_rogue_processes(top_output_rogue) assert len(rogue_processes) == 1 pid, user, cpu_percent, command = rogue_processes[0] @@ -72,13 +77,19 @@ def test_detect_rogue_processes(): assert user == "user" assert cpu_percent == 85.3 assert command == "rust-analyzer" - + # Test with custom threshold - rogue_processes_low_threshold = detect_rogue_processes(top_output_rogue, cpu_threshold=5.0) - assert len(rogue_processes_low_threshold) == 2 # Both rust-analyzer (85.3%) and top (10.0%) - + rogue_processes_low_threshold = detect_rogue_processes( + top_output_rogue, cpu_threshold=5.0 + ) + assert ( + len(rogue_processes_low_threshold) == 2 + ) # Both rust-analyzer (85.3%) and top (10.0%) + # Test with high threshold - rogue_processes_high_threshold = detect_rogue_processes(top_output_rogue, cpu_threshold=90.0) + rogue_processes_high_threshold = detect_rogue_processes( + top_output_rogue, cpu_threshold=90.0 + ) assert len(rogue_processes_high_threshold) == 0 # No processes above 90% @@ -87,10 +98,10 @@ def test_detect_rogue_processes_malformed(): malformed_output = """some random text not a top output no PID header""" - + rogue_processes = detect_rogue_processes(malformed_output) assert len(rogue_processes) == 0 - + # Test with empty output rogue_processes = detect_rogue_processes("") assert len(rogue_processes) == 0 From fea3bb06d49a413fb70a0faafa64834e4805492d Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 27 Jul 2025 04:08:16 +0000 Subject: [PATCH 6/6] Update changelog for rogue process detection feature Co-authored-by: caizixian <2891235+caizixian@users.noreply.github.com> --- docs/src/changelog.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/src/changelog.md b/docs/src/changelog.md index 2e7db56..7c12c1f 100644 --- a/docs/src/changelog.md +++ b/docs/src/changelog.md @@ -1,6 +1,8 @@ # Changelog ## Unreleased ### Added +#### Commands +- `runbms`: automatic detection and warning for rogue processes that consume high CPU resources (>50% configurable threshold). Warnings appear in both log prologue output and Zulip notifications when enabled. ### Changed