Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/src/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
## Unreleased
### Added
#### Commands
- `runbms`: automatic detection and warning for rogue processes that consume high CPU resources (>50% configurable threshold). Warnings appear in both log prologue output and Zulip notifications when enabled.
- `runbms`: new `--exit-on-failure [CODE]` flag to exit with a specified code (default: 1) when any benchmark configuration fails, making it suitable for CI environments.
- `runbms` gains an extra argument, `--randomize-configs`, to randomize the order of configs for each invocation to help distinguish between system-related noise and configuration-specific issues.

Expand Down
11 changes: 11 additions & 0 deletions src/running/command/runbms.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
config_index_to_chr,
config_str_encode,
dont_emit_heapsize_modifier,
detect_rogue_processes,
)
import socket
from datetime import datetime
Expand Down Expand Up @@ -291,6 +292,16 @@ def run_one_benchmark(
logging.warning(
"More than one user logged in: {}".format(" ".join(logged_in_users))
)

# Check for rogue processes with high CPU usage
top_output = system("top -bcn 1 -w512 |head -n 12")
rogue_processes = detect_rogue_processes(top_output)
for pid, user, cpu_percent, command in rogue_processes:
logging.warning(
"High CPU usage process detected: {} (PID: {}, User: {}) using {:.1f}% CPU".format(
command, pid, user, cpu_percent
)
)
ever_ran = [False] * len(configs)
for i in range(0, invocations):
for p in plugins.values():
Expand Down
21 changes: 20 additions & 1 deletion src/running/plugin/runbms/zulip.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
MomaReservationStatus,
config_index_to_chr,
get_logged_in_users,
system,
detect_rogue_processes,
)
import logging
import copy
Expand Down Expand Up @@ -43,10 +45,11 @@ def __init__(self, **kwargs):

def send_message(self, content):
message_data = copy.deepcopy(self.request)
message_data["content"] = "{}\n{}{}{}\n".format(
message_data["content"] = "{}\n{}{}{}{}\n".format(
self.run_id,
self.get_reservation_message(),
self.get_user_warnings(),
self.get_rogue_process_warnings(),
content,
)
try:
Expand Down Expand Up @@ -189,3 +192,19 @@ def get_user_warnings(self) -> str:
" ".join(sorted(logged_in_users))
)
return ""

def get_rogue_process_warnings(self) -> str:
"""Check for rogue processes with high CPU usage and generate warnings."""
top_output = system("top -bcn 1 -w512 |head -n 12")
rogue_processes = detect_rogue_processes(top_output)

if not rogue_processes:
return ""

warning = "# ** Warning: High CPU usage processes detected: **\n"
for pid, user, cpu_percent, command in rogue_processes:
warning += "- Process {} (PID: {}, User: {}) using {:.1f}% CPU\n".format(
command, pid, user, cpu_percent
)

return warning
52 changes: 52 additions & 0 deletions src/running/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,3 +180,55 @@ def update_reservation(self):
def get_reservation(self) -> Optional[MomaReservaton]:
self.update_reservation()
return self.reservation


def detect_rogue_processes(
top_output: str, cpu_threshold: float = 50.0
) -> List[Tuple[str, str, float, str]]:
"""
Parse top output and detect processes with high CPU usage.

Args:
top_output: Raw output from top command
cpu_threshold: CPU percentage threshold for considering a process "rogue"

Returns:
List of tuples: (pid, user, cpu_percent, command)
"""
rogue_processes: List[Tuple[str, str, float, str]] = []
lines = top_output.splitlines()

# Find the start of the process list (after the header line with PID USER PR NI...)
process_start_idx = -1
for i, line in enumerate(lines):
if line.strip().startswith("PID") and "USER" in line and "%CPU" in line:
process_start_idx = i + 1
break

if process_start_idx == -1:
return rogue_processes

# Parse each process line
for line in lines[process_start_idx:]:
if not line.strip():
continue

# Split the line and extract relevant fields
# Format: PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
parts = line.split()
if len(parts) < 12:
continue

try:
pid = parts[0]
user = parts[1]
cpu_percent = float(parts[8]) # %CPU column
command = " ".join(parts[11:]) # COMMAND column (may contain spaces)

if cpu_percent >= cpu_threshold:
rogue_processes.append((pid, user, cpu_percent, command))
except (ValueError, IndexError):
# Skip lines that don't match expected format
continue

return rogue_processes
71 changes: 70 additions & 1 deletion tests/test_util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
from pathlib import Path
from running.config import Configuration
from running.util import parse_config_str, smart_quote, split_quoted
from running.util import (
parse_config_str,
smart_quote,
split_quoted,
detect_rogue_processes,
)


def test_split_quoted():
Expand Down Expand Up @@ -36,3 +41,67 @@ def test_issue104():
c.resolve_class()
_, modifiers = parse_config_str(c, "jdk8|")
assert len(modifiers) == 0


def test_detect_rogue_processes():
# Test with no rogue processes
top_output_normal = """top - 03:18:43 up 4 min, 1 user, load average: 0.55, 0.35, 0.15
Tasks: 181 total, 1 running, 180 sleeping, 0 stopped, 0 zombie
%Cpu(s): 0.0 us, 2.2 sy, 0.0 ni, 97.8 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st
MiB Mem : 15995.6 total, 13333.4 free, 1396.1 used, 1616.3 buff/cache
MiB Swap: 4096.0 total, 4096.0 free, 0.0 used. 14599.5 avail Mem

PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
3524 runner 20 0 12340 5384 3336 R 10.0 0.0 0:00.01 top -bcn 1 -w512
1 root 20 0 22876 13828 9476 S 0.0 0.1 0:03.23 /sbin/init"""

rogue_processes = detect_rogue_processes(top_output_normal)
assert len(rogue_processes) == 0

# Test with one rogue process
top_output_rogue = """top - 03:18:43 up 4 min, 1 user, load average: 0.55, 0.35, 0.15
Tasks: 181 total, 1 running, 180 sleeping, 0 stopped, 0 zombie
%Cpu(s): 0.0 us, 2.2 sy, 0.0 ni, 97.8 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st
MiB Mem : 15995.6 total, 13333.4 free, 1396.1 used, 1616.3 buff/cache
MiB Swap: 4096.0 total, 4096.0 free, 0.0 used. 14599.5 avail Mem

PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
1234 user 20 0 123456 7890 4567 S 85.3 0.1 0:12.34 rust-analyzer
3524 runner 20 0 12340 5384 3336 R 10.0 0.0 0:00.01 top -bcn 1 -w512
1 root 20 0 22876 13828 9476 S 0.0 0.1 0:03.23 /sbin/init"""

rogue_processes = detect_rogue_processes(top_output_rogue)
assert len(rogue_processes) == 1
pid, user, cpu_percent, command = rogue_processes[0]
assert pid == "1234"
assert user == "user"
assert cpu_percent == 85.3
assert command == "rust-analyzer"

# Test with custom threshold
rogue_processes_low_threshold = detect_rogue_processes(
top_output_rogue, cpu_threshold=5.0
)
assert (
len(rogue_processes_low_threshold) == 2
) # Both rust-analyzer (85.3%) and top (10.0%)

# Test with high threshold
rogue_processes_high_threshold = detect_rogue_processes(
top_output_rogue, cpu_threshold=90.0
)
assert len(rogue_processes_high_threshold) == 0 # No processes above 90%


def test_detect_rogue_processes_malformed():
# Test with malformed top output (no header)
malformed_output = """some random text
not a top output
no PID header"""

rogue_processes = detect_rogue_processes(malformed_output)
assert len(rogue_processes) == 0

# Test with empty output
rogue_processes = detect_rogue_processes("")
assert len(rogue_processes) == 0