Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions benchpress/cli/commands/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import logging
import os
import subprocess
import sys
from datetime import datetime, timezone
from os import path

Expand Down Expand Up @@ -260,6 +261,31 @@ def run(self, args, jobs) -> None:
if not args.disable_hooks:
job.stop_hooks()

# Merge diagnosis records (failures, auto-fixes) into metrics.
# Works for all benchmarks that use DiagnosisRecorder.
diagnosis_path = os.environ.get("DIAGNOSIS_FILE_PATH", "")
if diagnosis_path and os.path.exists(diagnosis_path):
try:
sys.path.insert(
0,
os.path.join(
os.path.dirname(__file__),
"..",
"..",
"..",
"packages",
"common",
),
)
from diagnosis_utils import DiagnosisRecorder

recorder = DiagnosisRecorder.get_instance(
shared_file_path=diagnosis_path
)
recorder.merge_failure_to_results(metrics)
except Exception as e:
logger.warning("Failed to merge diagnosis records: %s", e)

final_metrics["metrics"] = metrics
stdout_reporter = ReporterFactory.create("stdout")
click.echo("Results Report:", err=True)
Expand Down
10 changes: 10 additions & 0 deletions benchpress/config/jobs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
- '-nnginx'
- '-L {load_generator}'
- '-s {lg_path}'
- '-U {auto_fix_ulimit}'
- '--'
- '--mediawiki'
- '--client-duration={duration}'
Expand All @@ -22,6 +23,7 @@
- 'lg_path=benchmarks/oss_performance_mediawiki/wrk/wrk'
- 'duration=10m'
- 'timeout=11m'
- 'auto_fix_ulimit=1'
- 'extra_args='
hooks:
- hook: copymove
Expand All @@ -40,6 +42,7 @@
- '-L {load_generator}'
- '-s {lg_path}'
- '-T {temp_dir}'
- '-U {auto_fix_ulimit}'
- '-p'
- '--'
- '--mediawiki'
Expand Down Expand Up @@ -68,6 +71,7 @@
- 'temp_dir=default_no_temp_dir'
- 'num_multi_req_warmups=-1' # -1 means the warmup status will define the number of iterations
- 'load_generator_seed=1000'
- 'auto_fix_ulimit=1'
- 'extra_args='
hooks:
- hook: copymove
Expand All @@ -86,6 +90,7 @@
- '-s {lg_path}'
- '-R{scale_out}'
- '-c{client_threads}'
- '-U {auto_fix_ulimit}'
- '--'
- '--mediawiki-mlp'
- '--client-duration={duration}'
Expand All @@ -100,6 +105,7 @@
- 'client_threads=0'
- 'duration=10m'
- 'timeout=11m'
- 'auto_fix_ulimit=1'
- 'extra_args='
hooks:
- hook: copymove
Expand All @@ -118,6 +124,7 @@
- '-s {lg_path}'
- '-R{scale_out}'
- '-c{client_threads}'
- '-U {auto_fix_ulimit}'
- '--'
- '--mediawiki-mlp'
- '--client-duration={duration}'
Expand All @@ -133,6 +140,7 @@
- 'client_threads=0'
- 'duration=10m'
- 'timeout=11m'
- 'auto_fix_ulimit=1'
- 'extra_args='
hooks:
- hook: copymove
Expand All @@ -151,6 +159,7 @@
- '-s {lg_path}'
- '-R{scale_out}'
- '-c{client_threads}'
- '-U {auto_fix_ulimit}'
- '--'
- '--mediawiki-mem'
- '--client-duration={duration}'
Expand All @@ -165,6 +174,7 @@
- 'client_threads=0'
- 'duration=10m'
- 'timeout=11m'
- 'auto_fix_ulimit=1'
- 'extra_args='
hooks:
- hook: copymove
Expand Down
197 changes: 195 additions & 2 deletions packages/common/diagnosis_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@
import fcntl
import json
import os
import resource
import socket
import subprocess
import sys
from datetime import datetime
from typing import Any, Dict, List, Optional
Expand Down Expand Up @@ -212,8 +214,8 @@ def record_auto_fix(
Record a notable auto-fix that was applied and may affect the benchmark score.

Args:
benchmark: Name of the benchmark (e.g., "tao_bench")
fix_type: Type of fix (e.g., "ephemeral_port_cap")
benchmark: Name of the benchmark (e.g., "tao_bench", "mediawiki")
fix_type: Type of fix (e.g., "ephemeral_port_cap", "file_descriptor_limit")
description: Human-readable description of what was fixed and why
original_value: The original value before the fix
fixed_value: The value after the fix was applied
Expand Down Expand Up @@ -539,3 +541,194 @@ def record_port_unavailable_error(
"""
recorder = DiagnosisRecorder.get_instance(root_dir=root_dir)
recorder.record_port_unavailable_error(port=port, benchmark=benchmark, errno=errno)


def check_file_descriptor_limit(
benchmark: str,
required_fds: int,
auto_fix: bool = False,
root_dir: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
) -> bool:
"""
Check if the file descriptor soft limit is sufficient and optionally auto-fix.

This is useful for benchmarks that open many sockets, files, or database
connections (e.g., mediawiki with HHVM + nginx + wrk, or tao_bench with
many memtier client connections).

Args:
benchmark: Name of the benchmark (for diagnosis recording)
required_fds: Minimum number of file descriptors needed
auto_fix: If True, attempt to raise the soft limit automatically
root_dir: Root directory for diagnosis file
metadata: Additional metadata to include in diagnosis records

Returns:
True if the FD limit is sufficient (or was auto-fixed), False otherwise
"""
soft_limit, hard_limit = resource.getrlimit(resource.RLIMIT_NOFILE)

if soft_limit >= required_fds:
print(f"File descriptor limit OK: soft={soft_limit}, need={required_fds}")
return True

recorder = DiagnosisRecorder.get_instance(root_dir=root_dir)
extra_meta = metadata or {}

if auto_fix:
try:
new_limit = required_fds
if hard_limit < new_limit:
resource.setrlimit(resource.RLIMIT_NOFILE, (new_limit, new_limit))
else:
resource.setrlimit(resource.RLIMIT_NOFILE, (new_limit, hard_limit))
actual_soft, _ = resource.getrlimit(resource.RLIMIT_NOFILE)
recorder.record_auto_fix(
benchmark=benchmark,
fix_type="file_descriptor_limit",
description=(
f"Raised file descriptor soft limit from {soft_limit} to "
f"{actual_soft} (need at least {required_fds})."
),
original_value=soft_limit,
fixed_value=actual_soft,
score_impact=(
"None \u2014 raising the file descriptor limit has no "
"performance impact."
),
metadata={
"original_soft_limit": soft_limit,
"original_hard_limit": hard_limit,
"new_soft_limit": actual_soft,
"required_fds": required_fds,
**extra_meta,
},
)
return True
except (ValueError, OSError) as e:
error_msg = (
f"File descriptor soft limit ({soft_limit}) is too low "
f"(need at least {required_fds}). Auto-fix failed: {e}"
)
print(f"\nWARNING: {error_msg}\n", file=sys.stderr)
recorder.record_failure(
benchmark=benchmark,
error_type="file_descriptor_limit_low",
reason=error_msg,
solutions=[
f"Run with higher ulimit: sudo bash -c 'ulimit -n "
f"{required_fds} && ./benchpress_cli.py run ...'",
"Set system-wide limit in /etc/security/limits.conf and reboot",
],
metadata={
"soft_limit": soft_limit,
"hard_limit": hard_limit,
"required_fds": required_fds,
"auto_fix_error": str(e),
**extra_meta,
},
)
return False

# No auto-fix \u2014 record failure and warn
error_msg = (
f"File descriptor soft limit ({soft_limit}) is too low "
f"(need at least {required_fds}). "
f"Processes may fail with 'Too many open files'."
)
print(
f"\n{'=' * 80}\n"
f"ERROR: File descriptor limit too low\n"
f"{'=' * 80}\n\n"
f"{error_msg}\n"
f" Current: soft={soft_limit}, hard={hard_limit}\n\n"
f"SOLUTIONS:\n"
f" 1. Enable auto-fix: add auto_fix_ulimit=1 to job input\n"
f" 2. Run with higher ulimit: sudo bash -c 'ulimit -n "
f"{required_fds} && ./benchpress_cli.py run ...'\n"
f" 3. Set in /etc/security/limits.conf and reboot\n"
f"\n{'=' * 80}\n",
file=sys.stderr,
)
recorder.record_failure(
benchmark=benchmark,
error_type="file_descriptor_limit_low",
reason=error_msg,
solutions=[
"Enable auto-fix: add auto_fix_ulimit=1 to job input",
f"Run with higher ulimit: sudo bash -c 'ulimit -n "
f"{required_fds} && ./benchpress_cli.py run ...'",
"Set system-wide limit in /etc/security/limits.conf and reboot",
],
metadata={
"soft_limit": soft_limit,
"hard_limit": hard_limit,
"required_fds": required_fds,
**extra_meta,
},
)
# Continue anyway so the diagnosis appears in the output JSON
return False


def check_selinux(
benchmark: str,
root_dir: Optional[str] = None,
) -> bool:
"""
Check if SELinux is disabled or permissive.

SELinux in Enforcing mode can cause segfaults or permission denials for
benchmarks that use JIT compilation, custom memory allocators, or bind
to non-standard ports.

Args:
benchmark: Name of the benchmark (for diagnosis recording)
root_dir: Root directory for diagnosis file

Returns:
True if SELinux is disabled or permissive, False if Enforcing
"""
try:
result = subprocess.run(
["getenforce"], capture_output=True, text=True, timeout=5
)
status = result.stdout.strip()
except (FileNotFoundError, subprocess.TimeoutExpired):
# getenforce not found \u2014 SELinux likely not installed
print("SELinux check: getenforce not found, assuming SELinux is not installed")
return True

if status in ("Disabled", "Permissive"):
print(f"SELinux check OK: {status}")
return True

error_msg = (
f"SELinux is set to '{status}'. This may cause segfaults or "
f"permission errors during the {benchmark} benchmark."
)
print(
f"\n{'=' * 80}\n"
f"ERROR: SELinux is Enforcing\n"
f"{'=' * 80}\n\n"
f"{error_msg}\n\n"
f"SOLUTIONS:\n"
f" 1. Disable SELinux temporarily: sudo setenforce 0\n"
f" 2. Disable SELinux permanently: edit /etc/selinux/config\n"
f"\n{'=' * 80}\n",
file=sys.stderr,
)
recorder = DiagnosisRecorder.get_instance(root_dir=root_dir)
recorder.record_failure(
benchmark=benchmark,
error_type="selinux_enforcing",
reason=error_msg,
solutions=[
"Disable SELinux temporarily: sudo setenforce 0",
"Disable SELinux permanently: set SELINUX=disabled in "
"/etc/selinux/config and reboot",
],
metadata={"selinux_status": status},
)
return False
Loading
Loading