diff --git a/benchpress/config/jobs.yml b/benchpress/config/jobs.yml index ed6f884b..99481d41 100644 --- a/benchpress/config/jobs.yml +++ b/benchpress/config/jobs.yml @@ -1046,6 +1046,7 @@ - '--disable-tls={disable_tls}' - '--smart-nanosleep={smart_nanosleep}' - '--memory-file={memory_file}' + - '--ipv4={ipv4}' - '--real' vars: - 'interface_name=eth0' @@ -1058,6 +1059,7 @@ - 'disable_tls=0' - 'smart_nanosleep=0' - 'memory_file=' + - 'ipv4=0' client: args: - 'client' @@ -1074,6 +1076,7 @@ - '--sanity={sanity}' - '--wait-after-warmup={wait_after_warmup}' - '--disable-tls={disable_tls}' + - '--ipv4={ipv4}' - '--client-id={client_id}' - '--control-port={control_port}' - '--real' @@ -1089,6 +1092,7 @@ - 'wait_after_warmup=5' - 'sanity=0' - 'disable_tls=0' + - 'ipv4=0' - 'client_id=0' - 'control_port=0' hooks: @@ -1129,6 +1133,7 @@ - '--smart-nanosleep={smart_nanosleep}' - '--memory-file={memory_file}' - '--auto-warmup={auto_warmup}' + - '--ipv4={ipv4}' - '--real' vars: - 'num_servers=0' @@ -1152,6 +1157,7 @@ - 'smart_nanosleep=0' - 'memory_file=' - 'auto_warmup=0' + - 'ipv4=0' hooks: - hook: tao_instruction - hook: copymove @@ -1184,11 +1190,16 @@ - '--stats-interval={stats_interval}' - '--port-number-start={port_number_start}' - '--disable-tls={disable_tls}' + - '--timeout-buffer={timeout_buffer}' - '--num-slow-threads={num_slow_threads}' - '--num-fast-threads={num_fast_threads}' - '--num-client-threads={num_client_threads}' - '--memory-file={memory_file}' - '--auto-warmup={auto_warmup}' + - '--auto-fix-ports={auto_fix_ports}' + - '--auto-fix-ulimit={auto_fix_ulimit}' + - '--skip-hit-rate-check={skip_hit_rate_check}' + - '--ipv4={ipv4}' vars: - 'num_servers=0' - 'memsize=0' @@ -1203,11 +1214,16 @@ - 'stats_interval=5000' - 'port_number_start=11211' - 'disable_tls=0' + - 'timeout_buffer=0' - 'num_slow_threads=0' - 'num_fast_threads=0' - 'num_client_threads=0' - 'memory_file=' - 'auto_warmup=0' + - 'auto_fix_ports=0' + - 'auto_fix_ulimit=0' + - 'skip_hit_rate_check=0' + - 'ipv4=0' hooks: - hook: copymove options: @@ -1238,6 +1254,7 @@ - '--stats-interval={stats_interval}' - '--port-number-start={port_number_start}' - '--disable-tls={disable_tls}' + - '--timeout-buffer={timeout_buffer}' - '--num-slow-threads={num_slow_threads}' - '--num-fast-threads={num_fast_threads}' - '--num-client-threads={num_client_threads}' @@ -1246,6 +1263,10 @@ - '--postprocessing-timeout-buffer={postprocessing_timeout_buffer}' - '--poll-interval={poll_interval}' - '--memory-file={memory_file}' + - '--auto-fix-ports={auto_fix_ports}' + - '--auto-fix-ulimit={auto_fix_ulimit}' + - '--skip-hit-rate-check={skip_hit_rate_check}' + - '--ipv4={ipv4}' vars: - 'num_servers=0' - 'memsize=0.5' @@ -1260,6 +1281,7 @@ - 'stats_interval=5000' - 'port_number_start=11211' - 'disable_tls=0' + - 'timeout_buffer=30' - 'num_slow_threads=0' - 'num_fast_threads=0' - 'num_client_threads=0' @@ -1268,6 +1290,10 @@ - 'postprocessing_timeout_buffer=60' - 'poll_interval=0.2' - 'memory_file=' + - 'auto_fix_ports=1' + - 'auto_fix_ulimit=1' + - 'skip_hit_rate_check=0' + - 'ipv4=0' hooks: - hook: copymove options: diff --git a/benchpress/plugins/parsers/tao_bench_autoscale.py b/benchpress/plugins/parsers/tao_bench_autoscale.py index 26b0a8c6..b9fe92d2 100644 --- a/benchpress/plugins/parsers/tao_bench_autoscale.py +++ b/benchpress/plugins/parsers/tao_bench_autoscale.py @@ -20,13 +20,15 @@ def parse(self, stdout, stderr, returncode): metrics = {} jsontext = "" met_json = False + brace_depth = 0 for line in stdout: - if line.strip() == "{": + if line.strip() == "{" and not met_json: met_json = True if met_json: jsontext += line - if line.strip() == "}": - break + brace_depth += line.count("{") - line.count("}") + if brace_depth == 0: + break try: metrics = json.loads(jsontext) if "total_qps" in metrics: diff --git a/packages/common/diagnosis_utils.py b/packages/common/diagnosis_utils.py index e2b6910e..4c61ed20 100644 --- a/packages/common/diagnosis_utils.py +++ b/packages/common/diagnosis_utils.py @@ -188,6 +188,7 @@ def record_failure( """ entry = { "timestamp": datetime.utcnow().isoformat() + "Z", + "category": "failure", "benchmark": benchmark, "error_type": error_type, "reason": reason, @@ -197,6 +198,44 @@ def record_failure( self._append_to_file(entry) + def record_auto_fix( + self, + benchmark: str, + fix_type: str, + description: str, + original_value: Any = None, + fixed_value: Any = None, + score_impact: str = "", + metadata: Optional[Dict[str, Any]] = None, + ) -> None: + """ + Record a notable auto-fix that was applied and may affect the benchmark score. + + Args: + benchmark: Name of the benchmark (e.g., "tao_bench") + fix_type: Type of fix (e.g., "ephemeral_port_cap") + description: Human-readable description of what was fixed and why + original_value: The original value before the fix + fixed_value: The value after the fix was applied + score_impact: Description of how this fix may affect the score + metadata: Additional metadata + """ + print(f"AUTO-FIX ({fix_type}): {description}") + + entry = { + "timestamp": datetime.utcnow().isoformat() + "Z", + "category": "auto_fix", + "benchmark": benchmark, + "fix_type": fix_type, + "description": description, + "original_value": original_value, + "fixed_value": fixed_value, + "score_impact": score_impact, + "metadata": metadata or {}, + } + + self._append_to_file(entry) + def _append_to_file(self, entry: Dict[str, Any]) -> None: """ Append an entry to the diagnosis file with file locking. @@ -305,13 +344,15 @@ def merge_failure_to_results( results_dict: Dictionary to merge failure information into Modifies: - results_dict is updated with a "failures" key containing a list of all - diagnosis records, or an empty list if no records are found. + results_dict is updated with a "failures" key containing a list of + failure records, and a "notable_auto_fixes" key containing a list of + auto-fix records, or empty lists if no records are found. """ try: # Read from the singleton instance's diagnosis file if not os.path.exists(self.diagnosis_file_path): results_dict["failures"] = [] + results_dict["notable_auto_fixes"] = [] return with open(self.diagnosis_file_path, "r") as f: @@ -320,13 +361,108 @@ def merge_failure_to_results( if not isinstance(records, list): records = [records] - # Add all failure records to results - results_dict["failures"] = records + # Separate failures from auto-fixes + results_dict["failures"] = [ + r for r in records if r.get("category") != "auto_fix" + ] + results_dict["notable_auto_fixes"] = [ + r for r in records if r.get("category") == "auto_fix" + ] except Exception as e: results_dict["failures"] = [] + results_dict["notable_auto_fixes"] = [] results_dict["failure_read_error"] = f"Failed to read diagnosis file: {e}" +def check_ipv6_hostname( + hostname: str, + benchmark: str = "unknown", + root_dir: Optional[str] = None, +) -> bool: + """ + Check if a hostname resolves to an IPv6 address and warn if --ipv4 may be needed. + + When a hostname resolves to IPv6 but IPv6 connectivity is broken or misconfigured, + clients will fail to connect to the server. This is a common issue on systems where + the hostname is registered with an IPv6 address (e.g., in /etc/hosts or DNS) but + IPv6 networking is not fully functional. + + Args: + hostname: The server hostname to check + benchmark: Name of the benchmark calling this function + root_dir: Root directory for diagnosis file + + Returns: + True if the hostname resolves to IPv6 (meaning --ipv4 may be needed), + False if it resolves to IPv4 only. + """ + try: + addr_infos = socket.getaddrinfo(hostname, None) + except socket.gaierror: + return False + + has_ipv6 = any(info[0] == socket.AF_INET6 for info in addr_infos) + has_ipv4 = any(info[0] == socket.AF_INET for info in addr_infos) + + if not has_ipv6: + return False + + # Hostname resolves to IPv6. Check if IPv6 connectivity actually works. + ipv6_works = False + try: + sock = socket.socket(socket.AF_INET6, socket.SOCK_STREAM) + sock.settimeout(2) + # Try binding to the IPv6 loopback to verify basic IPv6 support + sock.bind(("::1", 0)) + sock.close() + ipv6_works = True + except OSError as e: + ipv6_works = False + ipv6_errno = e.errno + ipv6_error_msg = str(e) + + # Only record a failure if IPv6 is actually broken. + # If IPv6 works (even without IPv4 fallback), there is no problem. + if not ipv6_works: + first_ipv6 = next( + (info[4][0] for info in addr_infos if info[0] == socket.AF_INET6), + None, + ) + recorder = DiagnosisRecorder.get_instance(root_dir=root_dir) + reason = ( + f"Hostname '{hostname}' resolves to IPv6 address ({first_ipv6}) " + f"but IPv6 networking is not functional on this system " + f"(errno={ipv6_errno}: {ipv6_error_msg}). " + f"Clients will fail to connect to the server." + ) + recorder.record_failure( + benchmark=benchmark, + error_type="ipv6_hostname_resolution", + reason=reason, + solutions=[ + "Force IPv4: --ipv4=1 (or add ipv4=1 to job input)", + f"Add IPv4 entry for hostname: " + f"echo '127.0.0.1 {hostname}' >> /etc/hosts", + "Ensure IPv6 networking is properly configured on this system", + ], + metadata={ + "hostname": hostname, + "ipv6_address": first_ipv6, + "has_ipv4": has_ipv4, + "ipv6_works": ipv6_works, + "resolved_addresses": [ + { + "family": ("IPv6" if info[0] == socket.AF_INET6 else "IPv4"), + "address": info[4][0], + } + for info in addr_infos + ], + }, + ) + + return True + + def check_port_available( port: int, interface: str = "0.0.0.0", diff --git a/packages/tao_bench/args_utils.py b/packages/tao_bench/args_utils.py index 1a71f780..385fd266 100644 --- a/packages/tao_bench/args_utils.py +++ b/packages/tao_bench/args_utils.py @@ -192,6 +192,12 @@ def add_common_server_args(server_parser: ArgumentParser) -> List[Tuple[str, str help="target hit ratio for auto-warmup detection. Warmup is considered " + "complete when hit ratio reaches 95%% of this value.", ) + server_parser.add_argument( + "--ipv4", + type=int, + default=0, + help="set to 1 to force IPv4 protocol instead of IPv6", + ) server_parser.add_argument("--real", action="store_true", help="for real") return get_opt_strings(server_parser) @@ -284,6 +290,12 @@ def add_common_client_args(client_parser: ArgumentParser) -> List[Tuple[str, str + "When > 0, client polls this port during warmup and stops early " + "when server reports warmed up. 0 = disabled (use full warmup time).", ) + client_parser.add_argument( + "--ipv4", + type=int, + default=0, + help="set to 1 to force IPv4 protocol instead of IPv6", + ) client_parser.add_argument("--real", action="store_true", help="for real") return get_opt_strings(client_parser) diff --git a/packages/tao_bench/parser.py b/packages/tao_bench/parser.py index c8339ab1..b27aaf23 100644 --- a/packages/tao_bench/parser.py +++ b/packages/tao_bench/parser.py @@ -45,8 +45,9 @@ class TaoBenchParser: # result qps calculation MIN_HIT_RATE = 0.88 - def __init__(self, server_csv_name="server.csv"): + def __init__(self, server_csv_name="server.csv", skip_hit_rate_check=False): self.server_csv_name = server_csv_name + self.skip_hit_rate_check = skip_hit_rate_check def parse(self, stdout, stderr, returncode): """Extracts TAO bench results from stdout.""" @@ -109,9 +110,9 @@ def process_server_snapshots(self, metrics, server_snapshots): if not snapshot.valid: continue # Also filter out data points with low hit rate - if ( - snapshot.get("fast_qps") > 1 - and snapshot.get("hit_rate") >= self.MIN_HIT_RATE + if snapshot.get("fast_qps") > 1 and ( + self.skip_hit_rate_check + or snapshot.get("hit_rate") >= self.MIN_HIT_RATE ): counter += 1 else: diff --git a/packages/tao_bench/run.py b/packages/tao_bench/run.py index 3b1b9102..19a2b0c7 100755 --- a/packages/tao_bench/run.py +++ b/packages/tao_bench/run.py @@ -9,6 +9,7 @@ import pathlib import re import shlex +import shutil import signal import subprocess import sys @@ -16,6 +17,13 @@ import time from typing import List +# Force line-buffered stdout so output is written to log files immediately, +# even when stdout is redirected to a file. Without this, output stays in +# a 4096-byte C buffer and is lost if the process is killed (SIGKILL). +if not sys.stdout.isatty(): + sys.stdout.reconfigure(line_buffering=True) + sys.stderr.reconfigure(line_buffering=True) + import args_utils from warmup_monitor import poll_control_port @@ -240,6 +248,8 @@ def run_server(args): "-I", "16m", ] + if args.ipv4: + server_cmd += ["-l", "0.0.0.0"] if not args.disable_tls: server_cmd.append("-Z") server_cmd += [ @@ -263,6 +273,13 @@ def run_server(args): if is_ubuntu(): os.environ["LD_LIBRARY_PATH"] = os.path.join(TAO_BENCH_DIR, "build-deps/lib") + # Force line-buffered stdout on the C server binary so stats output is + # flushed to the log file immediately. Without this, output stays in a + # 4096-byte buffer and is lost if the process is killed before flushing, + # which causes empty log files and zero scores on low core counts. + if shutil.which("stdbuf"): + server_cmd = ["stdbuf", "-oL"] + server_cmd + timeout = args.warmup_time + args.test_time + args.timeout_buffer graceful_sig = signal.SIGUSR1 if args.memory_file else None run_cmd(server_cmd, timeout, args.real, graceful_signal=graceful_sig) @@ -328,6 +345,8 @@ def get_client_cmd(args, n_seconds): "--key-bytes=220", f"--test-time={n_seconds}", ] + if args.ipv4: + client_cmd.append("-4") if not args.disable_tls: client_cmd += [ f"--cert={s_cert}", diff --git a/packages/tao_bench/run_autoscale.py b/packages/tao_bench/run_autoscale.py index 4431e4b6..d8f9173e 100755 --- a/packages/tao_bench/run_autoscale.py +++ b/packages/tao_bench/run_autoscale.py @@ -23,7 +23,7 @@ # Add parent directory to path to import diagnosis_utils sys.path.insert(0, str(pathlib.Path(__file__).parents[1] / "common")) -from diagnosis_utils import DiagnosisRecorder +from diagnosis_utils import check_ipv6_hostname, DiagnosisRecorder BENCHPRESS_ROOT = pathlib.Path(os.path.abspath(__file__)).parents[2] @@ -185,6 +185,8 @@ def gen_client_instructions(args, to_file=True): client_args["num_threads"] = args.num_client_threads if hasattr(args, "auto_warmup") and args.auto_warmup > 0: client_args["control_port"] = args.port_number_start + 1000 + if hasattr(args, "ipv4") and args.ipv4 != 0: + client_args["ipv4"] = 1 clients[c] += ( " ".join( [ @@ -223,6 +225,8 @@ def gen_client_instructions(args, to_file=True): client_args["num_threads"] = args.num_client_threads if hasattr(args, "auto_warmup") and args.auto_warmup > 0: client_args["control_port"] = args.port_number_start + 1000 + if hasattr(args, "ipv4") and args.ipv4 != 0: + client_args["ipv4"] = 1 clients[i] += ( " ".join( [ @@ -359,6 +363,22 @@ def run_server(args): required_gb = float(args.memsize) * args_utils.MEM_USAGE_FACTOR ensure_shm_capacity(required_gb) + # Check if hostname resolves to IPv6 and warn if --ipv4 may be needed + if not args.ipv4: + server_hostname = ( + args.server_hostname if args.server_hostname else socket.gethostname() + ) + resolves_ipv6 = check_ipv6_hostname( + hostname=server_hostname, + benchmark="tao_bench", + root_dir=str(BENCHPRESS_ROOT), + ) + if resolves_ipv6: + print( + f"WARNING: Hostname '{server_hostname}' resolves to an IPv6 address. " + f"If clients fail to connect, re-run with --ipv4=1" + ) + core_ranges = distribute_cores(args.num_servers) # memory size - split evenly for each server n_mem = float(args.memsize) @@ -672,7 +692,8 @@ def run_server(args): for line in preview_lines: print(f" {line}") - parser = TaoBenchParser(f"server_{i}.csv") + skip_check = getattr(args, "skip_hit_rate_check", 0) != 0 + parser = TaoBenchParser(f"server_{i}.csv", skip_hit_rate_check=skip_check) res = parser.parse(log, None, returncode) # Diagnose parser results @@ -828,6 +849,15 @@ def init_parser(): help="number of slow threads for the server. If not specified, will use default calculation (fast_threads * slow_to_fast_ratio).", ) + parser.add_argument( + "--skip-hit-rate-check", + type=int, + default=0, + help="set to 1 to skip the hit rate threshold check when computing " + + "server QPS. Useful on low core counts where the cache cannot reach " + + "the 88%% hit rate within a short test.", + ) + # Add custom thread parameters to SERVER_CMD_OPTIONS SERVER_CMD_OPTIONS.append(("--num-fast-threads", "num_fast_threads")) SERVER_CMD_OPTIONS.append(("--num-slow-threads", "num_slow_threads")) diff --git a/packages/tao_bench/run_standalone.py b/packages/tao_bench/run_standalone.py index e250fb57..2c2ae24c 100755 --- a/packages/tao_bench/run_standalone.py +++ b/packages/tao_bench/run_standalone.py @@ -8,10 +8,15 @@ import os import pathlib import re +import resource import subprocess +import sys import threading import args_utils + +sys.path.insert(0, str(pathlib.Path(__file__).parents[1] / "common")) +from diagnosis_utils import DiagnosisRecorder from run_autoscale import gen_client_instructions BENCHPRESS_ROOT = pathlib.Path(os.path.abspath(__file__)).parents[2] @@ -148,6 +153,28 @@ def init_parser(): default=0, help="number of slow threads for the server. If not specified, will use default calculation (fast_threads * slow_to_fast_ratio).", ) + parser.add_argument( + "--auto-fix-ports", + type=int, + default=0, + help="automatically reduce clients_per_thread if total connections would exceed " + + "the ephemeral port range. Set to non-zero to enable.", + ) + parser.add_argument( + "--skip-hit-rate-check", + type=int, + default=0, + help="set to 1 to skip the hit rate threshold check when computing " + + "server QPS. Useful on low core counts where the cache cannot reach " + + "the 88%% hit rate within a short test.", + ) + parser.add_argument( + "--auto-fix-ulimit", + type=int, + default=0, + help="automatically raise the file descriptor soft limit if it is too " + + "low for the number of connections. Set to non-zero to enable.", + ) return parser @@ -170,7 +197,6 @@ def launch_server(port_number_start=11211, bind_cpu=1, bind_mem=1): } script_args["--interface-name"] = "lo" script_args["--client-wait-after-warmup"] = 0 - script_args["--timeout-buffer"] = 0 if port_number_start > 0: script_args["--port-number-start"] = port_number_start script_args["--bind-cpu"] = bind_cpu @@ -196,6 +222,10 @@ def launch_server(port_number_start=11211, bind_cpu=1, bind_mem=1): if hasattr(args, "target_hit_ratio") and args.target_hit_ratio != 0.9: script_args["--target-hit-ratio"] = args.target_hit_ratio + # Pass skip-hit-rate-check if specified + if hasattr(args, "skip_hit_rate_check") and args.skip_hit_rate_check: + script_args["--skip-hit-rate-check"] = args.skip_hit_rate_check + cmd = [f"{TAO_BENCH_DIR}/run_autoscale.py --real"] for argname, argval in script_args.items(): @@ -240,7 +270,176 @@ def launch_client(cmd, n=1, client_id=0): args.memsize = args_utils.get_system_memsize_gb() * 0.75 args.warmup_time = args_utils.get_warmup_time(args) args.server_memsize = args.memsize - args.server_hostname = "localhost" + args.server_hostname = "127.0.0.1" if args.ipv4 else "localhost" + + # Initialize DiagnosisRecorder so subprocesses share the same diagnosis file + recorder = DiagnosisRecorder.get_instance(root_dir=str(BENCHPRESS_ROOT)) + + # In standalone mode, all client connections go through loopback and share + # the same ephemeral port pool. Check if total connections would exceed + # the available ephemeral ports. + n_cores = len(os.sched_getaffinity(0)) + if args.num_client_threads > 0: + threads_per_client = args.num_client_threads + else: + threads_per_client = max(1, n_cores - 6, int(n_cores * 0.8)) + if args.clients_per_thread <= 0: + args.clients_per_thread = args_utils.sanitize_clients_per_thread(380) + + port_low, port_high = "32768", "60999" + available_ports = 28231 # default: 60999 - 32768 + try: + with open("/proc/sys/net/ipv4/ip_local_port_range") as f: + port_low, port_high = f.read().split() + available_ports = int(port_high) - int(port_low) + except (OSError, ValueError): + pass + + max_conns = int(available_ports * 0.8) # leave 20% margin + total_conns = args.num_clients * threads_per_client * args.clients_per_thread + + if total_conns > max_conns: + if args.auto_fix_ports: + original_cpt = args.clients_per_thread + args.clients_per_thread = max( + 1, max_conns // (args.num_clients * threads_per_client) + ) + new_total = args.num_clients * threads_per_client * args.clients_per_thread + recorder.record_auto_fix( + benchmark="tao_bench", + fix_type="ephemeral_port_cap", + description=( + f"Reduced clients_per_thread from {original_cpt} to " + f"{args.clients_per_thread} because total connections " + f"({total_conns}) would exceed the ephemeral port range " + f"({available_ports} ports). All connections in standalone " + f"mode go through loopback and share the same port pool." + ), + original_value=original_cpt, + fixed_value=args.clients_per_thread, + score_impact=( + "Fewer client connections may reduce load on the server, " + "potentially resulting in a lower QPS score compared to " + "systems with a wider ephemeral port range." + ), + metadata={ + "num_clients": args.num_clients, + "threads_per_client": threads_per_client, + "original_total_conns": total_conns, + "fixed_total_conns": new_total, + "available_ports": available_ports, + "port_range": f"{port_low}-{port_high}", + }, + ) + total_conns = new_total + else: + error_msg = ( + f"Total client connections ({total_conns}) exceeds the available " + f"ephemeral port range ({available_ports} ports, " + f"net.ipv4.ip_local_port_range = {port_low} {port_high}). " + f"Clients will fail with 'Cannot assign requested address'." + ) + recorder.record_failure( + benchmark="tao_bench", + error_type="ephemeral_port_exhaustion", + reason=error_msg, + solutions=[ + "Expand port range: sudo sysctl -w net.ipv4.ip_local_port_range='1024 65535'", + f"Reduce clients_per_thread: --clients-per-thread=" + f"{max(1, max_conns // (args.num_clients * threads_per_client))}", + "Enable auto-fix: --auto-fix-ports=1", + ], + metadata={ + "num_clients": args.num_clients, + "threads_per_client": threads_per_client, + "clients_per_thread": args.clients_per_thread, + "total_connections": total_conns, + "available_ports": available_ports, + "port_range": f"{port_low}-{port_high}", + }, + ) + + # Check if the file descriptor limit is high enough for all connections. + # Each connection uses a socket (file descriptor). The server uses -c 180000 + # max connections, and each client opens threads_per_client * clients_per_thread + # connections. Add overhead for log files, threads, etc. + fd_overhead = 1000 + required_fds = total_conns + fd_overhead + soft_limit, hard_limit = resource.getrlimit(resource.RLIMIT_NOFILE) + + if soft_limit < required_fds: + if args.auto_fix_ulimit: + new_limit = max(required_fds, 100000) + try: + if hard_limit < new_limit: + # Raising hard limit requires root + resource.setrlimit(resource.RLIMIT_NOFILE, (new_limit, new_limit)) + else: + resource.setrlimit(resource.RLIMIT_NOFILE, (new_limit, hard_limit)) + actual_soft, _ = resource.getrlimit(resource.RLIMIT_NOFILE) + recorder.record_auto_fix( + benchmark="tao_bench", + fix_type="file_descriptor_limit", + description=( + f"Raised file descriptor soft limit from {soft_limit} to " + f"{actual_soft} because {total_conns} connections plus " + f"overhead require at least {required_fds} file descriptors." + ), + original_value=soft_limit, + fixed_value=actual_soft, + score_impact="None — raising the file descriptor limit has no performance impact.", + metadata={ + "original_soft_limit": soft_limit, + "original_hard_limit": hard_limit, + "new_soft_limit": actual_soft, + "required_fds": required_fds, + "total_connections": total_conns, + }, + ) + except (ValueError, OSError) as e: + error_msg = ( + f"File descriptor soft limit ({soft_limit}) is too low for " + f"{total_conns} connections (need at least {required_fds}). " + f"Auto-fix failed: {e}" + ) + recorder.record_failure( + benchmark="tao_bench", + error_type="file_descriptor_limit_low", + reason=error_msg, + solutions=[ + f"Run with higher ulimit: sudo bash -c 'ulimit -n {max(required_fds, 100000)} && ./benchpress_cli.py run ...'", + "Set system-wide limit in /etc/security/limits.conf", + ], + metadata={ + "soft_limit": soft_limit, + "hard_limit": hard_limit, + "required_fds": required_fds, + "total_connections": total_conns, + "auto_fix_error": str(e), + }, + ) + else: + error_msg = ( + f"File descriptor soft limit ({soft_limit}) is too low for " + f"{total_conns} connections (need at least {required_fds}). " + f"Server and clients may fail with 'Too many open files'." + ) + recorder.record_failure( + benchmark="tao_bench", + error_type="file_descriptor_limit_low", + reason=error_msg, + solutions=[ + "Enable auto-fix: --auto-fix-ulimit=1", + f"Run with higher ulimit: sudo bash -c 'ulimit -n {max(required_fds, 100000)} && ./benchpress_cli.py run ...'", + "Set system-wide limit in /etc/security/limits.conf", + ], + metadata={ + "soft_limit": soft_limit, + "hard_limit": hard_limit, + "required_fds": required_fds, + "total_connections": total_conns, + }, + ) t_server = threading.Thread( target=launch_server,