Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions benchpress/config/jobs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1046,6 +1046,7 @@
- '--disable-tls={disable_tls}'
- '--smart-nanosleep={smart_nanosleep}'
- '--memory-file={memory_file}'
- '--ipv4={ipv4}'
- '--real'
vars:
- 'interface_name=eth0'
Expand All @@ -1058,6 +1059,7 @@
- 'disable_tls=0'
- 'smart_nanosleep=0'
- 'memory_file='
- 'ipv4=0'
client:
args:
- 'client'
Expand All @@ -1074,6 +1076,7 @@
- '--sanity={sanity}'
- '--wait-after-warmup={wait_after_warmup}'
- '--disable-tls={disable_tls}'
- '--ipv4={ipv4}'
- '--client-id={client_id}'
- '--control-port={control_port}'
- '--real'
Expand All @@ -1089,6 +1092,7 @@
- 'wait_after_warmup=5'
- 'sanity=0'
- 'disable_tls=0'
- 'ipv4=0'
- 'client_id=0'
- 'control_port=0'
hooks:
Expand Down Expand Up @@ -1129,6 +1133,7 @@
- '--smart-nanosleep={smart_nanosleep}'
- '--memory-file={memory_file}'
- '--auto-warmup={auto_warmup}'
- '--ipv4={ipv4}'
- '--real'
vars:
- 'num_servers=0'
Expand All @@ -1152,6 +1157,7 @@
- 'smart_nanosleep=0'
- 'memory_file='
- 'auto_warmup=0'
- 'ipv4=0'
hooks:
- hook: tao_instruction
- hook: copymove
Expand Down Expand Up @@ -1184,11 +1190,16 @@
- '--stats-interval={stats_interval}'
- '--port-number-start={port_number_start}'
- '--disable-tls={disable_tls}'
- '--timeout-buffer={timeout_buffer}'
- '--num-slow-threads={num_slow_threads}'
- '--num-fast-threads={num_fast_threads}'
- '--num-client-threads={num_client_threads}'
- '--memory-file={memory_file}'
- '--auto-warmup={auto_warmup}'
- '--auto-fix-ports={auto_fix_ports}'
- '--auto-fix-ulimit={auto_fix_ulimit}'
- '--skip-hit-rate-check={skip_hit_rate_check}'
- '--ipv4={ipv4}'
vars:
- 'num_servers=0'
- 'memsize=0'
Expand All @@ -1203,11 +1214,16 @@
- 'stats_interval=5000'
- 'port_number_start=11211'
- 'disable_tls=0'
- 'timeout_buffer=0'
- 'num_slow_threads=0'
- 'num_fast_threads=0'
- 'num_client_threads=0'
- 'memory_file='
- 'auto_warmup=0'
- 'auto_fix_ports=0'
- 'auto_fix_ulimit=0'
- 'skip_hit_rate_check=0'
- 'ipv4=0'
hooks:
- hook: copymove
options:
Expand Down Expand Up @@ -1238,6 +1254,7 @@
- '--stats-interval={stats_interval}'
- '--port-number-start={port_number_start}'
- '--disable-tls={disable_tls}'
- '--timeout-buffer={timeout_buffer}'
- '--num-slow-threads={num_slow_threads}'
- '--num-fast-threads={num_fast_threads}'
- '--num-client-threads={num_client_threads}'
Expand All @@ -1246,6 +1263,10 @@
- '--postprocessing-timeout-buffer={postprocessing_timeout_buffer}'
- '--poll-interval={poll_interval}'
- '--memory-file={memory_file}'
- '--auto-fix-ports={auto_fix_ports}'
- '--auto-fix-ulimit={auto_fix_ulimit}'
- '--skip-hit-rate-check={skip_hit_rate_check}'
- '--ipv4={ipv4}'
vars:
- 'num_servers=0'
- 'memsize=0.5'
Expand All @@ -1260,6 +1281,7 @@
- 'stats_interval=5000'
- 'port_number_start=11211'
- 'disable_tls=0'
- 'timeout_buffer=30'
- 'num_slow_threads=0'
- 'num_fast_threads=0'
- 'num_client_threads=0'
Expand All @@ -1268,6 +1290,10 @@
- 'postprocessing_timeout_buffer=60'
- 'poll_interval=0.2'
- 'memory_file='
- 'auto_fix_ports=1'
- 'auto_fix_ulimit=1'
- 'skip_hit_rate_check=0'
- 'ipv4=0'
hooks:
- hook: copymove
options:
Expand Down
8 changes: 5 additions & 3 deletions benchpress/plugins/parsers/tao_bench_autoscale.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,15 @@ def parse(self, stdout, stderr, returncode):
metrics = {}
jsontext = ""
met_json = False
brace_depth = 0
for line in stdout:
if line.strip() == "{":
if line.strip() == "{" and not met_json:
met_json = True
if met_json:
jsontext += line
if line.strip() == "}":
break
brace_depth += line.count("{") - line.count("}")
if brace_depth == 0:
break
try:
metrics = json.loads(jsontext)
if "total_qps" in metrics:
Expand Down
144 changes: 140 additions & 4 deletions packages/common/diagnosis_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@ def record_failure(
"""
entry = {
"timestamp": datetime.utcnow().isoformat() + "Z",
"category": "failure",
"benchmark": benchmark,
"error_type": error_type,
"reason": reason,
Expand All @@ -197,6 +198,44 @@ def record_failure(

self._append_to_file(entry)

def record_auto_fix(
self,
benchmark: str,
fix_type: str,
description: str,
original_value: Any = None,
fixed_value: Any = None,
score_impact: str = "",
metadata: Optional[Dict[str, Any]] = None,
) -> None:
"""
Record a notable auto-fix that was applied and may affect the benchmark score.

Args:
benchmark: Name of the benchmark (e.g., "tao_bench")
fix_type: Type of fix (e.g., "ephemeral_port_cap")
description: Human-readable description of what was fixed and why
original_value: The original value before the fix
fixed_value: The value after the fix was applied
score_impact: Description of how this fix may affect the score
metadata: Additional metadata
"""
print(f"AUTO-FIX ({fix_type}): {description}")

entry = {
"timestamp": datetime.utcnow().isoformat() + "Z",
"category": "auto_fix",
"benchmark": benchmark,
"fix_type": fix_type,
"description": description,
"original_value": original_value,
"fixed_value": fixed_value,
"score_impact": score_impact,
"metadata": metadata or {},
}

self._append_to_file(entry)

def _append_to_file(self, entry: Dict[str, Any]) -> None:
"""
Append an entry to the diagnosis file with file locking.
Expand Down Expand Up @@ -305,13 +344,15 @@ def merge_failure_to_results(
results_dict: Dictionary to merge failure information into

Modifies:
results_dict is updated with a "failures" key containing a list of all
diagnosis records, or an empty list if no records are found.
results_dict is updated with a "failures" key containing a list of
failure records, and a "notable_auto_fixes" key containing a list of
auto-fix records, or empty lists if no records are found.
"""
try:
# Read from the singleton instance's diagnosis file
if not os.path.exists(self.diagnosis_file_path):
results_dict["failures"] = []
results_dict["notable_auto_fixes"] = []
return

with open(self.diagnosis_file_path, "r") as f:
Expand All @@ -320,13 +361,108 @@ def merge_failure_to_results(
if not isinstance(records, list):
records = [records]

# Add all failure records to results
results_dict["failures"] = records
# Separate failures from auto-fixes
results_dict["failures"] = [
r for r in records if r.get("category") != "auto_fix"
]
results_dict["notable_auto_fixes"] = [
r for r in records if r.get("category") == "auto_fix"
]
except Exception as e:
results_dict["failures"] = []
results_dict["notable_auto_fixes"] = []
results_dict["failure_read_error"] = f"Failed to read diagnosis file: {e}"


def check_ipv6_hostname(
hostname: str,
benchmark: str = "unknown",
root_dir: Optional[str] = None,
) -> bool:
"""
Check if a hostname resolves to an IPv6 address and warn if --ipv4 may be needed.

When a hostname resolves to IPv6 but IPv6 connectivity is broken or misconfigured,
clients will fail to connect to the server. This is a common issue on systems where
the hostname is registered with an IPv6 address (e.g., in /etc/hosts or DNS) but
IPv6 networking is not fully functional.

Args:
hostname: The server hostname to check
benchmark: Name of the benchmark calling this function
root_dir: Root directory for diagnosis file

Returns:
True if the hostname resolves to IPv6 (meaning --ipv4 may be needed),
False if it resolves to IPv4 only.
"""
try:
addr_infos = socket.getaddrinfo(hostname, None)
except socket.gaierror:
return False

has_ipv6 = any(info[0] == socket.AF_INET6 for info in addr_infos)
has_ipv4 = any(info[0] == socket.AF_INET for info in addr_infos)

if not has_ipv6:
return False

# Hostname resolves to IPv6. Check if IPv6 connectivity actually works.
ipv6_works = False
try:
sock = socket.socket(socket.AF_INET6, socket.SOCK_STREAM)
sock.settimeout(2)
# Try binding to the IPv6 loopback to verify basic IPv6 support
sock.bind(("::1", 0))
sock.close()
ipv6_works = True
except OSError as e:
ipv6_works = False
ipv6_errno = e.errno
ipv6_error_msg = str(e)

# Only record a failure if IPv6 is actually broken.
# If IPv6 works (even without IPv4 fallback), there is no problem.
if not ipv6_works:
first_ipv6 = next(
(info[4][0] for info in addr_infos if info[0] == socket.AF_INET6),
None,
)
recorder = DiagnosisRecorder.get_instance(root_dir=root_dir)
reason = (
f"Hostname '{hostname}' resolves to IPv6 address ({first_ipv6}) "
f"but IPv6 networking is not functional on this system "
f"(errno={ipv6_errno}: {ipv6_error_msg}). "
f"Clients will fail to connect to the server."
)
recorder.record_failure(
benchmark=benchmark,
error_type="ipv6_hostname_resolution",
reason=reason,
solutions=[
"Force IPv4: --ipv4=1 (or add ipv4=1 to job input)",
f"Add IPv4 entry for hostname: "
f"echo '127.0.0.1 {hostname}' >> /etc/hosts",
"Ensure IPv6 networking is properly configured on this system",
],
metadata={
"hostname": hostname,
"ipv6_address": first_ipv6,
"has_ipv4": has_ipv4,
"ipv6_works": ipv6_works,
"resolved_addresses": [
{
"family": ("IPv6" if info[0] == socket.AF_INET6 else "IPv4"),
"address": info[4][0],
}
for info in addr_infos
],
},
)

return True


def check_port_available(
port: int,
interface: str = "0.0.0.0",
Expand Down
12 changes: 12 additions & 0 deletions packages/tao_bench/args_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,12 @@ def add_common_server_args(server_parser: ArgumentParser) -> List[Tuple[str, str
help="target hit ratio for auto-warmup detection. Warmup is considered "
+ "complete when hit ratio reaches 95%% of this value.",
)
server_parser.add_argument(
"--ipv4",
type=int,
default=0,
help="set to 1 to force IPv4 protocol instead of IPv6",
)
server_parser.add_argument("--real", action="store_true", help="for real")

return get_opt_strings(server_parser)
Expand Down Expand Up @@ -284,6 +290,12 @@ def add_common_client_args(client_parser: ArgumentParser) -> List[Tuple[str, str
+ "When > 0, client polls this port during warmup and stops early "
+ "when server reports warmed up. 0 = disabled (use full warmup time).",
)
client_parser.add_argument(
"--ipv4",
type=int,
default=0,
help="set to 1 to force IPv4 protocol instead of IPv6",
)
client_parser.add_argument("--real", action="store_true", help="for real")

return get_opt_strings(client_parser)
9 changes: 5 additions & 4 deletions packages/tao_bench/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,9 @@ class TaoBenchParser:
# result qps calculation
MIN_HIT_RATE = 0.88

def __init__(self, server_csv_name="server.csv"):
def __init__(self, server_csv_name="server.csv", skip_hit_rate_check=False):
self.server_csv_name = server_csv_name
self.skip_hit_rate_check = skip_hit_rate_check

def parse(self, stdout, stderr, returncode):
"""Extracts TAO bench results from stdout."""
Expand Down Expand Up @@ -109,9 +110,9 @@ def process_server_snapshots(self, metrics, server_snapshots):
if not snapshot.valid:
continue
# Also filter out data points with low hit rate
if (
snapshot.get("fast_qps") > 1
and snapshot.get("hit_rate") >= self.MIN_HIT_RATE
if snapshot.get("fast_qps") > 1 and (
self.skip_hit_rate_check
or snapshot.get("hit_rate") >= self.MIN_HIT_RATE
):
counter += 1
else:
Expand Down
Loading
Loading