diff --git a/.gitignore b/.gitignore index e037c38a..3f20c2de 100644 --- a/.gitignore +++ b/.gitignore @@ -149,6 +149,7 @@ funnai_accounts/ # claude code CLAUDE.md +.claude/ scripts/burned_tokens_cache.json @@ -165,4 +166,4 @@ scripts/upgrade_mainers.logs scripts/logs-mainer-analysis/ # lp_holders list (generated output) -scripts/lp_holders/ \ No newline at end of file +scripts/lp_holders/ diff --git a/scripts/analyze_active_mainer_logs.py b/scripts/analyze_active_mainer_logs.py new file mode 100644 index 00000000..9f8e09ca --- /dev/null +++ b/scripts/analyze_active_mainer_logs.py @@ -0,0 +1,388 @@ +#!/usr/bin/env python3 +"""Fetch and analyze logs for all active mAIners using status data from check_mAIner_status. + +Reads the latest status JSON to identify active mAIners, fetches their logs in parallel, +and produces a health analysis report. +""" + +import argparse +import os +import json +import subprocess +import glob +from datetime import datetime +from collections import defaultdict +from concurrent.futures import ThreadPoolExecutor, as_completed + +SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) + +# Expected log patterns for a healthy mAIner flow +FLOW_STEPS = [ + "Recurring action 1 was triggered", + "pullNextChallenge - entered", + "pullNextChallenge - calling getChallengeFromGameStateCanister", + "calling getRandomOpenChallenge", +] + +# Patterns that indicate successful progression beyond challenge pull +SUCCESS_INDICATORS = [ + "got a challenge", + "sendToShareService", + "ShareService", + "submission", + "submitResponse", + "Inference", + "inference", + "response received", + "submitToGameState", +] + +# Patterns that indicate errors +ERROR_INDICATORS = [ + "ERROR", + "Error", + "error", + "trap", + "Trap", + "reject", + "Reject", + "failed", + "Failed", + "timeout", + "Timeout", +] + + +def get_logs(canister_id, network): + """Fetch logs for a canister with retry.""" + for attempt in range(3): + try: + output = subprocess.check_output( + ["dfx", "canister", "logs", canister_id, "--network", network], + stderr=subprocess.DEVNULL, + text=True, + timeout=30, + ) + return output.strip().splitlines() + except (subprocess.CalledProcessError, subprocess.TimeoutExpired): + if attempt == 2: + return None + return None + + +def analyze_canister_logs(canister_id, lines): + """Analyze log lines for a single canister and return a health assessment.""" + if lines is None: + return { + "canister_id": canister_id, + "verdict": "UNREACHABLE", + "detail": "Could not fetch logs", + "total_lines": 0, + "recurring_triggers": 0, + "pull_challenge_count": 0, + "success_indicators": [], + "error_lines": [], + "last_activity": "", + "timer_interval_hours": None, + } + + total_lines = len(lines) + recurring_count = 0 + pull_count = 0 + success_found = [] + error_lines = [] + timestamps = [] + + for line in lines: + # Extract timestamp + if "Z]:" in line: + try: + ts_start = line.index(". ") + 2 + ts_end = line.index("Z]:") + 1 + ts_str = line[ts_start:ts_end] + timestamps.append(ts_str) + except (ValueError, IndexError): + pass + + if "Recurring action 1 was triggered" in line: + recurring_count += 1 + + if "pullNextChallenge - entered" in line: + pull_count += 1 + + for indicator in SUCCESS_INDICATORS: + if indicator in line: + # Keep unique indicators only + if indicator not in success_found: + success_found.append(indicator) + break + + for indicator in ERROR_INDICATORS: + if indicator in line: + # Store the actual error line (truncated) + error_lines.append(line.strip()[-150:]) + break + + # Determine last activity timestamp + last_activity = timestamps[-1] if timestamps else "" + + # Calculate average timer interval + timer_interval = None + if len(timestamps) >= 2: + try: + first = datetime.fromisoformat(timestamps[0].replace("Z", "+00:00")) + last = datetime.fromisoformat(timestamps[-1].replace("Z", "+00:00")) + span_hours = (last - first).total_seconds() / 3600 + if recurring_count > 1: + timer_interval = round(span_hours / (recurring_count - 1), 1) + except (ValueError, IndexError): + pass + + # Determine verdict + if recurring_count == 0: + verdict = "NO_TIMERS" + detail = "No recurring timer triggers found in logs" + elif success_found: + verdict = "OK" + detail = f"Timers firing, challenge flow progressing ({', '.join(success_found)})" + elif pull_count > 0 and not success_found: + verdict = "STUCK_AT_CHALLENGE_PULL" + detail = "Timers fire, pullNextChallenge runs, but no evidence of challenge completion" + elif recurring_count > 0 and pull_count == 0: + verdict = "TIMER_ONLY" + detail = "Timers fire but no pullNextChallenge steps visible (possible log buffer overflow or silent failure)" + else: + verdict = "UNKNOWN" + detail = "Could not determine health from logs" + + return { + "canister_id": canister_id, + "verdict": verdict, + "detail": detail, + "total_lines": total_lines, + "recurring_triggers": recurring_count, + "pull_challenge_count": pull_count, + "success_indicators": success_found, + "error_lines": error_lines[:5], # keep top 5 + "last_activity": last_activity, + "timer_interval_hours": timer_interval, + } + + +def load_status_data(network): + """Load the latest check_mAIner_status JSON for the given network.""" + logs_dir = os.path.join(SCRIPT_DIR, "logs-mainer-analysis") + pattern = os.path.join(logs_dir, f"*-check_mAIner_status-{network}.json") + files = sorted(glob.glob(pattern)) + if not files: + return None + # Use the latest file + with open(files[-1]) as f: + return json.load(f) + + +def write_markdown_report(analysis_results, network, md_path): + """Write analysis report as markdown.""" + lines = [] + timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + lines.append(f"# Active mAIner Log Analysis — {network} — {timestamp}") + lines.append("") + + # Group by verdict + by_verdict = defaultdict(list) + for r in analysis_results: + by_verdict[r["verdict"]].append(r) + + # Summary + lines.append("## Summary") + lines.append("") + lines.append("| Verdict | Count |") + lines.append("|-------------------------|-------|") + for verdict in ["OK", "STUCK_AT_CHALLENGE_PULL", "TIMER_ONLY", "NO_TIMERS", "UNREACHABLE", "UNKNOWN"]: + count = len(by_verdict.get(verdict, [])) + if count > 0 or verdict in ["OK", "STUCK_AT_CHALLENGE_PULL", "TIMER_ONLY"]: + lines.append(f"| {verdict:<23} | {count:>5} |") + lines.append(f"| **Total** | **{len(analysis_results):>3}** |") + lines.append("") + + # Verdict explanations + lines.append("### Verdict Legend") + lines.append("") + lines.append("- **OK**: Timers firing and evidence of challenge processing beyond `getRandomOpenChallenge`") + lines.append("- **STUCK_AT_CHALLENGE_PULL**: Timers fire, `pullNextChallenge` runs, but no evidence of receiving a challenge or completing inference") + lines.append("- **TIMER_ONLY**: Timers fire but `pullNextChallenge` steps not visible in log buffer (may be log overflow)") + lines.append("- **NO_TIMERS**: No recurring timer triggers found") + lines.append("- **UNREACHABLE**: Could not fetch logs from canister") + lines.append("") + + # Detail table + cid_width = 27 + verdict_width = 23 + lines.append("## All Active mAIners") + lines.append("") + lines.append(f"| {'#':<4} | {'Canister ID':<{cid_width}} | {'Burn Rate':<9} | {'Verdict':<{verdict_width}} | {'Triggers':<8} | {'Pulls':<5} | {'Interval':<8} | {'Days Left':<9} | {'Last Activity':<25} |") + lines.append(f"|{'-'*6}|{'-'*(cid_width+2)}|{'-'*11}|{'-'*(verdict_width+2)}|{'-'*10}|{'-'*7}|{'-'*10}|{'-'*11}|{'-'*27}|") + + # Sort: problems first, then by days_until_freeze ascending + verdict_order = {"UNREACHABLE": 0, "NO_TIMERS": 1, "STUCK_AT_CHALLENGE_PULL": 2, "TIMER_ONLY": 3, "UNKNOWN": 4, "OK": 5} + sorted_results = sorted(analysis_results, key=lambda r: (verdict_order.get(r["verdict"], 99), r.get("days_until_freeze") if r.get("days_until_freeze") is not None else 9999)) + + for idx, r in enumerate(sorted_results, 1): + interval = f"{r['timer_interval_hours']}h" if r["timer_interval_hours"] else "" + days_left = f"{r['days_until_freeze']:.1f}" if r.get("days_until_freeze") is not None else "" + lines.append( + f"| {idx:<4} " + f"| {r['canister_id']:<{cid_width}} " + f"| {r.get('burn_rate', ''):<9} " + f"| {r['verdict']:<{verdict_width}} " + f"| {r['recurring_triggers']:<8} " + f"| {r['pull_challenge_count']:<5} " + f"| {interval:<8} " + f"| {days_left:>9} " + f"| {r['last_activity']:<25} |" + ) + lines.append("") + + # Problem canisters detail + problems = [r for r in sorted_results if r["verdict"] != "OK"] + if problems: + lines.append("## Problem mAIners — Details") + lines.append("") + for r in problems: + lines.append(f"### `{r['canister_id']}` — {r['verdict']}") + lines.append("") + lines.append(f"- **Detail**: {r['detail']}") + lines.append(f"- **Log lines**: {r['total_lines']}, Triggers: {r['recurring_triggers']}, Pulls: {r['pull_challenge_count']}") + if r["error_lines"]: + lines.append(f"- **Errors found**:") + for err in r["error_lines"]: + lines.append(f" - `{err}`") + lines.append("") + + # OK canisters with success indicators + ok_results = by_verdict.get("OK", []) + if ok_results: + lines.append("## OK mAIners — Success Indicators") + lines.append("") + for r in ok_results: + indicators = ", ".join(r["success_indicators"]) if r["success_indicators"] else "none" + lines.append(f"- `{r['canister_id']}`: {indicators}") + lines.append("") + + with open(md_path, 'w') as f: + f.write('\n'.join(lines) + '\n') + + +def main(network, workers=10): + # Step 1: Load status data + status_data = load_status_data(network) + if not status_data: + print(f"ERROR: No check_mAIner_status JSON found for network '{network}'.") + print(f"Run check_mAIner_status.sh --network {network} first.") + return + + print(f"Loaded status data from {status_data['timestamp']} ({status_data['summary']['total_share_agents']} total mAIners)") + + # Step 2: Extract active mAIners with burn rates and freeze prediction + active_mainers = [] + for m in status_data["mainers"]: + if m.get("active") is True and m.get("burn_rate", ""): + active_mainers.append({ + "canister_id": m["canister_id"], + "burn_rate": m["burn_rate"], + "days_until_freeze": m.get("days_until_freeze"), + }) + + if not active_mainers: + print("No active mAIners found in status data.") + return + + print(f"Found {len(active_mainers)} active mAIners to analyze.") + + # Step 3: Fetch logs in parallel + print(f"Fetching logs with {workers} parallel workers...") + logs_map = {} # canister_id -> lines + + with ThreadPoolExecutor(max_workers=workers) as executor: + future_to_id = { + executor.submit(get_logs, m["canister_id"], network): m["canister_id"] + for m in active_mainers + } + done = 0 + for future in as_completed(future_to_id): + done += 1 + cid = future_to_id[future] + try: + logs_map[cid] = future.result() + except Exception: + logs_map[cid] = None + if done % 25 == 0: + print(f" Fetched {done}/{len(active_mainers)} logs...") + + print(f" Fetched all {len(active_mainers)} logs.") + + # Step 4: Analyze each canister + print("Analyzing logs...") + analysis_results = [] + for m in active_mainers: + cid = m["canister_id"] + result = analyze_canister_logs(cid, logs_map.get(cid)) + result["burn_rate"] = m["burn_rate"] + result["days_until_freeze"] = m.get("days_until_freeze") + analysis_results.append(result) + + # Step 5: Print summary + by_verdict = defaultdict(int) + for r in analysis_results: + by_verdict[r["verdict"]] += 1 + + print(f"\n{'='*60}") + print("LOG ANALYSIS SUMMARY") + print(f"{'='*60}") + for verdict in ["OK", "STUCK_AT_CHALLENGE_PULL", "TIMER_ONLY", "NO_TIMERS", "UNREACHABLE", "UNKNOWN"]: + count = by_verdict.get(verdict, 0) + if count > 0: + print(f" {verdict:<25}: {count}") + print(f" {'Total':<25}: {len(analysis_results)}") + print() + + # Step 6: Write output files + date_prefix = datetime.now().strftime("%Y-%m-%d") + logs_dir = os.path.join(SCRIPT_DIR, "logs-mainer-analysis") + os.makedirs(logs_dir, exist_ok=True) + + json_path = os.path.join(logs_dir, f"{date_prefix}-analyze_active_mainer_logs-{network}.json") + with open(json_path, 'w') as f: + json.dump({ + "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "network": network, + "total_active": len(active_mainers), + "summary": dict(by_verdict), + "results": analysis_results, + }, f, indent=2) + print(f"JSON report: {os.path.abspath(json_path)}") + + md_path = os.path.join(logs_dir, f"{date_prefix}-analyze_active_mainer_logs-{network}.md") + write_markdown_report(analysis_results, network, md_path) + print(f"Markdown report: {os.path.abspath(md_path)}") + print() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Fetch and analyze logs for all active mAIners." + ) + parser.add_argument( + "--network", + choices=["local", "ic", "testing", "demo", "development", "prd"], + default="local", + help="Specify the network to use (default: local)", + ) + parser.add_argument( + "--workers", + type=int, + default=10, + help="Number of parallel workers for log fetching (default: 10)", + ) + args = parser.parse_args() + main(args.network, args.workers) diff --git a/scripts/analyze_active_mainer_logs.sh b/scripts/analyze_active_mainer_logs.sh new file mode 100755 index 00000000..02bc4cd1 --- /dev/null +++ b/scripts/analyze_active_mainer_logs.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +# Default network type is local +NETWORK_TYPE="local" +WORKERS="" + +# Parse command line arguments +while [ $# -gt 0 ]; do + case "$1" in + --network) + shift + if [ "$1" = "local" ] || [ "$1" = "ic" ] || [ "$1" = "testing" ] || [ "$1" = "development" ] || [ "$1" = "demo" ] || [ "$1" = "prd" ]; then + NETWORK_TYPE=$1 + else + echo "Invalid network type: $1. Use 'local' or 'ic' or 'testing' or 'development' or 'demo' or 'prd'." + exit 1 + fi + shift + ;; + --workers) + shift + WORKERS="--workers $1" + shift + ;; + *) + echo "Unknown argument: $1" + echo "Usage: $0 --network [local|ic|testing|development|demo|prd] [--workers N]" + exit 1 + ;; + esac +done + +echo "Using network type: $NETWORK_TYPE" + +python -m scripts.analyze_active_mainer_logs --network $NETWORK_TYPE $WORKERS diff --git a/scripts/analyze_frozen_mainer_logs.py b/scripts/analyze_frozen_mainer_logs.py new file mode 100644 index 00000000..b9cda535 --- /dev/null +++ b/scripts/analyze_frozen_mainer_logs.py @@ -0,0 +1,479 @@ +#!/usr/bin/env python3 +"""Fetch and analyze logs for all frozen mAIners to determine why they froze. + +Reads the latest status JSON to identify frozen mAIners, fetches their logs +(dfx canister logs works on frozen canisters), and analyzes for freeze causes. +""" + +import argparse +import os +import json +import subprocess +import glob +import re +from datetime import datetime, timezone +from collections import defaultdict +from concurrent.futures import ThreadPoolExecutor, as_completed + +SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) + +# Patterns to detect in logs +TIMER_TRIGGER = "Recurring action 1 was triggered" +TIMER_2_TRIGGER = "Recurring action 2 was triggered" +PULL_CHALLENGE = "pullNextChallenge - entered" +GOT_CHALLENGE = "pullNextChallenge - challenge =" +SEND_TO_SHARE = "addChallengeToShareServiceQueue" +STORE_RESPONSE = "storeAndSubmitResponse" +SUBMIT_TO_GS = "submitChallengeResponse" +TIMER_START = "startTimerExecution" +TIMER_SET = "setTimer" +CYCLES_ADD = "Cycles.add for" +UNOFFICIAL_TOPUP = "Unofficial top ups" + +# Error patterns +ERROR_PATTERNS = [ + "trap", "Trap", "ERROR", "Error", "reject", "Reject", + "failed", "Failed", "timeout", "Timeout", "out of cycles", + "canister_error", "IC0503", "IC0502", +] + + +def get_logs(canister_id, network): + """Fetch logs for a canister with retry.""" + for attempt in range(3): + try: + output = subprocess.check_output( + ["dfx", "canister", "logs", canister_id, "--network", network], + stderr=subprocess.DEVNULL, + text=True, + timeout=30, + ) + return output.strip().splitlines() + except (subprocess.CalledProcessError, subprocess.TimeoutExpired): + if attempt == 2: + return None + return None + + +def parse_timestamp(line): + """Extract timestamp from a log line like '[123. 2026-03-22T23:47:22.895Z]: ...'""" + match = re.search(r'(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+Z)', line) + if match: + ts_str = match.group(1) + # Truncate nanoseconds to microseconds for parsing + ts_str = re.sub(r'(\.\d{6})\d*Z', r'\1Z', ts_str) + try: + return datetime.fromisoformat(ts_str.replace("Z", "+00:00")) + except ValueError: + pass + return None + + +def calculate_timer_intervals(timestamps): + """Calculate timer intervals from a list of timestamps. + + Returns list of intervals in seconds. + """ + if len(timestamps) < 2: + return [] + intervals = [] + for i in range(1, len(timestamps)): + delta = (timestamps[i] - timestamps[i - 1]).total_seconds() + if delta > 0: + intervals.append(delta) + return intervals + + +def analyze_canister_logs(canister_id, lines): + """Analyze log lines for a frozen canister and determine freeze cause.""" + if lines is None: + return { + "canister_id": canister_id, + "freeze_cause": "UNREACHABLE", + "detail": "Could not fetch logs", + "total_lines": 0, + } + + total_lines = len(lines) + + # Counters + timer1_count = 0 + timer2_count = 0 + pull_count = 0 + got_challenge_count = 0 + send_share_count = 0 + store_response_count = 0 + submit_gs_count = 0 + timer_start_count = 0 + cycles_add_count = 0 + unofficial_topup_count = 0 + error_lines = [] + + # Timestamps for timer interval analysis + timer1_timestamps = [] + timer2_timestamps = [] + all_timestamps = [] + + # First and last activity + first_ts = None + last_ts = None + + for line in lines: + ts = parse_timestamp(line) + if ts: + all_timestamps.append(ts) + if first_ts is None: + first_ts = ts + last_ts = ts + + if TIMER_TRIGGER in line: + timer1_count += 1 + if ts: + timer1_timestamps.append(ts) + if TIMER_2_TRIGGER in line: + timer2_count += 1 + if ts: + timer2_timestamps.append(ts) + if PULL_CHALLENGE in line: + pull_count += 1 + if GOT_CHALLENGE in line: + got_challenge_count += 1 + if SEND_TO_SHARE in line: + send_share_count += 1 + if STORE_RESPONSE in line: + store_response_count += 1 + if SUBMIT_TO_GS in line: + submit_gs_count += 1 + if TIMER_START in line: + timer_start_count += 1 + if CYCLES_ADD in line: + cycles_add_count += 1 + if UNOFFICIAL_TOPUP in line: + unofficial_topup_count += 1 + + for pattern in ERROR_PATTERNS: + if pattern in line and "storeAndSubmitResponse" not in line: + error_lines.append(line.strip()[-200:]) + break + + # Calculate timer intervals + timer1_intervals = calculate_timer_intervals(timer1_timestamps) + timer2_intervals = calculate_timer_intervals(timer2_timestamps) + + avg_timer1_interval_h = None + min_timer1_interval_s = None + if timer1_intervals: + avg_timer1_interval_h = round(sum(timer1_intervals) / len(timer1_intervals) / 3600, 2) + min_timer1_interval_s = round(min(timer1_intervals), 1) + + avg_timer2_interval_s = None + if timer2_intervals: + avg_timer2_interval_s = round(sum(timer2_intervals) / len(timer2_intervals), 1) + + # Log span + log_span_days = None + if first_ts and last_ts: + log_span_days = round((last_ts - first_ts).total_seconds() / 86400, 1) + + # Determine freeze cause + freeze_cause = "UNKNOWN" + detail = "" + + if timer2_count > 0 and avg_timer2_interval_s and avg_timer2_interval_s < 30: + freeze_cause = "RUNAWAY_TIMER_2" + detail = f"Timer 2 fired {timer2_count} times at ~{avg_timer2_interval_s}s interval (should not run for ShareAgent)" + elif min_timer1_interval_s is not None and min_timer1_interval_s < 60: + freeze_cause = "RUNAWAY_TIMER_1" + detail = f"Timer 1 min interval was {min_timer1_interval_s}s (expected hours). Possible 5-second fallback or orphaned timers" + elif timer1_count > 0 and got_challenge_count > 0 and store_response_count > 0: + freeze_cause = "NORMAL_OPERATION" + detail = f"Processed {got_challenge_count} challenges, {store_response_count} submissions. Memory growth from stored responses likely caused freeze" + elif timer1_count > 0 and pull_count > 0 and got_challenge_count == 0: + freeze_cause = "IDLE_WITH_TIMERS" + detail = f"Timers firing ({timer1_count}x) and pulling challenges ({pull_count}x) but no challenges received. Idle cycle burn from timers + memory" + elif timer1_count > 0 and pull_count == 0: + freeze_cause = "TIMER_ONLY_NO_WORK" + detail = f"Timers firing ({timer1_count}x) but no challenge processing visible in logs. Possible log overflow or silent failure" + elif timer1_count == 0 and total_lines > 0: + freeze_cause = "NO_TIMERS_IN_LOG" + detail = f"No timer triggers found in {total_lines} log lines. Logs may predate timer start or timers never started" + elif total_lines == 0: + freeze_cause = "NO_LOGS" + detail = "No log entries found" + + if error_lines: + detail += f". {len(error_lines)} error(s) found" + + return { + "canister_id": canister_id, + "freeze_cause": freeze_cause, + "detail": detail, + "total_lines": total_lines, + "log_span_days": log_span_days, + "first_activity": first_ts.isoformat() if first_ts else "", + "last_activity": last_ts.isoformat() if last_ts else "", + "timer1_count": timer1_count, + "timer2_count": timer2_count, + "avg_timer1_interval_h": avg_timer1_interval_h, + "min_timer1_interval_s": min_timer1_interval_s, + "avg_timer2_interval_s": avg_timer2_interval_s, + "pull_count": pull_count, + "got_challenge_count": got_challenge_count, + "send_share_count": send_share_count, + "store_response_count": store_response_count, + "submit_gs_count": submit_gs_count, + "timer_start_count": timer_start_count, + "cycles_add_count": cycles_add_count, + "unofficial_topup_count": unofficial_topup_count, + "error_count": len(error_lines), + "error_samples": error_lines[:5], + } + + +def load_status_data(network): + """Load the latest check_mAIner_status JSON for the given network.""" + logs_dir = os.path.join(SCRIPT_DIR, "logs-mainer-analysis") + pattern = os.path.join(logs_dir, f"*-check_mAIner_status-{network}.json") + files = sorted(glob.glob(pattern)) + if not files: + return None + with open(files[-1]) as f: + return json.load(f) + + +def write_markdown_report(analysis_results, network, md_path): + """Write analysis report as markdown.""" + lines = [] + timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + lines.append(f"# Frozen mAIner Log Analysis — {network} — {timestamp}") + lines.append("") + + # Group by freeze cause + by_cause = defaultdict(list) + for r in analysis_results: + by_cause[r["freeze_cause"]].append(r) + + # Summary + lines.append("## Summary") + lines.append("") + lines.append("| Freeze Cause | Count |") + lines.append("|------------------------|-------|") + cause_order = [ + "RUNAWAY_TIMER_2", "RUNAWAY_TIMER_1", "NORMAL_OPERATION", + "IDLE_WITH_TIMERS", "TIMER_ONLY_NO_WORK", "NO_TIMERS_IN_LOG", + "NO_LOGS", "UNREACHABLE", "UNKNOWN", + ] + for cause in cause_order: + count = len(by_cause.get(cause, [])) + if count > 0: + lines.append(f"| {cause:<22} | {count:>5} |") + lines.append(f"| **Total** | **{len(analysis_results):>3}** |") + lines.append("") + + # Legend + lines.append("### Freeze Cause Legend") + lines.append("") + lines.append("- **RUNAWAY_TIMER_2**: Timer 2 firing at ~5s interval (should not run for ShareAgent) — rapid cycle drain") + lines.append("- **RUNAWAY_TIMER_1**: Timer 1 firing at sub-minute intervals instead of hours — possible 5s fallback") + lines.append("- **NORMAL_OPERATION**: Was processing challenges normally — froze from memory growth + idle cost") + lines.append("- **IDLE_WITH_TIMERS**: Timers firing, pulling challenges, but no challenges received — idle cycle burn") + lines.append("- **TIMER_ONLY_NO_WORK**: Timers firing but no challenge steps visible (log buffer overflow)") + lines.append("- **NO_TIMERS_IN_LOG**: No timer triggers in log — timers may have never started") + lines.append("- **NO_LOGS**: No log entries at all") + lines.append("- **UNREACHABLE**: Could not fetch logs") + lines.append("") + + # Detail table + cid_width = 27 + cause_width = 22 + lines.append("## All Frozen mAIners") + lines.append("") + lines.append( + f"| {'#':<4} " + f"| {'Canister ID':<{cid_width}} " + f"| {'Freeze Cause':<{cause_width}} " + f"| {'T1':>4} " + f"| {'T2':>4} " + f"| {'T1 Avg':>7} " + f"| {'T1 Min':>7} " + f"| {'Pulls':>5} " + f"| {'Got':>4} " + f"| {'Submit':>6} " + f"| {'Errs':>4} " + f"| {'Span':>6} " + f"| {'Last Activity':<25} |" + ) + lines.append( + f"|{'-' * 6}" + f"|{'-' * (cid_width + 2)}" + f"|{'-' * (cause_width + 2)}" + f"|{'-' * 6}" + f"|{'-' * 6}" + f"|{'-' * 9}" + f"|{'-' * 9}" + f"|{'-' * 7}" + f"|{'-' * 6}" + f"|{'-' * 8}" + f"|{'-' * 6}" + f"|{'-' * 8}" + f"|{'-' * 27}|" + ) + + # Sort by cause (runaway first) + cause_sort = {c: i for i, c in enumerate(cause_order)} + sorted_results = sorted(analysis_results, key=lambda r: cause_sort.get(r["freeze_cause"], 99)) + + for idx, r in enumerate(sorted_results, 1): + t1_avg = f"{r['avg_timer1_interval_h']}h" if r["avg_timer1_interval_h"] else "" + t1_min = f"{r['min_timer1_interval_s']}s" if r["min_timer1_interval_s"] else "" + span = f"{r['log_span_days']}d" if r["log_span_days"] else "" + last = r.get("last_activity", "")[:25] + lines.append( + f"| {idx:<4} " + f"| {r['canister_id']:<{cid_width}} " + f"| {r['freeze_cause']:<{cause_width}} " + f"| {r['timer1_count']:>4} " + f"| {r['timer2_count']:>4} " + f"| {t1_avg:>7} " + f"| {t1_min:>7} " + f"| {r['pull_count']:>5} " + f"| {r['got_challenge_count']:>4} " + f"| {r['submit_gs_count']:>6} " + f"| {r['error_count']:>4} " + f"| {span:>6} " + f"| {last:<25} |" + ) + lines.append("") + + # Detailed sections per cause + for cause in cause_order: + group = by_cause.get(cause, []) + if not group: + continue + lines.append(f"## {cause} ({len(group)} mAIners)") + lines.append("") + for r in group: + lines.append(f"### `{r['canister_id']}`") + lines.append("") + lines.append(f"- **Detail**: {r['detail']}") + lines.append(f"- **Log span**: {r['log_span_days']}d, {r['total_lines']} lines") + lines.append(f"- **Last activity**: {r.get('last_activity', 'N/A')}") + lines.append(f"- **Timer 1**: {r['timer1_count']}x, avg {r['avg_timer1_interval_h']}h, min {r['min_timer1_interval_s']}s") + if r["timer2_count"] > 0: + lines.append(f"- **Timer 2**: {r['timer2_count']}x, avg {r['avg_timer2_interval_s']}s") + lines.append(f"- **Flow**: pulls={r['pull_count']}, got={r['got_challenge_count']}, share={r['send_share_count']}, submit={r['submit_gs_count']}") + if r["error_samples"]: + lines.append(f"- **Errors**:") + for err in r["error_samples"]: + lines.append(f" - `{err}`") + lines.append("") + + with open(md_path, 'w') as f: + f.write('\n'.join(lines) + '\n') + + +def main(network, workers=10): + # Step 1: Load status data + status_data = load_status_data(network) + if not status_data: + print(f"ERROR: No check_mAIner_status JSON found for network '{network}'.") + print(f"Run check_mAIner_status.sh --network {network} first.") + return + + print(f"Loaded status data from {status_data['timestamp']} ({status_data['summary']['total_share_agents']} total mAIners)") + + # Step 2: Extract frozen mAIners + frozen_mainers = [m["canister_id"] for m in status_data["mainers"] if m["status"] == "Frozen"] + + if not frozen_mainers: + print("No frozen mAIners found in status data.") + return + + print(f"Found {len(frozen_mainers)} frozen mAIners to analyze.") + + # Step 3: Fetch logs in parallel + print(f"Fetching logs with {workers} parallel workers...") + logs_map = {} + + with ThreadPoolExecutor(max_workers=workers) as executor: + future_to_id = { + executor.submit(get_logs, cid, network): cid + for cid in frozen_mainers + } + done = 0 + for future in as_completed(future_to_id): + done += 1 + cid = future_to_id[future] + try: + logs_map[cid] = future.result() + except Exception: + logs_map[cid] = None + if done % 25 == 0: + print(f" Fetched {done}/{len(frozen_mainers)} logs...") + + print(f" Fetched all {len(frozen_mainers)} logs.") + + # Step 4: Analyze each canister + print("Analyzing logs...") + analysis_results = [] + for cid in frozen_mainers: + result = analyze_canister_logs(cid, logs_map.get(cid)) + analysis_results.append(result) + + # Step 5: Print summary + by_cause = defaultdict(int) + for r in analysis_results: + by_cause[r["freeze_cause"]] += 1 + + print(f"\n{'='*60}") + print("FROZEN MAINER LOG ANALYSIS SUMMARY") + print(f"{'='*60}") + for cause in ["RUNAWAY_TIMER_2", "RUNAWAY_TIMER_1", "NORMAL_OPERATION", + "IDLE_WITH_TIMERS", "TIMER_ONLY_NO_WORK", "NO_TIMERS_IN_LOG", + "NO_LOGS", "UNREACHABLE", "UNKNOWN"]: + count = by_cause.get(cause, 0) + if count > 0: + print(f" {cause:<25}: {count}") + print(f" {'Total':<25}: {len(analysis_results)}") + print() + + # Step 6: Write output files + date_prefix = datetime.now().strftime("%Y-%m-%d") + logs_dir = os.path.join(SCRIPT_DIR, "logs-mainer-analysis") + os.makedirs(logs_dir, exist_ok=True) + + json_path = os.path.join(logs_dir, f"{date_prefix}-analyze_frozen_mainer_logs-{network}.json") + with open(json_path, 'w') as f: + json.dump({ + "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "network": network, + "total_frozen": len(frozen_mainers), + "summary": dict(by_cause), + "results": analysis_results, + }, f, indent=2) + print(f"JSON report: {os.path.abspath(json_path)}") + + md_path = os.path.join(logs_dir, f"{date_prefix}-analyze_frozen_mainer_logs-{network}.md") + write_markdown_report(analysis_results, network, md_path) + print(f"Markdown report: {os.path.abspath(md_path)}") + print() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Fetch and analyze logs for all frozen mAIners to determine freeze cause." + ) + parser.add_argument( + "--network", + choices=["local", "ic", "testing", "demo", "development", "prd"], + default="local", + help="Specify the network to use (default: local)", + ) + parser.add_argument( + "--workers", + type=int, + default=10, + help="Number of parallel workers for log fetching (default: 10)", + ) + args = parser.parse_args() + main(args.network, args.workers) diff --git a/scripts/analyze_frozen_mainer_logs.sh b/scripts/analyze_frozen_mainer_logs.sh new file mode 100755 index 00000000..dd21771e --- /dev/null +++ b/scripts/analyze_frozen_mainer_logs.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +# Default network type is local +NETWORK_TYPE="local" +WORKERS="" + +# Parse command line arguments +while [ $# -gt 0 ]; do + case "$1" in + --network) + shift + if [ "$1" = "local" ] || [ "$1" = "ic" ] || [ "$1" = "testing" ] || [ "$1" = "development" ] || [ "$1" = "demo" ] || [ "$1" = "prd" ]; then + NETWORK_TYPE=$1 + else + echo "Invalid network type: $1. Use 'local' or 'ic' or 'testing' or 'development' or 'demo' or 'prd'." + exit 1 + fi + shift + ;; + --workers) + shift + WORKERS="--workers $1" + shift + ;; + *) + echo "Unknown argument: $1" + echo "Usage: $0 --network [local|ic|testing|development|demo|prd] [--workers N]" + exit 1 + ;; + esac +done + +echo "Using network type: $NETWORK_TYPE" + +python -m scripts.analyze_frozen_mainer_logs --network $NETWORK_TYPE $WORKERS diff --git a/scripts/check_mAIner_status.py b/scripts/check_mAIner_status.py new file mode 100644 index 00000000..a44bca70 --- /dev/null +++ b/scripts/check_mAIner_status.py @@ -0,0 +1,719 @@ +#!/usr/bin/env python3 + +import argparse +import os +import json +import subprocess +import time +import threading +from datetime import datetime +from concurrent.futures import ThreadPoolExecutor, as_completed +from typing import Optional + +from .get_mainers import get_mainers, get_mainer_is_active, get_mainer_setting +from .get_mainers_health import ( + is_transient_error, + run_command_with_retry, + log_message, +) + +# Get the directory of this script +SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) + +# Status constants +STATUS_HEALTHY = "Healthy" +STATUS_MAINTENANCE = "Maintenance" +STATUS_STOPPED = "Stopped" +STATUS_FROZEN = "Frozen" +STATUS_UNINSTALLED = "Uninstalled" +STATUS_UNAVAILABLE = "Unavailable" + +# Error indicators +FROZEN_INDICATORS = ["IC0207", "is out of cycles", "frozen"] +STOPPED_INDICATORS = ["IC0508"] + +# Health success patterns (from get_mainers_health.py) +HEALTH_OK_PATTERNS = [ + "(variant { Ok = record { status_code = 200 : nat16 } })", + "(variant { 17_724 = record { 3_475_804_314 = 200 : nat16 } })", +] + + +def is_frozen_error(error_text: str) -> bool: + """Check if an error indicates the canister is frozen.""" + if not error_text: + return False + return any(indicator in error_text for indicator in FROZEN_INDICATORS) + + +def is_stopped_error(error_text: str) -> bool: + """Check if an error indicates the canister is stopped.""" + if not error_text: + return False + return any(indicator in error_text for indicator in STOPPED_INDICATORS) + + +def collect_error_text(e) -> str: + """Collect meaningful error text from a subprocess exception. + + Prefers stderr/stdout over the generic str(e) which contains the raw command. + """ + parts = [] + if hasattr(e, 'stderr') and e.stderr: + parts.append(e.stderr.strip()) + if hasattr(e, 'stdout') and e.stdout: + parts.append(e.stdout.strip()) + if parts: + return " ".join(parts).replace("\n", " ") + return str(e).replace("\n", " ") + + +def get_module_hash(network: str, canister_id: str) -> tuple[Optional[str], Optional[str]]: + """Get the module hash of a canister via dfx canister info. + + Retries on transient errors (timeouts, connection issues). + + Returns: + (module_hash, error_message) + - ("0xabc...", None) if hash found + - ("None", None) if canister is uninstalled + - (None, "error text") if call failed + """ + cmd = ["dfx", "canister", "--network", network, "info", canister_id] + max_retries = 3 + retry_delay = 5.0 + + for attempt in range(1, max_retries + 1): + try: + result = subprocess.run(cmd, capture_output=True, text=True, timeout=30, check=True) + for line in result.stdout.split('\n'): + if 'Module hash:' in line: + hash_value = line.split(':', 1)[1].strip() + return (hash_value, None) + return (None, "Module hash line not found in output") + except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e: + error_text = collect_error_text(e) + + # Don't retry deterministic errors (frozen, out of cycles) + if is_frozen_error(error_text): + return (None, error_text) + + # Retry on transient errors + if attempt < max_retries and is_transient_error(error_text): + delay = retry_delay * (2 ** (attempt - 1)) + time.sleep(delay) + continue + + return (None, error_text) + + return (None, "Max retries exceeded") + + +def check_health(network: str, canister_id: str) -> tuple[str, Optional[str]]: + """Call the health endpoint on a canister with retries for transient errors. + + Returns: + (status, error_message) + """ + cmd = ["dfx", "canister", "--network", network, "call", canister_id, "health"] + max_retries = 3 + retry_delay = 5.0 + + for attempt in range(1, max_retries + 1): + try: + result = subprocess.run(cmd, capture_output=True, text=True, timeout=30, check=True) + output = result.stdout.strip() + if any(pattern in output for pattern in HEALTH_OK_PATTERNS): + return (STATUS_HEALTHY, None) + else: + return (STATUS_MAINTENANCE, f"health returned: {output[:150]}") + except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e: + error_text = collect_error_text(e) + + if is_frozen_error(error_text): + return (STATUS_FROZEN, error_text) + + if is_stopped_error(error_text): + return (STATUS_STOPPED, error_text) + + # Retry on transient errors + if attempt < max_retries and is_transient_error(error_text): + delay = retry_delay * (2 ** (attempt - 1)) + time.sleep(delay) + continue + + return (STATUS_UNAVAILABLE, error_text) + + return (STATUS_UNAVAILABLE, "Max retries exceeded") + + +def calculate_freeze_prediction(memory_bytes, cycles_balance): + """Calculate freeze prediction based on memory idle cost. + + Returns: + (daily_drain, freeze_threshold, days_until_freeze) or (None, None, None) + """ + if memory_bytes is None or cycles_balance is None or memory_bytes == 0: + return (None, None, None) + # 317,500 cycles per GiB per second on 13-node subnets + # (was 127,000, increased 2.5x by NNS Proposal 140538 / Mission70) + # Verified: dfx canister status shows idle_cycles_burned_per_day = 10.6B for 416MB, + # which matches 317,500 rate (127,000 rate would give only 4.3B) + daily_drain = int(memory_bytes * 317_500 / (1024 ** 3) * 86_400) + freeze_threshold = daily_drain * 30 # 30 days = default freezing threshold + if daily_drain > 0: + days_until_freeze = round((cycles_balance - freeze_threshold) / daily_drain, 1) + else: + days_until_freeze = None + return (daily_drain, freeze_threshold, days_until_freeze) + + +def get_canister_resources(network: str, canister_id: str) -> tuple[Optional[int], Optional[int]]: + """Get memory size and cycle balance via dfx canister status. + + Retries on transient errors. Returns (None, None) for frozen/unreachable. + + Returns: + (memory_bytes, cycles_balance) + """ + cmd = ["dfx", "canister", "--network", network, "status", canister_id] + max_retries = 3 + retry_delay = 5.0 + + for attempt in range(1, max_retries + 1): + try: + result = subprocess.run(cmd, capture_output=True, text=True, timeout=30, check=True) + memory = None + balance = None + for line in result.stdout.split('\n'): + if line.strip().startswith('Memory Size:'): + memory = int(line.split(':')[1].strip().split()[0]) + elif line.strip().startswith('Balance:'): + balance = int(line.split(':')[1].strip().split()[0]) + return (memory, balance) + except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e: + error_text = collect_error_text(e) + if is_frozen_error(error_text): + return (None, None) + if attempt < max_retries and is_transient_error(error_text): + delay = retry_delay * (2 ** (attempt - 1)) + time.sleep(delay) + continue + return (None, None) + + return (None, None) + + +def check_canister_status(network: str, canister_id: str, owner: str, + index: int, total: int) -> dict: + """Check the full status of a single mAIner canister. + + Detection order: + 1. dfx canister info → check Module hash (uninstalled/frozen) + 2. health endpoint → healthy/maintenance/stopped/frozen/unavailable + """ + result = { + "canister_id": canister_id, + "owner": owner, + "status": STATUS_UNAVAILABLE, + "module_hash": "", + "error_message": "", + "active": None, + "burn_rate": "", + "memory_bytes": None, + "cycles_balance": None, + "daily_drain": None, + "freeze_threshold": None, + "days_until_freeze": None, + } + + # Step 1: Check module hash via dfx canister info + module_hash, info_error = get_module_hash(network, canister_id) + + if info_error: + # info call failed — check if frozen + if is_frozen_error(info_error): + result["status"] = STATUS_FROZEN + result["error_message"] = info_error + log_message(f"{canister_id} — {STATUS_FROZEN}", "ERROR", index, total) + return result + # Unknown error on info call — mark unavailable + result["status"] = STATUS_UNAVAILABLE + result["error_message"] = info_error + log_message(f"{canister_id} — {STATUS_UNAVAILABLE}", "ERROR", index, total) + return result + + result["module_hash"] = module_hash + + if module_hash == "None": + result["status"] = STATUS_UNINSTALLED + log_message(f"{canister_id} — {STATUS_UNINSTALLED}", "ERROR", index, total) + return result + + # Step 2: Check health endpoint + status, health_error = check_health(network, canister_id) + result["status"] = status + if health_error: + result["error_message"] = health_error + + if status != STATUS_HEALTHY: + log_message(f"{canister_id} — {status}", "ERROR", index, total) + return result + + # Step 3: For Healthy canisters, check active/paused and burn rate + is_active = get_mainer_is_active(network, canister_id) + result["active"] = is_active + + if is_active: + result["burn_rate"] = get_mainer_setting(network, canister_id) + + # Step 4: Get memory and cycle balance for Healthy canisters + memory, balance = get_canister_resources(network, canister_id) + result["memory_bytes"] = memory + result["cycles_balance"] = balance + + # Step 5: Calculate freeze prediction + daily_drain, freeze_thresh, days_left = calculate_freeze_prediction(memory, balance) + result["daily_drain"] = daily_drain + result["freeze_threshold"] = freeze_thresh + result["days_until_freeze"] = days_left + + mem_mb = f"{memory / 1_000_000:.1f}MB" if memory else "?" + bal_t = f"{balance / 1_000_000_000_000:.2f}T" if balance else "?" + days_str = f"{days_left}d" if days_left is not None else "?" + + if is_active: + log_message(f"{canister_id} — {STATUS_HEALTHY} (Active, {result['burn_rate']}, {mem_mb}, {bal_t}, {days_str})", "SUCCESS", index, total) + elif is_active is False: + log_message(f"{canister_id} — {STATUS_HEALTHY} (Paused, {mem_mb}, {bal_t}, {days_str})", "SUCCESS", index, total) + else: + log_message(f"{canister_id} — {STATUS_HEALTHY} (unknown, {mem_mb}, {bal_t}, {days_str})", "SUCCESS", index, total) + + return result + + +def filter_share_agents(mainers: list) -> list: + """Filter mAIner list to only ShareAgent type with non-empty addresses.""" + share_agents = [] + for mainer in mainers: + address = mainer.get('address', '') + canister_type_dict = mainer.get('canisterType', {}).get("MainerAgent", {}) + canister_type = list(canister_type_dict.keys())[0] if canister_type_dict else '' + + if canister_type == "ShareAgent" and address != "": + share_agents.append({ + "address": address, + "owner": mainer.get('ownedBy', 'Unknown'), + }) + return share_agents + + +def write_markdown_report(results: dict, md_path: str) -> None: + """Write a human-readable markdown report with aligned tables.""" + summary = results["summary"] + mainers = results["mainers"] + timestamp = results["timestamp"] + network = results["network"] + + lines = [] + lines.append(f"# mAIner Status Report — {network} — {timestamp}") + lines.append("") + lines.append("## Summary") + lines.append("") + lines.append("| Status | Count |") + lines.append("|--------------------|-------|") + for status in [STATUS_HEALTHY, STATUS_MAINTENANCE, STATUS_STOPPED, + STATUS_FROZEN, STATUS_UNINSTALLED, STATUS_UNAVAILABLE]: + key = status.lower() + count = summary.get(key, 0) + lines.append(f"| {status:<18} | {count:>5} |") + lines.append(f"| **Total** | **{summary['total_share_agents']:>3}** |") + lines.append("") + if summary.get("healthy", 0) > 0: + lines.append("### Healthy Breakdown") + lines.append("") + lines.append("| Detail | Count |") + lines.append("|--------------------|-------|") + lines.append(f"| Active | {summary.get('active', 0):>5} |") + lines.append(f"| Paused | {summary.get('paused', 0):>5} |") + lines.append(f"| Burn Rate: Low | {summary.get('burn_rate_low', 0):>5} |") + lines.append(f"| Burn Rate: Medium | {summary.get('burn_rate_medium', 0):>5} |") + lines.append(f"| Burn Rate: High | {summary.get('burn_rate_high', 0):>5} |") + lines.append(f"| Burn Rate: VeryHigh| {summary.get('burn_rate_very_high', 0):>5} |") + if summary.get("burn_rate_other", 0) > 0: + lines.append(f"| Burn Rate: Other | {summary.get('burn_rate_other', 0):>5} |") + lines.append("") + + # Memory and cycles stats for healthy canisters + healthy_with_mem = [m for m in mainers if m["status"] == STATUS_HEALTHY and m.get("memory_bytes") is not None] + if healthy_with_mem: + mem_values = [m["memory_bytes"] for m in healthy_with_mem] + avg_mem = sum(mem_values) / len(mem_values) / 1_000_000 + max_mem = max(mem_values) / 1_000_000 + min_mem = min(mem_values) / 1_000_000 + lines.append("### Memory & Cycles (Healthy mAIners)") + lines.append("") + lines.append(f"| Metric | Value |") + lines.append(f"|---------------------|-------------|") + lines.append(f"| Avg Memory | {avg_mem:>8.1f} MB |") + lines.append(f"| Min Memory | {min_mem:>8.1f} MB |") + lines.append(f"| Max Memory | {max_mem:>8.1f} MB |") + healthy_with_bal = [m for m in healthy_with_mem if m.get("cycles_balance") is not None] + if healthy_with_bal: + bal_values = [m["cycles_balance"] for m in healthy_with_bal] + avg_bal = sum(bal_values) / len(bal_values) / 1_000_000_000_000 + min_bal = min(bal_values) / 1_000_000_000_000 + max_bal = max(bal_values) / 1_000_000_000_000 + lines.append(f"| Avg Cycles Balance | {avg_bal:>8.2f} T |") + lines.append(f"| Min Cycles Balance | {min_bal:>8.2f} T |") + lines.append(f"| Max Cycles Balance | {max_bal:>8.2f} T |") + lines.append("") + # At-risk section + lines.append("### Freeze Risk (Healthy mAIners)") + lines.append("") + lines.append("| Risk Window | Count |") + lines.append("|----------------------|-------|") + lines.append(f"| Freeze in < 7 days | {summary.get('at_risk_7_days', 0):>5} |") + lines.append(f"| Freeze in < 14 days | {summary.get('at_risk_14_days', 0):>5} |") + lines.append(f"| Freeze in < 30 days | {summary.get('at_risk_30_days', 0):>5} |") + lines.append("") + + # Determine the majority module hash + hash_counts = {} + for m in mainers: + h = m.get("module_hash", "") + if h and h != "None": + hash_counts[h] = hash_counts.get(h, 0) + 1 + majority_hash = max(hash_counts, key=hash_counts.get) if hash_counts else "" + + def module_hash_status(m): + h = m.get("module_hash", "") + if not h or h == "None": + return "None" + if h == majority_hash: + return "Ok" + return "Not Ok" + + # Determine column widths for the all-mainers table + cid_width = max((len(m["canister_id"]) for m in mainers), default=11) + status_width = max(len(STATUS_UNINSTALLED), 6) # widest status label + hash_col_width = 11 # "Module Hash" header width + + active_col_width = 6 + burn_col_width = 9 + mem_col_width = 9 + cyc_col_width = 10 + drain_col_width = 10 + freeze_col_width = 10 + days_col_width = 9 + hash_col_width = 11 + + def table_header(): + return [ + f"| {'#':<4} | {'Canister ID':<{cid_width}} | {'Status':<{status_width}} | {'Active':<{active_col_width}} | {'Burn Rate':<{burn_col_width}} | {'Mem (MB)':<{mem_col_width}} | {'Cycles (T)':<{cyc_col_width}} | {'Drain/D(B)':<{drain_col_width}} | {'FreezeAt(T)':<{freeze_col_width}} | {'Days Left':<{days_col_width}} | {'Hash':<{hash_col_width}} |", + f"|{'-' * 6}|{'-' * (cid_width + 2)}|{'-' * (status_width + 2)}|{'-' * (active_col_width + 2)}|{'-' * (burn_col_width + 2)}|{'-' * (mem_col_width + 2)}|{'-' * (cyc_col_width + 2)}|{'-' * (drain_col_width + 2)}|{'-' * (freeze_col_width + 2)}|{'-' * (days_col_width + 2)}|{'-' * (hash_col_width + 2)}|", + ] + + def active_display(m): + a = m.get("active") + if a is True: + return "Yes" + elif a is False: + return "No" + return "" + + def mem_display(m): + mem = m.get("memory_bytes") + if mem is not None: + return f"{mem / 1_000_000:.1f}" + return "" + + def cycles_display(m): + bal = m.get("cycles_balance") + if bal is not None: + return f"{bal / 1_000_000_000_000:.2f}" + return "" + + def drain_display(m): + d = m.get("daily_drain") + if d is not None: + return f"{d / 1_000_000_000:.1f}" + return "" + + def freeze_at_display(m): + ft = m.get("freeze_threshold") + if ft is not None: + return f"{ft / 1_000_000_000_000:.2f}" + return "" + + def days_display(m): + d = m.get("days_until_freeze") + if d is not None: + return f"{d:.1f}" + return "" + + def table_row(idx, m): + return ( + f"| {idx:<4} " + f"| {m['canister_id']:<{cid_width}} " + f"| {m['status']:<{status_width}} " + f"| {active_display(m):<{active_col_width}} " + f"| {m.get('burn_rate', ''):<{burn_col_width}} " + f"| {mem_display(m):>{mem_col_width}} " + f"| {cycles_display(m):>{cyc_col_width}} " + f"| {drain_display(m):>{drain_col_width}} " + f"| {freeze_at_display(m):>{freeze_col_width}} " + f"| {days_display(m):>{days_col_width}} " + f"| {module_hash_status(m):<{hash_col_width}} |" + ) + + # Sort: non-healthy first (Frozen, Uninstalled, Stopped, Maintenance, Unavailable, Healthy) + status_order = { + STATUS_FROZEN: 0, STATUS_UNINSTALLED: 1, STATUS_STOPPED: 2, + STATUS_MAINTENANCE: 3, STATUS_UNAVAILABLE: 4, STATUS_HEALTHY: 5, + } + sorted_mainers = sorted(mainers, key=lambda m: status_order.get(m["status"], 99)) + + lines.append("## All mAIners") + lines.append("") + lines.append(f"Ok Module Hash: `{majority_hash}`") + lines.append("") + lines.extend(table_header()) + for idx, m in enumerate(sorted_mainers, 1): + lines.append(table_row(idx, m)) + lines.append("") + + # Filtered tables for problem statuses + for filter_status in [STATUS_FROZEN, STATUS_UNINSTALLED, STATUS_STOPPED, + STATUS_MAINTENANCE, STATUS_UNAVAILABLE]: + filtered = [m for m in mainers if m["status"] == filter_status] + if filtered: + lines.append(f"## {filter_status} mAIners") + lines.append("") + lines.extend(table_header()) + for idx, m in enumerate(filtered, 1): + lines.append(table_row(idx, m)) + lines.append("") + + # Notes section with full error messages for non-healthy canisters + non_healthy = [m for m in sorted_mainers if m.get("error_message")] + if non_healthy: + lines.append("## Notes") + lines.append("") + for m in non_healthy: + lines.append(f"**{m['canister_id']}** ({m['status']}):") + lines.append(f"> {m['error_message']}") + lines.append("") + + with open(md_path, 'w') as f: + f.write('\n'.join(lines) + '\n') + + +def main(network, workers=10, limit=None): + log_message("=" * 80) + log_message(f"Checking status of all ShareAgent mAIners on network '{network}'") + log_message(f"Using {workers} parallel workers") + log_message("=" * 80) + + # Step 1: Fetch all mAIners from GameState + mainers = get_mainers(network) + if not mainers: + log_message(f"No mainers found on network '{network}'", "ERROR") + return + + # Step 2: Filter to ShareAgent only + share_agents = filter_share_agents(mainers) + log_message(f"Found {len(share_agents)} ShareAgent mAIners (out of {len(mainers)} total)") + + if not share_agents: + log_message("No ShareAgent mAIners to check", "ERROR") + return + + # Step 3: Apply limit if specified + if limit is not None and limit > 0: + share_agents = share_agents[:limit] + log_message(f"Limiting to first {limit} mAIners (for testing)") + + total = len(share_agents) + log_message(f"Starting status check for {total} mAIners...") + log_message("") + + # Step 4: Check all canisters in parallel + all_results = [] + with ThreadPoolExecutor(max_workers=workers) as executor: + future_to_agent = { + executor.submit( + check_canister_status, network, agent["address"], + agent["owner"], idx + 1, total + ): agent + for idx, agent in enumerate(share_agents) + } + + for future in as_completed(future_to_agent): + try: + result = future.result() + all_results.append(result) + except Exception as e: + agent = future_to_agent[future] + log_message(f"Exception checking {agent['address']}: {e}", "ERROR") + all_results.append({ + "canister_id": agent["address"], + "owner": agent["owner"], + "status": STATUS_UNAVAILABLE, + "module_hash": "", + "error_message": str(e)[:200], + "active": None, + "burn_rate": "", + "memory_bytes": None, + "cycles_balance": None, + "daily_drain": None, + "freeze_threshold": None, + "days_until_freeze": None, + }) + + # Step 5: Aggregate results + counts = { + STATUS_HEALTHY: 0, + STATUS_MAINTENANCE: 0, + STATUS_STOPPED: 0, + STATUS_FROZEN: 0, + STATUS_UNINSTALLED: 0, + STATUS_UNAVAILABLE: 0, + } + active_count = 0 + paused_count = 0 + burn_rate_counts = {"Low": 0, "Medium": 0, "High": 0, "VeryHigh": 0, "Other": 0} + + at_risk_7 = 0 + at_risk_14 = 0 + at_risk_30 = 0 + + for r in all_results: + counts[r["status"]] = counts.get(r["status"], 0) + 1 + if r["status"] == STATUS_HEALTHY: + if r.get("active") is True: + active_count += 1 + br = r.get("burn_rate", "") + if br in burn_rate_counts: + burn_rate_counts[br] += 1 + elif br: + burn_rate_counts["Other"] += 1 + elif r.get("active") is False: + paused_count += 1 + # At-risk tracking + d = r.get("days_until_freeze") + if d is not None: + if d < 7: + at_risk_7 += 1 + if d < 14: + at_risk_14 += 1 + if d < 30: + at_risk_30 += 1 + + # Step 6: Print summary + log_message("") + log_message("=" * 80) + log_message("STATUS CHECK SUMMARY") + log_message("=" * 80) + log_message(f"Total ShareAgent mAIners : {total}") + for status_name in [STATUS_HEALTHY, STATUS_MAINTENANCE, STATUS_STOPPED, + STATUS_FROZEN, STATUS_UNINSTALLED, STATUS_UNAVAILABLE]: + count = counts[status_name] + level = "SUCCESS" if status_name == STATUS_HEALTHY and count > 0 else \ + "ERROR" if count > 0 and status_name != STATUS_HEALTHY else "INFO" + log_message(f" {status_name:<13}: {count}", level) + log_message("") + if counts[STATUS_HEALTHY] > 0: + log_message(f"Healthy breakdown:") + log_message(f" Active : {active_count}", "SUCCESS" if active_count > 0 else "INFO") + log_message(f" Paused : {paused_count}", "ERROR" if paused_count > 0 else "INFO") + log_message(f" Active burn rates:") + for br_name in ["Low", "Medium", "High", "VeryHigh", "Other"]: + if burn_rate_counts[br_name] > 0: + log_message(f" {br_name:<10}: {burn_rate_counts[br_name]}") + log_message("") + log_message(f"Freeze risk (healthy mAIners):") + log_message(f" Freeze in < 7 days : {at_risk_7}", "ERROR" if at_risk_7 > 0 else "INFO") + log_message(f" Freeze in < 14 days: {at_risk_14}", "ERROR" if at_risk_14 > 0 else "INFO") + log_message(f" Freeze in < 30 days: {at_risk_30}", "ERROR" if at_risk_30 > 0 else "INFO") + log_message("") + + # Print non-healthy canisters + non_healthy = [r for r in all_results if r["status"] != STATUS_HEALTHY] + if non_healthy: + log_message("NON-HEALTHY MAINERS:", "ERROR") + for r in non_healthy: + error_info = f" — {r['error_message'][:80]}" if r.get("error_message") else "" + log_message(f" {r['canister_id']} [{r['status']}]{error_info}", "ERROR") + log_message("") + + if not non_healthy: + log_message("All mAIners are healthy!", "SUCCESS") + log_message("") + + # Step 7: Write output files + timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + date_prefix = datetime.now().strftime("%Y-%m-%d") + logs_dir = os.path.join(SCRIPT_DIR, "logs-mainer-analysis") + os.makedirs(logs_dir, exist_ok=True) + + output_data = { + "timestamp": timestamp, + "network": network, + "summary": { + "total_share_agents": total, + "healthy": counts[STATUS_HEALTHY], + "active": active_count, + "paused": paused_count, + "burn_rate_low": burn_rate_counts["Low"], + "burn_rate_medium": burn_rate_counts["Medium"], + "burn_rate_high": burn_rate_counts["High"], + "burn_rate_very_high": burn_rate_counts["VeryHigh"], + "burn_rate_other": burn_rate_counts["Other"], + "maintenance": counts[STATUS_MAINTENANCE], + "stopped": counts[STATUS_STOPPED], + "frozen": counts[STATUS_FROZEN], + "uninstalled": counts[STATUS_UNINSTALLED], + "unavailable": counts[STATUS_UNAVAILABLE], + "at_risk_7_days": at_risk_7, + "at_risk_14_days": at_risk_14, + "at_risk_30_days": at_risk_30, + }, + "mainers": sorted(all_results, key=lambda r: r["canister_id"]), + } + + json_path = os.path.join(logs_dir, f"{date_prefix}-check_mAIner_status-{network}.json") + with open(json_path, 'w') as f: + json.dump(output_data, f, indent=2) + log_message(f"JSON report saved to: {os.path.abspath(json_path)}") + + md_path = os.path.join(logs_dir, f"{date_prefix}-check_mAIner_status-{network}.md") + write_markdown_report(output_data, md_path) + log_message(f"Markdown report saved to: {os.path.abspath(md_path)}") + log_message("") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Check status of all ShareAgent mAIners (Healthy/Maintenance/Stopped/Frozen/Uninstalled/Unavailable)." + ) + parser.add_argument( + "--network", + choices=["local", "ic", "testing", "demo", "development", "prd"], + default="local", + help="Specify the network to use (default: local)", + ) + parser.add_argument( + "--workers", + type=int, + default=10, + help="Number of parallel workers (default: 10)", + ) + parser.add_argument( + "--limit", + type=int, + default=None, + help="Limit number of mainers to check (for testing)", + ) + args = parser.parse_args() + main(args.network, args.workers, args.limit) diff --git a/scripts/check_mAIner_status.sh b/scripts/check_mAIner_status.sh new file mode 100755 index 00000000..adee5eaf --- /dev/null +++ b/scripts/check_mAIner_status.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +# Default network type is local +NETWORK_TYPE="local" +WORKERS="" +LIMIT="" + +# Parse command line arguments for network type +while [ $# -gt 0 ]; do + case "$1" in + --network) + shift + if [ "$1" = "local" ] || [ "$1" = "ic" ] || [ "$1" = "testing" ] || [ "$1" = "development" ] || [ "$1" = "demo" ] || [ "$1" = "prd" ]; then + NETWORK_TYPE=$1 + else + echo "Invalid network type: $1. Use 'local' or 'ic' or 'testing' or 'development' or 'demo' or 'prd'." + exit 1 + fi + shift + ;; + --workers) + shift + WORKERS="--workers $1" + shift + ;; + --limit) + shift + LIMIT="--limit $1" + shift + ;; + *) + echo "Unknown argument: $1" + echo "Usage: $0 --network [local|ic|testing|development|demo|prd] [--workers N] [--limit N]" + exit 1 + ;; + esac +done + +echo "Using network type: $NETWORK_TYPE" + +python -m scripts.check_mAIner_status --network $NETWORK_TYPE $WORKERS $LIMIT diff --git a/scripts/monitor_logs_active_mainers.py b/scripts/monitor_logs_active_mainers.py new file mode 100644 index 00000000..ba37b530 --- /dev/null +++ b/scripts/monitor_logs_active_mainers.py @@ -0,0 +1,210 @@ +#!/usr/bin/env python3 + +import subprocess +import time +import argparse +import os +import json +from collections import defaultdict +from datetime import datetime + +from .get_mainers import get_mainers, get_mainer_is_active, get_mainer_setting +from .monitor_common import ensure_log_dir + +# Get the directory of this script +SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) + +BURN_RATE_GROUPS = ["Low", "Medium", "High", "VeryHigh"] + +# Color codes for burn rate groups +BURN_RATE_COLORS = { + "Low": "\033[38;5;82m", # green + "Medium": "\033[38;5;226m", # yellow + "High": "\033[38;5;208m", # orange + "VeryHigh": "\033[38;5;196m", # red +} +RESET_COLOR = "\033[0m" + + +def get_logs(canister_id, network): + """Fetch logs using dfx for a given canister.""" + try: + output = subprocess.check_output( + ["dfx", "canister", "logs", canister_id, "--network", network], + stderr=subprocess.DEVNULL, + text=True + ) + return output.strip().splitlines() + except subprocess.CalledProcessError: + return [] + + +def filter_share_agents(mainers): + """Filter mAIner list to only ShareAgent type with non-empty addresses.""" + share_agents = [] + for mainer in mainers: + address = mainer.get('address', '') + canister_type_dict = mainer.get('canisterType', {}).get("MainerAgent", {}) + canister_type = list(canister_type_dict.keys())[0] if canister_type_dict else '' + if canister_type == "ShareAgent" and address != "": + share_agents.append(address) + return share_agents + + +def discover_active_mainers(network, limit=None): + """Fetch all ShareAgent mAIners and return only active ones grouped by burn rate.""" + print(f"Fetching all mAIners from GameState on network '{network}'...") + mainers = get_mainers(network) + if not mainers: + print("ERROR: No mainers found.") + return {} + + share_agents = filter_share_agents(mainers) + print(f"Found {len(share_agents)} ShareAgent mAIners.") + + if limit is not None and limit > 0: + share_agents = share_agents[:limit] + print(f"Limiting to first {limit} mAIners (for testing).") + + print(f"Checking active status and burn rate for {len(share_agents)} mAIners...") + grouped = {br: [] for br in BURN_RATE_GROUPS} + active_count = 0 + paused_count = 0 + skipped_count = 0 + + for idx, address in enumerate(share_agents, 1): + is_active = get_mainer_is_active(network, address) + if is_active is not True: + if is_active is False: + paused_count += 1 + else: + skipped_count += 1 + continue + + active_count += 1 + setting = get_mainer_setting(network, address) + if setting in grouped: + grouped[setting].append(address) + else: + # Custom/Unknown/Unable to query — skip + skipped_count += 1 + + if idx % 50 == 0: + print(f" Checked {idx}/{len(share_agents)} mAIners...") + + print(f"\nDiscovery complete:") + print(f" Active: {active_count}, Paused: {paused_count}, Skipped: {skipped_count}") + for br in BURN_RATE_GROUPS: + print(f" {br}: {len(grouped[br])} active mAIners") + + return grouped + + +def main(network, delay=3, limit=None): + # Step 1: Discover active mAIners grouped by burn rate + grouped = discover_active_mainers(network, limit) + + total_active = sum(len(ids) for ids in grouped.values()) + if total_active == 0: + print("\nNo active mAIners found. Nothing to monitor.") + return + + # Step 2: Set up log directories and files + log_dir = os.path.join(SCRIPT_DIR, f"logs-active-mainers-{network}") + ensure_log_dir(log_dir) + + log_files = {} + for br in BURN_RATE_GROUPS: + if grouped[br]: + log_path = os.path.join(log_dir, f"burn_rate_{br.lower()}.log") + # Clear at start + with open(log_path, "w"): + pass + log_files[br] = log_path + + combined_log_path = os.path.join(log_dir, "combined_active.log") + with open(combined_log_path, "w"): + pass + + # Write a manifest of which canisters are in each group + manifest_path = os.path.join(log_dir, "manifest.json") + manifest = { + "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "network": network, + "total_active": total_active, + "groups": {br: grouped[br] for br in BURN_RATE_GROUPS}, + } + with open(manifest_path, 'w') as f: + json.dump(manifest, f, indent=2) + + print(f"\nMonitoring {total_active} active mAIners on '{network}'.") + print(f"Log directory: {os.path.abspath(log_dir)}") + print(f" Combined log: combined_active.log") + for br in BURN_RATE_GROUPS: + if grouped[br]: + print(f" {br} ({len(grouped[br])} mAIners): burn_rate_{br.lower()}.log") + print(f"\nChecking every {delay} seconds. Press Ctrl+C to stop.\n") + + # Step 3: Monitor loop + previous_logs = defaultdict(set) + first = True + + while True: + for br in BURN_RATE_GROUPS: + for canister_id in grouped[br]: + log_lines = get_logs(canister_id, network) + new_lines = [] + for line in log_lines: + if line not in previous_logs[canister_id]: + previous_logs[canister_id].add(line) + new_lines.append(line) + + if new_lines: + color = BURN_RATE_COLORS.get(br, "") + br_log_path = log_files.get(br) + + with open(combined_log_path, "a") as f_combined: + br_file = open(br_log_path, "a") if br_log_path else None + try: + for line in new_lines: + tagged_line = f"[{br}][{canister_id}] {line}" + f_combined.write(tagged_line + "\n") + if br_file: + br_file.write(f"[{canister_id}] {line}\n") + print(f"{color}[{br}]{RESET_COLOR}({canister_id}) {line}") + finally: + if br_file: + br_file.close() + + if first: + first = False + print(f"\nInitial log retrieval completed for {total_active} active mAIners.") + print(f"Will report changes in logs. Checking every {delay} seconds...") + + time.sleep(delay) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Monitor logs for active ShareAgent mAIners, grouped by burn rate." + ) + parser.add_argument( + "--network", + choices=["local", "ic", "testing", "demo", "development", "prd"], + default="local", + help="Specify the network to use (default: local)", + ) + parser.add_argument( + "--delay", + type=int, + default=3, + help="Seconds between log checks (default: 3)", + ) + parser.add_argument( + "--limit", + type=int, + default=None, + help="Limit number of mainers to check (for testing)", + ) + args = parser.parse_args() + main(args.network, args.delay, args.limit) diff --git a/scripts/monitor_logs_active_mainers.sh b/scripts/monitor_logs_active_mainers.sh new file mode 100755 index 00000000..bffb8824 --- /dev/null +++ b/scripts/monitor_logs_active_mainers.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +# Default network type is local +NETWORK_TYPE="local" +DELAY="" +LIMIT="" + +# Parse command line arguments +while [ $# -gt 0 ]; do + case "$1" in + --network) + shift + if [ "$1" = "local" ] || [ "$1" = "ic" ] || [ "$1" = "testing" ] || [ "$1" = "development" ] || [ "$1" = "demo" ] || [ "$1" = "prd" ]; then + NETWORK_TYPE=$1 + else + echo "Invalid network type: $1. Use 'local' or 'ic' or 'testing' or 'development' or 'demo' or 'prd'." + exit 1 + fi + shift + ;; + --delay) + shift + DELAY="--delay $1" + shift + ;; + --limit) + shift + LIMIT="--limit $1" + shift + ;; + *) + echo "Unknown argument: $1" + echo "Usage: $0 --network [local|ic|testing|development|demo|prd] [--delay N] [--limit N]" + exit 1 + ;; + esac +done + +echo "Using network type: $NETWORK_TYPE" + +python -m scripts.monitor_logs_active_mainers --network $NETWORK_TYPE $DELAY $LIMIT diff --git a/scripts/start_timers.py b/scripts/start_timers.py index c1b499cc..ef38c183 100644 --- a/scripts/start_timers.py +++ b/scripts/start_timers.py @@ -5,16 +5,25 @@ import argparse import os from collections import defaultdict +from concurrent.futures import ThreadPoolExecutor, as_completed from dotenv import dotenv_values from .monitor_common import get_canisters, ensure_log_dir +from .get_mainers import get_mainers +from .check_mAIner_status import ( + is_frozen_error, + is_transient_error, + collect_error_text, + HEALTH_OK_PATTERNS, +) # Get the directory of this script SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) + def start_timer(canister_id, network): """Start timer using dfx for a given canister.""" - try: + try: print(f"Starting timer for canister {canister_id} on network {network}...") subprocess.run( ["dfx", "canister", "--network", network, "call", canister_id, "startTimerExecutionAdmin"], @@ -24,15 +33,190 @@ def start_timer(canister_id, network): except subprocess.CalledProcessError: print(f"ERROR: Unable to start timer for canister {canister_id} on network {network}") -def main(network, canister_types): - (CANISTERS, CANISTER_COLORS, RESET_COLOR) = get_canisters(network, canister_types) - for name, canister_id in CANISTERS.items(): - if ("GAMESTATE" in name or "CREATOR" in name or "LLM" in name): - continue - print("-------------------------------") - print(f"Canister {name} ({canister_id})") - start_timer(canister_id, network) +def check_health_and_start_timer(canister_id, network, index, total): + """Check if a mAIner is responsive, then start its timer. + + Returns dict with canister_id, status ('started', 'frozen', 'unhealthy', 'error'). + """ + # Step 1: Check health + cmd = ["dfx", "canister", "--network", network, "call", canister_id, "health"] + max_retries = 3 + retry_delay = 5.0 + + for attempt in range(1, max_retries + 1): + try: + result = subprocess.run(cmd, capture_output=True, text=True, timeout=30, check=True) + output = result.stdout.strip() + if any(pattern in output for pattern in HEALTH_OK_PATTERNS): + break # Healthy — proceed to start timer + else: + print(f" ({index}/{total}) {canister_id} — unhealthy, skipping") + return {"canister_id": canister_id, "status": "unhealthy"} + except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e: + error_text = collect_error_text(e) + if is_frozen_error(error_text): + print(f" ({index}/{total}) {canister_id} — FROZEN, skipping") + return {"canister_id": canister_id, "status": "frozen"} + if attempt < max_retries and is_transient_error(error_text): + time.sleep(retry_delay * (2 ** (attempt - 1))) + continue + print(f" ({index}/{total}) {canister_id} — error: {error_text[:100]}, skipping") + return {"canister_id": canister_id, "status": "error"} + else: + print(f" ({index}/{total}) {canister_id} — max retries exceeded, skipping") + return {"canister_id": canister_id, "status": "error"} + + # Step 2: Start timer + cmd = ["dfx", "canister", "--network", network, "call", canister_id, "startTimerExecutionAdmin"] + try: + result = subprocess.run(cmd, capture_output=True, text=True, timeout=30, check=True) + print(f" ({index}/{total}) {canister_id} — timer started") + except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e: + error_text = collect_error_text(e) + print(f" ({index}/{total}) {canister_id} — failed to start timer: {error_text[:100]}") + return {"canister_id": canister_id, "status": "error", "logs": []} + + # Step 3: Capture logs after starting timer + cmd = ["dfx", "canister", "logs", canister_id, "--network", network] + try: + output = subprocess.check_output(cmd, stderr=subprocess.DEVNULL, text=True, timeout=30) + log_lines = output.strip().splitlines()[-20:] # last 20 lines + except (subprocess.CalledProcessError, subprocess.TimeoutExpired): + log_lines = ["(could not fetch logs)"] + + return {"canister_id": canister_id, "status": "started", "logs": log_lines} + + +def filter_share_agents(mainers): + """Filter mAIner list to only ShareAgent type with non-empty addresses.""" + share_agents = [] + for mainer in mainers: + address = mainer.get('address', '') + canister_type_dict = mainer.get('canisterType', {}).get("MainerAgent", {}) + canister_type = list(canister_type_dict.keys())[0] if canister_type_dict else '' + if canister_type == "ShareAgent" and address != "": + share_agents.append(address) + return share_agents + + +def main(network, canister_types, workers=10, limit=None, dry_run=False): + if dry_run: + print("*** DRY RUN — no timers will be started ***\n") + + # Protocol canisters — use existing .env-based flow + if canister_types == "protocol": + (CANISTERS, CANISTER_COLORS, RESET_COLOR) = get_canisters(network, canister_types) + for name, canister_id in CANISTERS.items(): + if ("GAMESTATE" in name or "CREATOR" in name or "LLM" in name): + continue + print("-------------------------------") + print(f"Canister {name} ({canister_id})") + if dry_run: + print(f" DRY RUN: would start timer for {canister_id}") + else: + start_timer(canister_id, network) + return + + # Mainers — fetch from GameState and check health before starting + print(f"Fetching mAIners from GameState on network '{network}'...") + mainers = get_mainers(network) + if not mainers: + print(f"ERROR: No mainers found on network '{network}'") + return + + # Show canister type breakdown for verification + type_counts = defaultdict(int) + for mainer in mainers: + address = mainer.get('address', '') + canister_type_dict = mainer.get('canisterType', {}).get("MainerAgent", {}) + canister_type = list(canister_type_dict.keys())[0] if canister_type_dict else 'Unknown' + if address: + type_counts[canister_type] += 1 + print(f"Canister type breakdown:") + for ct, count in sorted(type_counts.items()): + selected = " <-- SELECTED" if ct == "ShareAgent" else " (skipped)" + print(f" {ct}: {count}{selected}") + + share_agents = filter_share_agents(mainers) + print(f"\nFiltered to {len(share_agents)} ShareAgent mAIners.") + + if limit is not None and limit > 0: + share_agents = share_agents[:limit] + print(f"Limiting to first {limit} mAIners (for testing).") + + if dry_run: + print(f"\nDRY RUN: Would check health and start timers for {len(share_agents)} ShareAgent mAIners:") + for idx, cid in enumerate(share_agents, 1): + print(f" {idx}. {cid}") + print(f"\nDRY RUN complete. Re-run without --dry-run to execute.") + return + + total = len(share_agents) + print(f"\nStarting timers for {total} mAIners (checking health first)...") + print(f"Using {workers} parallel workers.\n") + + # Process in parallel + results = [] + with ThreadPoolExecutor(max_workers=workers) as executor: + future_to_id = { + executor.submit( + check_health_and_start_timer, cid, network, idx + 1, total + ): cid + for idx, cid in enumerate(share_agents) + } + for future in as_completed(future_to_id): + try: + results.append(future.result()) + except Exception as e: + cid = future_to_id[future] + print(f" {cid} — exception: {e}") + results.append({"canister_id": cid, "status": "error", "logs": []}) + + # Summary + counts = defaultdict(int) + for r in results: + counts[r["status"]] += 1 + + print(f"\n{'='*60}") + print("SUMMARY") + print(f"{'='*60}") + print(f" Started : {counts['started']}") + print(f" Frozen : {counts['frozen']} (skipped)") + print(f" Unhealthy: {counts['unhealthy']} (skipped)") + print(f" Errors : {counts['error']} (skipped)") + print(f" Total : {total}") + print() + + # Write log files + from datetime import datetime + date_prefix = datetime.now().strftime("%Y-%m-%d") + log_dir = os.path.join(SCRIPT_DIR, f"logs-start-timers-{network}") + ensure_log_dir(log_dir) + + combined_path = os.path.join(log_dir, f"{date_prefix}-start_timers-{network}.log") + with open(combined_path, "w") as f_combined: + for r in sorted(results, key=lambda x: x["canister_id"]): + cid = r["canister_id"] + logs = r.get("logs", []) + + # Individual log file + individual_path = os.path.join(log_dir, f"{cid}.log") + with open(individual_path, "w") as f_ind: + f_ind.write(f"# Status: {r['status']}\n") + for line in logs: + f_ind.write(line + "\n") + + # Combined log + f_combined.write(f"=== {cid} ({r['status']}) ===\n") + for line in logs: + f_combined.write(f" {line}\n") + f_combined.write("\n") + + print(f"Logs saved to: {os.path.abspath(log_dir)}") + print(f" Combined: {os.path.abspath(combined_path)}") + print(f" Per-canister: {len(results)} individual .log files") + print() if __name__ == "__main__": @@ -49,5 +233,22 @@ def main(network, canister_types): default="protocol", help="Specify the canister type (default: protocol)", ) + parser.add_argument( + "--workers", + type=int, + default=10, + help="Number of parallel workers (default: 10)", + ) + parser.add_argument( + "--limit", + type=int, + default=None, + help="Limit number of mainers to process (for testing)", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Show what would be done without actually starting timers", + ) args = parser.parse_args() - main(args.network, args.canister_types) + main(args.network, args.canister_types, args.workers, args.limit, args.dry_run) diff --git a/scripts/start_timers.sh b/scripts/start_timers.sh index 1f23eb4a..8c5dc610 100755 --- a/scripts/start_timers.sh +++ b/scripts/start_timers.sh @@ -3,6 +3,9 @@ # Default network type is local NETWORK_TYPE="local" CANISTER_TYPES="protocol" +WORKERS="" +LIMIT="" +DRY_RUN="" # Parse command line arguments for network type while [ $# -gt 0 ]; do @@ -22,14 +25,28 @@ while [ $# -gt 0 ]; do if [ "$1" = "all" ] || [ "$1" = "protocol" ] || [ "$1" = "mainers" ]; then CANISTER_TYPES=$1 else - echo "Invalid network type: $1. Use 'all' or 'protocol' or 'mainers'." + echo "Invalid canister type: $1. Use 'all' or 'protocol' or 'mainers'." exit 1 fi shift ;; + --workers) + shift + WORKERS="--workers $1" + shift + ;; + --limit) + shift + LIMIT="--limit $1" + shift + ;; + --dry-run) + DRY_RUN="--dry-run" + shift + ;; *) echo "Unknown argument: $1" - echo "Usage: $0 --network [local|ic|testing|development|demo|prd]" + echo "Usage: $0 --network [local|ic|testing|development|demo|prd] --canister-types [all|protocol|mainers] [--workers N] [--limit N] [--dry-run]" exit 1 ;; esac @@ -37,4 +54,4 @@ done echo "Using network type: $NETWORK_TYPE" -python -m scripts.start_timers --network $NETWORK_TYPE --canister-types $CANISTER_TYPES +python -m scripts.start_timers --network $NETWORK_TYPE --canister-types $CANISTER_TYPES $WORKERS $LIMIT $DRY_RUN