From 30d7874ca5d0e85540b89f8a99bcbf5c28fbd454 Mon Sep 17 00:00:00 2001 From: icpp Date: Fri, 17 Apr 2026 07:29:59 -0400 Subject: [PATCH 1/2] Add scripts for mAIner status monitoring and log analysis Introduces a suite of Python and shell scripts to track the health, activity, and resource usage of mAIner canisters. Includes dedicated tools for analyzing logs of both active and frozen mAIners to diagnose issues like runaway timers or execution failures. --- .gitignore | 3 +- scripts/analyze_active_mainer_logs.py | 384 +++++++++++++++ scripts/analyze_active_mainer_logs.sh | 35 ++ scripts/analyze_frozen_mainer_logs.py | 479 +++++++++++++++++++ scripts/analyze_frozen_mainer_logs.sh | 35 ++ scripts/check_mAIner_status.py | 621 +++++++++++++++++++++++++ scripts/check_mAIner_status.sh | 41 ++ scripts/monitor_logs_active_mainers.py | 210 +++++++++ scripts/monitor_logs_active_mainers.sh | 41 ++ 9 files changed, 1848 insertions(+), 1 deletion(-) create mode 100644 scripts/analyze_active_mainer_logs.py create mode 100755 scripts/analyze_active_mainer_logs.sh create mode 100644 scripts/analyze_frozen_mainer_logs.py create mode 100755 scripts/analyze_frozen_mainer_logs.sh create mode 100644 scripts/check_mAIner_status.py create mode 100755 scripts/check_mAIner_status.sh create mode 100644 scripts/monitor_logs_active_mainers.py create mode 100755 scripts/monitor_logs_active_mainers.sh diff --git a/.gitignore b/.gitignore index e037c38a..3f20c2de 100644 --- a/.gitignore +++ b/.gitignore @@ -149,6 +149,7 @@ funnai_accounts/ # claude code CLAUDE.md +.claude/ scripts/burned_tokens_cache.json @@ -165,4 +166,4 @@ scripts/upgrade_mainers.logs scripts/logs-mainer-analysis/ # lp_holders list (generated output) -scripts/lp_holders/ \ No newline at end of file +scripts/lp_holders/ diff --git a/scripts/analyze_active_mainer_logs.py b/scripts/analyze_active_mainer_logs.py new file mode 100644 index 00000000..9a2f8f94 --- /dev/null +++ b/scripts/analyze_active_mainer_logs.py @@ -0,0 +1,384 @@ +#!/usr/bin/env python3 +"""Fetch and analyze logs for all active mAIners using status data from check_mAIner_status. + +Reads the latest status JSON to identify active mAIners, fetches their logs in parallel, +and produces a health analysis report. +""" + +import argparse +import os +import json +import subprocess +import glob +from datetime import datetime +from collections import defaultdict +from concurrent.futures import ThreadPoolExecutor, as_completed + +SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) + +# Expected log patterns for a healthy mAIner flow +FLOW_STEPS = [ + "Recurring action 1 was triggered", + "pullNextChallenge - entered", + "pullNextChallenge - calling getChallengeFromGameStateCanister", + "calling getRandomOpenChallenge", +] + +# Patterns that indicate successful progression beyond challenge pull +SUCCESS_INDICATORS = [ + "got a challenge", + "sendToShareService", + "ShareService", + "submission", + "submitResponse", + "Inference", + "inference", + "response received", + "submitToGameState", +] + +# Patterns that indicate errors +ERROR_INDICATORS = [ + "ERROR", + "Error", + "error", + "trap", + "Trap", + "reject", + "Reject", + "failed", + "Failed", + "timeout", + "Timeout", +] + + +def get_logs(canister_id, network): + """Fetch logs for a canister with retry.""" + for attempt in range(3): + try: + output = subprocess.check_output( + ["dfx", "canister", "logs", canister_id, "--network", network], + stderr=subprocess.DEVNULL, + text=True, + timeout=30, + ) + return output.strip().splitlines() + except (subprocess.CalledProcessError, subprocess.TimeoutExpired): + if attempt == 2: + return None + return None + + +def analyze_canister_logs(canister_id, lines): + """Analyze log lines for a single canister and return a health assessment.""" + if lines is None: + return { + "canister_id": canister_id, + "verdict": "UNREACHABLE", + "detail": "Could not fetch logs", + "total_lines": 0, + "recurring_triggers": 0, + "pull_challenge_count": 0, + "success_indicators": [], + "error_lines": [], + "last_activity": "", + "timer_interval_hours": None, + } + + total_lines = len(lines) + recurring_count = 0 + pull_count = 0 + success_found = [] + error_lines = [] + timestamps = [] + + for line in lines: + # Extract timestamp + if "Z]:" in line: + try: + ts_start = line.index(". ") + 2 + ts_end = line.index("Z]:") + 1 + ts_str = line[ts_start:ts_end] + timestamps.append(ts_str) + except (ValueError, IndexError): + pass + + if "Recurring action 1 was triggered" in line: + recurring_count += 1 + + if "pullNextChallenge - entered" in line: + pull_count += 1 + + for indicator in SUCCESS_INDICATORS: + if indicator in line: + # Keep unique indicators only + if indicator not in success_found: + success_found.append(indicator) + break + + for indicator in ERROR_INDICATORS: + if indicator in line: + # Store the actual error line (truncated) + error_lines.append(line.strip()[-150:]) + break + + # Determine last activity timestamp + last_activity = timestamps[-1] if timestamps else "" + + # Calculate average timer interval + timer_interval = None + if len(timestamps) >= 2: + try: + first = datetime.fromisoformat(timestamps[0].replace("Z", "+00:00")) + last = datetime.fromisoformat(timestamps[-1].replace("Z", "+00:00")) + span_hours = (last - first).total_seconds() / 3600 + if recurring_count > 1: + timer_interval = round(span_hours / (recurring_count - 1), 1) + except (ValueError, IndexError): + pass + + # Determine verdict + if recurring_count == 0: + verdict = "NO_TIMERS" + detail = "No recurring timer triggers found in logs" + elif success_found: + verdict = "OK" + detail = f"Timers firing, challenge flow progressing ({', '.join(success_found)})" + elif pull_count > 0 and not success_found: + verdict = "STUCK_AT_CHALLENGE_PULL" + detail = "Timers fire, pullNextChallenge runs, but no evidence of challenge completion" + elif recurring_count > 0 and pull_count == 0: + verdict = "TIMER_ONLY" + detail = "Timers fire but no pullNextChallenge steps visible (possible log buffer overflow or silent failure)" + else: + verdict = "UNKNOWN" + detail = "Could not determine health from logs" + + return { + "canister_id": canister_id, + "verdict": verdict, + "detail": detail, + "total_lines": total_lines, + "recurring_triggers": recurring_count, + "pull_challenge_count": pull_count, + "success_indicators": success_found, + "error_lines": error_lines[:5], # keep top 5 + "last_activity": last_activity, + "timer_interval_hours": timer_interval, + } + + +def load_status_data(network): + """Load the latest check_mAIner_status JSON for the given network.""" + logs_dir = os.path.join(SCRIPT_DIR, "logs-mainer-analysis") + pattern = os.path.join(logs_dir, f"*-check_mAIner_status-{network}.json") + files = sorted(glob.glob(pattern)) + if not files: + return None + # Use the latest file + with open(files[-1]) as f: + return json.load(f) + + +def write_markdown_report(analysis_results, network, md_path): + """Write analysis report as markdown.""" + lines = [] + timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + lines.append(f"# Active mAIner Log Analysis — {network} — {timestamp}") + lines.append("") + + # Group by verdict + by_verdict = defaultdict(list) + for r in analysis_results: + by_verdict[r["verdict"]].append(r) + + # Summary + lines.append("## Summary") + lines.append("") + lines.append("| Verdict | Count |") + lines.append("|-------------------------|-------|") + for verdict in ["OK", "STUCK_AT_CHALLENGE_PULL", "TIMER_ONLY", "NO_TIMERS", "UNREACHABLE", "UNKNOWN"]: + count = len(by_verdict.get(verdict, [])) + if count > 0 or verdict in ["OK", "STUCK_AT_CHALLENGE_PULL", "TIMER_ONLY"]: + lines.append(f"| {verdict:<23} | {count:>5} |") + lines.append(f"| **Total** | **{len(analysis_results):>3}** |") + lines.append("") + + # Verdict explanations + lines.append("### Verdict Legend") + lines.append("") + lines.append("- **OK**: Timers firing and evidence of challenge processing beyond `getRandomOpenChallenge`") + lines.append("- **STUCK_AT_CHALLENGE_PULL**: Timers fire, `pullNextChallenge` runs, but no evidence of receiving a challenge or completing inference") + lines.append("- **TIMER_ONLY**: Timers fire but `pullNextChallenge` steps not visible in log buffer (may be log overflow)") + lines.append("- **NO_TIMERS**: No recurring timer triggers found") + lines.append("- **UNREACHABLE**: Could not fetch logs from canister") + lines.append("") + + # Detail table + cid_width = 27 + verdict_width = 23 + lines.append("## All Active mAIners") + lines.append("") + lines.append(f"| {'#':<4} | {'Canister ID':<{cid_width}} | {'Burn Rate':<9} | {'Verdict':<{verdict_width}} | {'Triggers':<8} | {'Pulls':<5} | {'Interval':<8} | {'Last Activity':<25} |") + lines.append(f"|{'-'*6}|{'-'*(cid_width+2)}|{'-'*11}|{'-'*(verdict_width+2)}|{'-'*10}|{'-'*7}|{'-'*10}|{'-'*27}|") + + # Sort: problems first + verdict_order = {"UNREACHABLE": 0, "NO_TIMERS": 1, "STUCK_AT_CHALLENGE_PULL": 2, "TIMER_ONLY": 3, "UNKNOWN": 4, "OK": 5} + sorted_results = sorted(analysis_results, key=lambda r: verdict_order.get(r["verdict"], 99)) + + for idx, r in enumerate(sorted_results, 1): + interval = f"{r['timer_interval_hours']}h" if r["timer_interval_hours"] else "" + lines.append( + f"| {idx:<4} " + f"| {r['canister_id']:<{cid_width}} " + f"| {r.get('burn_rate', ''):<9} " + f"| {r['verdict']:<{verdict_width}} " + f"| {r['recurring_triggers']:<8} " + f"| {r['pull_challenge_count']:<5} " + f"| {interval:<8} " + f"| {r['last_activity']:<25} |" + ) + lines.append("") + + # Problem canisters detail + problems = [r for r in sorted_results if r["verdict"] != "OK"] + if problems: + lines.append("## Problem mAIners — Details") + lines.append("") + for r in problems: + lines.append(f"### `{r['canister_id']}` — {r['verdict']}") + lines.append("") + lines.append(f"- **Detail**: {r['detail']}") + lines.append(f"- **Log lines**: {r['total_lines']}, Triggers: {r['recurring_triggers']}, Pulls: {r['pull_challenge_count']}") + if r["error_lines"]: + lines.append(f"- **Errors found**:") + for err in r["error_lines"]: + lines.append(f" - `{err}`") + lines.append("") + + # OK canisters with success indicators + ok_results = by_verdict.get("OK", []) + if ok_results: + lines.append("## OK mAIners — Success Indicators") + lines.append("") + for r in ok_results: + indicators = ", ".join(r["success_indicators"]) if r["success_indicators"] else "none" + lines.append(f"- `{r['canister_id']}`: {indicators}") + lines.append("") + + with open(md_path, 'w') as f: + f.write('\n'.join(lines) + '\n') + + +def main(network, workers=10): + # Step 1: Load status data + status_data = load_status_data(network) + if not status_data: + print(f"ERROR: No check_mAIner_status JSON found for network '{network}'.") + print(f"Run check_mAIner_status.sh --network {network} first.") + return + + print(f"Loaded status data from {status_data['timestamp']} ({status_data['summary']['total_share_agents']} total mAIners)") + + # Step 2: Extract active mAIners with burn rates + active_mainers = [] + for m in status_data["mainers"]: + if m.get("active") is True and m.get("burn_rate", ""): + active_mainers.append({ + "canister_id": m["canister_id"], + "burn_rate": m["burn_rate"], + }) + + if not active_mainers: + print("No active mAIners found in status data.") + return + + print(f"Found {len(active_mainers)} active mAIners to analyze.") + + # Step 3: Fetch logs in parallel + print(f"Fetching logs with {workers} parallel workers...") + logs_map = {} # canister_id -> lines + + with ThreadPoolExecutor(max_workers=workers) as executor: + future_to_id = { + executor.submit(get_logs, m["canister_id"], network): m["canister_id"] + for m in active_mainers + } + done = 0 + for future in as_completed(future_to_id): + done += 1 + cid = future_to_id[future] + try: + logs_map[cid] = future.result() + except Exception: + logs_map[cid] = None + if done % 25 == 0: + print(f" Fetched {done}/{len(active_mainers)} logs...") + + print(f" Fetched all {len(active_mainers)} logs.") + + # Step 4: Analyze each canister + print("Analyzing logs...") + analysis_results = [] + for m in active_mainers: + cid = m["canister_id"] + result = analyze_canister_logs(cid, logs_map.get(cid)) + result["burn_rate"] = m["burn_rate"] + analysis_results.append(result) + + # Step 5: Print summary + by_verdict = defaultdict(int) + for r in analysis_results: + by_verdict[r["verdict"]] += 1 + + print(f"\n{'='*60}") + print("LOG ANALYSIS SUMMARY") + print(f"{'='*60}") + for verdict in ["OK", "STUCK_AT_CHALLENGE_PULL", "TIMER_ONLY", "NO_TIMERS", "UNREACHABLE", "UNKNOWN"]: + count = by_verdict.get(verdict, 0) + if count > 0: + print(f" {verdict:<25}: {count}") + print(f" {'Total':<25}: {len(analysis_results)}") + print() + + # Step 6: Write output files + date_prefix = datetime.now().strftime("%Y-%m-%d") + logs_dir = os.path.join(SCRIPT_DIR, "logs-mainer-analysis") + os.makedirs(logs_dir, exist_ok=True) + + json_path = os.path.join(logs_dir, f"{date_prefix}-analyze_active_mainer_logs-{network}.json") + with open(json_path, 'w') as f: + json.dump({ + "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "network": network, + "total_active": len(active_mainers), + "summary": dict(by_verdict), + "results": analysis_results, + }, f, indent=2) + print(f"JSON report: {os.path.abspath(json_path)}") + + md_path = os.path.join(logs_dir, f"{date_prefix}-analyze_active_mainer_logs-{network}.md") + write_markdown_report(analysis_results, network, md_path) + print(f"Markdown report: {os.path.abspath(md_path)}") + print() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Fetch and analyze logs for all active mAIners." + ) + parser.add_argument( + "--network", + choices=["local", "ic", "testing", "demo", "development", "prd"], + default="local", + help="Specify the network to use (default: local)", + ) + parser.add_argument( + "--workers", + type=int, + default=10, + help="Number of parallel workers for log fetching (default: 10)", + ) + args = parser.parse_args() + main(args.network, args.workers) diff --git a/scripts/analyze_active_mainer_logs.sh b/scripts/analyze_active_mainer_logs.sh new file mode 100755 index 00000000..02bc4cd1 --- /dev/null +++ b/scripts/analyze_active_mainer_logs.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +# Default network type is local +NETWORK_TYPE="local" +WORKERS="" + +# Parse command line arguments +while [ $# -gt 0 ]; do + case "$1" in + --network) + shift + if [ "$1" = "local" ] || [ "$1" = "ic" ] || [ "$1" = "testing" ] || [ "$1" = "development" ] || [ "$1" = "demo" ] || [ "$1" = "prd" ]; then + NETWORK_TYPE=$1 + else + echo "Invalid network type: $1. Use 'local' or 'ic' or 'testing' or 'development' or 'demo' or 'prd'." + exit 1 + fi + shift + ;; + --workers) + shift + WORKERS="--workers $1" + shift + ;; + *) + echo "Unknown argument: $1" + echo "Usage: $0 --network [local|ic|testing|development|demo|prd] [--workers N]" + exit 1 + ;; + esac +done + +echo "Using network type: $NETWORK_TYPE" + +python -m scripts.analyze_active_mainer_logs --network $NETWORK_TYPE $WORKERS diff --git a/scripts/analyze_frozen_mainer_logs.py b/scripts/analyze_frozen_mainer_logs.py new file mode 100644 index 00000000..b9cda535 --- /dev/null +++ b/scripts/analyze_frozen_mainer_logs.py @@ -0,0 +1,479 @@ +#!/usr/bin/env python3 +"""Fetch and analyze logs for all frozen mAIners to determine why they froze. + +Reads the latest status JSON to identify frozen mAIners, fetches their logs +(dfx canister logs works on frozen canisters), and analyzes for freeze causes. +""" + +import argparse +import os +import json +import subprocess +import glob +import re +from datetime import datetime, timezone +from collections import defaultdict +from concurrent.futures import ThreadPoolExecutor, as_completed + +SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) + +# Patterns to detect in logs +TIMER_TRIGGER = "Recurring action 1 was triggered" +TIMER_2_TRIGGER = "Recurring action 2 was triggered" +PULL_CHALLENGE = "pullNextChallenge - entered" +GOT_CHALLENGE = "pullNextChallenge - challenge =" +SEND_TO_SHARE = "addChallengeToShareServiceQueue" +STORE_RESPONSE = "storeAndSubmitResponse" +SUBMIT_TO_GS = "submitChallengeResponse" +TIMER_START = "startTimerExecution" +TIMER_SET = "setTimer" +CYCLES_ADD = "Cycles.add for" +UNOFFICIAL_TOPUP = "Unofficial top ups" + +# Error patterns +ERROR_PATTERNS = [ + "trap", "Trap", "ERROR", "Error", "reject", "Reject", + "failed", "Failed", "timeout", "Timeout", "out of cycles", + "canister_error", "IC0503", "IC0502", +] + + +def get_logs(canister_id, network): + """Fetch logs for a canister with retry.""" + for attempt in range(3): + try: + output = subprocess.check_output( + ["dfx", "canister", "logs", canister_id, "--network", network], + stderr=subprocess.DEVNULL, + text=True, + timeout=30, + ) + return output.strip().splitlines() + except (subprocess.CalledProcessError, subprocess.TimeoutExpired): + if attempt == 2: + return None + return None + + +def parse_timestamp(line): + """Extract timestamp from a log line like '[123. 2026-03-22T23:47:22.895Z]: ...'""" + match = re.search(r'(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+Z)', line) + if match: + ts_str = match.group(1) + # Truncate nanoseconds to microseconds for parsing + ts_str = re.sub(r'(\.\d{6})\d*Z', r'\1Z', ts_str) + try: + return datetime.fromisoformat(ts_str.replace("Z", "+00:00")) + except ValueError: + pass + return None + + +def calculate_timer_intervals(timestamps): + """Calculate timer intervals from a list of timestamps. + + Returns list of intervals in seconds. + """ + if len(timestamps) < 2: + return [] + intervals = [] + for i in range(1, len(timestamps)): + delta = (timestamps[i] - timestamps[i - 1]).total_seconds() + if delta > 0: + intervals.append(delta) + return intervals + + +def analyze_canister_logs(canister_id, lines): + """Analyze log lines for a frozen canister and determine freeze cause.""" + if lines is None: + return { + "canister_id": canister_id, + "freeze_cause": "UNREACHABLE", + "detail": "Could not fetch logs", + "total_lines": 0, + } + + total_lines = len(lines) + + # Counters + timer1_count = 0 + timer2_count = 0 + pull_count = 0 + got_challenge_count = 0 + send_share_count = 0 + store_response_count = 0 + submit_gs_count = 0 + timer_start_count = 0 + cycles_add_count = 0 + unofficial_topup_count = 0 + error_lines = [] + + # Timestamps for timer interval analysis + timer1_timestamps = [] + timer2_timestamps = [] + all_timestamps = [] + + # First and last activity + first_ts = None + last_ts = None + + for line in lines: + ts = parse_timestamp(line) + if ts: + all_timestamps.append(ts) + if first_ts is None: + first_ts = ts + last_ts = ts + + if TIMER_TRIGGER in line: + timer1_count += 1 + if ts: + timer1_timestamps.append(ts) + if TIMER_2_TRIGGER in line: + timer2_count += 1 + if ts: + timer2_timestamps.append(ts) + if PULL_CHALLENGE in line: + pull_count += 1 + if GOT_CHALLENGE in line: + got_challenge_count += 1 + if SEND_TO_SHARE in line: + send_share_count += 1 + if STORE_RESPONSE in line: + store_response_count += 1 + if SUBMIT_TO_GS in line: + submit_gs_count += 1 + if TIMER_START in line: + timer_start_count += 1 + if CYCLES_ADD in line: + cycles_add_count += 1 + if UNOFFICIAL_TOPUP in line: + unofficial_topup_count += 1 + + for pattern in ERROR_PATTERNS: + if pattern in line and "storeAndSubmitResponse" not in line: + error_lines.append(line.strip()[-200:]) + break + + # Calculate timer intervals + timer1_intervals = calculate_timer_intervals(timer1_timestamps) + timer2_intervals = calculate_timer_intervals(timer2_timestamps) + + avg_timer1_interval_h = None + min_timer1_interval_s = None + if timer1_intervals: + avg_timer1_interval_h = round(sum(timer1_intervals) / len(timer1_intervals) / 3600, 2) + min_timer1_interval_s = round(min(timer1_intervals), 1) + + avg_timer2_interval_s = None + if timer2_intervals: + avg_timer2_interval_s = round(sum(timer2_intervals) / len(timer2_intervals), 1) + + # Log span + log_span_days = None + if first_ts and last_ts: + log_span_days = round((last_ts - first_ts).total_seconds() / 86400, 1) + + # Determine freeze cause + freeze_cause = "UNKNOWN" + detail = "" + + if timer2_count > 0 and avg_timer2_interval_s and avg_timer2_interval_s < 30: + freeze_cause = "RUNAWAY_TIMER_2" + detail = f"Timer 2 fired {timer2_count} times at ~{avg_timer2_interval_s}s interval (should not run for ShareAgent)" + elif min_timer1_interval_s is not None and min_timer1_interval_s < 60: + freeze_cause = "RUNAWAY_TIMER_1" + detail = f"Timer 1 min interval was {min_timer1_interval_s}s (expected hours). Possible 5-second fallback or orphaned timers" + elif timer1_count > 0 and got_challenge_count > 0 and store_response_count > 0: + freeze_cause = "NORMAL_OPERATION" + detail = f"Processed {got_challenge_count} challenges, {store_response_count} submissions. Memory growth from stored responses likely caused freeze" + elif timer1_count > 0 and pull_count > 0 and got_challenge_count == 0: + freeze_cause = "IDLE_WITH_TIMERS" + detail = f"Timers firing ({timer1_count}x) and pulling challenges ({pull_count}x) but no challenges received. Idle cycle burn from timers + memory" + elif timer1_count > 0 and pull_count == 0: + freeze_cause = "TIMER_ONLY_NO_WORK" + detail = f"Timers firing ({timer1_count}x) but no challenge processing visible in logs. Possible log overflow or silent failure" + elif timer1_count == 0 and total_lines > 0: + freeze_cause = "NO_TIMERS_IN_LOG" + detail = f"No timer triggers found in {total_lines} log lines. Logs may predate timer start or timers never started" + elif total_lines == 0: + freeze_cause = "NO_LOGS" + detail = "No log entries found" + + if error_lines: + detail += f". {len(error_lines)} error(s) found" + + return { + "canister_id": canister_id, + "freeze_cause": freeze_cause, + "detail": detail, + "total_lines": total_lines, + "log_span_days": log_span_days, + "first_activity": first_ts.isoformat() if first_ts else "", + "last_activity": last_ts.isoformat() if last_ts else "", + "timer1_count": timer1_count, + "timer2_count": timer2_count, + "avg_timer1_interval_h": avg_timer1_interval_h, + "min_timer1_interval_s": min_timer1_interval_s, + "avg_timer2_interval_s": avg_timer2_interval_s, + "pull_count": pull_count, + "got_challenge_count": got_challenge_count, + "send_share_count": send_share_count, + "store_response_count": store_response_count, + "submit_gs_count": submit_gs_count, + "timer_start_count": timer_start_count, + "cycles_add_count": cycles_add_count, + "unofficial_topup_count": unofficial_topup_count, + "error_count": len(error_lines), + "error_samples": error_lines[:5], + } + + +def load_status_data(network): + """Load the latest check_mAIner_status JSON for the given network.""" + logs_dir = os.path.join(SCRIPT_DIR, "logs-mainer-analysis") + pattern = os.path.join(logs_dir, f"*-check_mAIner_status-{network}.json") + files = sorted(glob.glob(pattern)) + if not files: + return None + with open(files[-1]) as f: + return json.load(f) + + +def write_markdown_report(analysis_results, network, md_path): + """Write analysis report as markdown.""" + lines = [] + timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + lines.append(f"# Frozen mAIner Log Analysis — {network} — {timestamp}") + lines.append("") + + # Group by freeze cause + by_cause = defaultdict(list) + for r in analysis_results: + by_cause[r["freeze_cause"]].append(r) + + # Summary + lines.append("## Summary") + lines.append("") + lines.append("| Freeze Cause | Count |") + lines.append("|------------------------|-------|") + cause_order = [ + "RUNAWAY_TIMER_2", "RUNAWAY_TIMER_1", "NORMAL_OPERATION", + "IDLE_WITH_TIMERS", "TIMER_ONLY_NO_WORK", "NO_TIMERS_IN_LOG", + "NO_LOGS", "UNREACHABLE", "UNKNOWN", + ] + for cause in cause_order: + count = len(by_cause.get(cause, [])) + if count > 0: + lines.append(f"| {cause:<22} | {count:>5} |") + lines.append(f"| **Total** | **{len(analysis_results):>3}** |") + lines.append("") + + # Legend + lines.append("### Freeze Cause Legend") + lines.append("") + lines.append("- **RUNAWAY_TIMER_2**: Timer 2 firing at ~5s interval (should not run for ShareAgent) — rapid cycle drain") + lines.append("- **RUNAWAY_TIMER_1**: Timer 1 firing at sub-minute intervals instead of hours — possible 5s fallback") + lines.append("- **NORMAL_OPERATION**: Was processing challenges normally — froze from memory growth + idle cost") + lines.append("- **IDLE_WITH_TIMERS**: Timers firing, pulling challenges, but no challenges received — idle cycle burn") + lines.append("- **TIMER_ONLY_NO_WORK**: Timers firing but no challenge steps visible (log buffer overflow)") + lines.append("- **NO_TIMERS_IN_LOG**: No timer triggers in log — timers may have never started") + lines.append("- **NO_LOGS**: No log entries at all") + lines.append("- **UNREACHABLE**: Could not fetch logs") + lines.append("") + + # Detail table + cid_width = 27 + cause_width = 22 + lines.append("## All Frozen mAIners") + lines.append("") + lines.append( + f"| {'#':<4} " + f"| {'Canister ID':<{cid_width}} " + f"| {'Freeze Cause':<{cause_width}} " + f"| {'T1':>4} " + f"| {'T2':>4} " + f"| {'T1 Avg':>7} " + f"| {'T1 Min':>7} " + f"| {'Pulls':>5} " + f"| {'Got':>4} " + f"| {'Submit':>6} " + f"| {'Errs':>4} " + f"| {'Span':>6} " + f"| {'Last Activity':<25} |" + ) + lines.append( + f"|{'-' * 6}" + f"|{'-' * (cid_width + 2)}" + f"|{'-' * (cause_width + 2)}" + f"|{'-' * 6}" + f"|{'-' * 6}" + f"|{'-' * 9}" + f"|{'-' * 9}" + f"|{'-' * 7}" + f"|{'-' * 6}" + f"|{'-' * 8}" + f"|{'-' * 6}" + f"|{'-' * 8}" + f"|{'-' * 27}|" + ) + + # Sort by cause (runaway first) + cause_sort = {c: i for i, c in enumerate(cause_order)} + sorted_results = sorted(analysis_results, key=lambda r: cause_sort.get(r["freeze_cause"], 99)) + + for idx, r in enumerate(sorted_results, 1): + t1_avg = f"{r['avg_timer1_interval_h']}h" if r["avg_timer1_interval_h"] else "" + t1_min = f"{r['min_timer1_interval_s']}s" if r["min_timer1_interval_s"] else "" + span = f"{r['log_span_days']}d" if r["log_span_days"] else "" + last = r.get("last_activity", "")[:25] + lines.append( + f"| {idx:<4} " + f"| {r['canister_id']:<{cid_width}} " + f"| {r['freeze_cause']:<{cause_width}} " + f"| {r['timer1_count']:>4} " + f"| {r['timer2_count']:>4} " + f"| {t1_avg:>7} " + f"| {t1_min:>7} " + f"| {r['pull_count']:>5} " + f"| {r['got_challenge_count']:>4} " + f"| {r['submit_gs_count']:>6} " + f"| {r['error_count']:>4} " + f"| {span:>6} " + f"| {last:<25} |" + ) + lines.append("") + + # Detailed sections per cause + for cause in cause_order: + group = by_cause.get(cause, []) + if not group: + continue + lines.append(f"## {cause} ({len(group)} mAIners)") + lines.append("") + for r in group: + lines.append(f"### `{r['canister_id']}`") + lines.append("") + lines.append(f"- **Detail**: {r['detail']}") + lines.append(f"- **Log span**: {r['log_span_days']}d, {r['total_lines']} lines") + lines.append(f"- **Last activity**: {r.get('last_activity', 'N/A')}") + lines.append(f"- **Timer 1**: {r['timer1_count']}x, avg {r['avg_timer1_interval_h']}h, min {r['min_timer1_interval_s']}s") + if r["timer2_count"] > 0: + lines.append(f"- **Timer 2**: {r['timer2_count']}x, avg {r['avg_timer2_interval_s']}s") + lines.append(f"- **Flow**: pulls={r['pull_count']}, got={r['got_challenge_count']}, share={r['send_share_count']}, submit={r['submit_gs_count']}") + if r["error_samples"]: + lines.append(f"- **Errors**:") + for err in r["error_samples"]: + lines.append(f" - `{err}`") + lines.append("") + + with open(md_path, 'w') as f: + f.write('\n'.join(lines) + '\n') + + +def main(network, workers=10): + # Step 1: Load status data + status_data = load_status_data(network) + if not status_data: + print(f"ERROR: No check_mAIner_status JSON found for network '{network}'.") + print(f"Run check_mAIner_status.sh --network {network} first.") + return + + print(f"Loaded status data from {status_data['timestamp']} ({status_data['summary']['total_share_agents']} total mAIners)") + + # Step 2: Extract frozen mAIners + frozen_mainers = [m["canister_id"] for m in status_data["mainers"] if m["status"] == "Frozen"] + + if not frozen_mainers: + print("No frozen mAIners found in status data.") + return + + print(f"Found {len(frozen_mainers)} frozen mAIners to analyze.") + + # Step 3: Fetch logs in parallel + print(f"Fetching logs with {workers} parallel workers...") + logs_map = {} + + with ThreadPoolExecutor(max_workers=workers) as executor: + future_to_id = { + executor.submit(get_logs, cid, network): cid + for cid in frozen_mainers + } + done = 0 + for future in as_completed(future_to_id): + done += 1 + cid = future_to_id[future] + try: + logs_map[cid] = future.result() + except Exception: + logs_map[cid] = None + if done % 25 == 0: + print(f" Fetched {done}/{len(frozen_mainers)} logs...") + + print(f" Fetched all {len(frozen_mainers)} logs.") + + # Step 4: Analyze each canister + print("Analyzing logs...") + analysis_results = [] + for cid in frozen_mainers: + result = analyze_canister_logs(cid, logs_map.get(cid)) + analysis_results.append(result) + + # Step 5: Print summary + by_cause = defaultdict(int) + for r in analysis_results: + by_cause[r["freeze_cause"]] += 1 + + print(f"\n{'='*60}") + print("FROZEN MAINER LOG ANALYSIS SUMMARY") + print(f"{'='*60}") + for cause in ["RUNAWAY_TIMER_2", "RUNAWAY_TIMER_1", "NORMAL_OPERATION", + "IDLE_WITH_TIMERS", "TIMER_ONLY_NO_WORK", "NO_TIMERS_IN_LOG", + "NO_LOGS", "UNREACHABLE", "UNKNOWN"]: + count = by_cause.get(cause, 0) + if count > 0: + print(f" {cause:<25}: {count}") + print(f" {'Total':<25}: {len(analysis_results)}") + print() + + # Step 6: Write output files + date_prefix = datetime.now().strftime("%Y-%m-%d") + logs_dir = os.path.join(SCRIPT_DIR, "logs-mainer-analysis") + os.makedirs(logs_dir, exist_ok=True) + + json_path = os.path.join(logs_dir, f"{date_prefix}-analyze_frozen_mainer_logs-{network}.json") + with open(json_path, 'w') as f: + json.dump({ + "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "network": network, + "total_frozen": len(frozen_mainers), + "summary": dict(by_cause), + "results": analysis_results, + }, f, indent=2) + print(f"JSON report: {os.path.abspath(json_path)}") + + md_path = os.path.join(logs_dir, f"{date_prefix}-analyze_frozen_mainer_logs-{network}.md") + write_markdown_report(analysis_results, network, md_path) + print(f"Markdown report: {os.path.abspath(md_path)}") + print() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Fetch and analyze logs for all frozen mAIners to determine freeze cause." + ) + parser.add_argument( + "--network", + choices=["local", "ic", "testing", "demo", "development", "prd"], + default="local", + help="Specify the network to use (default: local)", + ) + parser.add_argument( + "--workers", + type=int, + default=10, + help="Number of parallel workers for log fetching (default: 10)", + ) + args = parser.parse_args() + main(args.network, args.workers) diff --git a/scripts/analyze_frozen_mainer_logs.sh b/scripts/analyze_frozen_mainer_logs.sh new file mode 100755 index 00000000..dd21771e --- /dev/null +++ b/scripts/analyze_frozen_mainer_logs.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +# Default network type is local +NETWORK_TYPE="local" +WORKERS="" + +# Parse command line arguments +while [ $# -gt 0 ]; do + case "$1" in + --network) + shift + if [ "$1" = "local" ] || [ "$1" = "ic" ] || [ "$1" = "testing" ] || [ "$1" = "development" ] || [ "$1" = "demo" ] || [ "$1" = "prd" ]; then + NETWORK_TYPE=$1 + else + echo "Invalid network type: $1. Use 'local' or 'ic' or 'testing' or 'development' or 'demo' or 'prd'." + exit 1 + fi + shift + ;; + --workers) + shift + WORKERS="--workers $1" + shift + ;; + *) + echo "Unknown argument: $1" + echo "Usage: $0 --network [local|ic|testing|development|demo|prd] [--workers N]" + exit 1 + ;; + esac +done + +echo "Using network type: $NETWORK_TYPE" + +python -m scripts.analyze_frozen_mainer_logs --network $NETWORK_TYPE $WORKERS diff --git a/scripts/check_mAIner_status.py b/scripts/check_mAIner_status.py new file mode 100644 index 00000000..2cc4afe2 --- /dev/null +++ b/scripts/check_mAIner_status.py @@ -0,0 +1,621 @@ +#!/usr/bin/env python3 + +import argparse +import os +import json +import subprocess +import time +import threading +from datetime import datetime +from concurrent.futures import ThreadPoolExecutor, as_completed +from typing import Optional + +from .get_mainers import get_mainers, get_mainer_is_active, get_mainer_setting +from .get_mainers_health import ( + is_transient_error, + run_command_with_retry, + log_message, +) + +# Get the directory of this script +SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) + +# Status constants +STATUS_HEALTHY = "Healthy" +STATUS_MAINTENANCE = "Maintenance" +STATUS_STOPPED = "Stopped" +STATUS_FROZEN = "Frozen" +STATUS_UNINSTALLED = "Uninstalled" +STATUS_UNAVAILABLE = "Unavailable" + +# Error indicators +FROZEN_INDICATORS = ["IC0207", "is out of cycles", "frozen"] +STOPPED_INDICATORS = ["IC0508"] + +# Health success patterns (from get_mainers_health.py) +HEALTH_OK_PATTERNS = [ + "(variant { Ok = record { status_code = 200 : nat16 } })", + "(variant { 17_724 = record { 3_475_804_314 = 200 : nat16 } })", +] + + +def is_frozen_error(error_text: str) -> bool: + """Check if an error indicates the canister is frozen.""" + if not error_text: + return False + return any(indicator in error_text for indicator in FROZEN_INDICATORS) + + +def is_stopped_error(error_text: str) -> bool: + """Check if an error indicates the canister is stopped.""" + if not error_text: + return False + return any(indicator in error_text for indicator in STOPPED_INDICATORS) + + +def collect_error_text(e) -> str: + """Collect meaningful error text from a subprocess exception. + + Prefers stderr/stdout over the generic str(e) which contains the raw command. + """ + parts = [] + if hasattr(e, 'stderr') and e.stderr: + parts.append(e.stderr.strip()) + if hasattr(e, 'stdout') and e.stdout: + parts.append(e.stdout.strip()) + if parts: + return " ".join(parts).replace("\n", " ") + return str(e).replace("\n", " ") + + +def get_module_hash(network: str, canister_id: str) -> tuple[Optional[str], Optional[str]]: + """Get the module hash of a canister via dfx canister info. + + Retries on transient errors (timeouts, connection issues). + + Returns: + (module_hash, error_message) + - ("0xabc...", None) if hash found + - ("None", None) if canister is uninstalled + - (None, "error text") if call failed + """ + cmd = ["dfx", "canister", "--network", network, "info", canister_id] + max_retries = 3 + retry_delay = 5.0 + + for attempt in range(1, max_retries + 1): + try: + result = subprocess.run(cmd, capture_output=True, text=True, timeout=30, check=True) + for line in result.stdout.split('\n'): + if 'Module hash:' in line: + hash_value = line.split(':', 1)[1].strip() + return (hash_value, None) + return (None, "Module hash line not found in output") + except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e: + error_text = collect_error_text(e) + + # Don't retry deterministic errors (frozen, out of cycles) + if is_frozen_error(error_text): + return (None, error_text) + + # Retry on transient errors + if attempt < max_retries and is_transient_error(error_text): + delay = retry_delay * (2 ** (attempt - 1)) + time.sleep(delay) + continue + + return (None, error_text) + + return (None, "Max retries exceeded") + + +def check_health(network: str, canister_id: str) -> tuple[str, Optional[str]]: + """Call the health endpoint on a canister with retries for transient errors. + + Returns: + (status, error_message) + """ + cmd = ["dfx", "canister", "--network", network, "call", canister_id, "health"] + max_retries = 3 + retry_delay = 5.0 + + for attempt in range(1, max_retries + 1): + try: + result = subprocess.run(cmd, capture_output=True, text=True, timeout=30, check=True) + output = result.stdout.strip() + if any(pattern in output for pattern in HEALTH_OK_PATTERNS): + return (STATUS_HEALTHY, None) + else: + return (STATUS_MAINTENANCE, f"health returned: {output[:150]}") + except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e: + error_text = collect_error_text(e) + + if is_frozen_error(error_text): + return (STATUS_FROZEN, error_text) + + if is_stopped_error(error_text): + return (STATUS_STOPPED, error_text) + + # Retry on transient errors + if attempt < max_retries and is_transient_error(error_text): + delay = retry_delay * (2 ** (attempt - 1)) + time.sleep(delay) + continue + + return (STATUS_UNAVAILABLE, error_text) + + return (STATUS_UNAVAILABLE, "Max retries exceeded") + + +def get_canister_resources(network: str, canister_id: str) -> tuple[Optional[int], Optional[int]]: + """Get memory size and cycle balance via dfx canister status. + + Retries on transient errors. Returns (None, None) for frozen/unreachable. + + Returns: + (memory_bytes, cycles_balance) + """ + cmd = ["dfx", "canister", "--network", network, "status", canister_id] + max_retries = 3 + retry_delay = 5.0 + + for attempt in range(1, max_retries + 1): + try: + result = subprocess.run(cmd, capture_output=True, text=True, timeout=30, check=True) + memory = None + balance = None + for line in result.stdout.split('\n'): + if line.strip().startswith('Memory Size:'): + memory = int(line.split(':')[1].strip().split()[0]) + elif line.strip().startswith('Balance:'): + balance = int(line.split(':')[1].strip().split()[0]) + return (memory, balance) + except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e: + error_text = collect_error_text(e) + if is_frozen_error(error_text): + return (None, None) + if attempt < max_retries and is_transient_error(error_text): + delay = retry_delay * (2 ** (attempt - 1)) + time.sleep(delay) + continue + return (None, None) + + return (None, None) + + +def check_canister_status(network: str, canister_id: str, owner: str, + index: int, total: int) -> dict: + """Check the full status of a single mAIner canister. + + Detection order: + 1. dfx canister info → check Module hash (uninstalled/frozen) + 2. health endpoint → healthy/maintenance/stopped/frozen/unavailable + """ + result = { + "canister_id": canister_id, + "owner": owner, + "status": STATUS_UNAVAILABLE, + "module_hash": "", + "error_message": "", + "active": None, + "burn_rate": "", + "memory_bytes": None, + "cycles_balance": None, + } + + # Step 1: Check module hash via dfx canister info + module_hash, info_error = get_module_hash(network, canister_id) + + if info_error: + # info call failed — check if frozen + if is_frozen_error(info_error): + result["status"] = STATUS_FROZEN + result["error_message"] = info_error + log_message(f"{canister_id} — {STATUS_FROZEN}", "ERROR", index, total) + return result + # Unknown error on info call — mark unavailable + result["status"] = STATUS_UNAVAILABLE + result["error_message"] = info_error + log_message(f"{canister_id} — {STATUS_UNAVAILABLE}", "ERROR", index, total) + return result + + result["module_hash"] = module_hash + + if module_hash == "None": + result["status"] = STATUS_UNINSTALLED + log_message(f"{canister_id} — {STATUS_UNINSTALLED}", "ERROR", index, total) + return result + + # Step 2: Check health endpoint + status, health_error = check_health(network, canister_id) + result["status"] = status + if health_error: + result["error_message"] = health_error + + if status != STATUS_HEALTHY: + log_message(f"{canister_id} — {status}", "ERROR", index, total) + return result + + # Step 3: For Healthy canisters, check active/paused and burn rate + is_active = get_mainer_is_active(network, canister_id) + result["active"] = is_active + + if is_active: + result["burn_rate"] = get_mainer_setting(network, canister_id) + + # Step 4: Get memory and cycle balance for Healthy canisters + memory, balance = get_canister_resources(network, canister_id) + result["memory_bytes"] = memory + result["cycles_balance"] = balance + + mem_mb = f"{memory / 1_000_000:.1f}MB" if memory else "?" + bal_t = f"{balance / 1_000_000_000_000:.2f}T" if balance else "?" + + if is_active: + log_message(f"{canister_id} — {STATUS_HEALTHY} (Active, {result['burn_rate']}, {mem_mb}, {bal_t})", "SUCCESS", index, total) + elif is_active is False: + log_message(f"{canister_id} — {STATUS_HEALTHY} (Paused, {mem_mb}, {bal_t})", "SUCCESS", index, total) + else: + log_message(f"{canister_id} — {STATUS_HEALTHY} (unknown, {mem_mb}, {bal_t})", "SUCCESS", index, total) + + return result + + +def filter_share_agents(mainers: list) -> list: + """Filter mAIner list to only ShareAgent type with non-empty addresses.""" + share_agents = [] + for mainer in mainers: + address = mainer.get('address', '') + canister_type_dict = mainer.get('canisterType', {}).get("MainerAgent", {}) + canister_type = list(canister_type_dict.keys())[0] if canister_type_dict else '' + + if canister_type == "ShareAgent" and address != "": + share_agents.append({ + "address": address, + "owner": mainer.get('ownedBy', 'Unknown'), + }) + return share_agents + + +def write_markdown_report(results: dict, md_path: str) -> None: + """Write a human-readable markdown report with aligned tables.""" + summary = results["summary"] + mainers = results["mainers"] + timestamp = results["timestamp"] + network = results["network"] + + lines = [] + lines.append(f"# mAIner Status Report — {network} — {timestamp}") + lines.append("") + lines.append("## Summary") + lines.append("") + lines.append("| Status | Count |") + lines.append("|--------------------|-------|") + for status in [STATUS_HEALTHY, STATUS_MAINTENANCE, STATUS_STOPPED, + STATUS_FROZEN, STATUS_UNINSTALLED, STATUS_UNAVAILABLE]: + key = status.lower() + count = summary.get(key, 0) + lines.append(f"| {status:<18} | {count:>5} |") + lines.append(f"| **Total** | **{summary['total_share_agents']:>3}** |") + lines.append("") + if summary.get("healthy", 0) > 0: + lines.append("### Healthy Breakdown") + lines.append("") + lines.append("| Detail | Count |") + lines.append("|--------------------|-------|") + lines.append(f"| Active | {summary.get('active', 0):>5} |") + lines.append(f"| Paused | {summary.get('paused', 0):>5} |") + lines.append(f"| Burn Rate: Low | {summary.get('burn_rate_low', 0):>5} |") + lines.append(f"| Burn Rate: Medium | {summary.get('burn_rate_medium', 0):>5} |") + lines.append(f"| Burn Rate: High | {summary.get('burn_rate_high', 0):>5} |") + lines.append(f"| Burn Rate: VeryHigh| {summary.get('burn_rate_very_high', 0):>5} |") + if summary.get("burn_rate_other", 0) > 0: + lines.append(f"| Burn Rate: Other | {summary.get('burn_rate_other', 0):>5} |") + lines.append("") + + # Memory and cycles stats for healthy canisters + healthy_with_mem = [m for m in mainers if m["status"] == STATUS_HEALTHY and m.get("memory_bytes") is not None] + if healthy_with_mem: + mem_values = [m["memory_bytes"] for m in healthy_with_mem] + avg_mem = sum(mem_values) / len(mem_values) / 1_000_000 + max_mem = max(mem_values) / 1_000_000 + min_mem = min(mem_values) / 1_000_000 + lines.append("### Memory & Cycles (Healthy mAIners)") + lines.append("") + lines.append(f"| Metric | Value |") + lines.append(f"|---------------------|-------------|") + lines.append(f"| Avg Memory | {avg_mem:>8.1f} MB |") + lines.append(f"| Min Memory | {min_mem:>8.1f} MB |") + lines.append(f"| Max Memory | {max_mem:>8.1f} MB |") + healthy_with_bal = [m for m in healthy_with_mem if m.get("cycles_balance") is not None] + if healthy_with_bal: + bal_values = [m["cycles_balance"] for m in healthy_with_bal] + avg_bal = sum(bal_values) / len(bal_values) / 1_000_000_000_000 + min_bal = min(bal_values) / 1_000_000_000_000 + max_bal = max(bal_values) / 1_000_000_000_000 + lines.append(f"| Avg Cycles Balance | {avg_bal:>8.2f} T |") + lines.append(f"| Min Cycles Balance | {min_bal:>8.2f} T |") + lines.append(f"| Max Cycles Balance | {max_bal:>8.2f} T |") + lines.append("") + + # Determine the majority module hash + hash_counts = {} + for m in mainers: + h = m.get("module_hash", "") + if h and h != "None": + hash_counts[h] = hash_counts.get(h, 0) + 1 + majority_hash = max(hash_counts, key=hash_counts.get) if hash_counts else "" + + def module_hash_status(m): + h = m.get("module_hash", "") + if not h or h == "None": + return "None" + if h == majority_hash: + return "Ok" + return "Not Ok" + + # Determine column widths for the all-mainers table + cid_width = max((len(m["canister_id"]) for m in mainers), default=11) + status_width = max(len(STATUS_UNINSTALLED), 6) # widest status label + hash_col_width = 11 # "Module Hash" header width + + active_col_width = 7 # "Active" + padding + burn_col_width = 9 # "Burn Rate" header + mem_col_width = 10 # "Memory MB" header + cyc_col_width = 10 # "Cycles (T)" header + + def table_header(): + return [ + f"| {'#':<4} | {'Canister ID':<{cid_width}} | {'Status':<{status_width}} | {'Active':<{active_col_width}} | {'Burn Rate':<{burn_col_width}} | {'Memory MB':<{mem_col_width}} | {'Cycles (T)':<{cyc_col_width}} | {'Module Hash':<{hash_col_width}} |", + f"|{'-' * 6}|{'-' * (cid_width + 2)}|{'-' * (status_width + 2)}|{'-' * (active_col_width + 2)}|{'-' * (burn_col_width + 2)}|{'-' * (mem_col_width + 2)}|{'-' * (cyc_col_width + 2)}|{'-' * (hash_col_width + 2)}|", + ] + + def active_display(m): + a = m.get("active") + if a is True: + return "Yes" + elif a is False: + return "No" + return "" + + def mem_display(m): + mem = m.get("memory_bytes") + if mem is not None: + return f"{mem / 1_000_000:.1f}" + return "" + + def cycles_display(m): + bal = m.get("cycles_balance") + if bal is not None: + return f"{bal / 1_000_000_000_000:.2f}" + return "" + + def table_row(idx, m): + return f"| {idx:<4} | {m['canister_id']:<{cid_width}} | {m['status']:<{status_width}} | {active_display(m):<{active_col_width}} | {m.get('burn_rate', ''):<{burn_col_width}} | {mem_display(m):>{mem_col_width}} | {cycles_display(m):>{cyc_col_width}} | {module_hash_status(m):<{hash_col_width}} |" + + # Sort: non-healthy first (Frozen, Uninstalled, Stopped, Maintenance, Unavailable, Healthy) + status_order = { + STATUS_FROZEN: 0, STATUS_UNINSTALLED: 1, STATUS_STOPPED: 2, + STATUS_MAINTENANCE: 3, STATUS_UNAVAILABLE: 4, STATUS_HEALTHY: 5, + } + sorted_mainers = sorted(mainers, key=lambda m: status_order.get(m["status"], 99)) + + lines.append("## All mAIners") + lines.append("") + lines.append(f"Ok Module Hash: `{majority_hash}`") + lines.append("") + lines.extend(table_header()) + for idx, m in enumerate(sorted_mainers, 1): + lines.append(table_row(idx, m)) + lines.append("") + + # Filtered tables for problem statuses + for filter_status in [STATUS_FROZEN, STATUS_UNINSTALLED, STATUS_STOPPED, + STATUS_MAINTENANCE, STATUS_UNAVAILABLE]: + filtered = [m for m in mainers if m["status"] == filter_status] + if filtered: + lines.append(f"## {filter_status} mAIners") + lines.append("") + lines.extend(table_header()) + for idx, m in enumerate(filtered, 1): + lines.append(table_row(idx, m)) + lines.append("") + + # Notes section with full error messages for non-healthy canisters + non_healthy = [m for m in sorted_mainers if m.get("error_message")] + if non_healthy: + lines.append("## Notes") + lines.append("") + for m in non_healthy: + lines.append(f"**{m['canister_id']}** ({m['status']}):") + lines.append(f"> {m['error_message']}") + lines.append("") + + with open(md_path, 'w') as f: + f.write('\n'.join(lines) + '\n') + + +def main(network, workers=10, limit=None): + log_message("=" * 80) + log_message(f"Checking status of all ShareAgent mAIners on network '{network}'") + log_message(f"Using {workers} parallel workers") + log_message("=" * 80) + + # Step 1: Fetch all mAIners from GameState + mainers = get_mainers(network) + if not mainers: + log_message(f"No mainers found on network '{network}'", "ERROR") + return + + # Step 2: Filter to ShareAgent only + share_agents = filter_share_agents(mainers) + log_message(f"Found {len(share_agents)} ShareAgent mAIners (out of {len(mainers)} total)") + + if not share_agents: + log_message("No ShareAgent mAIners to check", "ERROR") + return + + # Step 3: Apply limit if specified + if limit is not None and limit > 0: + share_agents = share_agents[:limit] + log_message(f"Limiting to first {limit} mAIners (for testing)") + + total = len(share_agents) + log_message(f"Starting status check for {total} mAIners...") + log_message("") + + # Step 4: Check all canisters in parallel + all_results = [] + with ThreadPoolExecutor(max_workers=workers) as executor: + future_to_agent = { + executor.submit( + check_canister_status, network, agent["address"], + agent["owner"], idx + 1, total + ): agent + for idx, agent in enumerate(share_agents) + } + + for future in as_completed(future_to_agent): + try: + result = future.result() + all_results.append(result) + except Exception as e: + agent = future_to_agent[future] + log_message(f"Exception checking {agent['address']}: {e}", "ERROR") + all_results.append({ + "canister_id": agent["address"], + "owner": agent["owner"], + "status": STATUS_UNAVAILABLE, + "module_hash": "", + "error_message": str(e)[:200], + "active": None, + "burn_rate": "", + "memory_bytes": None, + "cycles_balance": None, + }) + + # Step 5: Aggregate results + counts = { + STATUS_HEALTHY: 0, + STATUS_MAINTENANCE: 0, + STATUS_STOPPED: 0, + STATUS_FROZEN: 0, + STATUS_UNINSTALLED: 0, + STATUS_UNAVAILABLE: 0, + } + active_count = 0 + paused_count = 0 + burn_rate_counts = {"Low": 0, "Medium": 0, "High": 0, "VeryHigh": 0, "Other": 0} + + for r in all_results: + counts[r["status"]] = counts.get(r["status"], 0) + 1 + if r["status"] == STATUS_HEALTHY: + if r.get("active") is True: + active_count += 1 + br = r.get("burn_rate", "") + if br in burn_rate_counts: + burn_rate_counts[br] += 1 + elif br: + burn_rate_counts["Other"] += 1 + elif r.get("active") is False: + paused_count += 1 + + # Step 6: Print summary + log_message("") + log_message("=" * 80) + log_message("STATUS CHECK SUMMARY") + log_message("=" * 80) + log_message(f"Total ShareAgent mAIners : {total}") + for status_name in [STATUS_HEALTHY, STATUS_MAINTENANCE, STATUS_STOPPED, + STATUS_FROZEN, STATUS_UNINSTALLED, STATUS_UNAVAILABLE]: + count = counts[status_name] + level = "SUCCESS" if status_name == STATUS_HEALTHY and count > 0 else \ + "ERROR" if count > 0 and status_name != STATUS_HEALTHY else "INFO" + log_message(f" {status_name:<13}: {count}", level) + log_message("") + if counts[STATUS_HEALTHY] > 0: + log_message(f"Healthy breakdown:") + log_message(f" Active : {active_count}", "SUCCESS" if active_count > 0 else "INFO") + log_message(f" Paused : {paused_count}", "ERROR" if paused_count > 0 else "INFO") + log_message(f" Active burn rates:") + for br_name in ["Low", "Medium", "High", "VeryHigh", "Other"]: + if burn_rate_counts[br_name] > 0: + log_message(f" {br_name:<10}: {burn_rate_counts[br_name]}") + log_message("") + + # Print non-healthy canisters + non_healthy = [r for r in all_results if r["status"] != STATUS_HEALTHY] + if non_healthy: + log_message("NON-HEALTHY MAINERS:", "ERROR") + for r in non_healthy: + error_info = f" — {r['error_message'][:80]}" if r.get("error_message") else "" + log_message(f" {r['canister_id']} [{r['status']}]{error_info}", "ERROR") + log_message("") + + if not non_healthy: + log_message("All mAIners are healthy!", "SUCCESS") + log_message("") + + # Step 7: Write output files + timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + date_prefix = datetime.now().strftime("%Y-%m-%d") + logs_dir = os.path.join(SCRIPT_DIR, "logs-mainer-analysis") + os.makedirs(logs_dir, exist_ok=True) + + output_data = { + "timestamp": timestamp, + "network": network, + "summary": { + "total_share_agents": total, + "healthy": counts[STATUS_HEALTHY], + "active": active_count, + "paused": paused_count, + "burn_rate_low": burn_rate_counts["Low"], + "burn_rate_medium": burn_rate_counts["Medium"], + "burn_rate_high": burn_rate_counts["High"], + "burn_rate_very_high": burn_rate_counts["VeryHigh"], + "burn_rate_other": burn_rate_counts["Other"], + "maintenance": counts[STATUS_MAINTENANCE], + "stopped": counts[STATUS_STOPPED], + "frozen": counts[STATUS_FROZEN], + "uninstalled": counts[STATUS_UNINSTALLED], + "unavailable": counts[STATUS_UNAVAILABLE], + }, + "mainers": sorted(all_results, key=lambda r: r["canister_id"]), + } + + json_path = os.path.join(logs_dir, f"{date_prefix}-check_mAIner_status-{network}.json") + with open(json_path, 'w') as f: + json.dump(output_data, f, indent=2) + log_message(f"JSON report saved to: {os.path.abspath(json_path)}") + + md_path = os.path.join(logs_dir, f"{date_prefix}-check_mAIner_status-{network}.md") + write_markdown_report(output_data, md_path) + log_message(f"Markdown report saved to: {os.path.abspath(md_path)}") + log_message("") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Check status of all ShareAgent mAIners (Healthy/Maintenance/Stopped/Frozen/Uninstalled/Unavailable)." + ) + parser.add_argument( + "--network", + choices=["local", "ic", "testing", "demo", "development", "prd"], + default="local", + help="Specify the network to use (default: local)", + ) + parser.add_argument( + "--workers", + type=int, + default=10, + help="Number of parallel workers (default: 10)", + ) + parser.add_argument( + "--limit", + type=int, + default=None, + help="Limit number of mainers to check (for testing)", + ) + args = parser.parse_args() + main(args.network, args.workers, args.limit) diff --git a/scripts/check_mAIner_status.sh b/scripts/check_mAIner_status.sh new file mode 100755 index 00000000..adee5eaf --- /dev/null +++ b/scripts/check_mAIner_status.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +# Default network type is local +NETWORK_TYPE="local" +WORKERS="" +LIMIT="" + +# Parse command line arguments for network type +while [ $# -gt 0 ]; do + case "$1" in + --network) + shift + if [ "$1" = "local" ] || [ "$1" = "ic" ] || [ "$1" = "testing" ] || [ "$1" = "development" ] || [ "$1" = "demo" ] || [ "$1" = "prd" ]; then + NETWORK_TYPE=$1 + else + echo "Invalid network type: $1. Use 'local' or 'ic' or 'testing' or 'development' or 'demo' or 'prd'." + exit 1 + fi + shift + ;; + --workers) + shift + WORKERS="--workers $1" + shift + ;; + --limit) + shift + LIMIT="--limit $1" + shift + ;; + *) + echo "Unknown argument: $1" + echo "Usage: $0 --network [local|ic|testing|development|demo|prd] [--workers N] [--limit N]" + exit 1 + ;; + esac +done + +echo "Using network type: $NETWORK_TYPE" + +python -m scripts.check_mAIner_status --network $NETWORK_TYPE $WORKERS $LIMIT diff --git a/scripts/monitor_logs_active_mainers.py b/scripts/monitor_logs_active_mainers.py new file mode 100644 index 00000000..ba37b530 --- /dev/null +++ b/scripts/monitor_logs_active_mainers.py @@ -0,0 +1,210 @@ +#!/usr/bin/env python3 + +import subprocess +import time +import argparse +import os +import json +from collections import defaultdict +from datetime import datetime + +from .get_mainers import get_mainers, get_mainer_is_active, get_mainer_setting +from .monitor_common import ensure_log_dir + +# Get the directory of this script +SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) + +BURN_RATE_GROUPS = ["Low", "Medium", "High", "VeryHigh"] + +# Color codes for burn rate groups +BURN_RATE_COLORS = { + "Low": "\033[38;5;82m", # green + "Medium": "\033[38;5;226m", # yellow + "High": "\033[38;5;208m", # orange + "VeryHigh": "\033[38;5;196m", # red +} +RESET_COLOR = "\033[0m" + + +def get_logs(canister_id, network): + """Fetch logs using dfx for a given canister.""" + try: + output = subprocess.check_output( + ["dfx", "canister", "logs", canister_id, "--network", network], + stderr=subprocess.DEVNULL, + text=True + ) + return output.strip().splitlines() + except subprocess.CalledProcessError: + return [] + + +def filter_share_agents(mainers): + """Filter mAIner list to only ShareAgent type with non-empty addresses.""" + share_agents = [] + for mainer in mainers: + address = mainer.get('address', '') + canister_type_dict = mainer.get('canisterType', {}).get("MainerAgent", {}) + canister_type = list(canister_type_dict.keys())[0] if canister_type_dict else '' + if canister_type == "ShareAgent" and address != "": + share_agents.append(address) + return share_agents + + +def discover_active_mainers(network, limit=None): + """Fetch all ShareAgent mAIners and return only active ones grouped by burn rate.""" + print(f"Fetching all mAIners from GameState on network '{network}'...") + mainers = get_mainers(network) + if not mainers: + print("ERROR: No mainers found.") + return {} + + share_agents = filter_share_agents(mainers) + print(f"Found {len(share_agents)} ShareAgent mAIners.") + + if limit is not None and limit > 0: + share_agents = share_agents[:limit] + print(f"Limiting to first {limit} mAIners (for testing).") + + print(f"Checking active status and burn rate for {len(share_agents)} mAIners...") + grouped = {br: [] for br in BURN_RATE_GROUPS} + active_count = 0 + paused_count = 0 + skipped_count = 0 + + for idx, address in enumerate(share_agents, 1): + is_active = get_mainer_is_active(network, address) + if is_active is not True: + if is_active is False: + paused_count += 1 + else: + skipped_count += 1 + continue + + active_count += 1 + setting = get_mainer_setting(network, address) + if setting in grouped: + grouped[setting].append(address) + else: + # Custom/Unknown/Unable to query — skip + skipped_count += 1 + + if idx % 50 == 0: + print(f" Checked {idx}/{len(share_agents)} mAIners...") + + print(f"\nDiscovery complete:") + print(f" Active: {active_count}, Paused: {paused_count}, Skipped: {skipped_count}") + for br in BURN_RATE_GROUPS: + print(f" {br}: {len(grouped[br])} active mAIners") + + return grouped + + +def main(network, delay=3, limit=None): + # Step 1: Discover active mAIners grouped by burn rate + grouped = discover_active_mainers(network, limit) + + total_active = sum(len(ids) for ids in grouped.values()) + if total_active == 0: + print("\nNo active mAIners found. Nothing to monitor.") + return + + # Step 2: Set up log directories and files + log_dir = os.path.join(SCRIPT_DIR, f"logs-active-mainers-{network}") + ensure_log_dir(log_dir) + + log_files = {} + for br in BURN_RATE_GROUPS: + if grouped[br]: + log_path = os.path.join(log_dir, f"burn_rate_{br.lower()}.log") + # Clear at start + with open(log_path, "w"): + pass + log_files[br] = log_path + + combined_log_path = os.path.join(log_dir, "combined_active.log") + with open(combined_log_path, "w"): + pass + + # Write a manifest of which canisters are in each group + manifest_path = os.path.join(log_dir, "manifest.json") + manifest = { + "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "network": network, + "total_active": total_active, + "groups": {br: grouped[br] for br in BURN_RATE_GROUPS}, + } + with open(manifest_path, 'w') as f: + json.dump(manifest, f, indent=2) + + print(f"\nMonitoring {total_active} active mAIners on '{network}'.") + print(f"Log directory: {os.path.abspath(log_dir)}") + print(f" Combined log: combined_active.log") + for br in BURN_RATE_GROUPS: + if grouped[br]: + print(f" {br} ({len(grouped[br])} mAIners): burn_rate_{br.lower()}.log") + print(f"\nChecking every {delay} seconds. Press Ctrl+C to stop.\n") + + # Step 3: Monitor loop + previous_logs = defaultdict(set) + first = True + + while True: + for br in BURN_RATE_GROUPS: + for canister_id in grouped[br]: + log_lines = get_logs(canister_id, network) + new_lines = [] + for line in log_lines: + if line not in previous_logs[canister_id]: + previous_logs[canister_id].add(line) + new_lines.append(line) + + if new_lines: + color = BURN_RATE_COLORS.get(br, "") + br_log_path = log_files.get(br) + + with open(combined_log_path, "a") as f_combined: + br_file = open(br_log_path, "a") if br_log_path else None + try: + for line in new_lines: + tagged_line = f"[{br}][{canister_id}] {line}" + f_combined.write(tagged_line + "\n") + if br_file: + br_file.write(f"[{canister_id}] {line}\n") + print(f"{color}[{br}]{RESET_COLOR}({canister_id}) {line}") + finally: + if br_file: + br_file.close() + + if first: + first = False + print(f"\nInitial log retrieval completed for {total_active} active mAIners.") + print(f"Will report changes in logs. Checking every {delay} seconds...") + + time.sleep(delay) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Monitor logs for active ShareAgent mAIners, grouped by burn rate." + ) + parser.add_argument( + "--network", + choices=["local", "ic", "testing", "demo", "development", "prd"], + default="local", + help="Specify the network to use (default: local)", + ) + parser.add_argument( + "--delay", + type=int, + default=3, + help="Seconds between log checks (default: 3)", + ) + parser.add_argument( + "--limit", + type=int, + default=None, + help="Limit number of mainers to check (for testing)", + ) + args = parser.parse_args() + main(args.network, args.delay, args.limit) diff --git a/scripts/monitor_logs_active_mainers.sh b/scripts/monitor_logs_active_mainers.sh new file mode 100755 index 00000000..bffb8824 --- /dev/null +++ b/scripts/monitor_logs_active_mainers.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +# Default network type is local +NETWORK_TYPE="local" +DELAY="" +LIMIT="" + +# Parse command line arguments +while [ $# -gt 0 ]; do + case "$1" in + --network) + shift + if [ "$1" = "local" ] || [ "$1" = "ic" ] || [ "$1" = "testing" ] || [ "$1" = "development" ] || [ "$1" = "demo" ] || [ "$1" = "prd" ]; then + NETWORK_TYPE=$1 + else + echo "Invalid network type: $1. Use 'local' or 'ic' or 'testing' or 'development' or 'demo' or 'prd'." + exit 1 + fi + shift + ;; + --delay) + shift + DELAY="--delay $1" + shift + ;; + --limit) + shift + LIMIT="--limit $1" + shift + ;; + *) + echo "Unknown argument: $1" + echo "Usage: $0 --network [local|ic|testing|development|demo|prd] [--delay N] [--limit N]" + exit 1 + ;; + esac +done + +echo "Using network type: $NETWORK_TYPE" + +python -m scripts.monitor_logs_active_mainers --network $NETWORK_TYPE $DELAY $LIMIT From 3b67f4032520f93f5841e0d17e54d8d9806f15d7 Mon Sep 17 00:00:00 2001 From: icpp Date: Fri, 17 Apr 2026 12:02:13 -0400 Subject: [PATCH 2/2] reeze prediction and parallelize timer management - Added cycle depletion forecasting based on memory usage to identify canisters at risk of freezing. - Updated status reports to display drain rates, freeze thresholds, and estimated days remaining. - Rewrote the timer startup script to support parallel execution, health checks, and dry-run mode. - Improved report sorting to prioritize canisters near their freezing threshold. --- scripts/analyze_active_mainer_logs.py | 14 +- scripts/check_mAIner_status.py | 118 ++++++++++++-- scripts/start_timers.py | 221 ++++++++++++++++++++++++-- scripts/start_timers.sh | 23 ++- 4 files changed, 348 insertions(+), 28 deletions(-) diff --git a/scripts/analyze_active_mainer_logs.py b/scripts/analyze_active_mainer_logs.py index 9a2f8f94..9f8e09ca 100644 --- a/scripts/analyze_active_mainer_logs.py +++ b/scripts/analyze_active_mainer_logs.py @@ -220,15 +220,16 @@ def write_markdown_report(analysis_results, network, md_path): verdict_width = 23 lines.append("## All Active mAIners") lines.append("") - lines.append(f"| {'#':<4} | {'Canister ID':<{cid_width}} | {'Burn Rate':<9} | {'Verdict':<{verdict_width}} | {'Triggers':<8} | {'Pulls':<5} | {'Interval':<8} | {'Last Activity':<25} |") - lines.append(f"|{'-'*6}|{'-'*(cid_width+2)}|{'-'*11}|{'-'*(verdict_width+2)}|{'-'*10}|{'-'*7}|{'-'*10}|{'-'*27}|") + lines.append(f"| {'#':<4} | {'Canister ID':<{cid_width}} | {'Burn Rate':<9} | {'Verdict':<{verdict_width}} | {'Triggers':<8} | {'Pulls':<5} | {'Interval':<8} | {'Days Left':<9} | {'Last Activity':<25} |") + lines.append(f"|{'-'*6}|{'-'*(cid_width+2)}|{'-'*11}|{'-'*(verdict_width+2)}|{'-'*10}|{'-'*7}|{'-'*10}|{'-'*11}|{'-'*27}|") - # Sort: problems first + # Sort: problems first, then by days_until_freeze ascending verdict_order = {"UNREACHABLE": 0, "NO_TIMERS": 1, "STUCK_AT_CHALLENGE_PULL": 2, "TIMER_ONLY": 3, "UNKNOWN": 4, "OK": 5} - sorted_results = sorted(analysis_results, key=lambda r: verdict_order.get(r["verdict"], 99)) + sorted_results = sorted(analysis_results, key=lambda r: (verdict_order.get(r["verdict"], 99), r.get("days_until_freeze") if r.get("days_until_freeze") is not None else 9999)) for idx, r in enumerate(sorted_results, 1): interval = f"{r['timer_interval_hours']}h" if r["timer_interval_hours"] else "" + days_left = f"{r['days_until_freeze']:.1f}" if r.get("days_until_freeze") is not None else "" lines.append( f"| {idx:<4} " f"| {r['canister_id']:<{cid_width}} " @@ -237,6 +238,7 @@ def write_markdown_report(analysis_results, network, md_path): f"| {r['recurring_triggers']:<8} " f"| {r['pull_challenge_count']:<5} " f"| {interval:<8} " + f"| {days_left:>9} " f"| {r['last_activity']:<25} |" ) lines.append("") @@ -281,13 +283,14 @@ def main(network, workers=10): print(f"Loaded status data from {status_data['timestamp']} ({status_data['summary']['total_share_agents']} total mAIners)") - # Step 2: Extract active mAIners with burn rates + # Step 2: Extract active mAIners with burn rates and freeze prediction active_mainers = [] for m in status_data["mainers"]: if m.get("active") is True and m.get("burn_rate", ""): active_mainers.append({ "canister_id": m["canister_id"], "burn_rate": m["burn_rate"], + "days_until_freeze": m.get("days_until_freeze"), }) if not active_mainers: @@ -325,6 +328,7 @@ def main(network, workers=10): cid = m["canister_id"] result = analyze_canister_logs(cid, logs_map.get(cid)) result["burn_rate"] = m["burn_rate"] + result["days_until_freeze"] = m.get("days_until_freeze") analysis_results.append(result) # Step 5: Print summary diff --git a/scripts/check_mAIner_status.py b/scripts/check_mAIner_status.py index 2cc4afe2..a44bca70 100644 --- a/scripts/check_mAIner_status.py +++ b/scripts/check_mAIner_status.py @@ -147,6 +147,27 @@ def check_health(network: str, canister_id: str) -> tuple[str, Optional[str]]: return (STATUS_UNAVAILABLE, "Max retries exceeded") +def calculate_freeze_prediction(memory_bytes, cycles_balance): + """Calculate freeze prediction based on memory idle cost. + + Returns: + (daily_drain, freeze_threshold, days_until_freeze) or (None, None, None) + """ + if memory_bytes is None or cycles_balance is None or memory_bytes == 0: + return (None, None, None) + # 317,500 cycles per GiB per second on 13-node subnets + # (was 127,000, increased 2.5x by NNS Proposal 140538 / Mission70) + # Verified: dfx canister status shows idle_cycles_burned_per_day = 10.6B for 416MB, + # which matches 317,500 rate (127,000 rate would give only 4.3B) + daily_drain = int(memory_bytes * 317_500 / (1024 ** 3) * 86_400) + freeze_threshold = daily_drain * 30 # 30 days = default freezing threshold + if daily_drain > 0: + days_until_freeze = round((cycles_balance - freeze_threshold) / daily_drain, 1) + else: + days_until_freeze = None + return (daily_drain, freeze_threshold, days_until_freeze) + + def get_canister_resources(network: str, canister_id: str) -> tuple[Optional[int], Optional[int]]: """Get memory size and cycle balance via dfx canister status. @@ -201,6 +222,9 @@ def check_canister_status(network: str, canister_id: str, owner: str, "burn_rate": "", "memory_bytes": None, "cycles_balance": None, + "daily_drain": None, + "freeze_threshold": None, + "days_until_freeze": None, } # Step 1: Check module hash via dfx canister info @@ -248,15 +272,22 @@ def check_canister_status(network: str, canister_id: str, owner: str, result["memory_bytes"] = memory result["cycles_balance"] = balance + # Step 5: Calculate freeze prediction + daily_drain, freeze_thresh, days_left = calculate_freeze_prediction(memory, balance) + result["daily_drain"] = daily_drain + result["freeze_threshold"] = freeze_thresh + result["days_until_freeze"] = days_left + mem_mb = f"{memory / 1_000_000:.1f}MB" if memory else "?" bal_t = f"{balance / 1_000_000_000_000:.2f}T" if balance else "?" + days_str = f"{days_left}d" if days_left is not None else "?" if is_active: - log_message(f"{canister_id} — {STATUS_HEALTHY} (Active, {result['burn_rate']}, {mem_mb}, {bal_t})", "SUCCESS", index, total) + log_message(f"{canister_id} — {STATUS_HEALTHY} (Active, {result['burn_rate']}, {mem_mb}, {bal_t}, {days_str})", "SUCCESS", index, total) elif is_active is False: - log_message(f"{canister_id} — {STATUS_HEALTHY} (Paused, {mem_mb}, {bal_t})", "SUCCESS", index, total) + log_message(f"{canister_id} — {STATUS_HEALTHY} (Paused, {mem_mb}, {bal_t}, {days_str})", "SUCCESS", index, total) else: - log_message(f"{canister_id} — {STATUS_HEALTHY} (unknown, {mem_mb}, {bal_t})", "SUCCESS", index, total) + log_message(f"{canister_id} — {STATUS_HEALTHY} (unknown, {mem_mb}, {bal_t}, {days_str})", "SUCCESS", index, total) return result @@ -337,6 +368,15 @@ def write_markdown_report(results: dict, md_path: str) -> None: lines.append(f"| Min Cycles Balance | {min_bal:>8.2f} T |") lines.append(f"| Max Cycles Balance | {max_bal:>8.2f} T |") lines.append("") + # At-risk section + lines.append("### Freeze Risk (Healthy mAIners)") + lines.append("") + lines.append("| Risk Window | Count |") + lines.append("|----------------------|-------|") + lines.append(f"| Freeze in < 7 days | {summary.get('at_risk_7_days', 0):>5} |") + lines.append(f"| Freeze in < 14 days | {summary.get('at_risk_14_days', 0):>5} |") + lines.append(f"| Freeze in < 30 days | {summary.get('at_risk_30_days', 0):>5} |") + lines.append("") # Determine the majority module hash hash_counts = {} @@ -359,15 +399,19 @@ def module_hash_status(m): status_width = max(len(STATUS_UNINSTALLED), 6) # widest status label hash_col_width = 11 # "Module Hash" header width - active_col_width = 7 # "Active" + padding - burn_col_width = 9 # "Burn Rate" header - mem_col_width = 10 # "Memory MB" header - cyc_col_width = 10 # "Cycles (T)" header + active_col_width = 6 + burn_col_width = 9 + mem_col_width = 9 + cyc_col_width = 10 + drain_col_width = 10 + freeze_col_width = 10 + days_col_width = 9 + hash_col_width = 11 def table_header(): return [ - f"| {'#':<4} | {'Canister ID':<{cid_width}} | {'Status':<{status_width}} | {'Active':<{active_col_width}} | {'Burn Rate':<{burn_col_width}} | {'Memory MB':<{mem_col_width}} | {'Cycles (T)':<{cyc_col_width}} | {'Module Hash':<{hash_col_width}} |", - f"|{'-' * 6}|{'-' * (cid_width + 2)}|{'-' * (status_width + 2)}|{'-' * (active_col_width + 2)}|{'-' * (burn_col_width + 2)}|{'-' * (mem_col_width + 2)}|{'-' * (cyc_col_width + 2)}|{'-' * (hash_col_width + 2)}|", + f"| {'#':<4} | {'Canister ID':<{cid_width}} | {'Status':<{status_width}} | {'Active':<{active_col_width}} | {'Burn Rate':<{burn_col_width}} | {'Mem (MB)':<{mem_col_width}} | {'Cycles (T)':<{cyc_col_width}} | {'Drain/D(B)':<{drain_col_width}} | {'FreezeAt(T)':<{freeze_col_width}} | {'Days Left':<{days_col_width}} | {'Hash':<{hash_col_width}} |", + f"|{'-' * 6}|{'-' * (cid_width + 2)}|{'-' * (status_width + 2)}|{'-' * (active_col_width + 2)}|{'-' * (burn_col_width + 2)}|{'-' * (mem_col_width + 2)}|{'-' * (cyc_col_width + 2)}|{'-' * (drain_col_width + 2)}|{'-' * (freeze_col_width + 2)}|{'-' * (days_col_width + 2)}|{'-' * (hash_col_width + 2)}|", ] def active_display(m): @@ -390,8 +434,38 @@ def cycles_display(m): return f"{bal / 1_000_000_000_000:.2f}" return "" + def drain_display(m): + d = m.get("daily_drain") + if d is not None: + return f"{d / 1_000_000_000:.1f}" + return "" + + def freeze_at_display(m): + ft = m.get("freeze_threshold") + if ft is not None: + return f"{ft / 1_000_000_000_000:.2f}" + return "" + + def days_display(m): + d = m.get("days_until_freeze") + if d is not None: + return f"{d:.1f}" + return "" + def table_row(idx, m): - return f"| {idx:<4} | {m['canister_id']:<{cid_width}} | {m['status']:<{status_width}} | {active_display(m):<{active_col_width}} | {m.get('burn_rate', ''):<{burn_col_width}} | {mem_display(m):>{mem_col_width}} | {cycles_display(m):>{cyc_col_width}} | {module_hash_status(m):<{hash_col_width}} |" + return ( + f"| {idx:<4} " + f"| {m['canister_id']:<{cid_width}} " + f"| {m['status']:<{status_width}} " + f"| {active_display(m):<{active_col_width}} " + f"| {m.get('burn_rate', ''):<{burn_col_width}} " + f"| {mem_display(m):>{mem_col_width}} " + f"| {cycles_display(m):>{cyc_col_width}} " + f"| {drain_display(m):>{drain_col_width}} " + f"| {freeze_at_display(m):>{freeze_col_width}} " + f"| {days_display(m):>{days_col_width}} " + f"| {module_hash_status(m):<{hash_col_width}} |" + ) # Sort: non-healthy first (Frozen, Uninstalled, Stopped, Maintenance, Unavailable, Healthy) status_order = { @@ -492,6 +566,9 @@ def main(network, workers=10, limit=None): "burn_rate": "", "memory_bytes": None, "cycles_balance": None, + "daily_drain": None, + "freeze_threshold": None, + "days_until_freeze": None, }) # Step 5: Aggregate results @@ -507,6 +584,10 @@ def main(network, workers=10, limit=None): paused_count = 0 burn_rate_counts = {"Low": 0, "Medium": 0, "High": 0, "VeryHigh": 0, "Other": 0} + at_risk_7 = 0 + at_risk_14 = 0 + at_risk_30 = 0 + for r in all_results: counts[r["status"]] = counts.get(r["status"], 0) + 1 if r["status"] == STATUS_HEALTHY: @@ -519,6 +600,15 @@ def main(network, workers=10, limit=None): burn_rate_counts["Other"] += 1 elif r.get("active") is False: paused_count += 1 + # At-risk tracking + d = r.get("days_until_freeze") + if d is not None: + if d < 7: + at_risk_7 += 1 + if d < 14: + at_risk_14 += 1 + if d < 30: + at_risk_30 += 1 # Step 6: Print summary log_message("") @@ -542,6 +632,11 @@ def main(network, workers=10, limit=None): if burn_rate_counts[br_name] > 0: log_message(f" {br_name:<10}: {burn_rate_counts[br_name]}") log_message("") + log_message(f"Freeze risk (healthy mAIners):") + log_message(f" Freeze in < 7 days : {at_risk_7}", "ERROR" if at_risk_7 > 0 else "INFO") + log_message(f" Freeze in < 14 days: {at_risk_14}", "ERROR" if at_risk_14 > 0 else "INFO") + log_message(f" Freeze in < 30 days: {at_risk_30}", "ERROR" if at_risk_30 > 0 else "INFO") + log_message("") # Print non-healthy canisters non_healthy = [r for r in all_results if r["status"] != STATUS_HEALTHY] @@ -580,6 +675,9 @@ def main(network, workers=10, limit=None): "frozen": counts[STATUS_FROZEN], "uninstalled": counts[STATUS_UNINSTALLED], "unavailable": counts[STATUS_UNAVAILABLE], + "at_risk_7_days": at_risk_7, + "at_risk_14_days": at_risk_14, + "at_risk_30_days": at_risk_30, }, "mainers": sorted(all_results, key=lambda r: r["canister_id"]), } diff --git a/scripts/start_timers.py b/scripts/start_timers.py index c1b499cc..ef38c183 100644 --- a/scripts/start_timers.py +++ b/scripts/start_timers.py @@ -5,16 +5,25 @@ import argparse import os from collections import defaultdict +from concurrent.futures import ThreadPoolExecutor, as_completed from dotenv import dotenv_values from .monitor_common import get_canisters, ensure_log_dir +from .get_mainers import get_mainers +from .check_mAIner_status import ( + is_frozen_error, + is_transient_error, + collect_error_text, + HEALTH_OK_PATTERNS, +) # Get the directory of this script SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) + def start_timer(canister_id, network): """Start timer using dfx for a given canister.""" - try: + try: print(f"Starting timer for canister {canister_id} on network {network}...") subprocess.run( ["dfx", "canister", "--network", network, "call", canister_id, "startTimerExecutionAdmin"], @@ -24,15 +33,190 @@ def start_timer(canister_id, network): except subprocess.CalledProcessError: print(f"ERROR: Unable to start timer for canister {canister_id} on network {network}") -def main(network, canister_types): - (CANISTERS, CANISTER_COLORS, RESET_COLOR) = get_canisters(network, canister_types) - for name, canister_id in CANISTERS.items(): - if ("GAMESTATE" in name or "CREATOR" in name or "LLM" in name): - continue - print("-------------------------------") - print(f"Canister {name} ({canister_id})") - start_timer(canister_id, network) +def check_health_and_start_timer(canister_id, network, index, total): + """Check if a mAIner is responsive, then start its timer. + + Returns dict with canister_id, status ('started', 'frozen', 'unhealthy', 'error'). + """ + # Step 1: Check health + cmd = ["dfx", "canister", "--network", network, "call", canister_id, "health"] + max_retries = 3 + retry_delay = 5.0 + + for attempt in range(1, max_retries + 1): + try: + result = subprocess.run(cmd, capture_output=True, text=True, timeout=30, check=True) + output = result.stdout.strip() + if any(pattern in output for pattern in HEALTH_OK_PATTERNS): + break # Healthy — proceed to start timer + else: + print(f" ({index}/{total}) {canister_id} — unhealthy, skipping") + return {"canister_id": canister_id, "status": "unhealthy"} + except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e: + error_text = collect_error_text(e) + if is_frozen_error(error_text): + print(f" ({index}/{total}) {canister_id} — FROZEN, skipping") + return {"canister_id": canister_id, "status": "frozen"} + if attempt < max_retries and is_transient_error(error_text): + time.sleep(retry_delay * (2 ** (attempt - 1))) + continue + print(f" ({index}/{total}) {canister_id} — error: {error_text[:100]}, skipping") + return {"canister_id": canister_id, "status": "error"} + else: + print(f" ({index}/{total}) {canister_id} — max retries exceeded, skipping") + return {"canister_id": canister_id, "status": "error"} + + # Step 2: Start timer + cmd = ["dfx", "canister", "--network", network, "call", canister_id, "startTimerExecutionAdmin"] + try: + result = subprocess.run(cmd, capture_output=True, text=True, timeout=30, check=True) + print(f" ({index}/{total}) {canister_id} — timer started") + except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e: + error_text = collect_error_text(e) + print(f" ({index}/{total}) {canister_id} — failed to start timer: {error_text[:100]}") + return {"canister_id": canister_id, "status": "error", "logs": []} + + # Step 3: Capture logs after starting timer + cmd = ["dfx", "canister", "logs", canister_id, "--network", network] + try: + output = subprocess.check_output(cmd, stderr=subprocess.DEVNULL, text=True, timeout=30) + log_lines = output.strip().splitlines()[-20:] # last 20 lines + except (subprocess.CalledProcessError, subprocess.TimeoutExpired): + log_lines = ["(could not fetch logs)"] + + return {"canister_id": canister_id, "status": "started", "logs": log_lines} + + +def filter_share_agents(mainers): + """Filter mAIner list to only ShareAgent type with non-empty addresses.""" + share_agents = [] + for mainer in mainers: + address = mainer.get('address', '') + canister_type_dict = mainer.get('canisterType', {}).get("MainerAgent", {}) + canister_type = list(canister_type_dict.keys())[0] if canister_type_dict else '' + if canister_type == "ShareAgent" and address != "": + share_agents.append(address) + return share_agents + + +def main(network, canister_types, workers=10, limit=None, dry_run=False): + if dry_run: + print("*** DRY RUN — no timers will be started ***\n") + + # Protocol canisters — use existing .env-based flow + if canister_types == "protocol": + (CANISTERS, CANISTER_COLORS, RESET_COLOR) = get_canisters(network, canister_types) + for name, canister_id in CANISTERS.items(): + if ("GAMESTATE" in name or "CREATOR" in name or "LLM" in name): + continue + print("-------------------------------") + print(f"Canister {name} ({canister_id})") + if dry_run: + print(f" DRY RUN: would start timer for {canister_id}") + else: + start_timer(canister_id, network) + return + + # Mainers — fetch from GameState and check health before starting + print(f"Fetching mAIners from GameState on network '{network}'...") + mainers = get_mainers(network) + if not mainers: + print(f"ERROR: No mainers found on network '{network}'") + return + + # Show canister type breakdown for verification + type_counts = defaultdict(int) + for mainer in mainers: + address = mainer.get('address', '') + canister_type_dict = mainer.get('canisterType', {}).get("MainerAgent", {}) + canister_type = list(canister_type_dict.keys())[0] if canister_type_dict else 'Unknown' + if address: + type_counts[canister_type] += 1 + print(f"Canister type breakdown:") + for ct, count in sorted(type_counts.items()): + selected = " <-- SELECTED" if ct == "ShareAgent" else " (skipped)" + print(f" {ct}: {count}{selected}") + + share_agents = filter_share_agents(mainers) + print(f"\nFiltered to {len(share_agents)} ShareAgent mAIners.") + + if limit is not None and limit > 0: + share_agents = share_agents[:limit] + print(f"Limiting to first {limit} mAIners (for testing).") + + if dry_run: + print(f"\nDRY RUN: Would check health and start timers for {len(share_agents)} ShareAgent mAIners:") + for idx, cid in enumerate(share_agents, 1): + print(f" {idx}. {cid}") + print(f"\nDRY RUN complete. Re-run without --dry-run to execute.") + return + + total = len(share_agents) + print(f"\nStarting timers for {total} mAIners (checking health first)...") + print(f"Using {workers} parallel workers.\n") + + # Process in parallel + results = [] + with ThreadPoolExecutor(max_workers=workers) as executor: + future_to_id = { + executor.submit( + check_health_and_start_timer, cid, network, idx + 1, total + ): cid + for idx, cid in enumerate(share_agents) + } + for future in as_completed(future_to_id): + try: + results.append(future.result()) + except Exception as e: + cid = future_to_id[future] + print(f" {cid} — exception: {e}") + results.append({"canister_id": cid, "status": "error", "logs": []}) + + # Summary + counts = defaultdict(int) + for r in results: + counts[r["status"]] += 1 + + print(f"\n{'='*60}") + print("SUMMARY") + print(f"{'='*60}") + print(f" Started : {counts['started']}") + print(f" Frozen : {counts['frozen']} (skipped)") + print(f" Unhealthy: {counts['unhealthy']} (skipped)") + print(f" Errors : {counts['error']} (skipped)") + print(f" Total : {total}") + print() + + # Write log files + from datetime import datetime + date_prefix = datetime.now().strftime("%Y-%m-%d") + log_dir = os.path.join(SCRIPT_DIR, f"logs-start-timers-{network}") + ensure_log_dir(log_dir) + + combined_path = os.path.join(log_dir, f"{date_prefix}-start_timers-{network}.log") + with open(combined_path, "w") as f_combined: + for r in sorted(results, key=lambda x: x["canister_id"]): + cid = r["canister_id"] + logs = r.get("logs", []) + + # Individual log file + individual_path = os.path.join(log_dir, f"{cid}.log") + with open(individual_path, "w") as f_ind: + f_ind.write(f"# Status: {r['status']}\n") + for line in logs: + f_ind.write(line + "\n") + + # Combined log + f_combined.write(f"=== {cid} ({r['status']}) ===\n") + for line in logs: + f_combined.write(f" {line}\n") + f_combined.write("\n") + + print(f"Logs saved to: {os.path.abspath(log_dir)}") + print(f" Combined: {os.path.abspath(combined_path)}") + print(f" Per-canister: {len(results)} individual .log files") + print() if __name__ == "__main__": @@ -49,5 +233,22 @@ def main(network, canister_types): default="protocol", help="Specify the canister type (default: protocol)", ) + parser.add_argument( + "--workers", + type=int, + default=10, + help="Number of parallel workers (default: 10)", + ) + parser.add_argument( + "--limit", + type=int, + default=None, + help="Limit number of mainers to process (for testing)", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Show what would be done without actually starting timers", + ) args = parser.parse_args() - main(args.network, args.canister_types) + main(args.network, args.canister_types, args.workers, args.limit, args.dry_run) diff --git a/scripts/start_timers.sh b/scripts/start_timers.sh index 1f23eb4a..8c5dc610 100755 --- a/scripts/start_timers.sh +++ b/scripts/start_timers.sh @@ -3,6 +3,9 @@ # Default network type is local NETWORK_TYPE="local" CANISTER_TYPES="protocol" +WORKERS="" +LIMIT="" +DRY_RUN="" # Parse command line arguments for network type while [ $# -gt 0 ]; do @@ -22,14 +25,28 @@ while [ $# -gt 0 ]; do if [ "$1" = "all" ] || [ "$1" = "protocol" ] || [ "$1" = "mainers" ]; then CANISTER_TYPES=$1 else - echo "Invalid network type: $1. Use 'all' or 'protocol' or 'mainers'." + echo "Invalid canister type: $1. Use 'all' or 'protocol' or 'mainers'." exit 1 fi shift ;; + --workers) + shift + WORKERS="--workers $1" + shift + ;; + --limit) + shift + LIMIT="--limit $1" + shift + ;; + --dry-run) + DRY_RUN="--dry-run" + shift + ;; *) echo "Unknown argument: $1" - echo "Usage: $0 --network [local|ic|testing|development|demo|prd]" + echo "Usage: $0 --network [local|ic|testing|development|demo|prd] --canister-types [all|protocol|mainers] [--workers N] [--limit N] [--dry-run]" exit 1 ;; esac @@ -37,4 +54,4 @@ done echo "Using network type: $NETWORK_TYPE" -python -m scripts.start_timers --network $NETWORK_TYPE --canister-types $CANISTER_TYPES +python -m scripts.start_timers --network $NETWORK_TYPE --canister-types $CANISTER_TYPES $WORKERS $LIMIT $DRY_RUN