Skip to content
Open
18 changes: 16 additions & 2 deletions src/madengine/core/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,28 @@
import os
import json
import logging
import sys


# Utility function for optional verbose logging of configuration
def _log_config_info(message: str, force_print: bool = False):
"""Log configuration information either to logger or print if specified."""
# Keep --version/--help output clean even if MAD_VERBOSE_CONFIG=true.
if any(arg in {"--version", "-V", "--help", "-h"} for arg in sys.argv[1:]):
logging.debug(message)
return
if force_print or os.environ.get("MAD_VERBOSE_CONFIG", "").lower() == "true":
print(message)
else:
logging.debug(message)


def _is_lightweight_cli_invocation() -> bool:
"""Return True for metadata/help invocations that should avoid side effects."""
lightweight_flags = {"--version", "-V", "--help", "-h"}
return any(arg in lightweight_flags for arg in sys.argv[1:])


# third-party modules
from madengine.core.console import Console

Expand Down Expand Up @@ -65,9 +76,12 @@ def _setup_model_dir():
_log_config_info(f"Model dir: {MODEL_DIR} copied to current dir: {cwd_abs}")


# Only setup model directory if explicitly requested (when not just importing for constants)
# Only setup model directory if explicitly requested and invocation is not metadata-only.
if os.environ.get("MAD_SETUP_MODEL_DIR", "").lower() == "true":
_setup_model_dir()
if _is_lightweight_cli_invocation():
_log_config_info("Skipping MODEL_DIR setup for lightweight CLI invocation (--version/--help).")
else:
_setup_model_dir()

# madengine credentials configuration
CRED_FILE = "credential.json"
Expand Down
85 changes: 77 additions & 8 deletions src/madengine/deployment/slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -1270,18 +1270,24 @@ def collect_results(self, deployment_id: str) -> Dict[str, Any]:
model_key, {}
) if model_key else {}

# Multiple results path: resolve CSV from job_dir/node_*, then cwd/run_directory
# Multiple results path: resolve CSV from job_dir/node_*, then cwd/run_directory.
# In multi-node runs, different nodes may produce the CSV with different levels
# of completeness (e.g. only one node observes the final throughput numbers and
# populates the "performance" column). Prefer the candidate with the most
# non-empty "performance" rows so aggregation does not silently pick an empty one.
mult_res = model_info_for_entry.get("multiple_results")
if mult_res:
resolved_csv: Optional[Path] = None
candidates: List[Path] = []
if (job_dir / mult_res).is_file():
resolved_csv = job_dir / mult_res
else:
for i in range(self.nodes):
candidate = job_dir / f"node_{i}" / mult_res
if candidate.is_file():
resolved_csv = candidate
break
candidates.append(job_dir / mult_res)
for i in range(self.nodes):
per_node_candidate = job_dir / f"node_{i}" / mult_res
if per_node_candidate.is_file():
candidates.append(per_node_candidate)

if candidates:
resolved_csv = self._select_best_multiple_results_csv(candidates)
if not resolved_csv and Path(mult_res).is_file():
resolved_csv = Path(mult_res)
if not resolved_csv and Path("run_directory", mult_res).is_file():
Expand Down Expand Up @@ -1492,6 +1498,69 @@ def collect_results(self, deployment_id: str) -> Dict[str, Any]:
)
return results

def _select_best_multiple_results_csv(
self, candidates: List[Path]
) -> Optional[Path]:
"""Pick the CSV with the most non-empty ``performance`` entries.

In multi-node SLURM runs, every node copies its local copy of the
workload's multi-results CSV into ``job_dir/node_<rank>/``. Only
some nodes will observe the final throughput numbers and therefore
populate the ``performance`` column; others may have the file but
with empty values. Ranking candidates by the number of non-empty
``performance`` rows lets downstream aggregation use the richest
available data without depending on node-0 winning every race.

Falls back to the first candidate when none has a ``performance``
column or when counting fails, preserving previous behavior.
"""
if not candidates:
return None
if len(candidates) == 1:
return candidates[0]

import csv as _csv

best_candidate: Optional[Path] = None
best_score = -1
best_rows = -1
for candidate in candidates:
non_empty_perf = 0
total_rows = 0
has_perf_column = False
try:
with open(candidate, "r", encoding="utf-8", errors="ignore") as f:
reader = _csv.DictReader(f)
fieldnames = reader.fieldnames or []
stripped_fields = [fn.strip() for fn in fieldnames]
has_perf_column = "performance" in stripped_fields
for row in reader:
total_rows += 1
if has_perf_column:
value = (row.get("performance") or "").strip()
Copy link

Copilot AI Apr 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_select_best_multiple_results_csv() strips whitespace from fieldnames to detect the presence of a "performance" column, but then reads values with row.get("performance"). If the actual CSV header is " performance" (or otherwise differs), this will incorrectly treat all rows as empty. Consider normalizing row keys (similar to the later {k.strip(): v for k,v in row.items()} logic in collect_results) or tracking the exact header name that matched after stripping.

Suggested change
value = (row.get("performance") or "").strip()
normalized_row = {
(k.strip() if isinstance(k, str) else k): v
for k, v in row.items()
}
value = (normalized_row.get("performance") or "").strip()

Copilot uses AI. Check for mistakes.
if value:
non_empty_perf += 1
except Exception:
continue

score = non_empty_perf if has_perf_column else 0
if (
score > best_score
or (score == best_score and total_rows > best_rows)
):
best_score = score
best_rows = total_rows
best_candidate = candidate

if best_candidate is None:
return candidates[0]
if best_score > 0:
self.console.print(
f"[dim] Selected multiple_results CSV with {best_score} "
f"non-empty performance rows: {best_candidate}[/dim]"
)
return best_candidate

def _collect_results_parse_perf_csv(
self, results: Dict[str, Any], session_start_row: Optional[int]
) -> None:
Expand Down
18 changes: 14 additions & 4 deletions src/madengine/deployment/templates/slurm/job.sh.j2
Original file line number Diff line number Diff line change
Expand Up @@ -213,15 +213,16 @@ fi
echo ""
echo "Verifying madengine availability..."
if command -v madengine >/dev/null 2>&1; then
MAD_CLI_VERSION=$(madengine --version 2>&1 | head -n1 || echo "unknown")
# MODEL_DIR can trigger side effects in madengine startup; unset it for preflight probes only.
MAD_CLI_VERSION=$(env -u MODEL_DIR madengine --version 2>&1 | head -n1 || echo "unknown")
MAD_CLI_PATH=$(which madengine 2>/dev/null || echo "unknown")

echo " ✓ madengine available"
echo " Version: $MAD_CLI_VERSION"
echo " Path: $MAD_CLI_PATH"

# Verify it's executable
if madengine --help >/dev/null 2>&1; then
if env -u MODEL_DIR madengine --help >/dev/null 2>&1; then
export MAD_CLI_COMMAND="madengine"
else
echo " ❌ ERROR: madengine found but not functional!"
Expand Down Expand Up @@ -488,15 +489,24 @@ trap 'ec=$?; echo "[DEBUG] $(date -Iseconds) Node ${SLURM_PROCID} ($(hostname)):
echo "Verifying madengine availability..."

if command -v madengine >/dev/null 2>&1; then
MAD_CLI_VERSION=$(madengine --version 2>&1 | head -n1 || echo "unknown")
# MODEL_DIR can trigger side effects in madengine startup; isolate it for preflight probes.
set +e
MAD_VERSION_RAW_SANITIZED=$(env -u MODEL_DIR madengine --version 2>&1)
set -e
MAD_CLI_VERSION=$(printf "%s" "$MAD_VERSION_RAW_SANITIZED" | head -n1 || echo "unknown")
MAD_CLI_PATH=$(which madengine 2>/dev/null || echo "unknown")

echo "✓ madengine available"
echo " Version: $MAD_CLI_VERSION"
echo " Path: $MAD_CLI_PATH"

# Verify it's executable
if madengine --help >/dev/null 2>&1; then
set +e
MAD_HELP_RAW_SANITIZED=$(env -u MODEL_DIR madengine --help 2>&1)
MAD_HELP_EXIT_SANITIZED=$?
set -e

if [ "${MAD_HELP_EXIT_SANITIZED}" -eq 0 ]; then
echo " ✓ Verified: madengine is functional"
MAD_CLI_COMMAND="madengine"
else
Expand Down
Loading