ROCm · mkuznet1 · Apr 23, 2026 · Apr 23, 2026 · Apr 23, 2026 · Apr 23, 2026
@@ -26,17 +26,28 @@
 import os
 import json
 import logging
+import sys
 
 
 # Utility function for optional verbose logging of configuration
 def _log_config_info(message: str, force_print: bool = False):
     """Log configuration information either to logger or print if specified."""
+    # Keep --version/--help output clean even if MAD_VERBOSE_CONFIG=true.
+    if any(arg in {"--version", "-V", "--help", "-h"} for arg in sys.argv[1:]):
+        logging.debug(message)
+        return
     if force_print or os.environ.get("MAD_VERBOSE_CONFIG", "").lower() == "true":
         print(message)
     else:
         logging.debug(message)
 
 
+def _is_lightweight_cli_invocation() -> bool:
+    """Return True for metadata/help invocations that should avoid side effects."""
+    lightweight_flags = {"--version", "-V", "--help", "-h"}
+    return any(arg in lightweight_flags for arg in sys.argv[1:])
+
+
 # third-party modules
 from madengine.core.console import Console
 
@@ -65,9 +76,12 @@ def _setup_model_dir():
         _log_config_info(f"Model dir: {MODEL_DIR} copied to current dir: {cwd_abs}")
 
 
-# Only setup model directory if explicitly requested (when not just importing for constants)
+# Only setup model directory if explicitly requested and invocation is not metadata-only.
 if os.environ.get("MAD_SETUP_MODEL_DIR", "").lower() == "true":
-    _setup_model_dir()
+    if _is_lightweight_cli_invocation():
+        _log_config_info("Skipping MODEL_DIR setup for lightweight CLI invocation (--version/--help).")
+    else:
+        _setup_model_dir()
 
 # madengine credentials configuration
 CRED_FILE = "credential.json"

@@ -1270,18 +1270,24 @@ def collect_results(self, deployment_id: str) -> Dict[str, Any]:
             model_key, {}
         ) if model_key else {}
 
-        # Multiple results path: resolve CSV from job_dir/node_*, then cwd/run_directory
+        # Multiple results path: resolve CSV from job_dir/node_*, then cwd/run_directory.
+        # In multi-node runs, different nodes may produce the CSV with different levels
+        # of completeness (e.g. only one node observes the final throughput numbers and
+        # populates the "performance" column). Prefer the candidate with the most
+        # non-empty "performance" rows so aggregation does not silently pick an empty one.
         mult_res = model_info_for_entry.get("multiple_results")
         if mult_res:
             resolved_csv: Optional[Path] = None
+            candidates: List[Path] = []
             if (job_dir / mult_res).is_file():
-                resolved_csv = job_dir / mult_res
-            else:
-                for i in range(self.nodes):
-                    candidate = job_dir / f"node_{i}" / mult_res
-                    if candidate.is_file():
-                        resolved_csv = candidate
-                        break
+                candidates.append(job_dir / mult_res)
+            for i in range(self.nodes):
+                per_node_candidate = job_dir / f"node_{i}" / mult_res
+                if per_node_candidate.is_file():
+                    candidates.append(per_node_candidate)
+
+            if candidates:
+                resolved_csv = self._select_best_multiple_results_csv(candidates)
             if not resolved_csv and Path(mult_res).is_file():
                 resolved_csv = Path(mult_res)
             if not resolved_csv and Path("run_directory", mult_res).is_file():
@@ -1492,6 +1498,69 @@ def collect_results(self, deployment_id: str) -> Dict[str, Any]:
         )
         return results
 
+    def _select_best_multiple_results_csv(
+        self, candidates: List[Path]
+    ) -> Optional[Path]:
+        """Pick the CSV with the most non-empty ``performance`` entries.
+
+        In multi-node SLURM runs, every node copies its local copy of the
+        workload's multi-results CSV into ``job_dir/node_<rank>/``. Only
+        some nodes will observe the final throughput numbers and therefore
+        populate the ``performance`` column; others may have the file but
+        with empty values. Ranking candidates by the number of non-empty
+        ``performance`` rows lets downstream aggregation use the richest
+        available data without depending on node-0 winning every race.
+
+        Falls back to the first candidate when none has a ``performance``
+        column or when counting fails, preserving previous behavior.
+        """
+        if not candidates:
+            return None
+        if len(candidates) == 1:
+            return candidates[0]
+
+        import csv as _csv
+
+        best_candidate: Optional[Path] = None
+        best_score = -1
+        best_rows = -1
+        for candidate in candidates:
+            non_empty_perf = 0
+            total_rows = 0
+            has_perf_column = False
+            try:
+                with open(candidate, "r", encoding="utf-8", errors="ignore") as f:
+                    reader = _csv.DictReader(f)
+                    fieldnames = reader.fieldnames or []
+                    stripped_fields = [fn.strip() for fn in fieldnames]
+                    has_perf_column = "performance" in stripped_fields
+                    for row in reader:
+                        total_rows += 1
+                        if has_perf_column:
+                            value = (row.get("performance") or "").strip()
-                            value = (row.get("performance") or "").strip()
+                            normalized_row = {
+                                (k.strip() if isinstance(k, str) else k): v
+                                for k, v in row.items()
+                            }
+                            value = (normalized_row.get("performance") or "").strip()
-                            value = (row.get("performance") or "").strip()
+                            normalized_row = {
+                                (k.strip() if isinstance(k, str) else k): v
+                                for k, v in row.items()
+                            }
+                            value = (normalized_row.get("performance") or "").strip()
+                            if value:
+                                non_empty_perf += 1
+            except Exception:
+                continue
+
+            score = non_empty_perf if has_perf_column else 0
+            if (
+                score > best_score
+                or (score == best_score and total_rows > best_rows)
+            ):
+                best_score = score
+                best_rows = total_rows
+                best_candidate = candidate
+
+        if best_candidate is None:
+            return candidates[0]
+        if best_score > 0:
+            self.console.print(
+                f"[dim]  Selected multiple_results CSV with {best_score} "
+                f"non-empty performance rows: {best_candidate}[/dim]"
+            )
+        return best_candidate
+
     def _collect_results_parse_perf_csv(
         self, results: Dict[str, Any], session_start_row: Optional[int]
     ) -> None:

@@ -213,15 +213,16 @@ fi
 echo ""
 echo "Verifying madengine availability..."
 if command -v madengine >/dev/null 2>&1; then
-    MAD_CLI_VERSION=$(madengine --version 2>&1 | head -n1 || echo "unknown")
+    # MODEL_DIR can trigger side effects in madengine startup; unset it for preflight probes only.
+    MAD_CLI_VERSION=$(env -u MODEL_DIR madengine --version 2>&1 | head -n1 || echo "unknown")
     MAD_CLI_PATH=$(which madengine 2>/dev/null || echo "unknown")
 
     echo "  ✓ madengine available"
     echo "  Version: $MAD_CLI_VERSION"
     echo "  Path: $MAD_CLI_PATH"
 
     # Verify it's executable
-    if madengine --help >/dev/null 2>&1; then
+    if env -u MODEL_DIR madengine --help >/dev/null 2>&1; then
         export MAD_CLI_COMMAND="madengine"
     else
         echo "  ❌ ERROR: madengine found but not functional!"
@@ -488,15 +489,24 @@ trap 'ec=$?; echo "[DEBUG] $(date -Iseconds) Node ${SLURM_PROCID} ($(hostname)):
 echo "Verifying madengine availability..."
 
 if command -v madengine >/dev/null 2>&1; then
-    MAD_CLI_VERSION=$(madengine --version 2>&1 | head -n1 || echo "unknown")
+    # MODEL_DIR can trigger side effects in madengine startup; isolate it for preflight probes.
+    set +e
+    MAD_VERSION_RAW_SANITIZED=$(env -u MODEL_DIR madengine --version 2>&1)
+    set -e
+    MAD_CLI_VERSION=$(printf "%s" "$MAD_VERSION_RAW_SANITIZED" | head -n1 || echo "unknown")
     MAD_CLI_PATH=$(which madengine 2>/dev/null || echo "unknown")
 
     echo "✓ madengine available"
     echo "  Version: $MAD_CLI_VERSION"
     echo "  Path: $MAD_CLI_PATH"
 
     # Verify it's executable
-    if madengine --help >/dev/null 2>&1; then
+    set +e
+    MAD_HELP_RAW_SANITIZED=$(env -u MODEL_DIR madengine --help 2>&1)
+    MAD_HELP_EXIT_SANITIZED=$?
+    set -e
+
+    if [ "${MAD_HELP_EXIT_SANITIZED}" -eq 0 ]; then
         echo "  ✓ Verified: madengine is functional"
         MAD_CLI_COMMAND="madengine"
     else