Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions src/madengine/core/console.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
"""
# built-in modules
import os
import subprocess
import typing
# third-party modules
Expand Down Expand Up @@ -62,7 +63,9 @@ def sh(
if self.shellVerbose and not secret:
print("> " + command, flush=True)

# Run the shell command
# Merge partial env with inherited environment (subprocess replaces env entirely if set).
run_env = None if env is None else {**os.environ, **env}

proc = subprocess.Popen(
command,
stdin=subprocess.PIPE,
Expand All @@ -71,7 +74,7 @@ def sh(
shell=True,
universal_newlines=True,
bufsize=1,
env=env,
env=run_env,
)

# Get the output of the shell command, and check for failure, and return the output.
Expand Down
18 changes: 18 additions & 0 deletions src/madengine/core/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,21 @@
}
else:
PUBLIC_GITHUB_ROCM_KEY = json.loads(os.environ["PUBLIC_GITHUB_ROCM_KEY"])


def get_rocm_path(override=None):
"""Return ROCm installation root directory.

Resolution order: override (e.g. from CLI) -> ROCM_PATH env -> default /opt/rocm.
Path is normalized with ``os.path.normpath`` (preserves ``/`` as root).

Args:
override: Optional path overriding env and default.

Returns:
str: Absolute ROCm root path.
"""
raw = override if override else os.environ.get("ROCM_PATH", "/opt/rocm")
path = os.path.abspath(os.path.expanduser(str(raw).strip()))
# normpath preserves "/" as install root; rstrip(os.sep) would turn "/" into "".
return os.path.normpath(path)
93 changes: 77 additions & 16 deletions src/madengine/core/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,12 @@
import collections.abc
import os
import re
import shlex
import subprocess
import typing
# third-party modules
from madengine.core.console import Console
from madengine.core.constants import get_rocm_path
from madengine.utils.gpu_validator import validate_rocm_installation, GPUInstallationError


Expand Down Expand Up @@ -65,18 +68,23 @@ class Context:
def __init__(
self,
additional_context: str=None,
additional_context_file: str=None
additional_context_file: str=None,
rocm_path: str=None
) -> None:
"""Constructor of the Context class.

Args:
additional_context: The additional context.
additional_context_file: The additional context file.
rocm_path: Optional ROCm installation path (overrides ROCM_PATH env; default /opt/rocm).

Raises:
RuntimeError: If the GPU vendor is not detected.
RuntimeError: If the GPU architecture is not detected.
"""
# Resolve ROCm path first (used by get_gpu_vendor and others)
self._rocm_path = get_rocm_path(rocm_path)

# Initialize the console
self.console = Console()

Expand All @@ -99,7 +107,7 @@ def __init__(
# Validate ROCm installation if AMD GPU is detected
if self.ctx["gpu_vendor"] == "AMD":
try:
validate_rocm_installation(verbose=False, raise_on_error=True)
validate_rocm_installation(verbose=False, raise_on_error=True, rocm_path=self._rocm_path)
except GPUInstallationError as e:
print("\n" + "="*70)
print("ERROR: ROCm Installation Validation Failed")
Expand All @@ -110,6 +118,8 @@ def __init__(

# Initialize the docker context
self.ctx["docker_env_vars"] = {}
self.ctx["rocm_path"] = self._rocm_path
self.ctx["docker_env_vars"]["ROCM_PATH"] = self._rocm_path
self.ctx["docker_env_vars"]["MAD_GPU_VENDOR"] = self.ctx["gpu_vendor"]
self.ctx["docker_env_vars"]["MAD_SYSTEM_NGPUS"] = self.get_system_ngpus()
self.ctx["docker_env_vars"]["MAD_SYSTEM_GPU_ARCHITECTURE"] = self.get_system_gpu_architecture()
Expand Down Expand Up @@ -161,6 +171,10 @@ def __init__(
# Set multi-node runner after context update
self.ctx['docker_env_vars']['MAD_MULTI_NODE_RUNNER'] = self.set_multi_node_runner()

def _quoted_rocm_bin(self, name: str) -> str:
"""Shell-safe path to a binary under ``{rocm_path}/bin``."""
return shlex.quote(os.path.join(self._rocm_path, "bin", name))

def get_ctx_test(self) -> str:
"""Get context test.

Expand Down Expand Up @@ -189,9 +203,15 @@ def get_gpu_vendor(self) -> str:
- NVIDIA
- AMD
"""
# Check if the GPU vendor is NVIDIA or AMD, and if it is unable to detect the GPU vendor.
# ROCM_PATH via subprocess env avoids embedding user-controlled paths in shell strings.
vendor_env = {**os.environ, "ROCM_PATH": self._rocm_path}
Copy link

Copilot AI Apr 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Console.sh() now merges any provided env with os.environ, so building vendor_env by copying os.environ here is redundant. You can pass just {"ROCM_PATH": self._rocm_path} (and/or any overrides) to keep the intent clear and avoid an extra full env copy.

Suggested change
vendor_env = {**os.environ, "ROCM_PATH": self._rocm_path}
vendor_env = {"ROCM_PATH": self._rocm_path}

Copilot uses AI. Check for mistakes.
return self.console.sh(
'bash -c \'if [[ -f /usr/bin/nvidia-smi ]] && $(/usr/bin/nvidia-smi > /dev/null 2>&1); then echo "NVIDIA"; elif [[ -f /opt/rocm/bin/amd-smi ]]; then echo "AMD"; elif [[ -f /usr/local/bin/amd-smi ]]; then echo "AMD"; elif [[ -f /opt/rocm/bin/rocm-smi ]]; then echo "AMD"; else echo "Unable to detect GPU vendor"; fi || true\''
'bash -c \'if [[ -f /usr/bin/nvidia-smi ]] && $(/usr/bin/nvidia-smi > /dev/null 2>&1); then echo "NVIDIA"; '
'elif [[ -f "${ROCM_PATH}/bin/amd-smi" ]]; then echo "AMD"; '
'elif [[ -f /usr/local/bin/amd-smi ]]; then echo "AMD"; '
'elif [[ -f "${ROCM_PATH}/bin/rocm-smi" ]]; then echo "AMD"; '
'else echo "Unable to detect GPU vendor"; fi || true\'',
env=vendor_env,
)

def get_host_os(self) -> str:
Expand Down Expand Up @@ -254,11 +274,19 @@ def get_system_ngpus(self) -> int:
number_gpus = 0
if self.ctx["docker_env_vars"]["MAD_GPU_VENDOR"] == "AMD":
try:
number_gpus = int(self.console.sh("amd-smi list --csv | tail -n +3 | wc -l"))
amd_smi = self._quoted_rocm_bin("amd-smi")
number_gpus = int(
self.console.sh(f"{amd_smi} list --csv | tail -n +3 | wc -l")
)
except Exception as e:
# Try fallback to rocm-smi
try:
number_gpus = int(self.console.sh("rocm-smi --showid --csv | tail -n +2 | wc -l"))
rocm_smi = self._quoted_rocm_bin("rocm-smi")
number_gpus = int(
self.console.sh(
f"{rocm_smi} --showid --csv | tail -n +2 | wc -l"
)
)
except Exception:
raise RuntimeError(
f"Unable to determine number of AMD GPUs. "
Expand Down Expand Up @@ -289,14 +317,31 @@ def get_system_gpu_architecture(self) -> str:
"""
if self.ctx["docker_env_vars"]["MAD_GPU_VENDOR"] == "AMD":
try:
arch = self.console.sh("/opt/rocm/bin/rocminfo |grep -o -m 1 'gfx.*'")
if not arch or arch.strip() == "":
rocminfo_path = os.path.join(self._rocm_path, "bin", "rocminfo")
try:
proc = subprocess.run(
[rocminfo_path],
capture_output=True,
text=True,
timeout=60,
check=False,
)
except FileNotFoundError as fnf:
raise RuntimeError(
f"rocminfo not found at {rocminfo_path}"
) from fnf
out = (proc.stdout or "") + (proc.stderr or "")
match = re.search(r"gfx\S+", out)
if not match:
raise RuntimeError("rocminfo returned empty architecture")
arch = match.group(0)
if not arch.strip():
raise RuntimeError("rocminfo returned empty architecture")
return arch
except Exception as e:
raise RuntimeError(
f"Unable to determine AMD GPU architecture. "
f"Ensure ROCm is installed and rocminfo is accessible at /opt/rocm/bin/rocminfo. "
f"Ensure ROCm is installed and rocminfo is accessible (ROCM_PATH={self._rocm_path}). "
f"Error: {e}"
)
elif self.ctx["docker_env_vars"]["MAD_GPU_VENDOR"] == "NVIDIA":
Expand All @@ -323,11 +368,15 @@ def get_system_gpu_product_name(self) -> str:
"""
if self.ctx["docker_env_vars"]["MAD_GPU_VENDOR"] == "AMD":
try:
return self.console.sh("amd-smi static -g 0 | grep MARKET_NAME: | cut -d ':' -f 2")
amd_smi = self._quoted_rocm_bin("amd-smi")
return self.console.sh(
f"{amd_smi} static -g 0 | grep MARKET_NAME: | cut -d ':' -f 2"
)
except Exception as e:
# Try fallback to rocm-smi
try:
output = self.console.sh("rocm-smi -i")
rocm_smi = self._quoted_rocm_bin("rocm-smi")
output = self.console.sh(f"{rocm_smi} -i")
# Parse output to extract product name from brackets
# Example: "GPU[0] : Device Name: Arcturus GL-XL [Instinct MI100]"
# Extract: "Instinct MI100"
Expand All @@ -352,7 +401,8 @@ def get_system_gpu_product_name(self) -> str:
def get_system_hip_version(self):
if self.ctx['docker_env_vars']['MAD_GPU_VENDOR']=='AMD':
try:
version = self.console.sh("hipconfig --version | cut -d'.' -f1,2")
hipconfig = self._quoted_rocm_bin("hipconfig")
version = self.console.sh(f"{hipconfig} --version | cut -d'.' -f1,2")
if not version or version.strip() == "":
raise RuntimeError("hipconfig returned empty version")
return version
Expand Down Expand Up @@ -408,9 +458,16 @@ def get_gpu_renderD_nodes(self) -> typing.Optional[typing.List[int]]:

try:
# Get ROCm version
rocm_version_str = self.console.sh("cat /opt/rocm/.info/version | cut -d'-' -f1")
version_file = os.path.join(self._rocm_path, ".info", "version")
try:
with open(version_file, "r", encoding="utf-8") as vf:
rocm_version_str = vf.read().strip().split("-")[0].split("\n")[0]
except OSError as io_err:
raise RuntimeError(
f"Failed to read ROCm version file {version_file}: {io_err}"
) from io_err
if not rocm_version_str or rocm_version_str.strip() == "":
raise RuntimeError("Failed to retrieve ROCm version from /opt/rocm/.info/version")
raise RuntimeError(f"Failed to retrieve ROCm version from {version_file}")

# Parse version safely
try:
Expand Down Expand Up @@ -467,7 +524,8 @@ def get_gpu_renderD_nodes(self) -> typing.Optional[typing.List[int]]:
}

# Get list of GPUs from amd-smi
output = self.console.sh("amd-smi list -e --json")
amd_smi = self._quoted_rocm_bin("amd-smi")
output = self.console.sh(f"{amd_smi} list -e --json")
if not output or output.strip() == "":
raise ValueError("Failed to retrieve AMD GPU data from amd-smi")

Expand Down Expand Up @@ -523,7 +581,10 @@ def get_gpu_renderD_nodes(self) -> typing.Optional[typing.List[int]]:
}

# Get GPU ID to unique ID mapping from rocm-smi
rsmi_output = self.console.sh("rocm-smi --showuniqueid | grep 'Unique.*:'")
rocm_smi = self._quoted_rocm_bin("rocm-smi")
rsmi_output = self.console.sh(
f"{rocm_smi} --showuniqueid | grep 'Unique.*:'"
)
if not rsmi_output or rsmi_output.strip() == "":
raise RuntimeError("Failed to retrieve unique IDs from rocm-smi")

Expand Down
1 change: 1 addition & 0 deletions src/madengine/mad.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,7 @@ def main():
parser_run.add_argument('--additional-context-file', default=None, help="additonal context, as json file, to filter behavior of workloads. Overrides detected contexts.")
parser_run.add_argument('--additional-context', default='{}', help="additional context, as string representation of python dict, to filter behavior of workloads. " +
" Overrides detected contexts and additional-context-file.")
parser_run.add_argument('--rocm-path', default=None, help='ROCm installation path (overrides ROCM_PATH env; default: /opt/rocm). Use when ROCm is not under /opt/rocm (e.g. Rock tar/whl).')
parser_run.add_argument('--data-config-file-name', default="data.json", help="custom data configuration file.")
parser_run.add_argument('--tools-json-file-name', default="./scripts/common/tools.json", help="custom tools json configuration file.")
parser_run.add_argument('--generate-sys-env-details', type=lambda x: (str(x).lower() in ['true', '1', 'yes']), default=True, help='generate system config env details by default (accepts: true/false, yes/no, 1/0)')
Expand Down
14 changes: 12 additions & 2 deletions src/madengine/tools/run_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import sys
import os
import json
import shlex
import time
import re
import traceback
Expand All @@ -48,6 +49,7 @@
from madengine.utils.ops import PythonicTee, file_print, substring_found, find_and_replace_pattern
from madengine.core.constants import MAD_MINIO, MAD_AWS_S3
from madengine.core.constants import MODEL_DIR, PUBLIC_GITHUB_ROCM_KEY
from madengine.core.constants import get_rocm_path
from madengine.core.timeout import Timeout
from madengine.tools.update_perf_csv import update_perf_csv
from madengine.tools.csv_to_html import convert_csv_to_html
Expand Down Expand Up @@ -154,9 +156,11 @@ def __init__(self, args):
self.return_status = True
self.args = args
self.console = Console(live_output=True)
rocm_path = get_rocm_path(getattr(args, 'rocm_path', None))
self.context = Context(
additional_context=args.additional_context,
additional_context_file=args.additional_context_file,
rocm_path=rocm_path,
)
# check the data.json file exists
data_json_file = args.data_config_file_name
Expand Down Expand Up @@ -202,7 +206,10 @@ def clean_up_docker_container(self, is_cleaned: bool = False) -> None:
gpu_vendor = self.context.ctx["docker_env_vars"]["MAD_GPU_VENDOR"]
# show gpu info
if gpu_vendor.find("AMD") != -1:
self.console.sh("/opt/rocm/bin/amd-smi || /opt/rocm/bin/rocm-smi || true")
ro = self.context.ctx["rocm_path"]
amd_smi = shlex.quote(os.path.join(ro, "bin", "amd-smi"))
rocm_smi = shlex.quote(os.path.join(ro, "bin", "rocm-smi"))
self.console.sh(f"{amd_smi} || {rocm_smi} || true")
elif gpu_vendor.find("NVIDIA") != -1:
self.console.sh("nvidia-smi -L || true")

Expand Down Expand Up @@ -789,7 +796,10 @@ def run_model_impl(

# echo gpu smi info
if gpu_vendor.find("AMD") != -1:
smi = model_docker.sh("/opt/rocm/bin/amd-smi || /opt/rocm/bin/rocm-smi || true")
ro = self.context.ctx["rocm_path"]
amd_smi = shlex.quote(os.path.join(ro, "bin", "amd-smi"))
rocm_smi = shlex.quote(os.path.join(ro, "bin", "rocm-smi"))
Comment on lines +799 to +801
Copy link

Copilot AI Apr 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shlex.quote() is safe for POSIX shells, but Docker.sh() embeds the entire command inside a double-quoted bash -c "..." string. If rocm_path contains a single quote, shlex.quote() will emit a "'" sequence (includes double quotes) which can break the surrounding quoting and potentially allow command injection. Consider avoiding shlex.quote() here (or escaping for a double-quoted context), or updating Docker.sh() to pass the command without wrapping it in unescaped double quotes (e.g., pass via stdin / use an argv form).

Suggested change
ro = self.context.ctx["rocm_path"]
amd_smi = shlex.quote(os.path.join(ro, "bin", "amd-smi"))
rocm_smi = shlex.quote(os.path.join(ro, "bin", "rocm-smi"))
def shell_double_quoted(value):
return '"' + value.replace("\\", "\\\\").replace('"', '\\"').replace("$", "\\$").replace("`", "\\`") + '"'
ro = self.context.ctx["rocm_path"]
amd_smi = shell_double_quoted(os.path.join(ro, "bin", "amd-smi"))
rocm_smi = shell_double_quoted(os.path.join(ro, "bin", "rocm-smi"))

Copilot uses AI. Check for mistakes.
smi = model_docker.sh(f"{amd_smi} || {rocm_smi} || true")
elif gpu_vendor.find("NVIDIA") != -1:
smi = model_docker.sh("/usr/bin/nvidia-smi || true")
else:
Expand Down
Loading