diff --git a/src/madengine/core/console.py b/src/madengine/core/console.py index 9340924a..f6453173 100644 --- a/src/madengine/core/console.py +++ b/src/madengine/core/console.py @@ -6,6 +6,7 @@ Copyright (c) Advanced Micro Devices, Inc. All rights reserved. """ # built-in modules +import os import subprocess import typing # third-party modules @@ -62,7 +63,9 @@ def sh( if self.shellVerbose and not secret: print("> " + command, flush=True) - # Run the shell command + # Merge partial env with inherited environment (subprocess replaces env entirely if set). + run_env = None if env is None else {**os.environ, **env} + proc = subprocess.Popen( command, stdin=subprocess.PIPE, @@ -71,7 +74,7 @@ def sh( shell=True, universal_newlines=True, bufsize=1, - env=env, + env=run_env, ) # Get the output of the shell command, and check for failure, and return the output. diff --git a/src/madengine/core/constants.py b/src/madengine/core/constants.py index c0cbd5c0..b377b7c4 100644 --- a/src/madengine/core/constants.py +++ b/src/madengine/core/constants.py @@ -89,3 +89,21 @@ } else: PUBLIC_GITHUB_ROCM_KEY = json.loads(os.environ["PUBLIC_GITHUB_ROCM_KEY"]) + + +def get_rocm_path(override=None): + """Return ROCm installation root directory. + + Resolution order: override (e.g. from CLI) -> ROCM_PATH env -> default /opt/rocm. + Path is normalized with ``os.path.normpath`` (preserves ``/`` as root). + + Args: + override: Optional path overriding env and default. + + Returns: + str: Absolute ROCm root path. + """ + raw = override if override else os.environ.get("ROCM_PATH", "/opt/rocm") + path = os.path.abspath(os.path.expanduser(str(raw).strip())) + # normpath preserves "/" as install root; rstrip(os.sep) would turn "/" into "". + return os.path.normpath(path) diff --git a/src/madengine/core/context.py b/src/madengine/core/context.py index 4627ab8d..31d4b8d8 100644 --- a/src/madengine/core/context.py +++ b/src/madengine/core/context.py @@ -17,9 +17,12 @@ import collections.abc import os import re +import shlex +import subprocess import typing # third-party modules from madengine.core.console import Console +from madengine.core.constants import get_rocm_path from madengine.utils.gpu_validator import validate_rocm_installation, GPUInstallationError @@ -65,18 +68,23 @@ class Context: def __init__( self, additional_context: str=None, - additional_context_file: str=None + additional_context_file: str=None, + rocm_path: str=None ) -> None: """Constructor of the Context class. Args: additional_context: The additional context. additional_context_file: The additional context file. + rocm_path: Optional ROCm installation path (overrides ROCM_PATH env; default /opt/rocm). Raises: RuntimeError: If the GPU vendor is not detected. RuntimeError: If the GPU architecture is not detected. """ + # Resolve ROCm path first (used by get_gpu_vendor and others) + self._rocm_path = get_rocm_path(rocm_path) + # Initialize the console self.console = Console() @@ -99,7 +107,7 @@ def __init__( # Validate ROCm installation if AMD GPU is detected if self.ctx["gpu_vendor"] == "AMD": try: - validate_rocm_installation(verbose=False, raise_on_error=True) + validate_rocm_installation(verbose=False, raise_on_error=True, rocm_path=self._rocm_path) except GPUInstallationError as e: print("\n" + "="*70) print("ERROR: ROCm Installation Validation Failed") @@ -110,6 +118,8 @@ def __init__( # Initialize the docker context self.ctx["docker_env_vars"] = {} + self.ctx["rocm_path"] = self._rocm_path + self.ctx["docker_env_vars"]["ROCM_PATH"] = self._rocm_path self.ctx["docker_env_vars"]["MAD_GPU_VENDOR"] = self.ctx["gpu_vendor"] self.ctx["docker_env_vars"]["MAD_SYSTEM_NGPUS"] = self.get_system_ngpus() self.ctx["docker_env_vars"]["MAD_SYSTEM_GPU_ARCHITECTURE"] = self.get_system_gpu_architecture() @@ -161,6 +171,10 @@ def __init__( # Set multi-node runner after context update self.ctx['docker_env_vars']['MAD_MULTI_NODE_RUNNER'] = self.set_multi_node_runner() + def _quoted_rocm_bin(self, name: str) -> str: + """Shell-safe path to a binary under ``{rocm_path}/bin``.""" + return shlex.quote(os.path.join(self._rocm_path, "bin", name)) + def get_ctx_test(self) -> str: """Get context test. @@ -189,9 +203,15 @@ def get_gpu_vendor(self) -> str: - NVIDIA - AMD """ - # Check if the GPU vendor is NVIDIA or AMD, and if it is unable to detect the GPU vendor. + # ROCM_PATH via subprocess env avoids embedding user-controlled paths in shell strings. + vendor_env = {**os.environ, "ROCM_PATH": self._rocm_path} return self.console.sh( - 'bash -c \'if [[ -f /usr/bin/nvidia-smi ]] && $(/usr/bin/nvidia-smi > /dev/null 2>&1); then echo "NVIDIA"; elif [[ -f /opt/rocm/bin/amd-smi ]]; then echo "AMD"; elif [[ -f /usr/local/bin/amd-smi ]]; then echo "AMD"; elif [[ -f /opt/rocm/bin/rocm-smi ]]; then echo "AMD"; else echo "Unable to detect GPU vendor"; fi || true\'' + 'bash -c \'if [[ -f /usr/bin/nvidia-smi ]] && $(/usr/bin/nvidia-smi > /dev/null 2>&1); then echo "NVIDIA"; ' + 'elif [[ -f "${ROCM_PATH}/bin/amd-smi" ]]; then echo "AMD"; ' + 'elif [[ -f /usr/local/bin/amd-smi ]]; then echo "AMD"; ' + 'elif [[ -f "${ROCM_PATH}/bin/rocm-smi" ]]; then echo "AMD"; ' + 'else echo "Unable to detect GPU vendor"; fi || true\'', + env=vendor_env, ) def get_host_os(self) -> str: @@ -254,11 +274,19 @@ def get_system_ngpus(self) -> int: number_gpus = 0 if self.ctx["docker_env_vars"]["MAD_GPU_VENDOR"] == "AMD": try: - number_gpus = int(self.console.sh("amd-smi list --csv | tail -n +3 | wc -l")) + amd_smi = self._quoted_rocm_bin("amd-smi") + number_gpus = int( + self.console.sh(f"{amd_smi} list --csv | tail -n +3 | wc -l") + ) except Exception as e: # Try fallback to rocm-smi try: - number_gpus = int(self.console.sh("rocm-smi --showid --csv | tail -n +2 | wc -l")) + rocm_smi = self._quoted_rocm_bin("rocm-smi") + number_gpus = int( + self.console.sh( + f"{rocm_smi} --showid --csv | tail -n +2 | wc -l" + ) + ) except Exception: raise RuntimeError( f"Unable to determine number of AMD GPUs. " @@ -289,14 +317,31 @@ def get_system_gpu_architecture(self) -> str: """ if self.ctx["docker_env_vars"]["MAD_GPU_VENDOR"] == "AMD": try: - arch = self.console.sh("/opt/rocm/bin/rocminfo |grep -o -m 1 'gfx.*'") - if not arch or arch.strip() == "": + rocminfo_path = os.path.join(self._rocm_path, "bin", "rocminfo") + try: + proc = subprocess.run( + [rocminfo_path], + capture_output=True, + text=True, + timeout=60, + check=False, + ) + except FileNotFoundError as fnf: + raise RuntimeError( + f"rocminfo not found at {rocminfo_path}" + ) from fnf + out = (proc.stdout or "") + (proc.stderr or "") + match = re.search(r"gfx\S+", out) + if not match: + raise RuntimeError("rocminfo returned empty architecture") + arch = match.group(0) + if not arch.strip(): raise RuntimeError("rocminfo returned empty architecture") return arch except Exception as e: raise RuntimeError( f"Unable to determine AMD GPU architecture. " - f"Ensure ROCm is installed and rocminfo is accessible at /opt/rocm/bin/rocminfo. " + f"Ensure ROCm is installed and rocminfo is accessible (ROCM_PATH={self._rocm_path}). " f"Error: {e}" ) elif self.ctx["docker_env_vars"]["MAD_GPU_VENDOR"] == "NVIDIA": @@ -323,11 +368,15 @@ def get_system_gpu_product_name(self) -> str: """ if self.ctx["docker_env_vars"]["MAD_GPU_VENDOR"] == "AMD": try: - return self.console.sh("amd-smi static -g 0 | grep MARKET_NAME: | cut -d ':' -f 2") + amd_smi = self._quoted_rocm_bin("amd-smi") + return self.console.sh( + f"{amd_smi} static -g 0 | grep MARKET_NAME: | cut -d ':' -f 2" + ) except Exception as e: # Try fallback to rocm-smi try: - output = self.console.sh("rocm-smi -i") + rocm_smi = self._quoted_rocm_bin("rocm-smi") + output = self.console.sh(f"{rocm_smi} -i") # Parse output to extract product name from brackets # Example: "GPU[0] : Device Name: Arcturus GL-XL [Instinct MI100]" # Extract: "Instinct MI100" @@ -352,7 +401,8 @@ def get_system_gpu_product_name(self) -> str: def get_system_hip_version(self): if self.ctx['docker_env_vars']['MAD_GPU_VENDOR']=='AMD': try: - version = self.console.sh("hipconfig --version | cut -d'.' -f1,2") + hipconfig = self._quoted_rocm_bin("hipconfig") + version = self.console.sh(f"{hipconfig} --version | cut -d'.' -f1,2") if not version or version.strip() == "": raise RuntimeError("hipconfig returned empty version") return version @@ -408,9 +458,16 @@ def get_gpu_renderD_nodes(self) -> typing.Optional[typing.List[int]]: try: # Get ROCm version - rocm_version_str = self.console.sh("cat /opt/rocm/.info/version | cut -d'-' -f1") + version_file = os.path.join(self._rocm_path, ".info", "version") + try: + with open(version_file, "r", encoding="utf-8") as vf: + rocm_version_str = vf.read().strip().split("-")[0].split("\n")[0] + except OSError as io_err: + raise RuntimeError( + f"Failed to read ROCm version file {version_file}: {io_err}" + ) from io_err if not rocm_version_str or rocm_version_str.strip() == "": - raise RuntimeError("Failed to retrieve ROCm version from /opt/rocm/.info/version") + raise RuntimeError(f"Failed to retrieve ROCm version from {version_file}") # Parse version safely try: @@ -467,7 +524,8 @@ def get_gpu_renderD_nodes(self) -> typing.Optional[typing.List[int]]: } # Get list of GPUs from amd-smi - output = self.console.sh("amd-smi list -e --json") + amd_smi = self._quoted_rocm_bin("amd-smi") + output = self.console.sh(f"{amd_smi} list -e --json") if not output or output.strip() == "": raise ValueError("Failed to retrieve AMD GPU data from amd-smi") @@ -523,7 +581,10 @@ def get_gpu_renderD_nodes(self) -> typing.Optional[typing.List[int]]: } # Get GPU ID to unique ID mapping from rocm-smi - rsmi_output = self.console.sh("rocm-smi --showuniqueid | grep 'Unique.*:'") + rocm_smi = self._quoted_rocm_bin("rocm-smi") + rsmi_output = self.console.sh( + f"{rocm_smi} --showuniqueid | grep 'Unique.*:'" + ) if not rsmi_output or rsmi_output.strip() == "": raise RuntimeError("Failed to retrieve unique IDs from rocm-smi") diff --git a/src/madengine/mad.py b/src/madengine/mad.py index e4df7143..02a3e76b 100644 --- a/src/madengine/mad.py +++ b/src/madengine/mad.py @@ -215,6 +215,7 @@ def main(): parser_run.add_argument('--additional-context-file', default=None, help="additonal context, as json file, to filter behavior of workloads. Overrides detected contexts.") parser_run.add_argument('--additional-context', default='{}', help="additional context, as string representation of python dict, to filter behavior of workloads. " + " Overrides detected contexts and additional-context-file.") + parser_run.add_argument('--rocm-path', default=None, help='ROCm installation path (overrides ROCM_PATH env; default: /opt/rocm). Use when ROCm is not under /opt/rocm (e.g. Rock tar/whl).') parser_run.add_argument('--data-config-file-name', default="data.json", help="custom data configuration file.") parser_run.add_argument('--tools-json-file-name', default="./scripts/common/tools.json", help="custom tools json configuration file.") parser_run.add_argument('--generate-sys-env-details', type=lambda x: (str(x).lower() in ['true', '1', 'yes']), default=True, help='generate system config env details by default (accepts: true/false, yes/no, 1/0)') diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py index 0a4d2083..9b62184f 100644 --- a/src/madengine/tools/run_models.py +++ b/src/madengine/tools/run_models.py @@ -33,6 +33,7 @@ import sys import os import json +import shlex import time import re import traceback @@ -48,6 +49,7 @@ from madengine.utils.ops import PythonicTee, file_print, substring_found, find_and_replace_pattern from madengine.core.constants import MAD_MINIO, MAD_AWS_S3 from madengine.core.constants import MODEL_DIR, PUBLIC_GITHUB_ROCM_KEY +from madengine.core.constants import get_rocm_path from madengine.core.timeout import Timeout from madengine.tools.update_perf_csv import update_perf_csv from madengine.tools.csv_to_html import convert_csv_to_html @@ -154,9 +156,11 @@ def __init__(self, args): self.return_status = True self.args = args self.console = Console(live_output=True) + rocm_path = get_rocm_path(getattr(args, 'rocm_path', None)) self.context = Context( additional_context=args.additional_context, additional_context_file=args.additional_context_file, + rocm_path=rocm_path, ) # check the data.json file exists data_json_file = args.data_config_file_name @@ -202,7 +206,10 @@ def clean_up_docker_container(self, is_cleaned: bool = False) -> None: gpu_vendor = self.context.ctx["docker_env_vars"]["MAD_GPU_VENDOR"] # show gpu info if gpu_vendor.find("AMD") != -1: - self.console.sh("/opt/rocm/bin/amd-smi || /opt/rocm/bin/rocm-smi || true") + ro = self.context.ctx["rocm_path"] + amd_smi = shlex.quote(os.path.join(ro, "bin", "amd-smi")) + rocm_smi = shlex.quote(os.path.join(ro, "bin", "rocm-smi")) + self.console.sh(f"{amd_smi} || {rocm_smi} || true") elif gpu_vendor.find("NVIDIA") != -1: self.console.sh("nvidia-smi -L || true") @@ -789,7 +796,10 @@ def run_model_impl( # echo gpu smi info if gpu_vendor.find("AMD") != -1: - smi = model_docker.sh("/opt/rocm/bin/amd-smi || /opt/rocm/bin/rocm-smi || true") + ro = self.context.ctx["rocm_path"] + amd_smi = shlex.quote(os.path.join(ro, "bin", "amd-smi")) + rocm_smi = shlex.quote(os.path.join(ro, "bin", "rocm-smi")) + smi = model_docker.sh(f"{amd_smi} || {rocm_smi} || true") elif gpu_vendor.find("NVIDIA") != -1: smi = model_docker.sh("/usr/bin/nvidia-smi || true") else: diff --git a/src/madengine/utils/gpu_validator.py b/src/madengine/utils/gpu_validator.py index 5715db67..b90f214b 100644 --- a/src/madengine/utils/gpu_validator.py +++ b/src/madengine/utils/gpu_validator.py @@ -14,6 +14,8 @@ from dataclasses import dataclass from enum import Enum +from madengine.core.constants import get_rocm_path + class GPUVendor(Enum): """Supported GPU vendors""" @@ -43,34 +45,39 @@ def __post_init__(self): class ROCmValidator: """Validator for AMD ROCm installation""" - - # Essential ROCm components to check - ESSENTIAL_PATHS = { - 'rocm_root': '/opt/rocm', - 'hip_path': '/opt/rocm/bin/hipconfig', - 'rocminfo': '/opt/rocm/bin/rocminfo', - } - - # Optional but recommended components - RECOMMENDED_PATHS = { - 'amd_smi': '/opt/rocm/bin/amd-smi', - 'rocm_smi': '/opt/rocm/bin/rocm-smi', - } - - # KFD (Kernel Fusion Driver) paths + + # KFD (Kernel Fusion Driver) paths - not under ROCm install KFD_PATHS = { 'kfd_device': '/dev/kfd', 'kfd_topology': '/sys/devices/virtual/kfd/kfd/topology/nodes', } - - def __init__(self, verbose: bool = False): + + def __init__(self, verbose: bool = False, rocm_path: Optional[str] = None): """Initialize ROCm validator - + Args: verbose: If True, print detailed validation progress + rocm_path: Optional ROCm root path (default: ROCM_PATH env or /opt/rocm) """ self.verbose = verbose - + self.rocm_path = get_rocm_path(rocm_path) + self.ESSENTIAL_PATHS = { + 'rocm_root': self.rocm_path, + 'hip_path': os.path.join(self.rocm_path, 'bin', 'hipconfig'), + 'rocminfo': os.path.join(self.rocm_path, 'bin', 'rocminfo'), + } + self.RECOMMENDED_PATHS = { + 'amd_smi': os.path.join(self.rocm_path, 'bin', 'amd-smi'), + 'rocm_smi': os.path.join(self.rocm_path, 'bin', 'rocm-smi'), + } + + def _rocm_tool_cmd(self, name: str, args: List[str]) -> List[str]: + """Prefer ``{rocm_path}/bin/{name}`` when present, else bare *name* on PATH.""" + full = os.path.join(self.rocm_path, "bin", name) + if os.path.isfile(full): + return [full] + args + return [name] + args + def _run_command(self, cmd: List[str], timeout: int = 10) -> Tuple[bool, str, str]: """Run a command and return success status and output @@ -106,13 +113,15 @@ def _get_rocm_version(self) -> Optional[str]: Returns: ROCm version string or None if not found """ - # Try hipconfig first - success, stdout, _ = self._run_command(['hipconfig', '--version']) + # Try hipconfig under rocm_path first (non-default installs may not set PATH) + success, stdout, _ = self._run_command( + self._rocm_tool_cmd("hipconfig", ["--version"]) + ) if success and stdout: return stdout.split('-')[0] # Remove build suffix # Try version file - version_file = '/opt/rocm/.info/version' + version_file = os.path.join(self.rocm_path, '.info', 'version') if os.path.exists(version_file): try: with open(version_file, 'r') as f: @@ -130,7 +139,7 @@ def _check_gpu_accessible(self) -> Tuple[bool, str]: Tuple of (accessible, message) """ # Try rocminfo first - success, stdout, stderr = self._run_command(['rocminfo']) + success, stdout, stderr = self._run_command(self._rocm_tool_cmd("rocminfo", [])) if success: # Check if any GPU agents are listed if 'Agent' in stdout and 'gfx' in stdout.lower(): @@ -139,12 +148,14 @@ def _check_gpu_accessible(self) -> Tuple[bool, str]: return False, "rocminfo ran but no GPU agents detected" # Try amd-smi - success, stdout, stderr = self._run_command(['amd-smi', 'list']) + success, stdout, stderr = self._run_command( + self._rocm_tool_cmd("amd-smi", ["list"]) + ) if success and stdout: return True, "GPUs accessible via amd-smi" # Try rocm-smi - success, stdout, stderr = self._run_command(['rocm-smi']) + success, stdout, stderr = self._run_command(self._rocm_tool_cmd("rocm-smi", [])) if success and stdout: return True, "GPUs accessible via rocm-smi" @@ -305,9 +316,10 @@ def validate(self) -> GPUValidationResult: # Generate suggestions based on issues if result.issues: - if not self._check_path_exists('/opt/rocm'): + if not self._check_path_exists(self.rocm_path): result.suggestions.append( - "ROCm does not appear to be installed. Install ROCm: " + f"ROCm does not appear to be installed at {self.rocm_path}. " + "Set ROCM_PATH if using a non-default install, or install ROCm: " "https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html" ) @@ -552,39 +564,45 @@ def validate(self) -> GPUValidationResult: return result -def detect_gpu_vendor() -> GPUVendor: +def detect_gpu_vendor(rocm_path: Optional[str] = None) -> GPUVendor: """Detect which GPU vendor is present on the system - + + Args: + rocm_path: Optional ROCm root path (default: ROCM_PATH env or /opt/rocm) + Returns: GPUVendor enum value """ if os.path.exists("/usr/bin/nvidia-smi"): return GPUVendor.NVIDIA - elif os.path.exists("/opt/rocm/bin/rocm-smi") or os.path.exists("/opt/rocm/bin/amd-smi"): + rocm = get_rocm_path(rocm_path) + if os.path.exists(os.path.join(rocm, "bin", "rocm-smi")) or os.path.exists(os.path.join(rocm, "bin", "amd-smi")): return GPUVendor.AMD - else: - return GPUVendor.UNKNOWN + if os.path.exists("/usr/local/bin/amd-smi"): + return GPUVendor.AMD + return GPUVendor.UNKNOWN -def validate_gpu_installation(vendor: Optional[GPUVendor] = None, verbose: bool = False, raise_on_error: bool = True) -> GPUValidationResult: +def validate_gpu_installation(vendor: Optional[GPUVendor] = None, verbose: bool = False, raise_on_error: bool = True, rocm_path: Optional[str] = None) -> GPUValidationResult: """Validate GPU installation on the current node - + Args: vendor: GPU vendor to validate (auto-detected if None) verbose: Print detailed validation progress raise_on_error: Raise GPUInstallationError if validation fails - + rocm_path: Optional ROCm root path for AMD (default: ROCM_PATH env or /opt/rocm) + Returns: GPUValidationResult - + Raises: GPUInstallationError: If validation fails and raise_on_error is True """ if vendor is None: - vendor = detect_gpu_vendor() - + vendor = detect_gpu_vendor(rocm_path=rocm_path) + if vendor == GPUVendor.AMD: - validator = ROCmValidator(verbose=verbose) + validator = ROCmValidator(verbose=verbose, rocm_path=rocm_path) rocm_result = validator.validate() # Convert ROCmValidationResult to GPUValidationResult result = GPUValidationResult( @@ -666,20 +684,21 @@ def _format_error_message(self, result: GPUValidationResult) -> str: ROCmInstallationError = GPUInstallationError # For backwards compatibility -def validate_rocm_installation(verbose: bool = False, raise_on_error: bool = True) -> GPUValidationResult: +def validate_rocm_installation(verbose: bool = False, raise_on_error: bool = True, rocm_path: Optional[str] = None) -> GPUValidationResult: """Validate ROCm installation on the current node (backwards compatibility wrapper) - + Args: verbose: Print detailed validation progress raise_on_error: Raise GPUInstallationError if validation fails - + rocm_path: Optional ROCm root path (default: ROCM_PATH env or /opt/rocm) + Returns: GPUValidationResult - + Raises: GPUInstallationError: If validation fails and raise_on_error is True """ - return validate_gpu_installation(vendor=GPUVendor.AMD, verbose=verbose, raise_on_error=raise_on_error) + return validate_gpu_installation(vendor=GPUVendor.AMD, verbose=verbose, raise_on_error=raise_on_error, rocm_path=rocm_path) if __name__ == "__main__": diff --git a/tests/test_gpu_renderD_nodes.py b/tests/test_gpu_renderD_nodes.py index ed99b04e..8a9d983a 100644 --- a/tests/test_gpu_renderD_nodes.py +++ b/tests/test_gpu_renderD_nodes.py @@ -15,14 +15,20 @@ # project modules from madengine.core.context import Context from madengine.core.console import Console +from madengine.core.constants import get_rocm_path def is_amd_gpu(): """Check if the system has AMD GPUs.""" try: console = Console() + rocm_path = get_rocm_path() + vendor_env = {**os.environ, "ROCM_PATH": rocm_path} vendor = console.sh( - 'bash -c \'if [[ -f /opt/rocm/bin/amd-smi ]]; then echo "AMD"; elif [[ -f /usr/local/bin/amd-smi ]]; then echo "AMD"; else echo "OTHER"; fi || true\'' + 'bash -c \'if [[ -f "${ROCM_PATH}/bin/amd-smi" ]]; then echo "AMD"; ' + 'elif [[ -f /usr/local/bin/amd-smi ]]; then echo "AMD"; ' + 'else echo "OTHER"; fi || true\'', + env=vendor_env, ) return vendor.strip() == "AMD" except Exception: