Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 32 additions & 5 deletions src/madengine/core/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ def __init__(
additional_context_file: str = None,
build_only_mode: bool = False,
rocm_path: str = None,
detect_local_gpu_arch: bool = False,
) -> None:
"""Constructor of the Context class.

Expand All @@ -91,6 +92,9 @@ def __init__(
additional_context_file: The additional context file.
build_only_mode: Whether running in build-only mode (no GPU detection).
rocm_path: Optional ROCm installation path (overrides ROCM_PATH env; default /opt/rocm).
detect_local_gpu_arch: When True and in build_only_mode, attempt to auto-detect
MAD_SYSTEM_GPU_ARCHITECTURE from the local node and inject it into docker_build_arg.
Has no effect when build_only_mode=False (runtime mode detects it via init_gpu_context).

Raises:
RuntimeError: If GPU detection fails and not in build-only mode.
Expand All @@ -100,6 +104,7 @@ def __init__(
self.console = Console()
self._gpu_context_initialized = False
self._build_only_mode = build_only_mode
self._detect_local_gpu_arch = detect_local_gpu_arch
self._system_context_initialized = False
self._gpu_tool_manager = None # Lazy initialization

Expand Down Expand Up @@ -137,17 +142,22 @@ def __init__(
self.init_runtime_context()
else:
# For build-only mode, only initialize what's needed for building
self.init_build_context()
self.init_build_context(detect_gpu_arch=self._detect_local_gpu_arch)

## ADD MORE CONTEXTS HERE ##

def init_build_context(self) -> None:
def init_build_context(self, detect_gpu_arch: bool = False) -> None:
"""Initialize build-specific context.

This method sets up only the context needed for Docker builds,
avoiding GPU detection that would fail on build-only nodes.
System-specific contexts (host_os, numa_balancing, etc.) should be
provided via --additional-context for build-only nodes if needed.

Args:
detect_gpu_arch: When True, attempt to auto-detect MAD_SYSTEM_GPU_ARCHITECTURE
from the local node and inject it into docker_build_arg. Fails gracefully
if no GPU is present (e.g., on a pure CI build node).
"""
print("Initializing build-only context...")

Expand All @@ -168,9 +178,26 @@ def init_build_context(self) -> None:
"Consider providing host_os via --additional-context if needed for build"
)

# Don't detect GPU-specific contexts in build-only mode
# These should be provided via additional_context if needed for build args
# (GPU arch guidance is emitted in BuildOrchestrator after model/Dockerfile discovery.)
# Optionally auto-detect GPU architecture for local full-workflow builds (build+run).
# Skipped for standalone `madengine build` on non-GPU/CI nodes (detect_gpu_arch=False).
if detect_gpu_arch and "MAD_SYSTEM_GPU_ARCHITECTURE" not in self.ctx.get("docker_build_arg", {}):
try:
from madengine.utils.gpu_validator import detect_gpu_vendor
from madengine.execution.dockerfile_utils import normalize_architecture_name

vendor = detect_gpu_vendor(self._rocm_path)
if vendor in (GPUVendor.AMD, GPUVendor.NVIDIA):
manager = get_gpu_tool_manager(vendor, self._rocm_path)
raw_arch = manager.get_gpu_architecture()
arch = normalize_architecture_name(raw_arch) or raw_arch.strip()
self.ctx["docker_build_arg"]["MAD_SYSTEM_GPU_ARCHITECTURE"] = arch
print(f"Auto-detected GPU architecture for build: {arch}")
else:
print("Warning: No supported GPU detected; MAD_SYSTEM_GPU_ARCHITECTURE will not be set automatically.")
print("Consider providing it via --additional-context if needed for build args.")
except Exception as e:
print(f"Warning: Could not auto-detect GPU architecture for build: {e}")
print("Consider providing MAD_SYSTEM_GPU_ARCHITECTURE via --additional-context if needed for build args.")
Comment on lines +181 to +200
Copy link

Copilot AI Apr 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new build-only auto-detection path (detect_gpu_arch=True) is currently untested. Since Context already has unit coverage, please add a unit test that patches detect_gpu_vendor() / get_gpu_tool_manager().get_gpu_architecture() and asserts that ctx["docker_build_arg"]["MAD_SYSTEM_GPU_ARCHITECTURE"] is injected only when absent, and that failures are handled without raising in build-only mode.

Copilot uses AI. Check for mistakes.

# Don't initialize NUMA balancing check for build-only nodes
# This is runtime-specific and should be handled on execution nodes
Expand Down
17 changes: 15 additions & 2 deletions src/madengine/orchestration/build_orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,17 @@ class BuildOrchestrator:
- Save deployment_config from --additional-context
"""

def __init__(self, args, additional_context: Optional[Dict] = None):
def __init__(self, args, additional_context: Optional[Dict] = None, detect_local_gpu_arch: bool = False):
"""
Initialize build orchestrator.

Args:
args: CLI arguments namespace
additional_context: Dict from --additional-context (merged with args if present)
detect_local_gpu_arch: When True, auto-detect MAD_SYSTEM_GPU_ARCHITECTURE from the
local node before building. Intended for full workflow (build+run) on a local
single node. Has no effect if the user already provided the value via
--additional-context. Default False preserves existing standalone-build behavior.
"""
self.args = args
self.console = Console(live_output=getattr(args, "live_output", True))
Expand Down Expand Up @@ -120,14 +124,17 @@ def __init__(self, args, additional_context: Optional[Dict] = None):
))
self.rich_console.print()

# Initialize context in build-only mode (no GPU detection)
# Initialize context in build-only mode (no GPU detection by default).
# Pass detect_local_gpu_arch so Context.init_build_context() can optionally
# auto-detect MAD_SYSTEM_GPU_ARCHITECTURE for full workflow (build+run) runs.
# Context expects additional_context as a string representation of Python dict
# Use repr() instead of json.dumps() because Context uses ast.literal_eval()
# Use self.additional_context (post-ConfigLoader), not pre-defaults merged_context
context_string = repr(self.additional_context)
self.context = Context(
additional_context=context_string,
build_only_mode=True,
detect_local_gpu_arch=detect_local_gpu_arch,
)
Comment thread
coketaste marked this conversation as resolved.

# Load credentials if available
Expand Down Expand Up @@ -288,6 +295,12 @@ def execute(
)
self._warn_if_mad_arch_unresolved_for_dockerfiles(models, builder)

resolved_arch = self.context.ctx.get("docker_build_arg", {}).get("MAD_SYSTEM_GPU_ARCHITECTURE")
if resolved_arch:
self.rich_console.print(
f"[green]✓ MAD_SYSTEM_GPU_ARCHITECTURE resolved: {resolved_arch}[/green]\n"
)

# Step 3: Build Docker images
self.rich_console.print("[bold cyan]🏗️ Building Docker images...[/bold cyan]")

Expand Down
11 changes: 10 additions & 1 deletion src/madengine/orchestration/run_orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,16 @@ def _build_phase(self, tags: list, registry: Optional[str] = None) -> str:
# Update args with tags
self.args.tags = tags

build_orch = BuildOrchestrator(self.args, self.additional_context)
# detect_local_gpu_arch=True: full workflow on a local single node — auto-detect
# MAD_SYSTEM_GPU_ARCHITECTURE before the build so Dockerfiles that require it
# (ARG MAD_SYSTEM_GPU_ARCHITECTURE with no default) are built correctly without
# requiring the user to manually pass --additional-context.
# The user's explicitly provided value (if any) is still respected and not overridden.
build_orch = BuildOrchestrator(
self.args,
self.additional_context,
detect_local_gpu_arch=True,
)
Comment thread
coketaste marked this conversation as resolved.
manifest_file = build_orch.execute(
registry=registry,
clean_cache=getattr(self.args, "clean_docker_cache", False),
Expand Down