diff --git a/src/madengine/core/context.py b/src/madengine/core/context.py index 24763588..d67e8c6a 100644 --- a/src/madengine/core/context.py +++ b/src/madengine/core/context.py @@ -83,6 +83,7 @@ def __init__( additional_context_file: str = None, build_only_mode: bool = False, rocm_path: str = None, + detect_local_gpu_arch: bool = False, ) -> None: """Constructor of the Context class. @@ -91,6 +92,9 @@ def __init__( additional_context_file: The additional context file. build_only_mode: Whether running in build-only mode (no GPU detection). rocm_path: Optional ROCm installation path (overrides ROCM_PATH env; default /opt/rocm). + detect_local_gpu_arch: When True and in build_only_mode, attempt to auto-detect + MAD_SYSTEM_GPU_ARCHITECTURE from the local node and inject it into docker_build_arg. + Has no effect when build_only_mode=False (runtime mode detects it via init_gpu_context). Raises: RuntimeError: If GPU detection fails and not in build-only mode. @@ -100,6 +104,7 @@ def __init__( self.console = Console() self._gpu_context_initialized = False self._build_only_mode = build_only_mode + self._detect_local_gpu_arch = detect_local_gpu_arch self._system_context_initialized = False self._gpu_tool_manager = None # Lazy initialization @@ -137,17 +142,22 @@ def __init__( self.init_runtime_context() else: # For build-only mode, only initialize what's needed for building - self.init_build_context() + self.init_build_context(detect_gpu_arch=self._detect_local_gpu_arch) ## ADD MORE CONTEXTS HERE ## - def init_build_context(self) -> None: + def init_build_context(self, detect_gpu_arch: bool = False) -> None: """Initialize build-specific context. This method sets up only the context needed for Docker builds, avoiding GPU detection that would fail on build-only nodes. System-specific contexts (host_os, numa_balancing, etc.) should be provided via --additional-context for build-only nodes if needed. + + Args: + detect_gpu_arch: When True, attempt to auto-detect MAD_SYSTEM_GPU_ARCHITECTURE + from the local node and inject it into docker_build_arg. Fails gracefully + if no GPU is present (e.g., on a pure CI build node). """ print("Initializing build-only context...") @@ -168,9 +178,26 @@ def init_build_context(self) -> None: "Consider providing host_os via --additional-context if needed for build" ) - # Don't detect GPU-specific contexts in build-only mode - # These should be provided via additional_context if needed for build args - # (GPU arch guidance is emitted in BuildOrchestrator after model/Dockerfile discovery.) + # Optionally auto-detect GPU architecture for local full-workflow builds (build+run). + # Skipped for standalone `madengine build` on non-GPU/CI nodes (detect_gpu_arch=False). + if detect_gpu_arch and "MAD_SYSTEM_GPU_ARCHITECTURE" not in self.ctx.get("docker_build_arg", {}): + try: + from madengine.utils.gpu_validator import detect_gpu_vendor + from madengine.execution.dockerfile_utils import normalize_architecture_name + + vendor = detect_gpu_vendor(self._rocm_path) + if vendor in (GPUVendor.AMD, GPUVendor.NVIDIA): + manager = get_gpu_tool_manager(vendor, self._rocm_path) + raw_arch = manager.get_gpu_architecture() + arch = normalize_architecture_name(raw_arch) or raw_arch.strip() + self.ctx["docker_build_arg"]["MAD_SYSTEM_GPU_ARCHITECTURE"] = arch + print(f"Auto-detected GPU architecture for build: {arch}") + else: + print("Warning: No supported GPU detected; MAD_SYSTEM_GPU_ARCHITECTURE will not be set automatically.") + print("Consider providing it via --additional-context if needed for build args.") + except Exception as e: + print(f"Warning: Could not auto-detect GPU architecture for build: {e}") + print("Consider providing MAD_SYSTEM_GPU_ARCHITECTURE via --additional-context if needed for build args.") # Don't initialize NUMA balancing check for build-only nodes # This is runtime-specific and should be handled on execution nodes diff --git a/src/madengine/orchestration/build_orchestrator.py b/src/madengine/orchestration/build_orchestrator.py index da06f91f..0825ab16 100644 --- a/src/madengine/orchestration/build_orchestrator.py +++ b/src/madengine/orchestration/build_orchestrator.py @@ -46,13 +46,17 @@ class BuildOrchestrator: - Save deployment_config from --additional-context """ - def __init__(self, args, additional_context: Optional[Dict] = None): + def __init__(self, args, additional_context: Optional[Dict] = None, detect_local_gpu_arch: bool = False): """ Initialize build orchestrator. Args: args: CLI arguments namespace additional_context: Dict from --additional-context (merged with args if present) + detect_local_gpu_arch: When True, auto-detect MAD_SYSTEM_GPU_ARCHITECTURE from the + local node before building. Intended for full workflow (build+run) on a local + single node. Has no effect if the user already provided the value via + --additional-context. Default False preserves existing standalone-build behavior. """ self.args = args self.console = Console(live_output=getattr(args, "live_output", True)) @@ -120,7 +124,9 @@ def __init__(self, args, additional_context: Optional[Dict] = None): )) self.rich_console.print() - # Initialize context in build-only mode (no GPU detection) + # Initialize context in build-only mode (no GPU detection by default). + # Pass detect_local_gpu_arch so Context.init_build_context() can optionally + # auto-detect MAD_SYSTEM_GPU_ARCHITECTURE for full workflow (build+run) runs. # Context expects additional_context as a string representation of Python dict # Use repr() instead of json.dumps() because Context uses ast.literal_eval() # Use self.additional_context (post-ConfigLoader), not pre-defaults merged_context @@ -128,6 +134,7 @@ def __init__(self, args, additional_context: Optional[Dict] = None): self.context = Context( additional_context=context_string, build_only_mode=True, + detect_local_gpu_arch=detect_local_gpu_arch, ) # Load credentials if available @@ -288,6 +295,12 @@ def execute( ) self._warn_if_mad_arch_unresolved_for_dockerfiles(models, builder) + resolved_arch = self.context.ctx.get("docker_build_arg", {}).get("MAD_SYSTEM_GPU_ARCHITECTURE") + if resolved_arch: + self.rich_console.print( + f"[green]✓ MAD_SYSTEM_GPU_ARCHITECTURE resolved: {resolved_arch}[/green]\n" + ) + # Step 3: Build Docker images self.rich_console.print("[bold cyan]🏗️ Building Docker images...[/bold cyan]") diff --git a/src/madengine/orchestration/run_orchestrator.py b/src/madengine/orchestration/run_orchestrator.py index 6725a457..67749514 100644 --- a/src/madengine/orchestration/run_orchestrator.py +++ b/src/madengine/orchestration/run_orchestrator.py @@ -345,7 +345,16 @@ def _build_phase(self, tags: list, registry: Optional[str] = None) -> str: # Update args with tags self.args.tags = tags - build_orch = BuildOrchestrator(self.args, self.additional_context) + # detect_local_gpu_arch=True: full workflow on a local single node — auto-detect + # MAD_SYSTEM_GPU_ARCHITECTURE before the build so Dockerfiles that require it + # (ARG MAD_SYSTEM_GPU_ARCHITECTURE with no default) are built correctly without + # requiring the user to manually pass --additional-context. + # The user's explicitly provided value (if any) is still respected and not overridden. + build_orch = BuildOrchestrator( + self.args, + self.additional_context, + detect_local_gpu_arch=True, + ) manifest_file = build_orch.execute( registry=registry, clean_cache=getattr(self.args, "clean_docker_cache", False), diff --git a/tests/unit/test_context_logic.py b/tests/unit/test_context_logic.py index 7f50f491..17d1de5d 100644 --- a/tests/unit/test_context_logic.py +++ b/tests/unit/test_context_logic.py @@ -7,9 +7,10 @@ """ import pytest -from unittest.mock import Mock, patch +from unittest.mock import Mock, MagicMock, patch from madengine.core.context import Context +from madengine.utils.gpu_validator import GPUVendor @pytest.mark.unit @@ -94,4 +95,97 @@ def test_build_only_no_mad_arch_info_line(self, mock_host, mock_ctx): assert not any("MAD_SYSTEM_GPU_ARCHITECTURE" in m for m in msgs) -# Total: 5 unit tests +def _make_build_only_ctx(additional_context="{}") -> Context: + """Create a Context in build_only_mode with __init__'s init_build_context call suppressed. + + Returns a fully constructed Context whose ctx dict is populated from additional_context + but whose init_build_context has NOT yet run, so callers can invoke it in a controlled way. + """ + with patch.object(Context, "init_build_context"), \ + patch.object(Context, "get_ctx_test", return_value="test"), \ + patch.object(Context, "get_host_os", return_value="linux"): + ctx = Context(additional_context=additional_context, build_only_mode=True) + return ctx + + +@pytest.mark.unit +class TestBuildContextGpuArchAutoDetect: + """Test GPU architecture auto-detection in init_build_context (detect_gpu_arch=True).""" + + def test_auto_detect_injects_arch_when_absent(self): + """Auto-detected arch should be injected into docker_build_arg when absent.""" + ctx = _make_build_only_ctx() + + manager = MagicMock() + manager.get_gpu_architecture.return_value = "gfx942" + + # get_gpu_tool_manager is a module-level import in context.py; patch it there. + # detect_gpu_vendor / normalize_architecture_name are imported locally inside + # init_build_context, so patch them at their source modules. + with patch("madengine.core.context.get_gpu_tool_manager", return_value=manager), \ + patch("madengine.utils.gpu_validator.detect_gpu_vendor", return_value=GPUVendor.AMD), \ + patch("madengine.execution.dockerfile_utils.normalize_architecture_name", return_value="gfx942"), \ + patch.object(Context, "get_ctx_test", return_value="test"), \ + patch.object(Context, "get_host_os", return_value="linux"): + ctx.init_build_context(detect_gpu_arch=True) + + assert ctx.ctx["docker_build_arg"]["MAD_SYSTEM_GPU_ARCHITECTURE"] == "gfx942" + + def test_auto_detect_does_not_override_user_value(self): + """User-provided MAD_SYSTEM_GPU_ARCHITECTURE must not be overridden.""" + ctx = _make_build_only_ctx( + additional_context="{'docker_build_arg': {'MAD_SYSTEM_GPU_ARCHITECTURE': 'gfx90a'}}" + ) + + manager = MagicMock() + manager.get_gpu_architecture.return_value = "gfx942" + + with patch("madengine.core.context.get_gpu_tool_manager", return_value=manager), \ + patch("madengine.utils.gpu_validator.detect_gpu_vendor", return_value=GPUVendor.AMD), \ + patch("madengine.execution.dockerfile_utils.normalize_architecture_name", return_value="gfx942"), \ + patch.object(Context, "get_ctx_test", return_value="test"), \ + patch.object(Context, "get_host_os", return_value="linux"): + ctx.init_build_context(detect_gpu_arch=True) + + # User value must be preserved; auto-detect must not overwrite it. + assert ctx.ctx["docker_build_arg"]["MAD_SYSTEM_GPU_ARCHITECTURE"] == "gfx90a" + + def test_auto_detect_warns_on_no_gpu(self): + """Should warn (not crash) when no supported GPU is detected.""" + ctx = _make_build_only_ctx() + + with patch("madengine.utils.gpu_validator.detect_gpu_vendor", return_value=GPUVendor.UNKNOWN), \ + patch.object(Context, "get_ctx_test", return_value="test"), \ + patch.object(Context, "get_host_os", return_value="linux"), \ + patch("builtins.print") as mock_print: + ctx.init_build_context(detect_gpu_arch=True) + + msgs = [str(c.args[0]) for c in mock_print.call_args_list if c.args] + assert any("No supported GPU detected" in m for m in msgs) + assert "MAD_SYSTEM_GPU_ARCHITECTURE" not in ctx.ctx.get("docker_build_arg", {}) + + def test_auto_detect_handles_exception_gracefully(self): + """Detection failure should warn, not raise.""" + ctx = _make_build_only_ctx() + + with patch("madengine.utils.gpu_validator.detect_gpu_vendor", side_effect=RuntimeError("rocminfo not found")), \ + patch.object(Context, "get_ctx_test", return_value="test"), \ + patch.object(Context, "get_host_os", return_value="linux"), \ + patch("builtins.print") as mock_print: + ctx.init_build_context(detect_gpu_arch=True) + + msgs = [str(c.args[0]) for c in mock_print.call_args_list if c.args] + assert any("Could not auto-detect GPU architecture" in m for m in msgs) + assert "MAD_SYSTEM_GPU_ARCHITECTURE" not in ctx.ctx.get("docker_build_arg", {}) + + def test_no_detection_when_flag_is_false(self): + """detect_gpu_arch=False should skip detection entirely.""" + ctx = _make_build_only_ctx() + + with patch("madengine.utils.gpu_validator.detect_gpu_vendor") as mock_detect, \ + patch.object(Context, "get_ctx_test", return_value="test"), \ + patch.object(Context, "get_host_os", return_value="linux"): + ctx.init_build_context(detect_gpu_arch=False) + + mock_detect.assert_not_called() + assert "MAD_SYSTEM_GPU_ARCHITECTURE" not in ctx.ctx.get("docker_build_arg", {})