diff --git a/.gitignore b/.gitignore index 1285852..e2d872c 100644 --- a/.gitignore +++ b/.gitignore @@ -69,6 +69,10 @@ venv.bak/ # Results *.csv *.json +!models.json +!perf_super.json +!perf_entry_super.json +!perf_entry.json *.log *.out *.html diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..6907329 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "scripts/Primus"] + path = scripts/Primus + url = https://github.com/AMD-AGI/Primus diff --git a/README.md b/README.md index 40b3a62..2e98cc7 100644 --- a/README.md +++ b/README.md @@ -24,11 +24,13 @@ Below are blueprints of supported models along with their documentation. | [**vLLM disaggregated P/D inference**](scripts/vllm_dissag/README.MD) | Distributed Inference P/D disaggregation with vLLM | DeepSeek-V3, Llama-3.3-70B-Instruct-FP8-KV, Llama-3.1-405B-Instruct-FP8-KV, gpt-oss-120b | | [**SGLang disaggregated P/D inference**](scripts/sglang_disagg/README.MD) | Distributed Inference P/D disggregation with SGLang | Qwen3-32B, Llama-3.1-8B-Instruct, Llama-3.3-70B-Instruct-FP8-KV, Llama-3.1-405B-Instruct-FP8-KV, DeepSeek-V3, Mixtral-8x7B-v0.1 | | [**KVCache Transfer Bench**](scripts/kvcache_transfer_bench/README.md) | Inter-node Transfer Benchmark | no specific models | +| [**Primus pretrain**](#primus-pretrain) | LLM pretraining through the [Primus](https://github.com/AMD-AGI/Primus) launcher (Megatron, TorchTitan, MaxText, and other backends) | Config-driven; see `scripts/Primus/examples/` | ## Table of Contents - [Prerequisites](#prerequisites) - [Quick Start](#quick-start) +- [Primus pretrain](#primus-pretrain) - [Usage Guide](#usage-guide) - [Running Models](#running-models) - [Tag Functionality](#tag-functionality) @@ -50,11 +52,15 @@ Below are blueprints of supported models along with their documentation. ## Quick Start -1. **Clone the repository**: +1. **Clone the repository** (include the Primus submodule if you use [Primus pretrain](#primus-pretrain)): ```bash - git clone + git clone --recurse-submodules cd MAD ``` + If you already cloned without submodules, initialize Primus with: + ```bash + git submodule update --init scripts/Primus + ``` 2. **Install dependencies**: ```bash @@ -66,7 +72,20 @@ Below are blueprints of supported models along with their documentation. madengine run --tags pyt_huggingface_bert ``` +## Primus pretrain + +MAD integrates [AMD-AGI/Primus](https://github.com/AMD-AGI/Primus) as a Git submodule at **`scripts/Primus`**. The **`primus_pretrain`** entry in `models.json` uses **`docker/primus.ubuntu.amd.Dockerfile`** and **`scripts/primus_pretrain/`** (`run.sh` wraps Primus `examples/run_pretrain.sh`, copies logs under the madengine run directory, and writes **`primus_perf_output.csv`** for throughput / TFLOPs / MFU when logs include those metrics). + +- **Run with madengine** (tags include `primus`, `training`, `pretrain`): + ```bash + madengine run --tags primus_pretrain + ``` +- **Choose a config**: pass Primus YAML via script args, e.g. `--config_path examples/torchtitan/configs/MI300X/your_config.yaml` (path is relative to the Primus repo root). For SLURM or Kubernetes, you can set **`PRIMUS_CONFIG_PATH`** to the same path instead. +- **Hugging Face–backed configs**: set **`HF_TOKEN`**, or **`MAD_SECRET_HFTOKEN`** (madengine v2 can inject the latter via `additional_context.docker_env_vars`). +- **Docker build**: build from the **repository root** so `COPY scripts/Primus/` in `docker/primus.ubuntu.amd.Dockerfile` resolves; `madengine build` uses repo context for Dockerfiles whose path contains `primus`. +- **Optional discovery**: `scripts/primus_pretrain/get_models_json.py` can expose individual Primus example YAMLs as separate models when used with madengine’s discover-models flow. +For more detail, see comments in `docker/primus.ubuntu.amd.Dockerfile` and `scripts/primus_pretrain/run.sh`. ## Usage Guide diff --git a/docker/primus.ubuntu.amd.Dockerfile b/docker/primus.ubuntu.amd.Dockerfile new file mode 100644 index 0000000..52d0122 --- /dev/null +++ b/docker/primus.ubuntu.amd.Dockerfile @@ -0,0 +1,42 @@ +# CONTEXT {'gpu_vendor': 'AMD', 'guest_os': 'UBUNTU'} +# Primus launcher for MAD: one image for all Primus pretrain configs (torchtitan, megatron, MaxText, …). +# +# Build context must be the repo root (so COPY scripts/Primus works). Manual: +# docker build -f docker/primus.ubuntu.amd.Dockerfile . +# `madengine build` uses context `.` for models whose dockerfile path contains "primus" +# (see DockerBuilder.get_context_path in madengine). +# +# PRIMUS_ROOT is /workspace/Primus (Primus repo root: examples/run_pretrain.sh, examples//…). +# WORKSPACE_DIR is the generic working directory /workspace; madengine places manifests and +# run_directory there. Do not set PRIMUS_ROOT=/workspace — that would collide with those files. +# +# Kubernetes: the Job mounts an emptyDir on /workspace, so image layers under /workspace are not +# visible in the pod. Madengine bundles `scripts/Primus/examples/...` into the ConfigMap as +# `Primus/examples/...` so the init container recreates /workspace/Primus (see madengine k8s). +# +# Local Docker / SLURM: bind-mount or shared filesystem provides scripts/Primus; run.sh prefers +# that checkout when present, else uses PRIMUS_ROOT from this image. +ARG BASE_DOCKER=docker.io/rocm/primus:v26.1 + +FROM $BASE_DOCKER + +USER root + +ENV WORKSPACE_DIR=/workspace +ENV PRIMUS_ROOT=/workspace/Primus + +RUN mkdir -p "$WORKSPACE_DIR" +WORKDIR $WORKSPACE_DIR + +LABEL mad.launcher=primus + +# rocm/primus base often has /workspace/Primus as a full git clone (.git is a directory). +# A submodule checkout uses .git as a file (gitlink). COPY cannot replace that tree — remove first. +RUN rm -rf /workspace/Primus + +# Bake Primus from the build context (submodule). No git clone — matches CI and local builds. +COPY scripts/Primus/ /workspace/Primus/ + +RUN test -f /workspace/Primus/examples/run_pretrain.sh + +RUN pip3 list 2>/dev/null || true diff --git a/models.json b/models.json index b24ac6c..9dfed92 100644 --- a/models.json +++ b/models.json @@ -1052,6 +1052,16 @@ "args": "--model_repo pyt_train_qwen3-32b" }, + { + "name": "primus_pretrain", + "dockerfile": "docker/primus", + "scripts": "scripts/primus_pretrain", + "n_gpus": "-1", + "owner": "mad.support@amd.com", + "training_precision": "bf16", + "tags": ["training", "primus", "megatron", "pretrain"], + "args": "" + }, { "name": "primus_pyt_train_llama-3.1-8b", "url": "", diff --git a/scripts/Primus b/scripts/Primus new file mode 160000 index 0000000..e50a78b --- /dev/null +++ b/scripts/Primus @@ -0,0 +1 @@ +Subproject commit e50a78b09599dac4d3f5f404d8551b54b5b5d83b diff --git a/scripts/primus_pretrain/extract_primus_perf.py b/scripts/primus_pretrain/extract_primus_perf.py new file mode 100755 index 0000000..e7634cc --- /dev/null +++ b/scripts/primus_pretrain/extract_primus_perf.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 +""" +Extract Primus/Torchtitan performance metrics from training log and write +madengine multiple_results CSV (one row per metric). + +Expected log line (last step): + step: 50 loss: ... tps: 1,444 tflops: 300.32 mfu: 23.10% + +Output CSV format (model, performance, metric) — one row per metric: + model,performance,metric + primus_run,1444,tokens_per_second + primus_run,300.32,tflops + primus_run,23.10,model_flops_utilization +""" +import argparse +import csv +import re +import sys + + +def extract_metrics(log_path: str) -> dict: + """Parse log file and return tps, tflops, mfu from the last step line.""" + tps = tflops = mfu = None + # Match lines containing step, tps, tflops, mfu (e.g. Torchtitan/Primus format) + tps_re = re.compile(r"tps:\s*([0-9][0-9.,eE+-]*)") + tflops_re = re.compile(r"tflops:\s*([0-9][0-9.eE+-]*)") + mfu_re = re.compile(r"mfu:\s*([0-9][0-9.]*)%?") + + try: + with open(log_path, "r", encoding="utf-8", errors="ignore") as f: + for line in f: + if "tps:" in line and "tflops:" in line and "mfu:" in line: + m = tps_re.search(line) + if m: + tps = m.group(1).replace(",", "").strip() + m = tflops_re.search(line) + if m: + tflops = m.group(1).strip() + m = mfu_re.search(line) + if m: + mfu = m.group(1).strip() + except OSError as e: + print(f"Error reading log {log_path}: {e}", file=sys.stderr) + return {} + + return {"tps": tps, "tflops": tflops, "mfu": mfu} + + +def main(): + parser = argparse.ArgumentParser(description="Extract Primus perf metrics to multiple_results CSV") + parser.add_argument("log_path", help="Path to training log (e.g. output/log_mp_pretrain_*.txt)") + parser.add_argument("output_csv", help="Path to output CSV (e.g. run_directory/primus_perf_output.csv)") + parser.add_argument("--model-id", default="primus_run", help="Model id for the CSV rows") + args = parser.parse_args() + + metrics = extract_metrics(args.log_path) + if not metrics or metrics.get("tps") is None: + print("Warning: No tps/tflops/mfu found in log; writing empty rows.", file=sys.stderr) + metrics = {"tps": "", "tflops": "", "mfu": ""} + + # One row per metric: model, performance, metric + rows = [ + {"model": args.model_id, "performance": metrics.get("tps") or "", "metric": "tokens_per_second"}, + {"model": args.model_id, "performance": metrics.get("tflops") or "", "metric": "tflops"}, + {"model": args.model_id, "performance": metrics.get("mfu") or "", "metric": "model_flops_utilization"}, + ] + + with open(args.output_csv, "w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=["model", "performance", "metric"]) + writer.writeheader() + writer.writerows(rows) + + print( + f"Wrote {args.output_csv}: {len(rows)} rows (tokens_per_second={rows[0]['performance']}, " + f"tflops={rows[1]['performance']}, model_flops_utilization={rows[2]['performance']})" + ) + + +if __name__ == "__main__": + main() diff --git a/scripts/primus_pretrain/get_models_json.py b/scripts/primus_pretrain/get_models_json.py new file mode 100644 index 0000000..ddfc830 --- /dev/null +++ b/scripts/primus_pretrain/get_models_json.py @@ -0,0 +1,56 @@ +""" +Discover Primus example configs as madengine models (optional). + +Convention-based: globs examples/*/configs/**/*.yaml from the Primus submodule (scripts/Primus), +so all launchers (megatron, megatron_bridge, torchtitan, maxtext, moe_package, etc.) are +discovered. New launchers added under examples//configs/ are picked up automatically. +All discovered models use the same dockerfile and run.sh; args pass --config_path . +For SLURM/K8s, supply distributed (launcher, nnodes, primus.config_path) via additional_context. +""" +import os +import glob + +try: + from madengine.utils.discover_models import CustomModel # madengine v2 +except ImportError: + from madengine.tools.discover_models import CustomModel # madengine v1 + +# This file lives in scripts/primus_pretrain; Primus submodule is scripts/Primus +THIS_DIR = os.path.dirname(os.path.abspath(__file__)) +PRIMUS_ROOT = os.path.normpath(os.path.join(THIS_DIR, "..", "Primus")) +# One glob for all launchers: examples//configs/**/*.yaml +CONFIGS_GLOB = os.path.join(PRIMUS_ROOT, "examples", "*", "configs", "**", "*.yaml") + + +def list_models(): + models = [] + if not os.path.isdir(PRIMUS_ROOT): + return models + for yaml_path in sorted(glob.glob(CONFIGS_GLOB)): + rel_path = os.path.relpath(yaml_path, PRIMUS_ROOT) + # Path shape: examples//configs//.yaml + parts = rel_path.split(os.sep) + if len(parts) < 5: + continue + launcher = parts[1] # megatron, torchtitan, megatron_bridge, etc. + arch = parts[3] # MI300X, MI355X, etc. + short_name = os.path.splitext(os.path.basename(yaml_path))[0] + # discover_models prefixes with dirname (primus_pretrain/), so no prefix here + name = f"{launcher}_{arch}_{short_name}" + tags = ["primus", launcher, arch, short_name] + models.append( + CustomModel( + name=name, + dockerfile="../../docker/primus", + scripts="run.sh", + data="", + n_gpus="8", + owner="mad.support@amd.com", + timeout=86400, + training_precision="bf16", + tags=tags, + args=f"--config_path {rel_path}", + multiple_results="primus_perf_output.csv", + ) + ) + return models diff --git a/scripts/primus_pretrain/run.sh b/scripts/primus_pretrain/run.sh new file mode 100755 index 0000000..cc3725e --- /dev/null +++ b/scripts/primus_pretrain/run.sh @@ -0,0 +1,88 @@ +#!/usr/bin/env bash +# Wrapper for Primus pretrain when run via madengine (local, SLURM, or K8s). +# Sets EXP from PRIMUS_CONFIG_PATH or --config_path, infers BACKEND from path, +# then runs Primus examples/run_pretrain.sh. For HF-backed configs set HF_TOKEN +# or MAD_SECRET_HFTOKEN (e.g. via additional_context.docker_env_vars in madengine v2). +# Primus root: set PRIMUS_ROOT to override; else auto-detect. +# After training, extracts tps/tflops/mfu from log and writes primus_perf_output.csv for madengine multiple_results. +set -e + +# run_directory when invoked by madengine (cd run_directory && bash run.sh ...); used for output CSV +RUN_DIR="$(pwd)" + +# Primus root resolution (local bind-mount, K8s ConfigMap extract, image ENV, legacy paths): +# 1) Repo submodule scripts/Primus (local Docker / SLURM with project layout) +# 2) /workspace/Primus — Dockerfile COPY and madengine K8s init (keys Primus/examples/…) +# 3) PRIMUS_ROOT from environment (image default) +# 4) Legacy /opt/primus images +script_dir="$(cd "$(dirname "$0")" && pwd)" +if [[ -f "$script_dir/../Primus/examples/run_pretrain.sh" ]]; then + export PRIMUS_ROOT="$(cd "$script_dir/../Primus" && pwd)" +elif [[ -f "/workspace/Primus/examples/run_pretrain.sh" ]]; then + export PRIMUS_ROOT="/workspace/Primus" +elif [[ -n "${PRIMUS_ROOT:-}" ]]; then + : +elif [[ -f "/opt/primus/examples/run_pretrain.sh" ]]; then + export PRIMUS_ROOT="/opt/primus" +elif [[ -f "/workspace/examples/run_pretrain.sh" ]]; then + export PRIMUS_ROOT="/workspace" +else + echo "ERROR: Could not find Primus run_pretrain.sh. Set PRIMUS_ROOT or use a repo with scripts/Primus submodule." >&2 + exit 1 +fi + +# EXP (required by Primus run_pretrain.sh): prefer PRIMUS_CONFIG_PATH (SLURM/K8s), else --config_path in args +if [[ -n "${PRIMUS_CONFIG_PATH:-}" ]]; then + export EXP="$PRIMUS_CONFIG_PATH" +else + export EXP="examples/megatron/exp_pretrain.yaml" + args=("$@") + for i in "${!args[@]}"; do + if [[ "${args[i]}" == "--config_path" && -n "${args[i+1]:-}" ]]; then + export EXP="${args[i+1]}" + break + fi + done +fi + +# Infer BACKEND from EXP path so run_pretrain.sh uses correct runner (torchtitan, megatron, maxtext, etc.) +# Primus expects BACKEND=MaxText for Jax/MaxText; lowercase for others. +exp_lower="$(echo "$EXP" | tr '[:upper:]' '[:lower:]')" +if [[ "$exp_lower" == *"/maxtext/"* ]]; then + export BACKEND="MaxText" +elif [[ "$exp_lower" == *"/torchtitan/"* ]]; then + export BACKEND="torchtitan" +elif [[ "$exp_lower" == *"/megatron_bridge/"* ]]; then + export BACKEND="megatron_bridge" +elif [[ "$exp_lower" == *"/moe_package/"* ]]; then + export BACKEND="moe_package" +else + export BACKEND="megatron" +fi + +# HF_TOKEN for Primus prepare (HF-backed configs): use MAD_SECRET_HFTOKEN from madengine v2 +# (set via additional_context.docker_env_vars) if HF_TOKEN not already set +if [[ -n "${HF_TOKEN:-}" ]]; then + export HF_TOKEN +elif [[ -n "${MAD_SECRET_HFTOKEN:-}" ]]; then + export HF_TOKEN="$MAD_SECRET_HFTOKEN" +fi + +# Redirect Primus output/outputs to run_directory (workspace root when run via madengine). +# No changes to Primus: we set env vars that run_pretrain.sh already honors (TRAIN_LOG, DUMP_HLO_DIR) +# and pass --job.dump_folder so Torchtitan writes checkpoints here. output/ = logs; outputs/ = checkpoints. +mkdir -p "$RUN_DIR/output" "$RUN_DIR/outputs" +export TRAIN_LOG="$RUN_DIR/output/log_mp_pretrain_$(basename "$EXP" .yaml).txt" +export DUMP_HLO_DIR="${DUMP_HLO_DIR:-$RUN_DIR/output/xla_dump_hlo}" + +# Run from PRIMUS_ROOT so EXP path (e.g. examples/torchtitan/configs/...) resolves correctly. +# Do not use exec so we can run the perf extractor after training for madengine multiple_results. +# Pass --job.dump_folder so Torchtitan writes checkpoints to RUN_DIR/outputs (not scripts/Primus/outputs). +cd "$PRIMUS_ROOT" && bash "$PRIMUS_ROOT/examples/run_pretrain.sh" "$@" --job.dump_folder "$RUN_DIR/outputs" +exitcode=$? +# Extract tps/tflops/mfu from training log into primus_perf_output.csv (one row: model, performance, metric, tflops, model_flops_utilization) +LOG_PATH="$RUN_DIR/output/log_mp_pretrain_$(basename "$EXP" .yaml).txt" +if [[ -f "$LOG_PATH" ]]; then + python3 "$RUN_DIR/extract_primus_perf.py" "$LOG_PATH" "$RUN_DIR/primus_perf_output.csv" || true +fi +exit "$exitcode"