From b2cdfeb83bfb4f9df8ea10bab751b1e157959b1d Mon Sep 17 00:00:00 2001 From: YongHwan Yoo Date: Sat, 25 Apr 2026 04:25:18 +0900 Subject: [PATCH 1/3] Add MIG scripts for g7e (RTX PRO 6000 Blackwell) on EKS Companion to the existing p5.48xlarge (H100) MIG guide. The smallest g7e size, g7e.2xlarge, ships a single RTX PRO 6000 Blackwell Server Edition and is the cheapest way to exercise MIG end-to-end on EKS. Scripts create a managed nodegroup, install the NVIDIA GPU operator with mig-manager (mixed strategy), partition the GPU (default all-1g.24gb -> 4 slices), run a smoke-test pod on a MIG slice, and tear everything down. README documents the gotchas we hit during validation: - G7e ICE is common and `describe-instance-type-offerings` lies about it, so SUBNETS can be used to pin a specific AZ after discovery. - AL2023 NVIDIA AMI ships containerd v3 but the gpu-operator v26 toolkit still emits a v2-style drop-in without `SystemdCgroup=true`, crashing every pod with "expected cgroupsPath ... slice:prefix:name". The operator script now writes a correct v3 drop-in and restarts containerd+kubelet so step 3 can actually advertise nvidia.com/mig-*. --- .../g7e-blackwell/01-create-nodegroup.sh | 88 +++++++++ .../g7e-blackwell/02-install-gpu-operator.sh | 170 ++++++++++++++++++ .../g7e-blackwell/03-apply-mig-config.sh | 67 +++++++ .../g7e-blackwell/04-test-mig.sh | 73 ++++++++ .../g7e-blackwell/99-cleanup.sh | 62 +++++++ .../g7e-blackwell/README.md | 135 ++++++++++++++ .../mig-gpu-partitioning/g7e-blackwell/env.sh | 44 +++++ 7 files changed, 639 insertions(+) create mode 100755 2.projects/mig-gpu-partitioning/g7e-blackwell/01-create-nodegroup.sh create mode 100755 2.projects/mig-gpu-partitioning/g7e-blackwell/02-install-gpu-operator.sh create mode 100755 2.projects/mig-gpu-partitioning/g7e-blackwell/03-apply-mig-config.sh create mode 100755 2.projects/mig-gpu-partitioning/g7e-blackwell/04-test-mig.sh create mode 100755 2.projects/mig-gpu-partitioning/g7e-blackwell/99-cleanup.sh create mode 100644 2.projects/mig-gpu-partitioning/g7e-blackwell/README.md create mode 100755 2.projects/mig-gpu-partitioning/g7e-blackwell/env.sh diff --git a/2.projects/mig-gpu-partitioning/g7e-blackwell/01-create-nodegroup.sh b/2.projects/mig-gpu-partitioning/g7e-blackwell/01-create-nodegroup.sh new file mode 100755 index 0000000..70cd8ec --- /dev/null +++ b/2.projects/mig-gpu-partitioning/g7e-blackwell/01-create-nodegroup.sh @@ -0,0 +1,88 @@ +#!/usr/bin/env bash +# Create an EKS managed nodegroup with a single g7e.2xlarge instance +# (NVIDIA RTX PRO 6000 Blackwell Server Edition) and wait for it to become ACTIVE. +# +# WARNING: this starts an ON_DEMAND g7e.2xlarge — on-demand billing starts +# immediately. Run `./99-cleanup.sh` when done. +set -euo pipefail +cd "$(dirname "$0")" +source ./env.sh + +log() { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; } + +log "Looking up VPC details for cluster $CLUSTER_NAME in $AWS_REGION" +cluster_json=$(aws eks describe-cluster --region "$AWS_REGION" --name "$CLUSTER_NAME") +subnet_ids=$(echo "$cluster_json" | jq -r '.cluster.resourcesVpcConfig.subnetIds[]') + +# Allow caller to pin a specific subnet set via SUBNETS="subnet-aaa,subnet-bbb". +# Useful when a given instance type has InsufficientInstanceCapacity in some AZs — +# g7e in particular is capacity-constrained and we saw us-west-2a return ICE. +private_subnets=() +if [[ -n "${SUBNETS:-}" ]]; then + IFS=',' read -r -a private_subnets <<<"$SUBNETS" + log "Using SUBNETS override: ${private_subnets[*]}" +else + # Fall back to every private subnet (MapPublicIpOnLaunch=false) in the cluster VPC. + for s in $subnet_ids; do + is_public=$(aws ec2 describe-subnets --region "$AWS_REGION" --subnet-ids "$s" \ + --query 'Subnets[0].MapPublicIpOnLaunch' --output text) + if [[ "$is_public" == "False" ]]; then + private_subnets+=("$s") + fi + done + log "Private subnets: ${private_subnets[*]}" +fi + +# Reuse the IAM node role from the existing osmo-gpu-nodes nodegroup so we don't +# have to provision a new role / policies for this throwaway test. +log "Looking up an existing GPU nodegroup to copy its node IAM role" +existing_ng=$(aws eks list-nodegroups --region "$AWS_REGION" --cluster-name "$CLUSTER_NAME" \ + --query 'nodegroups[?contains(@, `gpu`)] | [0]' --output text) +if [[ -z "$existing_ng" || "$existing_ng" == "None" ]]; then + echo "No existing GPU nodegroup found to copy node role from; aborting." >&2 + exit 1 +fi +node_role=$(aws eks describe-nodegroup --region "$AWS_REGION" --cluster-name "$CLUSTER_NAME" \ + --nodegroup-name "$existing_ng" --query 'nodegroup.nodeRole' --output text) +log "Reusing node role from $existing_ng: $node_role" + +# Turn the label/taint env vars into CLI args. +labels_arg=$(python3 -c ' +import os, json +pairs = [kv.split("=", 1) for kv in os.environ["NODE_LABELS"].split(",") if kv] +print(json.dumps(dict(pairs))) +') +taints_arg="$NODE_TAINTS_JSON" + +log "Creating nodegroup $NODEGROUP_NAME (instance=$INSTANCE_TYPE, ami=$AMI_TYPE)" +aws eks create-nodegroup \ + --region "$AWS_REGION" \ + --cluster-name "$CLUSTER_NAME" \ + --nodegroup-name "$NODEGROUP_NAME" \ + --scaling-config "minSize=$MIN_SIZE,maxSize=$MAX_SIZE,desiredSize=$DESIRED_SIZE" \ + --disk-size "$DISK_SIZE" \ + --subnets "${private_subnets[@]}" \ + --instance-types "$INSTANCE_TYPE" \ + --ami-type "$AMI_TYPE" \ + --capacity-type "$CAPACITY_TYPE" \ + --node-role "$node_role" \ + --labels "$labels_arg" \ + --taints "$taints_arg" \ + --tags "project=g7e-mig-test,owner=$USER" \ + --output json >/dev/null + +log "Waiting for nodegroup to become ACTIVE (this usually takes 3-5 minutes)..." +aws eks wait nodegroup-active \ + --region "$AWS_REGION" \ + --cluster-name "$CLUSTER_NAME" \ + --nodegroup-name "$NODEGROUP_NAME" + +log "Nodegroup $NODEGROUP_NAME is ACTIVE. EC2 instance:" +aws ec2 describe-instances --region "$AWS_REGION" \ + --filters "Name=tag:eks:cluster-name,Values=$CLUSTER_NAME" \ + "Name=tag:eks:nodegroup-name,Values=$NODEGROUP_NAME" \ + "Name=instance-state-name,Values=running,pending" \ + --query 'Reservations[].Instances[].[InstanceId,InstanceType,PrivateIpAddress,State.Name]' \ + --output table + +log "Configure kubectl, then: kubectl get nodes -l node-type=g7e-mig" diff --git a/2.projects/mig-gpu-partitioning/g7e-blackwell/02-install-gpu-operator.sh b/2.projects/mig-gpu-partitioning/g7e-blackwell/02-install-gpu-operator.sh new file mode 100755 index 0000000..d149ecc --- /dev/null +++ b/2.projects/mig-gpu-partitioning/g7e-blackwell/02-install-gpu-operator.sh @@ -0,0 +1,170 @@ +#!/usr/bin/env bash +# Install the NVIDIA GPU Operator via Helm with MIG manager enabled (mixed strategy). +# The operator deploys the driver, container toolkit, device plugin, DCGM exporter, +# node feature discovery, and mig-manager. On Blackwell the operator picks the +# right driver automatically (open kernel modules / 570+ branch). +set -euo pipefail +cd "$(dirname "$0")" +source ./env.sh + +log() { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; } + +if ! command -v helm >/dev/null; then + echo "helm is required but not installed" >&2 + exit 1 +fi + +log "Configuring kubectl for $CLUSTER_NAME in $AWS_REGION" +aws eks update-kubeconfig --region "$AWS_REGION" --name "$CLUSTER_NAME" >/dev/null + +log "Waiting for a Ready node with label node-type=g7e-mig" +for _ in $(seq 1 60); do + ready=$(kubectl get nodes -l node-type=g7e-mig \ + -o jsonpath='{range .items[*]}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' \ + 2>/dev/null | grep -c '^True$' || true) + if [[ "$ready" -ge 1 ]]; then + break + fi + sleep 10 +done +kubectl get nodes -l node-type=g7e-mig -L node.kubernetes.io/instance-type + +log "Adding NVIDIA helm repo and updating" +helm repo add nvidia https://helm.ngc.nvidia.com/nvidia >/dev/null 2>&1 || true +helm repo update >/dev/null + +log "Installing gpu-operator into namespace $GPU_OPERATOR_NAMESPACE (release=$GPU_OPERATOR_RELEASE)" +# Key flags: +# mig.strategy=mixed -> expose differently-sized MIG partitions as distinct resources +# migManager.default=all-disabled -> start with MIG off; we flip it on via the node label +# once the operator is healthy (in 03-apply-mig-config.sh). +# migManager.WITH_REBOOT=true -> let mig-manager reboot the node when required (Blackwell +# often needs a reboot after the first MIG enable). +# driver.enabled=true -> install NVIDIA driver via the operator's driver container +# (the EKS-optimized NVIDIA AMI ships a driver, but the +# operator's driver container is what mig-manager is +# validated against; toolkit-only mode is flaky on Blackwell). +# +# If you prefer to keep the host driver (from the EKS AL2023 NVIDIA AMI) and skip the +# operator's driver container, override with DRIVER_ENABLED=false on the command line. +DRIVER_ENABLED="${DRIVER_ENABLED:-true}" + +helm upgrade --install "$GPU_OPERATOR_RELEASE" nvidia/gpu-operator \ + --namespace "$GPU_OPERATOR_NAMESPACE" --create-namespace \ + --set mig.strategy=mixed \ + --set migManager.enabled=true \ + --set migManager.default=all-disabled \ + --set migManager.env[0].name=WITH_REBOOT \ + --set-string migManager.env[0].value="true" \ + --set driver.enabled="$DRIVER_ENABLED" \ + --set toolkit.enabled=true \ + --set devicePlugin.enabled=true \ + --set dcgmExporter.enabled=true \ + --set nodeStatusExporter.enabled=true \ + --wait --timeout 20m + +log "Operator install complete. Pods:" +kubectl get pods -n "$GPU_OPERATOR_NAMESPACE" -o wide + +# --------------------------------------------------------------------------- +# AL2023 NVIDIA AMI ships containerd v3 with NO pre-configured `nvidia` +# runtime and no explicit SystemdCgroup setting, while the gpu-operator +# v26.x toolkit (nvcr.io/nvidia/k8s/container-toolkit:v1.19.0) generates a +# v2-style drop-in that doesn't carry SystemdCgroup=true. That mismatch causes +# EVERY gpu-operator pod on the node to crash with: +# runc create failed: expected cgroupsPath to be of format +# "slice:prefix:name" for systemd cgroups, got "/kubepods/..." instead +# and blocks nvidia-device-plugin from advertising nvidia.com/mig-* resources +# (it also takes aws-node down in the process, breaking pod networking). +# +# We paper over it by writing a correct v3 drop-in to +# /etc/containerd/conf.d/99-nvidia.toml (runc+nvidia, both with +# SystemdCgroup=true, nvidia BinaryName pointing at the toolkit-installed +# binary), then restarting containerd+kubelet. The existing config.toml on the +# AL2023 NVIDIA AMI already `imports = ["/etc/containerd/conf.d/*.toml"]`. +log "Applying AL2023+containerd-v3 runtime drop-in (needed for gpu-operator v26.x)" +node=$(kubectl get nodes -l node-type=g7e-mig -o jsonpath='{.items[0].metadata.name}') +if [[ -z "$node" ]]; then + echo "No g7e node found — cannot apply runtime fix" >&2 + exit 1 +fi + +cat < /host/etc/containerd/conf.d/99-nvidia.toml <<'EOF' + version = 3 + + [plugins."io.containerd.cri.v1.runtime".containerd] + default_runtime_name = "runc" + + [plugins."io.containerd.cri.v1.runtime".containerd.runtimes.runc] + runtime_type = "io.containerd.runc.v2" + [plugins."io.containerd.cri.v1.runtime".containerd.runtimes.runc.options] + SystemdCgroup = true + + [plugins."io.containerd.cri.v1.runtime".containerd.runtimes.nvidia] + runtime_type = "io.containerd.runc.v2" + [plugins."io.containerd.cri.v1.runtime".containerd.runtimes.nvidia.options] + BinaryName = "/usr/local/nvidia/toolkit/nvidia-container-runtime" + SystemdCgroup = true + EOF + # Remove any stale drop-in nvidia-ctk wrote (v2 schema, wrong + # BinaryName) before we bounce the daemons. + rm -f /host/etc/containerd/conf.d/nvidia.toml + chroot /host systemctl restart containerd + chroot /host systemctl restart kubelet + echo "runtime fix applied" + volumeMounts: + - name: host + mountPath: /host + volumes: + - name: host + hostPath: + path: / +YAML + +log "Waiting for fix pod to complete" +# kubelet restart on the same node makes the pod transition surprising; poll. +for _ in $(seq 1 30); do + phase=$(kubectl get pod containerd-runtime-fix -o jsonpath='{.status.phase}' 2>/dev/null || true) + [[ "$phase" == "Succeeded" || "$phase" == "Failed" ]] && break + sleep 5 +done +kubectl logs containerd-runtime-fix 2>&1 || true +kubectl delete pod containerd-runtime-fix --ignore-not-found >/dev/null + +log "Waiting for nvidia-container-toolkit daemonset to report Ready on the node" +for _ in $(seq 1 60); do + ready=$(kubectl -n "$GPU_OPERATOR_NAMESPACE" get pod \ + -l app=nvidia-container-toolkit-daemonset \ + --field-selector "spec.nodeName=$node" \ + -o jsonpath='{.items[0].status.containerStatuses[0].ready}' 2>/dev/null || echo "") + [[ "$ready" == "true" ]] && break + sleep 5 +done +kubectl -n "$GPU_OPERATOR_NAMESPACE" get pods --field-selector "spec.nodeName=$node" -o wide + +log "Next: run ./03-apply-mig-config.sh to partition the GPU." diff --git a/2.projects/mig-gpu-partitioning/g7e-blackwell/03-apply-mig-config.sh b/2.projects/mig-gpu-partitioning/g7e-blackwell/03-apply-mig-config.sh new file mode 100755 index 0000000..0231f1f --- /dev/null +++ b/2.projects/mig-gpu-partitioning/g7e-blackwell/03-apply-mig-config.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# Ask mig-manager to enable MIG on the g7e node and apply a partition profile. +# +# Blackwell RTX PRO 6000 Server Edition supports MIG with up to 4 GPU instances +# per physical GPU (different granularity than A100's 7). Valid profiles depend on +# the exact SKU & driver; query with `nvidia-smi mig -lgip` after the operator is up +# (see: kubectl exec -n gpu-operator -- nvidia-smi mig -lgip). +# +# Default profile: `all-1g.24gb` — 4 equal partitions, ~24 GiB each on the 96 GiB +# RTX PRO 6000 Server Edition SKU that ships in g7e. Valid alternatives for this +# SKU include `all-2g.48gb` (2 partitions), `all-4g.96gb` (1 MIG partition covering +# the whole GPU), and `all-disabled`. +# Override with: MIG_PROFILE=all-2g.48gb ./03-apply-mig-config.sh +set -euo pipefail +cd "$(dirname "$0")" +source ./env.sh + +log() { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; } + +MIG_PROFILE="${MIG_PROFILE:-all-1g.24gb}" + +log "Configuring kubectl for $CLUSTER_NAME in $AWS_REGION" +aws eks update-kubeconfig --region "$AWS_REGION" --name "$CLUSTER_NAME" >/dev/null + +node=$(kubectl get nodes -l node-type=g7e-mig -o jsonpath='{.items[0].metadata.name}') +if [[ -z "$node" ]]; then + echo "No node with label node-type=g7e-mig found" >&2 + exit 1 +fi +log "Target node: $node" + +log "Available MIG profiles (from mig-parted configmap):" +kubectl get configmap -n "$GPU_OPERATOR_NAMESPACE" default-mig-parted-config \ + -o jsonpath='{.data.config\.yaml}' 2>/dev/null | \ + grep -E '^\s{2}[a-zA-Z0-9._-]+:\s*$' | sed 's/^/ /' || \ + log "(configmap not yet present — operator may still be bootstrapping)" + +log "Labeling $node with nvidia.com/mig.config=$MIG_PROFILE" +kubectl label node "$node" "nvidia.com/mig.config=$MIG_PROFILE" --overwrite + +log "Watching mig.config.state (expect: pending -> rebooting (if needed) -> success)" +# mig-manager may reboot the node, which pauses the watch. Poll with a timeout. +deadline=$(( $(date +%s) + 20*60 )) +last_state="" +while [[ $(date +%s) -lt $deadline ]]; do + state=$(kubectl get node "$node" \ + -o jsonpath='{.metadata.labels.nvidia\.com/mig\.config\.state}' 2>/dev/null || true) + if [[ "$state" != "$last_state" ]]; then + log "mig.config.state=$state" + last_state="$state" + fi + case "$state" in + success) break ;; + failed) echo "mig-manager reported FAILED — inspect: kubectl logs -n $GPU_OPERATOR_NAMESPACE -l app=nvidia-mig-manager" >&2; exit 1 ;; + esac + sleep 10 +done + +if [[ "$last_state" != "success" ]]; then + echo "Timed out waiting for mig.config.state=success (last=$last_state)" >&2 + exit 1 +fi + +log "MIG partitioning done. Advertised GPU resources on the node:" +kubectl get node "$node" -o json | jq '.status.allocatable | with_entries(select(.key | startswith("nvidia.com/")))' + +log "Next: run ./04-test-mig.sh to schedule a pod onto a MIG slice." diff --git a/2.projects/mig-gpu-partitioning/g7e-blackwell/04-test-mig.sh b/2.projects/mig-gpu-partitioning/g7e-blackwell/04-test-mig.sh new file mode 100755 index 0000000..437e69b --- /dev/null +++ b/2.projects/mig-gpu-partitioning/g7e-blackwell/04-test-mig.sh @@ -0,0 +1,73 @@ +#!/usr/bin/env bash +# Schedule a pod that requests one MIG slice and verifies nvidia-smi sees it. +# Also prints the full partition map on the node. +set -euo pipefail +cd "$(dirname "$0")" +source ./env.sh + +log() { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; } + +log "Configuring kubectl for $CLUSTER_NAME in $AWS_REGION" +aws eks update-kubeconfig --region "$AWS_REGION" --name "$CLUSTER_NAME" >/dev/null + +node=$(kubectl get nodes -l node-type=g7e-mig -o jsonpath='{.items[0].metadata.name}') +log "Target node: $node" + +# Discover the first nvidia.com/mig-* resource the node advertises. +resource=$(kubectl get node "$node" -o json | \ + jq -r '.status.allocatable | to_entries[] | select(.key | startswith("nvidia.com/mig-")) | .key' | head -n1) +if [[ -z "$resource" ]]; then + echo "No nvidia.com/mig-* resources advertised on $node. Did 03-apply-mig-config.sh succeed?" >&2 + exit 1 +fi +log "Will request one of: $resource" + +cat </dev/null || true +# Container is short-lived — tail logs once it terminates. +for _ in $(seq 1 30); do + phase=$(kubectl get pod mig-smoke-test -o jsonpath='{.status.phase}') + [[ "$phase" == "Succeeded" || "$phase" == "Failed" ]] && break + sleep 5 +done + +log "--- pod logs (mig-smoke-test) ---" +kubectl logs mig-smoke-test || true + +log "--- node allocatable (nvidia.com/*) ---" +kubectl get node "$node" -o json | jq '.status.allocatable | with_entries(select(.key | startswith("nvidia.com/")))' + +log "--- host-side nvidia-smi from the driver daemonset ---" +driver_pod=$(kubectl get pod -n "$GPU_OPERATOR_NAMESPACE" -l app=nvidia-driver-daemonset \ + --field-selector spec.nodeName="$node" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true) +if [[ -n "$driver_pod" ]]; then + kubectl exec -n "$GPU_OPERATOR_NAMESPACE" "$driver_pod" -- nvidia-smi -L || true + kubectl exec -n "$GPU_OPERATOR_NAMESPACE" "$driver_pod" -- nvidia-smi mig -lgi || true +fi + +log "Cleanup: kubectl delete pod mig-smoke-test" diff --git a/2.projects/mig-gpu-partitioning/g7e-blackwell/99-cleanup.sh b/2.projects/mig-gpu-partitioning/g7e-blackwell/99-cleanup.sh new file mode 100755 index 0000000..edbcc96 --- /dev/null +++ b/2.projects/mig-gpu-partitioning/g7e-blackwell/99-cleanup.sh @@ -0,0 +1,62 @@ +#!/usr/bin/env bash +# Tear everything down: smoke-test pod, helm release, and the EKS nodegroup. +# This stops g7e.2xlarge on-demand billing. Safe to rerun. +set -euo pipefail +cd "$(dirname "$0")" +source ./env.sh + +log() { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; } + +log "Configuring kubectl for $CLUSTER_NAME in $AWS_REGION" +aws eks update-kubeconfig --region "$AWS_REGION" --name "$CLUSTER_NAME" >/dev/null || true + +log "Deleting smoke-test pod (if any)" +kubectl delete pod mig-smoke-test --ignore-not-found + +log "Uninstalling helm release $GPU_OPERATOR_RELEASE" +helm uninstall "$GPU_OPERATOR_RELEASE" -n "$GPU_OPERATOR_NAMESPACE" 2>/dev/null || true +kubectl delete namespace "$GPU_OPERATOR_NAMESPACE" --ignore-not-found + +log "Deleting nodegroup $NODEGROUP_NAME (this is what stops the billing)" +aws eks delete-nodegroup \ + --region "$AWS_REGION" \ + --cluster-name "$CLUSTER_NAME" \ + --nodegroup-name "$NODEGROUP_NAME" >/dev/null 2>&1 || \ + log "Nodegroup $NODEGROUP_NAME not found (already deleted?)" + +log "Waiting for nodegroup deletion to complete" +aws eks wait nodegroup-deleted \ + --region "$AWS_REGION" \ + --cluster-name "$CLUSTER_NAME" \ + --nodegroup-name "$NODEGROUP_NAME" 2>/dev/null || true + +log "Verify no g7e instances remain:" +aws ec2 describe-instances --region "$AWS_REGION" \ + --filters "Name=tag:eks:cluster-name,Values=$CLUSTER_NAME" \ + "Name=instance-type,Values=g7e.*" \ + "Name=instance-state-name,Values=running,pending" \ + --query 'Reservations[].Instances[].[InstanceId,InstanceType,State.Name]' --output table + +# Optionally delete scratch subnets you created to add AZ coverage for g7e capacity. +# Opt-in via EXTRA_SUBNETS="subnet-aaa,subnet-bbb" — the script will try to +# disassociate each from its route table and delete the subnet. +if [[ -n "${EXTRA_SUBNETS:-}" ]]; then + log "Cleaning up scratch subnets: $EXTRA_SUBNETS" + IFS=',' read -r -a extras <<<"$EXTRA_SUBNETS" + for s in "${extras[@]}"; do + # Find any route table associations for this subnet and drop them. + assoc_ids=$(aws ec2 describe-route-tables --region "$AWS_REGION" \ + --filters "Name=association.subnet-id,Values=$s" \ + --query 'RouteTables[].Associations[?SubnetId==`'"$s"'`].RouteTableAssociationId' \ + --output text 2>/dev/null || true) + for a in $assoc_ids; do + log " disassociating $a from $s" + aws ec2 disassociate-route-table --region "$AWS_REGION" --association-id "$a" >/dev/null 2>&1 || true + done + log " deleting subnet $s" + aws ec2 delete-subnet --region "$AWS_REGION" --subnet-id "$s" 2>&1 || \ + log " (delete-subnet failed for $s — it may still have ENIs attached; retry in a minute)" + done +fi + +log "Done." diff --git a/2.projects/mig-gpu-partitioning/g7e-blackwell/README.md b/2.projects/mig-gpu-partitioning/g7e-blackwell/README.md new file mode 100644 index 0000000..dfa3e19 --- /dev/null +++ b/2.projects/mig-gpu-partitioning/g7e-blackwell/README.md @@ -0,0 +1,135 @@ +

MIG on Amazon EC2 G7e (NVIDIA RTX PRO 6000 Blackwell Server Edition) with EKS

+ +--- + +This directory is a companion to the parent [`mig-gpu-partitioning`](../) guide, which walks through MIG on a `p5.48xlarge` (H100). Here we do the same on the smallest G7e size — `g7e.2xlarge` — which ships a single **NVIDIA RTX PRO 6000 Blackwell Server Edition** GPU. + +Blackwell RTX PRO 6000 supports MIG with a different granularity than A100/H100: up to **4 GPU instances per physical GPU** (vs. 7 on A100/H100), with memory/compute slice sizes determined by the SKU and current driver. Because a `g7e.2xlarge` is a single-GPU node, this setup is the cheapest way to exercise MIG end-to-end on EKS. + +## Scripts + +| Script | What it does | +|---|---| +| `env.sh` | Shared environment (region, cluster, instance type, labels, taints). Source this; don't run it. | +| `01-create-nodegroup.sh` | Creates an EKS managed nodegroup `g7e-mig-test` with one `g7e.2xlarge`, reusing the IAM role of an existing `*gpu*` nodegroup. | +| `02-install-gpu-operator.sh` | `helm install` NVIDIA GPU Operator with `mig.strategy=mixed` and `migManager.enabled=true` (default `all-disabled` — we flip MIG on in step 3). | +| `03-apply-mig-config.sh` | Labels the node `nvidia.com/mig.config=` so mig-manager partitions the GPU. Default profile: `all-1g.24gb` (4 equal 24 GiB partitions). | +| `04-test-mig.sh` | Schedules a short-lived pod that requests one `nvidia.com/mig-*` slice, runs `nvidia-smi`, and dumps the node's advertised MIG resources. | +| `99-cleanup.sh` | Uninstalls the operator and deletes the nodegroup — this is what stops g7e on-demand billing. | + +## Prerequisites + +- `aws` CLI with credentials for the target account (the scripts assume `us-west-2` / cluster `osmo`; override via env vars) +- `kubectl`, `helm`, `jq`, and `python3` on `PATH` +- An existing EKS cluster with at least one GPU managed nodegroup whose IAM role we can reuse (the scripts auto-detect one named `*gpu*`) +- G-instance vCPU quota in the target region (g7e is covered by the "Running On-Demand G and VT instances" quota) +- Private subnets in an AZ where g7e actually has capacity (see the [Capacity](#capacity-insufficientinstancecapacity) section below — in us-west-2 we only found g7e.2xlarge capacity in 2d during testing) + +> **Heads up on `AWS_REGION`**: the scripts default to `us-west-2` via `${AWS_REGION:-us-west-2}`, so if your shell already exports `AWS_REGION=` that wins and the scripts will try to talk to a cluster in the wrong region. Either `unset AWS_REGION` before running, or prefix each command with `AWS_REGION=us-west-2`. + +## Usage + +```bash +# 0. (Optional) override any defaults in env.sh +export AWS_REGION=us-west-2 +export CLUSTER_NAME=osmo + +# If a specific AZ has g7e capacity, pin it: +# export SUBNETS=subnet-0abc,subnet-0def + +# 1. Spin up the node (starts on-demand billing) +./01-create-nodegroup.sh + +# 2. Install the NVIDIA GPU Operator with MIG manager +./02-install-gpu-operator.sh + +# 3. Partition the GPU +./03-apply-mig-config.sh +# or pick a different profile: +MIG_PROFILE=all-balanced ./03-apply-mig-config.sh + +# 4. Run a smoke-test pod on a MIG slice +./04-test-mig.sh + +# 5. Tear everything down +./99-cleanup.sh +``` + +## Discovering valid MIG profiles + +The set of profiles accepted by `nvidia.com/mig.config` comes from the `default-mig-parted-config` ConfigMap that the operator installs. To enumerate what's compiled in for Blackwell on your driver: + +```bash +kubectl get configmap -n gpu-operator default-mig-parted-config \ + -o jsonpath='{.data.config\.yaml}' | yq '.mig-configs | keys' +``` + +To see the raw per-GPU profile list from the driver itself: + +```bash +DRIVER_POD=$(kubectl get pod -n gpu-operator -l app=nvidia-driver-daemonset -o jsonpath='{.items[0].metadata.name}') +kubectl exec -n gpu-operator "$DRIVER_POD" -- nvidia-smi mig -lgip +``` + +For the `g7e.2xlarge` (RTX PRO 6000 Server Edition, 96 GiB) SKU we verified the following profiles work: + +| Profile | Partitions | Per-slice memory | +|---|---|---| +| `all-1g.24gb` (default) | 4 | ~24 GiB | +| `all-2g.48gb` | 2 | ~48 GiB | +| `all-4g.96gb` | 1 | ~95 GiB (whole GPU, MIG mode on) | + +If your Blackwell SKU differs (different memory capacity or newer driver) pick a profile from the ConfigMap that matches what `nvidia-smi mig -lgip` reports. + +## Capacity: `InsufficientInstanceCapacity` + +G7e is supply-constrained. `aws ec2 describe-instance-type-offerings` will happily list g7e.2xlarge in every AZ of a region even when AWS has zero actual capacity in those AZs — **offering availability ≠ real capacity**. A dry-run `RunInstances` doesn't check capacity either (it only validates syntax/IAM). + +When `01-create-nodegroup.sh` hits ICE, the nodegroup goes `CREATE_FAILED` with a health issue like: + +``` +AsgInstanceLaunchFailures: Could not launch On-Demand Instances. +InsufficientInstanceCapacity - We currently do not have sufficient g7e.2xlarge +capacity in the Availability Zone you requested (us-west-2a). ... +You can currently get g7e.2xlarge capacity by ... choosing us-west-2b, us-west-2c, us-west-2d. +``` + +A `CREATE_FAILED` nodegroup can't recover — the ASG keeps retrying past EKS's internal create timeout, so you can even end up with an instance that launched **after** EKS already gave up. Always run `./99-cleanup.sh` (or `aws eks delete-nodegroup …`) immediately when you see `CREATE_FAILED` so you don't get billed for an orphan instance. + +Practical playbook: + +1. Run `./01-create-nodegroup.sh` with all private subnets (default) and let ICE tell you which AZ currently has capacity. +2. Delete the failed nodegroup. +3. Re-run pinned to just that AZ: `SUBNETS=subnet-xxxxxxxx ./01-create-nodegroup.sh`. +4. If none of your VPC's private subnets are in an AZ with capacity, create one: an extra private subnet with `kubernetes.io/cluster/=shared` tagged associated with an existing NAT route table. See `99-cleanup.sh` for the reverse (deleting the scratch subnet after teardown). + +## Gotcha: AL2023 NVIDIA AMI + containerd v3 + gpu-operator v26 + +The EKS-optimized `AL2023_x86_64_NVIDIA` AMI ships **containerd v3**, which changes the plugin path layout (`io.containerd.cri.v1.runtime` instead of the v1/v2 `io.containerd.grpc.v1.cri`). The AMI's default `/etc/containerd/config.toml` does NOT include an explicit `SystemdCgroup = true` under the runc runtime — it relies on containerd's defaults. + +The NVIDIA GPU operator (as of v26.3) ships a toolkit container (`container-toolkit:v1.19.0`) whose `nvidia-ctk runtime configure` command still emits a **v2-style drop-in** without `SystemdCgroup`. When that drop-in merges with the v3 base config, runc receives a non-systemd-shaped cgroupsPath and every single pod on the node starts failing with: + +``` +runc create failed: expected cgroupsPath to be of format "slice:prefix:name" +for systemd cgroups, got "/kubepods/besteffort/..." instead +``` + +This blocks the device plugin from ever advertising `nvidia.com/mig-*` and also takes `aws-node` down (breaking pod networking on the node). + +`02-install-gpu-operator.sh` papers over this automatically by writing a correct v3 drop-in to `/etc/containerd/conf.d/99-nvidia.toml` containing both runc and nvidia runtimes with `SystemdCgroup = true`, then restarting `containerd` and `kubelet`. Without this patch, steps 3 and 4 will stall forever. If you're running these scripts on a different Blackwell AMI (say, a future AL2023 image where AWS pre-wires the nvidia runtime) you can drop the patch — check `containerd config dump | grep SystemdCgroup` and `crictl info` for the runtime section first. + +## Notes & gotchas + +- **Blackwell reboot**: mig-manager is configured with `WITH_REBOOT=true`. The very first MIG enable on a Blackwell node often requires a reboot before the partitions become visible; the node will cycle and the script will keep polling `nvidia.com/mig.config.state`. (In our testing on the `g7e.2xlarge` / driver 580.126 combo, the first enable succeeded without a reboot.) +- **Taints**: the nodegroup is tainted `nvidia.com/gpu=true:NoSchedule`. Pods must tolerate this to land on the node (step 4's smoke-test does). +- **Billing**: `g7e.2xlarge` is on-demand. The only way to stop billing is to delete the nodegroup — run `./99-cleanup.sh`. Verify no `g7e.*` instances remain at the end (the script prints a table). +- **EKS access for `kubectl`**: if `kubectl get nodes` fails with `the server has asked for the client to provide credentials`, your IAM principal doesn't have a cluster access entry yet. On a cluster with `authenticationMode=API` or `API_AND_CONFIG_MAP`, run: + ```bash + aws eks create-access-entry --cluster-name "$CLUSTER_NAME" \ + --principal-arn arn:aws:iam:::user/ --type STANDARD + aws eks associate-access-policy --cluster-name "$CLUSTER_NAME" \ + --principal-arn arn:aws:iam:::user/ \ + --policy-arn arn:aws:eks::aws:cluster-access-policy/AmazonEKSClusterAdminPolicy \ + --access-scope type=cluster + ``` +- **Why reuse an IAM role?**: the operator needs `AmazonEKSWorkerNodePolicy`, `AmazonEC2ContainerRegistryReadOnly`, and `AmazonEKS_CNI_Policy` on the node role. Reusing the role attached to an existing GPU nodegroup avoids provisioning a one-off role for a throwaway test. For a permanent setup, define your own. diff --git a/2.projects/mig-gpu-partitioning/g7e-blackwell/env.sh b/2.projects/mig-gpu-partitioning/g7e-blackwell/env.sh new file mode 100755 index 0000000..bb09f1b --- /dev/null +++ b/2.projects/mig-gpu-partitioning/g7e-blackwell/env.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash +# Shared environment variables for the g7e (RTX PRO 6000 Blackwell) MIG test scripts. +# Source this file from each step script: `source ./env.sh` + +# NOTE on AWS_REGION: because of the default-if-unset (:-) syntax below, if your shell +# already exports AWS_REGION to something else (e.g. us-east-1), that value wins and +# the scripts will try to talk to the cluster in the wrong region. Either +# `unset AWS_REGION` before running, or prefix each command with AWS_REGION=us-west-2. +export AWS_REGION="${AWS_REGION:-us-west-2}" +export CLUSTER_NAME="${CLUSTER_NAME:-osmo}" + +# Optional AZ pinning: comma-separated list of subnet IDs passed to +# `aws eks create-nodegroup --subnets`. Useful when g7e has ICE in some AZs +# (see README "Capacity" section). When unset, the script falls back to every +# private subnet it can find in the cluster VPC. +# e.g. export SUBNETS=subnet-0abc,subnet-0def +# export SUBNETS= + +# g7e.2xlarge = 1 x NVIDIA RTX PRO 6000 Blackwell Server Edition, 8 vCPU, 32 GiB. +# Smallest g7e size, sufficient to exercise MIG partitioning on a single GPU. +export INSTANCE_TYPE="${INSTANCE_TYPE:-g7e.2xlarge}" +export NODEGROUP_NAME="${NODEGROUP_NAME:-g7e-mig-test}" + +# AL2023 NVIDIA is the recommended path for new GPU node groups on EKS 1.30+. +# AMI_TYPE=AL2023_x86_64_NVIDIA picks the latest EKS-optimized Blackwell-capable image. +export AMI_TYPE="${AMI_TYPE:-AL2023_x86_64_NVIDIA}" + +export DISK_SIZE="${DISK_SIZE:-100}" +export CAPACITY_TYPE="${CAPACITY_TYPE:-ON_DEMAND}" +export DESIRED_SIZE="${DESIRED_SIZE:-1}" +export MIN_SIZE="${MIN_SIZE:-0}" +export MAX_SIZE="${MAX_SIZE:-1}" + +# MIG-specific node labels. The NVIDIA GPU operator's mig-manager watches +# `nvidia.com/mig.config` and (re)partitions the GPU when the value changes. +export NODE_LABELS="node-type=g7e-mig,nvidia.com/mig.config=all-disabled" + +# Taint keeps non-GPU workloads off this node. Pods must tolerate +# nvidia.com/gpu=true:NoSchedule to land here. +export NODE_TAINTS_JSON='[{"key":"nvidia.com/gpu","value":"true","effect":"NO_SCHEDULE"}]' + +# Helm release name & namespace for the NVIDIA GPU operator. +export GPU_OPERATOR_NAMESPACE="${GPU_OPERATOR_NAMESPACE:-gpu-operator}" +export GPU_OPERATOR_RELEASE="${GPU_OPERATOR_RELEASE:-gpu-operator}" From 96b6e2a48826163b849d4ae080990c767a029229 Mon Sep 17 00:00:00 2001 From: YongHwan Yoo Date: Sat, 25 Apr 2026 12:23:37 +0900 Subject: [PATCH 2/3] Link g7e-blackwell MIG guide from top-level and 2.projects READMEs Follow CONTRIBUTING.md guidance to list new projects in both README indexes. Co-Authored-By: Claude Opus 4.7 --- 2.projects/README.md | 3 ++- README.md | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/2.projects/README.md b/2.projects/README.md index 1e4a74e..d05cb80 100644 --- a/2.projects/README.md +++ b/2.projects/README.md @@ -94,6 +94,7 @@ These days, the challenge with ML Inference workloads, is that not all workloads In 2020, NVIDIA released Multi-Instance GPU (MIG), alongside the Ampere Architecture that powers the NVIDIA A100 (EC2 P4) and NVIDIA A10G (EC2 G5) GPUs. With MIG, administrators can partition a single GPU into multiple smaller GPU units (called “MIG devices”). Each of these smaller GPU units are fully isolated, with their own high-bandwidth memory, cache, and compute cores. ### Files & Directories -1. [README.md](https://github.com/aws-samples/awsome-inference/blob/main/2.projects/mig-gpu-partitioning/README.md): Yes, this process is simple enough to only have a README! Note: This project only shows you how to set MIG up, and assumes you already have the cluster(s) set up, and your deployment ready to go. +1. [README.md](https://github.com/aws-samples/awsome-inference/blob/main/2.projects/mig-gpu-partitioning/README.md): Walkthrough for MIG on a `p5.48xlarge` (H100). Assumes the cluster(s) are already set up. +2. [g7e-blackwell/](https://github.com/aws-samples/awsome-inference/tree/main/2.projects/mig-gpu-partitioning/g7e-blackwell): Companion bash scripts that exercise MIG on `g7e.2xlarge` (1 × NVIDIA RTX PRO 6000 Blackwell Server Edition, 96 GiB) on EKS — the smallest/cheapest single-GPU setup to validate MIG end-to-end. Creates a managed nodegroup, installs the NVIDIA GPU Operator (with mig-manager), partitions the GPU (default `all-1g.24gb` → 4 × 24 GiB slices), runs a smoke-test pod on a MIG slice, and tears everything down. Includes a workaround for the AL2023 NVIDIA AMI + containerd v3 + gpu-operator v26 cgroup mismatch. diff --git a/README.md b/README.md index 4b4c103..cfcfb5b 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ These examples shows how to deploy LLMs like T5, Mistral using NVIDIA Triton TRT ### MIG -This directory contains a README on how you can leverage Multi-Instance GPUs (MIGs) to partition your GPUs based on your workload. For more detailed documentation, check out the [MIG user guide](https://docs.nvidia.com/datacenter/tesla/mig-user-guide/). +This directory contains a README on how you can leverage Multi-Instance GPUs (MIGs) to partition your GPUs based on your workload. The [`g7e-blackwell/`](2.projects/mig-gpu-partitioning/g7e-blackwell) sub-directory provides end-to-end bash scripts that exercise MIG on the smallest G7e size (`g7e.2xlarge`, 1 × NVIDIA RTX PRO 6000 Blackwell Server Edition) on EKS. For more detailed documentation, check out the [MIG user guide](https://docs.nvidia.com/datacenter/tesla/mig-user-guide/). ## USE-CASES These are real life use-case examples on using projects from `2.PROJECTS/` to demonstrate catering the projects to real-life scenarios. From 3e5ad5992f3d3a22d55367b1b9781465dbd48f9e Mon Sep 17 00:00:00 2001 From: YongHwan Yoo Date: Sat, 25 Apr 2026 14:31:29 +0900 Subject: [PATCH 3/3] Align g7e-blackwell prereqs with repo convention Call out that scripts assume an existing EKS cluster (same convention used by every other 2.projects/* guide) and link to 1.infrastructure/ for the cluster setup. Drop the fork-specific default cluster name. Co-Authored-By: Claude Opus 4.7 --- 2.projects/mig-gpu-partitioning/g7e-blackwell/README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/2.projects/mig-gpu-partitioning/g7e-blackwell/README.md b/2.projects/mig-gpu-partitioning/g7e-blackwell/README.md index dfa3e19..df9ffe2 100644 --- a/2.projects/mig-gpu-partitioning/g7e-blackwell/README.md +++ b/2.projects/mig-gpu-partitioning/g7e-blackwell/README.md @@ -19,9 +19,11 @@ Blackwell RTX PRO 6000 supports MIG with a different granularity than A100/H100: ## Prerequisites -- `aws` CLI with credentials for the target account (the scripts assume `us-west-2` / cluster `osmo`; override via env vars) +These scripts assume an EKS cluster is already up. If you don't have one, start from the [`1.infrastructure/`](../../../1.infrastructure) guide to provision a VPC + EKS cluster. You'll then need: + +- `aws` CLI with credentials for the target account (defaults to `us-west-2`; override via `AWS_REGION` / `CLUSTER_NAME`) - `kubectl`, `helm`, `jq`, and `python3` on `PATH` -- An existing EKS cluster with at least one GPU managed nodegroup whose IAM role we can reuse (the scripts auto-detect one named `*gpu*`) +- An existing EKS cluster with at least one GPU managed nodegroup whose IAM role we can reuse (the scripts auto-detect one named `*gpu*`). For a permanent setup you should provision a dedicated node role instead — see the note at the bottom. - G-instance vCPU quota in the target region (g7e is covered by the "Running On-Demand G and VT instances" quota) - Private subnets in an AZ where g7e actually has capacity (see the [Capacity](#capacity-insufficientinstancecapacity) section below — in us-west-2 we only found g7e.2xlarge capacity in 2d during testing)