From b2cdfeb83bfb4f9df8ea10bab751b1e157959b1d Mon Sep 17 00:00:00 2001
From: YongHwan Yoo <yhyoo@amazon.com>
Date: Sat, 25 Apr 2026 04:25:18 +0900
Subject: [PATCH 1/3] Add MIG scripts for g7e (RTX PRO 6000 Blackwell) on EKS

Companion to the existing p5.48xlarge (H100) MIG guide. The smallest g7e
size, g7e.2xlarge, ships a single RTX PRO 6000 Blackwell Server Edition
and is the cheapest way to exercise MIG end-to-end on EKS. Scripts create
a managed nodegroup, install the NVIDIA GPU operator with mig-manager
(mixed strategy), partition the GPU (default all-1g.24gb -> 4 slices),
run a smoke-test pod on a MIG slice, and tear everything down.

README documents the gotchas we hit during validation:
- G7e ICE is common and `describe-instance-type-offerings` lies about it,
  so SUBNETS can be used to pin a specific AZ after discovery.
- AL2023 NVIDIA AMI ships containerd v3 but the gpu-operator v26 toolkit
  still emits a v2-style drop-in without `SystemdCgroup=true`, crashing
  every pod with "expected cgroupsPath ... slice:prefix:name". The
  operator script now writes a correct v3 drop-in and restarts
  containerd+kubelet so step 3 can actually advertise nvidia.com/mig-*.
---
 .../g7e-blackwell/01-create-nodegroup.sh      |  88 +++++++++
 .../g7e-blackwell/02-install-gpu-operator.sh  | 170 ++++++++++++++++++
 .../g7e-blackwell/03-apply-mig-config.sh      |  67 +++++++
 .../g7e-blackwell/04-test-mig.sh              |  73 ++++++++
 .../g7e-blackwell/99-cleanup.sh               |  62 +++++++
 .../g7e-blackwell/README.md                   | 135 ++++++++++++++
 .../mig-gpu-partitioning/g7e-blackwell/env.sh |  44 +++++
 7 files changed, 639 insertions(+)
 create mode 100755 2.projects/mig-gpu-partitioning/g7e-blackwell/01-create-nodegroup.sh
 create mode 100755 2.projects/mig-gpu-partitioning/g7e-blackwell/02-install-gpu-operator.sh
 create mode 100755 2.projects/mig-gpu-partitioning/g7e-blackwell/03-apply-mig-config.sh
 create mode 100755 2.projects/mig-gpu-partitioning/g7e-blackwell/04-test-mig.sh
 create mode 100755 2.projects/mig-gpu-partitioning/g7e-blackwell/99-cleanup.sh
 create mode 100644 2.projects/mig-gpu-partitioning/g7e-blackwell/README.md
 create mode 100755 2.projects/mig-gpu-partitioning/g7e-blackwell/env.sh

diff --git a/2.projects/mig-gpu-partitioning/g7e-blackwell/01-create-nodegroup.sh b/2.projects/mig-gpu-partitioning/g7e-blackwell/01-create-nodegroup.sh
new file mode 100755
index 0000000..70cd8ec
--- /dev/null
+++ b/2.projects/mig-gpu-partitioning/g7e-blackwell/01-create-nodegroup.sh
@@ -0,0 +1,88 @@
+#!/usr/bin/env bash
+# Create an EKS managed nodegroup with a single g7e.2xlarge instance
+# (NVIDIA RTX PRO 6000 Blackwell Server Edition) and wait for it to become ACTIVE.
+#
+# WARNING: this starts an ON_DEMAND g7e.2xlarge — on-demand billing starts
+# immediately. Run `./99-cleanup.sh` when done.
+set -euo pipefail
+cd "$(dirname "$0")"
+source ./env.sh
+
+log() { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; }
+
+log "Looking up VPC details for cluster $CLUSTER_NAME in $AWS_REGION"
+cluster_json=$(aws eks describe-cluster --region "$AWS_REGION" --name "$CLUSTER_NAME")
+subnet_ids=$(echo "$cluster_json" | jq -r '.cluster.resourcesVpcConfig.subnetIds[]')
+
+# Allow caller to pin a specific subnet set via SUBNETS="subnet-aaa,subnet-bbb".
+# Useful when a given instance type has InsufficientInstanceCapacity in some AZs —
+# g7e in particular is capacity-constrained and we saw us-west-2a return ICE.
+private_subnets=()
+if [[ -n "${SUBNETS:-}" ]]; then
+  IFS=',' read -r -a private_subnets <<<"$SUBNETS"
+  log "Using SUBNETS override: ${private_subnets[*]}"
+else
+  # Fall back to every private subnet (MapPublicIpOnLaunch=false) in the cluster VPC.
+  for s in $subnet_ids; do
+    is_public=$(aws ec2 describe-subnets --region "$AWS_REGION" --subnet-ids "$s" \
+      --query 'Subnets[0].MapPublicIpOnLaunch' --output text)
+    if [[ "$is_public" == "False" ]]; then
+      private_subnets+=("$s")
+    fi
+  done
+  log "Private subnets: ${private_subnets[*]}"
+fi
+
+# Reuse the IAM node role from the existing osmo-gpu-nodes nodegroup so we don't
+# have to provision a new role / policies for this throwaway test.
+log "Looking up an existing GPU nodegroup to copy its node IAM role"
+existing_ng=$(aws eks list-nodegroups --region "$AWS_REGION" --cluster-name "$CLUSTER_NAME" \
+  --query 'nodegroups[?contains(@, `gpu`)] | [0]' --output text)
+if [[ -z "$existing_ng" || "$existing_ng" == "None" ]]; then
+  echo "No existing GPU nodegroup found to copy node role from; aborting." >&2
+  exit 1
+fi
+node_role=$(aws eks describe-nodegroup --region "$AWS_REGION" --cluster-name "$CLUSTER_NAME" \
+  --nodegroup-name "$existing_ng" --query 'nodegroup.nodeRole' --output text)
+log "Reusing node role from $existing_ng: $node_role"
+
+# Turn the label/taint env vars into CLI args.
+labels_arg=$(python3 -c '
+import os, json
+pairs = [kv.split("=", 1) for kv in os.environ["NODE_LABELS"].split(",") if kv]
+print(json.dumps(dict(pairs)))
+')
+taints_arg="$NODE_TAINTS_JSON"
+
+log "Creating nodegroup $NODEGROUP_NAME (instance=$INSTANCE_TYPE, ami=$AMI_TYPE)"
+aws eks create-nodegroup \
+  --region "$AWS_REGION" \
+  --cluster-name "$CLUSTER_NAME" \
+  --nodegroup-name "$NODEGROUP_NAME" \
+  --scaling-config "minSize=$MIN_SIZE,maxSize=$MAX_SIZE,desiredSize=$DESIRED_SIZE" \
+  --disk-size "$DISK_SIZE" \
+  --subnets "${private_subnets[@]}" \
+  --instance-types "$INSTANCE_TYPE" \
+  --ami-type "$AMI_TYPE" \
+  --capacity-type "$CAPACITY_TYPE" \
+  --node-role "$node_role" \
+  --labels "$labels_arg" \
+  --taints "$taints_arg" \
+  --tags "project=g7e-mig-test,owner=$USER" \
+  --output json >/dev/null
+
+log "Waiting for nodegroup to become ACTIVE (this usually takes 3-5 minutes)..."
+aws eks wait nodegroup-active \
+  --region "$AWS_REGION" \
+  --cluster-name "$CLUSTER_NAME" \
+  --nodegroup-name "$NODEGROUP_NAME"
+
+log "Nodegroup $NODEGROUP_NAME is ACTIVE. EC2 instance:"
+aws ec2 describe-instances --region "$AWS_REGION" \
+  --filters "Name=tag:eks:cluster-name,Values=$CLUSTER_NAME" \
+            "Name=tag:eks:nodegroup-name,Values=$NODEGROUP_NAME" \
+            "Name=instance-state-name,Values=running,pending" \
+  --query 'Reservations[].Instances[].[InstanceId,InstanceType,PrivateIpAddress,State.Name]' \
+  --output table
+
+log "Configure kubectl, then:  kubectl get nodes -l node-type=g7e-mig"
diff --git a/2.projects/mig-gpu-partitioning/g7e-blackwell/02-install-gpu-operator.sh b/2.projects/mig-gpu-partitioning/g7e-blackwell/02-install-gpu-operator.sh
new file mode 100755
index 0000000..d149ecc
--- /dev/null
+++ b/2.projects/mig-gpu-partitioning/g7e-blackwell/02-install-gpu-operator.sh
@@ -0,0 +1,170 @@
+#!/usr/bin/env bash
+# Install the NVIDIA GPU Operator via Helm with MIG manager enabled (mixed strategy).
+# The operator deploys the driver, container toolkit, device plugin, DCGM exporter,
+# node feature discovery, and mig-manager. On Blackwell the operator picks the
+# right driver automatically (open kernel modules / 570+ branch).
+set -euo pipefail
+cd "$(dirname "$0")"
+source ./env.sh
+
+log() { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; }
+
+if ! command -v helm >/dev/null; then
+  echo "helm is required but not installed" >&2
+  exit 1
+fi
+
+log "Configuring kubectl for $CLUSTER_NAME in $AWS_REGION"
+aws eks update-kubeconfig --region "$AWS_REGION" --name "$CLUSTER_NAME" >/dev/null
+
+log "Waiting for a Ready node with label node-type=g7e-mig"
+for _ in $(seq 1 60); do
+  ready=$(kubectl get nodes -l node-type=g7e-mig \
+    -o jsonpath='{range .items[*]}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' \
+    2>/dev/null | grep -c '^True$' || true)
+  if [[ "$ready" -ge 1 ]]; then
+    break
+  fi
+  sleep 10
+done
+kubectl get nodes -l node-type=g7e-mig -L node.kubernetes.io/instance-type
+
+log "Adding NVIDIA helm repo and updating"
+helm repo add nvidia https://helm.ngc.nvidia.com/nvidia >/dev/null 2>&1 || true
+helm repo update >/dev/null
+
+log "Installing gpu-operator into namespace $GPU_OPERATOR_NAMESPACE (release=$GPU_OPERATOR_RELEASE)"
+# Key flags:
+#   mig.strategy=mixed     -> expose differently-sized MIG partitions as distinct resources
+#   migManager.default=all-disabled -> start with MIG off; we flip it on via the node label
+#                                     once the operator is healthy (in 03-apply-mig-config.sh).
+#   migManager.WITH_REBOOT=true -> let mig-manager reboot the node when required (Blackwell
+#                                   often needs a reboot after the first MIG enable).
+#   driver.enabled=true    -> install NVIDIA driver via the operator's driver container
+#                             (the EKS-optimized NVIDIA AMI ships a driver, but the
+#                              operator's driver container is what mig-manager is
+#                              validated against; toolkit-only mode is flaky on Blackwell).
+#
+# If you prefer to keep the host driver (from the EKS AL2023 NVIDIA AMI) and skip the
+# operator's driver container, override with DRIVER_ENABLED=false on the command line.
+DRIVER_ENABLED="${DRIVER_ENABLED:-true}"
+
+helm upgrade --install "$GPU_OPERATOR_RELEASE" nvidia/gpu-operator \
+  --namespace "$GPU_OPERATOR_NAMESPACE" --create-namespace \
+  --set mig.strategy=mixed \
+  --set migManager.enabled=true \
+  --set migManager.default=all-disabled \
+  --set migManager.env[0].name=WITH_REBOOT \
+  --set-string migManager.env[0].value="true" \
+  --set driver.enabled="$DRIVER_ENABLED" \
+  --set toolkit.enabled=true \
+  --set devicePlugin.enabled=true \
+  --set dcgmExporter.enabled=true \
+  --set nodeStatusExporter.enabled=true \
+  --wait --timeout 20m
+
+log "Operator install complete. Pods:"
+kubectl get pods -n "$GPU_OPERATOR_NAMESPACE" -o wide
+
+# ---------------------------------------------------------------------------
+# AL2023 NVIDIA AMI ships containerd v3 with NO pre-configured `nvidia`
+# runtime and no explicit SystemdCgroup setting, while the gpu-operator
+# v26.x toolkit (nvcr.io/nvidia/k8s/container-toolkit:v1.19.0) generates a
+# v2-style drop-in that doesn't carry SystemdCgroup=true. That mismatch causes
+# EVERY gpu-operator pod on the node to crash with:
+#     runc create failed: expected cgroupsPath to be of format
+#     "slice:prefix:name" for systemd cgroups, got "/kubepods/..." instead
+# and blocks nvidia-device-plugin from advertising nvidia.com/mig-* resources
+# (it also takes aws-node down in the process, breaking pod networking).
+#
+# We paper over it by writing a correct v3 drop-in to
+# /etc/containerd/conf.d/99-nvidia.toml (runc+nvidia, both with
+# SystemdCgroup=true, nvidia BinaryName pointing at the toolkit-installed
+# binary), then restarting containerd+kubelet. The existing config.toml on the
+# AL2023 NVIDIA AMI already `imports = ["/etc/containerd/conf.d/*.toml"]`.
+log "Applying AL2023+containerd-v3 runtime drop-in (needed for gpu-operator v26.x)"
+node=$(kubectl get nodes -l node-type=g7e-mig -o jsonpath='{.items[0].metadata.name}')
+if [[ -z "$node" ]]; then
+  echo "No g7e node found — cannot apply runtime fix" >&2
+  exit 1
+fi
+
+cat <<YAML | kubectl apply -f -
+apiVersion: v1
+kind: Pod
+metadata:
+  name: containerd-runtime-fix
+  namespace: default
+spec:
+  nodeName: $node
+  hostPID: true
+  hostNetwork: true
+  restartPolicy: Never
+  tolerations:
+    - key: nvidia.com/gpu
+      operator: Equal
+      value: "true"
+      effect: NoSchedule
+  containers:
+    - name: fix
+      image: alpine:3.20
+      securityContext:
+        privileged: true
+      command: ["/bin/sh","-c"]
+      args:
+        - |
+          set -eu
+          cat > /host/etc/containerd/conf.d/99-nvidia.toml <<'EOF'
+          version = 3
+
+          [plugins."io.containerd.cri.v1.runtime".containerd]
+            default_runtime_name = "runc"
+
+            [plugins."io.containerd.cri.v1.runtime".containerd.runtimes.runc]
+              runtime_type = "io.containerd.runc.v2"
+              [plugins."io.containerd.cri.v1.runtime".containerd.runtimes.runc.options]
+                SystemdCgroup = true
+
+            [plugins."io.containerd.cri.v1.runtime".containerd.runtimes.nvidia]
+              runtime_type = "io.containerd.runc.v2"
+              [plugins."io.containerd.cri.v1.runtime".containerd.runtimes.nvidia.options]
+                BinaryName = "/usr/local/nvidia/toolkit/nvidia-container-runtime"
+                SystemdCgroup = true
+          EOF
+          # Remove any stale drop-in nvidia-ctk wrote (v2 schema, wrong
+          # BinaryName) before we bounce the daemons.
+          rm -f /host/etc/containerd/conf.d/nvidia.toml
+          chroot /host systemctl restart containerd
+          chroot /host systemctl restart kubelet
+          echo "runtime fix applied"
+      volumeMounts:
+        - name: host
+          mountPath: /host
+  volumes:
+    - name: host
+      hostPath:
+        path: /
+YAML
+
+log "Waiting for fix pod to complete"
+# kubelet restart on the same node makes the pod transition surprising; poll.
+for _ in $(seq 1 30); do
+  phase=$(kubectl get pod containerd-runtime-fix -o jsonpath='{.status.phase}' 2>/dev/null || true)
+  [[ "$phase" == "Succeeded" || "$phase" == "Failed" ]] && break
+  sleep 5
+done
+kubectl logs containerd-runtime-fix 2>&1 || true
+kubectl delete pod containerd-runtime-fix --ignore-not-found >/dev/null
+
+log "Waiting for nvidia-container-toolkit daemonset to report Ready on the node"
+for _ in $(seq 1 60); do
+  ready=$(kubectl -n "$GPU_OPERATOR_NAMESPACE" get pod \
+    -l app=nvidia-container-toolkit-daemonset \
+    --field-selector "spec.nodeName=$node" \
+    -o jsonpath='{.items[0].status.containerStatuses[0].ready}' 2>/dev/null || echo "")
+  [[ "$ready" == "true" ]] && break
+  sleep 5
+done
+kubectl -n "$GPU_OPERATOR_NAMESPACE" get pods --field-selector "spec.nodeName=$node" -o wide
+
+log "Next: run ./03-apply-mig-config.sh to partition the GPU."
diff --git a/2.projects/mig-gpu-partitioning/g7e-blackwell/03-apply-mig-config.sh b/2.projects/mig-gpu-partitioning/g7e-blackwell/03-apply-mig-config.sh
new file mode 100755
index 0000000..0231f1f
--- /dev/null
+++ b/2.projects/mig-gpu-partitioning/g7e-blackwell/03-apply-mig-config.sh
@@ -0,0 +1,67 @@
+#!/usr/bin/env bash
+# Ask mig-manager to enable MIG on the g7e node and apply a partition profile.
+#
+# Blackwell RTX PRO 6000 Server Edition supports MIG with up to 4 GPU instances
+# per physical GPU (different granularity than A100's 7). Valid profiles depend on
+# the exact SKU & driver; query with `nvidia-smi mig -lgip` after the operator is up
+# (see: kubectl exec -n gpu-operator <nvidia-driver-daemonset-...> -- nvidia-smi mig -lgip).
+#
+# Default profile: `all-1g.24gb` — 4 equal partitions, ~24 GiB each on the 96 GiB
+# RTX PRO 6000 Server Edition SKU that ships in g7e. Valid alternatives for this
+# SKU include `all-2g.48gb` (2 partitions), `all-4g.96gb` (1 MIG partition covering
+# the whole GPU), and `all-disabled`.
+# Override with: MIG_PROFILE=all-2g.48gb ./03-apply-mig-config.sh
+set -euo pipefail
+cd "$(dirname "$0")"
+source ./env.sh
+
+log() { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; }
+
+MIG_PROFILE="${MIG_PROFILE:-all-1g.24gb}"
+
+log "Configuring kubectl for $CLUSTER_NAME in $AWS_REGION"
+aws eks update-kubeconfig --region "$AWS_REGION" --name "$CLUSTER_NAME" >/dev/null
+
+node=$(kubectl get nodes -l node-type=g7e-mig -o jsonpath='{.items[0].metadata.name}')
+if [[ -z "$node" ]]; then
+  echo "No node with label node-type=g7e-mig found" >&2
+  exit 1
+fi
+log "Target node: $node"
+
+log "Available MIG profiles (from mig-parted configmap):"
+kubectl get configmap -n "$GPU_OPERATOR_NAMESPACE" default-mig-parted-config \
+  -o jsonpath='{.data.config\.yaml}' 2>/dev/null | \
+  grep -E '^\s{2}[a-zA-Z0-9._-]+:\s*$' | sed 's/^/  /' || \
+  log "(configmap not yet present — operator may still be bootstrapping)"
+
+log "Labeling $node with nvidia.com/mig.config=$MIG_PROFILE"
+kubectl label node "$node" "nvidia.com/mig.config=$MIG_PROFILE" --overwrite
+
+log "Watching mig.config.state (expect: pending -> rebooting (if needed) -> success)"
+# mig-manager may reboot the node, which pauses the watch. Poll with a timeout.
+deadline=$(( $(date +%s) + 20*60 ))
+last_state=""
+while [[ $(date +%s) -lt $deadline ]]; do
+  state=$(kubectl get node "$node" \
+    -o jsonpath='{.metadata.labels.nvidia\.com/mig\.config\.state}' 2>/dev/null || true)
+  if [[ "$state" != "$last_state" ]]; then
+    log "mig.config.state=$state"
+    last_state="$state"
+  fi
+  case "$state" in
+    success) break ;;
+    failed)  echo "mig-manager reported FAILED — inspect: kubectl logs -n $GPU_OPERATOR_NAMESPACE -l app=nvidia-mig-manager" >&2; exit 1 ;;
+  esac
+  sleep 10
+done
+
+if [[ "$last_state" != "success" ]]; then
+  echo "Timed out waiting for mig.config.state=success (last=$last_state)" >&2
+  exit 1
+fi
+
+log "MIG partitioning done. Advertised GPU resources on the node:"
+kubectl get node "$node" -o json | jq '.status.allocatable | with_entries(select(.key | startswith("nvidia.com/")))'
+
+log "Next: run ./04-test-mig.sh to schedule a pod onto a MIG slice."
diff --git a/2.projects/mig-gpu-partitioning/g7e-blackwell/04-test-mig.sh b/2.projects/mig-gpu-partitioning/g7e-blackwell/04-test-mig.sh
new file mode 100755
index 0000000..437e69b
--- /dev/null
+++ b/2.projects/mig-gpu-partitioning/g7e-blackwell/04-test-mig.sh
@@ -0,0 +1,73 @@
+#!/usr/bin/env bash
+# Schedule a pod that requests one MIG slice and verifies nvidia-smi sees it.
+# Also prints the full partition map on the node.
+set -euo pipefail
+cd "$(dirname "$0")"
+source ./env.sh
+
+log() { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; }
+
+log "Configuring kubectl for $CLUSTER_NAME in $AWS_REGION"
+aws eks update-kubeconfig --region "$AWS_REGION" --name "$CLUSTER_NAME" >/dev/null
+
+node=$(kubectl get nodes -l node-type=g7e-mig -o jsonpath='{.items[0].metadata.name}')
+log "Target node: $node"
+
+# Discover the first nvidia.com/mig-* resource the node advertises.
+resource=$(kubectl get node "$node" -o json | \
+  jq -r '.status.allocatable | to_entries[] | select(.key | startswith("nvidia.com/mig-")) | .key' | head -n1)
+if [[ -z "$resource" ]]; then
+  echo "No nvidia.com/mig-* resources advertised on $node. Did 03-apply-mig-config.sh succeed?" >&2
+  exit 1
+fi
+log "Will request one of: $resource"
+
+cat <<YAML | kubectl apply -f -
+apiVersion: v1
+kind: Pod
+metadata:
+  name: mig-smoke-test
+  labels:
+    app: mig-smoke-test
+spec:
+  restartPolicy: Never
+  tolerations:
+    - key: nvidia.com/gpu
+      operator: Equal
+      value: "true"
+      effect: NoSchedule
+  nodeSelector:
+    node-type: g7e-mig
+  containers:
+    - name: cuda
+      image: nvcr.io/nvidia/cuda:12.6.2-base-ubuntu22.04
+      command: ["bash","-lc","nvidia-smi && nvidia-smi -L && sleep 2"]
+      resources:
+        limits:
+          ${resource}: "1"
+YAML
+
+log "Waiting for pod to reach Succeeded or Failed"
+kubectl wait --for=condition=Ready pod/mig-smoke-test --timeout=5m 2>/dev/null || true
+# Container is short-lived — tail logs once it terminates.
+for _ in $(seq 1 30); do
+  phase=$(kubectl get pod mig-smoke-test -o jsonpath='{.status.phase}')
+  [[ "$phase" == "Succeeded" || "$phase" == "Failed" ]] && break
+  sleep 5
+done
+
+log "--- pod logs (mig-smoke-test) ---"
+kubectl logs mig-smoke-test || true
+
+log "--- node allocatable (nvidia.com/*) ---"
+kubectl get node "$node" -o json | jq '.status.allocatable | with_entries(select(.key | startswith("nvidia.com/")))'
+
+log "--- host-side nvidia-smi from the driver daemonset ---"
+driver_pod=$(kubectl get pod -n "$GPU_OPERATOR_NAMESPACE" -l app=nvidia-driver-daemonset \
+  --field-selector spec.nodeName="$node" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)
+if [[ -n "$driver_pod" ]]; then
+  kubectl exec -n "$GPU_OPERATOR_NAMESPACE" "$driver_pod" -- nvidia-smi -L || true
+  kubectl exec -n "$GPU_OPERATOR_NAMESPACE" "$driver_pod" -- nvidia-smi mig -lgi || true
+fi
+
+log "Cleanup:  kubectl delete pod mig-smoke-test"
diff --git a/2.projects/mig-gpu-partitioning/g7e-blackwell/99-cleanup.sh b/2.projects/mig-gpu-partitioning/g7e-blackwell/99-cleanup.sh
new file mode 100755
index 0000000..edbcc96
--- /dev/null
+++ b/2.projects/mig-gpu-partitioning/g7e-blackwell/99-cleanup.sh
@@ -0,0 +1,62 @@
+#!/usr/bin/env bash
+# Tear everything down: smoke-test pod, helm release, and the EKS nodegroup.
+# This stops g7e.2xlarge on-demand billing. Safe to rerun.
+set -euo pipefail
+cd "$(dirname "$0")"
+source ./env.sh
+
+log() { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; }
+
+log "Configuring kubectl for $CLUSTER_NAME in $AWS_REGION"
+aws eks update-kubeconfig --region "$AWS_REGION" --name "$CLUSTER_NAME" >/dev/null || true
+
+log "Deleting smoke-test pod (if any)"
+kubectl delete pod mig-smoke-test --ignore-not-found
+
+log "Uninstalling helm release $GPU_OPERATOR_RELEASE"
+helm uninstall "$GPU_OPERATOR_RELEASE" -n "$GPU_OPERATOR_NAMESPACE" 2>/dev/null || true
+kubectl delete namespace "$GPU_OPERATOR_NAMESPACE" --ignore-not-found
+
+log "Deleting nodegroup $NODEGROUP_NAME (this is what stops the billing)"
+aws eks delete-nodegroup \
+  --region "$AWS_REGION" \
+  --cluster-name "$CLUSTER_NAME" \
+  --nodegroup-name "$NODEGROUP_NAME" >/dev/null 2>&1 || \
+  log "Nodegroup $NODEGROUP_NAME not found (already deleted?)"
+
+log "Waiting for nodegroup deletion to complete"
+aws eks wait nodegroup-deleted \
+  --region "$AWS_REGION" \
+  --cluster-name "$CLUSTER_NAME" \
+  --nodegroup-name "$NODEGROUP_NAME" 2>/dev/null || true
+
+log "Verify no g7e instances remain:"
+aws ec2 describe-instances --region "$AWS_REGION" \
+  --filters "Name=tag:eks:cluster-name,Values=$CLUSTER_NAME" \
+            "Name=instance-type,Values=g7e.*" \
+            "Name=instance-state-name,Values=running,pending" \
+  --query 'Reservations[].Instances[].[InstanceId,InstanceType,State.Name]' --output table
+
+# Optionally delete scratch subnets you created to add AZ coverage for g7e capacity.
+# Opt-in via EXTRA_SUBNETS="subnet-aaa,subnet-bbb" — the script will try to
+# disassociate each from its route table and delete the subnet.
+if [[ -n "${EXTRA_SUBNETS:-}" ]]; then
+  log "Cleaning up scratch subnets: $EXTRA_SUBNETS"
+  IFS=',' read -r -a extras <<<"$EXTRA_SUBNETS"
+  for s in "${extras[@]}"; do
+    # Find any route table associations for this subnet and drop them.
+    assoc_ids=$(aws ec2 describe-route-tables --region "$AWS_REGION" \
+      --filters "Name=association.subnet-id,Values=$s" \
+      --query 'RouteTables[].Associations[?SubnetId==`'"$s"'`].RouteTableAssociationId' \
+      --output text 2>/dev/null || true)
+    for a in $assoc_ids; do
+      log "  disassociating $a from $s"
+      aws ec2 disassociate-route-table --region "$AWS_REGION" --association-id "$a" >/dev/null 2>&1 || true
+    done
+    log "  deleting subnet $s"
+    aws ec2 delete-subnet --region "$AWS_REGION" --subnet-id "$s" 2>&1 || \
+      log "  (delete-subnet failed for $s — it may still have ENIs attached; retry in a minute)"
+  done
+fi
+
+log "Done."
diff --git a/2.projects/mig-gpu-partitioning/g7e-blackwell/README.md b/2.projects/mig-gpu-partitioning/g7e-blackwell/README.md
new file mode 100644
index 0000000..dfa3e19
--- /dev/null
+++ b/2.projects/mig-gpu-partitioning/g7e-blackwell/README.md
@@ -0,0 +1,135 @@
+<h3 align="center">MIG on Amazon EC2 G7e (NVIDIA RTX PRO 6000 Blackwell Server Edition) with EKS</h3>
+
+---
+
+This directory is a companion to the parent [`mig-gpu-partitioning`](../) guide, which walks through MIG on a `p5.48xlarge` (H100). Here we do the same on the smallest G7e size — `g7e.2xlarge` — which ships a single **NVIDIA RTX PRO 6000 Blackwell Server Edition** GPU.
+
+Blackwell RTX PRO 6000 supports MIG with a different granularity than A100/H100: up to **4 GPU instances per physical GPU** (vs. 7 on A100/H100), with memory/compute slice sizes determined by the SKU and current driver. Because a `g7e.2xlarge` is a single-GPU node, this setup is the cheapest way to exercise MIG end-to-end on EKS.
+
+## Scripts
+
+| Script | What it does |
+|---|---|
+| `env.sh` | Shared environment (region, cluster, instance type, labels, taints). Source this; don't run it. |
+| `01-create-nodegroup.sh` | Creates an EKS managed nodegroup `g7e-mig-test` with one `g7e.2xlarge`, reusing the IAM role of an existing `*gpu*` nodegroup. |
+| `02-install-gpu-operator.sh` | `helm install` NVIDIA GPU Operator with `mig.strategy=mixed` and `migManager.enabled=true` (default `all-disabled` — we flip MIG on in step 3). |
+| `03-apply-mig-config.sh` | Labels the node `nvidia.com/mig.config=<profile>` so mig-manager partitions the GPU. Default profile: `all-1g.24gb` (4 equal 24 GiB partitions). |
+| `04-test-mig.sh` | Schedules a short-lived pod that requests one `nvidia.com/mig-*` slice, runs `nvidia-smi`, and dumps the node's advertised MIG resources. |
+| `99-cleanup.sh` | Uninstalls the operator and deletes the nodegroup — this is what stops g7e on-demand billing. |
+
+## Prerequisites
+
+- `aws` CLI with credentials for the target account (the scripts assume `us-west-2` / cluster `osmo`; override via env vars)
+- `kubectl`, `helm`, `jq`, and `python3` on `PATH`
+- An existing EKS cluster with at least one GPU managed nodegroup whose IAM role we can reuse (the scripts auto-detect one named `*gpu*`)
+- G-instance vCPU quota in the target region (g7e is covered by the "Running On-Demand G and VT instances" quota)
+- Private subnets in an AZ where g7e actually has capacity (see the [Capacity](#capacity-insufficientinstancecapacity) section below — in us-west-2 we only found g7e.2xlarge capacity in 2d during testing)
+
+> **Heads up on `AWS_REGION`**: the scripts default to `us-west-2` via `${AWS_REGION:-us-west-2}`, so if your shell already exports `AWS_REGION=<something-else>` that wins and the scripts will try to talk to a cluster in the wrong region. Either `unset AWS_REGION` before running, or prefix each command with `AWS_REGION=us-west-2`.
+
+## Usage
+
+```bash
+# 0. (Optional) override any defaults in env.sh
+export AWS_REGION=us-west-2
+export CLUSTER_NAME=osmo
+
+# If a specific AZ has g7e capacity, pin it:
+# export SUBNETS=subnet-0abc,subnet-0def
+
+# 1. Spin up the node (starts on-demand billing)
+./01-create-nodegroup.sh
+
+# 2. Install the NVIDIA GPU Operator with MIG manager
+./02-install-gpu-operator.sh
+
+# 3. Partition the GPU
+./03-apply-mig-config.sh
+# or pick a different profile:
+MIG_PROFILE=all-balanced ./03-apply-mig-config.sh
+
+# 4. Run a smoke-test pod on a MIG slice
+./04-test-mig.sh
+
+# 5. Tear everything down
+./99-cleanup.sh
+```
+
+## Discovering valid MIG profiles
+
+The set of profiles accepted by `nvidia.com/mig.config` comes from the `default-mig-parted-config` ConfigMap that the operator installs. To enumerate what's compiled in for Blackwell on your driver:
+
+```bash
+kubectl get configmap -n gpu-operator default-mig-parted-config \
+  -o jsonpath='{.data.config\.yaml}' | yq '.mig-configs | keys'
+```
+
+To see the raw per-GPU profile list from the driver itself:
+
+```bash
+DRIVER_POD=$(kubectl get pod -n gpu-operator -l app=nvidia-driver-daemonset -o jsonpath='{.items[0].metadata.name}')
+kubectl exec -n gpu-operator "$DRIVER_POD" -- nvidia-smi mig -lgip
+```
+
+For the `g7e.2xlarge` (RTX PRO 6000 Server Edition, 96 GiB) SKU we verified the following profiles work:
+
+| Profile | Partitions | Per-slice memory |
+|---|---|---|
+| `all-1g.24gb` (default) | 4 | ~24 GiB |
+| `all-2g.48gb` | 2 | ~48 GiB |
+| `all-4g.96gb` | 1 | ~95 GiB (whole GPU, MIG mode on) |
+
+If your Blackwell SKU differs (different memory capacity or newer driver) pick a profile from the ConfigMap that matches what `nvidia-smi mig -lgip` reports.
+
+## Capacity: `InsufficientInstanceCapacity`
+
+G7e is supply-constrained. `aws ec2 describe-instance-type-offerings` will happily list g7e.2xlarge in every AZ of a region even when AWS has zero actual capacity in those AZs — **offering availability ≠ real capacity**. A dry-run `RunInstances` doesn't check capacity either (it only validates syntax/IAM).
+
+When `01-create-nodegroup.sh` hits ICE, the nodegroup goes `CREATE_FAILED` with a health issue like:
+
+```
+AsgInstanceLaunchFailures: Could not launch On-Demand Instances.
+InsufficientInstanceCapacity - We currently do not have sufficient g7e.2xlarge
+capacity in the Availability Zone you requested (us-west-2a). ...
+You can currently get g7e.2xlarge capacity by ... choosing us-west-2b, us-west-2c, us-west-2d.
+```
+
+A `CREATE_FAILED` nodegroup can't recover — the ASG keeps retrying past EKS's internal create timeout, so you can even end up with an instance that launched **after** EKS already gave up. Always run `./99-cleanup.sh` (or `aws eks delete-nodegroup …`) immediately when you see `CREATE_FAILED` so you don't get billed for an orphan instance.
+
+Practical playbook:
+
+1. Run `./01-create-nodegroup.sh` with all private subnets (default) and let ICE tell you which AZ currently has capacity.
+2. Delete the failed nodegroup.
+3. Re-run pinned to just that AZ: `SUBNETS=subnet-xxxxxxxx ./01-create-nodegroup.sh`.
+4. If none of your VPC's private subnets are in an AZ with capacity, create one: an extra private subnet with `kubernetes.io/cluster/<cluster>=shared` tagged associated with an existing NAT route table. See `99-cleanup.sh` for the reverse (deleting the scratch subnet after teardown).
+
+## Gotcha: AL2023 NVIDIA AMI + containerd v3 + gpu-operator v26
+
+The EKS-optimized `AL2023_x86_64_NVIDIA` AMI ships **containerd v3**, which changes the plugin path layout (`io.containerd.cri.v1.runtime` instead of the v1/v2 `io.containerd.grpc.v1.cri`). The AMI's default `/etc/containerd/config.toml` does NOT include an explicit `SystemdCgroup = true` under the runc runtime — it relies on containerd's defaults.
+
+The NVIDIA GPU operator (as of v26.3) ships a toolkit container (`container-toolkit:v1.19.0`) whose `nvidia-ctk runtime configure` command still emits a **v2-style drop-in** without `SystemdCgroup`. When that drop-in merges with the v3 base config, runc receives a non-systemd-shaped cgroupsPath and every single pod on the node starts failing with:
+
+```
+runc create failed: expected cgroupsPath to be of format "slice:prefix:name"
+for systemd cgroups, got "/kubepods/besteffort/..." instead
+```
+
+This blocks the device plugin from ever advertising `nvidia.com/mig-*` and also takes `aws-node` down (breaking pod networking on the node).
+
+`02-install-gpu-operator.sh` papers over this automatically by writing a correct v3 drop-in to `/etc/containerd/conf.d/99-nvidia.toml` containing both runc and nvidia runtimes with `SystemdCgroup = true`, then restarting `containerd` and `kubelet`. Without this patch, steps 3 and 4 will stall forever. If you're running these scripts on a different Blackwell AMI (say, a future AL2023 image where AWS pre-wires the nvidia runtime) you can drop the patch — check `containerd config dump | grep SystemdCgroup` and `crictl info` for the runtime section first.
+
+## Notes & gotchas
+
+- **Blackwell reboot**: mig-manager is configured with `WITH_REBOOT=true`. The very first MIG enable on a Blackwell node often requires a reboot before the partitions become visible; the node will cycle and the script will keep polling `nvidia.com/mig.config.state`. (In our testing on the `g7e.2xlarge` / driver 580.126 combo, the first enable succeeded without a reboot.)
+- **Taints**: the nodegroup is tainted `nvidia.com/gpu=true:NoSchedule`. Pods must tolerate this to land on the node (step 4's smoke-test does).
+- **Billing**: `g7e.2xlarge` is on-demand. The only way to stop billing is to delete the nodegroup — run `./99-cleanup.sh`. Verify no `g7e.*` instances remain at the end (the script prints a table).
+- **EKS access for `kubectl`**: if `kubectl get nodes` fails with `the server has asked for the client to provide credentials`, your IAM principal doesn't have a cluster access entry yet. On a cluster with `authenticationMode=API` or `API_AND_CONFIG_MAP`, run:
+  ```bash
+  aws eks create-access-entry --cluster-name "$CLUSTER_NAME" \
+    --principal-arn arn:aws:iam::<account>:user/<you> --type STANDARD
+  aws eks associate-access-policy --cluster-name "$CLUSTER_NAME" \
+    --principal-arn arn:aws:iam::<account>:user/<you> \
+    --policy-arn arn:aws:eks::aws:cluster-access-policy/AmazonEKSClusterAdminPolicy \
+    --access-scope type=cluster
+  ```
+- **Why reuse an IAM role?**: the operator needs `AmazonEKSWorkerNodePolicy`, `AmazonEC2ContainerRegistryReadOnly`, and `AmazonEKS_CNI_Policy` on the node role. Reusing the role attached to an existing GPU nodegroup avoids provisioning a one-off role for a throwaway test. For a permanent setup, define your own.
diff --git a/2.projects/mig-gpu-partitioning/g7e-blackwell/env.sh b/2.projects/mig-gpu-partitioning/g7e-blackwell/env.sh
new file mode 100755
index 0000000..bb09f1b
--- /dev/null
+++ b/2.projects/mig-gpu-partitioning/g7e-blackwell/env.sh
@@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+# Shared environment variables for the g7e (RTX PRO 6000 Blackwell) MIG test scripts.
+# Source this file from each step script: `source ./env.sh`
+
+# NOTE on AWS_REGION: because of the default-if-unset (:-) syntax below, if your shell
+# already exports AWS_REGION to something else (e.g. us-east-1), that value wins and
+# the scripts will try to talk to the cluster in the wrong region. Either
+# `unset AWS_REGION` before running, or prefix each command with AWS_REGION=us-west-2.
+export AWS_REGION="${AWS_REGION:-us-west-2}"
+export CLUSTER_NAME="${CLUSTER_NAME:-osmo}"
+
+# Optional AZ pinning: comma-separated list of subnet IDs passed to
+# `aws eks create-nodegroup --subnets`. Useful when g7e has ICE in some AZs
+# (see README "Capacity" section). When unset, the script falls back to every
+# private subnet it can find in the cluster VPC.
+#   e.g. export SUBNETS=subnet-0abc,subnet-0def
+# export SUBNETS=
+
+# g7e.2xlarge = 1 x NVIDIA RTX PRO 6000 Blackwell Server Edition, 8 vCPU, 32 GiB.
+# Smallest g7e size, sufficient to exercise MIG partitioning on a single GPU.
+export INSTANCE_TYPE="${INSTANCE_TYPE:-g7e.2xlarge}"
+export NODEGROUP_NAME="${NODEGROUP_NAME:-g7e-mig-test}"
+
+# AL2023 NVIDIA is the recommended path for new GPU node groups on EKS 1.30+.
+# AMI_TYPE=AL2023_x86_64_NVIDIA picks the latest EKS-optimized Blackwell-capable image.
+export AMI_TYPE="${AMI_TYPE:-AL2023_x86_64_NVIDIA}"
+
+export DISK_SIZE="${DISK_SIZE:-100}"
+export CAPACITY_TYPE="${CAPACITY_TYPE:-ON_DEMAND}"
+export DESIRED_SIZE="${DESIRED_SIZE:-1}"
+export MIN_SIZE="${MIN_SIZE:-0}"
+export MAX_SIZE="${MAX_SIZE:-1}"
+
+# MIG-specific node labels. The NVIDIA GPU operator's mig-manager watches
+# `nvidia.com/mig.config` and (re)partitions the GPU when the value changes.
+export NODE_LABELS="node-type=g7e-mig,nvidia.com/mig.config=all-disabled"
+
+# Taint keeps non-GPU workloads off this node. Pods must tolerate
+# nvidia.com/gpu=true:NoSchedule to land here.
+export NODE_TAINTS_JSON='[{"key":"nvidia.com/gpu","value":"true","effect":"NO_SCHEDULE"}]'
+
+# Helm release name & namespace for the NVIDIA GPU operator.
+export GPU_OPERATOR_NAMESPACE="${GPU_OPERATOR_NAMESPACE:-gpu-operator}"
+export GPU_OPERATOR_RELEASE="${GPU_OPERATOR_RELEASE:-gpu-operator}"

From 96b6e2a48826163b849d4ae080990c767a029229 Mon Sep 17 00:00:00 2001
From: YongHwan Yoo <yhyoo@amazon.com>
Date: Sat, 25 Apr 2026 12:23:37 +0900
Subject: [PATCH 2/3] Link g7e-blackwell MIG guide from top-level and
 2.projects READMEs

Follow CONTRIBUTING.md guidance to list new projects in both README
indexes.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 2.projects/README.md | 3 ++-
 README.md            | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/2.projects/README.md b/2.projects/README.md
index 1e4a74e..d05cb80 100644
--- a/2.projects/README.md
+++ b/2.projects/README.md
@@ -94,6 +94,7 @@ These days, the challenge with ML Inference workloads, is that not all workloads
 In 2020, NVIDIA released Multi-Instance GPU (MIG), alongside the Ampere Architecture that powers the NVIDIA A100 (EC2 P4) and NVIDIA A10G (EC2 G5) GPUs. With MIG, administrators can partition a single GPU into multiple smaller GPU units (called “MIG devices”). Each of these smaller GPU units are fully isolated, with their own high-bandwidth memory, cache, and compute cores.
 
 ### Files & Directories
-1. [README.md](https://github.com/aws-samples/awsome-inference/blob/main/2.projects/mig-gpu-partitioning/README.md): Yes, this process is simple enough to only have a README! Note: This project only shows you how to set MIG up, and assumes you already have the cluster(s) set up, and your deployment ready to go.
+1. [README.md](https://github.com/aws-samples/awsome-inference/blob/main/2.projects/mig-gpu-partitioning/README.md): Walkthrough for MIG on a `p5.48xlarge` (H100). Assumes the cluster(s) are already set up.
+2. [g7e-blackwell/](https://github.com/aws-samples/awsome-inference/tree/main/2.projects/mig-gpu-partitioning/g7e-blackwell): Companion bash scripts that exercise MIG on `g7e.2xlarge` (1 × NVIDIA RTX PRO 6000 Blackwell Server Edition, 96 GiB) on EKS — the smallest/cheapest single-GPU setup to validate MIG end-to-end. Creates a managed nodegroup, installs the NVIDIA GPU Operator (with mig-manager), partitions the GPU (default `all-1g.24gb` → 4 × 24 GiB slices), runs a smoke-test pod on a MIG slice, and tears everything down. Includes a workaround for the AL2023 NVIDIA AMI + containerd v3 + gpu-operator v26 cgroup mismatch.
 
 
diff --git a/README.md b/README.md
index 4b4c103..cfcfb5b 100644
--- a/README.md
+++ b/README.md
@@ -71,7 +71,7 @@ These examples shows how to deploy LLMs like T5, Mistral using NVIDIA Triton TRT
 
 ### MIG 
 
-This directory contains a README on how you can leverage Multi-Instance GPUs (MIGs) to partition your GPUs based on your workload. For more detailed documentation, check out the [MIG user guide](https://docs.nvidia.com/datacenter/tesla/mig-user-guide/).
+This directory contains a README on how you can leverage Multi-Instance GPUs (MIGs) to partition your GPUs based on your workload. The [`g7e-blackwell/`](2.projects/mig-gpu-partitioning/g7e-blackwell) sub-directory provides end-to-end bash scripts that exercise MIG on the smallest G7e size (`g7e.2xlarge`, 1 × NVIDIA RTX PRO 6000 Blackwell Server Edition) on EKS. For more detailed documentation, check out the [MIG user guide](https://docs.nvidia.com/datacenter/tesla/mig-user-guide/).
 
 ## USE-CASES
 These are real life use-case examples on using projects from `2.PROJECTS/` to demonstrate catering the projects to real-life scenarios.

From 3e5ad5992f3d3a22d55367b1b9781465dbd48f9e Mon Sep 17 00:00:00 2001
From: YongHwan Yoo <yhyoo@amazon.com>
Date: Sat, 25 Apr 2026 14:31:29 +0900
Subject: [PATCH 3/3] Align g7e-blackwell prereqs with repo convention

Call out that scripts assume an existing EKS cluster (same convention
used by every other 2.projects/* guide) and link to 1.infrastructure/
for the cluster setup. Drop the fork-specific default cluster name.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 2.projects/mig-gpu-partitioning/g7e-blackwell/README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/2.projects/mig-gpu-partitioning/g7e-blackwell/README.md b/2.projects/mig-gpu-partitioning/g7e-blackwell/README.md
index dfa3e19..df9ffe2 100644
--- a/2.projects/mig-gpu-partitioning/g7e-blackwell/README.md
+++ b/2.projects/mig-gpu-partitioning/g7e-blackwell/README.md
@@ -19,9 +19,11 @@ Blackwell RTX PRO 6000 supports MIG with a different granularity than A100/H100:
 
 ## Prerequisites
 
-- `aws` CLI with credentials for the target account (the scripts assume `us-west-2` / cluster `osmo`; override via env vars)
+These scripts assume an EKS cluster is already up. If you don't have one, start from the [`1.infrastructure/`](../../../1.infrastructure) guide to provision a VPC + EKS cluster. You'll then need:
+
+- `aws` CLI with credentials for the target account (defaults to `us-west-2`; override via `AWS_REGION` / `CLUSTER_NAME`)
 - `kubectl`, `helm`, `jq`, and `python3` on `PATH`
-- An existing EKS cluster with at least one GPU managed nodegroup whose IAM role we can reuse (the scripts auto-detect one named `*gpu*`)
+- An existing EKS cluster with at least one GPU managed nodegroup whose IAM role we can reuse (the scripts auto-detect one named `*gpu*`). For a permanent setup you should provision a dedicated node role instead — see the note at the bottom.
 - G-instance vCPU quota in the target region (g7e is covered by the "Running On-Demand G and VT instances" quota)
 - Private subnets in an AZ where g7e actually has capacity (see the [Capacity](#capacity-insufficientinstancecapacity) section below — in us-west-2 we only found g7e.2xlarge capacity in 2d during testing)