aws-samples · yoosful · Apr 24, 2026 · Apr 25, 2026 · Apr 25, 2026
diff --git a/2.projects/README.md b/2.projects/README.md
@@ -94,6 +94,7 @@ These days, the challenge with ML Inference workloads, is that not all workloads
 In 2020, NVIDIA released Multi-Instance GPU (MIG), alongside the Ampere Architecture that powers the NVIDIA A100 (EC2 P4) and NVIDIA A10G (EC2 G5) GPUs. With MIG, administrators can partition a single GPU into multiple smaller GPU units (called “MIG devices”). Each of these smaller GPU units are fully isolated, with their own high-bandwidth memory, cache, and compute cores.
 
 ### Files & Directories
-1. [README.md](https://github.com/aws-samples/awsome-inference/blob/main/2.projects/mig-gpu-partitioning/README.md): Yes, this process is simple enough to only have a README! Note: This project only shows you how to set MIG up, and assumes you already have the cluster(s) set up, and your deployment ready to go.
+1. [README.md](https://github.com/aws-samples/awsome-inference/blob/main/2.projects/mig-gpu-partitioning/README.md): Walkthrough for MIG on a `p5.48xlarge` (H100). Assumes the cluster(s) are already set up.
+2. [g7e-blackwell/](https://github.com/aws-samples/awsome-inference/tree/main/2.projects/mig-gpu-partitioning/g7e-blackwell): Companion bash scripts that exercise MIG on `g7e.2xlarge` (1 × NVIDIA RTX PRO 6000 Blackwell Server Edition, 96 GiB) on EKS — the smallest/cheapest single-GPU setup to validate MIG end-to-end. Creates a managed nodegroup, installs the NVIDIA GPU Operator (with mig-manager), partitions the GPU (default `all-1g.24gb` → 4 × 24 GiB slices), runs a smoke-test pod on a MIG slice, and tears everything down. Includes a workaround for the AL2023 NVIDIA AMI + containerd v3 + gpu-operator v26 cgroup mismatch.
 
 
diff --git a/2.projects/mig-gpu-partitioning/g7e-blackwell/01-create-nodegroup.sh b/2.projects/mig-gpu-partitioning/g7e-blackwell/01-create-nodegroup.sh
@@ -0,0 +1,88 @@
+#!/usr/bin/env bash
+# Create an EKS managed nodegroup with a single g7e.2xlarge instance
+# (NVIDIA RTX PRO 6000 Blackwell Server Edition) and wait for it to become ACTIVE.
+#
+# WARNING: this starts an ON_DEMAND g7e.2xlarge — on-demand billing starts
+# immediately. Run `./99-cleanup.sh` when done.
+set -euo pipefail
+cd "$(dirname "$0")"
+source ./env.sh
+
+log() { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; }
+
+log "Looking up VPC details for cluster $CLUSTER_NAME in $AWS_REGION"
+cluster_json=$(aws eks describe-cluster --region "$AWS_REGION" --name "$CLUSTER_NAME")
+subnet_ids=$(echo "$cluster_json" | jq -r '.cluster.resourcesVpcConfig.subnetIds[]')
+
+# Allow caller to pin a specific subnet set via SUBNETS="subnet-aaa,subnet-bbb".
+# Useful when a given instance type has InsufficientInstanceCapacity in some AZs —
+# g7e in particular is capacity-constrained and we saw us-west-2a return ICE.
+private_subnets=()
+if [[ -n "${SUBNETS:-}" ]]; then
+  IFS=',' read -r -a private_subnets <<<"$SUBNETS"
+  log "Using SUBNETS override: ${private_subnets[*]}"
+else
+  # Fall back to every private subnet (MapPublicIpOnLaunch=false) in the cluster VPC.
+  for s in $subnet_ids; do
+    is_public=$(aws ec2 describe-subnets --region "$AWS_REGION" --subnet-ids "$s" \
+      --query 'Subnets[0].MapPublicIpOnLaunch' --output text)
+    if [[ "$is_public" == "False" ]]; then
+      private_subnets+=("$s")
+    fi
+  done
+  log "Private subnets: ${private_subnets[*]}"
+fi
+
+# Reuse the IAM node role from the existing osmo-gpu-nodes nodegroup so we don't
+# have to provision a new role / policies for this throwaway test.
+log "Looking up an existing GPU nodegroup to copy its node IAM role"
+existing_ng=$(aws eks list-nodegroups --region "$AWS_REGION" --cluster-name "$CLUSTER_NAME" \
+  --query 'nodegroups[?contains(@, `gpu`)] | [0]' --output text)
+if [[ -z "$existing_ng" || "$existing_ng" == "None" ]]; then
+  echo "No existing GPU nodegroup found to copy node role from; aborting." >&2
+  exit 1
+fi
+node_role=$(aws eks describe-nodegroup --region "$AWS_REGION" --cluster-name "$CLUSTER_NAME" \
+  --nodegroup-name "$existing_ng" --query 'nodegroup.nodeRole' --output text)
+log "Reusing node role from $existing_ng: $node_role"
+
+# Turn the label/taint env vars into CLI args.
+labels_arg=$(python3 -c '
+import os, json
+pairs = [kv.split("=", 1) for kv in os.environ["NODE_LABELS"].split(",") if kv]
+print(json.dumps(dict(pairs)))
+')
+taints_arg="$NODE_TAINTS_JSON"
+
+log "Creating nodegroup $NODEGROUP_NAME (instance=$INSTANCE_TYPE, ami=$AMI_TYPE)"
+aws eks create-nodegroup \
+  --region "$AWS_REGION" \
+  --cluster-name "$CLUSTER_NAME" \
+  --nodegroup-name "$NODEGROUP_NAME" \
+  --scaling-config "minSize=$MIN_SIZE,maxSize=$MAX_SIZE,desiredSize=$DESIRED_SIZE" \
+  --disk-size "$DISK_SIZE" \
+  --subnets "${private_subnets[@]}" \
+  --instance-types "$INSTANCE_TYPE" \
+  --ami-type "$AMI_TYPE" \
+  --capacity-type "$CAPACITY_TYPE" \
+  --node-role "$node_role" \
+  --labels "$labels_arg" \
+  --taints "$taints_arg" \
+  --tags "project=g7e-mig-test,owner=$USER" \
+  --output json >/dev/null
+
+log "Waiting for nodegroup to become ACTIVE (this usually takes 3-5 minutes)..."
+aws eks wait nodegroup-active \
+  --region "$AWS_REGION" \
+  --cluster-name "$CLUSTER_NAME" \
+  --nodegroup-name "$NODEGROUP_NAME"
+
+log "Nodegroup $NODEGROUP_NAME is ACTIVE. EC2 instance:"
+aws ec2 describe-instances --region "$AWS_REGION" \
+  --filters "Name=tag:eks:cluster-name,Values=$CLUSTER_NAME" \
+            "Name=tag:eks:nodegroup-name,Values=$NODEGROUP_NAME" \
+            "Name=instance-state-name,Values=running,pending" \
+  --query 'Reservations[].Instances[].[InstanceId,InstanceType,PrivateIpAddress,State.Name]' \
+  --output table
+
+log "Configure kubectl, then:  kubectl get nodes -l node-type=g7e-mig"
diff --git a/2.projects/mig-gpu-partitioning/g7e-blackwell/02-install-gpu-operator.sh b/2.projects/mig-gpu-partitioning/g7e-blackwell/02-install-gpu-operator.sh
@@ -0,0 +1,170 @@
+#!/usr/bin/env bash
+# Install the NVIDIA GPU Operator via Helm with MIG manager enabled (mixed strategy).
+# The operator deploys the driver, container toolkit, device plugin, DCGM exporter,
+# node feature discovery, and mig-manager. On Blackwell the operator picks the
+# right driver automatically (open kernel modules / 570+ branch).
+set -euo pipefail
+cd "$(dirname "$0")"
+source ./env.sh
+
+log() { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; }
+
+if ! command -v helm >/dev/null; then
+  echo "helm is required but not installed" >&2
+  exit 1
+fi
+
+log "Configuring kubectl for $CLUSTER_NAME in $AWS_REGION"
+aws eks update-kubeconfig --region "$AWS_REGION" --name "$CLUSTER_NAME" >/dev/null
+
+log "Waiting for a Ready node with label node-type=g7e-mig"
+for _ in $(seq 1 60); do
+  ready=$(kubectl get nodes -l node-type=g7e-mig \
+    -o jsonpath='{range .items[*]}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' \
+    2>/dev/null | grep -c '^True$' || true)
+  if [[ "$ready" -ge 1 ]]; then
+    break
+  fi
+  sleep 10
+done
+kubectl get nodes -l node-type=g7e-mig -L node.kubernetes.io/instance-type
+
+log "Adding NVIDIA helm repo and updating"
+helm repo add nvidia https://helm.ngc.nvidia.com/nvidia >/dev/null 2>&1 || true
+helm repo update >/dev/null
+
+log "Installing gpu-operator into namespace $GPU_OPERATOR_NAMESPACE (release=$GPU_OPERATOR_RELEASE)"
+# Key flags:
+#   mig.strategy=mixed     -> expose differently-sized MIG partitions as distinct resources
+#   migManager.default=all-disabled -> start with MIG off; we flip it on via the node label
+#                                     once the operator is healthy (in 03-apply-mig-config.sh).
+#   migManager.WITH_REBOOT=true -> let mig-manager reboot the node when required (Blackwell
+#                                   often needs a reboot after the first MIG enable).
+#   driver.enabled=true    -> install NVIDIA driver via the operator's driver container
+#                             (the EKS-optimized NVIDIA AMI ships a driver, but the
+#                              operator's driver container is what mig-manager is
+#                              validated against; toolkit-only mode is flaky on Blackwell).
+#
+# If you prefer to keep the host driver (from the EKS AL2023 NVIDIA AMI) and skip the
+# operator's driver container, override with DRIVER_ENABLED=false on the command line.
+DRIVER_ENABLED="${DRIVER_ENABLED:-true}"
+
+helm upgrade --install "$GPU_OPERATOR_RELEASE" nvidia/gpu-operator \
+  --namespace "$GPU_OPERATOR_NAMESPACE" --create-namespace \
+  --set mig.strategy=mixed \
+  --set migManager.enabled=true \
+  --set migManager.default=all-disabled \
+  --set migManager.env[0].name=WITH_REBOOT \
+  --set-string migManager.env[0].value="true" \
+  --set driver.enabled="$DRIVER_ENABLED" \
+  --set toolkit.enabled=true \
+  --set devicePlugin.enabled=true \
+  --set dcgmExporter.enabled=true \
+  --set nodeStatusExporter.enabled=true \
+  --wait --timeout 20m
+
+log "Operator install complete. Pods:"
+kubectl get pods -n "$GPU_OPERATOR_NAMESPACE" -o wide
+
+# ---------------------------------------------------------------------------
+# AL2023 NVIDIA AMI ships containerd v3 with NO pre-configured `nvidia`
+# runtime and no explicit SystemdCgroup setting, while the gpu-operator
+# v26.x toolkit (nvcr.io/nvidia/k8s/container-toolkit:v1.19.0) generates a
+# v2-style drop-in that doesn't carry SystemdCgroup=true. That mismatch causes
+# EVERY gpu-operator pod on the node to crash with:
+#     runc create failed: expected cgroupsPath to be of format
+#     "slice:prefix:name" for systemd cgroups, got "/kubepods/..." instead
+# and blocks nvidia-device-plugin from advertising nvidia.com/mig-* resources
+# (it also takes aws-node down in the process, breaking pod networking).
+#
+# We paper over it by writing a correct v3 drop-in to
+# /etc/containerd/conf.d/99-nvidia.toml (runc+nvidia, both with
+# SystemdCgroup=true, nvidia BinaryName pointing at the toolkit-installed
+# binary), then restarting containerd+kubelet. The existing config.toml on the
+# AL2023 NVIDIA AMI already `imports = ["/etc/containerd/conf.d/*.toml"]`.
+log "Applying AL2023+containerd-v3 runtime drop-in (needed for gpu-operator v26.x)"
+node=$(kubectl get nodes -l node-type=g7e-mig -o jsonpath='{.items[0].metadata.name}')
+if [[ -z "$node" ]]; then
+  echo "No g7e node found — cannot apply runtime fix" >&2
+  exit 1
+fi
+
+cat <<YAML | kubectl apply -f -
+apiVersion: v1
+kind: Pod
+metadata:
+  name: containerd-runtime-fix
+  namespace: default
+spec:
+  nodeName: $node
+  hostPID: true
+  hostNetwork: true
+  restartPolicy: Never
+  tolerations:
+    - key: nvidia.com/gpu
+      operator: Equal
+      value: "true"
+      effect: NoSchedule
+  containers:
+    - name: fix
+      image: alpine:3.20
+      securityContext:
+        privileged: true
+      command: ["/bin/sh","-c"]
+      args:
+        - |
+          set -eu
+          cat > /host/etc/containerd/conf.d/99-nvidia.toml <<'EOF'
+          version = 3
+
+          [plugins."io.containerd.cri.v1.runtime".containerd]
+            default_runtime_name = "runc"
+
+            [plugins."io.containerd.cri.v1.runtime".containerd.runtimes.runc]
+              runtime_type = "io.containerd.runc.v2"
+              [plugins."io.containerd.cri.v1.runtime".containerd.runtimes.runc.options]
+                SystemdCgroup = true
+
+            [plugins."io.containerd.cri.v1.runtime".containerd.runtimes.nvidia]
+              runtime_type = "io.containerd.runc.v2"
+              [plugins."io.containerd.cri.v1.runtime".containerd.runtimes.nvidia.options]
+                BinaryName = "/usr/local/nvidia/toolkit/nvidia-container-runtime"
+                SystemdCgroup = true
+          EOF
+          # Remove any stale drop-in nvidia-ctk wrote (v2 schema, wrong
+          # BinaryName) before we bounce the daemons.
+          rm -f /host/etc/containerd/conf.d/nvidia.toml
+          chroot /host systemctl restart containerd
+          chroot /host systemctl restart kubelet
+          echo "runtime fix applied"
+      volumeMounts:
+        - name: host
+          mountPath: /host
+  volumes:
+    - name: host
+      hostPath:
+        path: /
+YAML
+
+log "Waiting for fix pod to complete"
+# kubelet restart on the same node makes the pod transition surprising; poll.
+for _ in $(seq 1 30); do
+  phase=$(kubectl get pod containerd-runtime-fix -o jsonpath='{.status.phase}' 2>/dev/null || true)
+  [[ "$phase" == "Succeeded" || "$phase" == "Failed" ]] && break
+  sleep 5
+done
+kubectl logs containerd-runtime-fix 2>&1 || true
+kubectl delete pod containerd-runtime-fix --ignore-not-found >/dev/null
+
+log "Waiting for nvidia-container-toolkit daemonset to report Ready on the node"
+for _ in $(seq 1 60); do
+  ready=$(kubectl -n "$GPU_OPERATOR_NAMESPACE" get pod \
+    -l app=nvidia-container-toolkit-daemonset \
+    --field-selector "spec.nodeName=$node" \
+    -o jsonpath='{.items[0].status.containerStatuses[0].ready}' 2>/dev/null || echo "")
+  [[ "$ready" == "true" ]] && break
+  sleep 5
+done
+kubectl -n "$GPU_OPERATOR_NAMESPACE" get pods --field-selector "spec.nodeName=$node" -o wide
+
+log "Next: run ./03-apply-mig-config.sh to partition the GPU."
diff --git a/2.projects/mig-gpu-partitioning/g7e-blackwell/03-apply-mig-config.sh b/2.projects/mig-gpu-partitioning/g7e-blackwell/03-apply-mig-config.sh
@@ -0,0 +1,67 @@
+#!/usr/bin/env bash
+# Ask mig-manager to enable MIG on the g7e node and apply a partition profile.
+#
+# Blackwell RTX PRO 6000 Server Edition supports MIG with up to 4 GPU instances
+# per physical GPU (different granularity than A100's 7). Valid profiles depend on
+# the exact SKU & driver; query with `nvidia-smi mig -lgip` after the operator is up
+# (see: kubectl exec -n gpu-operator <nvidia-driver-daemonset-...> -- nvidia-smi mig -lgip).
+#
+# Default profile: `all-1g.24gb` — 4 equal partitions, ~24 GiB each on the 96 GiB
+# RTX PRO 6000 Server Edition SKU that ships in g7e. Valid alternatives for this
+# SKU include `all-2g.48gb` (2 partitions), `all-4g.96gb` (1 MIG partition covering
+# the whole GPU), and `all-disabled`.
+# Override with: MIG_PROFILE=all-2g.48gb ./03-apply-mig-config.sh
+set -euo pipefail
+cd "$(dirname "$0")"
+source ./env.sh
+
+log() { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; }
+
+MIG_PROFILE="${MIG_PROFILE:-all-1g.24gb}"
+
+log "Configuring kubectl for $CLUSTER_NAME in $AWS_REGION"
+aws eks update-kubeconfig --region "$AWS_REGION" --name "$CLUSTER_NAME" >/dev/null
+
+node=$(kubectl get nodes -l node-type=g7e-mig -o jsonpath='{.items[0].metadata.name}')
+if [[ -z "$node" ]]; then
+  echo "No node with label node-type=g7e-mig found" >&2
+  exit 1
+fi
+log "Target node: $node"
+
+log "Available MIG profiles (from mig-parted configmap):"
+kubectl get configmap -n "$GPU_OPERATOR_NAMESPACE" default-mig-parted-config \
+  -o jsonpath='{.data.config\.yaml}' 2>/dev/null | \
+  grep -E '^\s{2}[a-zA-Z0-9._-]+:\s*$' | sed 's/^/  /' || \
+  log "(configmap not yet present — operator may still be bootstrapping)"
+
+log "Labeling $node with nvidia.com/mig.config=$MIG_PROFILE"
+kubectl label node "$node" "nvidia.com/mig.config=$MIG_PROFILE" --overwrite
+
+log "Watching mig.config.state (expect: pending -> rebooting (if needed) -> success)"
+# mig-manager may reboot the node, which pauses the watch. Poll with a timeout.
+deadline=$(( $(date +%s) + 20*60 ))
+last_state=""
+while [[ $(date +%s) -lt $deadline ]]; do
+  state=$(kubectl get node "$node" \
+    -o jsonpath='{.metadata.labels.nvidia\.com/mig\.config\.state}' 2>/dev/null || true)
+  if [[ "$state" != "$last_state" ]]; then
+    log "mig.config.state=$state"
+    last_state="$state"
+  fi
+  case "$state" in
+    success) break ;;
+    failed)  echo "mig-manager reported FAILED — inspect: kubectl logs -n $GPU_OPERATOR_NAMESPACE -l app=nvidia-mig-manager" >&2; exit 1 ;;
+  esac
+  sleep 10
+done
+
+if [[ "$last_state" != "success" ]]; then
+  echo "Timed out waiting for mig.config.state=success (last=$last_state)" >&2
+  exit 1
+fi
+
+log "MIG partitioning done. Advertised GPU resources on the node:"
+kubectl get node "$node" -o json | jq '.status.allocatable | with_entries(select(.key | startswith("nvidia.com/")))'
+
+log "Next: run ./04-test-mig.sh to schedule a pod onto a MIG slice."