From 5c96f9497d12cd1c6d3017648866a10fe3cad11e Mon Sep 17 00:00:00 2001 From: Aman Shanbhag <55571601+amanshanbhag@users.noreply.github.com> Date: Tue, 2 Sep 2025 09:57:56 -0500 Subject: [PATCH 01/15] Updating CF stack to allow for local zone deployments for GB200 --- .../sagemaker-hyperpod.yaml | 102 +++++++++++++++++- 1 file changed, 99 insertions(+), 3 deletions(-) diff --git a/1.architectures/5.sagemaker-hyperpod/sagemaker-hyperpod.yaml b/1.architectures/5.sagemaker-hyperpod/sagemaker-hyperpod.yaml index f340fe3c4..43232fc40 100644 --- a/1.architectures/5.sagemaker-hyperpod/sagemaker-hyperpod.yaml +++ b/1.architectures/5.sagemaker-hyperpod/sagemaker-hyperpod.yaml @@ -13,6 +13,7 @@ Description: > 1.2 TB storage which can be overridden by parameter. A role is also created which helps to execute HyperPod cluster operations. +#TODO: DO THIS FOR EKS TOO. #################### ## Stack Metadata ## @@ -31,6 +32,7 @@ Metadata: Parameters: - PrimarySubnetAZ - BackupSubnetAZ + - IsLocalZone - Label: default: FSx Lustre configuration Parameters: @@ -59,7 +61,7 @@ Metadata: SSMDocumentName: default: True/False; Create SSM Session Manager Document. Only set to False if SSM-SessionManagerRunShellAsUbuntu document exists in your account. PrimarySubnetAZ: - default: Availability zone id to deploy the primary subnets + default: Availability zone id to deploy the primary subnets (OR set this to your Local Zone ID if you set IsLocalZone to True. Example use1-dfw2-az1) BackupSubnetAZ: default: (Optional) Availability zone id to deploy the backup private subnet CreateS3Endpoint: @@ -175,6 +177,14 @@ Parameters: Default: 0 MinValue: 0 MaxValue: 400000 + + IsLocalZone: + Type: String + Default: 'false' + AllowedValues: + - 'true' + - 'false' + Description: Set to true if you are using a local zone for GB200 (DFW only currently). ############################### ## Conditions for Parameters ## @@ -186,6 +196,8 @@ Conditions: CreateSSMDocument: !Equals [!Ref 'SSMDocumentName', 'true'] CreateOpenZFSCondition: !Equals [!Ref 'CreateOpenZFS', 'true'] ConfigureCustomIops: !Not [!Equals [!Ref OpenZFSIops, 0]] + UseNATInstance: !Equals [!Ref IsLocalZone, 'true'] + UseNATGateway: !Equals [!Ref IsLocalZone, 'false'] ########################## @@ -279,15 +291,97 @@ Resources: # Create a NAT GW then add it to the public subnet NATGateway: + Condition: UseNATGateway Type: AWS::EC2::NatGateway Properties: AllocationId: !GetAtt ElasticIP.AllocationId SubnetId: !Ref PublicSubnet ElasticIP: + Condition: UseNATGateway + Type: AWS::EC2::EIP + Properties: + Domain: vpc + + # For GB200: Create a NAT Instance for the DFW Local Zone and an EIP + LocalZoneEIP: + Condition: UseNATInstance Type: AWS::EC2::EIP Properties: Domain: vpc + NetworkBorderGroup: !Sub "${AWS::Region}-dfw-2" # TODO: CURRENTLY HARDCODED TO DFW. + + NATInstance: + Condition: UseNATInstance + Type: AWS::EC2::Instance + Properties: + ImageId: ami-00ca32bbc84273381 # TODO: CURRENTLY HARDCODED TO IAD -- create map. + InstanceType: c6i.2xlarge + SubnetId: !Ref PublicSubnet + SecurityGroupIds: [!Ref NATSecurityGroup] + SourceDestCheck: false + IamInstanceProfile: !Ref NATInstanceProfile + UserData: + Fn::Base64: !Sub | + #!/bin/bash -ex + yum update -y + yum install iptables-services -y + systemctl enable iptables + systemctl start iptables + echo 'net.ipv4.ip_forward=1' > /etc/sysctl.d/custom-ip-forwarding.conf + sysctl -p /etc/sysctl.d/custom-ip-forwarding.conf + /sbin/iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE + /sbin/iptables -F FORWARD + service iptables save + + # Associate EIP and disable source/dest check + TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600") + INSTANCEID=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" http://169.254.169.254/latest/meta-data/instance-id) + aws --region ${AWS::Region} ec2 associate-address --instance-id $INSTANCEID --allocation-id ${LocalZoneEIP.AllocationId} + + NATSecurityGroup: + Condition: UseNATInstance + Type: AWS::EC2::SecurityGroup + Properties: + VpcId: !Ref VPC + GroupDescription: NAT Instance Security Group + SecurityGroupIngress: + - CidrIp: 10.0.0.0/8 + IpProtocol: "-1" + SecurityGroupEgress: + - CidrIp: 0.0.0.0/0 + IpProtocol: "-1" + Tags: + - Key: Name + Value: NATSecurityGroup + + NATInstanceProfile: + Condition: UseNATInstance + Type: AWS::IAM::InstanceProfile + Properties: + Roles: [!Ref NATInstanceRole] + + NATInstanceRole: + Condition: UseNATInstance + Type: AWS::IAM::Role + Properties: + AssumeRolePolicyDocument: + Version: "2012-10-17" + Statement: + - Effect: Allow + Principal: + Service: ["ec2.amazonaws.com"] + Action: "sts:AssumeRole" + Policies: + - PolicyName: AssociateEIP + PolicyDocument: + Version: "2012-10-17" + Statement: + - Effect: Allow + Action: + - ec2:AssociateAddress + - ec2:ModifyInstanceAttribute + Resource: "*" # NOTE: when you create additional security groups, you must ensure that every # security group has ingress/egress from/to its own security group id. Failure @@ -392,7 +486,9 @@ Resources: Properties: RouteTableId: !Ref PrivateRouteTable DestinationCidrBlock: 0.0.0.0/0 - NatGatewayId: !Ref NATGateway + NatGatewayId: !If [UseNATGateway, !Ref NATGateway, !Ref 'AWS::NoValue'] + InstanceId: !If [UseNATInstance, !Ref NATInstance, !Ref 'AWS::NoValue'] + # Associate the public route table to the public subnet PublicSubnetRouteTableAssociation: @@ -630,4 +726,4 @@ Outputs: FSxOpenZFSFileSystemDNSname: Condition: CreateOpenZFSCondition Description: The DNS of the FSxOpenZFS filesystem that has been created - Value: !GetAtt FSxOpenZFSFileSystem.DNSName \ No newline at end of file + Value: !GetAtt FSxOpenZFSFileSystem.DNSName From d0525070b43decf7bd52e746f9978a177bff8e56 Mon Sep 17 00:00:00 2001 From: Aman Shanbhag <55571601+amanshanbhag@users.noreply.github.com> Date: Tue, 2 Sep 2025 15:41:38 -0500 Subject: [PATCH 02/15] Adding options for both NAT Gateway (High Availability with cross AZ charges) + NAT Instance (not HA, but no cross AZ) Need to test both. --- .../sagemaker-hyperpod.yaml | 79 +++++++++++++++---- 1 file changed, 64 insertions(+), 15 deletions(-) diff --git a/1.architectures/5.sagemaker-hyperpod/sagemaker-hyperpod.yaml b/1.architectures/5.sagemaker-hyperpod/sagemaker-hyperpod.yaml index 43232fc40..5b786ebd6 100644 --- a/1.architectures/5.sagemaker-hyperpod/sagemaker-hyperpod.yaml +++ b/1.architectures/5.sagemaker-hyperpod/sagemaker-hyperpod.yaml @@ -13,8 +13,6 @@ Description: > 1.2 TB storage which can be overridden by parameter. A role is also created which helps to execute HyperPod cluster operations. -#TODO: DO THIS FOR EKS TOO. - #################### ## Stack Metadata ## #################### @@ -33,6 +31,8 @@ Metadata: - PrimarySubnetAZ - BackupSubnetAZ - IsLocalZone + - LocalZoneNATType + - NATGatewayAZ - Label: default: FSx Lustre configuration Parameters: @@ -186,6 +186,19 @@ Parameters: - 'false' Description: Set to true if you are using a local zone for GB200 (DFW only currently). + LocalZoneNATType: + Type: String + Default: 'gateway' + AllowedValues: + - 'gateway' + - 'instance' + Description: For Local Zones - NAT Gateway (high availability, cross-AZ charges) or NAT Instance (no cross-AZ charges, no HA) + + NATGatewayAZ: + Type: String + Default: 'use1-az2' + Description: Standard AZ for NAT Gateway when using Local Zone with gateway option + ############################### ## Conditions for Parameters ## ############################### @@ -196,8 +209,10 @@ Conditions: CreateSSMDocument: !Equals [!Ref 'SSMDocumentName', 'true'] CreateOpenZFSCondition: !Equals [!Ref 'CreateOpenZFS', 'true'] ConfigureCustomIops: !Not [!Equals [!Ref OpenZFSIops, 0]] - UseNATInstance: !Equals [!Ref IsLocalZone, 'true'] - UseNATGateway: !Equals [!Ref IsLocalZone, 'false'] + UseLocalZoneNATGateway: !And [!Equals [!Ref IsLocalZone, 'true'], !Equals [!Ref LocalZoneNATType, 'gateway']] + UseLocalZoneNATInstance: !And [!Equals [!Ref IsLocalZone, 'true'], !Equals [!Ref LocalZoneNATType, 'instance']] + UseStandardNATGateway: !Equals [!Ref IsLocalZone, 'false'] + ########################## @@ -291,28 +306,60 @@ Resources: # Create a NAT GW then add it to the public subnet NATGateway: - Condition: UseNATGateway + Condition: UseStandardNATGateway Type: AWS::EC2::NatGateway Properties: AllocationId: !GetAtt ElasticIP.AllocationId SubnetId: !Ref PublicSubnet ElasticIP: - Condition: UseNATGateway + Condition: UseStandardNATGateway Type: AWS::EC2::EIP Properties: Domain: vpc - # For GB200: Create a NAT Instance for the DFW Local Zone and an EIP + ### IF YOU ARE USING A LOCAL ZONE, THIS CF STACK WILL EITHER CREATE A NAT GATEWAY OR A NAT INSTANCE + NATGatewaySubnet: + Condition: UseLocalZoneNATGateway + Type: AWS::EC2::Subnet + Properties: + MapPublicIpOnLaunch: true + VpcId: !Ref VPC + CidrBlock: !Select [2, !Cidr [!GetAtt VPC.CidrBlock, 4, 8]] + AvailabilityZoneId: !Ref NATGatewayAZ + Tags: + - Key: Name + Value: NAT Gateway Subnet + + NATGatewaySubnetRouteTableAssociation: + Condition: UseLocalZoneNATGateway + Type: AWS::EC2::SubnetRouteTableAssociation + Properties: + SubnetId: !Ref NATGatewaySubnet + RouteTableId: !Ref PublicRouteTable + + LocalZoneNATGateway: + Condition: UseLocalZoneNATGateway + Type: AWS::EC2::NatGateway + Properties: + AllocationId: !GetAtt LocalZoneNATGatewayEIP.AllocationId + SubnetId: !Ref NATGatewaySubnet + + LocalZoneNATGatewayEIP: + Condition: UseLocalZoneNATGateway + Type: AWS::EC2::EIP + Properties: + Domain: vpc + LocalZoneEIP: - Condition: UseNATInstance + Condition: UseLocalZoneNATInstance Type: AWS::EC2::EIP Properties: Domain: vpc NetworkBorderGroup: !Sub "${AWS::Region}-dfw-2" # TODO: CURRENTLY HARDCODED TO DFW. NATInstance: - Condition: UseNATInstance + Condition: UseLocalZoneNATInstance Type: AWS::EC2::Instance Properties: ImageId: ami-00ca32bbc84273381 # TODO: CURRENTLY HARDCODED TO IAD -- create map. @@ -340,7 +387,7 @@ Resources: aws --region ${AWS::Region} ec2 associate-address --instance-id $INSTANCEID --allocation-id ${LocalZoneEIP.AllocationId} NATSecurityGroup: - Condition: UseNATInstance + Condition: UseLocalZoneNATInstance Type: AWS::EC2::SecurityGroup Properties: VpcId: !Ref VPC @@ -356,13 +403,13 @@ Resources: Value: NATSecurityGroup NATInstanceProfile: - Condition: UseNATInstance + Condition: UseLocalZoneNATInstance Type: AWS::IAM::InstanceProfile Properties: Roles: [!Ref NATInstanceRole] NATInstanceRole: - Condition: UseNATInstance + Condition: UseLocalZoneNATInstance Type: AWS::IAM::Role Properties: AssumeRolePolicyDocument: @@ -486,9 +533,11 @@ Resources: Properties: RouteTableId: !Ref PrivateRouteTable DestinationCidrBlock: 0.0.0.0/0 - NatGatewayId: !If [UseNATGateway, !Ref NATGateway, !Ref 'AWS::NoValue'] - InstanceId: !If [UseNATInstance, !Ref NATInstance, !Ref 'AWS::NoValue'] - + NatGatewayId: !If + - UseStandardNATGateway + - !Ref NATGateway + - !If [UseLocalZoneNATGateway, !Ref LocalZoneNATGateway, !Ref 'AWS::NoValue'] + InstanceId: !If [UseLocalZoneNATInstance, !Ref NATInstance, !Ref 'AWS::NoValue'] # Associate the public route table to the public subnet PublicSubnetRouteTableAssociation: From 862e50d879798f785f3eb7fbc76711f062eb22e3 Mon Sep 17 00:00:00 2001 From: Aman Shanbhag <55571601+amanshanbhag@users.noreply.github.com> Date: Fri, 12 Sep 2025 08:56:15 -0700 Subject: [PATCH 03/15] Update sagemaker-hyperpod.yaml --- .../sagemaker-hyperpod.yaml | 98 +------------------ 1 file changed, 4 insertions(+), 94 deletions(-) diff --git a/1.architectures/5.sagemaker-hyperpod/sagemaker-hyperpod.yaml b/1.architectures/5.sagemaker-hyperpod/sagemaker-hyperpod.yaml index 5b786ebd6..53844058a 100644 --- a/1.architectures/5.sagemaker-hyperpod/sagemaker-hyperpod.yaml +++ b/1.architectures/5.sagemaker-hyperpod/sagemaker-hyperpod.yaml @@ -31,7 +31,6 @@ Metadata: - PrimarySubnetAZ - BackupSubnetAZ - IsLocalZone - - LocalZoneNATType - NATGatewayAZ - Label: default: FSx Lustre configuration @@ -185,14 +184,6 @@ Parameters: - 'true' - 'false' Description: Set to true if you are using a local zone for GB200 (DFW only currently). - - LocalZoneNATType: - Type: String - Default: 'gateway' - AllowedValues: - - 'gateway' - - 'instance' - Description: For Local Zones - NAT Gateway (high availability, cross-AZ charges) or NAT Instance (no cross-AZ charges, no HA) NATGatewayAZ: Type: String @@ -209,8 +200,7 @@ Conditions: CreateSSMDocument: !Equals [!Ref 'SSMDocumentName', 'true'] CreateOpenZFSCondition: !Equals [!Ref 'CreateOpenZFS', 'true'] ConfigureCustomIops: !Not [!Equals [!Ref OpenZFSIops, 0]] - UseLocalZoneNATGateway: !And [!Equals [!Ref IsLocalZone, 'true'], !Equals [!Ref LocalZoneNATType, 'gateway']] - UseLocalZoneNATInstance: !And [!Equals [!Ref IsLocalZone, 'true'], !Equals [!Ref LocalZoneNATType, 'instance']] + UseLocalZoneNATGateway: !And [!Equals [!Ref IsLocalZone, 'true']] UseStandardNATGateway: !Equals [!Ref IsLocalZone, 'false'] @@ -318,7 +308,7 @@ Resources: Properties: Domain: vpc - ### IF YOU ARE USING A LOCAL ZONE, THIS CF STACK WILL EITHER CREATE A NAT GATEWAY OR A NAT INSTANCE + ### IF YOU ARE USING A LOCAL ZONE, THIS CF STACK WILL CREATE A NAT GATEWAY NATGatewaySubnet: Condition: UseLocalZoneNATGateway Type: AWS::EC2::Subnet @@ -350,86 +340,7 @@ Resources: Type: AWS::EC2::EIP Properties: Domain: vpc - - LocalZoneEIP: - Condition: UseLocalZoneNATInstance - Type: AWS::EC2::EIP - Properties: - Domain: vpc - NetworkBorderGroup: !Sub "${AWS::Region}-dfw-2" # TODO: CURRENTLY HARDCODED TO DFW. - - NATInstance: - Condition: UseLocalZoneNATInstance - Type: AWS::EC2::Instance - Properties: - ImageId: ami-00ca32bbc84273381 # TODO: CURRENTLY HARDCODED TO IAD -- create map. - InstanceType: c6i.2xlarge - SubnetId: !Ref PublicSubnet - SecurityGroupIds: [!Ref NATSecurityGroup] - SourceDestCheck: false - IamInstanceProfile: !Ref NATInstanceProfile - UserData: - Fn::Base64: !Sub | - #!/bin/bash -ex - yum update -y - yum install iptables-services -y - systemctl enable iptables - systemctl start iptables - echo 'net.ipv4.ip_forward=1' > /etc/sysctl.d/custom-ip-forwarding.conf - sysctl -p /etc/sysctl.d/custom-ip-forwarding.conf - /sbin/iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE - /sbin/iptables -F FORWARD - service iptables save - - # Associate EIP and disable source/dest check - TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600") - INSTANCEID=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" http://169.254.169.254/latest/meta-data/instance-id) - aws --region ${AWS::Region} ec2 associate-address --instance-id $INSTANCEID --allocation-id ${LocalZoneEIP.AllocationId} - - NATSecurityGroup: - Condition: UseLocalZoneNATInstance - Type: AWS::EC2::SecurityGroup - Properties: - VpcId: !Ref VPC - GroupDescription: NAT Instance Security Group - SecurityGroupIngress: - - CidrIp: 10.0.0.0/8 - IpProtocol: "-1" - SecurityGroupEgress: - - CidrIp: 0.0.0.0/0 - IpProtocol: "-1" - Tags: - - Key: Name - Value: NATSecurityGroup - - NATInstanceProfile: - Condition: UseLocalZoneNATInstance - Type: AWS::IAM::InstanceProfile - Properties: - Roles: [!Ref NATInstanceRole] - - NATInstanceRole: - Condition: UseLocalZoneNATInstance - Type: AWS::IAM::Role - Properties: - AssumeRolePolicyDocument: - Version: "2012-10-17" - Statement: - - Effect: Allow - Principal: - Service: ["ec2.amazonaws.com"] - Action: "sts:AssumeRole" - Policies: - - PolicyName: AssociateEIP - PolicyDocument: - Version: "2012-10-17" - Statement: - - Effect: Allow - Action: - - ec2:AssociateAddress - - ec2:ModifyInstanceAttribute - Resource: "*" - + # NOTE: when you create additional security groups, you must ensure that every # security group has ingress/egress from/to its own security group id. Failure # to do so may cause trn1/p4d/p4de/p5 SMHP cluster creation to fail: @@ -536,8 +447,7 @@ Resources: NatGatewayId: !If - UseStandardNATGateway - !Ref NATGateway - - !If [UseLocalZoneNATGateway, !Ref LocalZoneNATGateway, !Ref 'AWS::NoValue'] - InstanceId: !If [UseLocalZoneNATInstance, !Ref NATInstance, !Ref 'AWS::NoValue'] + - !Ref LocalZoneNATGateway # Associate the public route table to the public subnet PublicSubnetRouteTableAssociation: From 8b3efda48bc855b724b8c94739346dadf021c8b6 Mon Sep 17 00:00:00 2001 From: Aman Shanbhag <55571601+amanshanbhag@users.noreply.github.com> Date: Fri, 12 Sep 2025 08:59:43 -0700 Subject: [PATCH 04/15] Removed NAT Instance From 0f9cbc35df9f6f0557ebbf5ee5de0b5ad3092e3f Mon Sep 17 00:00:00 2001 From: Aman Shanbhag <55571601+amanshanbhag@users.noreply.github.com> Date: Mon, 15 Sep 2025 12:53:50 -0500 Subject: [PATCH 05/15] Fixing syntax error and removing !And call to get rid of NAT Instance --- 1.architectures/5.sagemaker-hyperpod/sagemaker-hyperpod.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/1.architectures/5.sagemaker-hyperpod/sagemaker-hyperpod.yaml b/1.architectures/5.sagemaker-hyperpod/sagemaker-hyperpod.yaml index 53844058a..4074ae189 100644 --- a/1.architectures/5.sagemaker-hyperpod/sagemaker-hyperpod.yaml +++ b/1.architectures/5.sagemaker-hyperpod/sagemaker-hyperpod.yaml @@ -200,7 +200,7 @@ Conditions: CreateSSMDocument: !Equals [!Ref 'SSMDocumentName', 'true'] CreateOpenZFSCondition: !Equals [!Ref 'CreateOpenZFS', 'true'] ConfigureCustomIops: !Not [!Equals [!Ref OpenZFSIops, 0]] - UseLocalZoneNATGateway: !And [!Equals [!Ref IsLocalZone, 'true']] + UseLocalZoneNATGateway: !Equals [!Ref IsLocalZone, 'true'] UseStandardNATGateway: !Equals [!Ref IsLocalZone, 'false'] From 46e6c16bda1bc50b85e18d91f20d56d37e87326a Mon Sep 17 00:00:00 2001 From: Aman Shanbhag <55571601+amanshanbhag@users.noreply.github.com> Date: Mon, 15 Sep 2025 18:10:29 -0500 Subject: [PATCH 06/15] Update sagemaker-hyperpod.yaml -- Brute forcing NATGW subnet --- 1.architectures/5.sagemaker-hyperpod/sagemaker-hyperpod.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/1.architectures/5.sagemaker-hyperpod/sagemaker-hyperpod.yaml b/1.architectures/5.sagemaker-hyperpod/sagemaker-hyperpod.yaml index 4074ae189..39ae6e06c 100644 --- a/1.architectures/5.sagemaker-hyperpod/sagemaker-hyperpod.yaml +++ b/1.architectures/5.sagemaker-hyperpod/sagemaker-hyperpod.yaml @@ -315,7 +315,7 @@ Resources: Properties: MapPublicIpOnLaunch: true VpcId: !Ref VPC - CidrBlock: !Select [2, !Cidr [!GetAtt VPC.CidrBlock, 4, 8]] + CidrBlock: 10.0.128.0/24 AvailabilityZoneId: !Ref NATGatewayAZ Tags: - Key: Name From af02f06f836f77a6cd035ab3ad7d61cc7d490088 Mon Sep 17 00:00:00 2001 From: Aman Shanbhag <55571601+amanshanbhag@users.noreply.github.com> Date: Tue, 16 Sep 2025 13:47:16 -0500 Subject: [PATCH 07/15] Editing EKS SMHP CF deployment to include LZ (for GB200) - main stack --- .../nested-stacks/main-stack.yaml | 199 ++++++++++++++++-- 1 file changed, 180 insertions(+), 19 deletions(-) diff --git a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/main-stack.yaml b/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/main-stack.yaml index 0ff66f824..83b24ed05 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/main-stack.yaml +++ b/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/main-stack.yaml @@ -31,6 +31,11 @@ Metadata: Parameters: - AvailabilityZoneId - PrivateSubnet1CIDR + - Label: + default: Local Zone Configuration (for GB200) + Parameters: + - IsLocalZone + - NATGatewayAZ - Label: default: Parameter Checks Recommended If EKSClusterStack == true Parameters: @@ -50,6 +55,11 @@ Metadata: Parameters: - HyperPodClusterName - NodeRecovery + - UseContinuousNodeProvisioningMode + - Label: + default: Parameters for Accelerated Instance Group + Parameters: + - CreateAcceleratedInstanceGroup - AcceleratedInstanceGroupName - AcceleratedInstanceType - AcceleratedInstanceCount @@ -58,6 +68,11 @@ Metadata: - EnableInstanceStressCheck - EnableInstanceConnectivityCheck - AcceleratedLifeCycleConfigOnCreate + - AcceleratedTrainingPlanArn + - AcceleratedImageId + - Label: + default: Parameters for General Purpose Instance Group + Parameters: - CreateGeneralPurposeInstanceGroup - GeneralPurposeInstanceGroupName - GeneralPurposeInstanceType @@ -65,6 +80,22 @@ Metadata: - GeneralPurposeEBSVolumeSize - GeneralPurposeThreadsPerCore - GeneralPurposeLifeCycleConfigOnCreate + - GeneralPurposeImageId + - Label: + default: Parameters for Restricted Instance Group + Parameters: + - CreateRestrictedInstanceGroup + - RestrictedInstanceGroupName + - RestrictedInstanceType + - RestrictedInstanceCount + - RestrictedPerUnitStorageThroughput + - RestrictedSizeInGiB + - RestrictedEBSVolumeSize + - EnableRestrictedInstanceStressCheck + - EnableRestrictedInstanceConnectivityCheck + - RestrictedThreadsPerCore + - RestrictedTrainingPlanArn + - Label: default: Parameter Overrides Required If CreateVPCStack == false and Any(CreatePrivateSubnetStack, CreateSecurityGroupStack, CreateEKSClusterStack, CreateS3EndpointStack) == true Parameters: @@ -217,12 +248,23 @@ Parameters: Type: String Default: nat-1234567890abcdef0 + IsLocalZone: + Type: String + Default: 'false' + AllowedValues: ['true', 'false'] + Description: Set to true if using Local Zone Deployment (for GB200) + + NATGatewayAZ: + Type: String + Default: 'use1-az2' + Description: Standard AZ for NAT Gateway when using Local Zone (for GB200) + ### ---------------- EKS Params ----------------### KubernetesVersion: Description: The Kubernetes version to use for the EKS cluster. Type: String - Default: '1.31' + Default: '1.32' EKSClusterName: Description: The name of the newly created of preexisting EKS cluster you wish to use. @@ -313,6 +355,14 @@ Parameters: AllowedValues: - Automatic - None + + UseContinuousNodeProvisioningMode: + Description: Whether to enable continuous node provisioning mode for the HyperPod cluster. + Type: String + Default: true + AllowedValues: + - true + - false SageMakerIAMRoleName: Description: The name of the IAM role that SageMaker will use to access the AWS resources on your behalf. @@ -327,12 +377,20 @@ Parameters: Type: String Default: subnet-1234567890abcdef0 - ### ---------------- Accelerated Instance Group 1 Params----------------### + ### ---------------- Accelerated Instance Group Params----------------### + + CreateAcceleratedInstanceGroup: + Description: Whether to create an accelerated instance group for the HyperPod cluster. + Type: String + Default: true + AllowedValues: + - true + - false AcceleratedInstanceGroupName: Description: The name of the accelerated instance group for the HyperPod cluster. Type: String - Default: accelerated-worker-group-1 + Default: accelerated-instance-group AcceleratedInstanceType: Description: The instance type of the accelerated instance group for the HyperPod cluster. @@ -354,14 +412,14 @@ Parameters: AcceleratedThreadsPerCore: Description: The number of threads per CPU core in the accelerated instance group for the HyperPod cluster. Type: Number + Default: 1 AllowedValues: - 1 - 2 - Default: 1 EnableInstanceStressCheck: Type: String - Description: Enable Instance Stress deep health check + Description: Enable instance stress deep health check Default: true AllowedValues: - true @@ -369,7 +427,7 @@ Parameters: EnableInstanceConnectivityCheck: Type: String - Description: Enable Instance Connectivity deep health check + Description: Enable instance connectivity deep health check Default: true AllowedValues: - true @@ -380,12 +438,22 @@ Parameters: Type: String Default: on_create.sh + AcceleratedTrainingPlanArn: + Description: The ARN of the accelerated instance group training plan + Type: String + Default: "" + + AcceleratedImageId: + Description: The AMI ID of the accelerated instance group for the HyperPod cluster. + Type: String + Default: "" + ### ---------------- General Purpose Instance Group 2 Params ----------------### CreateGeneralPurposeInstanceGroup: Description: Whether to create a general purpose instance group for the HyperPod cluster. Type: String - Default: true + Default: false AllowedValues: - true - false @@ -393,7 +461,7 @@ Parameters: GeneralPurposeInstanceGroupName: Description: The name of the general purpose instance group for the HyperPod cluster. Type: String - Default: general-purpose-worker-group-2 + Default: general-purpose-instance-group GeneralPurposeInstanceType: Description: The instance type of the general purpose instance group for the HyperPod cluster. @@ -415,15 +483,90 @@ Parameters: GeneralPurposeThreadsPerCore: Description: The number of threads per CPU core in the general purpose instance group for the HyperPod cluster. Type: Number + Default: 1 AllowedValues: - 1 - 2 - Default: 1 GeneralPurposeLifeCycleConfigOnCreate: Description: The file name of lifecycle script for the general purpose instance group. This script runs during cluster creation. Type: String Default: on_create.sh + + GeneralPurposeImageId: + Description: The AMI ID of the general purpose instance group for the HyperPod cluster. + Type: String + Default: "" + + ### ---------------- Restricted Instance Group Params ----------------### + CreateRestrictedInstanceGroup: + Description: Whether to create a restricted instance group for the HyperPod cluster. + Type: String + Default: false + AllowedValues: + - true + - false + + RestrictedInstanceGroupName: + Description: The name of the restricted instance group for the HyperPod cluster. + Type: String + Default: restricted-instance-group + + RestrictedInstanceType: + Description: The instance type of the restricted instance group for the HyperPod cluster. + Type: String + Default: ml.g5.8xlarge + + RestrictedInstanceCount: + Description: The number of instances in the restricted instance group for the HyperPod cluster. + Type: Number + Default: 1 + + RestrictedPerUnitStorageThroughput: + Description: The throughput for the service-managed FSx for Lustre file system + Type: Number + Default: 250 + + RestrictedSizeInGiB: + Description: The volume size for the service-managed FSx for Lustre file system + Type: Number + Default: 1200 + + RestrictedEBSVolumeSize: + Description: > + The size in gigabytes (GB) of the additional EBS volume to be attached + to the instances in the restricted instance group for the HyperPod cluster. + Type: Number + Default: 500 + + EnableRestrictedInstanceStressCheck: + Type: String + Description: Enable restricted instance stress deep health check + Default: true + AllowedValues: + - true + - false + + EnableRestrictedInstanceConnectivityCheck: + Type: String + Description: Enable restricted instance connectivity deep health check + Default: true + AllowedValues: + - true + - false + + RestrictedThreadsPerCore: + Description: The number of threads per CPU core in the restricted instance group for the HyperPod cluster. + Type: Number + Default: 1 + AllowedValues: + - 1 + - 2 + + RestrictedTrainingPlanArn: + Description: The ARN of the restricted instance group training plan + Type: String + Default: "" ### ---------------- Condition Params ----------------### @@ -573,6 +716,8 @@ Resources: VpcCIDR: !Ref VpcCIDR PublicSubnet1CIDR: !Ref PublicSubnet1CIDR PublicSubnet2CIDR: !Ref PublicSubnet2CIDR + IsLocalZone: !Ref IsLocalZone + NATGatewayAZ: !Ref NATGatewayAZ PrivateSubnetStack: Type: AWS::CloudFormation::Stack @@ -774,6 +919,8 @@ Resources: Parameters: HyperPodClusterName: !Ref HyperPodClusterName NodeRecovery: !Ref NodeRecovery + UseContinuousNodeProvisioningMode: !Ref UseContinuousNodeProvisioningMode + CreateAcceleratedInstanceGroup: !Ref CreateAcceleratedInstanceGroup AcceleratedInstanceGroupName: !Ref AcceleratedInstanceGroupName AcceleratedInstanceType: !Ref AcceleratedInstanceType AcceleratedInstanceCount: !Ref AcceleratedInstanceCount @@ -782,6 +929,8 @@ Resources: EnableInstanceStressCheck: !Ref EnableInstanceStressCheck EnableInstanceConnectivityCheck: !Ref EnableInstanceConnectivityCheck AcceleratedLifeCycleConfigOnCreate: !Ref AcceleratedLifeCycleConfigOnCreate + AcceleratedTrainingPlanArn: !Ref AcceleratedTrainingPlanArn + AcceleratedImageId: !Ref AcceleratedImageId CreateGeneralPurposeInstanceGroup: !Ref CreateGeneralPurposeInstanceGroup GeneralPurposeInstanceGroupName: !Ref GeneralPurposeInstanceGroupName GeneralPurposeInstanceType: !Ref GeneralPurposeInstanceType @@ -789,6 +938,18 @@ Resources: GeneralPurposeEBSVolumeSize: !Ref GeneralPurposeEBSVolumeSize GeneralPurposeThreadsPerCore: !Ref GeneralPurposeThreadsPerCore GeneralPurposeLifeCycleConfigOnCreate: !Ref GeneralPurposeLifeCycleConfigOnCreate + GeneralPurposeImageId: !Ref GeneralPurposeImageId + CreateRestrictedInstanceGroup: !Ref CreateRestrictedInstanceGroup + RestrictedInstanceGroupName: !Ref RestrictedInstanceGroupName + RestrictedInstanceType: !Ref RestrictedInstanceType + RestrictedInstanceCount: !Ref RestrictedInstanceCount + RestrictedPerUnitStorageThroughput: !Ref RestrictedPerUnitStorageThroughput + RestrictedSizeInGiB: !Ref RestrictedSizeInGiB + RestrictedEBSVolumeSize: !Ref RestrictedEBSVolumeSize + EnableRestrictedInstanceStressCheck: !Ref EnableRestrictedInstanceStressCheck + EnableRestrictedInstanceConnectivityCheck: !Ref EnableRestrictedInstanceConnectivityCheck + RestrictedThreadsPerCore: !Ref RestrictedThreadsPerCore + RestrictedTrainingPlanArn: !Ref RestrictedTrainingPlanArn # Used to conditionally force HyperPod to wait on the HelmChartStack (if deployed) HelmChartStatus: !If - CreateHelmChartStack @@ -817,38 +978,38 @@ Resources: ### ---------------- Outputs ----------------### Outputs: - VpcId: + OutputVpcId: Condition: CreateVPCStack Value: !GetAtt VPCStack.Outputs.VpcId - PrivateSubnetId: + OutputPrivateSubnetIds: Condition: CreatePrivateSubnetStack Value: !GetAtt PrivateSubnetStack.Outputs.PrivateSubnetId - SecurityGroupId: + OutputSecurityGroupId: Condition: CreateSecurityGroupStack Value: !GetAtt SecurityGroupStack.Outputs.SecurityGroupId - EKSClusterArn: + OutputEKSClusterArn: Condition: CreateEKSClusterStack Value: !GetAtt EKSClusterStack.Outputs.EKSClusterArn - EKSClusterName: + OutputEKSClusterName: Condition: CreateEKSClusterStack Value: !GetAtt EKSClusterStack.Outputs.EKSClusterName - SageMakerIAMRoleArn: + OutputSageMakerIAMRoleArn: Condition: CreateSageMakerIAMRoleStack Value: !GetAtt SageMakerIAMRoleStack.Outputs.SageMakerIAMRoleArn - S3BucketName: + OutputS3BucketName: Condition: CreateS3BucketStack Value: !GetAtt S3BucketStack.Outputs.S3BucketName - HyperPodClusterName: + OutputHyperPodClusterName: Condition: CreateHyperPodClusterStack Value: !GetAtt HyperPodClusterStack.Outputs.HyperPodClusterName - HyperPodClusterArn: + OutputHyperPodClusterArn: Condition: CreateHyperPodClusterStack - Value: !GetAtt HyperPodClusterStack.Outputs.HyperPodClusterArn \ No newline at end of file + Value: !GetAtt HyperPodClusterStack.Outputs.HyperPodClusterArn From 9f8c10cb582299e392def7302b51cef6fbd08fb0 Mon Sep 17 00:00:00 2001 From: Aman Shanbhag <55571601+amanshanbhag@users.noreply.github.com> Date: Tue, 16 Sep 2025 13:48:23 -0500 Subject: [PATCH 08/15] Adding EKS SMHP support for LZ/GB200 -- VPC Stack --- .../nested-stacks/vpc-stack.yaml | 63 ++++++++++++++++++- 1 file changed, 60 insertions(+), 3 deletions(-) diff --git a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/vpc-stack.yaml b/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/vpc-stack.yaml index 8333bb970..e644e1443 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/vpc-stack.yaml +++ b/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/vpc-stack.yaml @@ -21,9 +21,23 @@ Parameters: Description: The IP range (CIDR notation) for the public subnet in the second Availability Zone. Type: String Default: 10.192.11.0/24 + + IsLocalZone: + Type: String + Default: 'false' + AllowedValues: ['true', 'false'] + Description: Set to true if using Local Zone Deployment (for GB200) + + NATGatewayAZ: + Type: String + Default: 'use1-az2' + Description: Standard AZ for NAT Gateway when using Local Zone Deployment (for GB200) -Resources: +Conditions: + UseLocalZoneNATGateway: !Equals [!Ref IsLocalZone, 'true'] + UseStandardNATGateway: !Equals [!Ref IsLocalZone, 'false'] +Resources: VPC: Type: AWS::EC2::VPC Properties: @@ -69,30 +83,70 @@ Resources: - Key: Name Value: !Sub ${ResourceNamePrefix}-SMHP-Public2 + # Standard NAT Gateway + NatGateway1EIP: Type: AWS::EC2::EIP + Condition: UseStandardNATGateway DependsOn: InternetGatewayAttachment Properties: Domain: vpc NatGateway2EIP: Type: AWS::EC2::EIP + Condition: UseStandardNATGateway DependsOn: InternetGatewayAttachment Properties: Domain: vpc NatGateway1: Type: AWS::EC2::NatGateway + Condition: UseStandardNATGateway Properties: AllocationId: !GetAtt NatGateway1EIP.AllocationId SubnetId: !Ref PublicSubnet1 NatGateway2: Type: AWS::EC2::NatGateway + Condition: UseStandardNATGateway Properties: AllocationId: !GetAtt NatGateway2EIP.AllocationId SubnetId: !Ref PublicSubnet2 + # Local Zone NAT Gateway + + LocalZoneNATGatewaySubnet: + Condition: UseLocalZoneNATGateway + Type: AWS::EC2::Subnet + Properties: + VpcId: !Ref VPC + CidrBlock: 10.192.128.0/24 + AvailabilityZone: !Ref NATGatewayAZ + MapPublicIpOnLaunch: true + Tags: + - Key: Name + Value: !Sub ${ResourceNamePrefix}-SMHP-Public-NAT-GW-Subnet + + LocalZoneNATGatewaySubnetAssociation: + Condition: UseLocalZoneNATGateway + Type: AWS::EC2::SubnetRouteTableAssociation + Properties: + RouteTableId: !Ref PublicRouteTable + SubnetId: !Ref LocalZoneNATGatewaySubnet + + LocalZoneNATGatewayEIP: + Condition: UseLocalZoneNATGateway + Type: AWS::EC2::EIP + Properties: + Domain: vpc + + LocalZoneNATGateway: + Condition: UseLocalZoneNATGateway + Type: AWS::EC2::NatGateway + Properties: + AllocationId: !GetAtt LocalZoneNATGatewayEIP.AllocationId + SubnetId: !Ref LocalZoneNATGatewaySubnet + PublicRouteTable: Type: AWS::EC2::RouteTable Properties: @@ -124,8 +178,11 @@ Resources: Outputs: NatGatewayId: Description: Nat Gateway Id - Value: !Ref NatGateway1 + Value: !If + - UseStandardNATGateway + - !Ref NatGateway1 + - !Ref LocalZoneNATGateway VpcId: Description: VPC Id - Value: !Ref VPC \ No newline at end of file + Value: !Ref VPC From 4f1b5b1a5b268965988e3a167f4ac00fd14a10f9 Mon Sep 17 00:00:00 2001 From: Aman Shanbhag <55571601+amanshanbhag@users.noreply.github.com> Date: Wed, 15 Oct 2025 15:55:54 -0700 Subject: [PATCH 09/15] Update main-stack.yaml Remove constraint for AZ --- .../cfn-templates/nested-stacks/main-stack.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/main-stack.yaml b/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/main-stack.yaml index 83b24ed05..35810bada 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/main-stack.yaml +++ b/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/main-stack.yaml @@ -229,9 +229,9 @@ Parameters: The Availability Zone Id you specify should correspond to the location of your accelerated compute capacity. Type: String Default: usw2-az2 - AllowedPattern: ^[a-z]{3,4}[0-9]-az[0-9]$ - ConstraintDescription: The Availability Zone Id must match the expression - ^[a-z]{3,4}[0-9]-az[0-9]$. For example, use1-az4, usw2-az2, or apse1-az2. + # AllowedPattern: ^[a-z]{3,4}[0-9]-az[0-9]$ + # ConstraintDescription: The Availability Zone Id must match the expression + # ^[a-z]{3,4}[0-9]-az[0-9]$. For example, use1-az4, usw2-az2, or apse1-az2. PrivateSubnet1CIDR: Description: The IP range (CIDR notation) for the private subnet to be created in the target Availability Zone. From 98d724c8857d1cccd09bf6f870feb6a77140d5f8 Mon Sep 17 00:00:00 2001 From: Aman Shanbhag <55571601+amanshanbhag@users.noreply.github.com> Date: Wed, 15 Oct 2025 16:14:04 -0700 Subject: [PATCH 10/15] Update main-stack.yaml --- .../cfn-templates/nested-stacks/main-stack.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/main-stack.yaml b/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/main-stack.yaml index 35810bada..a478d3c49 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/main-stack.yaml +++ b/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/main-stack.yaml @@ -174,7 +174,8 @@ Mappings: sa-east-1: BucketName: ws-assets-prod-iad-r-gru-527b8c19222c1182 us-east-1: - BucketName: ws-assets-prod-iad-r-iad-ed304a55c2ca1aee + # BucketName: ws-assets-prod-iad-r-iad-ed304a55c2ca1aee + BucketName: awsome-distributed-training ap-northeast-2: BucketName: ws-assets-prod-iad-r-icn-ced060f0d38bc0b0 ap-northeast-3: From 2c1d41447627c9c7334d4206c9a3018a38c11748 Mon Sep 17 00:00:00 2001 From: Aman Shanbhag <55571601+amanshanbhag@users.noreply.github.com> Date: Wed, 15 Oct 2025 16:21:26 -0700 Subject: [PATCH 11/15] Update main-stack.yaml --- .../nested-stacks/main-stack.yaml | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/main-stack.yaml b/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/main-stack.yaml index a478d3c49..85fae9e4d 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/main-stack.yaml +++ b/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/main-stack.yaml @@ -706,7 +706,7 @@ Resources: Properties: TemplateURL: Fn::Sub: - - 'https://${BucketName}.s3.${AWS::Region}.amazonaws.com/2433d39e-ccfe-4c00-9d3d-9917b729258e/vpc-stack.yaml' + - 'https://${BucketName}.s3.${AWS::Region}.amazonaws.com/gb200-stacks/vpc-stack.yaml' - BucketName: Fn::FindInMap: - AssetS3Buckets @@ -726,7 +726,7 @@ Resources: Properties: TemplateURL: Fn::Sub: - - "https://${BucketName}.s3.${AWS::Region}.amazonaws.com/2433d39e-ccfe-4c00-9d3d-9917b729258e/private-subnet-stack.yaml" + - "https://${BucketName}.s3.${AWS::Region}.amazonaws.com/gb200-stacks/private-subnet-stack.yaml" - BucketName: Fn::FindInMap: - AssetS3Buckets @@ -751,7 +751,7 @@ Resources: Properties: TemplateURL: Fn::Sub: - - "https://${BucketName}.s3.${AWS::Region}.amazonaws.com/2433d39e-ccfe-4c00-9d3d-9917b729258e/security-group-stack.yaml" + - "https://${BucketName}.s3.${AWS::Region}.amazonaws.com/gb200-stacks/security-group-stack.yaml" - BucketName: Fn::FindInMap: - AssetS3Buckets @@ -777,7 +777,7 @@ Resources: Properties: TemplateURL: Fn::Sub: - - "https://${BucketName}.s3.${AWS::Region}.amazonaws.com/2433d39e-ccfe-4c00-9d3d-9917b729258e/eks-cluster-stack.yaml" + - "https://${BucketName}.s3.${AWS::Region}.amazonaws.com/gb200-stacks/eks-cluster-stack.yaml" - BucketName: Fn::FindInMap: - AssetS3Buckets @@ -806,7 +806,7 @@ Resources: Properties: TemplateURL: Fn::Sub: - - "https://${BucketName}.s3.${AWS::Region}.amazonaws.com/2433d39e-ccfe-4c00-9d3d-9917b729258e/s3-bucket-stack.yaml" + - "https://${BucketName}.s3.${AWS::Region}.amazonaws.com/gb200-stacks/s3-bucket-stack.yaml" - BucketName: Fn::FindInMap: - AssetS3Buckets @@ -821,7 +821,7 @@ Resources: Properties: TemplateURL: Fn::Sub: - - "https://${BucketName}.s3.${AWS::Region}.amazonaws.com/2433d39e-ccfe-4c00-9d3d-9917b729258e/s3-endpoint-stack.yaml" + - "https://${BucketName}.s3.${AWS::Region}.amazonaws.com/gb200-stacks/s3-endpoint-stack.yaml" - BucketName: Fn::FindInMap: - AssetS3Buckets @@ -843,7 +843,7 @@ Resources: Properties: TemplateURL: Fn::Sub: - - "https://${BucketName}.s3.${AWS::Region}.amazonaws.com/2433d39e-ccfe-4c00-9d3d-9917b729258e/lifecycle-script-stack.yaml" + - "https://${BucketName}.s3.${AWS::Region}.amazonaws.com/gb200-stacks/lifecycle-script-stack.yaml" - BucketName: Fn::FindInMap: - AssetS3Buckets @@ -862,7 +862,7 @@ Resources: Properties: TemplateURL: Fn::Sub: - - "https://${BucketName}.s3.${AWS::Region}.amazonaws.com/2433d39e-ccfe-4c00-9d3d-9917b729258e/sagemaker-iam-role-stack.yaml" + - "https://${BucketName}.s3.${AWS::Region}.amazonaws.com/gb200-stacks/sagemaker-iam-role-stack.yaml" - BucketName: Fn::FindInMap: - AssetS3Buckets @@ -881,7 +881,7 @@ Resources: Properties: TemplateURL: Fn::Sub: - - "https://${BucketName}.s3.${AWS::Region}.amazonaws.com/2433d39e-ccfe-4c00-9d3d-9917b729258e/helm-chart-stack.yaml" + - "https://${BucketName}.s3.${AWS::Region}.amazonaws.com/gb200-stacks/helm-chart-stack.yaml" - BucketName: Fn::FindInMap: - AssetS3Buckets @@ -911,7 +911,7 @@ Resources: Properties: TemplateURL: Fn::Sub: - - "https://${BucketName}.s3.${AWS::Region}.amazonaws.com/2433d39e-ccfe-4c00-9d3d-9917b729258e/hyperpod-cluster-stack.yaml" + - "https://${BucketName}.s3.${AWS::Region}.amazonaws.com/gb200-stacks/hyperpod-cluster-stack.yaml" - BucketName: Fn::FindInMap: - AssetS3Buckets From fff5ec62876a56e16b3bf76c97c5741eee7aed80 Mon Sep 17 00:00:00 2001 From: Aman Shanbhag <55571601+amanshanbhag@users.noreply.github.com> Date: Wed, 15 Oct 2025 16:24:53 -0700 Subject: [PATCH 12/15] Update main-stack.yaml --- .../cfn-templates/nested-stacks/main-stack.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/main-stack.yaml b/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/main-stack.yaml index 85fae9e4d..31c9ee5b2 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/main-stack.yaml +++ b/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/main-stack.yaml @@ -227,9 +227,10 @@ Parameters: Description: > The Availability Zone Id you wish to create a private subnet in. This private subnet will be used by HyperPod to deploy cross-account ENIs. - The Availability Zone Id you specify should correspond to the location of your accelerated compute capacity. + The Availability Zone Id you specify should correspond to the location of your accelerated compute capacity. + If this is for GB200, use use1-dfw2-az1 Type: String - Default: usw2-az2 + Default: usw2-az2 # AllowedPattern: ^[a-z]{3,4}[0-9]-az[0-9]$ # ConstraintDescription: The Availability Zone Id must match the expression # ^[a-z]{3,4}[0-9]-az[0-9]$. For example, use1-az4, usw2-az2, or apse1-az2. From 2bb82bd17b9d4ba34a4f35a113c0e9cefdc00ca2 Mon Sep 17 00:00:00 2001 From: Aman Shanbhag <55571601+amanshanbhag@users.noreply.github.com> Date: Wed, 15 Oct 2025 17:05:07 -0700 Subject: [PATCH 13/15] Update private-subnet-stack.yaml --- .../cfn-templates/nested-stacks/private-subnet-stack.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/private-subnet-stack.yaml b/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/private-subnet-stack.yaml index 3d4301853..79e4ea59a 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/private-subnet-stack.yaml +++ b/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/private-subnet-stack.yaml @@ -20,8 +20,8 @@ Parameters: The Availability Zone Id you specify should correspond to the location of your accelerated compute capacity. Type: String Default: usw2-az2 - AllowedPattern: ^[a-z]{3,4}[0-9]-az[0-9]$ - ConstraintDescription: The Availability Zone Id must match the expression ^[a-z]{3,4}[0-9]-az[0-9]$. For example, use1-az4, usw2-az2, or apse1-az2. + # AllowedPattern: ^[a-z]{3,4}[0-9]-az[0-9]$ + # ConstraintDescription: The Availability Zone Id must match the expression ^[a-z]{3,4}[0-9]-az[0-9]$. For example, use1-az4, usw2-az2, or apse1-az2. VpcId: Description: The ID of the VPC you wish to use if you do not want to create a new VPC. From a2b79001d93affed874ef535c3db30e73f5971ab Mon Sep 17 00:00:00 2001 From: Aman Shanbhag <55571601+amanshanbhag@users.noreply.github.com> Date: Wed, 15 Oct 2025 17:32:01 -0700 Subject: [PATCH 14/15] Update main-stack.yaml --- .../cfn-templates/nested-stacks/main-stack.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/main-stack.yaml b/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/main-stack.yaml index 31c9ee5b2..8a88900fb 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/main-stack.yaml +++ b/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/main-stack.yaml @@ -176,6 +176,8 @@ Mappings: us-east-1: # BucketName: ws-assets-prod-iad-r-iad-ed304a55c2ca1aee BucketName: awsome-distributed-training + us-east-1-extra: + BucketName: ws-assets-prod-iad-r-iad-ed304a55c2ca1aee ap-northeast-2: BucketName: ws-assets-prod-iad-r-icn-ced060f0d38bc0b0 ap-northeast-3: @@ -897,7 +899,7 @@ Resources: CustomResourceS3Bucket: Fn::FindInMap: - AssetS3Buckets - - Ref: 'AWS::Region' + - us-east-1-alt - BucketName LayerS3Key: '2433d39e-ccfe-4c00-9d3d-9917b729258e/lambda-layer.zip' FunctionS3Key: '2433d39e-ccfe-4c00-9d3d-9917b729258e/function.zip' From e528f6d14fef14fec3d641b7050b87104c6930da Mon Sep 17 00:00:00 2001 From: Aman Shanbhag <55571601+amanshanbhag@users.noreply.github.com> Date: Wed, 15 Oct 2025 17:32:54 -0700 Subject: [PATCH 15/15] Update main-stack.yaml --- .../cfn-templates/nested-stacks/main-stack.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/main-stack.yaml b/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/main-stack.yaml index 8a88900fb..69b3f0673 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/main-stack.yaml +++ b/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/main-stack.yaml @@ -899,7 +899,7 @@ Resources: CustomResourceS3Bucket: Fn::FindInMap: - AssetS3Buckets - - us-east-1-alt + - us-east-1-extra - BucketName LayerS3Key: '2433d39e-ccfe-4c00-9d3d-9917b729258e/lambda-layer.zip' FunctionS3Key: '2433d39e-ccfe-4c00-9d3d-9917b729258e/function.zip'