diff --git a/1.architectures/5.sagemaker-hyperpod/sagemaker-hyperpod.yaml b/1.architectures/5.sagemaker-hyperpod/sagemaker-hyperpod.yaml index f340fe3c4..39ae6e06c 100644 --- a/1.architectures/5.sagemaker-hyperpod/sagemaker-hyperpod.yaml +++ b/1.architectures/5.sagemaker-hyperpod/sagemaker-hyperpod.yaml @@ -13,7 +13,6 @@ Description: > 1.2 TB storage which can be overridden by parameter. A role is also created which helps to execute HyperPod cluster operations. - #################### ## Stack Metadata ## #################### @@ -31,6 +30,8 @@ Metadata: Parameters: - PrimarySubnetAZ - BackupSubnetAZ + - IsLocalZone + - NATGatewayAZ - Label: default: FSx Lustre configuration Parameters: @@ -59,7 +60,7 @@ Metadata: SSMDocumentName: default: True/False; Create SSM Session Manager Document. Only set to False if SSM-SessionManagerRunShellAsUbuntu document exists in your account. PrimarySubnetAZ: - default: Availability zone id to deploy the primary subnets + default: Availability zone id to deploy the primary subnets (OR set this to your Local Zone ID if you set IsLocalZone to True. Example use1-dfw2-az1) BackupSubnetAZ: default: (Optional) Availability zone id to deploy the backup private subnet CreateS3Endpoint: @@ -175,6 +176,19 @@ Parameters: Default: 0 MinValue: 0 MaxValue: 400000 + + IsLocalZone: + Type: String + Default: 'false' + AllowedValues: + - 'true' + - 'false' + Description: Set to true if you are using a local zone for GB200 (DFW only currently). + + NATGatewayAZ: + Type: String + Default: 'use1-az2' + Description: Standard AZ for NAT Gateway when using Local Zone with gateway option ############################### ## Conditions for Parameters ## @@ -186,6 +200,9 @@ Conditions: CreateSSMDocument: !Equals [!Ref 'SSMDocumentName', 'true'] CreateOpenZFSCondition: !Equals [!Ref 'CreateOpenZFS', 'true'] ConfigureCustomIops: !Not [!Equals [!Ref OpenZFSIops, 0]] + UseLocalZoneNATGateway: !Equals [!Ref IsLocalZone, 'true'] + UseStandardNATGateway: !Equals [!Ref IsLocalZone, 'false'] + ########################## @@ -279,16 +296,51 @@ Resources: # Create a NAT GW then add it to the public subnet NATGateway: + Condition: UseStandardNATGateway Type: AWS::EC2::NatGateway Properties: AllocationId: !GetAtt ElasticIP.AllocationId SubnetId: !Ref PublicSubnet ElasticIP: + Condition: UseStandardNATGateway Type: AWS::EC2::EIP Properties: Domain: vpc + + ### IF YOU ARE USING A LOCAL ZONE, THIS CF STACK WILL CREATE A NAT GATEWAY + NATGatewaySubnet: + Condition: UseLocalZoneNATGateway + Type: AWS::EC2::Subnet + Properties: + MapPublicIpOnLaunch: true + VpcId: !Ref VPC + CidrBlock: 10.0.128.0/24 + AvailabilityZoneId: !Ref NATGatewayAZ + Tags: + - Key: Name + Value: NAT Gateway Subnet + + NATGatewaySubnetRouteTableAssociation: + Condition: UseLocalZoneNATGateway + Type: AWS::EC2::SubnetRouteTableAssociation + Properties: + SubnetId: !Ref NATGatewaySubnet + RouteTableId: !Ref PublicRouteTable + LocalZoneNATGateway: + Condition: UseLocalZoneNATGateway + Type: AWS::EC2::NatGateway + Properties: + AllocationId: !GetAtt LocalZoneNATGatewayEIP.AllocationId + SubnetId: !Ref NATGatewaySubnet + + LocalZoneNATGatewayEIP: + Condition: UseLocalZoneNATGateway + Type: AWS::EC2::EIP + Properties: + Domain: vpc + # NOTE: when you create additional security groups, you must ensure that every # security group has ingress/egress from/to its own security group id. Failure # to do so may cause trn1/p4d/p4de/p5 SMHP cluster creation to fail: @@ -392,7 +444,10 @@ Resources: Properties: RouteTableId: !Ref PrivateRouteTable DestinationCidrBlock: 0.0.0.0/0 - NatGatewayId: !Ref NATGateway + NatGatewayId: !If + - UseStandardNATGateway + - !Ref NATGateway + - !Ref LocalZoneNATGateway # Associate the public route table to the public subnet PublicSubnetRouteTableAssociation: @@ -630,4 +685,4 @@ Outputs: FSxOpenZFSFileSystemDNSname: Condition: CreateOpenZFSCondition Description: The DNS of the FSxOpenZFS filesystem that has been created - Value: !GetAtt FSxOpenZFSFileSystem.DNSName \ No newline at end of file + Value: !GetAtt FSxOpenZFSFileSystem.DNSName diff --git a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/main-stack.yaml b/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/main-stack.yaml index 0ff66f824..69b3f0673 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/main-stack.yaml +++ b/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/main-stack.yaml @@ -31,6 +31,11 @@ Metadata: Parameters: - AvailabilityZoneId - PrivateSubnet1CIDR + - Label: + default: Local Zone Configuration (for GB200) + Parameters: + - IsLocalZone + - NATGatewayAZ - Label: default: Parameter Checks Recommended If EKSClusterStack == true Parameters: @@ -50,6 +55,11 @@ Metadata: Parameters: - HyperPodClusterName - NodeRecovery + - UseContinuousNodeProvisioningMode + - Label: + default: Parameters for Accelerated Instance Group + Parameters: + - CreateAcceleratedInstanceGroup - AcceleratedInstanceGroupName - AcceleratedInstanceType - AcceleratedInstanceCount @@ -58,6 +68,11 @@ Metadata: - EnableInstanceStressCheck - EnableInstanceConnectivityCheck - AcceleratedLifeCycleConfigOnCreate + - AcceleratedTrainingPlanArn + - AcceleratedImageId + - Label: + default: Parameters for General Purpose Instance Group + Parameters: - CreateGeneralPurposeInstanceGroup - GeneralPurposeInstanceGroupName - GeneralPurposeInstanceType @@ -65,6 +80,22 @@ Metadata: - GeneralPurposeEBSVolumeSize - GeneralPurposeThreadsPerCore - GeneralPurposeLifeCycleConfigOnCreate + - GeneralPurposeImageId + - Label: + default: Parameters for Restricted Instance Group + Parameters: + - CreateRestrictedInstanceGroup + - RestrictedInstanceGroupName + - RestrictedInstanceType + - RestrictedInstanceCount + - RestrictedPerUnitStorageThroughput + - RestrictedSizeInGiB + - RestrictedEBSVolumeSize + - EnableRestrictedInstanceStressCheck + - EnableRestrictedInstanceConnectivityCheck + - RestrictedThreadsPerCore + - RestrictedTrainingPlanArn + - Label: default: Parameter Overrides Required If CreateVPCStack == false and Any(CreatePrivateSubnetStack, CreateSecurityGroupStack, CreateEKSClusterStack, CreateS3EndpointStack) == true Parameters: @@ -143,6 +174,9 @@ Mappings: sa-east-1: BucketName: ws-assets-prod-iad-r-gru-527b8c19222c1182 us-east-1: + # BucketName: ws-assets-prod-iad-r-iad-ed304a55c2ca1aee + BucketName: awsome-distributed-training + us-east-1-extra: BucketName: ws-assets-prod-iad-r-iad-ed304a55c2ca1aee ap-northeast-2: BucketName: ws-assets-prod-iad-r-icn-ced060f0d38bc0b0 @@ -195,12 +229,13 @@ Parameters: Description: > The Availability Zone Id you wish to create a private subnet in. This private subnet will be used by HyperPod to deploy cross-account ENIs. - The Availability Zone Id you specify should correspond to the location of your accelerated compute capacity. + The Availability Zone Id you specify should correspond to the location of your accelerated compute capacity. + If this is for GB200, use use1-dfw2-az1 Type: String - Default: usw2-az2 - AllowedPattern: ^[a-z]{3,4}[0-9]-az[0-9]$ - ConstraintDescription: The Availability Zone Id must match the expression - ^[a-z]{3,4}[0-9]-az[0-9]$. For example, use1-az4, usw2-az2, or apse1-az2. + Default: usw2-az2 + # AllowedPattern: ^[a-z]{3,4}[0-9]-az[0-9]$ + # ConstraintDescription: The Availability Zone Id must match the expression + # ^[a-z]{3,4}[0-9]-az[0-9]$. For example, use1-az4, usw2-az2, or apse1-az2. PrivateSubnet1CIDR: Description: The IP range (CIDR notation) for the private subnet to be created in the target Availability Zone. @@ -217,12 +252,23 @@ Parameters: Type: String Default: nat-1234567890abcdef0 + IsLocalZone: + Type: String + Default: 'false' + AllowedValues: ['true', 'false'] + Description: Set to true if using Local Zone Deployment (for GB200) + + NATGatewayAZ: + Type: String + Default: 'use1-az2' + Description: Standard AZ for NAT Gateway when using Local Zone (for GB200) + ### ---------------- EKS Params ----------------### KubernetesVersion: Description: The Kubernetes version to use for the EKS cluster. Type: String - Default: '1.31' + Default: '1.32' EKSClusterName: Description: The name of the newly created of preexisting EKS cluster you wish to use. @@ -313,6 +359,14 @@ Parameters: AllowedValues: - Automatic - None + + UseContinuousNodeProvisioningMode: + Description: Whether to enable continuous node provisioning mode for the HyperPod cluster. + Type: String + Default: true + AllowedValues: + - true + - false SageMakerIAMRoleName: Description: The name of the IAM role that SageMaker will use to access the AWS resources on your behalf. @@ -327,12 +381,20 @@ Parameters: Type: String Default: subnet-1234567890abcdef0 - ### ---------------- Accelerated Instance Group 1 Params----------------### + ### ---------------- Accelerated Instance Group Params----------------### + + CreateAcceleratedInstanceGroup: + Description: Whether to create an accelerated instance group for the HyperPod cluster. + Type: String + Default: true + AllowedValues: + - true + - false AcceleratedInstanceGroupName: Description: The name of the accelerated instance group for the HyperPod cluster. Type: String - Default: accelerated-worker-group-1 + Default: accelerated-instance-group AcceleratedInstanceType: Description: The instance type of the accelerated instance group for the HyperPod cluster. @@ -354,14 +416,14 @@ Parameters: AcceleratedThreadsPerCore: Description: The number of threads per CPU core in the accelerated instance group for the HyperPod cluster. Type: Number + Default: 1 AllowedValues: - 1 - 2 - Default: 1 EnableInstanceStressCheck: Type: String - Description: Enable Instance Stress deep health check + Description: Enable instance stress deep health check Default: true AllowedValues: - true @@ -369,7 +431,7 @@ Parameters: EnableInstanceConnectivityCheck: Type: String - Description: Enable Instance Connectivity deep health check + Description: Enable instance connectivity deep health check Default: true AllowedValues: - true @@ -380,12 +442,22 @@ Parameters: Type: String Default: on_create.sh + AcceleratedTrainingPlanArn: + Description: The ARN of the accelerated instance group training plan + Type: String + Default: "" + + AcceleratedImageId: + Description: The AMI ID of the accelerated instance group for the HyperPod cluster. + Type: String + Default: "" + ### ---------------- General Purpose Instance Group 2 Params ----------------### CreateGeneralPurposeInstanceGroup: Description: Whether to create a general purpose instance group for the HyperPod cluster. Type: String - Default: true + Default: false AllowedValues: - true - false @@ -393,7 +465,7 @@ Parameters: GeneralPurposeInstanceGroupName: Description: The name of the general purpose instance group for the HyperPod cluster. Type: String - Default: general-purpose-worker-group-2 + Default: general-purpose-instance-group GeneralPurposeInstanceType: Description: The instance type of the general purpose instance group for the HyperPod cluster. @@ -415,15 +487,90 @@ Parameters: GeneralPurposeThreadsPerCore: Description: The number of threads per CPU core in the general purpose instance group for the HyperPod cluster. Type: Number + Default: 1 AllowedValues: - 1 - 2 - Default: 1 GeneralPurposeLifeCycleConfigOnCreate: Description: The file name of lifecycle script for the general purpose instance group. This script runs during cluster creation. Type: String Default: on_create.sh + + GeneralPurposeImageId: + Description: The AMI ID of the general purpose instance group for the HyperPod cluster. + Type: String + Default: "" + + ### ---------------- Restricted Instance Group Params ----------------### + CreateRestrictedInstanceGroup: + Description: Whether to create a restricted instance group for the HyperPod cluster. + Type: String + Default: false + AllowedValues: + - true + - false + + RestrictedInstanceGroupName: + Description: The name of the restricted instance group for the HyperPod cluster. + Type: String + Default: restricted-instance-group + + RestrictedInstanceType: + Description: The instance type of the restricted instance group for the HyperPod cluster. + Type: String + Default: ml.g5.8xlarge + + RestrictedInstanceCount: + Description: The number of instances in the restricted instance group for the HyperPod cluster. + Type: Number + Default: 1 + + RestrictedPerUnitStorageThroughput: + Description: The throughput for the service-managed FSx for Lustre file system + Type: Number + Default: 250 + + RestrictedSizeInGiB: + Description: The volume size for the service-managed FSx for Lustre file system + Type: Number + Default: 1200 + + RestrictedEBSVolumeSize: + Description: > + The size in gigabytes (GB) of the additional EBS volume to be attached + to the instances in the restricted instance group for the HyperPod cluster. + Type: Number + Default: 500 + + EnableRestrictedInstanceStressCheck: + Type: String + Description: Enable restricted instance stress deep health check + Default: true + AllowedValues: + - true + - false + + EnableRestrictedInstanceConnectivityCheck: + Type: String + Description: Enable restricted instance connectivity deep health check + Default: true + AllowedValues: + - true + - false + + RestrictedThreadsPerCore: + Description: The number of threads per CPU core in the restricted instance group for the HyperPod cluster. + Type: Number + Default: 1 + AllowedValues: + - 1 + - 2 + + RestrictedTrainingPlanArn: + Description: The ARN of the restricted instance group training plan + Type: String + Default: "" ### ---------------- Condition Params ----------------### @@ -562,7 +709,7 @@ Resources: Properties: TemplateURL: Fn::Sub: - - 'https://${BucketName}.s3.${AWS::Region}.amazonaws.com/2433d39e-ccfe-4c00-9d3d-9917b729258e/vpc-stack.yaml' + - 'https://${BucketName}.s3.${AWS::Region}.amazonaws.com/gb200-stacks/vpc-stack.yaml' - BucketName: Fn::FindInMap: - AssetS3Buckets @@ -573,6 +720,8 @@ Resources: VpcCIDR: !Ref VpcCIDR PublicSubnet1CIDR: !Ref PublicSubnet1CIDR PublicSubnet2CIDR: !Ref PublicSubnet2CIDR + IsLocalZone: !Ref IsLocalZone + NATGatewayAZ: !Ref NATGatewayAZ PrivateSubnetStack: Type: AWS::CloudFormation::Stack @@ -580,7 +729,7 @@ Resources: Properties: TemplateURL: Fn::Sub: - - "https://${BucketName}.s3.${AWS::Region}.amazonaws.com/2433d39e-ccfe-4c00-9d3d-9917b729258e/private-subnet-stack.yaml" + - "https://${BucketName}.s3.${AWS::Region}.amazonaws.com/gb200-stacks/private-subnet-stack.yaml" - BucketName: Fn::FindInMap: - AssetS3Buckets @@ -605,7 +754,7 @@ Resources: Properties: TemplateURL: Fn::Sub: - - "https://${BucketName}.s3.${AWS::Region}.amazonaws.com/2433d39e-ccfe-4c00-9d3d-9917b729258e/security-group-stack.yaml" + - "https://${BucketName}.s3.${AWS::Region}.amazonaws.com/gb200-stacks/security-group-stack.yaml" - BucketName: Fn::FindInMap: - AssetS3Buckets @@ -631,7 +780,7 @@ Resources: Properties: TemplateURL: Fn::Sub: - - "https://${BucketName}.s3.${AWS::Region}.amazonaws.com/2433d39e-ccfe-4c00-9d3d-9917b729258e/eks-cluster-stack.yaml" + - "https://${BucketName}.s3.${AWS::Region}.amazonaws.com/gb200-stacks/eks-cluster-stack.yaml" - BucketName: Fn::FindInMap: - AssetS3Buckets @@ -660,7 +809,7 @@ Resources: Properties: TemplateURL: Fn::Sub: - - "https://${BucketName}.s3.${AWS::Region}.amazonaws.com/2433d39e-ccfe-4c00-9d3d-9917b729258e/s3-bucket-stack.yaml" + - "https://${BucketName}.s3.${AWS::Region}.amazonaws.com/gb200-stacks/s3-bucket-stack.yaml" - BucketName: Fn::FindInMap: - AssetS3Buckets @@ -675,7 +824,7 @@ Resources: Properties: TemplateURL: Fn::Sub: - - "https://${BucketName}.s3.${AWS::Region}.amazonaws.com/2433d39e-ccfe-4c00-9d3d-9917b729258e/s3-endpoint-stack.yaml" + - "https://${BucketName}.s3.${AWS::Region}.amazonaws.com/gb200-stacks/s3-endpoint-stack.yaml" - BucketName: Fn::FindInMap: - AssetS3Buckets @@ -697,7 +846,7 @@ Resources: Properties: TemplateURL: Fn::Sub: - - "https://${BucketName}.s3.${AWS::Region}.amazonaws.com/2433d39e-ccfe-4c00-9d3d-9917b729258e/lifecycle-script-stack.yaml" + - "https://${BucketName}.s3.${AWS::Region}.amazonaws.com/gb200-stacks/lifecycle-script-stack.yaml" - BucketName: Fn::FindInMap: - AssetS3Buckets @@ -716,7 +865,7 @@ Resources: Properties: TemplateURL: Fn::Sub: - - "https://${BucketName}.s3.${AWS::Region}.amazonaws.com/2433d39e-ccfe-4c00-9d3d-9917b729258e/sagemaker-iam-role-stack.yaml" + - "https://${BucketName}.s3.${AWS::Region}.amazonaws.com/gb200-stacks/sagemaker-iam-role-stack.yaml" - BucketName: Fn::FindInMap: - AssetS3Buckets @@ -735,7 +884,7 @@ Resources: Properties: TemplateURL: Fn::Sub: - - "https://${BucketName}.s3.${AWS::Region}.amazonaws.com/2433d39e-ccfe-4c00-9d3d-9917b729258e/helm-chart-stack.yaml" + - "https://${BucketName}.s3.${AWS::Region}.amazonaws.com/gb200-stacks/helm-chart-stack.yaml" - BucketName: Fn::FindInMap: - AssetS3Buckets @@ -750,7 +899,7 @@ Resources: CustomResourceS3Bucket: Fn::FindInMap: - AssetS3Buckets - - Ref: 'AWS::Region' + - us-east-1-extra - BucketName LayerS3Key: '2433d39e-ccfe-4c00-9d3d-9917b729258e/lambda-layer.zip' FunctionS3Key: '2433d39e-ccfe-4c00-9d3d-9917b729258e/function.zip' @@ -765,7 +914,7 @@ Resources: Properties: TemplateURL: Fn::Sub: - - "https://${BucketName}.s3.${AWS::Region}.amazonaws.com/2433d39e-ccfe-4c00-9d3d-9917b729258e/hyperpod-cluster-stack.yaml" + - "https://${BucketName}.s3.${AWS::Region}.amazonaws.com/gb200-stacks/hyperpod-cluster-stack.yaml" - BucketName: Fn::FindInMap: - AssetS3Buckets @@ -774,6 +923,8 @@ Resources: Parameters: HyperPodClusterName: !Ref HyperPodClusterName NodeRecovery: !Ref NodeRecovery + UseContinuousNodeProvisioningMode: !Ref UseContinuousNodeProvisioningMode + CreateAcceleratedInstanceGroup: !Ref CreateAcceleratedInstanceGroup AcceleratedInstanceGroupName: !Ref AcceleratedInstanceGroupName AcceleratedInstanceType: !Ref AcceleratedInstanceType AcceleratedInstanceCount: !Ref AcceleratedInstanceCount @@ -782,6 +933,8 @@ Resources: EnableInstanceStressCheck: !Ref EnableInstanceStressCheck EnableInstanceConnectivityCheck: !Ref EnableInstanceConnectivityCheck AcceleratedLifeCycleConfigOnCreate: !Ref AcceleratedLifeCycleConfigOnCreate + AcceleratedTrainingPlanArn: !Ref AcceleratedTrainingPlanArn + AcceleratedImageId: !Ref AcceleratedImageId CreateGeneralPurposeInstanceGroup: !Ref CreateGeneralPurposeInstanceGroup GeneralPurposeInstanceGroupName: !Ref GeneralPurposeInstanceGroupName GeneralPurposeInstanceType: !Ref GeneralPurposeInstanceType @@ -789,6 +942,18 @@ Resources: GeneralPurposeEBSVolumeSize: !Ref GeneralPurposeEBSVolumeSize GeneralPurposeThreadsPerCore: !Ref GeneralPurposeThreadsPerCore GeneralPurposeLifeCycleConfigOnCreate: !Ref GeneralPurposeLifeCycleConfigOnCreate + GeneralPurposeImageId: !Ref GeneralPurposeImageId + CreateRestrictedInstanceGroup: !Ref CreateRestrictedInstanceGroup + RestrictedInstanceGroupName: !Ref RestrictedInstanceGroupName + RestrictedInstanceType: !Ref RestrictedInstanceType + RestrictedInstanceCount: !Ref RestrictedInstanceCount + RestrictedPerUnitStorageThroughput: !Ref RestrictedPerUnitStorageThroughput + RestrictedSizeInGiB: !Ref RestrictedSizeInGiB + RestrictedEBSVolumeSize: !Ref RestrictedEBSVolumeSize + EnableRestrictedInstanceStressCheck: !Ref EnableRestrictedInstanceStressCheck + EnableRestrictedInstanceConnectivityCheck: !Ref EnableRestrictedInstanceConnectivityCheck + RestrictedThreadsPerCore: !Ref RestrictedThreadsPerCore + RestrictedTrainingPlanArn: !Ref RestrictedTrainingPlanArn # Used to conditionally force HyperPod to wait on the HelmChartStack (if deployed) HelmChartStatus: !If - CreateHelmChartStack @@ -817,38 +982,38 @@ Resources: ### ---------------- Outputs ----------------### Outputs: - VpcId: + OutputVpcId: Condition: CreateVPCStack Value: !GetAtt VPCStack.Outputs.VpcId - PrivateSubnetId: + OutputPrivateSubnetIds: Condition: CreatePrivateSubnetStack Value: !GetAtt PrivateSubnetStack.Outputs.PrivateSubnetId - SecurityGroupId: + OutputSecurityGroupId: Condition: CreateSecurityGroupStack Value: !GetAtt SecurityGroupStack.Outputs.SecurityGroupId - EKSClusterArn: + OutputEKSClusterArn: Condition: CreateEKSClusterStack Value: !GetAtt EKSClusterStack.Outputs.EKSClusterArn - EKSClusterName: + OutputEKSClusterName: Condition: CreateEKSClusterStack Value: !GetAtt EKSClusterStack.Outputs.EKSClusterName - SageMakerIAMRoleArn: + OutputSageMakerIAMRoleArn: Condition: CreateSageMakerIAMRoleStack Value: !GetAtt SageMakerIAMRoleStack.Outputs.SageMakerIAMRoleArn - S3BucketName: + OutputS3BucketName: Condition: CreateS3BucketStack Value: !GetAtt S3BucketStack.Outputs.S3BucketName - HyperPodClusterName: + OutputHyperPodClusterName: Condition: CreateHyperPodClusterStack Value: !GetAtt HyperPodClusterStack.Outputs.HyperPodClusterName - HyperPodClusterArn: + OutputHyperPodClusterArn: Condition: CreateHyperPodClusterStack - Value: !GetAtt HyperPodClusterStack.Outputs.HyperPodClusterArn \ No newline at end of file + Value: !GetAtt HyperPodClusterStack.Outputs.HyperPodClusterArn diff --git a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/private-subnet-stack.yaml b/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/private-subnet-stack.yaml index 3d4301853..79e4ea59a 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/private-subnet-stack.yaml +++ b/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/private-subnet-stack.yaml @@ -20,8 +20,8 @@ Parameters: The Availability Zone Id you specify should correspond to the location of your accelerated compute capacity. Type: String Default: usw2-az2 - AllowedPattern: ^[a-z]{3,4}[0-9]-az[0-9]$ - ConstraintDescription: The Availability Zone Id must match the expression ^[a-z]{3,4}[0-9]-az[0-9]$. For example, use1-az4, usw2-az2, or apse1-az2. + # AllowedPattern: ^[a-z]{3,4}[0-9]-az[0-9]$ + # ConstraintDescription: The Availability Zone Id must match the expression ^[a-z]{3,4}[0-9]-az[0-9]$. For example, use1-az4, usw2-az2, or apse1-az2. VpcId: Description: The ID of the VPC you wish to use if you do not want to create a new VPC. diff --git a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/vpc-stack.yaml b/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/vpc-stack.yaml index 8333bb970..e644e1443 100644 --- a/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/vpc-stack.yaml +++ b/1.architectures/7.sagemaker-hyperpod-eks/cfn-templates/nested-stacks/vpc-stack.yaml @@ -21,9 +21,23 @@ Parameters: Description: The IP range (CIDR notation) for the public subnet in the second Availability Zone. Type: String Default: 10.192.11.0/24 + + IsLocalZone: + Type: String + Default: 'false' + AllowedValues: ['true', 'false'] + Description: Set to true if using Local Zone Deployment (for GB200) + + NATGatewayAZ: + Type: String + Default: 'use1-az2' + Description: Standard AZ for NAT Gateway when using Local Zone Deployment (for GB200) -Resources: +Conditions: + UseLocalZoneNATGateway: !Equals [!Ref IsLocalZone, 'true'] + UseStandardNATGateway: !Equals [!Ref IsLocalZone, 'false'] +Resources: VPC: Type: AWS::EC2::VPC Properties: @@ -69,30 +83,70 @@ Resources: - Key: Name Value: !Sub ${ResourceNamePrefix}-SMHP-Public2 + # Standard NAT Gateway + NatGateway1EIP: Type: AWS::EC2::EIP + Condition: UseStandardNATGateway DependsOn: InternetGatewayAttachment Properties: Domain: vpc NatGateway2EIP: Type: AWS::EC2::EIP + Condition: UseStandardNATGateway DependsOn: InternetGatewayAttachment Properties: Domain: vpc NatGateway1: Type: AWS::EC2::NatGateway + Condition: UseStandardNATGateway Properties: AllocationId: !GetAtt NatGateway1EIP.AllocationId SubnetId: !Ref PublicSubnet1 NatGateway2: Type: AWS::EC2::NatGateway + Condition: UseStandardNATGateway Properties: AllocationId: !GetAtt NatGateway2EIP.AllocationId SubnetId: !Ref PublicSubnet2 + # Local Zone NAT Gateway + + LocalZoneNATGatewaySubnet: + Condition: UseLocalZoneNATGateway + Type: AWS::EC2::Subnet + Properties: + VpcId: !Ref VPC + CidrBlock: 10.192.128.0/24 + AvailabilityZone: !Ref NATGatewayAZ + MapPublicIpOnLaunch: true + Tags: + - Key: Name + Value: !Sub ${ResourceNamePrefix}-SMHP-Public-NAT-GW-Subnet + + LocalZoneNATGatewaySubnetAssociation: + Condition: UseLocalZoneNATGateway + Type: AWS::EC2::SubnetRouteTableAssociation + Properties: + RouteTableId: !Ref PublicRouteTable + SubnetId: !Ref LocalZoneNATGatewaySubnet + + LocalZoneNATGatewayEIP: + Condition: UseLocalZoneNATGateway + Type: AWS::EC2::EIP + Properties: + Domain: vpc + + LocalZoneNATGateway: + Condition: UseLocalZoneNATGateway + Type: AWS::EC2::NatGateway + Properties: + AllocationId: !GetAtt LocalZoneNATGatewayEIP.AllocationId + SubnetId: !Ref LocalZoneNATGatewaySubnet + PublicRouteTable: Type: AWS::EC2::RouteTable Properties: @@ -124,8 +178,11 @@ Resources: Outputs: NatGatewayId: Description: Nat Gateway Id - Value: !Ref NatGateway1 + Value: !If + - UseStandardNATGateway + - !Ref NatGateway1 + - !Ref LocalZoneNATGateway VpcId: Description: VPC Id - Value: !Ref VPC \ No newline at end of file + Value: !Ref VPC