diff --git a/pmm/v3/pmm3-ha-eks-cleanup.groovy b/pmm/v3/pmm3-ha-eks-cleanup.groovy index cada78b616..f42b0ff1b2 100644 --- a/pmm/v3/pmm3-ha-eks-cleanup.groovy +++ b/pmm/v3/pmm3-ha-eks-cleanup.groovy @@ -1,33 +1,60 @@ +/** + * PMM HA EKS Cleanup Pipeline + * + * Manages cleanup of PMM HA test clusters. Supports manual and scheduled runs. + * Deletes Route53 records, ALB ingress, and EKS clusters. + * + * Actions: + * - LIST_ONLY: List all test clusters with age + * - DELETE_CLUSTER: Delete a specific cluster + * - DELETE_ALL: Delete all test clusters (respects SKIP_NEWEST) + * - DELETE_OLD (cron): Delete clusters older than 24 hours + * - CLEANUP_ORPHANS: Delete orphaned VPCs and failed CF stacks + * + * Related: + * - Create: pmm3-ha-eks.groovy + * - Shared library: vars/pmmHaEks.groovy + */ +library changelog: false, identifier: 'lib@fix/pmm-ha-eks-access-entries', retriever: modernSCM([ + $class: 'GitSCMSource', + remote: 'https://github.com/Percona-Lab/jenkins-pipelines' +]) + pipeline { agent { - label 'agent-amd64-ol9' + label 'cli' } triggers { - cron('H 0,12 * * *') // Runs twice daily at 00:00 & 12:00 + cron('H 0,12 * * *') } parameters { choice( name: 'ACTION', - choices: ['LIST_ONLY', 'DELETE_CLUSTER', 'DELETE_ALL'], + choices: ['LIST_ONLY', 'DELETE_CLUSTER', 'DELETE_ALL', 'CLEANUP_ORPHANS'], description: ''' LIST_ONLY - list all test clusters
DELETE_CLUSTER - delete a specific cluster (requires CLUSTER_NAME)
- DELETE_ALL - delete all test clusters

+ DELETE_ALL - delete all test clusters
+ CLEANUP_ORPHANS - delete orphaned VPCs and failed CF stacks

Note: Daily cron automatically deletes clusters older than 1 day. ''' ) string(name: 'CLUSTER_NAME', defaultValue: '', description: 'Required only for DELETE_CLUSTER') + booleanParam(name: 'SKIP_NEWEST', defaultValue: true, description: 'Skip the most recent cluster (protects in-progress builds)') } options { buildDiscarder(logRotator(numToKeepStr: '30')) + disableConcurrentBuilds() + timeout(time: 60, unit: 'MINUTES') } environment { - REGION = "us-east-2" - CLUSTER_PREFIX = "pmm-ha-test-" + REGION = 'us-east-2' + CLUSTER_PREFIX = 'pmm-ha-test-' + R53_ZONE_NAME = 'cd.percona.com' } stages { @@ -36,14 +63,14 @@ pipeline { script { if (currentBuild.getBuildCauses('hudson.triggers.TimerTrigger$TimerTriggerCause')) { env.ACTION = 'DELETE_OLD' - echo "Triggered by cron - will delete clusters older than 1 day." + echo 'Triggered by cron - will delete clusters older than 1 day.' } else { env.ACTION = params.ACTION echo "Manual run with ACTION=${params.ACTION}" } if (env.ACTION == 'DELETE_CLUSTER' && !params.CLUSTER_NAME) { - error("CLUSTER_NAME is required for DELETE_CLUSTER.") + error('CLUSTER_NAME is required for DELETE_CLUSTER.') } if (params.CLUSTER_NAME && !params.CLUSTER_NAME.startsWith(env.CLUSTER_PREFIX)) { error("Cluster name must start with ${env.CLUSTER_PREFIX}") @@ -56,29 +83,30 @@ pipeline { when { expression { env.ACTION == 'LIST_ONLY' } } steps { withCredentials([aws(credentialsId: 'pmm-staging-slave')]) { - sh ''' - set +x - - CLUSTERS=$(aws eks list-clusters --region "$REGION" \ - --query "clusters[?starts_with(@, '${CLUSTER_PREFIX}')]" \ - --output text) - - if [ -z "$CLUSTERS" ]; then - echo "No clusters found with prefix '${CLUSTER_PREFIX}'." - exit 0 - fi - - for c in $CLUSTERS; do - CREATED=$(aws eks describe-cluster \ - --name "$c" --region "$REGION" \ - --query "cluster.createdAt" --output text) - - CREATED_EPOCH=$(date -d "$CREATED" +%s) - AGE_HOURS=$(( ( $(date +%s) - CREATED_EPOCH ) / 3600 )) - - echo "• $c | Created: $CREATED | Age: ${AGE_HOURS}h" - done - ''' + script { + def clusters = pmmHaEks.listClusters(env.REGION) + + if (!clusters) { + echo "No clusters found with prefix '${env.CLUSTER_PREFIX}'." + return + } + + echo "Found ${clusters.size()} cluster(s):" + clusters.each { clusterName -> + def info = sh( + script: """ + CREATED=\$(aws eks describe-cluster --name ${clusterName} --region ${env.REGION} \ + --query 'cluster.createdAt' --output text) + CREATED_EPOCH=\$(date -d "\${CREATED}" +%s) + AGE_HOURS=\$(( ( \$(date +%s) - CREATED_EPOCH ) / 3600 )) + echo "\${CREATED}|\${AGE_HOURS}" + """, + returnStdout: true + ).trim() + def parts = info.split('\\|') + echo "* ${clusterName} | Created: ${parts[0]} | Age: ${parts[1]}h" + } + } } } } @@ -87,15 +115,22 @@ pipeline { when { expression { env.ACTION == 'DELETE_CLUSTER' } } steps { withCredentials([aws(credentialsId: 'pmm-staging-slave')]) { - sh ''' - if ! aws eks describe-cluster --region "${REGION}" --name "${CLUSTER_NAME}" >/dev/null 2>&1; then - echo "Cluster '${CLUSTER_NAME}' not found in region '${REGION}'." - exit 0 - fi - - eksctl delete cluster --region "${REGION}" --name "${CLUSTER_NAME}" \ - --disable-nodegroup-eviction --wait - ''' + script { + def clusterExists = sh( + script: "aws eks describe-cluster --region ${REGION} --name ${params.CLUSTER_NAME} >/dev/null 2>&1", + returnStatus: true + ) == 0 + + if (clusterExists) { + pmmHaEks.deleteCluster( + clusterName: params.CLUSTER_NAME, + region: env.REGION, + r53ZoneName: env.R53_ZONE_NAME + ) + } else { + echo "Cluster '${params.CLUSTER_NAME}' not found in region '${REGION}'." + } + } } } } @@ -104,20 +139,14 @@ pipeline { when { expression { env.ACTION == 'DELETE_ALL' } } steps { withCredentials([aws(credentialsId: 'pmm-staging-slave')]) { - sh ''' - CLUSTERS=$(aws eks list-clusters --region "$REGION" \ - --query "clusters[?starts_with(@, '${CLUSTER_PREFIX}')]" --output text) - - if [ -z "$CLUSTERS" ]; then - echo "No clusters found with prefix '${CLUSTER_PREFIX}'." - exit 0 - fi - - for c in $CLUSTERS; do - eksctl delete cluster --region "$REGION" --name "$c" \ - --disable-nodegroup-eviction --wait - done - ''' + script { + pmmHaEks.deleteAllClusters( + region: env.REGION, + r53ZoneName: env.R53_ZONE_NAME, + skipNewest: params.SKIP_NEWEST, + maxAgeHours: 0 // Delete all regardless of age + ) + } } } } @@ -126,36 +155,25 @@ pipeline { when { expression { env.ACTION == 'DELETE_OLD' } } steps { withCredentials([aws(credentialsId: 'pmm-staging-slave')]) { - sh ''' - CLUSTERS=$(aws eks list-clusters --region "$REGION" \ - --query "clusters[?starts_with(@, '${CLUSTER_PREFIX}')]" --output text) - - if [ -z "$CLUSTERS" ]; then - echo "No clusters found with prefix '${CLUSTER_PREFIX}'." - exit 0 - fi - - CUTOFF=$(date -d "1 day ago" +%s) - - for c in $CLUSTERS; do - CREATED=$(aws eks describe-cluster --name "$c" --region "$REGION" \ - --query "cluster.createdAt" --output text 2>/dev/null || true) - - if [ -z "$CREATED" ] || [ "$CREATED" == "None" ]; then - echo "Unable to fetch creation time for $c — skipping." - continue - fi - - CREATED_EPOCH=$(date -d "$CREATED" +%s) + script { + pmmHaEks.deleteAllClusters( + region: env.REGION, + r53ZoneName: env.R53_ZONE_NAME, + skipNewest: true, // Always protect newest during cron + maxAgeHours: 24 // Only delete clusters older than 24h + ) + } + } + } + } - if [ "$CREATED_EPOCH" -lt "$CUTOFF" ]; then - eksctl delete cluster --region "$REGION" --name "$c" \ - --disable-nodegroup-eviction --wait - else - echo "Skipping recent cluster: $c (created within last 24h)" - fi - done - ''' + stage('Cleanup Orphan Resources') { + when { expression { env.ACTION == 'CLEANUP_ORPHANS' } } + steps { + withCredentials([aws(credentialsId: 'pmm-staging-slave')]) { + script { + pmmHaEks.cleanupOrphans(region: env.REGION) + } } } } diff --git a/pmm/v3/pmm3-ha-eks.groovy b/pmm/v3/pmm3-ha-eks.groovy index 580bec6446..3e0b05edc5 100644 --- a/pmm/v3/pmm3-ha-eks.groovy +++ b/pmm/v3/pmm3-ha-eks.groovy @@ -1,26 +1,69 @@ +/** + * PMM HA EKS Test Pipeline + * + * Creates an EKS cluster with PMM High Availability deployment for testing. + * Includes ALB ingress with ACM certificate and Route53 DNS. + * + * Related: + * - Cleanup: pmm3-ha-eks-cleanup.groovy + * - Shared library: vars/pmmHaEks.groovy + */ +library changelog: false, identifier: 'lib@fix/pmm-ha-eks-access-entries', retriever: modernSCM([ + $class: 'GitSCMSource', + remote: 'https://github.com/Percona-Lab/jenkins-pipelines' +]) + pipeline { agent { - label 'agent-amd64-ol9' + label 'cli' + } + + options { + disableConcurrentBuilds() + timeout(time: 90, unit: 'MINUTES') } parameters { choice( name: 'K8S_VERSION', - choices: ['1.32', '1.31', '1.30', '1.29', '1.28'], + choices: ['1.32', '1.33', '1.31', '1.30', '1.29'], description: 'Select Kubernetes cluster version' ) + // PMM HA charts are not yet merged to percona/percona-helm-charts main branch. + // theTibi/PMM-14420 contains both pmm-ha and pmm-ha-dependencies charts. + // Once merged to percona main, update default to 'main' and swap repo priority. + string( + name: 'HELM_CHART_BRANCH', + defaultValue: 'PMM-14420', + description: 'Branch of percona-helm-charts repo (theTibi/PMM-14420 has both pmm-ha and pmm-ha-dependencies)' + ) + string( + name: 'PMM_IMAGE_TAG', + defaultValue: '', + description: 'PMM Server image tag (leave empty for chart default)' + ) } - environment { + environment { CLUSTER_NAME = "pmm-ha-test-${BUILD_NUMBER}" - REGION = "us-east-2" + REGION = 'us-east-2' KUBECONFIG = "${WORKSPACE}/kubeconfig/config" + PMM_NAMESPACE = 'pmm' + ACM_CERT_ARN = 'arn:aws:acm:us-east-2:119175775298:certificate/9bd3a0c8-8205-4092-8003-7304ca762143' + R53_ZONE_NAME = 'cd.percona.com' + PMM_DOMAIN = "pmm-ha-test-${BUILD_NUMBER}.${R53_ZONE_NAME}" } stages { stage('Write Cluster Config') { steps { - sh ''' + withCredentials([aws(credentialsId: 'pmm-staging-slave')]) { + sh ''' + # Discover available AZs dynamically + AZS=$(aws ec2 describe-availability-zones --region "${REGION}" \ + --query 'AvailabilityZones[?State==`available`].ZoneName' \ + --output json) + cat > cluster-config.yaml <= 5) { + error('Maximum limit of 5 test clusters reached.') + } - echo "Existing clusters: $EXISTING_COUNT / 5" - ''' + echo "Cluster count: ${count} / 5" + } } } } + stage('Validate Helm Chart') { + steps { + sh ''' + set -e + echo "Validating Helm chart branch: ${HELM_CHART_BRANCH}" + + # Try theTibi fork first (has PMM-14420 with both charts), then percona repo + TIBI_REPO="https://github.com/theTibi/percona-helm-charts.git" + PERCONA_REPO="https://github.com/percona/percona-helm-charts.git" + + rm -rf charts-repo-check + if git clone --depth 1 --branch "${HELM_CHART_BRANCH}" "${TIBI_REPO}" charts-repo-check 2>/dev/null; then + echo "Found branch in: ${TIBI_REPO}" + elif git clone --depth 1 --branch "${HELM_CHART_BRANCH}" "${PERCONA_REPO}" charts-repo-check 2>/dev/null; then + echo "Found branch in: ${PERCONA_REPO}" + else + echo "ERROR: Branch '${HELM_CHART_BRANCH}' not found in theTibi or percona helm chart repos" + exit 1 + fi + + # Check required charts exist + if [ ! -d "charts-repo-check/charts/pmm-ha" ]; then + echo "ERROR: pmm-ha chart not found in branch '${HELM_CHART_BRANCH}'" + echo "Available charts:" + ls -la charts-repo-check/charts/ || true + rm -rf charts-repo-check + exit 1 + fi + + if [ ! -d "charts-repo-check/charts/pmm-ha-dependencies" ]; then + echo "ERROR: pmm-ha-dependencies chart not found in branch '${HELM_CHART_BRANCH}'" + echo "Available charts:" + ls -la charts-repo-check/charts/ || true + rm -rf charts-repo-check + exit 1 + fi + + echo "Helm charts validated successfully (pmm-ha + pmm-ha-dependencies)" + rm -rf charts-repo-check + ''' + } + } + stage('Create EKS Cluster') { steps { withCredentials([aws(credentialsId: 'pmm-staging-slave')]) { sh ''' eksctl create cluster -f cluster-config.yaml --timeout=40m --verbose=4 - - # Map EKSAdminRole for IAM users - eksctl create iamidentitymapping \ - --cluster "${CLUSTER_NAME}" \ - --region "${REGION}" \ - --arn arn:aws:iam::119175775298:role/EKSAdminRole \ - --username eks-admin \ - --group system:masters ''' } } } + stage('Configure Cluster Access') { + steps { + withCredentials([aws(credentialsId: 'pmm-staging-slave')]) { + script { + pmmHaEks.configureAccess( + clusterName: env.CLUSTER_NAME, + region: env.REGION + ) + } + } + } + } + stage('Export kubeconfig') { steps { withCredentials([aws(credentialsId: 'pmm-staging-slave')]) { sh ''' + rm -rf kubeconfig mkdir -p kubeconfig aws eks update-kubeconfig \ @@ -129,50 +221,45 @@ EOF } } - stage('Configure GP3 Storage Class') { + stage('Setup Infrastructure') { steps { withCredentials([aws(credentialsId: 'pmm-staging-slave')]) { - sh ''' - kubectl patch storageclass gp2 -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"false"}}}' - - cat </dev/null || echo "pending") + + echo "============================================" + echo "Access Commands" + echo "============================================" + echo "kubectl:" + echo " aws eks update-kubeconfig --name ${CLUSTER_NAME} --region ${REGION}" + echo "" + echo "PMM UI:" + echo " https://${PMM_DOMAIN}" + echo "" + echo "ALB Hostname:" + echo " ${ALB_HOSTNAME}" + + # Save credentials to file for archiving + mkdir -p pmm-credentials + cat > pmm-credentials/access-info.txt </dev/null || echo 'unknown'", + returnStdout: true + ).trim() + + currentBuild.description = "https://${PMM_DOMAIN} | admin / ${pmmPassword} | ${chartRepo}/${HELM_CHART_BRANCH}" + + echo "Cluster ${CLUSTER_NAME} with PMM HA created successfully." + echo "PMM URL: https://${PMM_DOMAIN}" + echo 'User: admin' + echo "Password: ${pmmPassword}" + echo "Chart: ${chartRepo}/${HELM_CHART_BRANCH}" + } + } } failure { withCredentials([aws(credentialsId: 'pmm-staging-slave')]) { - sh ''' - if eksctl get cluster \ - --region "${REGION}" \ - --name "${CLUSTER_NAME}" >/dev/null 2>&1 - then - eksctl delete cluster \ - --region "${REGION}" \ - --name "${CLUSTER_NAME}" \ - --disable-nodegroup-eviction \ - --wait - fi - ''' + script { + // Check if cluster exists before cleanup + def clusterExists = sh( + script: "eksctl get cluster --region ${REGION} --name ${CLUSTER_NAME} >/dev/null 2>&1", + returnStatus: true + ) == 0 + + if (clusterExists) { + pmmHaEks.deleteCluster( + clusterName: env.CLUSTER_NAME, + region: env.REGION, + r53ZoneName: env.R53_ZONE_NAME + ) + } else { + echo "Cluster ${CLUSTER_NAME} not found, nothing to clean up." + } + } } } } diff --git a/vars/pmmHaEks.groovy b/vars/pmmHaEks.groovy new file mode 100644 index 0000000000..468b47b4d1 --- /dev/null +++ b/vars/pmmHaEks.groovy @@ -0,0 +1,730 @@ +/** + * PMM HA EKS Shared Library + * + * Reusable functions for PMM High Availability testing on EKS clusters. + * + * Functions: + * - configureAccess() Configure EKS Access Entries (IAM roles, users, SSO) + * - setupInfrastructure() Install GP3 storage, Node Termination Handler, ALB Controller + * - installPmm() Deploy PMM HA stack (operators, secrets, helm chart) + * - createIngress() Create ALB Ingress and Route53 DNS record + * - deleteCluster() Delete Route53, ALB, and EKS cluster + * - listClusters() List PMM HA test clusters sorted by creation time (newest first) + * - deleteAllClusters() Delete all/old clusters with SKIP_NEWEST support + * - cleanupOrphans() Clean up orphaned VPCs and failed CF stacks + * + * Related: + * - Create pipeline: pmm/v3/pmm3-ha-eks.groovy + * - Cleanup pipeline: pmm/v3/pmm3-ha-eks-cleanup.groovy + */ + +// Constants +def CLUSTER_PREFIX = 'pmm-ha-test-' + +/** + * Configure EKS Access Entries for cluster authentication. + * + * Grants cluster admin access to: + * - EKSAdminRole (for automation) + * - Members of pmm-eks-admins IAM group (dynamically resolved) + * - SSO AdministratorAccess role (for console users) + * + * @param clusterName EKS cluster name (required) + * @param region AWS region (default: us-east-2) + * @param adminGroupName IAM group for admin access (default: pmm-eks-admins) + */ +def configureAccess(Map config) { + def clusterName = config.clusterName ?: error('clusterName is required') + def region = config.region ?: 'us-east-2' + def adminGroupName = config.adminGroupName ?: 'pmm-eks-admins' + + sh """ + set -euo pipefail + + CLUSTER_NAME="${clusterName}" + REGION="${region}" + + ACCOUNT_ID=\$(aws sts get-caller-identity --query Account --output text) + echo "AWS Account ID: \${ACCOUNT_ID}" + + # Add EKSAdminRole with cluster admin access + aws eks create-access-entry \\ + --cluster-name "\${CLUSTER_NAME}" \\ + --region "\${REGION}" \\ + --principal-arn "arn:aws:iam::\${ACCOUNT_ID}:role/EKSAdminRole" || true + + aws eks associate-access-policy \\ + --cluster-name "\${CLUSTER_NAME}" \\ + --region "\${REGION}" \\ + --principal-arn "arn:aws:iam::\${ACCOUNT_ID}:role/EKSAdminRole" \\ + --policy-arn arn:aws:eks::aws:cluster-access-policy/AmazonEKSClusterAdminPolicy \\ + --access-scope type=cluster || true + + # Add IAM group members dynamically + USERS=\$(aws iam get-group --group-name ${adminGroupName} --query 'Users[].Arn' --output text 2>/dev/null || echo "") + for USER_ARN in \${USERS}; do + echo "Adding access for \${USER_ARN}..." + aws eks create-access-entry \\ + --cluster-name "\${CLUSTER_NAME}" \\ + --region "\${REGION}" \\ + --principal-arn "\${USER_ARN}" || true + + aws eks associate-access-policy \\ + --cluster-name "\${CLUSTER_NAME}" \\ + --region "\${REGION}" \\ + --principal-arn "\${USER_ARN}" \\ + --policy-arn arn:aws:eks::aws:cluster-access-policy/AmazonEKSClusterAdminPolicy \\ + --access-scope type=cluster || true + done + + # Add SSO AdministratorAccess role (discover dynamically) + SSO_ROLE_ARN=\$(aws iam list-roles \\ + --query "Roles[?contains(RoleName, 'AWSReservedSSO_AdministratorAccess')].Arn | [0]" \\ + --output text 2>/dev/null | head -1 | tr -d '[:space:]') + + if [ -n "\${SSO_ROLE_ARN}" ] && [ "\${SSO_ROLE_ARN}" != "None" ]; then + echo "Adding SSO role: \${SSO_ROLE_ARN}" + aws eks create-access-entry \\ + --cluster-name "\${CLUSTER_NAME}" \\ + --region "\${REGION}" \\ + --principal-arn "\${SSO_ROLE_ARN}" || true + + aws eks associate-access-policy \\ + --cluster-name "\${CLUSTER_NAME}" \\ + --region "\${REGION}" \\ + --principal-arn "\${SSO_ROLE_ARN}" \\ + --policy-arn arn:aws:eks::aws:cluster-access-policy/AmazonEKSClusterAdminPolicy \\ + --access-scope type=cluster || true + else + echo "No SSO AdministratorAccess role found, skipping" + fi + + echo "Access entries configured:" + aws eks list-access-entries --cluster-name "\${CLUSTER_NAME}" --region "\${REGION}" + """ +} + +/** + * Setup EKS infrastructure components for PMM HA. + * + * Installs and configures: + * - GP3 storage class (encrypted, default) + * - AWS Node Termination Handler (for spot instance draining) + * - AWS Load Balancer Controller (for ALB ingress) + * + * @param clusterName EKS cluster name (required) + * @param region AWS region (default: us-east-2) + */ +def setupInfrastructure(Map config) { + def clusterName = config.clusterName ?: error('clusterName is required') + def region = config.region ?: 'us-east-2' + + sh """ + set -euo pipefail + + CLUSTER_NAME="${clusterName}" + REGION="${region}" + + ACCOUNT_ID=\$(aws sts get-caller-identity --query Account --output text) + echo "AWS Account ID: \${ACCOUNT_ID}" + + # Configure GP3 as default storage class + kubectl patch storageclass gp2 -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"false"}}}' || true + + cat </dev/null; then + echo "Cloned from: \${TIBI_REPO}" + echo "theTibi" > .chart-repo-source + elif git clone --depth 1 --branch "\${HELM_CHART_BRANCH}" "\${PERCONA_REPO}" charts-repo 2>/dev/null; then + echo "Cloned from: \${PERCONA_REPO}" + echo "percona" > .chart-repo-source + else + echo "ERROR: Branch \${HELM_CHART_BRANCH} not found in either repository" + exit 1 + fi + + # Add required Helm repos + helm repo add percona https://percona.github.io/percona-helm-charts/ || true + helm repo add vm https://victoriametrics.github.io/helm-charts/ || true + helm repo add altinity https://docs.altinity.com/helm-charts/ || true + helm repo update + + # Install PMM HA dependencies (operators) + helm dependency update charts-repo/charts/pmm-ha-dependencies + helm upgrade --install pmm-operators charts-repo/charts/pmm-ha-dependencies \\ + --namespace "\${PMM_NAMESPACE}" \\ + --create-namespace \\ + --wait \\ + --timeout 10m + + echo "Waiting for operators to be ready..." + kubectl wait --for=condition=ready pod -l app.kubernetes.io/name=victoria-metrics-operator -n "\${PMM_NAMESPACE}" --timeout=300s || true + kubectl wait --for=condition=ready pod -l app.kubernetes.io/name=altinity-clickhouse-operator -n "\${PMM_NAMESPACE}" --timeout=300s || true + kubectl wait --for=condition=ready pod -l app.kubernetes.io/name=pg-operator -n "\${PMM_NAMESPACE}" --timeout=300s || true + + # Generate passwords + PMM_ADMIN_PASSWORD=\$(openssl rand -base64 16 | tr -dc 'a-zA-Z0-9' | head -c 16) + PG_PASSWORD=\$(openssl rand -base64 24 | tr -dc 'a-zA-Z0-9' | head -c 24) + GF_PASSWORD=\$(openssl rand -base64 24 | tr -dc 'a-zA-Z0-9' | head -c 24) + CH_PASSWORD=\$(openssl rand -base64 24 | tr -dc 'a-zA-Z0-9' | head -c 24) + VM_PASSWORD=\$(openssl rand -base64 24 | tr -dc 'a-zA-Z0-9' | head -c 24) + + # Pre-create pmm-secret before helm install + # The chart's pg-user-credentials-secrets.yaml uses lookup() at template time + # GF_SECURITY_ADMIN_PASSWORD is needed because with secret.create=false, + # the chart doesn't explicitly set this env var (only secretRef is used) + kubectl create secret generic pmm-secret \\ + --namespace "\${PMM_NAMESPACE}" \\ + --from-literal=PMM_ADMIN_PASSWORD="\${PMM_ADMIN_PASSWORD}" \\ + --from-literal=GF_SECURITY_ADMIN_PASSWORD="\${PMM_ADMIN_PASSWORD}" \\ + --from-literal=PG_PASSWORD="\${PG_PASSWORD}" \\ + --from-literal=GF_PASSWORD="\${GF_PASSWORD}" \\ + --from-literal=PMM_CLICKHOUSE_USER="clickhouse_pmm" \\ + --from-literal=PMM_CLICKHOUSE_PASSWORD="\${CH_PASSWORD}" \\ + --from-literal=VMAGENT_remoteWrite_basicAuth_username="victoriametrics_pmm" \\ + --from-literal=VMAGENT_remoteWrite_basicAuth_password="\${VM_PASSWORD}" \\ + --dry-run=client -o yaml | kubectl apply -f - + + helm dependency update charts-repo/charts/pmm-ha + + HELM_CMD="helm upgrade --install pmm-ha charts-repo/charts/pmm-ha" + HELM_CMD="\${HELM_CMD} --namespace \${PMM_NAMESPACE}" + HELM_CMD="\${HELM_CMD} --set secret.create=false" + HELM_CMD="\${HELM_CMD} --set secret.name=pmm-secret" + # Increase ClickHouse memory for merge operations (default 4Gi is insufficient) + HELM_CMD="\${HELM_CMD} --set clickhouse.resources.requests.memory=4Gi" + HELM_CMD="\${HELM_CMD} --set clickhouse.resources.limits.memory=10Gi" + if [ -n "\${PMM_IMAGE_TAG}" ]; then + HELM_CMD="\${HELM_CMD} --set image.tag=\${PMM_IMAGE_TAG}" + fi + HELM_CMD="\${HELM_CMD} --wait --timeout 15m" + + eval "\${HELM_CMD}" + + echo "Waiting for PMM HA components..." + kubectl rollout status statefulset/pmm-ha -n "\${PMM_NAMESPACE}" --timeout=600s || true + kubectl wait --for=condition=ready pod -l clickhouse.altinity.com/chi=pmm-ha -n "\${PMM_NAMESPACE}" --timeout=600s || true + kubectl wait --for=condition=ready pod -l app.kubernetes.io/component=vmselect -n "\${PMM_NAMESPACE}" --timeout=300s || true + kubectl wait --for=condition=ready pod -l app.kubernetes.io/component=vmstorage -n "\${PMM_NAMESPACE}" --timeout=300s || true + + echo "PMM HA installed" + kubectl get pods -n "\${PMM_NAMESPACE}" + """ +} + +/** + * Create ALB Ingress and Route53 DNS record for PMM HA. + * + * Creates: + * - ALB Ingress with ACM certificate (HTTPS) + * - Route53 alias record pointing to ALB + * + * Waits up to 5 minutes for ALB provisioning. + * + * @param namespace Kubernetes namespace (default: pmm) + * @param domain FQDN for PMM access (required) + * @param certArn ACM certificate ARN for TLS (required) + * @param r53ZoneName Route53 hosted zone name (required, e.g., cd.percona.com) + * @param region AWS region (default: us-east-2) + */ +def createIngress(Map config) { + def namespace = config.namespace ?: 'pmm' + def domain = config.domain ?: error('domain is required') + def certArn = config.certArn ?: error('certArn is required') + def r53ZoneName = config.r53ZoneName ?: error('r53ZoneName is required') + def region = config.region ?: 'us-east-2' + + sh """ + set -euo pipefail + + PMM_NAMESPACE="${namespace}" + PMM_DOMAIN="${domain}" + ACM_CERT_ARN="${certArn}" + R53_ZONE_NAME="${r53ZoneName}" + REGION="${region}" + + # Resolve Route53 zone ID from zone name (public zones only, exact match) + R53_ZONE_IDS=\$(aws route53 list-hosted-zones-by-name \\ + --dns-name "\${R53_ZONE_NAME}" \\ + --query 'HostedZones[?Config.PrivateZone==`false` && Name==`'"\${R53_ZONE_NAME}"'.`].Id' \\ + --output text | sed 's|/hostedzone/||g') + + # Validate we got exactly one zone + zone_count=\$(echo "\${R53_ZONE_IDS}" | wc -w | tr -d ' ') + if [ "\${zone_count}" -eq 0 ] || [ -z "\${R53_ZONE_IDS}" ] || [ "\${R53_ZONE_IDS}" = "None" ]; then + echo "ERROR: No public Route53 zone found for \${R53_ZONE_NAME}" + exit 1 + elif [ "\${zone_count}" -gt 1 ]; then + echo "ERROR: Multiple public Route53 zones found for \${R53_ZONE_NAME}: \${R53_ZONE_IDS}" + exit 1 + fi + R53_ZONE_ID="\${R53_ZONE_IDS}" + echo "Resolved Route53 zone ID: \${R53_ZONE_ID}" + + # Create ALB Ingress + cat </dev/null || echo "") + if [ -n "\${ALB_HOSTNAME}" ]; then + echo "ALB provisioned: \${ALB_HOSTNAME}" + break + fi + echo "Waiting for ALB... (\${attempt}/30)" + sleep 10 + done + + if [ -z "\${ALB_HOSTNAME}" ]; then + echo "WARNING: ALB not provisioned within timeout" + kubectl describe ingress pmm-ha-alb -n "\${PMM_NAMESPACE}" + exit 1 + fi + + ALB_ZONE_ID=\$(aws elbv2 describe-load-balancers --region "\${REGION}" \\ + --query "LoadBalancers[?DNSName=='\${ALB_HOSTNAME}'].CanonicalHostedZoneId" \\ + --output text) + + if [ -n "\${ALB_ZONE_ID}" ]; then + aws route53 change-resource-record-sets \\ + --hosted-zone-id "\${R53_ZONE_ID}" \\ + --change-batch '{ + "Changes": [{ + "Action": "UPSERT", + "ResourceRecordSet": { + "Name": "'"\${PMM_DOMAIN}"'", + "Type": "A", + "AliasTarget": { + "HostedZoneId": "'"\${ALB_ZONE_ID}"'", + "DNSName": "'"\${ALB_HOSTNAME}"'", + "EvaluateTargetHealth": true + } + } + }] + }' + echo "Route53 record created: \${PMM_DOMAIN} -> \${ALB_HOSTNAME}" + else + echo "WARNING: Could not get ALB zone ID, skipping Route53 record" + fi + """ +} + +/** + * Delete PMM HA EKS cluster and all associated AWS resources. + * + * Cleanup order (to avoid dependency errors): + * 1. Route53 alias record + * 2. ALB Ingress (triggers ALB deletion) + * 3. EKS cluster via eksctl + * + * @param clusterName EKS cluster name (required) + * @param region AWS region (default: us-east-2) + * @param r53ZoneName Route53 hosted zone name (default: cd.percona.com) + */ +def deleteCluster(Map config) { + def clusterName = config.clusterName ?: error('clusterName is required') + def region = config.region ?: 'us-east-2' + def r53ZoneName = config.r53ZoneName ?: 'cd.percona.com' + + sh """ + set -euo pipefail + + cluster_name="${clusterName}" + REGION="${region}" + R53_ZONE_NAME="${r53ZoneName}" + + # Resolve Route53 zone ID from zone name (public zones only, exact match) + R53_ZONE_IDS=\$(aws route53 list-hosted-zones-by-name \\ + --dns-name "\${R53_ZONE_NAME}" \\ + --query 'HostedZones[?Config.PrivateZone==`false` && Name==`'"\${R53_ZONE_NAME}"'.`].Id' \\ + --output text | sed 's|/hostedzone/||g') + + # Validate we got exactly one zone + zone_count=\$(echo "\${R53_ZONE_IDS}" | wc -w | tr -d ' ') + if [ "\${zone_count}" -eq 0 ] || [ -z "\${R53_ZONE_IDS}" ] || [ "\${R53_ZONE_IDS}" = "None" ]; then + echo "WARNING: No public Route53 zone found for \${R53_ZONE_NAME}, skipping DNS cleanup" + R53_ZONE_ID="" + elif [ "\${zone_count}" -gt 1 ]; then + echo "WARNING: Multiple public Route53 zones found for \${R53_ZONE_NAME}, skipping DNS cleanup" + R53_ZONE_ID="" + else + R53_ZONE_ID="\${R53_ZONE_IDS}" + echo "Resolved Route53 zone ID: \${R53_ZONE_ID}" + fi + + echo "============================================" + echo "Cleaning up cluster: \${cluster_name}" + echo "============================================" + + # Delete Route53 record (if zone was resolved) + domain_name="\${cluster_name}.\${R53_ZONE_NAME}" + if [ -n "\${R53_ZONE_ID}" ]; then + echo "Deleting Route53 record for \${domain_name}..." + record=\$(aws route53 list-resource-record-sets \\ + --hosted-zone-id "\${R53_ZONE_ID}" \\ + --query "ResourceRecordSets[?Name=='\${domain_name}.']" \\ + --output json 2>/dev/null || echo "[]") + + if [ "\${record}" != "[]" ] && [ -n "\${record}" ]; then + record_type=\$(echo "\${record}" | jq -r '.[0].Type') + if [ "\${record_type}" = "A" ]; then + alias_target=\$(echo "\${record}" | jq -r '.[0].AliasTarget') + aws route53 change-resource-record-sets \\ + --hosted-zone-id "\${R53_ZONE_ID}" \\ + --change-batch '{ + "Changes": [{ + "Action": "DELETE", + "ResourceRecordSet": { + "Name": "'"\${domain_name}"'", + "Type": "A", + "AliasTarget": '"\${alias_target}"' + } + }] + }' && echo "Route53 record deleted" || echo "Warning: Failed to delete Route53 record" + fi + else + echo "No Route53 record found for \${domain_name}" + fi + else + echo "Skipping Route53 record deletion (zone not resolved)" + fi + + # Delete ALB ingress (triggers ALB deletion) + echo "Deleting ALB ingress..." + if aws eks update-kubeconfig --name "\${cluster_name}" --region "\${REGION}" 2>/dev/null; then + kubectl delete ingress pmm-ha-alb -n pmm --ignore-not-found=true + fi + + # Wait for ALB cleanup + echo "Waiting for ALB cleanup..." + sleep 30 + + # Disable termination protection on all CloudFormation stacks for this cluster + echo "Disabling termination protection on CloudFormation stacks..." + for stack_name in \$(aws cloudformation list-stacks --region "\${REGION}" \\ + --stack-status-filter CREATE_COMPLETE UPDATE_COMPLETE \\ + --query "StackSummaries[?starts_with(StackName, 'eksctl-\${cluster_name}')].StackName" \\ + --output text 2>/dev/null); do + echo " Disabling protection: \${stack_name}" + aws cloudformation update-termination-protection \\ + --region "\${REGION}" \\ + --stack-name "\${stack_name}" \\ + --no-enable-termination-protection 2>/dev/null || true + done + + echo "Deleting EKS cluster \${cluster_name}..." + eksctl delete cluster --region "\${REGION}" --name "\${cluster_name}" \\ + --disable-nodegroup-eviction --wait + """ +} + +/** + * List PMM HA test clusters sorted by creation time (newest first). + * + * @param region AWS region (default: us-east-2) + * @return List of cluster names sorted newest first, empty list if none found + */ +def listClusters(String region = 'us-east-2') { + def clusterPrefix = 'pmm-ha-test-' + + // Get all clusters matching prefix, then describe each and sort by createdAt (newest first) + // Output: one cluster name per line, sorted by creation time descending + def output = sh( + script: """ + aws eks list-clusters --region ${region} --output json 2>/dev/null | \\ + jq -r '.clusters[] | select(startswith("${clusterPrefix}"))' | \\ + while read cluster; do + CREATED=\$(aws eks describe-cluster --name "\$cluster" --region ${region} \\ + --query 'cluster.createdAt' --output text 2>/dev/null) + [ -n "\$CREATED" ] && [ "\$CREATED" != "None" ] && echo "\$CREATED|\$cluster" + done | sort -r | cut -d'|' -f2 + """, + returnStdout: true + ).trim() + + if (!output) { + return [] + } + + return output.split('\n').findAll { it } +} + +/** + * Delete multiple clusters with optional SKIP_NEWEST and age filter. + * + * Supports parallel deletion for faster cleanup. + * + * @param region AWS region (default: us-east-2) + * @param r53ZoneName Route53 hosted zone name (default: cd.percona.com) + * @param skipNewest Skip the most recent cluster (default: true) + * @param maxAgeHours Only delete clusters older than this (0 = delete all, default: 0) + */ +def deleteAllClusters(Map config = [:]) { + def region = config.region ?: 'us-east-2' + def r53ZoneName = config.r53ZoneName ?: 'cd.percona.com' + def skipNewest = config.skipNewest != null ? config.skipNewest : true + def maxAgeHours = config.maxAgeHours ?: 0 + + def clusterList = listClusters(region) + + if (!clusterList) { + echo "No clusters found with prefix 'pmm-ha-test-'." + return + } + + def clustersToDelete = clusterList + if (skipNewest && clusterList.size() > 1) { + // clusterList is sorted newest first, so skip first element + clustersToDelete = clusterList.drop(1) + echo "Skipping newest cluster: ${clusterList[0]} (SKIP_NEWEST=true)" + } + + // Filter by age if maxAgeHours > 0 + if (maxAgeHours > 0) { + def cutoffMs = System.currentTimeMillis() - (maxAgeHours * 60 * 60 * 1000) + def filtered = [] + + clustersToDelete.each { clusterName -> + def createdMs = sh( + script: """ + CREATED=\$(aws eks describe-cluster --name ${clusterName} --region ${region} \\ + --query 'cluster.createdAt' --output text 2>/dev/null || echo '') + if [ -z "\$CREATED" ] || [ "\$CREATED" = "None" ]; then + echo "" + else + date -d "\$CREATED" +%s000 2>/dev/null || echo "" + fi + """, + returnStdout: true + ).trim() + + if (createdMs && createdMs.toLong() < cutoffMs) { + filtered.add(clusterName) + } else { + echo "Skipping recent cluster: ${clusterName} (created within last ${maxAgeHours}h)" + } + } + clustersToDelete = filtered + } + + if (!clustersToDelete) { + echo 'No clusters to delete after applying filters.' + return + } + + // Delete clusters in parallel + def parallelStages = [:] + clustersToDelete.each { clusterName -> + parallelStages["Delete ${clusterName}"] = { + deleteCluster( + clusterName: clusterName, + region: region, + r53ZoneName: r53ZoneName + ) + } + } + parallel parallelStages +} + +/** + * Clean up orphaned VPCs and failed CloudFormation stacks. + * + * Finds: + * - VPCs with eksctl-pmm-ha-test-* tags but no matching EKS cluster + * - CloudFormation stacks in DELETE_FAILED or ROLLBACK_COMPLETE state + * + * @param region AWS region (default: us-east-2) + */ +def cleanupOrphans(Map config = [:]) { + def region = config.region ?: 'us-east-2' + def clusterPrefix = 'pmm-ha-test-' + + // Get list of active EKS clusters + def activeClusters = sh( + script: """ + aws eks list-clusters --region ${region} \\ + --query "clusters[?starts_with(@, '${clusterPrefix}')]" \\ + --output text 2>/dev/null || echo '' + """, + returnStdout: true + ).trim().split(/\s+/).findAll { it } + + echo "Active EKS clusters: ${activeClusters}" + + // Find orphaned VPCs (VPCs without matching EKS cluster) + def orphanedVpcs = sh( + script: """ + aws ec2 describe-vpcs --region ${region} \\ + --filters "Name=tag:Name,Values=eksctl-${clusterPrefix}*-cluster/VPC" \\ + --query 'Vpcs[*].[VpcId,Tags[?Key==`Name`].Value|[0]]' \\ + --output text 2>/dev/null || echo '' + """, + returnStdout: true + ).trim() + + if (orphanedVpcs) { + orphanedVpcs.split('\n').each { line -> + def parts = line.split('\t') + if (parts.size() >= 2) { + def vpcId = parts[0] + def vpcName = parts[1] + // Extract cluster name from VPC name (eksctl-pmm-ha-test-XX-cluster/VPC) + def matcher = vpcName =~ /eksctl-(${clusterPrefix}\d+)-cluster/ + if (matcher) { + def clusterName = matcher[0][1] + if (!activeClusters.contains(clusterName)) { + echo "Found orphaned VPC: ${vpcId} (${vpcName}) - cluster ${clusterName} does not exist" + // Delete VPC using eksctl (handles all dependencies) + sh """ + eksctl delete cluster --name ${clusterName} --region ${region} --wait=false 2>/dev/null || true + """ + } + } + } + } + } else { + echo 'No orphaned VPCs found.' + } + + // Find and delete failed CloudFormation stacks + def failedStacks = sh( + script: """ + aws cloudformation list-stacks --region ${region} \\ + --stack-status-filter DELETE_FAILED ROLLBACK_COMPLETE \\ + --query "StackSummaries[?contains(StackName, '${clusterPrefix}')].StackName" \\ + --output text 2>/dev/null || echo '' + """, + returnStdout: true + ).trim() + + if (failedStacks) { + failedStacks.split(/\s+/).each { stackName -> + echo "Deleting failed stack: ${stackName}" + sh "aws cloudformation delete-stack --region ${region} --stack-name ${stackName} || true" + } + } else { + echo 'No failed CloudFormation stacks found.' + } +}