From b38da5ee5bd7ed5d3493451afdffe93d9a602b9c Mon Sep 17 00:00:00 2001 From: Anderson Nogueira Date: Sun, 30 Nov 2025 00:27:57 +0100 Subject: [PATCH 01/19] feat(pmm-ha): add EKS testing pipeline with ALB, Route53, and Access Entries - Add AWS Load Balancer Controller with IRSA for ALB ingress - Add ALB Ingress with ACM certificate (*.cd.percona.com wildcard) - Add Route53 alias records for friendly URLs (pmm-ha-test-N.cd.percona.com) - Replace ConfigMap-based auth with EKS Access Entries API - Add pmm-eks-admins IAM group for kubectl access - Add SSO AdministratorAccess role support - Add cleanup job with Route53/ALB cleanup before cluster deletion - Extract shared library vars/pmmHaEks.groovy for reusable functions Jira: PMM-14346 --- pmm/v3/pmm3-ha-eks-cleanup.groovy | 166 ++++++---- pmm/v3/pmm3-ha-eks.groovy | 305 ++++++++++++++---- vars/pmmHaEks.groovy | 515 ++++++++++++++++++++++++++++++ 3 files changed, 856 insertions(+), 130 deletions(-) create mode 100644 vars/pmmHaEks.groovy diff --git a/pmm/v3/pmm3-ha-eks-cleanup.groovy b/pmm/v3/pmm3-ha-eks-cleanup.groovy index cada78b616..dfb74ee79b 100644 --- a/pmm/v3/pmm3-ha-eks-cleanup.groovy +++ b/pmm/v3/pmm3-ha-eks-cleanup.groovy @@ -1,10 +1,31 @@ +/** + * PMM HA EKS Cleanup Pipeline + * + * Manages cleanup of PMM HA test clusters. Supports manual and scheduled runs. + * Deletes Route53 records, ALB ingress, and EKS clusters. + * + * Actions: + * - LIST_ONLY: List all test clusters with age + * - DELETE_CLUSTER: Delete a specific cluster + * - DELETE_ALL: Delete all test clusters + * - DELETE_OLD (cron): Delete clusters older than 24 hours + * + * Related: + * - Create: pmm3-ha-eks.groovy + * - Shared library: vars/pmmHaEks.groovy + */ +library changelog: false, identifier: 'lib@master', retriever: modernSCM([ + $class: 'GitSCMSource', + remote: 'https://github.com/Percona-Lab/jenkins-pipelines' +]) + pipeline { agent { - label 'agent-amd64-ol9' + label 'cli' } triggers { - cron('H 0,12 * * *') // Runs twice daily at 00:00 & 12:00 + cron('H 0,12 * * *') } parameters { @@ -23,11 +44,14 @@ pipeline { options { buildDiscarder(logRotator(numToKeepStr: '30')) + disableConcurrentBuilds() + timeout(time: 60, unit: 'MINUTES') } environment { - REGION = "us-east-2" - CLUSTER_PREFIX = "pmm-ha-test-" + REGION = 'us-east-2' + CLUSTER_PREFIX = 'pmm-ha-test-' + R53_ZONE_NAME = 'cd.percona.com' } stages { @@ -36,14 +60,14 @@ pipeline { script { if (currentBuild.getBuildCauses('hudson.triggers.TimerTrigger$TimerTriggerCause')) { env.ACTION = 'DELETE_OLD' - echo "Triggered by cron - will delete clusters older than 1 day." + echo 'Triggered by cron - will delete clusters older than 1 day.' } else { env.ACTION = params.ACTION echo "Manual run with ACTION=${params.ACTION}" } if (env.ACTION == 'DELETE_CLUSTER' && !params.CLUSTER_NAME) { - error("CLUSTER_NAME is required for DELETE_CLUSTER.") + error('CLUSTER_NAME is required for DELETE_CLUSTER.') } if (params.CLUSTER_NAME && !params.CLUSTER_NAME.startsWith(env.CLUSTER_PREFIX)) { error("Cluster name must start with ${env.CLUSTER_PREFIX}") @@ -59,24 +83,24 @@ pipeline { sh ''' set +x - CLUSTERS=$(aws eks list-clusters --region "$REGION" \ + CLUSTERS=$(aws eks list-clusters --region "${REGION}" \ --query "clusters[?starts_with(@, '${CLUSTER_PREFIX}')]" \ --output text) - if [ -z "$CLUSTERS" ]; then + if [ -z "${CLUSTERS}" ]; then echo "No clusters found with prefix '${CLUSTER_PREFIX}'." exit 0 fi - for c in $CLUSTERS; do + for cluster in ${CLUSTERS}; do CREATED=$(aws eks describe-cluster \ - --name "$c" --region "$REGION" \ + --name "${cluster}" --region "${REGION}" \ --query "cluster.createdAt" --output text) - CREATED_EPOCH=$(date -d "$CREATED" +%s) + CREATED_EPOCH=$(date -d "${CREATED}" +%s) AGE_HOURS=$(( ( $(date +%s) - CREATED_EPOCH ) / 3600 )) - echo "• $c | Created: $CREATED | Age: ${AGE_HOURS}h" + echo "* ${cluster} | Created: ${CREATED} | Age: ${AGE_HOURS}h" done ''' } @@ -87,15 +111,22 @@ pipeline { when { expression { env.ACTION == 'DELETE_CLUSTER' } } steps { withCredentials([aws(credentialsId: 'pmm-staging-slave')]) { - sh ''' - if ! aws eks describe-cluster --region "${REGION}" --name "${CLUSTER_NAME}" >/dev/null 2>&1; then - echo "Cluster '${CLUSTER_NAME}' not found in region '${REGION}'." - exit 0 - fi - - eksctl delete cluster --region "${REGION}" --name "${CLUSTER_NAME}" \ - --disable-nodegroup-eviction --wait - ''' + script { + def clusterExists = sh( + script: "aws eks describe-cluster --region ${REGION} --name ${params.CLUSTER_NAME} >/dev/null 2>&1", + returnStatus: true + ) == 0 + + if (clusterExists) { + pmmHaEks.deleteCluster( + clusterName: params.CLUSTER_NAME, + region: env.REGION, + r53ZoneName: env.R53_ZONE_NAME + ) + } else { + echo "Cluster '${params.CLUSTER_NAME}' not found in region '${REGION}'." + } + } } } } @@ -104,20 +135,25 @@ pipeline { when { expression { env.ACTION == 'DELETE_ALL' } } steps { withCredentials([aws(credentialsId: 'pmm-staging-slave')]) { - sh ''' - CLUSTERS=$(aws eks list-clusters --region "$REGION" \ - --query "clusters[?starts_with(@, '${CLUSTER_PREFIX}')]" --output text) + script { + def clusters = sh( + script: "aws eks list-clusters --region ${REGION} --query \"clusters[?starts_with(@, '${CLUSTER_PREFIX}')]\" --output text", + returnStdout: true + ).trim() - if [ -z "$CLUSTERS" ]; then + if (!clusters) { echo "No clusters found with prefix '${CLUSTER_PREFIX}'." - exit 0 - fi - - for c in $CLUSTERS; do - eksctl delete cluster --region "$REGION" --name "$c" \ - --disable-nodegroup-eviction --wait - done - ''' + return + } + + clusters.split(/\s+/).each { clusterName -> + pmmHaEks.deleteCluster( + clusterName: clusterName, + region: env.REGION, + r53ZoneName: env.R53_ZONE_NAME + ) + } + } } } } @@ -126,36 +162,44 @@ pipeline { when { expression { env.ACTION == 'DELETE_OLD' } } steps { withCredentials([aws(credentialsId: 'pmm-staging-slave')]) { - sh ''' - CLUSTERS=$(aws eks list-clusters --region "$REGION" \ - --query "clusters[?starts_with(@, '${CLUSTER_PREFIX}')]" --output text) + script { + def clusters = sh( + script: "aws eks list-clusters --region ${REGION} --query \"clusters[?starts_with(@, '${CLUSTER_PREFIX}')]\" --output text", + returnStdout: true + ).trim() - if [ -z "$CLUSTERS" ]; then + if (!clusters) { echo "No clusters found with prefix '${CLUSTER_PREFIX}'." - exit 0 - fi - - CUTOFF=$(date -d "1 day ago" +%s) - - for c in $CLUSTERS; do - CREATED=$(aws eks describe-cluster --name "$c" --region "$REGION" \ - --query "cluster.createdAt" --output text 2>/dev/null || true) - - if [ -z "$CREATED" ] || [ "$CREATED" == "None" ]; then - echo "Unable to fetch creation time for $c — skipping." - continue - fi - - CREATED_EPOCH=$(date -d "$CREATED" +%s) - - if [ "$CREATED_EPOCH" -lt "$CUTOFF" ]; then - eksctl delete cluster --region "$REGION" --name "$c" \ - --disable-nodegroup-eviction --wait - else - echo "Skipping recent cluster: $c (created within last 24h)" - fi - done - ''' + return + } + + def cutoffMs = System.currentTimeMillis() - (24 * 60 * 60 * 1000) // 1 day ago + + clusters.split(/\s+/).each { clusterName -> + def createdAt = sh( + script: "aws eks describe-cluster --name ${clusterName} --region ${REGION} --query 'cluster.createdAt' --output text 2>/dev/null || echo ''", + returnStdout: true + ).trim() + + if (!createdAt || createdAt == 'None') { + echo "Unable to fetch creation time for ${clusterName} - skipping." + return // continue to next iteration + } + + // Parse ISO 8601 timestamp + def createdMs = Date.parse("yyyy-MM-dd'T'HH:mm:ss", createdAt.take(19)).time + + if (createdMs < cutoffMs) { + pmmHaEks.deleteCluster( + clusterName: clusterName, + region: env.REGION, + r53ZoneName: env.R53_ZONE_NAME + ) + } else { + echo "Skipping recent cluster: ${clusterName} (created within last 24h)" + } + } + } } } } diff --git a/pmm/v3/pmm3-ha-eks.groovy b/pmm/v3/pmm3-ha-eks.groovy index 580bec6446..02ea2c3fb3 100644 --- a/pmm/v3/pmm3-ha-eks.groovy +++ b/pmm/v3/pmm3-ha-eks.groovy @@ -1,20 +1,57 @@ +/** + * PMM HA EKS Test Pipeline + * + * Creates an EKS cluster with PMM High Availability deployment for testing. + * Includes ALB ingress with ACM certificate and Route53 DNS. + * + * Related: + * - Cleanup: pmm3-ha-eks-cleanup.groovy + * - Shared library: vars/pmmHaEks.groovy + */ +library changelog: false, identifier: 'lib@master', retriever: modernSCM([ + $class: 'GitSCMSource', + remote: 'https://github.com/Percona-Lab/jenkins-pipelines' +]) + pipeline { agent { label 'agent-amd64-ol9' } + options { + disableConcurrentBuilds() + timeout(time: 90, unit: 'MINUTES') + } + parameters { choice( name: 'K8S_VERSION', - choices: ['1.32', '1.31', '1.30', '1.29', '1.28'], + choices: ['1.32', '1.33', '1.34', '1.31', '1.30', '1.29'], description: 'Select Kubernetes cluster version' ) + // PMM HA charts are not yet merged to percona/percona-helm-charts main branch. + // theTibi/PMM-14420 contains both pmm-ha and pmm-ha-dependencies charts. + // Once merged to percona main, update default to 'main' and swap repo priority. + string( + name: 'HELM_CHART_BRANCH', + defaultValue: 'PMM-14420', + description: 'Branch of percona-helm-charts repo (theTibi/PMM-14420 has both pmm-ha and pmm-ha-dependencies)' + ) + string( + name: 'PMM_IMAGE_TAG', + defaultValue: '', + description: 'PMM Server image tag (leave empty for chart default)' + ) } - environment { + environment { CLUSTER_NAME = "pmm-ha-test-${BUILD_NUMBER}" - REGION = "us-east-2" + REGION = 'us-east-2' KUBECONFIG = "${WORKSPACE}/kubeconfig/config" + PMM_NAMESPACE = 'pmm' + ACM_CERT_ARN = 'arn:aws:acm:us-east-2:119175775298:certificate/9bd3a0c8-8205-4092-8003-7304ca762143' + R53_ZONE_NAME = 'cd.percona.com' + PMM_DOMAIN = "pmm-ha-test-${BUILD_NUMBER}.${R53_ZONE_NAME}" } stages { @@ -35,6 +72,9 @@ metadata: build-number: "${BUILD_NUMBER}" purpose: "pmm-ha-testing" +accessConfig: + authenticationMode: API + iam: withOIDC: true @@ -55,7 +95,7 @@ managedNodeGroups: spot: true minSize: 2 maxSize: 5 - desiredCapacity: 3 + desiredCapacity: 4 tags: iit-billing-tag: "pmm" nodegroup: "spot" @@ -75,14 +115,14 @@ EOF EXISTING_CLUSTERS=$(aws eks list-clusters --region "${REGION}" \ --query "clusters[?starts_with(@, 'pmm-ha-test-')]" --output text) - if [ -z "$EXISTING_CLUSTERS" ]; then + if [ -z "${EXISTING_CLUSTERS}" ]; then EXISTING_COUNT=0 else - EXISTING_COUNT=$(echo "$EXISTING_CLUSTERS" | wc -w) - echo "$EXISTING_CLUSTERS" | tr '\\t' '\\n' + EXISTING_COUNT=$(echo "${EXISTING_CLUSTERS}" | wc -w) + echo "${EXISTING_CLUSTERS}" | tr '\\t' '\\n' fi - if [ "$EXISTING_COUNT" -ge 5 ]; then + if [ "${EXISTING_COUNT}" -ge 5 ]; then echo "ERROR: Maximum limit of 5 test clusters reached." exit 1 fi @@ -93,24 +133,72 @@ EOF } } + stage('Validate Helm Chart') { + steps { + sh ''' + set -e + echo "Validating Helm chart branch: ${HELM_CHART_BRANCH}" + + # Try theTibi fork first (has PMM-14420 with both charts), then percona repo + TIBI_REPO="https://github.com/theTibi/percona-helm-charts.git" + PERCONA_REPO="https://github.com/percona/percona-helm-charts.git" + + rm -rf charts-repo-check + if git clone --depth 1 --branch "${HELM_CHART_BRANCH}" "${TIBI_REPO}" charts-repo-check 2>/dev/null; then + echo "Found branch in: ${TIBI_REPO}" + elif git clone --depth 1 --branch "${HELM_CHART_BRANCH}" "${PERCONA_REPO}" charts-repo-check 2>/dev/null; then + echo "Found branch in: ${PERCONA_REPO}" + else + echo "ERROR: Branch '${HELM_CHART_BRANCH}' not found in theTibi or percona helm chart repos" + exit 1 + fi + + # Check required charts exist + if [ ! -d "charts-repo-check/charts/pmm-ha" ]; then + echo "ERROR: pmm-ha chart not found in branch '${HELM_CHART_BRANCH}'" + echo "Available charts:" + ls -la charts-repo-check/charts/ || true + rm -rf charts-repo-check + exit 1 + fi + + if [ ! -d "charts-repo-check/charts/pmm-ha-dependencies" ]; then + echo "ERROR: pmm-ha-dependencies chart not found in branch '${HELM_CHART_BRANCH}'" + echo "Available charts:" + ls -la charts-repo-check/charts/ || true + rm -rf charts-repo-check + exit 1 + fi + + echo "Helm charts validated successfully (pmm-ha + pmm-ha-dependencies)" + rm -rf charts-repo-check + ''' + } + } + stage('Create EKS Cluster') { steps { withCredentials([aws(credentialsId: 'pmm-staging-slave')]) { sh ''' eksctl create cluster -f cluster-config.yaml --timeout=40m --verbose=4 - - # Map EKSAdminRole for IAM users - eksctl create iamidentitymapping \ - --cluster "${CLUSTER_NAME}" \ - --region "${REGION}" \ - --arn arn:aws:iam::119175775298:role/EKSAdminRole \ - --username eks-admin \ - --group system:masters ''' } } } + stage('Configure Cluster Access') { + steps { + withCredentials([aws(credentialsId: 'pmm-staging-slave')]) { + script { + pmmHaEks.configureAccess( + clusterName: env.CLUSTER_NAME, + region: env.REGION + ) + } + } + } + } + stage('Export kubeconfig') { steps { withCredentials([aws(credentialsId: 'pmm-staging-slave')]) { @@ -129,50 +217,45 @@ EOF } } - stage('Configure GP3 Storage Class') { + stage('Setup Infrastructure') { steps { withCredentials([aws(credentialsId: 'pmm-staging-slave')]) { - sh ''' - kubectl patch storageclass gp2 -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"false"}}}' - - cat </dev/null || echo "pending") + + echo "============================================" + echo "Access Commands" + echo "============================================" + echo "kubectl:" + echo " aws eks update-kubeconfig --name ${CLUSTER_NAME} --region ${REGION}" + echo "" + echo "PMM UI:" + echo " https://${PMM_DOMAIN}" + echo "" + echo "ALB Hostname:" + echo " ${ALB_HOSTNAME}" + + # Save credentials to file for archiving + mkdir -p pmm-credentials + cat > pmm-credentials/access-info.txt </dev/null || echo 'unknown'", + returnStdout: true + ).trim() + + currentBuild.description = "https://${PMM_DOMAIN} | admin / ${pmmPassword} | ${chartRepo}/${HELM_CHART_BRANCH}" + + echo "Cluster ${CLUSTER_NAME} with PMM HA created successfully." + echo "PMM URL: https://${PMM_DOMAIN}" + echo 'User: admin' + echo "Password: ${pmmPassword}" + echo "Chart: ${chartRepo}/${HELM_CHART_BRANCH}" + } + } } failure { withCredentials([aws(credentialsId: 'pmm-staging-slave')]) { - sh ''' - if eksctl get cluster \ - --region "${REGION}" \ - --name "${CLUSTER_NAME}" >/dev/null 2>&1 - then - eksctl delete cluster \ - --region "${REGION}" \ - --name "${CLUSTER_NAME}" \ - --disable-nodegroup-eviction \ - --wait - fi - ''' + script { + // Check if cluster exists before cleanup + def clusterExists = sh( + script: "eksctl get cluster --region ${REGION} --name ${CLUSTER_NAME} >/dev/null 2>&1", + returnStatus: true + ) == 0 + + if (clusterExists) { + pmmHaEks.deleteCluster( + clusterName: env.CLUSTER_NAME, + region: env.REGION, + r53ZoneName: env.R53_ZONE_NAME + ) + } else { + echo "Cluster ${CLUSTER_NAME} not found, nothing to clean up." + } + } } } } diff --git a/vars/pmmHaEks.groovy b/vars/pmmHaEks.groovy new file mode 100644 index 0000000000..3490732c4e --- /dev/null +++ b/vars/pmmHaEks.groovy @@ -0,0 +1,515 @@ +/** + * PMM HA EKS Shared Library + * + * Reusable functions for PMM High Availability testing on EKS clusters. + * + * Functions: + * - configureAccess() Configure EKS Access Entries (IAM roles, users, SSO) + * - setupInfrastructure() Install GP3 storage, Node Termination Handler, ALB Controller + * - installPmm() Deploy PMM HA stack (operators, secrets, helm chart) + * - createIngress() Create ALB Ingress and Route53 DNS record + * - deleteCluster() Delete Route53, ALB, and EKS cluster + * + * Related: + * - Create pipeline: pmm/v3/pmm3-ha-eks.groovy + * - Cleanup pipeline: pmm/v3/pmm3-ha-eks-cleanup.groovy + */ + +/** + * Configure EKS Access Entries for cluster authentication. + * + * Grants cluster admin access to: + * - EKSAdminRole (for automation) + * - Members of pmm-eks-admins IAM group (dynamically resolved) + * - SSO AdministratorAccess role (for console users) + * + * @param clusterName EKS cluster name (required) + * @param region AWS region (default: us-east-2) + * @param accountId AWS account ID (default: 119175775298) + * @param adminGroupName IAM group for admin access (default: pmm-eks-admins) + */ +def configureAccess(Map config) { + def clusterName = config.clusterName ?: error("clusterName is required") + def region = config.region ?: 'us-east-2' + def accountId = config.accountId ?: '119175775298' + def adminGroupName = config.adminGroupName ?: 'pmm-eks-admins' + + sh """ + set -euo pipefail + + CLUSTER_NAME="${clusterName}" + REGION="${region}" + ACCOUNT_ID="${accountId}" + + # Add EKSAdminRole with cluster admin access + aws eks create-access-entry \\ + --cluster-name "\${CLUSTER_NAME}" \\ + --region "\${REGION}" \\ + --principal-arn "arn:aws:iam::\${ACCOUNT_ID}:role/EKSAdminRole" || true + + aws eks associate-access-policy \\ + --cluster-name "\${CLUSTER_NAME}" \\ + --region "\${REGION}" \\ + --principal-arn "arn:aws:iam::\${ACCOUNT_ID}:role/EKSAdminRole" \\ + --policy-arn arn:aws:eks::aws:cluster-access-policy/AmazonEKSClusterAdminPolicy \\ + --access-scope type=cluster || true + + # Add IAM group members dynamically + USERS=\$(aws iam get-group --group-name ${adminGroupName} --query 'Users[].Arn' --output text 2>/dev/null || echo "") + for USER_ARN in \${USERS}; do + echo "Adding access for \${USER_ARN}..." + aws eks create-access-entry \\ + --cluster-name "\${CLUSTER_NAME}" \\ + --region "\${REGION}" \\ + --principal-arn "\${USER_ARN}" || true + + aws eks associate-access-policy \\ + --cluster-name "\${CLUSTER_NAME}" \\ + --region "\${REGION}" \\ + --principal-arn "\${USER_ARN}" \\ + --policy-arn arn:aws:eks::aws:cluster-access-policy/AmazonEKSClusterAdminPolicy \\ + --access-scope type=cluster || true + done + + # Add SSO AdministratorAccess role + aws eks create-access-entry \\ + --cluster-name "\${CLUSTER_NAME}" \\ + --region "\${REGION}" \\ + --principal-arn "arn:aws:iam::\${ACCOUNT_ID}:role/aws-reserved/sso.amazonaws.com/AWSReservedSSO_AdministratorAccess_5922b1e9e802dfa5" || true + + aws eks associate-access-policy \\ + --cluster-name "\${CLUSTER_NAME}" \\ + --region "\${REGION}" \\ + --principal-arn "arn:aws:iam::\${ACCOUNT_ID}:role/aws-reserved/sso.amazonaws.com/AWSReservedSSO_AdministratorAccess_5922b1e9e802dfa5" \\ + --policy-arn arn:aws:eks::aws:cluster-access-policy/AmazonEKSClusterAdminPolicy \\ + --access-scope type=cluster || true + + echo "Access entries configured:" + aws eks list-access-entries --cluster-name "\${CLUSTER_NAME}" --region "\${REGION}" + """ +} + +/** + * Setup EKS infrastructure components for PMM HA. + * + * Installs and configures: + * - GP3 storage class (encrypted, default) + * - AWS Node Termination Handler (for spot instance draining) + * - AWS Load Balancer Controller (for ALB ingress) + * + * @param clusterName EKS cluster name (required) + * @param region AWS region (default: us-east-2) + * @param accountId AWS account ID for IAM policy ARN (default: 119175775298) + */ +def setupInfrastructure(Map config) { + def clusterName = config.clusterName ?: error("clusterName is required") + def region = config.region ?: 'us-east-2' + def accountId = config.accountId ?: '119175775298' + + sh """ + set -euo pipefail + + CLUSTER_NAME="${clusterName}" + REGION="${region}" + ACCOUNT_ID="${accountId}" + + # Configure GP3 as default storage class + kubectl patch storageclass gp2 -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"false"}}}' || true + + cat </dev/null; then + echo "Cloned from: \${TIBI_REPO}" + echo "theTibi" > .chart-repo-source + elif git clone --depth 1 --branch "\${HELM_CHART_BRANCH}" "\${PERCONA_REPO}" charts-repo 2>/dev/null; then + echo "Cloned from: \${PERCONA_REPO}" + echo "percona" > .chart-repo-source + else + echo "ERROR: Branch \${HELM_CHART_BRANCH} not found in either repository" + exit 1 + fi + + # Add required Helm repos + helm repo add percona https://percona.github.io/percona-helm-charts/ || true + helm repo add vm https://victoriametrics.github.io/helm-charts/ || true + helm repo add altinity https://docs.altinity.com/helm-charts/ || true + helm repo update + + # Install PMM HA dependencies (operators) + helm dependency update charts-repo/charts/pmm-ha-dependencies + helm upgrade --install pmm-operators charts-repo/charts/pmm-ha-dependencies \\ + --namespace "\${PMM_NAMESPACE}" \\ + --create-namespace \\ + --wait \\ + --timeout 10m + + echo "Waiting for operators to be ready..." + kubectl wait --for=condition=ready pod -l app.kubernetes.io/name=victoria-metrics-operator -n "\${PMM_NAMESPACE}" --timeout=300s || true + kubectl wait --for=condition=ready pod -l app.kubernetes.io/name=altinity-clickhouse-operator -n "\${PMM_NAMESPACE}" --timeout=300s || true + kubectl wait --for=condition=ready pod -l app.kubernetes.io/name=pg-operator -n "\${PMM_NAMESPACE}" --timeout=300s || true + + # Generate passwords + PMM_ADMIN_PASSWORD=\$(openssl rand -base64 16 | tr -dc 'a-zA-Z0-9' | head -c 16) + PG_PASSWORD=\$(openssl rand -base64 24 | tr -dc 'a-zA-Z0-9' | head -c 24) + GF_PASSWORD=\$(openssl rand -base64 24 | tr -dc 'a-zA-Z0-9' | head -c 24) + CH_PASSWORD=\$(openssl rand -base64 24 | tr -dc 'a-zA-Z0-9' | head -c 24) + VM_PASSWORD=\$(openssl rand -base64 24 | tr -dc 'a-zA-Z0-9' | head -c 24) + + # Pre-create pmm-secret before helm install + # The chart's pg-user-credentials-secrets.yaml uses lookup() at template time + # GF_SECURITY_ADMIN_PASSWORD is needed because with secret.create=false, + # the chart doesn't explicitly set this env var (only secretRef is used) + kubectl create secret generic pmm-secret \\ + --namespace "\${PMM_NAMESPACE}" \\ + --from-literal=PMM_ADMIN_PASSWORD="\${PMM_ADMIN_PASSWORD}" \\ + --from-literal=GF_SECURITY_ADMIN_PASSWORD="\${PMM_ADMIN_PASSWORD}" \\ + --from-literal=PG_PASSWORD="\${PG_PASSWORD}" \\ + --from-literal=GF_PASSWORD="\${GF_PASSWORD}" \\ + --from-literal=PMM_CLICKHOUSE_USER="clickhouse_pmm" \\ + --from-literal=PMM_CLICKHOUSE_PASSWORD="\${CH_PASSWORD}" \\ + --from-literal=VMAGENT_remoteWrite_basicAuth_username="victoriametrics_pmm" \\ + --from-literal=VMAGENT_remoteWrite_basicAuth_password="\${VM_PASSWORD}" \\ + --dry-run=client -o yaml | kubectl apply -f - + + # Install PMM HA + helm dependency update charts-repo/charts/pmm-ha + + HELM_CMD="helm upgrade --install pmm-ha charts-repo/charts/pmm-ha" + HELM_CMD="\${HELM_CMD} --namespace \${PMM_NAMESPACE}" + HELM_CMD="\${HELM_CMD} --set secret.create=false" + HELM_CMD="\${HELM_CMD} --set secret.name=pmm-secret" + if [ -n "\${PMM_IMAGE_TAG}" ]; then + HELM_CMD="\${HELM_CMD} --set image.tag=\${PMM_IMAGE_TAG}" + fi + HELM_CMD="\${HELM_CMD} --wait --timeout 15m" + + eval "\${HELM_CMD}" + + # Wait for components + echo "Waiting for PMM HA components..." + kubectl rollout status statefulset/pmm-ha -n "\${PMM_NAMESPACE}" --timeout=600s || true + kubectl wait --for=condition=ready pod -l clickhouse.altinity.com/chi=pmm-ha -n "\${PMM_NAMESPACE}" --timeout=600s || true + kubectl wait --for=condition=ready pod -l app.kubernetes.io/component=vmselect -n "\${PMM_NAMESPACE}" --timeout=300s || true + kubectl wait --for=condition=ready pod -l app.kubernetes.io/component=vmstorage -n "\${PMM_NAMESPACE}" --timeout=300s || true + + echo "PMM HA installed" + kubectl get pods -n "\${PMM_NAMESPACE}" + """ +} + +/** + * Create ALB Ingress and Route53 DNS record for PMM HA. + * + * Creates: + * - ALB Ingress with ACM certificate (HTTPS) + * - Route53 alias record pointing to ALB + * + * Waits up to 5 minutes for ALB provisioning. + * + * @param namespace Kubernetes namespace (default: pmm) + * @param domain FQDN for PMM access (required) + * @param certArn ACM certificate ARN for TLS (required) + * @param r53ZoneName Route53 hosted zone name (required, e.g., cd.percona.com) + * @param region AWS region (default: us-east-2) + */ +def createIngress(Map config) { + def namespace = config.namespace ?: 'pmm' + def domain = config.domain ?: error("domain is required") + def certArn = config.certArn ?: error("certArn is required") + def r53ZoneName = config.r53ZoneName ?: error("r53ZoneName is required") + def region = config.region ?: 'us-east-2' + + sh """ + set -euo pipefail + + PMM_NAMESPACE="${namespace}" + PMM_DOMAIN="${domain}" + ACM_CERT_ARN="${certArn}" + R53_ZONE_NAME="${r53ZoneName}" + REGION="${region}" + + # Resolve Route53 zone ID from zone name (public zones only, exact match) + R53_ZONE_IDS=\$(aws route53 list-hosted-zones-by-name \\ + --dns-name "\${R53_ZONE_NAME}" \\ + --query 'HostedZones[?Config.PrivateZone==`false` && Name==`'"\${R53_ZONE_NAME}"'.`].Id' \\ + --output text | sed 's|/hostedzone/||g') + + # Validate we got exactly one zone + zone_count=\$(echo "\${R53_ZONE_IDS}" | wc -w | tr -d ' ') + if [ "\${zone_count}" -eq 0 ] || [ -z "\${R53_ZONE_IDS}" ] || [ "\${R53_ZONE_IDS}" = "None" ]; then + echo "ERROR: No public Route53 zone found for \${R53_ZONE_NAME}" + exit 1 + elif [ "\${zone_count}" -gt 1 ]; then + echo "ERROR: Multiple public Route53 zones found for \${R53_ZONE_NAME}: \${R53_ZONE_IDS}" + exit 1 + fi + R53_ZONE_ID="\${R53_ZONE_IDS}" + echo "Resolved Route53 zone ID: \${R53_ZONE_ID}" + + # Create ALB Ingress + cat </dev/null || echo "") + if [ -n "\${ALB_HOSTNAME}" ]; then + echo "ALB provisioned: \${ALB_HOSTNAME}" + break + fi + echo "Waiting for ALB... (\${attempt}/30)" + sleep 10 + done + + if [ -z "\${ALB_HOSTNAME}" ]; then + echo "WARNING: ALB not provisioned within timeout" + kubectl describe ingress pmm-ha-alb -n "\${PMM_NAMESPACE}" + exit 1 + fi + + # Create Route53 record + ALB_ZONE_ID=\$(aws elbv2 describe-load-balancers --region "\${REGION}" \\ + --query "LoadBalancers[?DNSName=='\${ALB_HOSTNAME}'].CanonicalHostedZoneId" \\ + --output text) + + if [ -n "\${ALB_ZONE_ID}" ]; then + aws route53 change-resource-record-sets \\ + --hosted-zone-id "\${R53_ZONE_ID}" \\ + --change-batch '{ + "Changes": [{ + "Action": "UPSERT", + "ResourceRecordSet": { + "Name": "'"\${PMM_DOMAIN}"'", + "Type": "A", + "AliasTarget": { + "HostedZoneId": "'"\${ALB_ZONE_ID}"'", + "DNSName": "'"\${ALB_HOSTNAME}"'", + "EvaluateTargetHealth": true + } + } + }] + }' + echo "Route53 record created: \${PMM_DOMAIN} -> \${ALB_HOSTNAME}" + else + echo "WARNING: Could not get ALB zone ID, skipping Route53 record" + fi + """ +} + +/** + * Delete PMM HA EKS cluster and all associated AWS resources. + * + * Cleanup order (to avoid dependency errors): + * 1. Route53 alias record + * 2. ALB Ingress (triggers ALB deletion) + * 3. EKS cluster via eksctl + * + * @param clusterName EKS cluster name (required) + * @param region AWS region (default: us-east-2) + * @param r53ZoneName Route53 hosted zone name (default: cd.percona.com) + */ +def deleteCluster(Map config) { + def clusterName = config.clusterName ?: error("clusterName is required") + def region = config.region ?: 'us-east-2' + def r53ZoneName = config.r53ZoneName ?: 'cd.percona.com' + + sh """ + set -euo pipefail + + cluster_name="${clusterName}" + REGION="${region}" + R53_ZONE_NAME="${r53ZoneName}" + + # Resolve Route53 zone ID from zone name (public zones only, exact match) + R53_ZONE_IDS=\$(aws route53 list-hosted-zones-by-name \\ + --dns-name "\${R53_ZONE_NAME}" \\ + --query 'HostedZones[?Config.PrivateZone==`false` && Name==`'"\${R53_ZONE_NAME}"'.`].Id' \\ + --output text | sed 's|/hostedzone/||g') + + # Validate we got exactly one zone + zone_count=\$(echo "\${R53_ZONE_IDS}" | wc -w | tr -d ' ') + if [ "\${zone_count}" -eq 0 ] || [ -z "\${R53_ZONE_IDS}" ] || [ "\${R53_ZONE_IDS}" = "None" ]; then + echo "WARNING: No public Route53 zone found for \${R53_ZONE_NAME}, skipping DNS cleanup" + R53_ZONE_ID="" + elif [ "\${zone_count}" -gt 1 ]; then + echo "WARNING: Multiple public Route53 zones found for \${R53_ZONE_NAME}, skipping DNS cleanup" + R53_ZONE_ID="" + else + R53_ZONE_ID="\${R53_ZONE_IDS}" + echo "Resolved Route53 zone ID: \${R53_ZONE_ID}" + fi + + echo "============================================" + echo "Cleaning up cluster: \${cluster_name}" + echo "============================================" + + # Delete Route53 record (if zone was resolved) + domain_name="\${cluster_name}.\${R53_ZONE_NAME}" + if [ -n "\${R53_ZONE_ID}" ]; then + echo "Deleting Route53 record for \${domain_name}..." + record=\$(aws route53 list-resource-record-sets \\ + --hosted-zone-id "\${R53_ZONE_ID}" \\ + --query "ResourceRecordSets[?Name=='\${domain_name}.']" \\ + --output json 2>/dev/null || echo "[]") + + if [ "\${record}" != "[]" ] && [ -n "\${record}" ]; then + record_type=\$(echo "\${record}" | jq -r '.[0].Type') + if [ "\${record_type}" = "A" ]; then + alias_target=\$(echo "\${record}" | jq -r '.[0].AliasTarget') + aws route53 change-resource-record-sets \\ + --hosted-zone-id "\${R53_ZONE_ID}" \\ + --change-batch '{ + "Changes": [{ + "Action": "DELETE", + "ResourceRecordSet": { + "Name": "'"\${domain_name}"'", + "Type": "A", + "AliasTarget": '"\${alias_target}"' + } + }] + }' && echo "Route53 record deleted" || echo "Warning: Failed to delete Route53 record" + fi + else + echo "No Route53 record found for \${domain_name}" + fi + else + echo "Skipping Route53 record deletion (zone not resolved)" + fi + + # Delete ALB ingress (triggers ALB deletion) + echo "Deleting ALB ingress..." + if aws eks update-kubeconfig --name "\${cluster_name}" --region "\${REGION}" 2>/dev/null; then + kubectl delete ingress pmm-ha-alb -n pmm --ignore-not-found=true + fi + + # Wait for ALB cleanup + echo "Waiting for ALB cleanup..." + sleep 30 + + # Delete the EKS cluster + echo "Deleting EKS cluster \${cluster_name}..." + eksctl delete cluster --region "\${REGION}" --name "\${cluster_name}" \\ + --disable-nodegroup-eviction --wait + """ +} From f3ef4fc8c93e510216c0beea3e7dc9108e3bd592 Mon Sep 17 00:00:00 2001 From: Anderson Nogueira Date: Sun, 30 Nov 2025 00:32:50 +0100 Subject: [PATCH 02/19] refactor: derive AWS account ID and SSO role dynamically - Remove hardcoded account ID (119175775298), use aws sts get-caller-identity - Remove hardcoded SSO role suffix, discover via aws iam list-roles - Skip SSO role gracefully if not found in account - Revert library branch to feature branch for testing --- pmm/v3/pmm3-ha-eks-cleanup.groovy | 2 +- pmm/v3/pmm3-ha-eks.groovy | 2 +- vars/pmmHaEks.groovy | 61 ++++++++++++++++++------------- 3 files changed, 38 insertions(+), 27 deletions(-) diff --git a/pmm/v3/pmm3-ha-eks-cleanup.groovy b/pmm/v3/pmm3-ha-eks-cleanup.groovy index dfb74ee79b..0ab32ca88c 100644 --- a/pmm/v3/pmm3-ha-eks-cleanup.groovy +++ b/pmm/v3/pmm3-ha-eks-cleanup.groovy @@ -14,7 +14,7 @@ * - Create: pmm3-ha-eks.groovy * - Shared library: vars/pmmHaEks.groovy */ -library changelog: false, identifier: 'lib@master', retriever: modernSCM([ +library changelog: false, identifier: 'lib@fix/pmm-ha-eks-access-entries', retriever: modernSCM([ $class: 'GitSCMSource', remote: 'https://github.com/Percona-Lab/jenkins-pipelines' ]) diff --git a/pmm/v3/pmm3-ha-eks.groovy b/pmm/v3/pmm3-ha-eks.groovy index 02ea2c3fb3..615e311957 100644 --- a/pmm/v3/pmm3-ha-eks.groovy +++ b/pmm/v3/pmm3-ha-eks.groovy @@ -8,7 +8,7 @@ * - Cleanup: pmm3-ha-eks-cleanup.groovy * - Shared library: vars/pmmHaEks.groovy */ -library changelog: false, identifier: 'lib@master', retriever: modernSCM([ +library changelog: false, identifier: 'lib@fix/pmm-ha-eks-access-entries', retriever: modernSCM([ $class: 'GitSCMSource', remote: 'https://github.com/Percona-Lab/jenkins-pipelines' ]) diff --git a/vars/pmmHaEks.groovy b/vars/pmmHaEks.groovy index 3490732c4e..0f0e1a5475 100644 --- a/vars/pmmHaEks.groovy +++ b/vars/pmmHaEks.groovy @@ -23,15 +23,13 @@ * - Members of pmm-eks-admins IAM group (dynamically resolved) * - SSO AdministratorAccess role (for console users) * - * @param clusterName EKS cluster name (required) - * @param region AWS region (default: us-east-2) - * @param accountId AWS account ID (default: 119175775298) + * @param clusterName EKS cluster name (required) + * @param region AWS region (default: us-east-2) * @param adminGroupName IAM group for admin access (default: pmm-eks-admins) */ def configureAccess(Map config) { - def clusterName = config.clusterName ?: error("clusterName is required") + def clusterName = config.clusterName ?: error('clusterName is required') def region = config.region ?: 'us-east-2' - def accountId = config.accountId ?: '119175775298' def adminGroupName = config.adminGroupName ?: 'pmm-eks-admins' sh """ @@ -39,7 +37,10 @@ def configureAccess(Map config) { CLUSTER_NAME="${clusterName}" REGION="${region}" - ACCOUNT_ID="${accountId}" + + # Get AWS account ID dynamically + ACCOUNT_ID=\$(aws sts get-caller-identity --query Account --output text) + echo "AWS Account ID: \${ACCOUNT_ID}" # Add EKSAdminRole with cluster admin access aws eks create-access-entry \\ @@ -71,18 +72,27 @@ def configureAccess(Map config) { --access-scope type=cluster || true done - # Add SSO AdministratorAccess role - aws eks create-access-entry \\ - --cluster-name "\${CLUSTER_NAME}" \\ - --region "\${REGION}" \\ - --principal-arn "arn:aws:iam::\${ACCOUNT_ID}:role/aws-reserved/sso.amazonaws.com/AWSReservedSSO_AdministratorAccess_5922b1e9e802dfa5" || true + # Add SSO AdministratorAccess role (discover dynamically) + SSO_ROLE_ARN=\$(aws iam list-roles \\ + --query "Roles[?contains(RoleName, 'AWSReservedSSO_AdministratorAccess')].Arn | [0]" \\ + --output text 2>/dev/null || echo "") - aws eks associate-access-policy \\ - --cluster-name "\${CLUSTER_NAME}" \\ - --region "\${REGION}" \\ - --principal-arn "arn:aws:iam::\${ACCOUNT_ID}:role/aws-reserved/sso.amazonaws.com/AWSReservedSSO_AdministratorAccess_5922b1e9e802dfa5" \\ - --policy-arn arn:aws:eks::aws:cluster-access-policy/AmazonEKSClusterAdminPolicy \\ - --access-scope type=cluster || true + if [ -n "\${SSO_ROLE_ARN}" ] && [ "\${SSO_ROLE_ARN}" != "None" ]; then + echo "Adding SSO role: \${SSO_ROLE_ARN}" + aws eks create-access-entry \\ + --cluster-name "\${CLUSTER_NAME}" \\ + --region "\${REGION}" \\ + --principal-arn "\${SSO_ROLE_ARN}" || true + + aws eks associate-access-policy \\ + --cluster-name "\${CLUSTER_NAME}" \\ + --region "\${REGION}" \\ + --principal-arn "\${SSO_ROLE_ARN}" \\ + --policy-arn arn:aws:eks::aws:cluster-access-policy/AmazonEKSClusterAdminPolicy \\ + --access-scope type=cluster || true + else + echo "No SSO AdministratorAccess role found, skipping" + fi echo "Access entries configured:" aws eks list-access-entries --cluster-name "\${CLUSTER_NAME}" --region "\${REGION}" @@ -99,19 +109,20 @@ def configureAccess(Map config) { * * @param clusterName EKS cluster name (required) * @param region AWS region (default: us-east-2) - * @param accountId AWS account ID for IAM policy ARN (default: 119175775298) */ def setupInfrastructure(Map config) { - def clusterName = config.clusterName ?: error("clusterName is required") + def clusterName = config.clusterName ?: error('clusterName is required') def region = config.region ?: 'us-east-2' - def accountId = config.accountId ?: '119175775298' sh """ set -euo pipefail CLUSTER_NAME="${clusterName}" REGION="${region}" - ACCOUNT_ID="${accountId}" + + # Get AWS account ID dynamically + ACCOUNT_ID=\$(aws sts get-caller-identity --query Account --output text) + echo "AWS Account ID: \${ACCOUNT_ID}" # Configure GP3 as default storage class kubectl patch storageclass gp2 -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"false"}}}' || true @@ -303,9 +314,9 @@ def installPmm(Map config) { */ def createIngress(Map config) { def namespace = config.namespace ?: 'pmm' - def domain = config.domain ?: error("domain is required") - def certArn = config.certArn ?: error("certArn is required") - def r53ZoneName = config.r53ZoneName ?: error("r53ZoneName is required") + def domain = config.domain ?: error('domain is required') + def certArn = config.certArn ?: error('certArn is required') + def r53ZoneName = config.r53ZoneName ?: error('r53ZoneName is required') def region = config.region ?: 'us-east-2' sh """ @@ -430,7 +441,7 @@ EOF * @param r53ZoneName Route53 hosted zone name (default: cd.percona.com) */ def deleteCluster(Map config) { - def clusterName = config.clusterName ?: error("clusterName is required") + def clusterName = config.clusterName ?: error('clusterName is required') def region = config.region ?: 'us-east-2' def r53ZoneName = config.r53ZoneName ?: 'cd.percona.com' From f180443e2d3f6f7142abedf9e51cb7d20aea5f52 Mon Sep 17 00:00:00 2001 From: Anderson Nogueira Date: Sun, 30 Nov 2025 00:40:34 +0100 Subject: [PATCH 03/19] fix: drop unsupported K8s version 1.34 --- pmm/v3/pmm3-ha-eks.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pmm/v3/pmm3-ha-eks.groovy b/pmm/v3/pmm3-ha-eks.groovy index 615e311957..cfdb994923 100644 --- a/pmm/v3/pmm3-ha-eks.groovy +++ b/pmm/v3/pmm3-ha-eks.groovy @@ -26,7 +26,7 @@ pipeline { parameters { choice( name: 'K8S_VERSION', - choices: ['1.32', '1.33', '1.34', '1.31', '1.30', '1.29'], + choices: ['1.32', '1.33', '1.31', '1.30', '1.29'], description: 'Select Kubernetes cluster version' ) // PMM HA charts are not yet merged to percona/percona-helm-charts main branch. From 06e1139138e3b3b0520c6a55f340ad783479416b Mon Sep 17 00:00:00 2001 From: Anderson Nogueira Date: Sun, 30 Nov 2025 01:36:13 +0100 Subject: [PATCH 04/19] fix(pmm-ha): sanitize SSO role ARN from aws iam list-roles output The jmespath query returns trailing None values when using --output text, causing the access entry creation to fail with an embedded newline. --- vars/pmmHaEks.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vars/pmmHaEks.groovy b/vars/pmmHaEks.groovy index 0f0e1a5475..f7cb8c464e 100644 --- a/vars/pmmHaEks.groovy +++ b/vars/pmmHaEks.groovy @@ -75,7 +75,7 @@ def configureAccess(Map config) { # Add SSO AdministratorAccess role (discover dynamically) SSO_ROLE_ARN=\$(aws iam list-roles \\ --query "Roles[?contains(RoleName, 'AWSReservedSSO_AdministratorAccess')].Arn | [0]" \\ - --output text 2>/dev/null || echo "") + --output text 2>/dev/null | head -1 | tr -d '[:space:]') if [ -n "\${SSO_ROLE_ARN}" ] && [ "\${SSO_ROLE_ARN}" != "None" ]; then echo "Adding SSO role: \${SSO_ROLE_ARN}" From 0ba9672cba83197fd21aed4945785bdb0f269a2d Mon Sep 17 00:00:00 2001 From: Anderson Nogueira Date: Sun, 30 Nov 2025 02:25:36 +0100 Subject: [PATCH 05/19] feat(pmm-ha): spread nodes across all available AZs dynamically Discover availability zones from AWS at runtime instead of hardcoding. Improves spot instance resilience - if one AZ has interruptions, pods can reschedule to nodes in other AZs. --- pmm/v3/pmm3-ha-eks.groovy | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pmm/v3/pmm3-ha-eks.groovy b/pmm/v3/pmm3-ha-eks.groovy index cfdb994923..20e1517187 100644 --- a/pmm/v3/pmm3-ha-eks.groovy +++ b/pmm/v3/pmm3-ha-eks.groovy @@ -57,7 +57,13 @@ pipeline { stages { stage('Write Cluster Config') { steps { - sh ''' + withCredentials([aws(credentialsId: 'pmm-staging-slave')]) { + sh ''' + # Discover available AZs dynamically + AZS=$(aws ec2 describe-availability-zones --region "${REGION}" \ + --query 'AvailabilityZones[?State==`available`].ZoneName' \ + --output json) + cat > cluster-config.yaml < Date: Sun, 30 Nov 2025 02:33:24 +0100 Subject: [PATCH 06/19] chore(pmm-ha): remove obvious comments Remove comments that merely restate what the code does: - 'Get AWS account ID dynamically' - 'Install PMM HA' - 'Wait for components' - 'Wait for ALB' - 'Create Route53 record' - 'Delete the EKS cluster' --- vars/pmmHaEks.groovy | 7 ------- 1 file changed, 7 deletions(-) diff --git a/vars/pmmHaEks.groovy b/vars/pmmHaEks.groovy index f7cb8c464e..c2a6115123 100644 --- a/vars/pmmHaEks.groovy +++ b/vars/pmmHaEks.groovy @@ -38,7 +38,6 @@ def configureAccess(Map config) { CLUSTER_NAME="${clusterName}" REGION="${region}" - # Get AWS account ID dynamically ACCOUNT_ID=\$(aws sts get-caller-identity --query Account --output text) echo "AWS Account ID: \${ACCOUNT_ID}" @@ -120,7 +119,6 @@ def setupInfrastructure(Map config) { CLUSTER_NAME="${clusterName}" REGION="${region}" - # Get AWS account ID dynamically ACCOUNT_ID=\$(aws sts get-caller-identity --query Account --output text) echo "AWS Account ID: \${ACCOUNT_ID}" @@ -271,7 +269,6 @@ def installPmm(Map config) { --from-literal=VMAGENT_remoteWrite_basicAuth_password="\${VM_PASSWORD}" \\ --dry-run=client -o yaml | kubectl apply -f - - # Install PMM HA helm dependency update charts-repo/charts/pmm-ha HELM_CMD="helm upgrade --install pmm-ha charts-repo/charts/pmm-ha" @@ -285,7 +282,6 @@ def installPmm(Map config) { eval "\${HELM_CMD}" - # Wait for components echo "Waiting for PMM HA components..." kubectl rollout status statefulset/pmm-ha -n "\${PMM_NAMESPACE}" --timeout=600s || true kubectl wait --for=condition=ready pod -l clickhouse.altinity.com/chi=pmm-ha -n "\${PMM_NAMESPACE}" --timeout=600s || true @@ -379,7 +375,6 @@ spec: number: 443 EOF - # Wait for ALB echo "Waiting for ALB to be provisioned..." ALB_HOSTNAME="" for attempt in \$(seq 1 30); do @@ -399,7 +394,6 @@ EOF exit 1 fi - # Create Route53 record ALB_ZONE_ID=\$(aws elbv2 describe-load-balancers --region "\${REGION}" \\ --query "LoadBalancers[?DNSName=='\${ALB_HOSTNAME}'].CanonicalHostedZoneId" \\ --output text) @@ -518,7 +512,6 @@ def deleteCluster(Map config) { echo "Waiting for ALB cleanup..." sleep 30 - # Delete the EKS cluster echo "Deleting EKS cluster \${cluster_name}..." eksctl delete cluster --region "\${REGION}" --name "\${cluster_name}" \\ --disable-nodegroup-eviction --wait From a3832ce268a7817eac730edab186fcb83dda8268 Mon Sep 17 00:00:00 2001 From: Anderson Nogueira Date: Sun, 30 Nov 2025 02:38:01 +0100 Subject: [PATCH 07/19] fix(pmm-ha): use agent-amd64-ol9 for cleanup pipeline Cleanup requires kubectl and eksctl which may not be available on cli agents. --- pmm/v3/pmm3-ha-eks-cleanup.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pmm/v3/pmm3-ha-eks-cleanup.groovy b/pmm/v3/pmm3-ha-eks-cleanup.groovy index 0ab32ca88c..401a973b14 100644 --- a/pmm/v3/pmm3-ha-eks-cleanup.groovy +++ b/pmm/v3/pmm3-ha-eks-cleanup.groovy @@ -21,7 +21,7 @@ library changelog: false, identifier: 'lib@fix/pmm-ha-eks-access-entries', retri pipeline { agent { - label 'cli' + label 'agent-amd64-ol9' } triggers { From 5d62e824335db4635d9f971580137edf3f871fc6 Mon Sep 17 00:00:00 2001 From: Anderson Nogueira Date: Sun, 30 Nov 2025 02:38:33 +0100 Subject: [PATCH 08/19] Revert "fix(pmm-ha): use agent-amd64-ol9 for cleanup pipeline" This reverts commit a3832ce268a7817eac730edab186fcb83dda8268. --- pmm/v3/pmm3-ha-eks-cleanup.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pmm/v3/pmm3-ha-eks-cleanup.groovy b/pmm/v3/pmm3-ha-eks-cleanup.groovy index 401a973b14..0ab32ca88c 100644 --- a/pmm/v3/pmm3-ha-eks-cleanup.groovy +++ b/pmm/v3/pmm3-ha-eks-cleanup.groovy @@ -21,7 +21,7 @@ library changelog: false, identifier: 'lib@fix/pmm-ha-eks-access-entries', retri pipeline { agent { - label 'agent-amd64-ol9' + label 'cli' } triggers { From 89bad25e88baeb6c1875b4db8a0da3b636081c50 Mon Sep 17 00:00:00 2001 From: Anderson Nogueira Date: Sun, 30 Nov 2025 02:40:01 +0100 Subject: [PATCH 09/19] chore(pmm-ha): use cli agent for create pipeline CLI agents have kubectl, eksctl, helm, and AWS CLI - same as cleanup. --- pmm/v3/pmm3-ha-eks.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pmm/v3/pmm3-ha-eks.groovy b/pmm/v3/pmm3-ha-eks.groovy index 20e1517187..bde2431b22 100644 --- a/pmm/v3/pmm3-ha-eks.groovy +++ b/pmm/v3/pmm3-ha-eks.groovy @@ -15,7 +15,7 @@ library changelog: false, identifier: 'lib@fix/pmm-ha-eks-access-entries', retri pipeline { agent { - label 'agent-amd64-ol9' + label 'cli' } options { From 6f107ce1c8bc689c186f7125096920f8f91e1817 Mon Sep 17 00:00:00 2001 From: Anderson Nogueira Date: Sun, 30 Nov 2025 02:51:21 +0100 Subject: [PATCH 10/19] chore(pmm-ha): rename stage to 'Configure External PMM Access' --- pmm/v3/pmm3-ha-eks.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pmm/v3/pmm3-ha-eks.groovy b/pmm/v3/pmm3-ha-eks.groovy index bde2431b22..79df84e163 100644 --- a/pmm/v3/pmm3-ha-eks.groovy +++ b/pmm/v3/pmm3-ha-eks.groovy @@ -252,7 +252,7 @@ EOF } } - stage('Create Ingress') { + stage('Configure External PMM Access') { steps { withCredentials([aws(credentialsId: 'pmm-staging-slave')]) { script { From b496a5ec37a4be4069cdf0eaeb9d27232c8c3c76 Mon Sep 17 00:00:00 2001 From: Anderson Nogueira Date: Sun, 30 Nov 2025 03:39:16 +0100 Subject: [PATCH 11/19] chore(pmm-ha): rename stage to 'Setup External Access' --- pmm/v3/pmm3-ha-eks.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pmm/v3/pmm3-ha-eks.groovy b/pmm/v3/pmm3-ha-eks.groovy index 79df84e163..c0ec44638a 100644 --- a/pmm/v3/pmm3-ha-eks.groovy +++ b/pmm/v3/pmm3-ha-eks.groovy @@ -252,7 +252,7 @@ EOF } } - stage('Configure External PMM Access') { + stage('Setup External Access') { steps { withCredentials([aws(credentialsId: 'pmm-staging-slave')]) { script { From 3e61fce24e091e4d0d46e6020871f2a1b648c4fb Mon Sep 17 00:00:00 2001 From: Anderson Nogueira Date: Mon, 1 Dec 2025 09:31:49 +0100 Subject: [PATCH 12/19] fix(pmm-ha): clean kubeconfig directory before export Removes stale kubeconfig entries from previous builds that could persist in the Jenkins workspace, ensuring the artifact contains only the current cluster configuration. --- pmm/v3/pmm3-ha-eks.groovy | 1 + 1 file changed, 1 insertion(+) diff --git a/pmm/v3/pmm3-ha-eks.groovy b/pmm/v3/pmm3-ha-eks.groovy index c0ec44638a..41b55c7710 100644 --- a/pmm/v3/pmm3-ha-eks.groovy +++ b/pmm/v3/pmm3-ha-eks.groovy @@ -211,6 +211,7 @@ EOF steps { withCredentials([aws(credentialsId: 'pmm-staging-slave')]) { sh ''' + rm -rf kubeconfig mkdir -p kubeconfig aws eks update-kubeconfig \ From 1461b4f2e68e7ec15a3e1f25fe84b97b77eb21f5 Mon Sep 17 00:00:00 2001 From: Anderson Nogueira Date: Tue, 2 Dec 2025 12:51:34 +0100 Subject: [PATCH 13/19] fix(pmm-ha-eks): increase node size to xlarge for ClickHouse memory ClickHouse merge operations failing with MEMORY_LIMIT_EXCEEDED on *.large instances (8GB RAM). Upgrade to *.xlarge (16GB RAM) to provide sufficient memory headroom for the full PMM HA stack. --- pmm/v3/pmm3-ha-eks.groovy | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pmm/v3/pmm3-ha-eks.groovy b/pmm/v3/pmm3-ha-eks.groovy index 41b55c7710..9a0e9d59a1 100644 --- a/pmm/v3/pmm3-ha-eks.groovy +++ b/pmm/v3/pmm3-ha-eks.groovy @@ -93,10 +93,10 @@ managedNodeGroups: - name: ng-spot amiFamily: AmazonLinux2023 instanceTypes: - - m5a.large - - m5n.large - - m7a.large - - m7i-flex.large + - m5a.xlarge + - m5n.xlarge + - m7a.xlarge + - m7i-flex.xlarge volumeSize: 80 spot: true minSize: 2 From cac43cca7dcaa1ee93264584f86701e2d4ba64b4 Mon Sep 17 00:00:00 2001 From: Anderson Nogueira Date: Tue, 2 Dec 2025 12:55:40 +0100 Subject: [PATCH 14/19] fix(pmm-ha-eks-cleanup): replace Date.parse with shell date command Date.parse() is not allowed in Jenkins sandbox, causing DELETE_OLD cron jobs to fail with RejectedAccessException. Use shell date -d to convert ISO 8601 timestamps to epoch milliseconds instead. Fixes cron builds #21, #22, #23 failing with: "No such static method found: staticMethod java.util.Date parse" --- pmm/v3/pmm3-ha-eks-cleanup.groovy | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/pmm/v3/pmm3-ha-eks-cleanup.groovy b/pmm/v3/pmm3-ha-eks-cleanup.groovy index 0ab32ca88c..f3cf7a28a3 100644 --- a/pmm/v3/pmm3-ha-eks-cleanup.groovy +++ b/pmm/v3/pmm3-ha-eks-cleanup.groovy @@ -176,20 +176,26 @@ pipeline { def cutoffMs = System.currentTimeMillis() - (24 * 60 * 60 * 1000) // 1 day ago clusters.split(/\s+/).each { clusterName -> - def createdAt = sh( - script: "aws eks describe-cluster --name ${clusterName} --region ${REGION} --query 'cluster.createdAt' --output text 2>/dev/null || echo ''", + // Get creation time as epoch milliseconds using shell date command + def createdMs = sh( + script: """ + CREATED=\$(aws eks describe-cluster --name ${clusterName} --region ${REGION} --query 'cluster.createdAt' --output text 2>/dev/null || echo '') + if [ -z "\$CREATED" ] || [ "\$CREATED" = "None" ]; then + echo "" + else + # Convert ISO 8601 to epoch milliseconds + date -d "\$CREATED" +%s000 2>/dev/null || echo "" + fi + """, returnStdout: true ).trim() - if (!createdAt || createdAt == 'None') { + if (!createdMs) { echo "Unable to fetch creation time for ${clusterName} - skipping." return // continue to next iteration } - // Parse ISO 8601 timestamp - def createdMs = Date.parse("yyyy-MM-dd'T'HH:mm:ss", createdAt.take(19)).time - - if (createdMs < cutoffMs) { + if (createdMs.toLong() < cutoffMs) { pmmHaEks.deleteCluster( clusterName: clusterName, region: env.REGION, From bc546ea5ba1a726f9fb0e13da115c4a467980766 Mon Sep 17 00:00:00 2001 From: Anderson Nogueira Date: Tue, 2 Dec 2025 14:13:49 +0100 Subject: [PATCH 15/19] fix(pmm-ha-eks): increase ClickHouse memory limits to 10Gi Default 4Gi memory limit causes MEMORY_LIMIT_EXCEEDED errors during merge operations. Increase to 10Gi with 4Gi requests to allow proper merge execution on xlarge nodes. --- vars/pmmHaEks.groovy | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vars/pmmHaEks.groovy b/vars/pmmHaEks.groovy index c2a6115123..ab6af77683 100644 --- a/vars/pmmHaEks.groovy +++ b/vars/pmmHaEks.groovy @@ -275,6 +275,9 @@ def installPmm(Map config) { HELM_CMD="\${HELM_CMD} --namespace \${PMM_NAMESPACE}" HELM_CMD="\${HELM_CMD} --set secret.create=false" HELM_CMD="\${HELM_CMD} --set secret.name=pmm-secret" + # Increase ClickHouse memory for merge operations (default 4Gi is insufficient) + HELM_CMD="\${HELM_CMD} --set clickhouse.resources.requests.memory=4Gi" + HELM_CMD="\${HELM_CMD} --set clickhouse.resources.limits.memory=10Gi" if [ -n "\${PMM_IMAGE_TAG}" ]; then HELM_CMD="\${HELM_CMD} --set image.tag=\${PMM_IMAGE_TAG}" fi From 77929b5a812ed695c876b7af8a9327f3e090698c Mon Sep 17 00:00:00 2001 From: Anderson Nogueira Date: Tue, 2 Dec 2025 14:21:06 +0100 Subject: [PATCH 16/19] refactor(pmm-ha-eks): move deletion logic to shared library Move cluster management functions from cleanup pipeline to pmmHaEks.groovy: - listClusters(): returns clusters sorted newest first (CPS-safe) - deleteAllClusters(): parallel deletion with SKIP_NEWEST and age filter - cleanupOrphans(): removes orphaned VPCs and failed CF stacks Simplify pmm3-ha-eks-cleanup.groovy to high-level orchestration only. Add SKIP_NEWEST parameter and CLEANUP_ORPHANS action. --- pmm/v3/pmm3-ha-eks-cleanup.groovy | 89 ++++--------- vars/pmmHaEks.groovy | 210 ++++++++++++++++++++++++++++++ 2 files changed, 238 insertions(+), 61 deletions(-) diff --git a/pmm/v3/pmm3-ha-eks-cleanup.groovy b/pmm/v3/pmm3-ha-eks-cleanup.groovy index f3cf7a28a3..c5863a1ce1 100644 --- a/pmm/v3/pmm3-ha-eks-cleanup.groovy +++ b/pmm/v3/pmm3-ha-eks-cleanup.groovy @@ -7,8 +7,9 @@ * Actions: * - LIST_ONLY: List all test clusters with age * - DELETE_CLUSTER: Delete a specific cluster - * - DELETE_ALL: Delete all test clusters + * - DELETE_ALL: Delete all test clusters (respects SKIP_NEWEST) * - DELETE_OLD (cron): Delete clusters older than 24 hours + * - CLEANUP_ORPHANS: Delete orphaned VPCs and failed CF stacks * * Related: * - Create: pmm3-ha-eks.groovy @@ -31,15 +32,17 @@ pipeline { parameters { choice( name: 'ACTION', - choices: ['LIST_ONLY', 'DELETE_CLUSTER', 'DELETE_ALL'], + choices: ['LIST_ONLY', 'DELETE_CLUSTER', 'DELETE_ALL', 'CLEANUP_ORPHANS'], description: ''' LIST_ONLY - list all test clusters
DELETE_CLUSTER - delete a specific cluster (requires CLUSTER_NAME)
- DELETE_ALL - delete all test clusters

+ DELETE_ALL - delete all test clusters
+ CLEANUP_ORPHANS - delete orphaned VPCs and failed CF stacks

Note: Daily cron automatically deletes clusters older than 1 day. ''' ) string(name: 'CLUSTER_NAME', defaultValue: '', description: 'Required only for DELETE_CLUSTER') + booleanParam(name: 'SKIP_NEWEST', defaultValue: true, description: 'Skip the most recent cluster (protects in-progress builds)') } options { @@ -136,23 +139,12 @@ pipeline { steps { withCredentials([aws(credentialsId: 'pmm-staging-slave')]) { script { - def clusters = sh( - script: "aws eks list-clusters --region ${REGION} --query \"clusters[?starts_with(@, '${CLUSTER_PREFIX}')]\" --output text", - returnStdout: true - ).trim() - - if (!clusters) { - echo "No clusters found with prefix '${CLUSTER_PREFIX}'." - return - } - - clusters.split(/\s+/).each { clusterName -> - pmmHaEks.deleteCluster( - clusterName: clusterName, - region: env.REGION, - r53ZoneName: env.R53_ZONE_NAME - ) - } + pmmHaEks.deleteAllClusters( + region: env.REGION, + r53ZoneName: env.R53_ZONE_NAME, + skipNewest: params.SKIP_NEWEST, + maxAgeHours: 0 // Delete all regardless of age + ) } } } @@ -163,48 +155,23 @@ pipeline { steps { withCredentials([aws(credentialsId: 'pmm-staging-slave')]) { script { - def clusters = sh( - script: "aws eks list-clusters --region ${REGION} --query \"clusters[?starts_with(@, '${CLUSTER_PREFIX}')]\" --output text", - returnStdout: true - ).trim() - - if (!clusters) { - echo "No clusters found with prefix '${CLUSTER_PREFIX}'." - return - } + pmmHaEks.deleteAllClusters( + region: env.REGION, + r53ZoneName: env.R53_ZONE_NAME, + skipNewest: true, // Always protect newest during cron + maxAgeHours: 24 // Only delete clusters older than 24h + ) + } + } + } + } - def cutoffMs = System.currentTimeMillis() - (24 * 60 * 60 * 1000) // 1 day ago - - clusters.split(/\s+/).each { clusterName -> - // Get creation time as epoch milliseconds using shell date command - def createdMs = sh( - script: """ - CREATED=\$(aws eks describe-cluster --name ${clusterName} --region ${REGION} --query 'cluster.createdAt' --output text 2>/dev/null || echo '') - if [ -z "\$CREATED" ] || [ "\$CREATED" = "None" ]; then - echo "" - else - # Convert ISO 8601 to epoch milliseconds - date -d "\$CREATED" +%s000 2>/dev/null || echo "" - fi - """, - returnStdout: true - ).trim() - - if (!createdMs) { - echo "Unable to fetch creation time for ${clusterName} - skipping." - return // continue to next iteration - } - - if (createdMs.toLong() < cutoffMs) { - pmmHaEks.deleteCluster( - clusterName: clusterName, - region: env.REGION, - r53ZoneName: env.R53_ZONE_NAME - ) - } else { - echo "Skipping recent cluster: ${clusterName} (created within last 24h)" - } - } + stage('Cleanup Orphan Resources') { + when { expression { env.ACTION == 'CLEANUP_ORPHANS' } } + steps { + withCredentials([aws(credentialsId: 'pmm-staging-slave')]) { + script { + pmmHaEks.cleanupOrphans(region: env.REGION) } } } diff --git a/vars/pmmHaEks.groovy b/vars/pmmHaEks.groovy index ab6af77683..13d3bbad63 100644 --- a/vars/pmmHaEks.groovy +++ b/vars/pmmHaEks.groovy @@ -9,12 +9,18 @@ * - installPmm() Deploy PMM HA stack (operators, secrets, helm chart) * - createIngress() Create ALB Ingress and Route53 DNS record * - deleteCluster() Delete Route53, ALB, and EKS cluster + * - listClusters() List PMM HA test clusters sorted by creation time (newest first) + * - deleteAllClusters() Delete all/old clusters with SKIP_NEWEST support + * - cleanupOrphans() Clean up orphaned VPCs and failed CF stacks * * Related: * - Create pipeline: pmm/v3/pmm3-ha-eks.groovy * - Cleanup pipeline: pmm/v3/pmm3-ha-eks-cleanup.groovy */ +// Constants +def CLUSTER_PREFIX = 'pmm-ha-test-' + /** * Configure EKS Access Entries for cluster authentication. * @@ -520,3 +526,207 @@ def deleteCluster(Map config) { --disable-nodegroup-eviction --wait """ } + +/** + * List PMM HA test clusters sorted by creation time (newest first). + * + * @param region AWS region (default: us-east-2) + * @return List of cluster names sorted newest first, empty list if none found + */ +def listClusters(String region = 'us-east-2') { + def clusterPrefix = 'pmm-ha-test-' + + def clustersJson = sh( + script: """ + aws eks list-clusters --region ${region} \\ + --query "clusters[?starts_with(@, '${clusterPrefix}')]" \\ + --output json 2>/dev/null || echo '[]' + """, + returnStdout: true + ).trim() + + def clusters = readJSON(text: clustersJson) + if (!clusters) { + return [] + } + + // Get creation times for all clusters + def clusterData = [] + clusters.each { clusterName -> + def createdAt = sh( + script: """ + aws eks describe-cluster --name ${clusterName} --region ${region} \\ + --query 'cluster.createdAt' --output text 2>/dev/null || echo '' + """, + returnStdout: true + ).trim() + + if (createdAt && createdAt != 'None') { + clusterData.add([name: clusterName, createdAt: createdAt]) + } + } + + // Sort by createdAt descending (newest first) using string comparison (ISO 8601 sorts correctly) + // Note: toSorted() returns new list without modifying original (CPS-safe) + def sortedData = clusterData.toSorted { a, b -> b.createdAt <=> a.createdAt } + + return sortedData.collect { it.name } +} + +/** + * Delete multiple clusters with optional SKIP_NEWEST and age filter. + * + * Supports parallel deletion for faster cleanup. + * + * @param region AWS region (default: us-east-2) + * @param r53ZoneName Route53 hosted zone name (default: cd.percona.com) + * @param skipNewest Skip the most recent cluster (default: true) + * @param maxAgeHours Only delete clusters older than this (0 = delete all, default: 0) + */ +def deleteAllClusters(Map config = [:]) { + def region = config.region ?: 'us-east-2' + def r53ZoneName = config.r53ZoneName ?: 'cd.percona.com' + def skipNewest = config.skipNewest != null ? config.skipNewest : true + def maxAgeHours = config.maxAgeHours ?: 0 + + def clusterList = listClusters(region) + + if (!clusterList) { + echo "No clusters found with prefix 'pmm-ha-test-'." + return + } + + def clustersToDelete = clusterList + if (skipNewest && clusterList.size() > 1) { + // clusterList is sorted newest first, so skip first element + clustersToDelete = clusterList.drop(1) + echo "Skipping newest cluster: ${clusterList[0]} (SKIP_NEWEST=true)" + } + + // Filter by age if maxAgeHours > 0 + if (maxAgeHours > 0) { + def cutoffMs = System.currentTimeMillis() - (maxAgeHours * 60 * 60 * 1000) + def filtered = [] + + clustersToDelete.each { clusterName -> + def createdMs = sh( + script: """ + CREATED=\$(aws eks describe-cluster --name ${clusterName} --region ${region} \\ + --query 'cluster.createdAt' --output text 2>/dev/null || echo '') + if [ -z "\$CREATED" ] || [ "\$CREATED" = "None" ]; then + echo "" + else + date -d "\$CREATED" +%s000 2>/dev/null || echo "" + fi + """, + returnStdout: true + ).trim() + + if (createdMs && createdMs.toLong() < cutoffMs) { + filtered.add(clusterName) + } else { + echo "Skipping recent cluster: ${clusterName} (created within last ${maxAgeHours}h)" + } + } + clustersToDelete = filtered + } + + if (!clustersToDelete) { + echo 'No clusters to delete after applying filters.' + return + } + + // Delete clusters in parallel + def parallelStages = [:] + clustersToDelete.each { clusterName -> + parallelStages["Delete ${clusterName}"] = { + deleteCluster( + clusterName: clusterName, + region: region, + r53ZoneName: r53ZoneName + ) + } + } + parallel parallelStages +} + +/** + * Clean up orphaned VPCs and failed CloudFormation stacks. + * + * Finds: + * - VPCs with eksctl-pmm-ha-test-* tags but no matching EKS cluster + * - CloudFormation stacks in DELETE_FAILED or ROLLBACK_COMPLETE state + * + * @param region AWS region (default: us-east-2) + */ +def cleanupOrphans(Map config = [:]) { + def region = config.region ?: 'us-east-2' + def clusterPrefix = 'pmm-ha-test-' + + // Get list of active EKS clusters + def activeClusters = sh( + script: """ + aws eks list-clusters --region ${region} \\ + --query "clusters[?starts_with(@, '${clusterPrefix}')]" \\ + --output text 2>/dev/null || echo '' + """, + returnStdout: true + ).trim().split(/\s+/).findAll { it } + + echo "Active EKS clusters: ${activeClusters}" + + // Find orphaned VPCs (VPCs without matching EKS cluster) + def orphanedVpcs = sh( + script: """ + aws ec2 describe-vpcs --region ${region} \\ + --filters "Name=tag:Name,Values=eksctl-${clusterPrefix}*-cluster/VPC" \\ + --query 'Vpcs[*].[VpcId,Tags[?Key==`Name`].Value|[0]]' \\ + --output text 2>/dev/null || echo '' + """, + returnStdout: true + ).trim() + + if (orphanedVpcs) { + orphanedVpcs.split('\n').each { line -> + def parts = line.split('\t') + if (parts.size() >= 2) { + def vpcId = parts[0] + def vpcName = parts[1] + // Extract cluster name from VPC name (eksctl-pmm-ha-test-XX-cluster/VPC) + def matcher = vpcName =~ /eksctl-(${clusterPrefix}\d+)-cluster/ + if (matcher) { + def clusterName = matcher[0][1] + if (!activeClusters.contains(clusterName)) { + echo "Found orphaned VPC: ${vpcId} (${vpcName}) - cluster ${clusterName} does not exist" + // Delete VPC using eksctl (handles all dependencies) + sh """ + eksctl delete cluster --name ${clusterName} --region ${region} --wait=false 2>/dev/null || true + """ + } + } + } + } + } else { + echo 'No orphaned VPCs found.' + } + + // Find and delete failed CloudFormation stacks + def failedStacks = sh( + script: """ + aws cloudformation list-stacks --region ${region} \\ + --stack-status-filter DELETE_FAILED ROLLBACK_COMPLETE \\ + --query "StackSummaries[?contains(StackName, '${clusterPrefix}')].StackName" \\ + --output text 2>/dev/null || echo '' + """, + returnStdout: true + ).trim() + + if (failedStacks) { + failedStacks.split(/\s+/).each { stackName -> + echo "Deleting failed stack: ${stackName}" + sh "aws cloudformation delete-stack --region ${region} --stack-name ${stackName} || true" + } + } else { + echo 'No failed CloudFormation stacks found.' + } +} From 4415d999f4cfd00eadff80b11c96e40cda1681ca Mon Sep 17 00:00:00 2001 From: Anderson Nogueira Date: Tue, 2 Dec 2025 14:31:07 +0100 Subject: [PATCH 17/19] refactor(pmm-ha-eks): use listClusters() consistently in pipelines Replace inline shell cluster discovery with pmmHaEks.listClusters(): - pmm3-ha-eks.groovy: Check Existing Clusters stage - pmm3-ha-eks-cleanup.groovy: List Clusters stage Reduces code duplication and ensures consistent behavior. --- pmm/v3/pmm3-ha-eks-cleanup.groovy | 43 ++++++++++++++++--------------- pmm/v3/pmm3-ha-eks.groovy | 29 +++++++++------------ 2 files changed, 34 insertions(+), 38 deletions(-) diff --git a/pmm/v3/pmm3-ha-eks-cleanup.groovy b/pmm/v3/pmm3-ha-eks-cleanup.groovy index c5863a1ce1..f42b0ff1b2 100644 --- a/pmm/v3/pmm3-ha-eks-cleanup.groovy +++ b/pmm/v3/pmm3-ha-eks-cleanup.groovy @@ -83,29 +83,30 @@ pipeline { when { expression { env.ACTION == 'LIST_ONLY' } } steps { withCredentials([aws(credentialsId: 'pmm-staging-slave')]) { - sh ''' - set +x - - CLUSTERS=$(aws eks list-clusters --region "${REGION}" \ - --query "clusters[?starts_with(@, '${CLUSTER_PREFIX}')]" \ - --output text) - - if [ -z "${CLUSTERS}" ]; then - echo "No clusters found with prefix '${CLUSTER_PREFIX}'." - exit 0 - fi - - for cluster in ${CLUSTERS}; do - CREATED=$(aws eks describe-cluster \ - --name "${cluster}" --region "${REGION}" \ - --query "cluster.createdAt" --output text) + script { + def clusters = pmmHaEks.listClusters(env.REGION) - CREATED_EPOCH=$(date -d "${CREATED}" +%s) - AGE_HOURS=$(( ( $(date +%s) - CREATED_EPOCH ) / 3600 )) + if (!clusters) { + echo "No clusters found with prefix '${env.CLUSTER_PREFIX}'." + return + } - echo "* ${cluster} | Created: ${CREATED} | Age: ${AGE_HOURS}h" - done - ''' + echo "Found ${clusters.size()} cluster(s):" + clusters.each { clusterName -> + def info = sh( + script: """ + CREATED=\$(aws eks describe-cluster --name ${clusterName} --region ${env.REGION} \ + --query 'cluster.createdAt' --output text) + CREATED_EPOCH=\$(date -d "\${CREATED}" +%s) + AGE_HOURS=\$(( ( \$(date +%s) - CREATED_EPOCH ) / 3600 )) + echo "\${CREATED}|\${AGE_HOURS}" + """, + returnStdout: true + ).trim() + def parts = info.split('\\|') + echo "* ${clusterName} | Created: ${parts[0]} | Age: ${parts[1]}h" + } + } } } } diff --git a/pmm/v3/pmm3-ha-eks.groovy b/pmm/v3/pmm3-ha-eks.groovy index 9a0e9d59a1..3e0b05edc5 100644 --- a/pmm/v3/pmm3-ha-eks.groovy +++ b/pmm/v3/pmm3-ha-eks.groovy @@ -117,26 +117,21 @@ EOF stage('Check Existing Clusters') { steps { withCredentials([aws(credentialsId: 'pmm-staging-slave')]) { - sh ''' - set +x - - EXISTING_CLUSTERS=$(aws eks list-clusters --region "${REGION}" \ - --query "clusters[?starts_with(@, 'pmm-ha-test-')]" --output text) + script { + def clusters = pmmHaEks.listClusters(env.REGION) + def count = clusters.size() - if [ -z "${EXISTING_CLUSTERS}" ]; then - EXISTING_COUNT=0 - else - EXISTING_COUNT=$(echo "${EXISTING_CLUSTERS}" | wc -w) - echo "${EXISTING_CLUSTERS}" | tr '\\t' '\\n' - fi + if (clusters) { + echo "Existing clusters (${count}):" + clusters.each { echo " - ${it}" } + } - if [ "${EXISTING_COUNT}" -ge 5 ]; then - echo "ERROR: Maximum limit of 5 test clusters reached." - exit 1 - fi + if (count >= 5) { + error('Maximum limit of 5 test clusters reached.') + } - echo "Existing clusters: $EXISTING_COUNT / 5" - ''' + echo "Cluster count: ${count} / 5" + } } } } From 559b3ef05379e04d3b0eed71908dcc760d5cebdf Mon Sep 17 00:00:00 2001 From: Anderson Nogueira Date: Tue, 2 Dec 2025 14:33:04 +0100 Subject: [PATCH 18/19] fix(pmm-ha-eks): use jq for JSON parsing in listClusters Replace readJSON (unavailable DSL method) with shell-based jq parsing. Sorting by createdAt is done in shell using sort -r for CPS safety. --- vars/pmmHaEks.groovy | 39 ++++++++++++--------------------------- 1 file changed, 12 insertions(+), 27 deletions(-) diff --git a/vars/pmmHaEks.groovy b/vars/pmmHaEks.groovy index 13d3bbad63..d2a7699159 100644 --- a/vars/pmmHaEks.groovy +++ b/vars/pmmHaEks.groovy @@ -536,41 +536,26 @@ def deleteCluster(Map config) { def listClusters(String region = 'us-east-2') { def clusterPrefix = 'pmm-ha-test-' - def clustersJson = sh( + // Get all clusters matching prefix, then describe each and sort by createdAt (newest first) + // Output: one cluster name per line, sorted by creation time descending + def output = sh( script: """ - aws eks list-clusters --region ${region} \\ - --query "clusters[?starts_with(@, '${clusterPrefix}')]" \\ - --output json 2>/dev/null || echo '[]' + aws eks list-clusters --region ${region} --output json 2>/dev/null | \\ + jq -r '.clusters[] | select(startswith("${clusterPrefix}"))' | \\ + while read cluster; do + CREATED=\$(aws eks describe-cluster --name "\$cluster" --region ${region} \\ + --query 'cluster.createdAt' --output text 2>/dev/null) + [ -n "\$CREATED" ] && [ "\$CREATED" != "None" ] && echo "\$CREATED|\$cluster" + done | sort -r | cut -d'|' -f2 """, returnStdout: true ).trim() - def clusters = readJSON(text: clustersJson) - if (!clusters) { + if (!output) { return [] } - // Get creation times for all clusters - def clusterData = [] - clusters.each { clusterName -> - def createdAt = sh( - script: """ - aws eks describe-cluster --name ${clusterName} --region ${region} \\ - --query 'cluster.createdAt' --output text 2>/dev/null || echo '' - """, - returnStdout: true - ).trim() - - if (createdAt && createdAt != 'None') { - clusterData.add([name: clusterName, createdAt: createdAt]) - } - } - - // Sort by createdAt descending (newest first) using string comparison (ISO 8601 sorts correctly) - // Note: toSorted() returns new list without modifying original (CPS-safe) - def sortedData = clusterData.toSorted { a, b -> b.createdAt <=> a.createdAt } - - return sortedData.collect { it.name } + return output.split('\n').findAll { it } } /** From a45987151cd266ee23cbe7c07267f399c32cd256 Mon Sep 17 00:00:00 2001 From: Anderson Nogueira Date: Tue, 2 Dec 2025 14:37:28 +0100 Subject: [PATCH 19/19] fix(pmm-ha-eks): disable stack termination protection before deletion eksctl cannot delete stacks with TerminationProtection enabled. Add step to disable protection on all cluster-related CF stacks before calling eksctl delete. --- vars/pmmHaEks.groovy | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/vars/pmmHaEks.groovy b/vars/pmmHaEks.groovy index d2a7699159..468b47b4d1 100644 --- a/vars/pmmHaEks.groovy +++ b/vars/pmmHaEks.groovy @@ -521,6 +521,19 @@ def deleteCluster(Map config) { echo "Waiting for ALB cleanup..." sleep 30 + # Disable termination protection on all CloudFormation stacks for this cluster + echo "Disabling termination protection on CloudFormation stacks..." + for stack_name in \$(aws cloudformation list-stacks --region "\${REGION}" \\ + --stack-status-filter CREATE_COMPLETE UPDATE_COMPLETE \\ + --query "StackSummaries[?starts_with(StackName, 'eksctl-\${cluster_name}')].StackName" \\ + --output text 2>/dev/null); do + echo " Disabling protection: \${stack_name}" + aws cloudformation update-termination-protection \\ + --region "\${REGION}" \\ + --stack-name "\${stack_name}" \\ + --no-enable-termination-protection 2>/dev/null || true + done + echo "Deleting EKS cluster \${cluster_name}..." eksctl delete cluster --region "\${REGION}" --name "\${cluster_name}" \\ --disable-nodegroup-eviction --wait