diff --git a/pmm/v3/pmm3-ha-eks-cleanup.groovy b/pmm/v3/pmm3-ha-eks-cleanup.groovy
index cada78b616..f42b0ff1b2 100644
--- a/pmm/v3/pmm3-ha-eks-cleanup.groovy
+++ b/pmm/v3/pmm3-ha-eks-cleanup.groovy
@@ -1,33 +1,60 @@
+/**
+ * PMM HA EKS Cleanup Pipeline
+ *
+ * Manages cleanup of PMM HA test clusters. Supports manual and scheduled runs.
+ * Deletes Route53 records, ALB ingress, and EKS clusters.
+ *
+ * Actions:
+ * - LIST_ONLY: List all test clusters with age
+ * - DELETE_CLUSTER: Delete a specific cluster
+ * - DELETE_ALL: Delete all test clusters (respects SKIP_NEWEST)
+ * - DELETE_OLD (cron): Delete clusters older than 24 hours
+ * - CLEANUP_ORPHANS: Delete orphaned VPCs and failed CF stacks
+ *
+ * Related:
+ * - Create: pmm3-ha-eks.groovy
+ * - Shared library: vars/pmmHaEks.groovy
+ */
+library changelog: false, identifier: 'lib@fix/pmm-ha-eks-access-entries', retriever: modernSCM([
+ $class: 'GitSCMSource',
+ remote: 'https://github.com/Percona-Lab/jenkins-pipelines'
+])
+
pipeline {
agent {
- label 'agent-amd64-ol9'
+ label 'cli'
}
triggers {
- cron('H 0,12 * * *') // Runs twice daily at 00:00 & 12:00
+ cron('H 0,12 * * *')
}
parameters {
choice(
name: 'ACTION',
- choices: ['LIST_ONLY', 'DELETE_CLUSTER', 'DELETE_ALL'],
+ choices: ['LIST_ONLY', 'DELETE_CLUSTER', 'DELETE_ALL', 'CLEANUP_ORPHANS'],
description: '''
LIST_ONLY - list all test clusters
DELETE_CLUSTER - delete a specific cluster (requires CLUSTER_NAME)
- DELETE_ALL - delete all test clusters
+ DELETE_ALL - delete all test clusters
+ CLEANUP_ORPHANS - delete orphaned VPCs and failed CF stacks
Note: Daily cron automatically deletes clusters older than 1 day.
'''
)
string(name: 'CLUSTER_NAME', defaultValue: '', description: 'Required only for DELETE_CLUSTER')
+ booleanParam(name: 'SKIP_NEWEST', defaultValue: true, description: 'Skip the most recent cluster (protects in-progress builds)')
}
options {
buildDiscarder(logRotator(numToKeepStr: '30'))
+ disableConcurrentBuilds()
+ timeout(time: 60, unit: 'MINUTES')
}
environment {
- REGION = "us-east-2"
- CLUSTER_PREFIX = "pmm-ha-test-"
+ REGION = 'us-east-2'
+ CLUSTER_PREFIX = 'pmm-ha-test-'
+ R53_ZONE_NAME = 'cd.percona.com'
}
stages {
@@ -36,14 +63,14 @@ pipeline {
script {
if (currentBuild.getBuildCauses('hudson.triggers.TimerTrigger$TimerTriggerCause')) {
env.ACTION = 'DELETE_OLD'
- echo "Triggered by cron - will delete clusters older than 1 day."
+ echo 'Triggered by cron - will delete clusters older than 1 day.'
} else {
env.ACTION = params.ACTION
echo "Manual run with ACTION=${params.ACTION}"
}
if (env.ACTION == 'DELETE_CLUSTER' && !params.CLUSTER_NAME) {
- error("CLUSTER_NAME is required for DELETE_CLUSTER.")
+ error('CLUSTER_NAME is required for DELETE_CLUSTER.')
}
if (params.CLUSTER_NAME && !params.CLUSTER_NAME.startsWith(env.CLUSTER_PREFIX)) {
error("Cluster name must start with ${env.CLUSTER_PREFIX}")
@@ -56,29 +83,30 @@ pipeline {
when { expression { env.ACTION == 'LIST_ONLY' } }
steps {
withCredentials([aws(credentialsId: 'pmm-staging-slave')]) {
- sh '''
- set +x
-
- CLUSTERS=$(aws eks list-clusters --region "$REGION" \
- --query "clusters[?starts_with(@, '${CLUSTER_PREFIX}')]" \
- --output text)
-
- if [ -z "$CLUSTERS" ]; then
- echo "No clusters found with prefix '${CLUSTER_PREFIX}'."
- exit 0
- fi
-
- for c in $CLUSTERS; do
- CREATED=$(aws eks describe-cluster \
- --name "$c" --region "$REGION" \
- --query "cluster.createdAt" --output text)
-
- CREATED_EPOCH=$(date -d "$CREATED" +%s)
- AGE_HOURS=$(( ( $(date +%s) - CREATED_EPOCH ) / 3600 ))
-
- echo "• $c | Created: $CREATED | Age: ${AGE_HOURS}h"
- done
- '''
+ script {
+ def clusters = pmmHaEks.listClusters(env.REGION)
+
+ if (!clusters) {
+ echo "No clusters found with prefix '${env.CLUSTER_PREFIX}'."
+ return
+ }
+
+ echo "Found ${clusters.size()} cluster(s):"
+ clusters.each { clusterName ->
+ def info = sh(
+ script: """
+ CREATED=\$(aws eks describe-cluster --name ${clusterName} --region ${env.REGION} \
+ --query 'cluster.createdAt' --output text)
+ CREATED_EPOCH=\$(date -d "\${CREATED}" +%s)
+ AGE_HOURS=\$(( ( \$(date +%s) - CREATED_EPOCH ) / 3600 ))
+ echo "\${CREATED}|\${AGE_HOURS}"
+ """,
+ returnStdout: true
+ ).trim()
+ def parts = info.split('\\|')
+ echo "* ${clusterName} | Created: ${parts[0]} | Age: ${parts[1]}h"
+ }
+ }
}
}
}
@@ -87,15 +115,22 @@ pipeline {
when { expression { env.ACTION == 'DELETE_CLUSTER' } }
steps {
withCredentials([aws(credentialsId: 'pmm-staging-slave')]) {
- sh '''
- if ! aws eks describe-cluster --region "${REGION}" --name "${CLUSTER_NAME}" >/dev/null 2>&1; then
- echo "Cluster '${CLUSTER_NAME}' not found in region '${REGION}'."
- exit 0
- fi
-
- eksctl delete cluster --region "${REGION}" --name "${CLUSTER_NAME}" \
- --disable-nodegroup-eviction --wait
- '''
+ script {
+ def clusterExists = sh(
+ script: "aws eks describe-cluster --region ${REGION} --name ${params.CLUSTER_NAME} >/dev/null 2>&1",
+ returnStatus: true
+ ) == 0
+
+ if (clusterExists) {
+ pmmHaEks.deleteCluster(
+ clusterName: params.CLUSTER_NAME,
+ region: env.REGION,
+ r53ZoneName: env.R53_ZONE_NAME
+ )
+ } else {
+ echo "Cluster '${params.CLUSTER_NAME}' not found in region '${REGION}'."
+ }
+ }
}
}
}
@@ -104,20 +139,14 @@ pipeline {
when { expression { env.ACTION == 'DELETE_ALL' } }
steps {
withCredentials([aws(credentialsId: 'pmm-staging-slave')]) {
- sh '''
- CLUSTERS=$(aws eks list-clusters --region "$REGION" \
- --query "clusters[?starts_with(@, '${CLUSTER_PREFIX}')]" --output text)
-
- if [ -z "$CLUSTERS" ]; then
- echo "No clusters found with prefix '${CLUSTER_PREFIX}'."
- exit 0
- fi
-
- for c in $CLUSTERS; do
- eksctl delete cluster --region "$REGION" --name "$c" \
- --disable-nodegroup-eviction --wait
- done
- '''
+ script {
+ pmmHaEks.deleteAllClusters(
+ region: env.REGION,
+ r53ZoneName: env.R53_ZONE_NAME,
+ skipNewest: params.SKIP_NEWEST,
+ maxAgeHours: 0 // Delete all regardless of age
+ )
+ }
}
}
}
@@ -126,36 +155,25 @@ pipeline {
when { expression { env.ACTION == 'DELETE_OLD' } }
steps {
withCredentials([aws(credentialsId: 'pmm-staging-slave')]) {
- sh '''
- CLUSTERS=$(aws eks list-clusters --region "$REGION" \
- --query "clusters[?starts_with(@, '${CLUSTER_PREFIX}')]" --output text)
-
- if [ -z "$CLUSTERS" ]; then
- echo "No clusters found with prefix '${CLUSTER_PREFIX}'."
- exit 0
- fi
-
- CUTOFF=$(date -d "1 day ago" +%s)
-
- for c in $CLUSTERS; do
- CREATED=$(aws eks describe-cluster --name "$c" --region "$REGION" \
- --query "cluster.createdAt" --output text 2>/dev/null || true)
-
- if [ -z "$CREATED" ] || [ "$CREATED" == "None" ]; then
- echo "Unable to fetch creation time for $c — skipping."
- continue
- fi
-
- CREATED_EPOCH=$(date -d "$CREATED" +%s)
+ script {
+ pmmHaEks.deleteAllClusters(
+ region: env.REGION,
+ r53ZoneName: env.R53_ZONE_NAME,
+ skipNewest: true, // Always protect newest during cron
+ maxAgeHours: 24 // Only delete clusters older than 24h
+ )
+ }
+ }
+ }
+ }
- if [ "$CREATED_EPOCH" -lt "$CUTOFF" ]; then
- eksctl delete cluster --region "$REGION" --name "$c" \
- --disable-nodegroup-eviction --wait
- else
- echo "Skipping recent cluster: $c (created within last 24h)"
- fi
- done
- '''
+ stage('Cleanup Orphan Resources') {
+ when { expression { env.ACTION == 'CLEANUP_ORPHANS' } }
+ steps {
+ withCredentials([aws(credentialsId: 'pmm-staging-slave')]) {
+ script {
+ pmmHaEks.cleanupOrphans(region: env.REGION)
+ }
}
}
}
diff --git a/pmm/v3/pmm3-ha-eks.groovy b/pmm/v3/pmm3-ha-eks.groovy
index 580bec6446..3e0b05edc5 100644
--- a/pmm/v3/pmm3-ha-eks.groovy
+++ b/pmm/v3/pmm3-ha-eks.groovy
@@ -1,26 +1,69 @@
+/**
+ * PMM HA EKS Test Pipeline
+ *
+ * Creates an EKS cluster with PMM High Availability deployment for testing.
+ * Includes ALB ingress with ACM certificate and Route53 DNS.
+ *
+ * Related:
+ * - Cleanup: pmm3-ha-eks-cleanup.groovy
+ * - Shared library: vars/pmmHaEks.groovy
+ */
+library changelog: false, identifier: 'lib@fix/pmm-ha-eks-access-entries', retriever: modernSCM([
+ $class: 'GitSCMSource',
+ remote: 'https://github.com/Percona-Lab/jenkins-pipelines'
+])
+
pipeline {
agent {
- label 'agent-amd64-ol9'
+ label 'cli'
+ }
+
+ options {
+ disableConcurrentBuilds()
+ timeout(time: 90, unit: 'MINUTES')
}
parameters {
choice(
name: 'K8S_VERSION',
- choices: ['1.32', '1.31', '1.30', '1.29', '1.28'],
+ choices: ['1.32', '1.33', '1.31', '1.30', '1.29'],
description: 'Select Kubernetes cluster version'
)
+ // PMM HA charts are not yet merged to percona/percona-helm-charts main branch.
+ // theTibi/PMM-14420 contains both pmm-ha and pmm-ha-dependencies charts.
+ // Once merged to percona main, update default to 'main' and swap repo priority.
+ string(
+ name: 'HELM_CHART_BRANCH',
+ defaultValue: 'PMM-14420',
+ description: 'Branch of percona-helm-charts repo (theTibi/PMM-14420 has both pmm-ha and pmm-ha-dependencies)'
+ )
+ string(
+ name: 'PMM_IMAGE_TAG',
+ defaultValue: '',
+ description: 'PMM Server image tag (leave empty for chart default)'
+ )
}
- environment {
+ environment {
CLUSTER_NAME = "pmm-ha-test-${BUILD_NUMBER}"
- REGION = "us-east-2"
+ REGION = 'us-east-2'
KUBECONFIG = "${WORKSPACE}/kubeconfig/config"
+ PMM_NAMESPACE = 'pmm'
+ ACM_CERT_ARN = 'arn:aws:acm:us-east-2:119175775298:certificate/9bd3a0c8-8205-4092-8003-7304ca762143'
+ R53_ZONE_NAME = 'cd.percona.com'
+ PMM_DOMAIN = "pmm-ha-test-${BUILD_NUMBER}.${R53_ZONE_NAME}"
}
stages {
stage('Write Cluster Config') {
steps {
- sh '''
+ withCredentials([aws(credentialsId: 'pmm-staging-slave')]) {
+ sh '''
+ # Discover available AZs dynamically
+ AZS=$(aws ec2 describe-availability-zones --region "${REGION}" \
+ --query 'AvailabilityZones[?State==`available`].ZoneName' \
+ --output json)
+
cat > cluster-config.yaml <= 5) {
+ error('Maximum limit of 5 test clusters reached.')
+ }
- echo "Existing clusters: $EXISTING_COUNT / 5"
- '''
+ echo "Cluster count: ${count} / 5"
+ }
}
}
}
+ stage('Validate Helm Chart') {
+ steps {
+ sh '''
+ set -e
+ echo "Validating Helm chart branch: ${HELM_CHART_BRANCH}"
+
+ # Try theTibi fork first (has PMM-14420 with both charts), then percona repo
+ TIBI_REPO="https://github.com/theTibi/percona-helm-charts.git"
+ PERCONA_REPO="https://github.com/percona/percona-helm-charts.git"
+
+ rm -rf charts-repo-check
+ if git clone --depth 1 --branch "${HELM_CHART_BRANCH}" "${TIBI_REPO}" charts-repo-check 2>/dev/null; then
+ echo "Found branch in: ${TIBI_REPO}"
+ elif git clone --depth 1 --branch "${HELM_CHART_BRANCH}" "${PERCONA_REPO}" charts-repo-check 2>/dev/null; then
+ echo "Found branch in: ${PERCONA_REPO}"
+ else
+ echo "ERROR: Branch '${HELM_CHART_BRANCH}' not found in theTibi or percona helm chart repos"
+ exit 1
+ fi
+
+ # Check required charts exist
+ if [ ! -d "charts-repo-check/charts/pmm-ha" ]; then
+ echo "ERROR: pmm-ha chart not found in branch '${HELM_CHART_BRANCH}'"
+ echo "Available charts:"
+ ls -la charts-repo-check/charts/ || true
+ rm -rf charts-repo-check
+ exit 1
+ fi
+
+ if [ ! -d "charts-repo-check/charts/pmm-ha-dependencies" ]; then
+ echo "ERROR: pmm-ha-dependencies chart not found in branch '${HELM_CHART_BRANCH}'"
+ echo "Available charts:"
+ ls -la charts-repo-check/charts/ || true
+ rm -rf charts-repo-check
+ exit 1
+ fi
+
+ echo "Helm charts validated successfully (pmm-ha + pmm-ha-dependencies)"
+ rm -rf charts-repo-check
+ '''
+ }
+ }
+
stage('Create EKS Cluster') {
steps {
withCredentials([aws(credentialsId: 'pmm-staging-slave')]) {
sh '''
eksctl create cluster -f cluster-config.yaml --timeout=40m --verbose=4
-
- # Map EKSAdminRole for IAM users
- eksctl create iamidentitymapping \
- --cluster "${CLUSTER_NAME}" \
- --region "${REGION}" \
- --arn arn:aws:iam::119175775298:role/EKSAdminRole \
- --username eks-admin \
- --group system:masters
'''
}
}
}
+ stage('Configure Cluster Access') {
+ steps {
+ withCredentials([aws(credentialsId: 'pmm-staging-slave')]) {
+ script {
+ pmmHaEks.configureAccess(
+ clusterName: env.CLUSTER_NAME,
+ region: env.REGION
+ )
+ }
+ }
+ }
+ }
+
stage('Export kubeconfig') {
steps {
withCredentials([aws(credentialsId: 'pmm-staging-slave')]) {
sh '''
+ rm -rf kubeconfig
mkdir -p kubeconfig
aws eks update-kubeconfig \
@@ -129,50 +221,45 @@ EOF
}
}
- stage('Configure GP3 Storage Class') {
+ stage('Setup Infrastructure') {
steps {
withCredentials([aws(credentialsId: 'pmm-staging-slave')]) {
- sh '''
- kubectl patch storageclass gp2 -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"false"}}}'
-
- cat </dev/null || echo "pending")
+
+ echo "============================================"
+ echo "Access Commands"
+ echo "============================================"
+ echo "kubectl:"
+ echo " aws eks update-kubeconfig --name ${CLUSTER_NAME} --region ${REGION}"
+ echo ""
+ echo "PMM UI:"
+ echo " https://${PMM_DOMAIN}"
+ echo ""
+ echo "ALB Hostname:"
+ echo " ${ALB_HOSTNAME}"
+
+ # Save credentials to file for archiving
+ mkdir -p pmm-credentials
+ cat > pmm-credentials/access-info.txt </dev/null || echo 'unknown'",
+ returnStdout: true
+ ).trim()
+
+ currentBuild.description = "https://${PMM_DOMAIN} | admin / ${pmmPassword} | ${chartRepo}/${HELM_CHART_BRANCH}"
+
+ echo "Cluster ${CLUSTER_NAME} with PMM HA created successfully."
+ echo "PMM URL: https://${PMM_DOMAIN}"
+ echo 'User: admin'
+ echo "Password: ${pmmPassword}"
+ echo "Chart: ${chartRepo}/${HELM_CHART_BRANCH}"
+ }
+ }
}
failure {
withCredentials([aws(credentialsId: 'pmm-staging-slave')]) {
- sh '''
- if eksctl get cluster \
- --region "${REGION}" \
- --name "${CLUSTER_NAME}" >/dev/null 2>&1
- then
- eksctl delete cluster \
- --region "${REGION}" \
- --name "${CLUSTER_NAME}" \
- --disable-nodegroup-eviction \
- --wait
- fi
- '''
+ script {
+ // Check if cluster exists before cleanup
+ def clusterExists = sh(
+ script: "eksctl get cluster --region ${REGION} --name ${CLUSTER_NAME} >/dev/null 2>&1",
+ returnStatus: true
+ ) == 0
+
+ if (clusterExists) {
+ pmmHaEks.deleteCluster(
+ clusterName: env.CLUSTER_NAME,
+ region: env.REGION,
+ r53ZoneName: env.R53_ZONE_NAME
+ )
+ } else {
+ echo "Cluster ${CLUSTER_NAME} not found, nothing to clean up."
+ }
+ }
}
}
}
diff --git a/vars/pmmHaEks.groovy b/vars/pmmHaEks.groovy
new file mode 100644
index 0000000000..468b47b4d1
--- /dev/null
+++ b/vars/pmmHaEks.groovy
@@ -0,0 +1,730 @@
+/**
+ * PMM HA EKS Shared Library
+ *
+ * Reusable functions for PMM High Availability testing on EKS clusters.
+ *
+ * Functions:
+ * - configureAccess() Configure EKS Access Entries (IAM roles, users, SSO)
+ * - setupInfrastructure() Install GP3 storage, Node Termination Handler, ALB Controller
+ * - installPmm() Deploy PMM HA stack (operators, secrets, helm chart)
+ * - createIngress() Create ALB Ingress and Route53 DNS record
+ * - deleteCluster() Delete Route53, ALB, and EKS cluster
+ * - listClusters() List PMM HA test clusters sorted by creation time (newest first)
+ * - deleteAllClusters() Delete all/old clusters with SKIP_NEWEST support
+ * - cleanupOrphans() Clean up orphaned VPCs and failed CF stacks
+ *
+ * Related:
+ * - Create pipeline: pmm/v3/pmm3-ha-eks.groovy
+ * - Cleanup pipeline: pmm/v3/pmm3-ha-eks-cleanup.groovy
+ */
+
+// Constants
+def CLUSTER_PREFIX = 'pmm-ha-test-'
+
+/**
+ * Configure EKS Access Entries for cluster authentication.
+ *
+ * Grants cluster admin access to:
+ * - EKSAdminRole (for automation)
+ * - Members of pmm-eks-admins IAM group (dynamically resolved)
+ * - SSO AdministratorAccess role (for console users)
+ *
+ * @param clusterName EKS cluster name (required)
+ * @param region AWS region (default: us-east-2)
+ * @param adminGroupName IAM group for admin access (default: pmm-eks-admins)
+ */
+def configureAccess(Map config) {
+ def clusterName = config.clusterName ?: error('clusterName is required')
+ def region = config.region ?: 'us-east-2'
+ def adminGroupName = config.adminGroupName ?: 'pmm-eks-admins'
+
+ sh """
+ set -euo pipefail
+
+ CLUSTER_NAME="${clusterName}"
+ REGION="${region}"
+
+ ACCOUNT_ID=\$(aws sts get-caller-identity --query Account --output text)
+ echo "AWS Account ID: \${ACCOUNT_ID}"
+
+ # Add EKSAdminRole with cluster admin access
+ aws eks create-access-entry \\
+ --cluster-name "\${CLUSTER_NAME}" \\
+ --region "\${REGION}" \\
+ --principal-arn "arn:aws:iam::\${ACCOUNT_ID}:role/EKSAdminRole" || true
+
+ aws eks associate-access-policy \\
+ --cluster-name "\${CLUSTER_NAME}" \\
+ --region "\${REGION}" \\
+ --principal-arn "arn:aws:iam::\${ACCOUNT_ID}:role/EKSAdminRole" \\
+ --policy-arn arn:aws:eks::aws:cluster-access-policy/AmazonEKSClusterAdminPolicy \\
+ --access-scope type=cluster || true
+
+ # Add IAM group members dynamically
+ USERS=\$(aws iam get-group --group-name ${adminGroupName} --query 'Users[].Arn' --output text 2>/dev/null || echo "")
+ for USER_ARN in \${USERS}; do
+ echo "Adding access for \${USER_ARN}..."
+ aws eks create-access-entry \\
+ --cluster-name "\${CLUSTER_NAME}" \\
+ --region "\${REGION}" \\
+ --principal-arn "\${USER_ARN}" || true
+
+ aws eks associate-access-policy \\
+ --cluster-name "\${CLUSTER_NAME}" \\
+ --region "\${REGION}" \\
+ --principal-arn "\${USER_ARN}" \\
+ --policy-arn arn:aws:eks::aws:cluster-access-policy/AmazonEKSClusterAdminPolicy \\
+ --access-scope type=cluster || true
+ done
+
+ # Add SSO AdministratorAccess role (discover dynamically)
+ SSO_ROLE_ARN=\$(aws iam list-roles \\
+ --query "Roles[?contains(RoleName, 'AWSReservedSSO_AdministratorAccess')].Arn | [0]" \\
+ --output text 2>/dev/null | head -1 | tr -d '[:space:]')
+
+ if [ -n "\${SSO_ROLE_ARN}" ] && [ "\${SSO_ROLE_ARN}" != "None" ]; then
+ echo "Adding SSO role: \${SSO_ROLE_ARN}"
+ aws eks create-access-entry \\
+ --cluster-name "\${CLUSTER_NAME}" \\
+ --region "\${REGION}" \\
+ --principal-arn "\${SSO_ROLE_ARN}" || true
+
+ aws eks associate-access-policy \\
+ --cluster-name "\${CLUSTER_NAME}" \\
+ --region "\${REGION}" \\
+ --principal-arn "\${SSO_ROLE_ARN}" \\
+ --policy-arn arn:aws:eks::aws:cluster-access-policy/AmazonEKSClusterAdminPolicy \\
+ --access-scope type=cluster || true
+ else
+ echo "No SSO AdministratorAccess role found, skipping"
+ fi
+
+ echo "Access entries configured:"
+ aws eks list-access-entries --cluster-name "\${CLUSTER_NAME}" --region "\${REGION}"
+ """
+}
+
+/**
+ * Setup EKS infrastructure components for PMM HA.
+ *
+ * Installs and configures:
+ * - GP3 storage class (encrypted, default)
+ * - AWS Node Termination Handler (for spot instance draining)
+ * - AWS Load Balancer Controller (for ALB ingress)
+ *
+ * @param clusterName EKS cluster name (required)
+ * @param region AWS region (default: us-east-2)
+ */
+def setupInfrastructure(Map config) {
+ def clusterName = config.clusterName ?: error('clusterName is required')
+ def region = config.region ?: 'us-east-2'
+
+ sh """
+ set -euo pipefail
+
+ CLUSTER_NAME="${clusterName}"
+ REGION="${region}"
+
+ ACCOUNT_ID=\$(aws sts get-caller-identity --query Account --output text)
+ echo "AWS Account ID: \${ACCOUNT_ID}"
+
+ # Configure GP3 as default storage class
+ kubectl patch storageclass gp2 -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"false"}}}' || true
+
+ cat </dev/null; then
+ echo "Cloned from: \${TIBI_REPO}"
+ echo "theTibi" > .chart-repo-source
+ elif git clone --depth 1 --branch "\${HELM_CHART_BRANCH}" "\${PERCONA_REPO}" charts-repo 2>/dev/null; then
+ echo "Cloned from: \${PERCONA_REPO}"
+ echo "percona" > .chart-repo-source
+ else
+ echo "ERROR: Branch \${HELM_CHART_BRANCH} not found in either repository"
+ exit 1
+ fi
+
+ # Add required Helm repos
+ helm repo add percona https://percona.github.io/percona-helm-charts/ || true
+ helm repo add vm https://victoriametrics.github.io/helm-charts/ || true
+ helm repo add altinity https://docs.altinity.com/helm-charts/ || true
+ helm repo update
+
+ # Install PMM HA dependencies (operators)
+ helm dependency update charts-repo/charts/pmm-ha-dependencies
+ helm upgrade --install pmm-operators charts-repo/charts/pmm-ha-dependencies \\
+ --namespace "\${PMM_NAMESPACE}" \\
+ --create-namespace \\
+ --wait \\
+ --timeout 10m
+
+ echo "Waiting for operators to be ready..."
+ kubectl wait --for=condition=ready pod -l app.kubernetes.io/name=victoria-metrics-operator -n "\${PMM_NAMESPACE}" --timeout=300s || true
+ kubectl wait --for=condition=ready pod -l app.kubernetes.io/name=altinity-clickhouse-operator -n "\${PMM_NAMESPACE}" --timeout=300s || true
+ kubectl wait --for=condition=ready pod -l app.kubernetes.io/name=pg-operator -n "\${PMM_NAMESPACE}" --timeout=300s || true
+
+ # Generate passwords
+ PMM_ADMIN_PASSWORD=\$(openssl rand -base64 16 | tr -dc 'a-zA-Z0-9' | head -c 16)
+ PG_PASSWORD=\$(openssl rand -base64 24 | tr -dc 'a-zA-Z0-9' | head -c 24)
+ GF_PASSWORD=\$(openssl rand -base64 24 | tr -dc 'a-zA-Z0-9' | head -c 24)
+ CH_PASSWORD=\$(openssl rand -base64 24 | tr -dc 'a-zA-Z0-9' | head -c 24)
+ VM_PASSWORD=\$(openssl rand -base64 24 | tr -dc 'a-zA-Z0-9' | head -c 24)
+
+ # Pre-create pmm-secret before helm install
+ # The chart's pg-user-credentials-secrets.yaml uses lookup() at template time
+ # GF_SECURITY_ADMIN_PASSWORD is needed because with secret.create=false,
+ # the chart doesn't explicitly set this env var (only secretRef is used)
+ kubectl create secret generic pmm-secret \\
+ --namespace "\${PMM_NAMESPACE}" \\
+ --from-literal=PMM_ADMIN_PASSWORD="\${PMM_ADMIN_PASSWORD}" \\
+ --from-literal=GF_SECURITY_ADMIN_PASSWORD="\${PMM_ADMIN_PASSWORD}" \\
+ --from-literal=PG_PASSWORD="\${PG_PASSWORD}" \\
+ --from-literal=GF_PASSWORD="\${GF_PASSWORD}" \\
+ --from-literal=PMM_CLICKHOUSE_USER="clickhouse_pmm" \\
+ --from-literal=PMM_CLICKHOUSE_PASSWORD="\${CH_PASSWORD}" \\
+ --from-literal=VMAGENT_remoteWrite_basicAuth_username="victoriametrics_pmm" \\
+ --from-literal=VMAGENT_remoteWrite_basicAuth_password="\${VM_PASSWORD}" \\
+ --dry-run=client -o yaml | kubectl apply -f -
+
+ helm dependency update charts-repo/charts/pmm-ha
+
+ HELM_CMD="helm upgrade --install pmm-ha charts-repo/charts/pmm-ha"
+ HELM_CMD="\${HELM_CMD} --namespace \${PMM_NAMESPACE}"
+ HELM_CMD="\${HELM_CMD} --set secret.create=false"
+ HELM_CMD="\${HELM_CMD} --set secret.name=pmm-secret"
+ # Increase ClickHouse memory for merge operations (default 4Gi is insufficient)
+ HELM_CMD="\${HELM_CMD} --set clickhouse.resources.requests.memory=4Gi"
+ HELM_CMD="\${HELM_CMD} --set clickhouse.resources.limits.memory=10Gi"
+ if [ -n "\${PMM_IMAGE_TAG}" ]; then
+ HELM_CMD="\${HELM_CMD} --set image.tag=\${PMM_IMAGE_TAG}"
+ fi
+ HELM_CMD="\${HELM_CMD} --wait --timeout 15m"
+
+ eval "\${HELM_CMD}"
+
+ echo "Waiting for PMM HA components..."
+ kubectl rollout status statefulset/pmm-ha -n "\${PMM_NAMESPACE}" --timeout=600s || true
+ kubectl wait --for=condition=ready pod -l clickhouse.altinity.com/chi=pmm-ha -n "\${PMM_NAMESPACE}" --timeout=600s || true
+ kubectl wait --for=condition=ready pod -l app.kubernetes.io/component=vmselect -n "\${PMM_NAMESPACE}" --timeout=300s || true
+ kubectl wait --for=condition=ready pod -l app.kubernetes.io/component=vmstorage -n "\${PMM_NAMESPACE}" --timeout=300s || true
+
+ echo "PMM HA installed"
+ kubectl get pods -n "\${PMM_NAMESPACE}"
+ """
+}
+
+/**
+ * Create ALB Ingress and Route53 DNS record for PMM HA.
+ *
+ * Creates:
+ * - ALB Ingress with ACM certificate (HTTPS)
+ * - Route53 alias record pointing to ALB
+ *
+ * Waits up to 5 minutes for ALB provisioning.
+ *
+ * @param namespace Kubernetes namespace (default: pmm)
+ * @param domain FQDN for PMM access (required)
+ * @param certArn ACM certificate ARN for TLS (required)
+ * @param r53ZoneName Route53 hosted zone name (required, e.g., cd.percona.com)
+ * @param region AWS region (default: us-east-2)
+ */
+def createIngress(Map config) {
+ def namespace = config.namespace ?: 'pmm'
+ def domain = config.domain ?: error('domain is required')
+ def certArn = config.certArn ?: error('certArn is required')
+ def r53ZoneName = config.r53ZoneName ?: error('r53ZoneName is required')
+ def region = config.region ?: 'us-east-2'
+
+ sh """
+ set -euo pipefail
+
+ PMM_NAMESPACE="${namespace}"
+ PMM_DOMAIN="${domain}"
+ ACM_CERT_ARN="${certArn}"
+ R53_ZONE_NAME="${r53ZoneName}"
+ REGION="${region}"
+
+ # Resolve Route53 zone ID from zone name (public zones only, exact match)
+ R53_ZONE_IDS=\$(aws route53 list-hosted-zones-by-name \\
+ --dns-name "\${R53_ZONE_NAME}" \\
+ --query 'HostedZones[?Config.PrivateZone==`false` && Name==`'"\${R53_ZONE_NAME}"'.`].Id' \\
+ --output text | sed 's|/hostedzone/||g')
+
+ # Validate we got exactly one zone
+ zone_count=\$(echo "\${R53_ZONE_IDS}" | wc -w | tr -d ' ')
+ if [ "\${zone_count}" -eq 0 ] || [ -z "\${R53_ZONE_IDS}" ] || [ "\${R53_ZONE_IDS}" = "None" ]; then
+ echo "ERROR: No public Route53 zone found for \${R53_ZONE_NAME}"
+ exit 1
+ elif [ "\${zone_count}" -gt 1 ]; then
+ echo "ERROR: Multiple public Route53 zones found for \${R53_ZONE_NAME}: \${R53_ZONE_IDS}"
+ exit 1
+ fi
+ R53_ZONE_ID="\${R53_ZONE_IDS}"
+ echo "Resolved Route53 zone ID: \${R53_ZONE_ID}"
+
+ # Create ALB Ingress
+ cat </dev/null || echo "")
+ if [ -n "\${ALB_HOSTNAME}" ]; then
+ echo "ALB provisioned: \${ALB_HOSTNAME}"
+ break
+ fi
+ echo "Waiting for ALB... (\${attempt}/30)"
+ sleep 10
+ done
+
+ if [ -z "\${ALB_HOSTNAME}" ]; then
+ echo "WARNING: ALB not provisioned within timeout"
+ kubectl describe ingress pmm-ha-alb -n "\${PMM_NAMESPACE}"
+ exit 1
+ fi
+
+ ALB_ZONE_ID=\$(aws elbv2 describe-load-balancers --region "\${REGION}" \\
+ --query "LoadBalancers[?DNSName=='\${ALB_HOSTNAME}'].CanonicalHostedZoneId" \\
+ --output text)
+
+ if [ -n "\${ALB_ZONE_ID}" ]; then
+ aws route53 change-resource-record-sets \\
+ --hosted-zone-id "\${R53_ZONE_ID}" \\
+ --change-batch '{
+ "Changes": [{
+ "Action": "UPSERT",
+ "ResourceRecordSet": {
+ "Name": "'"\${PMM_DOMAIN}"'",
+ "Type": "A",
+ "AliasTarget": {
+ "HostedZoneId": "'"\${ALB_ZONE_ID}"'",
+ "DNSName": "'"\${ALB_HOSTNAME}"'",
+ "EvaluateTargetHealth": true
+ }
+ }
+ }]
+ }'
+ echo "Route53 record created: \${PMM_DOMAIN} -> \${ALB_HOSTNAME}"
+ else
+ echo "WARNING: Could not get ALB zone ID, skipping Route53 record"
+ fi
+ """
+}
+
+/**
+ * Delete PMM HA EKS cluster and all associated AWS resources.
+ *
+ * Cleanup order (to avoid dependency errors):
+ * 1. Route53 alias record
+ * 2. ALB Ingress (triggers ALB deletion)
+ * 3. EKS cluster via eksctl
+ *
+ * @param clusterName EKS cluster name (required)
+ * @param region AWS region (default: us-east-2)
+ * @param r53ZoneName Route53 hosted zone name (default: cd.percona.com)
+ */
+def deleteCluster(Map config) {
+ def clusterName = config.clusterName ?: error('clusterName is required')
+ def region = config.region ?: 'us-east-2'
+ def r53ZoneName = config.r53ZoneName ?: 'cd.percona.com'
+
+ sh """
+ set -euo pipefail
+
+ cluster_name="${clusterName}"
+ REGION="${region}"
+ R53_ZONE_NAME="${r53ZoneName}"
+
+ # Resolve Route53 zone ID from zone name (public zones only, exact match)
+ R53_ZONE_IDS=\$(aws route53 list-hosted-zones-by-name \\
+ --dns-name "\${R53_ZONE_NAME}" \\
+ --query 'HostedZones[?Config.PrivateZone==`false` && Name==`'"\${R53_ZONE_NAME}"'.`].Id' \\
+ --output text | sed 's|/hostedzone/||g')
+
+ # Validate we got exactly one zone
+ zone_count=\$(echo "\${R53_ZONE_IDS}" | wc -w | tr -d ' ')
+ if [ "\${zone_count}" -eq 0 ] || [ -z "\${R53_ZONE_IDS}" ] || [ "\${R53_ZONE_IDS}" = "None" ]; then
+ echo "WARNING: No public Route53 zone found for \${R53_ZONE_NAME}, skipping DNS cleanup"
+ R53_ZONE_ID=""
+ elif [ "\${zone_count}" -gt 1 ]; then
+ echo "WARNING: Multiple public Route53 zones found for \${R53_ZONE_NAME}, skipping DNS cleanup"
+ R53_ZONE_ID=""
+ else
+ R53_ZONE_ID="\${R53_ZONE_IDS}"
+ echo "Resolved Route53 zone ID: \${R53_ZONE_ID}"
+ fi
+
+ echo "============================================"
+ echo "Cleaning up cluster: \${cluster_name}"
+ echo "============================================"
+
+ # Delete Route53 record (if zone was resolved)
+ domain_name="\${cluster_name}.\${R53_ZONE_NAME}"
+ if [ -n "\${R53_ZONE_ID}" ]; then
+ echo "Deleting Route53 record for \${domain_name}..."
+ record=\$(aws route53 list-resource-record-sets \\
+ --hosted-zone-id "\${R53_ZONE_ID}" \\
+ --query "ResourceRecordSets[?Name=='\${domain_name}.']" \\
+ --output json 2>/dev/null || echo "[]")
+
+ if [ "\${record}" != "[]" ] && [ -n "\${record}" ]; then
+ record_type=\$(echo "\${record}" | jq -r '.[0].Type')
+ if [ "\${record_type}" = "A" ]; then
+ alias_target=\$(echo "\${record}" | jq -r '.[0].AliasTarget')
+ aws route53 change-resource-record-sets \\
+ --hosted-zone-id "\${R53_ZONE_ID}" \\
+ --change-batch '{
+ "Changes": [{
+ "Action": "DELETE",
+ "ResourceRecordSet": {
+ "Name": "'"\${domain_name}"'",
+ "Type": "A",
+ "AliasTarget": '"\${alias_target}"'
+ }
+ }]
+ }' && echo "Route53 record deleted" || echo "Warning: Failed to delete Route53 record"
+ fi
+ else
+ echo "No Route53 record found for \${domain_name}"
+ fi
+ else
+ echo "Skipping Route53 record deletion (zone not resolved)"
+ fi
+
+ # Delete ALB ingress (triggers ALB deletion)
+ echo "Deleting ALB ingress..."
+ if aws eks update-kubeconfig --name "\${cluster_name}" --region "\${REGION}" 2>/dev/null; then
+ kubectl delete ingress pmm-ha-alb -n pmm --ignore-not-found=true
+ fi
+
+ # Wait for ALB cleanup
+ echo "Waiting for ALB cleanup..."
+ sleep 30
+
+ # Disable termination protection on all CloudFormation stacks for this cluster
+ echo "Disabling termination protection on CloudFormation stacks..."
+ for stack_name in \$(aws cloudformation list-stacks --region "\${REGION}" \\
+ --stack-status-filter CREATE_COMPLETE UPDATE_COMPLETE \\
+ --query "StackSummaries[?starts_with(StackName, 'eksctl-\${cluster_name}')].StackName" \\
+ --output text 2>/dev/null); do
+ echo " Disabling protection: \${stack_name}"
+ aws cloudformation update-termination-protection \\
+ --region "\${REGION}" \\
+ --stack-name "\${stack_name}" \\
+ --no-enable-termination-protection 2>/dev/null || true
+ done
+
+ echo "Deleting EKS cluster \${cluster_name}..."
+ eksctl delete cluster --region "\${REGION}" --name "\${cluster_name}" \\
+ --disable-nodegroup-eviction --wait
+ """
+}
+
+/**
+ * List PMM HA test clusters sorted by creation time (newest first).
+ *
+ * @param region AWS region (default: us-east-2)
+ * @return List of cluster names sorted newest first, empty list if none found
+ */
+def listClusters(String region = 'us-east-2') {
+ def clusterPrefix = 'pmm-ha-test-'
+
+ // Get all clusters matching prefix, then describe each and sort by createdAt (newest first)
+ // Output: one cluster name per line, sorted by creation time descending
+ def output = sh(
+ script: """
+ aws eks list-clusters --region ${region} --output json 2>/dev/null | \\
+ jq -r '.clusters[] | select(startswith("${clusterPrefix}"))' | \\
+ while read cluster; do
+ CREATED=\$(aws eks describe-cluster --name "\$cluster" --region ${region} \\
+ --query 'cluster.createdAt' --output text 2>/dev/null)
+ [ -n "\$CREATED" ] && [ "\$CREATED" != "None" ] && echo "\$CREATED|\$cluster"
+ done | sort -r | cut -d'|' -f2
+ """,
+ returnStdout: true
+ ).trim()
+
+ if (!output) {
+ return []
+ }
+
+ return output.split('\n').findAll { it }
+}
+
+/**
+ * Delete multiple clusters with optional SKIP_NEWEST and age filter.
+ *
+ * Supports parallel deletion for faster cleanup.
+ *
+ * @param region AWS region (default: us-east-2)
+ * @param r53ZoneName Route53 hosted zone name (default: cd.percona.com)
+ * @param skipNewest Skip the most recent cluster (default: true)
+ * @param maxAgeHours Only delete clusters older than this (0 = delete all, default: 0)
+ */
+def deleteAllClusters(Map config = [:]) {
+ def region = config.region ?: 'us-east-2'
+ def r53ZoneName = config.r53ZoneName ?: 'cd.percona.com'
+ def skipNewest = config.skipNewest != null ? config.skipNewest : true
+ def maxAgeHours = config.maxAgeHours ?: 0
+
+ def clusterList = listClusters(region)
+
+ if (!clusterList) {
+ echo "No clusters found with prefix 'pmm-ha-test-'."
+ return
+ }
+
+ def clustersToDelete = clusterList
+ if (skipNewest && clusterList.size() > 1) {
+ // clusterList is sorted newest first, so skip first element
+ clustersToDelete = clusterList.drop(1)
+ echo "Skipping newest cluster: ${clusterList[0]} (SKIP_NEWEST=true)"
+ }
+
+ // Filter by age if maxAgeHours > 0
+ if (maxAgeHours > 0) {
+ def cutoffMs = System.currentTimeMillis() - (maxAgeHours * 60 * 60 * 1000)
+ def filtered = []
+
+ clustersToDelete.each { clusterName ->
+ def createdMs = sh(
+ script: """
+ CREATED=\$(aws eks describe-cluster --name ${clusterName} --region ${region} \\
+ --query 'cluster.createdAt' --output text 2>/dev/null || echo '')
+ if [ -z "\$CREATED" ] || [ "\$CREATED" = "None" ]; then
+ echo ""
+ else
+ date -d "\$CREATED" +%s000 2>/dev/null || echo ""
+ fi
+ """,
+ returnStdout: true
+ ).trim()
+
+ if (createdMs && createdMs.toLong() < cutoffMs) {
+ filtered.add(clusterName)
+ } else {
+ echo "Skipping recent cluster: ${clusterName} (created within last ${maxAgeHours}h)"
+ }
+ }
+ clustersToDelete = filtered
+ }
+
+ if (!clustersToDelete) {
+ echo 'No clusters to delete after applying filters.'
+ return
+ }
+
+ // Delete clusters in parallel
+ def parallelStages = [:]
+ clustersToDelete.each { clusterName ->
+ parallelStages["Delete ${clusterName}"] = {
+ deleteCluster(
+ clusterName: clusterName,
+ region: region,
+ r53ZoneName: r53ZoneName
+ )
+ }
+ }
+ parallel parallelStages
+}
+
+/**
+ * Clean up orphaned VPCs and failed CloudFormation stacks.
+ *
+ * Finds:
+ * - VPCs with eksctl-pmm-ha-test-* tags but no matching EKS cluster
+ * - CloudFormation stacks in DELETE_FAILED or ROLLBACK_COMPLETE state
+ *
+ * @param region AWS region (default: us-east-2)
+ */
+def cleanupOrphans(Map config = [:]) {
+ def region = config.region ?: 'us-east-2'
+ def clusterPrefix = 'pmm-ha-test-'
+
+ // Get list of active EKS clusters
+ def activeClusters = sh(
+ script: """
+ aws eks list-clusters --region ${region} \\
+ --query "clusters[?starts_with(@, '${clusterPrefix}')]" \\
+ --output text 2>/dev/null || echo ''
+ """,
+ returnStdout: true
+ ).trim().split(/\s+/).findAll { it }
+
+ echo "Active EKS clusters: ${activeClusters}"
+
+ // Find orphaned VPCs (VPCs without matching EKS cluster)
+ def orphanedVpcs = sh(
+ script: """
+ aws ec2 describe-vpcs --region ${region} \\
+ --filters "Name=tag:Name,Values=eksctl-${clusterPrefix}*-cluster/VPC" \\
+ --query 'Vpcs[*].[VpcId,Tags[?Key==`Name`].Value|[0]]' \\
+ --output text 2>/dev/null || echo ''
+ """,
+ returnStdout: true
+ ).trim()
+
+ if (orphanedVpcs) {
+ orphanedVpcs.split('\n').each { line ->
+ def parts = line.split('\t')
+ if (parts.size() >= 2) {
+ def vpcId = parts[0]
+ def vpcName = parts[1]
+ // Extract cluster name from VPC name (eksctl-pmm-ha-test-XX-cluster/VPC)
+ def matcher = vpcName =~ /eksctl-(${clusterPrefix}\d+)-cluster/
+ if (matcher) {
+ def clusterName = matcher[0][1]
+ if (!activeClusters.contains(clusterName)) {
+ echo "Found orphaned VPC: ${vpcId} (${vpcName}) - cluster ${clusterName} does not exist"
+ // Delete VPC using eksctl (handles all dependencies)
+ sh """
+ eksctl delete cluster --name ${clusterName} --region ${region} --wait=false 2>/dev/null || true
+ """
+ }
+ }
+ }
+ }
+ } else {
+ echo 'No orphaned VPCs found.'
+ }
+
+ // Find and delete failed CloudFormation stacks
+ def failedStacks = sh(
+ script: """
+ aws cloudformation list-stacks --region ${region} \\
+ --stack-status-filter DELETE_FAILED ROLLBACK_COMPLETE \\
+ --query "StackSummaries[?contains(StackName, '${clusterPrefix}')].StackName" \\
+ --output text 2>/dev/null || echo ''
+ """,
+ returnStdout: true
+ ).trim()
+
+ if (failedStacks) {
+ failedStacks.split(/\s+/).each { stackName ->
+ echo "Deleting failed stack: ${stackName}"
+ sh "aws cloudformation delete-stack --region ${region} --stack-name ${stackName} || true"
+ }
+ } else {
+ echo 'No failed CloudFormation stacks found.'
+ }
+}