From 9ac1b3060c7412a2a75785bee5e7978d3719cca6 Mon Sep 17 00:00:00 2001 From: fdhaussy Date: Tue, 19 Aug 2025 15:59:34 +0200 Subject: [PATCH] feat(): add EIP creation/release in gitlab runner lifecycle --- main.tf | 10 +- modules/terminate-agent-hook/iam.tf | 27 ++++ .../lambda/lambda_function.py | 123 ++++++++++++++++++ outputs.tf | 5 - .../instance-docker-autoscaler-policy.json | 64 ++++----- policies/instance-eip.json | 4 + template/eip.tftpl | 73 +++++++++-- template/gitlab-runner.tftpl | 83 ++++++++++++ 8 files changed, 337 insertions(+), 52 deletions(-) diff --git a/main.tf b/main.tf index 5b46fa501..d925163af 100644 --- a/main.tf +++ b/main.tf @@ -50,7 +50,7 @@ locals { file_yum_update = file("${path.module}/template/yum_update.tftpl") template_eip = templatefile("${path.module}/template/eip.tftpl", { - eip = join(",", [for eip in aws_eip.gitlab_runner : eip.public_ip]) + eip_tags = join(" ", [for k, v in local.tags : "Key=${k},Value=${v}"]) }) template_gitlab_runner = templatefile("${path.module}/template/gitlab-runner.tftpl", @@ -73,6 +73,7 @@ locals { secure_parameter_store_gitlab_runner_registration_token_name = var.runner_gitlab_registration_token_secure_parameter_store_name secure_parameter_store_runner_token_key = local.secure_parameter_store_runner_token_key secure_parameter_store_runner_sentry_dsn = local.secure_parameter_store_runner_sentry_dsn + use_eip = var.runner_instance.use_eip secure_parameter_store_gitlab_token_name = var.runner_gitlab.access_token_secure_parameter_store_name secure_parameter_store_region = data.aws_region.current.name gitlab_runner_registration_token = var.runner_gitlab_registration_config.registration_token @@ -357,13 +358,6 @@ resource "aws_iam_instance_profile" "instance" { tags = local.tags } -resource "aws_eip" "gitlab_runner" { - # checkov:skip=CKV2_AWS_19:We can't use NAT gateway here as we are contacted from the outside. - count = var.runner_instance.use_eip ? 1 : 0 - - tags = local.tags -} - # We wait for 5 minutes until we set an EC2 instance to status `InService` so it has time to provision itself and it's configured capacity. resource "aws_autoscaling_lifecycle_hook" "wait_for_gitlab_runner" { name = "${var.environment}-wait-for-gitlab-runner-up" diff --git a/modules/terminate-agent-hook/iam.tf b/modules/terminate-agent-hook/iam.tf index 16daede6b..498c9dcb3 100644 --- a/modules/terminate-agent-hook/iam.tf +++ b/modules/terminate-agent-hook/iam.tf @@ -161,6 +161,20 @@ data "aws_iam_policy_document" "spot_request_housekeeping" { } } +data "aws_iam_policy_document" "eip_cleanup" { + statement { + sid = "EIPCleanup" + + effect = "Allow" + actions = [ + "ec2:DescribeAddresses", + "ec2:DisassociateAddress", + "ec2:ReleaseAddress" + ] + resources = ["*"] + } +} + resource "aws_iam_policy" "lambda" { name = "${var.name_iam_objects}-${var.name}-lambda" path = "/" @@ -187,6 +201,19 @@ resource "aws_iam_role_policy_attachment" "spot_request_housekeeping" { policy_arn = aws_iam_policy.spot_request_housekeeping.arn } +resource "aws_iam_policy" "eip_cleanup" { + name = "${var.name_iam_objects}-${var.name}-eip-cleanup" + path = "/" + policy = data.aws_iam_policy_document.eip_cleanup.json + + tags = var.tags +} + +resource "aws_iam_role_policy_attachment" "eip_cleanup" { + role = aws_iam_role.lambda.name + policy_arn = aws_iam_policy.eip_cleanup.arn +} + resource "aws_iam_role_policy_attachment" "aws_lambda_vpc_access_execution_role" { role = aws_iam_role.lambda.name policy_arn = "arn:${data.aws_partition.current.partition}:iam::aws:policy/service-role/AWSLambdaVPCAccessExecutionRole" diff --git a/modules/terminate-agent-hook/lambda/lambda_function.py b/modules/terminate-agent-hook/lambda/lambda_function.py index 23936f90c..905e1828c 100644 --- a/modules/terminate-agent-hook/lambda/lambda_function.py +++ b/modules/terminate-agent-hook/lambda/lambda_function.py @@ -223,6 +223,126 @@ def remove_unused_ssh_key_pairs(client, executor_name_part): })) +def cleanup_orphaned_eips(ec2_client, executor_name_part): + """ + Clean up orphaned EIPs from terminated instances. + :param ec2_client: the boto3 EC2 client + :param executor_name_part: used to filter EIPs by Environment tag to match this value + """ + print(json.dumps({ + "Level": "info", + "Message": f"Checking for orphaned EIPs for agent {executor_name_part}" + })) + + try: + # Find all EIPs (we'll filter by tag content below) + eips_response = ec2_client.describe_addresses() + + eips_to_cleanup = [] + + for eip in eips_response.get("Addresses", []): + allocation_id = eip["AllocationId"] + instance_id = eip.get("InstanceId") + + # First check if this EIP belongs to our environment + eip_tags = {tag["Key"]: tag["Value"] for tag in eip.get("Tags", [])} + if not ("Environment" in eip_tags and executor_name_part in eip_tags["Environment"]): + continue # Skip EIPs not belonging to our environment + + if instance_id: + # Check if the associated instance still exists and is terminated + try: + instance_response = ec2_client.describe_instances(InstanceIds=[instance_id]) + instance_state = instance_response["Reservations"][0]["Instances"][0]["State"]["Name"] + + if instance_state == "terminated": + eips_to_cleanup.append({ + "allocation_id": allocation_id, + "instance_id": instance_id, + "public_ip": eip.get("PublicIp", "unknown"), + "reason": f"associated instance {instance_id} is terminated" + }) + except ClientError as error: + if 'InvalidInstanceID.NotFound' in str(error): + # Instance no longer exists + eips_to_cleanup.append({ + "allocation_id": allocation_id, + "instance_id": instance_id, + "public_ip": eip.get("PublicIp", "unknown"), + "reason": f"associated instance {instance_id} no longer exists" + }) + else: + print(json.dumps({ + "Level": "warning", + "Message": f"Could not check instance {instance_id} for EIP {allocation_id}", + "Exception": str(error) + })) + else: + # EIP is not associated with any instance and belongs to our environment + eips_to_cleanup.append({ + "allocation_id": allocation_id, + "instance_id": "none", + "public_ip": eip.get("PublicIp", "unknown"), + "reason": "unassociated EIP with matching Environment tag" + }) + + # Clean up identified orphaned EIPs + for eip_info in eips_to_cleanup: + try: + print(json.dumps({ + "Level": "info", + "AllocationId": eip_info["allocation_id"], + "PublicIp": eip_info["public_ip"], + "Message": f"Releasing orphaned EIP: {eip_info['reason']}" + })) + + # Disassociate first if still associated + if eip_info["instance_id"] != "none": + try: + ec2_client.disassociate_address(AllocationId=eip_info["allocation_id"]) + except ClientError as disassociate_error: + print(json.dumps({ + "Level": "warning", + "Message": f"Failed to disassociate EIP {eip_info['allocation_id']}", + "Exception": str(disassociate_error) + })) + + # Release the EIP + ec2_client.release_address(AllocationId=eip_info["allocation_id"]) + + print(json.dumps({ + "Level": "info", + "AllocationId": eip_info["allocation_id"], + "Message": "Successfully released orphaned EIP" + })) + + except ClientError as error: + print(json.dumps({ + "Level": "error", + "AllocationId": eip_info["allocation_id"], + "Message": f"Failed to release orphaned EIP", + "Exception": str(error) + })) + + if not eips_to_cleanup: + print(json.dumps({ + "Level": "info", + "Message": "No orphaned EIPs found to clean up" + })) + else: + print(json.dumps({ + "Level": "info", + "Message": f"Cleaned up {len(eips_to_cleanup)} orphaned EIP(s)" + })) + + except ClientError as error: + print(json.dumps({ + "Level": "error", + "Message": "Failed to describe EIPs for cleanup", + "Exception": str(error) + })) + + # context not used: this is the interface for a AWS Lambda function defined by AWS # pylint: disable=unused-argument def handler(event, context): @@ -269,6 +389,9 @@ def handler(event, context): remove_unused_ssh_key_pairs(client=client, executor_name_part=os.environ['NAME_EXECUTOR_INSTANCE']) + # Clean up orphaned EIPs from terminated instances + cleanup_orphaned_eips(ec2_client=client, executor_name_part=os.environ['NAME_EXECUTOR_INSTANCE']) + return "Housekeeping done" diff --git a/outputs.tf b/outputs.tf index b2010d1ed..3af27ab58 100644 --- a/outputs.tf +++ b/outputs.tf @@ -43,11 +43,6 @@ output "runner_sg_id" { value = var.runner_worker.type == "docker-autoscaler" ? aws_security_group.docker_autoscaler[0].id : (var.runner_worker.type == "docker+machine" ? aws_security_group.docker_machine[0].id : null) } -output "runner_eip" { - description = "EIP of the Gitlab Runner" - value = length(aws_eip.gitlab_runner) > 0 ? aws_eip.gitlab_runner[0].public_ip : null -} - output "runner_launch_template_name" { description = "The name of the runner's launch template." value = aws_launch_template.gitlab_runner_instance.name diff --git a/policies/instance-docker-autoscaler-policy.json b/policies/instance-docker-autoscaler-policy.json index 4d3707d90..5deddf69a 100644 --- a/policies/instance-docker-autoscaler-policy.json +++ b/policies/instance-docker-autoscaler-policy.json @@ -1,34 +1,36 @@ { - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": [ - "autoscaling:SetDesiredCapacity", - "autoscaling:TerminateInstanceInAutoScalingGroup" - ], - "Resource": "${autoscaler_asg_arn}" - }, - { - "Effect": "Allow", - "Action": [ - "autoscaling:DescribeAutoScalingGroups", - "ec2:DescribeInstances" - ], - "Resource": "*" - }, - { - "Effect": "Allow", - "Action": [ - "ec2:GetPasswordData", - "ec2-instance-connect:SendSSHPublicKey" - ], - "Resource": "arn:${partition}:ec2:${aws_region}:*:instance/*", - "Condition": { - "StringEquals": { - "ec2:ResourceTag/aws:autoscaling:groupName": "${autoscaler_asg_name}" - } - } + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "autoscaling:SetDesiredCapacity", + "autoscaling:TerminateInstanceInAutoScalingGroup" + ], + "Resource": "${autoscaler_asg_arn}" + }, + { + "Effect": "Allow", + "Action": [ + "autoscaling:DescribeAutoScalingGroups", + "ec2:DescribeInstances", + "autoscaling:CompleteLifecycleAction", + "autoscaling:DescribeLifecycleHooks" + ], + "Resource": "*" + }, + { + "Effect": "Allow", + "Action": [ + "ec2:GetPasswordData", + "ec2-instance-connect:SendSSHPublicKey" + ], + "Resource": "arn:${partition}:ec2:${aws_region}:*:instance/*", + "Condition": { + "StringEquals": { + "ec2:ResourceTag/aws:autoscaling:groupName": "${autoscaler_asg_name}" } - ] + } + } + ] } diff --git a/policies/instance-eip.json b/policies/instance-eip.json index 062bda806..71ebfa096 100644 --- a/policies/instance-eip.json +++ b/policies/instance-eip.json @@ -4,7 +4,11 @@ { "Effect": "Allow", "Action": [ + "ec2:AllocateAddress", "ec2:AssociateAddress", + "ec2:DisassociateAddress", + "ec2:ReleaseAddress", + "ec2:CreateTags", "ec2:Describe*" ], "Resource": "*" diff --git a/template/eip.tftpl b/template/eip.tftpl index 602d171e2..ca4e0259e 100644 --- a/template/eip.tftpl +++ b/template/eip.tftpl @@ -1,10 +1,67 @@ -echo 'installing additional software for assigning EIP' +echo 'Setting up dynamic EIP management' -yum install python3 -y -curl --fail --retry 6 -O https://bootstrap.pypa.io/get-pip.py -python3 get-pip.py --user -export PATH=~/.local/bin:$PATH +# Get instance metadata (token already available from user-data) +INSTANCE_ID=$(curl -s -H "X-aws-ec2-metadata-token: $token" http://169.254.169.254/latest/meta-data/instance-id) +REGION=$(curl -s -H "X-aws-ec2-metadata-token: $token" http://169.254.169.254/latest/dynamic/instance-identity/document | jq -r '.region') -pip install aws-ec2-assign-elastic-ip -export AWS_DEFAULT_REGION=$(curl -s -H "X-aws-ec2-metadata-token: $token" http://169.254.169.254/latest/dynamic/instance-identity/document | grep region | awk -F\" '{print $4}') -/usr/local/bin/aws-ec2-assign-elastic-ip --valid-ips ${eip} \ No newline at end of file +export AWS_DEFAULT_REGION=$REGION + +# Create directory for storing EIP allocation ID +mkdir -p /var/lib/ec2-eip + +max_retries=5 +retry_count=0 +wait_time=10 + +while [ $retry_count -lt $max_retries ]; do + echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-ALLOCATION: Attempting to allocate EIP (attempt $((retry_count + 1))/$max_retries)" + + EIP_RESULT=$(aws ec2 allocate-address --domain vpc --query 'AllocationId' --output text 2>&1) + EIP_EXIT_CODE=$? + + if [ $EIP_EXIT_CODE -eq 0 ] && [[ "$EIP_RESULT" =~ ^eipalloc- ]]; then + ALLOCATION_ID="$EIP_RESULT" + echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-ALLOCATION: Successfully allocated EIP with allocation ID: $ALLOCATION_ID" + + TAG_RESULT=$(aws ec2 create-tags --resources "$ALLOCATION_ID" --tags ${eip_tags} 2>&1) + TAG_EXIT_CODE=$? + if [ $TAG_EXIT_CODE -eq 0 ]; then + echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-ALLOCATION: Successfully tagged EIP $ALLOCATION_ID" + else + echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-ALLOCATION: Warning: Failed to tag EIP $ALLOCATION_ID: $TAG_RESULT" + fi + + ASSOC_RESULT=$(aws ec2 associate-address --instance-id "$INSTANCE_ID" --allocation-id "$ALLOCATION_ID" 2>&1) + ASSOC_EXIT_CODE=$? + if [ $ASSOC_EXIT_CODE -eq 0 ]; then + echo "$ALLOCATION_ID" > /var/lib/ec2-eip/allocation-id + echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-ALLOCATION: EIP allocation completed successfully" + break + else + echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-ALLOCATION: Error: Failed to associate EIP $ALLOCATION_ID with instance $INSTANCE_ID: $ASSOC_RESULT" + + # Clean up the allocated EIP if association failed + CLEANUP_RESULT=$(aws ec2 release-address --allocation-id "$ALLOCATION_ID" 2>&1) + if [ $? -eq 0 ]; then + echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-ALLOCATION: Successfully cleaned up failed EIP allocation" + else + echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-ALLOCATION: Warning: Failed to cleanup EIP $ALLOCATION_ID: $CLEANUP_RESULT" + fi + fi + else + echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-ALLOCATION: Error: Failed to allocate EIP (exit code: $EIP_EXIT_CODE): $EIP_RESULT" + fi + + retry_count=$((retry_count + 1)) + if [ $retry_count -lt $max_retries ]; then + echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-ALLOCATION: Waiting $wait_time seconds before retry..." + sleep $wait_time + wait_time=$((wait_time * 2)) + fi +done + +if [ $retry_count -eq $max_retries ]; then + echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-ALLOCATION: Error: Failed to allocate and associate EIP after $max_retries attempts" + echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-ALLOCATION: EIP allocation failed - GitLab Runner will not have an EIP associated" + exit 1 +fi \ No newline at end of file diff --git a/template/gitlab-runner.tftpl b/template/gitlab-runner.tftpl index 0d09c42d6..6adea5ea5 100644 --- a/template/gitlab-runner.tftpl +++ b/template/gitlab-runner.tftpl @@ -325,6 +325,84 @@ imds_token=$(curl -sSf -X PUT "http://169.254.169.254/latest/api/token" -H "X-aw instance_id=$(curl -sSf -H "X-aws-ec2-metadata-token: $imds_token" "http://169.254.169.254/latest/meta-data/instance-id") region=$(curl -sSf -H "X-aws-ec2-metadata-token: $imds_token" "http://169.254.169.254/latest/meta-data/placement/region") +%{ if use_eip } +release_eip() { + # File generated in eip.tftpl + allocation_id_file="/var/lib/ec2-eip/allocation-id" + if [ -f "$allocation_id_file" ]; then + echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-RELEASE: Releasing EIP before lifecycle completion..." + allocation_id=$(cat "$allocation_id_file") + echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-RELEASE: Found stored EIP allocation ID: $allocation_id" + + # Get current association to verify it's ours + associated_instance=$(aws --region $region ec2 describe-addresses --allocation-ids "$allocation_id" --query 'Addresses[0].InstanceId' --output text 2>/dev/null || echo "") + echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-RELEASE: EIP association check: EIP=$allocation_id, CurrentInstance=$instance_id, AssociatedInstance='$associated_instance'" + + if [ "$associated_instance" = "$instance_id" ] || [ "$associated_instance" = "None" ] || [ -z "$associated_instance" ]; then + echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-RELEASE: Disassociating EIP $allocation_id" + association_id=$(aws --region $region ec2 describe-addresses --allocation-ids "$allocation_id" --query 'Addresses[0].AssociationId' --output text 2>/dev/null) + if [ "$association_id" != "None" ] && [ -n "$association_id" ]; then + DISASSOC_RESULT=$(aws --region $region ec2 disassociate-address --association-id "$association_id" 2>&1) + else + DISASSOC_RESULT="EIP not associated" + fi + DISASSOC_EXIT_CODE=$? + if [ $DISASSOC_EXIT_CODE -ne 0 ]; then + echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-RELEASE: Warning: Failed to disassociate EIP (exit code: $DISASSOC_EXIT_CODE): $DISASSOC_RESULT" + fi + + echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-RELEASE: Releasing EIP $allocation_id" + RELEASE_RESULT=$(aws --region $region ec2 release-address --allocation-id "$allocation_id" 2>&1) + RELEASE_EXIT_CODE=$? + if [ $RELEASE_EXIT_CODE -eq 0 ]; then + echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-RELEASE: Successfully released EIP $allocation_id" + rm -f "$allocation_id_file" + else + echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-RELEASE: Error: Failed to release EIP $allocation_id (exit code: $RELEASE_EXIT_CODE): $RELEASE_RESULT" + echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-RELEASE: Warning: Failed to release EIP $allocation_id, continuing with lifecycle completion" + fi + else + echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-RELEASE: Warning: EIP $allocation_id is associated with different instance $associated_instance, not releasing" + rm -f "$allocation_id_file" # Clean up stale file + fi + else + echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-RELEASE: No stored EIP allocation ID found, checking for associated EIP" + + # Try to find EIP associated with this instance + allocation_id=$(aws --region $region ec2 describe-addresses --filters "Name=instance-id,Values=$instance_id" --query 'Addresses[0].AllocationId' --output text 2>/dev/null || echo "None") + + if [ "$allocation_id" != "None" ] && [ -n "$allocation_id" ]; then + echo "Found EIP $allocation_id associated with this instance" + + echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-RELEASE: Disassociating EIP $allocation_id" + # Get association ID first, then disassociate + association_id=$(aws --region $region ec2 describe-addresses --allocation-ids "$allocation_id" --query 'Addresses[0].AssociationId' --output text 2>/dev/null) + if [ "$association_id" != "None" ] && [ -n "$association_id" ]; then + DISASSOC_RESULT=$(aws --region $region ec2 disassociate-address --association-id "$association_id" 2>&1) + else + DISASSOC_RESULT="EIP not associated" + fi + DISASSOC_EXIT_CODE=$? + if [ $DISASSOC_EXIT_CODE -ne 0 ]; then + echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-RELEASE: Warning: Failed to disassociate EIP (exit code: $DISASSOC_EXIT_CODE): $DISASSOC_RESULT" + fi + + echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-RELEASE: Releasing EIP $allocation_id" + RELEASE_RESULT=$(aws --region $region ec2 release-address --allocation-id "$allocation_id" 2>&1) + RELEASE_EXIT_CODE=$? + if [ $RELEASE_EXIT_CODE -eq 0 ]; then + echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-RELEASE: Successfully released EIP $allocation_id" + else + echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-RELEASE: Error: Failed to release EIP $allocation_id (exit code: $RELEASE_EXIT_CODE): $RELEASE_RESULT" + echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-RELEASE: Warning: Failed to release EIP $allocation_id, continuing with lifecycle completion" + fi + else + echo "No EIP found associated with this instance" + fi + fi +} +%{ endif } + # Function to send complete-lifecycle-action send_complete_lifecycle_action() { echo "Sending complete-lifecycle-action for instance $instance_id..." @@ -382,6 +460,11 @@ monitor_process() { status=$(systemctl is-active gitlab-runner.service) if [ "$status" = "inactive" ] || [ "$status" = "failed" ]; then echo "GitLab Runner service $status. Proceeding with cleanup." + +%{ if use_eip } + release_eip +%{ endif } + send_complete_lifecycle_action else echo "GitLab Runner Service still running, sleeping..."