Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 2 additions & 8 deletions main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ locals {
file_yum_update = file("${path.module}/template/yum_update.tftpl")

template_eip = templatefile("${path.module}/template/eip.tftpl", {
eip = join(",", [for eip in aws_eip.gitlab_runner : eip.public_ip])
eip_tags = join(" ", [for k, v in local.tags : "Key=${k},Value=${v}"])
})

template_gitlab_runner = templatefile("${path.module}/template/gitlab-runner.tftpl",
Expand All @@ -73,6 +73,7 @@ locals {
secure_parameter_store_gitlab_runner_registration_token_name = var.runner_gitlab_registration_token_secure_parameter_store_name
secure_parameter_store_runner_token_key = local.secure_parameter_store_runner_token_key
secure_parameter_store_runner_sentry_dsn = local.secure_parameter_store_runner_sentry_dsn
use_eip = var.runner_instance.use_eip
secure_parameter_store_gitlab_token_name = var.runner_gitlab.access_token_secure_parameter_store_name
secure_parameter_store_region = data.aws_region.current.name
gitlab_runner_registration_token = var.runner_gitlab_registration_config.registration_token
Expand Down Expand Up @@ -357,13 +358,6 @@ resource "aws_iam_instance_profile" "instance" {
tags = local.tags
}

resource "aws_eip" "gitlab_runner" {
# checkov:skip=CKV2_AWS_19:We can't use NAT gateway here as we are contacted from the outside.
count = var.runner_instance.use_eip ? 1 : 0

tags = local.tags
}

# We wait for 5 minutes until we set an EC2 instance to status `InService` so it has time to provision itself and it's configured capacity.
resource "aws_autoscaling_lifecycle_hook" "wait_for_gitlab_runner" {
name = "${var.environment}-wait-for-gitlab-runner-up"
Expand Down
27 changes: 27 additions & 0 deletions modules/terminate-agent-hook/iam.tf
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,20 @@ data "aws_iam_policy_document" "spot_request_housekeeping" {
}
}

data "aws_iam_policy_document" "eip_cleanup" {
statement {
sid = "EIPCleanup"

effect = "Allow"
actions = [
"ec2:DescribeAddresses",
"ec2:DisassociateAddress",
"ec2:ReleaseAddress"
]
resources = ["*"]
}
}

resource "aws_iam_policy" "lambda" {
name = "${var.name_iam_objects}-${var.name}-lambda"
path = "/"
Expand All @@ -187,6 +201,19 @@ resource "aws_iam_role_policy_attachment" "spot_request_housekeeping" {
policy_arn = aws_iam_policy.spot_request_housekeeping.arn
}

resource "aws_iam_policy" "eip_cleanup" {
name = "${var.name_iam_objects}-${var.name}-eip-cleanup"
path = "/"
policy = data.aws_iam_policy_document.eip_cleanup.json

tags = var.tags
}

resource "aws_iam_role_policy_attachment" "eip_cleanup" {
role = aws_iam_role.lambda.name
policy_arn = aws_iam_policy.eip_cleanup.arn
}

resource "aws_iam_role_policy_attachment" "aws_lambda_vpc_access_execution_role" {
role = aws_iam_role.lambda.name
policy_arn = "arn:${data.aws_partition.current.partition}:iam::aws:policy/service-role/AWSLambdaVPCAccessExecutionRole"
Expand Down
123 changes: 123 additions & 0 deletions modules/terminate-agent-hook/lambda/lambda_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,126 @@ def remove_unused_ssh_key_pairs(client, executor_name_part):
}))


def cleanup_orphaned_eips(ec2_client, executor_name_part):
"""
Clean up orphaned EIPs from terminated instances.
:param ec2_client: the boto3 EC2 client
:param executor_name_part: used to filter EIPs by Environment tag to match this value
"""
print(json.dumps({
"Level": "info",
"Message": f"Checking for orphaned EIPs for agent {executor_name_part}"
}))

try:
# Find all EIPs (we'll filter by tag content below)
eips_response = ec2_client.describe_addresses()

eips_to_cleanup = []

for eip in eips_response.get("Addresses", []):
allocation_id = eip["AllocationId"]
instance_id = eip.get("InstanceId")

# First check if this EIP belongs to our environment
eip_tags = {tag["Key"]: tag["Value"] for tag in eip.get("Tags", [])}
if not ("Environment" in eip_tags and executor_name_part in eip_tags["Environment"]):
continue # Skip EIPs not belonging to our environment

if instance_id:
# Check if the associated instance still exists and is terminated
try:
instance_response = ec2_client.describe_instances(InstanceIds=[instance_id])
instance_state = instance_response["Reservations"][0]["Instances"][0]["State"]["Name"]

if instance_state == "terminated":
eips_to_cleanup.append({
"allocation_id": allocation_id,
"instance_id": instance_id,
"public_ip": eip.get("PublicIp", "unknown"),
"reason": f"associated instance {instance_id} is terminated"
})
except ClientError as error:
if 'InvalidInstanceID.NotFound' in str(error):
# Instance no longer exists
eips_to_cleanup.append({
"allocation_id": allocation_id,
"instance_id": instance_id,
"public_ip": eip.get("PublicIp", "unknown"),
"reason": f"associated instance {instance_id} no longer exists"
})
else:
print(json.dumps({
"Level": "warning",
"Message": f"Could not check instance {instance_id} for EIP {allocation_id}",
"Exception": str(error)
}))
else:
# EIP is not associated with any instance and belongs to our environment
eips_to_cleanup.append({
"allocation_id": allocation_id,
"instance_id": "none",
"public_ip": eip.get("PublicIp", "unknown"),
"reason": "unassociated EIP with matching Environment tag"
})

# Clean up identified orphaned EIPs
for eip_info in eips_to_cleanup:
try:
print(json.dumps({
"Level": "info",
"AllocationId": eip_info["allocation_id"],
"PublicIp": eip_info["public_ip"],
"Message": f"Releasing orphaned EIP: {eip_info['reason']}"
}))

# Disassociate first if still associated
if eip_info["instance_id"] != "none":
try:
ec2_client.disassociate_address(AllocationId=eip_info["allocation_id"])
except ClientError as disassociate_error:
print(json.dumps({
"Level": "warning",
"Message": f"Failed to disassociate EIP {eip_info['allocation_id']}",
"Exception": str(disassociate_error)
}))

# Release the EIP
ec2_client.release_address(AllocationId=eip_info["allocation_id"])

print(json.dumps({
"Level": "info",
"AllocationId": eip_info["allocation_id"],
"Message": "Successfully released orphaned EIP"
}))

except ClientError as error:
print(json.dumps({
"Level": "error",
"AllocationId": eip_info["allocation_id"],
"Message": f"Failed to release orphaned EIP",
"Exception": str(error)
}))

if not eips_to_cleanup:
print(json.dumps({
"Level": "info",
"Message": "No orphaned EIPs found to clean up"
}))
else:
print(json.dumps({
"Level": "info",
"Message": f"Cleaned up {len(eips_to_cleanup)} orphaned EIP(s)"
}))

except ClientError as error:
print(json.dumps({
"Level": "error",
"Message": "Failed to describe EIPs for cleanup",
"Exception": str(error)
}))


# context not used: this is the interface for a AWS Lambda function defined by AWS
# pylint: disable=unused-argument
def handler(event, context):
Expand Down Expand Up @@ -269,6 +389,9 @@ def handler(event, context):

remove_unused_ssh_key_pairs(client=client, executor_name_part=os.environ['NAME_EXECUTOR_INSTANCE'])

# Clean up orphaned EIPs from terminated instances
cleanup_orphaned_eips(ec2_client=client, executor_name_part=os.environ['NAME_EXECUTOR_INSTANCE'])

return "Housekeeping done"


Expand Down
5 changes: 0 additions & 5 deletions outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,6 @@ output "runner_sg_id" {
value = var.runner_worker.type == "docker-autoscaler" ? aws_security_group.docker_autoscaler[0].id : (var.runner_worker.type == "docker+machine" ? aws_security_group.docker_machine[0].id : null)
}

output "runner_eip" {
description = "EIP of the Gitlab Runner"
value = length(aws_eip.gitlab_runner) > 0 ? aws_eip.gitlab_runner[0].public_ip : null
}

output "runner_launch_template_name" {
description = "The name of the runner's launch template."
value = aws_launch_template.gitlab_runner_instance.name
Expand Down
64 changes: 33 additions & 31 deletions policies/instance-docker-autoscaler-policy.json
Original file line number Diff line number Diff line change
@@ -1,34 +1,36 @@
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": [
"autoscaling:SetDesiredCapacity",
"autoscaling:TerminateInstanceInAutoScalingGroup"
],
"Resource": "${autoscaler_asg_arn}"
},
{
"Effect": "Allow",
"Action": [
"autoscaling:DescribeAutoScalingGroups",
"ec2:DescribeInstances"
],
"Resource": "*"
},
{
"Effect": "Allow",
"Action": [
"ec2:GetPasswordData",
"ec2-instance-connect:SendSSHPublicKey"
],
"Resource": "arn:${partition}:ec2:${aws_region}:*:instance/*",
"Condition": {
"StringEquals": {
"ec2:ResourceTag/aws:autoscaling:groupName": "${autoscaler_asg_name}"
}
}
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": [
"autoscaling:SetDesiredCapacity",
"autoscaling:TerminateInstanceInAutoScalingGroup"
],
"Resource": "${autoscaler_asg_arn}"
},
{
"Effect": "Allow",
"Action": [
"autoscaling:DescribeAutoScalingGroups",
"ec2:DescribeInstances",
"autoscaling:CompleteLifecycleAction",
"autoscaling:DescribeLifecycleHooks"
],
"Resource": "*"
},
{
"Effect": "Allow",
"Action": [
"ec2:GetPasswordData",
"ec2-instance-connect:SendSSHPublicKey"
],
"Resource": "arn:${partition}:ec2:${aws_region}:*:instance/*",
"Condition": {
"StringEquals": {
"ec2:ResourceTag/aws:autoscaling:groupName": "${autoscaler_asg_name}"
}
]
}
}
]
}
4 changes: 4 additions & 0 deletions policies/instance-eip.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,11 @@
{
"Effect": "Allow",
"Action": [
"ec2:AllocateAddress",
"ec2:AssociateAddress",
"ec2:DisassociateAddress",
"ec2:ReleaseAddress",
"ec2:CreateTags",
"ec2:Describe*"
],
"Resource": "*"
Expand Down
73 changes: 65 additions & 8 deletions template/eip.tftpl
Original file line number Diff line number Diff line change
@@ -1,10 +1,67 @@
echo 'installing additional software for assigning EIP'
echo 'Setting up dynamic EIP management'

yum install python3 -y
curl --fail --retry 6 -O https://bootstrap.pypa.io/get-pip.py
python3 get-pip.py --user
export PATH=~/.local/bin:$PATH
# Get instance metadata (token already available from user-data)
INSTANCE_ID=$(curl -s -H "X-aws-ec2-metadata-token: $token" http://169.254.169.254/latest/meta-data/instance-id)
REGION=$(curl -s -H "X-aws-ec2-metadata-token: $token" http://169.254.169.254/latest/dynamic/instance-identity/document | jq -r '.region')

pip install aws-ec2-assign-elastic-ip
export AWS_DEFAULT_REGION=$(curl -s -H "X-aws-ec2-metadata-token: $token" http://169.254.169.254/latest/dynamic/instance-identity/document | grep region | awk -F\" '{print $4}')
/usr/local/bin/aws-ec2-assign-elastic-ip --valid-ips ${eip}
export AWS_DEFAULT_REGION=$REGION

# Create directory for storing EIP allocation ID
mkdir -p /var/lib/ec2-eip

max_retries=5
retry_count=0
wait_time=10

while [ $retry_count -lt $max_retries ]; do
echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-ALLOCATION: Attempting to allocate EIP (attempt $((retry_count + 1))/$max_retries)"

EIP_RESULT=$(aws ec2 allocate-address --domain vpc --query 'AllocationId' --output text 2>&1)
EIP_EXIT_CODE=$?

if [ $EIP_EXIT_CODE -eq 0 ] && [[ "$EIP_RESULT" =~ ^eipalloc- ]]; then
ALLOCATION_ID="$EIP_RESULT"
echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-ALLOCATION: Successfully allocated EIP with allocation ID: $ALLOCATION_ID"

TAG_RESULT=$(aws ec2 create-tags --resources "$ALLOCATION_ID" --tags ${eip_tags} 2>&1)
TAG_EXIT_CODE=$?
if [ $TAG_EXIT_CODE -eq 0 ]; then
echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-ALLOCATION: Successfully tagged EIP $ALLOCATION_ID"
else
echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-ALLOCATION: Warning: Failed to tag EIP $ALLOCATION_ID: $TAG_RESULT"
fi

ASSOC_RESULT=$(aws ec2 associate-address --instance-id "$INSTANCE_ID" --allocation-id "$ALLOCATION_ID" 2>&1)
ASSOC_EXIT_CODE=$?
if [ $ASSOC_EXIT_CODE -eq 0 ]; then
echo "$ALLOCATION_ID" > /var/lib/ec2-eip/allocation-id
echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-ALLOCATION: EIP allocation completed successfully"
break
else
echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-ALLOCATION: Error: Failed to associate EIP $ALLOCATION_ID with instance $INSTANCE_ID: $ASSOC_RESULT"

# Clean up the allocated EIP if association failed
CLEANUP_RESULT=$(aws ec2 release-address --allocation-id "$ALLOCATION_ID" 2>&1)
if [ $? -eq 0 ]; then
echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-ALLOCATION: Successfully cleaned up failed EIP allocation"
else
echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-ALLOCATION: Warning: Failed to cleanup EIP $ALLOCATION_ID: $CLEANUP_RESULT"
fi
fi
else
echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-ALLOCATION: Error: Failed to allocate EIP (exit code: $EIP_EXIT_CODE): $EIP_RESULT"
fi

retry_count=$((retry_count + 1))
if [ $retry_count -lt $max_retries ]; then
echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-ALLOCATION: Waiting $wait_time seconds before retry..."
sleep $wait_time
wait_time=$((wait_time * 2))
fi
done

if [ $retry_count -eq $max_retries ]; then
echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-ALLOCATION: Error: Failed to allocate and associate EIP after $max_retries attempts"
echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-ALLOCATION: EIP allocation failed - GitLab Runner will not have an EIP associated"
exit 1
fi
Loading