tf-aws-container/lambda_controller.py at main · thatderek/tf-aws-container · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
import boto3, json
import zipfile, tarfile
import os, shutil, time
from botocore.exceptions import ClientError

ecr = boto3.client('ecr')
s3 = boto3.client('s3')
ecs = boto3.client('ecs')

def get_container_details(ecr_name, tag):
    """ A function for searching for a container image in ECR

    This function returns an object describing a the details of a container in
    ECR, or barring finding such a container, a None object.
    """
    print('get_container_details() is checking on: ', ecr_name, tag)
    try:
        response = ecr.describe_images(
            repositoryName = ecr_name,
            imageIds = [
                {
                    'imageTag': tag
                }
            ]
        )
    except ecr.exceptions.ImageNotFoundException:
        return None
    except ClientError as e:
        print("get_container_details() had an error: " , e)
        raise e

    print("get_container_details() reponse: ", response)

    print('get_container_details() found the container: ', ecr_name, tag)
    return response['imageDetails'][0]


def move_code(s3_bucket, code_location):
    """Move a file in s3 from a .zip to .tar.gz

    As the time of writing, the archive-provider for terraform doesn't allow
    for archival in tar.gz format, only in the .zip format.

    https://github.com/hashicorp/terraform-provider-archive/pull/277#issuecomment-2073420171

    Currently, it looks like development has halted on this provider. However,
    kaniko requires a build context using s3 to be in a .tar.gz format:

    https://github.com/GoogleContainerTools/kaniko/blob/main/README.md#kaniko-build-contexts

    So, this function is a bit of a patch fix to make that tar.gz available for
    the kaniko runner in ECS.

    It's output will a string of the S3 file location.
    """

    print('move_code() is starting for: ', s3_bucket, code_location)

    # Build strings representing where we will do our archival work
    zip_location = '/tmp/' + code_location
    tar_file = code_location.rstrip('.zip') + '.tar.gz'
    tar_location = '/tmp/' + tar_file

    # Downloads the file
    with open(zip_location, 'wb') as f:
        s3.download_fileobj(s3_bucket, code_location, f)

    # Unzip the contents to a specific location
    with zipfile.ZipFile(zip_location, 'r') as zip_ref:
        tmp_dir = os.path.dirname(tar_location)
        os.makedirs(tmp_dir, exist_ok=True)
        zip_ref.extractall(tmp_dir)


    # For each file, add it to the tar.gz
    with tarfile.open(tar_location, mode='w:gz') as tar_ref:
        for root, dirs, files in os.walk(tmp_dir):
            for file in files:
                full_path = os.path.join(root, file)
                tar_ref.add(full_path, arcname=os.path.relpath(full_path,
                                                               tmp_dir))

    # Upload the .tar.gz to s3
    try:
        response = s3.upload_file(tar_location, s3_bucket, tar_file)
    except ClientError as e:
        print('move_code() ClientError: ', e)
        raise e

    # If everything worked, return true and the location of the tar-file in s3
    print('move_code() success')
    return 's3://'+s3_bucket+'/'+tar_file


def run_build_task(task_definition_arn, cluster_name, subnet_id, security_group_id,
                    s3_code_path, repository_uri, image_tag, image_tags_additional):
    """ Runs the kaniko task in ECS

    This function instructs ECS to run the Kaniko task building our container
    and returns the task-id.
    """

    # Build command parameter
    command = [
        '--context',
        s3_code_path,
        '--destination',
        repository_uri + ':'  + image_tag
    ]
    for tag in image_tags_additional:
        command.extend([
            '--destination',
            repository_uri + ':' + tag
        ])

    print('run_build_task() starting')
    try:
        response = ecs.run_task(
            taskDefinition = task_definition_arn,
            cluster = cluster_name,
            count = 1,
            launchType = 'FARGATE',
            networkConfiguration ={
                'awsvpcConfiguration': {
                    'subnets': [subnet_id],
                    'securityGroups':  [security_group_id],
                    'assignPublicIp': "ENABLED",
                },
            },
            overrides = {
                'containerOverrides': [
                    {
                        'name': 'kaniko',
                        'command': command,
                    },
                ]
            }
        )
    except ClientError as e:
        print('run_task() ClientError: ', e)
        raise e

    task_arn = response['tasks'][0]['taskArn']

    print('run_task() success: ', task_arn)
    return task_arn


def container_waiter(task_arn, cluster_name, repository_name, image_tag, context):
    """Waits for a container with a tag to show up in ECR

    This function queries an ECS task for completion and then when that is
    done, queires an ECR repository for the existance of an image with a
    specific tag. Returns True if it shows up before the lambda runs out of
    time and False if otherwise.
    """

    print('container_waiter() started')
    i = 0

    # Wait until exeuction time remaining gets below 10 seconds (in
    # milliseconds), loop infinitely
    while context.get_remaining_time_in_millis() > 10000:
        print("container_waiter() sleeping to start loop: ", i)
        i += 1

        # This is at the top to give ECS a fighting chance to start the task
        # before we start querying it.
        time.sleep(3)

        # Get task information
        resp = ecs.describe_tasks(
            cluster = cluster_name,
            tasks = [
                task_arn,
            ]
        )

        # If no tasks are described, restart the loop
        if len(resp['tasks']) == 0:
            print('container_waiter() empty response on task_arn, looping')
            continue

        # If the status of the task doesn't appear to be stopping or stopped,
        # loop
        status = resp['tasks'][0]['lastStatus']
        if status not in [
            'DEACTIVATING',
            'STOPPING',
            'DEPROVISIONING',
            'STOPPPED',
            'DELETED'
        ]:
            print('container_waiter() status not ending/ed, looping: ', status)
            continue

        # Get image information seeing if a response shows up, continuing the
        # loop if it doesn't
        container_details = get_container_details(repository_name, image_tag)
        if container_details != None:
            print('conatiner_waiter() success!')
            return True

    # If we've exited the loop, the image hasn't shown up in the allotted time.
    error = ('The image hasn\'t shown up in the allotted time. Check to see if'+
    'the lambda needs to be extended or if the container build errored.')
    print('container_waiter() failure: ' + error)
    return False


def lambda_handler(event, context):
    """The main controller invoked by lambda to build containers!

    This handler is invoked by lambda to build containers in ECS using Kaniko.
    It takes an event with the keys specified immediately below and an AWS
    Context object with a get_remaining_time_in_millis() function.
    """

    print('lambda_controller started.')
    print('event payload: ', json.dumps(event))

    # Gather info from invocation payload
    repository_name = event['repo_name']
    repository_uri = event['repository_uri']
    image_tag = event['image_tag']
    image_tags_additional = event['image_tags_additional']
    image_destination = repository_uri + ':' + image_tag

    s3_bucket = event['s3_bucket']
    # Location of the zip archive as passed from terraform -> s3
    code_location = event['code_location']

    task_definition_arn = event['task_definition_arn']
    cluster_name = event['cluster_name']
    subnet_id = event['subnet_id']
    security_group_id = event['security_group_id']


    # Quick check to see if the container already exists before building it
    container_details = get_container_details(repository_name, image_tag)
    if container_details != None:
        print("Container found before building, so exiting cleanly.")
        return {
            'statusCode': 200,
            'repositoryName': container_details['repositoryName'],
            'imageTag': container_details['imageTags'][0],
            'imageDigest': container_details['imageDigest'],
        }

    # Get zip archive from S3 and remake it into a .tar.gz that kaiko can use
    s3_tar_path = move_code(s3_bucket, code_location)

    # Run the Kaniko task definition in ECS
    task_arn = run_build_task(
        task_definition_arn, cluster_name, subnet_id, security_group_id,
        s3_tar_path, repository_uri, image_tag, image_tags_additional
    )

    # Watch the task for completion before the lambda exits
    container_waiter_result = container_waiter(
        task_arn, cluster_name, repository_name, image_tag, context
    )

    if container_waiter_result == False:
        print("Error getting container details: ", container_waiter_result)
        # Intentionally continuing to check to for the container anyways so not
        # exiting script.

    # Check to see the container details exist in the ECR Repository
    container_details = get_container_details(repository_name, image_tag)
    if container_details == None:
        raise Exception("Contianer_exists check failed: {}".format(
            container_details)
        )

    print("Container found. Exiting cleanly! („• ֊ •„)")
    return {
        'statusCode': 200,
        'repositoryName': container_details['repositoryName'],
        'imageTag': container_details['imageTags'][0],
        'imageDigest': container_details['imageDigest'],
    }

if __name__ == "__main__":
    """
    This is a helper function that works when the script is invoked directly
    (as might be done in local development). It is expected lambda will execute
    the script via the lambda_handler function above, but when invoked locally,
    (with the environment variables sets below -- and with a set of matching
    AWS credentials available to the SDK's auth chain) that this script can be
    run to test it's functionality without having to deploy it to lambda for
    execution.
    """

    # These must be set in the environment when executing this script locally
    repo_name = os.getenv('REPO_NAME')
    s3_bucket = os.getenv('S3_BUCKET')
    cluster_name = os.getenv('CLUSTER_NAME')
    task_arn = os.getenv('TASK_ARN')
    image_tag = os.getenv('IMAGE_TAG')
    repository_uri = os.getenv('REPOSITORY_URI')
    code_location = os.getenv('CODE_LOCATION')
    subnet_id = os.getenv('SUBNET_ID')
    security_group_id = os.getenv('SECURITY_GROUP_ID')

    event = """
{
  "repo_name": "{}",
  "s3_bucket": "{}",
  "cluster_name": "{}",
  "task_arn":
      "{}",
  "image_tag": "{}",
  "repository_uri": "{}",
  "code_location": "{}",
  "subnet_id": "{}",
  "security_group_id": "{}",
  "key2": "value2",
  "tf": {
    "action": "create",
    "prev_input": {
      "key1": "value1",
      "key2": "value2"
    }
  }
}
""".format(repo_name, s3_bucket, cluster_name, task_arn, image_tag,
           repository_uri, code_location, subnet_id, security_group_id)

    # This creates a very limited version of the AWS Lambda conext object
    # https://docs.aws.amazon.com/lambda/latest/dg/python-context.html
    # meant solely to replicate the get_remaining_time_in_millis() function
    class Context(object):
        def __init__(self):
            self.time_start= time.time()
            self.total_duration = 5 # seconds
            self.time_end = time.time() + self.total_duration
            print("Context init, current time: ", time.time())
            print("Context init, end time: ", self.time_end)
        def get_remaining_time_in_millis(self):
            return round((self.time_end - time.time()) * 1000, 2)

    context = Context()
    event = json.loads(event)
    print(lambda_handler(event, context))