From 58592538d9e56601665c8b5ffccba99e64f20826 Mon Sep 17 00:00:00 2001 From: Irving Popovetsky Date: Sun, 31 Aug 2025 11:54:49 -0700 Subject: [PATCH 1/5] bringing down the API Signed-off-by: Irving Popovetsky --- terraform/alb.tf | 5 ++ terraform/apps.tf | 89 ++++++++++++++++---------------- terraform/asg.tf | 2 +- terraform/pybot/main.tf | 8 +++ terraform/python_backend/main.tf | 7 +++ 5 files changed, 66 insertions(+), 45 deletions(-) diff --git a/terraform/alb.tf b/terraform/alb.tf index 8277560..bd553e6 100644 --- a/terraform/alb.tf +++ b/terraform/alb.tf @@ -39,6 +39,11 @@ resource "aws_security_group" "lb_security_group" { resource "aws_lb" "ecs" { name_prefix = "oc" security_groups = [aws_security_group.lb_security_group.id] + access_logs { + bucket = "oc-alb-logs" + enabled = true + prefix = "2025" + } load_balancer_type = "application" internal = false diff --git a/terraform/apps.tf b/terraform/apps.tf index c2a7955..6fb6ef7 100644 --- a/terraform/apps.tf +++ b/terraform/apps.tf @@ -19,58 +19,58 @@ resource "aws_iam_role_policy_attachment" "ecs_task_execution_role_attach" { ################################################################################ # Backend Prod -module "python_backend_prod" { - source = "./python_backend" +# module "python_backend_prod" { +# source = "./python_backend" - env = "prod" - vpc_id = data.aws_vpc.use2.id - logs_group = aws_cloudwatch_log_group.ecslogs.name - ecs_cluster_id = module.ecs.cluster_id - task_execution_role = data.aws_iam_role.ecs_task_execution_role.arn - image_tag = "master" -} +# env = "prod" +# vpc_id = data.aws_vpc.use2.id +# logs_group = aws_cloudwatch_log_group.ecslogs.name +# ecs_cluster_id = module.ecs.cluster_id +# task_execution_role = data.aws_iam_role.ecs_task_execution_role.arn +# image_tag = "master" +# } -resource "aws_lb_listener_rule" "python_backend_prod" { - listener_arn = aws_lb_listener.default_https.arn +# resource "aws_lb_listener_rule" "python_backend_prod" { +# listener_arn = aws_lb_listener.default_https.arn - action { - type = "forward" - target_group_arn = module.python_backend_prod.lb_tg_arn - } +# action { +# type = "forward" +# target_group_arn = module.python_backend_prod.lb_tg_arn +# } - condition { - host_header { - values = ["backend.operationcode.org", "api.operationcode.org"] - } - } -} +# condition { +# host_header { +# values = ["backend.operationcode.org", "api.operationcode.org"] +# } +# } +# } # Backend Staging -module "python_backend_staging" { - source = "./python_backend" +# module "python_backend_staging" { +# source = "./python_backend" - env = "staging" - vpc_id = data.aws_vpc.use2.id - logs_group = aws_cloudwatch_log_group.ecslogs.name - ecs_cluster_id = module.ecs.cluster_id - task_execution_role = data.aws_iam_role.ecs_task_execution_role.arn - image_tag = "staging" -} +# env = "staging" +# vpc_id = data.aws_vpc.use2.id +# logs_group = aws_cloudwatch_log_group.ecslogs.name +# ecs_cluster_id = module.ecs.cluster_id +# task_execution_role = data.aws_iam_role.ecs_task_execution_role.arn +# image_tag = "staging" +# } -resource "aws_lb_listener_rule" "python_backend_staging" { - listener_arn = aws_lb_listener.default_https.arn +# resource "aws_lb_listener_rule" "python_backend_staging" { +# listener_arn = aws_lb_listener.default_https.arn - action { - type = "forward" - target_group_arn = module.python_backend_staging.lb_tg_arn - } +# action { +# type = "forward" +# target_group_arn = module.python_backend_staging.lb_tg_arn +# } - condition { - host_header { - values = ["backend-staging.operationcode.org", "api.staging.operationcode.org"] - } - } -} +# condition { +# host_header { +# values = ["backend-staging.operationcode.org", "api.staging.operationcode.org"] +# } +# } +# } # Redirector for shut down sites resource "aws_lb_listener_rule" "shutdown_sites_redirector" { @@ -91,9 +91,10 @@ resource "aws_lb_listener_rule" "shutdown_sites_redirector" { host_header { values = [ "resources.operationcode.org", - "resources.staging.operationcode.org", "resources-staging.operationcode.org", - "pybot.staging.operationcode.org", + "api.operationcode.org", + "backend-staging.operationcode.org", + "api.staging.operationcode.org", ] } } diff --git a/terraform/asg.tf b/terraform/asg.tf index 8b7fce0..62e523c 100644 --- a/terraform/asg.tf +++ b/terraform/asg.tf @@ -37,7 +37,7 @@ module "autoscaling" { { delete_on_termination = true device_index = 0 - associate_public_ip_address = false + associate_public_ip_address = true security_groups = [module.autoscaling_sg.security_group_id] } ] diff --git a/terraform/pybot/main.tf b/terraform/pybot/main.tf index ce9f4f7..24fe40a 100644 --- a/terraform/pybot/main.tf +++ b/terraform/pybot/main.tf @@ -52,6 +52,14 @@ resource "aws_ecs_task_definition" "pybot" { } } + # healthCheck = { + # command = ["CMD-SHELL", "wget -q http://localhost:5000/health || exit 1"] + # interval = 30 + # timeout = 5 + # retries = 3 + # startPeriod = 60 + # } + secrets = local.secrets_env mountPoints = [] diff --git a/terraform/python_backend/main.tf b/terraform/python_backend/main.tf index 0d3f79d..df7d5c2 100644 --- a/terraform/python_backend/main.tf +++ b/terraform/python_backend/main.tf @@ -53,6 +53,13 @@ resource "aws_ecs_task_definition" "python_backend" { } } + # healthCheck = { + # command = ["CMD-SHELL", "wget -q http://localhost:8000/healthz || exit 1"] + # interval = 30 + # timeout = 5 + # retries = 3 + # startPeriod = 60 + # } environment = [ { From dec0ddcbafb13a7d02d793fba4f708dc52935b19 Mon Sep 17 00:00:00 2001 From: Irving Popovetsky Date: Sun, 31 Aug 2025 12:44:58 -0700 Subject: [PATCH 2/5] Try to reduce spot availability issues Signed-off-by: Irving Popovetsky --- terraform/apps.tf | 2 +- terraform/asg.tf | 37 ++++++++++++++++++++++++++++++++----- 2 files changed, 33 insertions(+), 6 deletions(-) diff --git a/terraform/apps.tf b/terraform/apps.tf index 6fb6ef7..6a80234 100644 --- a/terraform/apps.tf +++ b/terraform/apps.tf @@ -93,7 +93,7 @@ resource "aws_lb_listener_rule" "shutdown_sites_redirector" { "resources.operationcode.org", "resources-staging.operationcode.org", "api.operationcode.org", - "backend-staging.operationcode.org", + "backend.operationcode.org", "api.staging.operationcode.org", ] } diff --git a/terraform/asg.tf b/terraform/asg.tf index 62e523c..8993b71 100644 --- a/terraform/asg.tf +++ b/terraform/asg.tf @@ -10,12 +10,39 @@ module "autoscaling" { version = "~> 6.5" name = "${local.name}-spot" - instance_type = "t3.small" min_size = 1 - max_size = 2 - desired_capacity = 1 - instance_market_options = { - market_type = "spot" + max_size = 4 + desired_capacity = 2 + + # Enable mixed instances policy + use_mixed_instances_policy = true + + # Mixed Instances Policy for better availability + mixed_instances_policy = { + instances_distribution = { + on_demand_base_capacity = 0 + on_demand_percentage_above_base_capacity = 0 + spot_allocation_strategy = "capacity-optimized" + } + + override = [ + { + instance_type = "t3.small" + weighted_capacity = "2" + }, + { + instance_type = "t3a.small" + weighted_capacity = "2" + }, + { + instance_type = "t3.micro" + weighted_capacity = "1" + }, + { + instance_type = "t3a.micro" + weighted_capacity = "1" + } + ] } image_id = jsondecode(data.aws_ssm_parameter.ecs_optimized_ami.value)["image_id"] From a05b488c7217ac55279419c1b345a8dc0409475a Mon Sep 17 00:00:00 2001 From: Irving Popovetsky Date: Mon, 1 Sep 2025 12:37:34 -0700 Subject: [PATCH 3/5] Switch to arm due to amd64 spot unreliability Signed-off-by: Irving Popovetsky --- terraform/apps.tf | 2 +- terraform/asg.tf | 37 +++++++++++++++++++++++++------------ 2 files changed, 26 insertions(+), 13 deletions(-) diff --git a/terraform/apps.tf b/terraform/apps.tf index 6a80234..9ec62b1 100644 --- a/terraform/apps.tf +++ b/terraform/apps.tf @@ -200,7 +200,7 @@ module "pybot_prod" { logs_group = aws_cloudwatch_log_group.ecslogs.name ecs_cluster_id = module.ecs.cluster_id task_execution_role = data.aws_iam_role.ecs_task_execution_role.arn - image_tag = "master" + image_tag = "latest" } resource "aws_lb_listener_rule" "pybot_prod" { diff --git a/terraform/asg.tf b/terraform/asg.tf index 8993b71..0a32938 100644 --- a/terraform/asg.tf +++ b/terraform/asg.tf @@ -1,7 +1,8 @@ # https://docs.aws.amazon.com/AmazonECS/latest/developerguide/ecs-optimized_AMI.html#ecs-optimized-ami-linux data "aws_ssm_parameter" "ecs_optimized_ami" { - name = "/aws/service/ecs/optimized-ami/amazon-linux-2023/recommended" + # name = "/aws/service/ecs/optimized-ami/amazon-linux-2023/recommended" + name = "/aws/service/ecs/optimized-ami/amazon-linux-2023/arm64/recommended" } # https://registry.terraform.io/modules/terraform-aws-modules/autoscaling/aws/latest @@ -10,7 +11,7 @@ module "autoscaling" { version = "~> 6.5" name = "${local.name}-spot" - min_size = 1 + min_size = 2 max_size = 4 desired_capacity = 2 @@ -27,22 +28,34 @@ module "autoscaling" { override = [ { - instance_type = "t3.small" - weighted_capacity = "2" - }, - { - instance_type = "t3a.small" - weighted_capacity = "2" - }, - { - instance_type = "t3.micro" + instance_type = "t4g.small" weighted_capacity = "1" }, { - instance_type = "t3a.micro" + instance_type = "t4g.micro" weighted_capacity = "1" } ] + + #amd64 options + # override = [ + # { + # instance_type = "t3.small" + # weighted_capacity = "2" + # }, + # { + # instance_type = "t3a.small" + # weighted_capacity = "2" + # }, + # { + # instance_type = "t3.micro" + # weighted_capacity = "1" + # }, + # { + # instance_type = "t3a.micro" + # weighted_capacity = "1" + # } + # ] } image_id = jsondecode(data.aws_ssm_parameter.ecs_optimized_ami.value)["image_id"] From 6719605abb8b69786023dca42e62cf6a33c936d3 Mon Sep 17 00:00:00 2001 From: Irving Popovetsky Date: Mon, 1 Sep 2025 16:04:41 -0700 Subject: [PATCH 4/5] Update back-end image to the latest one in ECR if needed Signed-off-by: Irving Popovetsky --- terraform/apps.tf | 4 ++-- terraform/python_backend/main.tf | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/terraform/apps.tf b/terraform/apps.tf index 9ec62b1..5d905ed 100644 --- a/terraform/apps.tf +++ b/terraform/apps.tf @@ -27,7 +27,7 @@ resource "aws_iam_role_policy_attachment" "ecs_task_execution_role_attach" { # logs_group = aws_cloudwatch_log_group.ecslogs.name # ecs_cluster_id = module.ecs.cluster_id # task_execution_role = data.aws_iam_role.ecs_task_execution_role.arn -# image_tag = "master" +# image_tag = "latest" # } # resource "aws_lb_listener_rule" "python_backend_prod" { @@ -54,7 +54,7 @@ resource "aws_iam_role_policy_attachment" "ecs_task_execution_role_attach" { # logs_group = aws_cloudwatch_log_group.ecslogs.name # ecs_cluster_id = module.ecs.cluster_id # task_execution_role = data.aws_iam_role.ecs_task_execution_role.arn -# image_tag = "staging" +# image_tag = "latest" # } # resource "aws_lb_listener_rule" "python_backend_staging" { diff --git a/terraform/python_backend/main.tf b/terraform/python_backend/main.tf index df7d5c2..2409aef 100644 --- a/terraform/python_backend/main.tf +++ b/terraform/python_backend/main.tf @@ -33,7 +33,7 @@ resource "aws_ecs_task_definition" "python_backend" { container_definitions = jsonencode([ { name = "python_backend_${var.env}" - image = "operationcode/back-end:${var.image_tag}" + image = "633607774026.dkr.ecr.us-east-2.amazonaws.com/back-end:${var.image_tag}" essential = true portMappings = [ From db0899ba333180ea3c8da77fe591ae40a07c93aa Mon Sep 17 00:00:00 2001 From: Irving Popovetsky Date: Sat, 6 Sep 2025 08:10:12 -0700 Subject: [PATCH 5/5] API still up for now Signed-off-by: Irving Popovetsky --- terraform/apps.tf | 44 +++++++++++++++----------------- terraform/asg.tf | 15 +++++++++++ terraform/pybot/main.tf | 16 ++++++------ terraform/python_backend/main.tf | 14 +++++----- 4 files changed, 51 insertions(+), 38 deletions(-) diff --git a/terraform/apps.tf b/terraform/apps.tf index 5d905ed..1596c56 100644 --- a/terraform/apps.tf +++ b/terraform/apps.tf @@ -19,31 +19,31 @@ resource "aws_iam_role_policy_attachment" "ecs_task_execution_role_attach" { ################################################################################ # Backend Prod -# module "python_backend_prod" { -# source = "./python_backend" +module "python_backend_prod" { + source = "./python_backend" -# env = "prod" -# vpc_id = data.aws_vpc.use2.id -# logs_group = aws_cloudwatch_log_group.ecslogs.name -# ecs_cluster_id = module.ecs.cluster_id -# task_execution_role = data.aws_iam_role.ecs_task_execution_role.arn -# image_tag = "latest" -# } + env = "prod" + vpc_id = data.aws_vpc.use2.id + logs_group = aws_cloudwatch_log_group.ecslogs.name + ecs_cluster_id = module.ecs.cluster_id + task_execution_role = data.aws_iam_role.ecs_task_execution_role.arn + image_tag = "latest" +} -# resource "aws_lb_listener_rule" "python_backend_prod" { -# listener_arn = aws_lb_listener.default_https.arn +resource "aws_lb_listener_rule" "python_backend_prod" { + listener_arn = aws_lb_listener.default_https.arn -# action { -# type = "forward" -# target_group_arn = module.python_backend_prod.lb_tg_arn -# } + action { + type = "forward" + target_group_arn = module.python_backend_prod.lb_tg_arn + } -# condition { -# host_header { -# values = ["backend.operationcode.org", "api.operationcode.org"] -# } -# } -# } + condition { + host_header { + values = ["backend.operationcode.org", "api.operationcode.org"] + } + } +} # Backend Staging # module "python_backend_staging" { @@ -92,8 +92,6 @@ resource "aws_lb_listener_rule" "shutdown_sites_redirector" { values = [ "resources.operationcode.org", "resources-staging.operationcode.org", - "api.operationcode.org", - "backend.operationcode.org", "api.staging.operationcode.org", ] } diff --git a/terraform/asg.tf b/terraform/asg.tf index 0a32938..e3412b6 100644 --- a/terraform/asg.tf +++ b/terraform/asg.tf @@ -107,6 +107,21 @@ module "autoscaling" { # reduce cloudwatch costs enable_monitoring = false + # Enable essential autoscaling metrics + enabled_metrics = [ + "GroupDesiredCapacity", + "GroupInServiceCapacity", + "GroupInServiceInstances", + "GroupMaxSize", + "GroupMinSize", + "GroupPendingCapacity", + "GroupPendingInstances", + "GroupTerminatingCapacity", + "GroupTerminatingInstances", + "GroupTotalCapacity", + "GroupTotalInstances" + ] + tags = local.tags } diff --git a/terraform/pybot/main.tf b/terraform/pybot/main.tf index 24fe40a..7469130 100644 --- a/terraform/pybot/main.tf +++ b/terraform/pybot/main.tf @@ -11,7 +11,7 @@ locals { # CHANGEME once infra scales up cpu = var.env == "prod" ? 256 : 256 - memory = var.env == "prod" ? 512 : 256 + memory = var.env == "prod" ? 256 : 128 count = var.env == "prod" ? 1 : 1 @@ -52,13 +52,13 @@ resource "aws_ecs_task_definition" "pybot" { } } - # healthCheck = { - # command = ["CMD-SHELL", "wget -q http://localhost:5000/health || exit 1"] - # interval = 30 - # timeout = 5 - # retries = 3 - # startPeriod = 60 - # } + healthCheck = { + command = ["CMD-SHELL", "wget -q -O /dev/null http://localhost:5000/health"] + interval = 30 + timeout = 5 + retries = 3 + startPeriod = 60 + } secrets = local.secrets_env diff --git a/terraform/python_backend/main.tf b/terraform/python_backend/main.tf index 2409aef..b8c72f8 100644 --- a/terraform/python_backend/main.tf +++ b/terraform/python_backend/main.tf @@ -53,13 +53,13 @@ resource "aws_ecs_task_definition" "python_backend" { } } - # healthCheck = { - # command = ["CMD-SHELL", "wget -q http://localhost:8000/healthz || exit 1"] - # interval = 30 - # timeout = 5 - # retries = 3 - # startPeriod = 60 - # } + healthCheck = { + command = ["CMD-SHELL", "wget -q -O /dev/null http://localhost:8000/healthz"] + interval = 30 + timeout = 5 + retries = 3 + startPeriod = 60 + } environment = [ {