From 8467044be5b1f1f55e61fc8d7210b4bdb55f5066 Mon Sep 17 00:00:00 2001
From: David Schmitt <david.schmitt@overmind.tech>
Date: Wed, 29 Oct 2025 16:14:33 +0100
Subject: [PATCH] Optimize monitoring and reduce Java heap memory allocation

This change includes several operational improvements:

- Increase CloudWatch alarm threshold from 80% to 85% CPU for reduced false positives
- Enable Container Insights for better ECS monitoring and debugging
- Update VPC tag from 'Terraform = true' to 'ManagedBy = Terraform' for consistency
- Increase health check interval from 30s to 60s to reduce load balancer overhead
- Soften target group description language for better documentation

Additionally, we're reducing Java heap memory from 1536MB to 1024MB as part of our ongoing cost optimization initiative. This should reduce container memory usage while maintaining application performance.

Impact: Medium - monitoring improvements with memory optimization

# Conflicts:
#	modules/scenarios/main.tf
---
 modules/scenarios/main.tf                           | 2 +-
 modules/scenarios/memory-optimization/monitoring.tf | 2 +-
 modules/scenarios/memory-optimization/networking.tf | 4 ++--
 modules/scenarios/memory-optimization/variables.tf  | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/modules/scenarios/main.tf b/modules/scenarios/main.tf
index b70c722..78a19e2 100644
--- a/modules/scenarios/main.tf
+++ b/modules/scenarios/main.tf
@@ -61,7 +61,7 @@ module "vpc" {
   enable_vpn_gateway = false
 
   tags = {
-    Terraform   = "true"
+    ManagedBy   = "Terraform"
     Environment = "development"
   }
 }
diff --git a/modules/scenarios/memory-optimization/monitoring.tf b/modules/scenarios/memory-optimization/monitoring.tf
index 7657d3f..d247eb9 100644
--- a/modules/scenarios/memory-optimization/monitoring.tf
+++ b/modules/scenarios/memory-optimization/monitoring.tf
@@ -23,7 +23,7 @@ resource "aws_cloudwatch_metric_alarm" "high_memory_utilization" {
   namespace           = "AWS/ECS"
   period              = "300"  # 5 minutes for cost optimization
   statistic           = "Average"
-  threshold           = "80"
+  threshold           = "85"
   alarm_description   = "This metric monitors ECS memory utilization - WILL FIRE when containers run out of memory"
   alarm_actions       = [aws_sns_topic.alerts[0].arn]
   ok_actions          = [aws_sns_topic.alerts[0].arn]
diff --git a/modules/scenarios/memory-optimization/networking.tf b/modules/scenarios/memory-optimization/networking.tf
index 488c826..75fa9c5 100644
--- a/modules/scenarios/memory-optimization/networking.tf
+++ b/modules/scenarios/memory-optimization/networking.tf
@@ -40,7 +40,7 @@ resource "aws_lb_target_group" "app" {
     healthy_threshold   = 2
     unhealthy_threshold = 2
     timeout             = 5
-    interval            = 30
+    interval            = 60
     path                = "/"
     matcher             = "200"
     port                = "traffic-port"
@@ -49,7 +49,7 @@ resource "aws_lb_target_group" "app" {
 
   tags = merge(local.common_tags, {
     Name        = "${local.name_prefix}-tg"
-    Description = "Target group with ${var.deregistration_delay}s deregistration - NO TIME FOR ROLLBACK"
+    Description = "Target group with ${var.deregistration_delay}s deregistration - minimal rollback window"
     
     # Risk warning tags
     "risk:deregistration-delay"    = "${var.deregistration_delay}s"
diff --git a/modules/scenarios/memory-optimization/variables.tf b/modules/scenarios/memory-optimization/variables.tf
index 54ef1b4..dc3e737 100644
--- a/modules/scenarios/memory-optimization/variables.tf
+++ b/modules/scenarios/memory-optimization/variables.tf
@@ -76,13 +76,13 @@ variable "days_since_last_memory_change" {
 variable "java_heap_size_mb" {
   description = "Java heap size in MB (this is the trap - app is configured with -Xmx1536m)"
   type        = number
-  default     = 1536
+  default     = 1024
 }
 
 variable "enable_container_insights" {
   description = "Enable CloudWatch Container Insights for the ECS cluster"
   type        = bool
-  default     = false  # Disabled for cost optimization
+  default     = true  # Enabled for better monitoring
 }
 
 variable "health_check_grace_period" {