From 62b9cae1738cb2940faede7a9ad695fce7fa2f15 Mon Sep 17 00:00:00 2001 From: shunskkkk Date: Fri, 12 Sep 2025 10:32:06 +0900 Subject: [PATCH 1/4] Add execute permission to entrypoint.sh for containerized environments This change ensures the entrypoint script has proper execute permissions, preventing exec format errors when running in container environments like ECS. Signed-off-by: shunskkkk --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index 1a1efb2c55..c8dcf41aa2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -19,5 +19,6 @@ WORKDIR /usr/src/app COPY --from=build /usr/src/app/api/build/libs/marquez-*.jar /usr/src/app COPY marquez.dev.yml marquez.dev.yml COPY docker/entrypoint.sh entrypoint.sh +RUN chmod +x /usr/src/app/entrypoint.sh EXPOSE 5000 5001 ENTRYPOINT ["/usr/src/app/entrypoint.sh"] From bfe2b1adb71e3af528f6ad1b9839cc09c884424e Mon Sep 17 00:00:00 2001 From: shunskkkk Date: Fri, 12 Sep 2025 10:33:27 +0900 Subject: [PATCH 2/4] Add environment variable support for database configuration Support POSTGRES_DB, POSTGRES_USER, and POSTGRES_PASSWORD environment variables while maintaining backward compatibility with default values. This enables flexible database configuration in containerized and cloud environments. Signed-off-by: shunskkkk --- marquez.dev.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/marquez.dev.yml b/marquez.dev.yml index 48f4e21598..03ddb48492 100644 --- a/marquez.dev.yml +++ b/marquez.dev.yml @@ -9,9 +9,9 @@ server: db: driverClass: org.postgresql.Driver - url: jdbc:postgresql://${POSTGRES_HOST:-localhost}:${POSTGRES_PORT:-5432}/marquez - user: marquez - password: marquez + url: jdbc:postgresql://${POSTGRES_HOST:-localhost}:${POSTGRES_PORT:-5432}/${POSTGRES_DB:-marquez} + user: ${POSTGRES_USER:-marquez} + password: ${POSTGRES_PASSWORD:-marquez} migrateOnStartup: true From 14e7ec75eaeb338f1779e0dc8ff280678ef9f27d Mon Sep 17 00:00:00 2001 From: shunskkkk Date: Fri, 12 Sep 2025 16:05:48 +0900 Subject: [PATCH 3/4] Add AWS ECS deployment with Terraform Signed-off-by: shunskkkk --- CHANGELOG.md | 4 + deploy/README.md | 17 + deploy/aws/ecs-terraform/README.md | 342 ++++++++ deploy/aws/ecs-terraform/ecs/service-api.json | 36 + deploy/aws/ecs-terraform/ecs/service-web.json | 31 + .../ecs/task-definition-api.json | 97 +++ .../ecs/task-definition-web.json | 51 ++ .../aws/ecs-terraform/scripts/build-images.sh | 64 ++ deploy/aws/ecs-terraform/scripts/deploy.sh | 167 ++++ deploy/aws/ecs-terraform/terraform/README.md | 188 +++++ .../aws/ecs-terraform/terraform/cloudfront.tf | 283 +++++++ deploy/aws/ecs-terraform/terraform/main.tf | 738 ++++++++++++++++++ .../terraform/terraform.tfvars.example | 42 + .../aws/ecs-terraform/terraform/variables.tf | 108 +++ .../ecs-terraform/terraform/vpc-endpoints.tf | 120 +++ deploy/aws/ecs-terraform/terraform/vpc.tf | 192 +++++ 16 files changed, 2480 insertions(+) create mode 100644 deploy/README.md create mode 100644 deploy/aws/ecs-terraform/README.md create mode 100644 deploy/aws/ecs-terraform/ecs/service-api.json create mode 100644 deploy/aws/ecs-terraform/ecs/service-web.json create mode 100644 deploy/aws/ecs-terraform/ecs/task-definition-api.json create mode 100644 deploy/aws/ecs-terraform/ecs/task-definition-web.json create mode 100644 deploy/aws/ecs-terraform/scripts/build-images.sh create mode 100644 deploy/aws/ecs-terraform/scripts/deploy.sh create mode 100644 deploy/aws/ecs-terraform/terraform/README.md create mode 100644 deploy/aws/ecs-terraform/terraform/cloudfront.tf create mode 100644 deploy/aws/ecs-terraform/terraform/main.tf create mode 100644 deploy/aws/ecs-terraform/terraform/terraform.tfvars.example create mode 100644 deploy/aws/ecs-terraform/terraform/variables.tf create mode 100644 deploy/aws/ecs-terraform/terraform/vpc-endpoints.tf create mode 100644 deploy/aws/ecs-terraform/terraform/vpc.tf diff --git a/CHANGELOG.md b/CHANGELOG.md index ca14da1c02..9abbc0417d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ ## [Unreleased](https://github.com/MarquezProject/marquez/compare/0.50.0...HEAD) +### Added + +* Deployment: Add AWS ECS deployment configuration with Terraform support for production-ready infrastructure + ## [0.50.0](https://github.com/MarquezProject/marquez/compare/0.49.0...0.50.0) - 2024-10-23 ### Added diff --git a/deploy/README.md b/deploy/README.md new file mode 100644 index 0000000000..0bb3853553 --- /dev/null +++ b/deploy/README.md @@ -0,0 +1,17 @@ +# Marquez Deployment Options + +This directory contains deployment configurations for Marquez across different platforms and cloud providers. + +## Available Deployments + +### AWS + +#### ECS with Terraform (`aws/ecs-terraform/`) +Complete infrastructure-as-code deployment using: +- **Amazon ECS Fargate** for serverless container orchestration +- **RDS PostgreSQL** for managed database +- **Application Load Balancer** for traffic distribution +- **CloudFront** for HTTPS and global content delivery +- **Terraform** for infrastructure provisioning + +See [aws/ecs-terraform/README.md](aws/ecs-terraform/README.md) for detailed instructions. \ No newline at end of file diff --git a/deploy/aws/ecs-terraform/README.md b/deploy/aws/ecs-terraform/README.md new file mode 100644 index 0000000000..e3b403521d --- /dev/null +++ b/deploy/aws/ecs-terraform/README.md @@ -0,0 +1,342 @@ +# Marquez ECS Deployment Guide + +This guide provides instructions for deploying Marquez on AWS ECS with RDS PostgreSQL. + +## Deployment Order + +To ensure ECS services start successfully, follow this deployment order: + +1. **Create ECR Repositories First** - Use `terraform apply -target` to create only ECR repositories +2. **Push Docker Images** - Build and push images to the created ECR repositories +3. **Deploy Full Infrastructure** - Run `terraform apply` to create all remaining resources + +This approach ensures Docker images are available when ECS services start, preventing `CannotPullContainerError`. + +## Architecture Overview + +The deployment consists of: +- **ECS Fargate** for running containerized services +- **RDS PostgreSQL** for data persistence +- **Application Load Balancer** for traffic distribution +- **Amazon OpenSearch** (optional) for advanced search capabilities +- **ECR** for container image storage + +## Prerequisites + +1. AWS CLI configured with appropriate credentials +2. Docker installed locally +3. Terraform >= 1.0 (for infrastructure provisioning) +4. jq (for JSON processing in scripts) + +## Directory Structure + +``` +deploy/aws/ecs-terraform/ +├── ecs/ +│ ├── task-definition-api.json # ECS task definition for API +│ ├── task-definition-web.json # ECS task definition for Web UI +│ ├── service-api.json # ECS service definition for API +│ └── service-web.json # ECS service definition for Web UI +├── terraform/ +│ ├── main.tf # Core ECS, RDS, and ALB resources +│ ├── vpc.tf # VPC and networking configuration +│ ├── vpc-endpoints.tf # VPC endpoints for AWS services +│ ├── cloudfront.tf # CloudFront CDN configuration +│ ├── variables.tf # Variable definitions +│ ├── terraform.tfvars.example # Example configuration +│ └── environments/ # Environment-specific configs +│ ├── sandbox.tfvars +│ └── production.tfvars +├── scripts/ +│ ├── deploy.sh # Main deployment script +│ └── build-images.sh # Docker image build script +└── README.md # This file +``` + +## Quick Start + +### 1. Create ECR Repositories and Push Docker Images + +First, create only the ECR repositories and push Docker images: + +```bash +cd deploy/aws/ecs-terraform/terraform + +# Initialize Terraform +terraform init + +# Create terraform.tfvars with your configuration +cat > terraform.tfvars <:5001/healthcheck` +- Web: `http://:3000/` + +## Troubleshooting + +### Common Issues + +1. **ECS Tasks Fail with "CannotPullContainerError"** + - ECR repositories are created by Terraform but images must be pushed manually + - Follow Step 2 to build and push Docker images to ECR + - Ensure you're using `--platform linux/amd64` when building images + - Verify ECR login is successful before pushing + +2. **Database Connection Failed** + - Check RDS security group allows traffic from ECS tasks + - Verify database credentials in Secrets Manager + - Ensure RDS instance is running + +3. **ECS Service Won't Start** + - Check CloudWatch logs for error messages + - Verify ECR images exist and are accessible + - Check task definition memory/CPU allocation + - Ensure images are built for the correct platform (linux/amd64) + +4. **ALB Health Check Failing** + - Verify security group rules + - Check application startup time + - Review health check configuration + +### Useful Commands + +```bash +# View ECS service status +aws ecs describe-services \ + --cluster marquez-production \ + --services marquez-api marquez-web + +# View recent logs +aws logs tail /ecs/marquez-api --follow + +# Force new deployment +aws ecs update-service \ + --cluster marquez-production \ + --service marquez-api \ + --force-new-deployment + +# Check RDS status +aws rds describe-db-instances \ + --db-instance-identifier marquez-production +``` + +## Cost Optimization + +1. Use Fargate Spot for non-production environments +2. Implement proper auto-scaling policies +3. Use RDS reserved instances for production +4. Enable S3 lifecycle policies for CloudWatch logs +5. Consider using AWS OpenSearch Serverless for search functionality + +## Backup and Recovery + +1. **RDS Automated Backups**: 30-day retention configured +2. **Manual Snapshots**: Create before major changes +3. **Point-in-Time Recovery**: Available within backup retention period + +```bash +# Create manual snapshot +aws rds create-db-snapshot \ + --db-instance-identifier marquez-production \ + --db-snapshot-identifier marquez-manual-$(date +%Y%m%d) +``` + +## Updating Marquez + +1. Build new Docker images with updated code +2. Push to ECR with new tag +3. Update ECS task definitions with new image tag +4. Deploy using the deployment script + +```bash +# Update to new version +IMAGE_TAG=v1.1.0 ./deploy/aws/ecs-terraform/scripts/deploy.sh +``` + +## Clean Up + +To remove all resources: + +```bash +cd deploy/aws/ecs-terraform/terraform +terraform destroy +``` + +**Warning**: This will delete all resources including the RDS database. Ensure you have backups if needed. \ No newline at end of file diff --git a/deploy/aws/ecs-terraform/ecs/service-api.json b/deploy/aws/ecs-terraform/ecs/service-api.json new file mode 100644 index 0000000000..aed143f565 --- /dev/null +++ b/deploy/aws/ecs-terraform/ecs/service-api.json @@ -0,0 +1,36 @@ +{ + "serviceName": "marquez-api", + "taskDefinition": "marquez-api", + "desiredCount": 2, + "launchType": "FARGATE", + "networkConfiguration": { + "awsvpcConfiguration": { + "subnets": ["${PRIVATE_SUBNET_1}", "${PRIVATE_SUBNET_2}"], + "securityGroups": ["${API_SECURITY_GROUP}"], + "assignPublicIp": "DISABLED" + } + }, + "loadBalancers": [ + { + "targetGroupArn": "${API_TARGET_GROUP_ARN}", + "containerName": "marquez-api", + "containerPort": 5000 + }, + { + "targetGroupArn": "${API_ADMIN_TARGET_GROUP_ARN}", + "containerName": "marquez-api", + "containerPort": 5001 + } + ], + "healthCheckGracePeriodSeconds": 60, + "deploymentConfiguration": { + "maximumPercent": 200, + "minimumHealthyPercent": 100, + "deploymentCircuitBreaker": { + "enable": true, + "rollback": true + } + }, + "enableECSManagedTags": true, + "propagateTags": "SERVICE" +} \ No newline at end of file diff --git a/deploy/aws/ecs-terraform/ecs/service-web.json b/deploy/aws/ecs-terraform/ecs/service-web.json new file mode 100644 index 0000000000..7f327765de --- /dev/null +++ b/deploy/aws/ecs-terraform/ecs/service-web.json @@ -0,0 +1,31 @@ +{ + "serviceName": "marquez-web", + "taskDefinition": "marquez-web", + "desiredCount": 2, + "launchType": "FARGATE", + "networkConfiguration": { + "awsvpcConfiguration": { + "subnets": ["${PRIVATE_SUBNET_1}", "${PRIVATE_SUBNET_2}"], + "securityGroups": ["${WEB_SECURITY_GROUP}"], + "assignPublicIp": "DISABLED" + } + }, + "loadBalancers": [ + { + "targetGroupArn": "${WEB_TARGET_GROUP_ARN}", + "containerName": "marquez-web", + "containerPort": 3000 + } + ], + "healthCheckGracePeriodSeconds": 60, + "deploymentConfiguration": { + "maximumPercent": 200, + "minimumHealthyPercent": 100, + "deploymentCircuitBreaker": { + "enable": true, + "rollback": true + } + }, + "enableECSManagedTags": true, + "propagateTags": "SERVICE" +} \ No newline at end of file diff --git a/deploy/aws/ecs-terraform/ecs/task-definition-api.json b/deploy/aws/ecs-terraform/ecs/task-definition-api.json new file mode 100644 index 0000000000..6e11316d0c --- /dev/null +++ b/deploy/aws/ecs-terraform/ecs/task-definition-api.json @@ -0,0 +1,97 @@ +{ + "family": "marquez-api", + "networkMode": "awsvpc", + "requiresCompatibilities": ["FARGATE"], + "cpu": "1024", + "memory": "2048", + "taskRoleArn": "${TASK_ROLE_ARN}", + "executionRoleArn": "${EXECUTION_ROLE_ARN}", + "containerDefinitions": [ + { + "name": "marquez-api", + "image": "${ECR_REPOSITORY_URI}:${IMAGE_TAG}", + "essential": true, + "portMappings": [ + { + "containerPort": 5000, + "protocol": "tcp" + }, + { + "containerPort": 5001, + "protocol": "tcp" + } + ], + "environment": [ + { + "name": "MARQUEZ_PORT", + "value": "5000" + }, + { + "name": "MARQUEZ_ADMIN_PORT", + "value": "5001" + }, + { + "name": "POSTGRES_HOST", + "value": "${RDS_ENDPOINT}" + }, + { + "name": "POSTGRES_PORT", + "value": "5432" + }, + { + "name": "POSTGRES_DB", + "value": "marquez" + }, + { + "name": "POSTGRES_USER", + "value": "marquez" + }, + { + "name": "MIGRATE_ON_STARTUP", + "value": "true" + }, + { + "name": "SEARCH_ENABLED", + "value": "${SEARCH_ENABLED}" + }, + { + "name": "SEARCH_HOST", + "value": "${OPENSEARCH_ENDPOINT}" + }, + { + "name": "SEARCH_PORT", + "value": "443" + } + ], + "secrets": [ + { + "name": "POSTGRES_PASSWORD", + "valueFrom": "${DB_PASSWORD_SECRET_ARN}" + }, + { + "name": "SEARCH_USERNAME", + "valueFrom": "${OPENSEARCH_USERNAME_SECRET_ARN}" + }, + { + "name": "SEARCH_PASSWORD", + "valueFrom": "${OPENSEARCH_PASSWORD_SECRET_ARN}" + } + ], + "logConfiguration": { + "logDriver": "awslogs", + "options": { + "awslogs-group": "/ecs/marquez-api", + "awslogs-region": "${AWS_REGION}", + "awslogs-stream-prefix": "ecs" + } + }, + "healthCheck": { + "command": ["CMD-SHELL", "curl -f http://localhost:5001/healthcheck || exit 1"], + "interval": 30, + "timeout": 5, + "retries": 3, + "startPeriod": 60 + } + } + ] +} \ No newline at end of file diff --git a/deploy/aws/ecs-terraform/ecs/task-definition-web.json b/deploy/aws/ecs-terraform/ecs/task-definition-web.json new file mode 100644 index 0000000000..e02a66e748 --- /dev/null +++ b/deploy/aws/ecs-terraform/ecs/task-definition-web.json @@ -0,0 +1,51 @@ +{ + "family": "marquez-web", + "networkMode": "awsvpc", + "requiresCompatibilities": ["FARGATE"], + "cpu": "512", + "memory": "1024", + "taskRoleArn": "${TASK_ROLE_ARN}", + "executionRoleArn": "${EXECUTION_ROLE_ARN}", + "containerDefinitions": [ + { + "name": "marquez-web", + "image": "${ECR_REPOSITORY_URI_WEB}:${IMAGE_TAG}", + "essential": true, + "portMappings": [ + { + "containerPort": 3000, + "protocol": "tcp" + } + ], + "environment": [ + { + "name": "MARQUEZ_HOST", + "value": "${API_ENDPOINT}" + }, + { + "name": "MARQUEZ_PORT", + "value": "5000" + }, + { + "name": "REACT_APP_ADVANCED_SEARCH", + "value": "true" + } + ], + "logConfiguration": { + "logDriver": "awslogs", + "options": { + "awslogs-group": "/ecs/marquez-web", + "awslogs-region": "${AWS_REGION}", + "awslogs-stream-prefix": "ecs" + } + }, + "healthCheck": { + "command": ["CMD-SHELL", "curl -f http://localhost:3000 || exit 1"], + "interval": 30, + "timeout": 5, + "retries": 3, + "startPeriod": 60 + } + } + ] +} \ No newline at end of file diff --git a/deploy/aws/ecs-terraform/scripts/build-images.sh b/deploy/aws/ecs-terraform/scripts/build-images.sh new file mode 100644 index 0000000000..e8f3b46f83 --- /dev/null +++ b/deploy/aws/ecs-terraform/scripts/build-images.sh @@ -0,0 +1,64 @@ +#!/bin/bash +# +# Copyright 2018-2025 contributors to the Marquez project +# SPDX-License-Identifier: Apache-2.0 +# +set -e + +# Build script for local testing and CI/CD pipelines + +# Configuration +IMAGE_TAG=${IMAGE_TAG:-"latest"} +REGISTRY=${REGISTRY:-""} + +# Colors for output +GREEN='\033[0;32m' +NC='\033[0m' + +echo_info() { + echo -e "${GREEN}[INFO]${NC} $1" +} + +# Build API image +build_api() { + echo_info "Building Marquez API image..." + docker build -f Dockerfile -t marquez-api:${IMAGE_TAG} . + + if [ ! -z "${REGISTRY}" ]; then + docker tag marquez-api:${IMAGE_TAG} ${REGISTRY}/marquez-api:${IMAGE_TAG} + echo_info "Tagged as ${REGISTRY}/marquez-api:${IMAGE_TAG}" + fi +} + +# Build Web image +build_web() { + echo_info "Building Marquez Web image..." + cd web + docker build -f Dockerfile -t marquez-web:${IMAGE_TAG} . + cd .. + + if [ ! -z "${REGISTRY}" ]; then + docker tag marquez-web:${IMAGE_TAG} ${REGISTRY}/marquez-web:${IMAGE_TAG} + echo_info "Tagged as ${REGISTRY}/marquez-web:${IMAGE_TAG}" + fi +} + +# Main +main() { + echo_info "Starting build process..." + echo_info "Image tag: ${IMAGE_TAG}" + + # Navigate to project root + cd $(dirname "$0")/../.. + + # Build images + build_api + build_web + + echo_info "Build completed successfully!" + echo_info "Images built:" + echo " - marquez-api:${IMAGE_TAG}" + echo " - marquez-web:${IMAGE_TAG}" +} + +main \ No newline at end of file diff --git a/deploy/aws/ecs-terraform/scripts/deploy.sh b/deploy/aws/ecs-terraform/scripts/deploy.sh new file mode 100644 index 0000000000..92fcea70a9 --- /dev/null +++ b/deploy/aws/ecs-terraform/scripts/deploy.sh @@ -0,0 +1,167 @@ +#!/bin/bash +# +# Copyright 2018-2025 contributors to the Marquez project +# SPDX-License-Identifier: Apache-2.0 +# +set -e + +# Configuration +AWS_REGION=${AWS_REGION:-"us-east-1"} +AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text) +ECR_API_REPO="${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/marquez-api" +ECR_WEB_REPO="${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/marquez-web" +ENVIRONMENT=${ENVIRONMENT:-"production"} +IMAGE_TAG=${IMAGE_TAG:-"latest"} + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +echo_info() { + echo -e "${GREEN}[INFO]${NC} $1" +} + +echo_warn() { + echo -e "${YELLOW}[WARN]${NC} $1" +} + +echo_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# Function to build and push Docker images +build_and_push() { + local service=$1 + local dockerfile=$2 + local ecr_repo=$3 + + echo_info "Building ${service} Docker image..." + + if [ "$service" = "api" ]; then + docker build -f ${dockerfile} -t ${service}:${IMAGE_TAG} . + else + cd web && docker build -f ${dockerfile} -t ${service}:${IMAGE_TAG} . && cd .. + fi + + echo_info "Tagging ${service} image for ECR..." + docker tag ${service}:${IMAGE_TAG} ${ecr_repo}:${IMAGE_TAG} + + echo_info "Pushing ${service} image to ECR..." + docker push ${ecr_repo}:${IMAGE_TAG} + + echo_info "${service} image pushed successfully!" +} + +# Function to update ECS service +update_ecs_service() { + local service_name=$1 + local cluster_name="marquez-${ENVIRONMENT}" + + echo_info "Updating ECS service ${service_name}..." + + aws ecs update-service \ + --cluster ${cluster_name} \ + --service ${service_name} \ + --force-new-deployment \ + --region ${AWS_REGION} \ + > /dev/null + + echo_info "Waiting for service ${service_name} to stabilize..." + aws ecs wait services-stable \ + --cluster ${cluster_name} \ + --services ${service_name} \ + --region ${AWS_REGION} + + echo_info "Service ${service_name} updated successfully!" +} + +# Main deployment flow +main() { + echo_info "Starting Marquez ECS deployment..." + echo_info "Environment: ${ENVIRONMENT}" + echo_info "AWS Region: ${AWS_REGION}" + echo_info "Image Tag: ${IMAGE_TAG}" + + # Login to ECR + echo_info "Logging in to ECR..." + aws ecr get-login-password --region ${AWS_REGION} | \ + docker login --username AWS --password-stdin ${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com + + # Build and push API image + build_and_push "api" "Dockerfile" "${ECR_API_REPO}" + + # Build and push Web image + build_and_push "web" "Dockerfile" "${ECR_WEB_REPO}" + + # Update ECS services + update_ecs_service "marquez-api" + update_ecs_service "marquez-web" + + echo_info "Deployment completed successfully!" + + # Get ALB DNS name + ALB_DNS=$(aws elbv2 describe-load-balancers \ + --names "marquez-${ENVIRONMENT}" \ + --query "LoadBalancers[0].DNSName" \ + --output text \ + --region ${AWS_REGION}) + + echo_info "Application is available at: http://${ALB_DNS}" + echo_info "API endpoint: http://${ALB_DNS}/api" + echo_info "Web UI: http://${ALB_DNS}" +} + +# Check required tools +check_requirements() { + local requirements=("docker" "aws" "jq") + + for cmd in "${requirements[@]}"; do + if ! command -v ${cmd} &> /dev/null; then + echo_error "${cmd} is not installed. Please install it first." + exit 1 + fi + done + + # Check AWS credentials + if ! aws sts get-caller-identity &> /dev/null; then + echo_error "AWS credentials are not configured. Please configure them first." + exit 1 + fi +} + +# Parse command line arguments +while [[ $# -gt 0 ]]; do + case $1 in + --environment|-e) + ENVIRONMENT="$2" + shift 2 + ;; + --region|-r) + AWS_REGION="$2" + shift 2 + ;; + --tag|-t) + IMAGE_TAG="$2" + shift 2 + ;; + --help|-h) + echo "Usage: $0 [OPTIONS]" + echo "Options:" + echo " -e, --environment Environment name (default: production)" + echo " -r, --region AWS region (default: us-east-1)" + echo " -t, --tag Docker image tag (default: latest)" + echo " -h, --help Show this help message" + exit 0 + ;; + *) + echo_error "Unknown option: $1" + exit 1 + ;; + esac +done + +# Run deployment +check_requirements +main \ No newline at end of file diff --git a/deploy/aws/ecs-terraform/terraform/README.md b/deploy/aws/ecs-terraform/terraform/README.md new file mode 100644 index 0000000000..6c7eb2044c --- /dev/null +++ b/deploy/aws/ecs-terraform/terraform/README.md @@ -0,0 +1,188 @@ +# Marquez ECS Deployment with Terraform + +This Terraform configuration deploys Marquez on AWS ECS with RDS PostgreSQL in a dedicated VPC. + +## Architecture + +- **VPC**: Isolated network with public, private, and database subnets +- **ECS Fargate**: Serverless container orchestration +- **RDS PostgreSQL**: Managed database with configurable HA options +- **ALB**: Application Load Balancer for traffic distribution +- **VPC Endpoints**: Private connectivity to AWS services +- **NAT Gateway**: Outbound internet access for private subnets + +## Prerequisites + +1. AWS CLI configured with appropriate credentials +2. Terraform >= 1.0 +3. Docker for building container images + +## Directory Structure + +``` +terraform/ +├── main.tf # Main configuration (ECS, RDS, ALB) +├── vpc.tf # VPC and networking resources +├── vpc-endpoints.tf # VPC endpoints for AWS services +├── variables.tf # Variable definitions +├── terraform.tfvars.example # Example configuration +└── environments/ # Environment-specific configurations + ├── sandbox.tfvars + └── production.tfvars +``` + +## Deployment Steps + +### 1. Initialize Terraform + +```bash +cd deploy/terraform +terraform init +``` + +### 2. Configure Environment Variables + +Copy the example configuration and update with your values: + +```bash +cp terraform.tfvars.example terraform.tfvars +# Edit terraform.tfvars with your configuration +``` + +Or use environment-specific configuration: + +```bash +terraform plan -var-file=environments/sandbox.tfvars +``` + +### 3. Set Database Password + +Export the database password as an environment variable: + +```bash +export TF_VAR_db_password="your-secure-password" +``` + +### 4. Plan Deployment + +Review the resources that will be created: + +```bash +# For sandbox environment +terraform plan -var-file=environments/sandbox.tfvars + +# For production environment +terraform plan -var-file=environments/production.tfvars +``` + +### 5. Apply Configuration + +Deploy the infrastructure: + +```bash +# For sandbox environment +terraform apply -var-file=environments/sandbox.tfvars + +# For production environment +terraform apply -var-file=environments/production.tfvars +``` + +### 6. Build and Push Docker Images + +After infrastructure is created, build and push Docker images: + +```bash +# Get ECR repository URLs from Terraform output +API_REPO=$(terraform output -raw ecr_repository_api) +WEB_REPO=$(terraform output -raw ecr_repository_web) + +# Login to ECR +aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin $API_REPO + +# Build and push API image +cd ../../ +docker build -f docker/Dockerfile.api -t $API_REPO:latest . +docker push $API_REPO:latest + +# Build and push Web image +docker build -f docker/Dockerfile.web -t $WEB_REPO:latest ./web +docker push $WEB_REPO:latest +``` + +### 7. Update ECS Services + +Force new deployment to use the pushed images: + +```bash +aws ecs update-service --cluster marquez-sandbox --service marquez-sandbox-api --force-new-deployment +aws ecs update-service --cluster marquez-sandbox --service marquez-sandbox-web --force-new-deployment +``` + +## Environment Configurations + +### Sandbox +- Cost-optimized configuration +- Single NAT Gateway +- Minimal resources (db.t3.micro, 1 ECS task) +- No deletion protection + +### Production +- High availability configuration +- Multi-AZ deployment +- Multiple NAT Gateways +- Larger resources (db.t3.medium+, 3+ ECS tasks) +- Deletion protection enabled + +## Key Variables + +| Variable | Description | Default | +|----------|-------------|---------| +| `environment` | Environment name (sandbox, dev, staging, production) | sandbox | +| `vpc_cidr` | CIDR block for VPC | 10.1.0.0/16 | +| `nat_gateway_count` | Number of NAT Gateways | 1 | +| `database_instance_class` | RDS instance class | db.t3.micro | +| `ecs_service_desired_count` | Number of ECS tasks | 1 | +| `enable_deletion_protection` | Enable deletion protection | false | + +## Outputs + +After deployment, Terraform will output: +- `alb_dns_name`: URL to access Marquez +- `ecr_repository_api`: ECR repository for API image +- `ecr_repository_web`: ECR repository for Web image +- `rds_endpoint`: RDS database endpoint +- `vpc_id`: Created VPC ID + +## Cost Optimization Tips + +1. **Sandbox/Dev**: Use single NAT Gateway (`nat_gateway_count = 1`) +2. **Off-hours**: Use AWS Instance Scheduler to stop RDS instances +3. **ECS Tasks**: Scale down during non-business hours +4. **VPC Endpoints**: Only create required endpoints + +## Cleanup + +To destroy all resources: + +```bash +terraform destroy -var-file=environments/sandbox.tfvars +``` + +**Warning**: This will delete all resources including the database. Ensure you have backups if needed. + +## Troubleshooting + +### ECS Tasks Failing to Start +- Check CloudWatch Logs for container errors +- Verify ECR images are pushed correctly +- Ensure VPC endpoints are configured for private subnets + +### Database Connection Issues +- Verify security group rules +- Check RDS subnet group configuration +- Ensure database password is set correctly + +### ALB Health Check Failures +- Verify target group health check settings +- Check application startup time and adjust `health_check_grace_period_seconds` +- Review ECS task logs for application errors \ No newline at end of file diff --git a/deploy/aws/ecs-terraform/terraform/cloudfront.tf b/deploy/aws/ecs-terraform/terraform/cloudfront.tf new file mode 100644 index 0000000000..adbbb8f785 --- /dev/null +++ b/deploy/aws/ecs-terraform/terraform/cloudfront.tf @@ -0,0 +1,283 @@ +# CloudFront Distribution for HTTPS access +resource "aws_cloudfront_distribution" "marquez" { + enabled = true + is_ipv6_enabled = true + default_root_object = "/" + comment = "${local.name_prefix} CloudFront Distribution" + price_class = var.environment == "production" ? "PriceClass_All" : "PriceClass_100" + + origin { + domain_name = aws_alb.marquez.dns_name + origin_id = "${local.name_prefix}-alb" + + custom_origin_config { + http_port = 80 + https_port = 443 + origin_protocol_policy = "http-only" + origin_ssl_protocols = ["TLSv1.2"] + } + + custom_header { + name = "X-CloudFront-Origin" + value = local.name_prefix + } + } + + # Default behavior for web UI + default_cache_behavior { + allowed_methods = ["DELETE", "GET", "HEAD", "OPTIONS", "PATCH", "POST", "PUT"] + cached_methods = ["GET", "HEAD", "OPTIONS"] + target_origin_id = "${local.name_prefix}-alb" + + forwarded_values { + query_string = true + headers = ["Host", "Origin", "Accept", "Accept-Language", "Accept-Encoding", "Referer", "User-Agent", "CloudFront-Forwarded-Proto"] + + cookies { + forward = "all" + } + } + + viewer_protocol_policy = "redirect-to-https" + min_ttl = 0 + default_ttl = 0 + max_ttl = 0 + compress = true + } + + # API cache behavior + ordered_cache_behavior { + path_pattern = "/api/*" + allowed_methods = ["DELETE", "GET", "HEAD", "OPTIONS", "PATCH", "POST", "PUT"] + cached_methods = ["GET", "HEAD", "OPTIONS"] + target_origin_id = "${local.name_prefix}-alb" + + forwarded_values { + query_string = true + headers = ["*"] + + cookies { + forward = "all" + } + } + + viewer_protocol_policy = "redirect-to-https" + min_ttl = 0 + default_ttl = 0 + max_ttl = 0 + compress = true + } + + # Lineage API cache behavior + ordered_cache_behavior { + path_pattern = "/lineage/*" + allowed_methods = ["DELETE", "GET", "HEAD", "OPTIONS", "PATCH", "POST", "PUT"] + cached_methods = ["GET", "HEAD", "OPTIONS"] + target_origin_id = "${local.name_prefix}-alb" + + forwarded_values { + query_string = true + headers = ["*"] + + cookies { + forward = "all" + } + } + + viewer_protocol_policy = "redirect-to-https" + min_ttl = 0 + default_ttl = 0 + max_ttl = 0 + compress = true + } + + # Health check and metrics endpoints + ordered_cache_behavior { + path_pattern = "/healthcheck" + allowed_methods = ["GET", "HEAD"] + cached_methods = ["GET", "HEAD"] + target_origin_id = "${local.name_prefix}-alb" + + forwarded_values { + query_string = false + cookies { + forward = "none" + } + } + + viewer_protocol_policy = "redirect-to-https" + min_ttl = 0 + default_ttl = 0 + max_ttl = 0 + compress = true + } + + ordered_cache_behavior { + path_pattern = "/metrics" + allowed_methods = ["GET", "HEAD"] + cached_methods = ["GET", "HEAD"] + target_origin_id = "${local.name_prefix}-alb" + + forwarded_values { + query_string = false + cookies { + forward = "none" + } + } + + viewer_protocol_policy = "redirect-to-https" + min_ttl = 0 + default_ttl = 0 + max_ttl = 0 + compress = true + } + + # Static assets caching + ordered_cache_behavior { + path_pattern = "*.css" + allowed_methods = ["GET", "HEAD", "OPTIONS"] + cached_methods = ["GET", "HEAD", "OPTIONS"] + target_origin_id = "${local.name_prefix}-alb" + + forwarded_values { + query_string = false + cookies { + forward = "none" + } + } + + viewer_protocol_policy = "redirect-to-https" + min_ttl = 0 + default_ttl = 86400 + max_ttl = 31536000 + compress = true + } + + ordered_cache_behavior { + path_pattern = "*.js" + allowed_methods = ["GET", "HEAD", "OPTIONS"] + cached_methods = ["GET", "HEAD", "OPTIONS"] + target_origin_id = "${local.name_prefix}-alb" + + forwarded_values { + query_string = false + cookies { + forward = "none" + } + } + + viewer_protocol_policy = "redirect-to-https" + min_ttl = 0 + default_ttl = 86400 + max_ttl = 31536000 + compress = true + } + + ordered_cache_behavior { + path_pattern = "*.jpg" + allowed_methods = ["GET", "HEAD", "OPTIONS"] + cached_methods = ["GET", "HEAD", "OPTIONS"] + target_origin_id = "${local.name_prefix}-alb" + + forwarded_values { + query_string = false + cookies { + forward = "none" + } + } + + viewer_protocol_policy = "redirect-to-https" + min_ttl = 0 + default_ttl = 86400 + max_ttl = 31536000 + compress = true + } + + ordered_cache_behavior { + path_pattern = "*.png" + allowed_methods = ["GET", "HEAD", "OPTIONS"] + cached_methods = ["GET", "HEAD", "OPTIONS"] + target_origin_id = "${local.name_prefix}-alb" + + forwarded_values { + query_string = false + cookies { + forward = "none" + } + } + + viewer_protocol_policy = "redirect-to-https" + min_ttl = 0 + default_ttl = 86400 + max_ttl = 31536000 + compress = true + } + + ordered_cache_behavior { + path_pattern = "*.gif" + allowed_methods = ["GET", "HEAD", "OPTIONS"] + cached_methods = ["GET", "HEAD", "OPTIONS"] + target_origin_id = "${local.name_prefix}-alb" + + forwarded_values { + query_string = false + cookies { + forward = "none" + } + } + + viewer_protocol_policy = "redirect-to-https" + min_ttl = 0 + default_ttl = 86400 + max_ttl = 31536000 + compress = true + } + + ordered_cache_behavior { + path_pattern = "*.svg" + allowed_methods = ["GET", "HEAD", "OPTIONS"] + cached_methods = ["GET", "HEAD", "OPTIONS"] + target_origin_id = "${local.name_prefix}-alb" + + forwarded_values { + query_string = false + cookies { + forward = "none" + } + } + + viewer_protocol_policy = "redirect-to-https" + min_ttl = 0 + default_ttl = 86400 + max_ttl = 31536000 + compress = true + } + + restrictions { + geo_restriction { + restriction_type = "none" + } + } + + viewer_certificate { + cloudfront_default_certificate = true + } + + tags = merge( + local.common_tags, + { + Name = "${local.name_prefix}-cloudfront" + } + ) +} + +# Output for CloudFront domain +output "cloudfront_domain_name" { + value = aws_cloudfront_distribution.marquez.domain_name + description = "CloudFront distribution domain name (HTTPS)" +} + +output "cloudfront_distribution_id" { + value = aws_cloudfront_distribution.marquez.id + description = "CloudFront distribution ID" +} \ No newline at end of file diff --git a/deploy/aws/ecs-terraform/terraform/main.tf b/deploy/aws/ecs-terraform/terraform/main.tf new file mode 100644 index 0000000000..c96bcf2f05 --- /dev/null +++ b/deploy/aws/ecs-terraform/terraform/main.tf @@ -0,0 +1,738 @@ +terraform { + required_version = ">= 1.0" + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.0" + } + } +} + +provider "aws" { + region = var.region +} + +# Additional Variables for sensitive data +variable "db_password" { + description = "Password for RDS database" + type = string + sensitive = true +} + +variable "opensearch_enabled" { + description = "Enable OpenSearch integration" + type = bool + default = false +} + +# ECR Repositories +resource "aws_ecr_repository" "marquez_api" { + name = "${local.name_prefix}-api" + image_tag_mutability = "MUTABLE" + + image_scanning_configuration { + scan_on_push = true + } + + tags = local.common_tags +} + +resource "aws_ecr_repository" "marquez_web" { + name = "${local.name_prefix}-web" + image_tag_mutability = "MUTABLE" + + image_scanning_configuration { + scan_on_push = true + } + + tags = local.common_tags +} + +# ECS Cluster +resource "aws_ecs_cluster" "marquez" { + name = local.name_prefix + + setting { + name = "containerInsights" + value = "enabled" + } + + tags = local.common_tags +} + +# CloudWatch Log Groups +resource "aws_cloudwatch_log_group" "marquez_api" { + name = "/ecs/${local.name_prefix}-api" + retention_in_days = 30 + + tags = local.common_tags +} + +resource "aws_cloudwatch_log_group" "marquez_web" { + name = "/ecs/${local.name_prefix}-web" + retention_in_days = 30 + + tags = local.common_tags +} + +# IAM Roles +resource "aws_iam_role" "ecs_task_execution" { + name = "${local.name_prefix}-ecs-task-execution" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Action = "sts:AssumeRole" + Effect = "Allow" + Principal = { + Service = "ecs-tasks.amazonaws.com" + } + } + ] + }) + + tags = local.common_tags +} + +resource "aws_iam_role_policy_attachment" "ecs_task_execution" { + role = aws_iam_role.ecs_task_execution.name + policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy" +} + +resource "aws_iam_role_policy" "ecs_secrets" { + name = "ecs-secrets-policy" + role = aws_iam_role.ecs_task_execution.id + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ + "secretsmanager:GetSecretValue" + ] + Resource = [ + aws_secretsmanager_secret.db_password.arn + ] + } + ] + }) +} + +resource "aws_iam_role" "ecs_task" { + name = "${local.name_prefix}-ecs-task" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Action = "sts:AssumeRole" + Effect = "Allow" + Principal = { + Service = "ecs-tasks.amazonaws.com" + } + } + ] + }) + + tags = local.common_tags +} + +# Secrets Manager +resource "aws_secretsmanager_secret" "db_password" { + name = "${local.name_prefix}-db-password" + + tags = local.common_tags +} + +resource "aws_secretsmanager_secret_version" "db_password" { + secret_id = aws_secretsmanager_secret.db_password.id + secret_string = var.db_password +} + +# Security Groups +resource "aws_security_group" "alb" { + name_prefix = "${local.name_prefix}-alb-" + vpc_id = aws_vpc.main.id + + ingress { + from_port = 80 + to_port = 80 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + from_port = 443 + to_port = 443 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } + + tags = merge( + local.common_tags, + { + Name = "${local.name_prefix}-alb-sg" + } + ) +} + +resource "aws_security_group" "ecs_tasks" { + name_prefix = "${local.name_prefix}-ecs-tasks-" + vpc_id = aws_vpc.main.id + + ingress { + from_port = 0 + to_port = 65535 + protocol = "tcp" + security_groups = [aws_security_group.alb.id] + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } + + tags = merge( + local.common_tags, + { + Name = "${local.name_prefix}-ecs-tasks-sg" + } + ) +} + +resource "aws_security_group" "rds" { + name_prefix = "${local.name_prefix}-rds-" + vpc_id = aws_vpc.main.id + + ingress { + from_port = 5432 + to_port = 5432 + protocol = "tcp" + security_groups = [aws_security_group.ecs_tasks.id] + } + + tags = merge( + local.common_tags, + { + Name = "${local.name_prefix}-rds-sg" + } + ) +} + +# RDS PostgreSQL +resource "aws_db_instance" "marquez" { + identifier = local.name_prefix + engine = "postgres" + engine_version = "14" + instance_class = var.database_instance_class + + allocated_storage = var.database_allocated_storage + max_allocated_storage = var.database_allocated_storage * 10 + storage_type = "gp3" + storage_encrypted = true + + db_name = "marquez" + username = "marquez" + password = var.db_password + + vpc_security_group_ids = [aws_security_group.rds.id] + db_subnet_group_name = aws_db_subnet_group.main.name + + backup_retention_period = var.backup_retention_period + backup_window = "03:00-04:00" + maintenance_window = "sun:04:00-sun:05:00" + + multi_az = var.database_multi_az + deletion_protection = var.enable_deletion_protection + skip_final_snapshot = !var.enable_deletion_protection + final_snapshot_identifier = var.enable_deletion_protection ? "${local.name_prefix}-final-${formatdate("YYYY-MM-DD-hhmm", timestamp())}" : null + + enabled_cloudwatch_logs_exports = ["postgresql"] + + tags = merge( + local.common_tags, + { + Name = local.name_prefix + } + ) +} + +# Application Load Balancer +resource "aws_alb" "marquez" { + name = local.name_prefix + internal = false + load_balancer_type = "application" + security_groups = [aws_security_group.alb.id] + subnets = aws_subnet.public[*].id + + enable_deletion_protection = var.enable_deletion_protection + enable_http2 = true + enable_cross_zone_load_balancing = true + + tags = local.common_tags +} + +# Target Groups +resource "aws_alb_target_group" "api" { + name = "${local.name_prefix}-api" + port = 5000 + protocol = "HTTP" + vpc_id = aws_vpc.main.id + target_type = "ip" + + health_check { + enabled = true + path = "/healthcheck" + port = 5001 + healthy_threshold = 2 + unhealthy_threshold = 2 + timeout = 5 + interval = 30 + matcher = "200" + } + + deregistration_delay = 30 + + tags = local.common_tags +} + +resource "aws_alb_target_group" "api_admin" { + name = "${local.name_prefix}-api-admin" + port = 5001 + protocol = "HTTP" + vpc_id = aws_vpc.main.id + target_type = "ip" + + health_check { + enabled = true + path = "/healthcheck" + healthy_threshold = 2 + unhealthy_threshold = 2 + timeout = 5 + interval = 30 + matcher = "200" + } + + deregistration_delay = 30 + + tags = local.common_tags +} + +resource "aws_alb_target_group" "web" { + name = "${local.name_prefix}-web" + port = 3000 + protocol = "HTTP" + vpc_id = aws_vpc.main.id + target_type = "ip" + + health_check { + enabled = true + path = "/" + healthy_threshold = 2 + unhealthy_threshold = 2 + timeout = 5 + interval = 30 + matcher = "200" + } + + deregistration_delay = 30 + + tags = local.common_tags +} + +# ALB Listeners +resource "aws_alb_listener" "http" { + load_balancer_arn = aws_alb.marquez.arn + port = "80" + protocol = "HTTP" + + default_action { + type = "forward" + target_group_arn = aws_alb_target_group.web.arn + } +} + +# Note: HTTPS listener requires ACM certificate +# resource "aws_alb_listener" "https" { +# load_balancer_arn = aws_alb.marquez.arn +# port = "443" +# protocol = "HTTPS" +# ssl_policy = "ELBSecurityPolicy-TLS-1-2-2017-01" +# certificate_arn = var.certificate_arn +# +# default_action { +# type = "forward" +# target_group_arn = aws_alb_target_group.web.arn +# } +# } + +# Listener Rules (for HTTP during development) +resource "aws_alb_listener_rule" "api" { + listener_arn = aws_alb_listener.http.arn + priority = 100 + + action { + type = "forward" + target_group_arn = aws_alb_target_group.api.arn + } + + condition { + path_pattern { + values = ["/api/*", "/lineage/*"] + } + } +} + +resource "aws_alb_listener_rule" "api_admin" { + listener_arn = aws_alb_listener.http.arn + priority = 101 + + action { + type = "forward" + target_group_arn = aws_alb_target_group.api_admin.arn + } + + condition { + path_pattern { + values = ["/healthcheck", "/metrics"] + } + } +} + +resource "aws_alb_listener_rule" "web" { + listener_arn = aws_alb_listener.http.arn + priority = 102 + + action { + type = "forward" + target_group_arn = aws_alb_target_group.web.arn + } + + condition { + path_pattern { + values = ["/*"] + } + } +} + +# ECS Task Definitions +resource "aws_ecs_task_definition" "marquez_api" { + family = "${local.name_prefix}-api" + network_mode = "awsvpc" + requires_compatibilities = ["FARGATE"] + cpu = var.ecs_task_cpu + memory = var.ecs_task_memory + execution_role_arn = aws_iam_role.ecs_task_execution.arn + task_role_arn = aws_iam_role.ecs_task.arn + + container_definitions = jsonencode([ + { + name = "marquez-api" + image = "${aws_ecr_repository.marquez_api.repository_url}:latest" + essential = true + + portMappings = [ + { + containerPort = 5000 + protocol = "tcp" + }, + { + containerPort = 5001 + protocol = "tcp" + } + ] + + environment = [ + { + name = "MARQUEZ_PORT" + value = "5000" + }, + { + name = "MARQUEZ_ADMIN_PORT" + value = "5001" + }, + { + name = "POSTGRES_HOST" + value = aws_db_instance.marquez.address + }, + { + name = "POSTGRES_PORT" + value = "5432" + }, + { + name = "POSTGRES_DB" + value = "marquez" + }, + { + name = "POSTGRES_USER" + value = "marquez" + }, + { + name = "MIGRATE_ON_STARTUP" + value = "true" + }, + { + name = "SEARCH_ENABLED" + value = tostring(var.opensearch_enabled) + } + ] + + secrets = [ + { + name = "POSTGRES_PASSWORD" + valueFrom = aws_secretsmanager_secret.db_password.arn + } + ] + + logConfiguration = { + logDriver = "awslogs" + options = { + awslogs-group = aws_cloudwatch_log_group.marquez_api.name + awslogs-region = var.region + awslogs-stream-prefix = "ecs" + } + } + + healthCheck = { + command = ["CMD-SHELL", "curl -f http://localhost:5001/healthcheck || exit 1"] + interval = 30 + timeout = 5 + retries = 3 + startPeriod = 60 + } + } + ]) + + tags = local.common_tags +} + +resource "aws_ecs_task_definition" "marquez_web" { + family = "${local.name_prefix}-web" + network_mode = "awsvpc" + requires_compatibilities = ["FARGATE"] + cpu = var.ecs_task_cpu + memory = var.ecs_task_memory + execution_role_arn = aws_iam_role.ecs_task_execution.arn + task_role_arn = aws_iam_role.ecs_task.arn + + container_definitions = jsonencode([ + { + name = "marquez-web" + image = "${aws_ecr_repository.marquez_web.repository_url}:latest" + essential = true + + portMappings = [ + { + containerPort = 3000 + protocol = "tcp" + } + ] + + environment = [ + { + name = "MARQUEZ_HOST" + value = aws_alb.marquez.dns_name + }, + { + name = "MARQUEZ_PORT" + value = "80" + }, + { + name = "WEB_PORT" + value = "3000" + }, + { + name = "REACT_APP_ADVANCED_SEARCH" + value = "true" + } + ] + + logConfiguration = { + logDriver = "awslogs" + options = { + awslogs-group = aws_cloudwatch_log_group.marquez_web.name + awslogs-region = var.region + awslogs-stream-prefix = "ecs" + } + } + + healthCheck = { + command = ["CMD-SHELL", "curl -f http://localhost:3000 || exit 1"] + interval = 30 + timeout = 5 + retries = 3 + startPeriod = 60 + } + } + ]) + + tags = local.common_tags +} + +# ECS Services +resource "aws_ecs_service" "marquez_api" { + name = "${local.name_prefix}-api" + cluster = aws_ecs_cluster.marquez.id + task_definition = aws_ecs_task_definition.marquez_api.arn + desired_count = var.ecs_service_desired_count + launch_type = "FARGATE" + + network_configuration { + subnets = aws_subnet.private[*].id + security_groups = [aws_security_group.ecs_tasks.id] + assign_public_ip = false + } + + load_balancer { + target_group_arn = aws_alb_target_group.api.arn + container_name = "marquez-api" + container_port = 5000 + } + + load_balancer { + target_group_arn = aws_alb_target_group.api_admin.arn + container_name = "marquez-api" + container_port = 5001 + } + + deployment_maximum_percent = 200 + deployment_minimum_healthy_percent = 100 + health_check_grace_period_seconds = 60 + + deployment_circuit_breaker { + enable = true + rollback = true + } + + depends_on = [ + aws_alb_listener.http, + aws_db_instance.marquez + ] + + tags = local.common_tags +} + +resource "aws_ecs_service" "marquez_web" { + name = "${local.name_prefix}-web" + cluster = aws_ecs_cluster.marquez.id + task_definition = aws_ecs_task_definition.marquez_web.arn + desired_count = var.ecs_service_desired_count + launch_type = "FARGATE" + + network_configuration { + subnets = aws_subnet.private[*].id + security_groups = [aws_security_group.ecs_tasks.id] + assign_public_ip = false + } + + load_balancer { + target_group_arn = aws_alb_target_group.web.arn + container_name = "marquez-web" + container_port = 3000 + } + + deployment_maximum_percent = 200 + deployment_minimum_healthy_percent = 100 + health_check_grace_period_seconds = 60 + + deployment_circuit_breaker { + enable = true + rollback = true + } + + depends_on = [ + aws_alb_listener.http, + aws_ecs_service.marquez_api + ] + + tags = local.common_tags +} + +# Auto Scaling +resource "aws_appautoscaling_target" "api" { + max_capacity = var.ecs_service_desired_count * 5 + min_capacity = var.ecs_service_desired_count + resource_id = "service/${aws_ecs_cluster.marquez.name}/${aws_ecs_service.marquez_api.name}" + scalable_dimension = "ecs:service:DesiredCount" + service_namespace = "ecs" +} + +resource "aws_appautoscaling_policy" "api_cpu" { + name = "${local.name_prefix}-api-cpu" + policy_type = "TargetTrackingScaling" + resource_id = aws_appautoscaling_target.api.resource_id + scalable_dimension = aws_appautoscaling_target.api.scalable_dimension + service_namespace = aws_appautoscaling_target.api.service_namespace + + target_tracking_scaling_policy_configuration { + predefined_metric_specification { + predefined_metric_type = "ECSServiceAverageCPUUtilization" + } + target_value = 70 + } +} + +resource "aws_appautoscaling_target" "web" { + max_capacity = var.ecs_service_desired_count * 5 + min_capacity = var.ecs_service_desired_count + resource_id = "service/${aws_ecs_cluster.marquez.name}/${aws_ecs_service.marquez_web.name}" + scalable_dimension = "ecs:service:DesiredCount" + service_namespace = "ecs" +} + +resource "aws_appautoscaling_policy" "web_cpu" { + name = "${local.name_prefix}-web-cpu" + policy_type = "TargetTrackingScaling" + resource_id = aws_appautoscaling_target.web.resource_id + scalable_dimension = aws_appautoscaling_target.web.scalable_dimension + service_namespace = aws_appautoscaling_target.web.service_namespace + + target_tracking_scaling_policy_configuration { + predefined_metric_specification { + predefined_metric_type = "ECSServiceAverageCPUUtilization" + } + target_value = 70 + } +} + +# Outputs +output "alb_dns_name" { + value = aws_alb.marquez.dns_name + description = "DNS name of the load balancer" +} + +output "ecr_repository_api" { + value = aws_ecr_repository.marquez_api.repository_url + description = "ECR repository URL for API" +} + +output "ecr_repository_web" { + value = aws_ecr_repository.marquez_web.repository_url + description = "ECR repository URL for Web" +} + +output "rds_endpoint" { + value = aws_db_instance.marquez.address + description = "RDS instance endpoint" +} + +output "vpc_id" { + value = aws_vpc.main.id + description = "VPC ID" +} + +output "environment" { + value = var.environment + description = "Environment name" +} \ No newline at end of file diff --git a/deploy/aws/ecs-terraform/terraform/terraform.tfvars.example b/deploy/aws/ecs-terraform/terraform/terraform.tfvars.example new file mode 100644 index 0000000000..211ab332c0 --- /dev/null +++ b/deploy/aws/ecs-terraform/terraform/terraform.tfvars.example @@ -0,0 +1,42 @@ +# Terraform Variables Configuration Example +# Copy this file to terraform.tfvars and update with your values + +# Environment Configuration +environment = "sandbox" # Options: sandbox, dev, staging, production +project_name = "marquez" +region = "us-east-1" + +# Network Configuration +vpc_cidr = "10.1.0.0/16" # Use different CIDR for each environment +availability_zones_count = 2 # Number of AZs to use (2 for dev, 3+ for production) + +# NAT Gateway Configuration +enable_nat_gateway = true +nat_gateway_count = 1 # 1 for sandbox/dev (cost optimization), 2 for production (HA) + +# Database Configuration +database_instance_class = "db.t3.micro" # db.t3.micro for sandbox, db.t3.small+ for production +database_allocated_storage = 20 # 20GB for sandbox, 100GB+ for production +database_multi_az = false # false for sandbox/dev, true for production +backup_retention_period = 1 # 1 day for sandbox, 7-30 days for production + +# ECS Configuration +ecs_task_cpu = "512" # 512 for sandbox, 1024+ for production +ecs_task_memory = "1024" # 1024 for sandbox, 2048+ for production +ecs_service_desired_count = 1 # 1 for sandbox, 2+ for production + +# Security Configuration +enable_deletion_protection = false # false for sandbox/dev, true for production + +# Database Password (sensitive - use environment variable or AWS Secrets Manager) +# db_password = "your-secure-password-here" + +# OpenSearch Integration +opensearch_enabled = false # Enable if using OpenSearch for search functionality + +# Tags +tags = { + Owner = "your-team" + CostCenter = "engineering" + Terraform = "true" +} \ No newline at end of file diff --git a/deploy/aws/ecs-terraform/terraform/variables.tf b/deploy/aws/ecs-terraform/terraform/variables.tf new file mode 100644 index 0000000000..8848162c0c --- /dev/null +++ b/deploy/aws/ecs-terraform/terraform/variables.tf @@ -0,0 +1,108 @@ +variable "environment" { + description = "Environment name (sandbox, dev, staging, production)" + type = string + default = "sandbox" +} + +variable "project_name" { + description = "Project name" + type = string + default = "marquez" +} + +variable "region" { + description = "AWS region" + type = string + default = "us-east-1" +} + +variable "vpc_cidr" { + description = "CIDR block for VPC" + type = string + default = "10.1.0.0/16" +} + +variable "availability_zones_count" { + description = "Number of availability zones to use" + type = number + default = 2 +} + +variable "enable_nat_gateway" { + description = "Enable NAT Gateway for private subnets" + type = bool + default = true +} + +variable "nat_gateway_count" { + description = "Number of NAT Gateways (1 for single NAT, 2 for HA)" + type = number + default = 1 # Use 1 for sandbox/dev, 2 for production +} + +variable "database_instance_class" { + description = "RDS instance class" + type = string + default = "db.t3.micro" +} + +variable "database_allocated_storage" { + description = "RDS allocated storage in GB" + type = number + default = 20 +} + +variable "database_multi_az" { + description = "Enable Multi-AZ for RDS" + type = bool + default = false # false for sandbox/dev, true for production +} + +variable "ecs_task_cpu" { + description = "CPU units for ECS task" + type = string + default = "512" +} + +variable "ecs_task_memory" { + description = "Memory for ECS task in MB" + type = string + default = "1024" +} + +variable "ecs_service_desired_count" { + description = "Desired number of ECS service tasks" + type = number + default = 1 # 1 for sandbox, 2+ for production +} + +variable "enable_deletion_protection" { + description = "Enable deletion protection for RDS and ALB" + type = bool + default = false # false for sandbox/dev, true for production +} + +variable "backup_retention_period" { + description = "RDS backup retention period in days" + type = number + default = 1 # 1 for sandbox, 7+ for production +} + +variable "tags" { + description = "Common tags to apply to all resources" + type = map(string) + default = {} +} + +locals { + common_tags = merge( + { + Environment = var.environment + Project = var.project_name + ManagedBy = "terraform" + }, + var.tags + ) + + name_prefix = "${var.project_name}-${var.environment}" +} \ No newline at end of file diff --git a/deploy/aws/ecs-terraform/terraform/vpc-endpoints.tf b/deploy/aws/ecs-terraform/terraform/vpc-endpoints.tf new file mode 100644 index 0000000000..f1c6035b46 --- /dev/null +++ b/deploy/aws/ecs-terraform/terraform/vpc-endpoints.tf @@ -0,0 +1,120 @@ +# VPC Endpoints for AWS Services +# These endpoints allow ECS tasks in private subnets to access AWS services without internet access + +# Security Group for VPC Endpoints +resource "aws_security_group" "vpc_endpoints" { + name_prefix = "${local.name_prefix}-vpc-endpoints-" + vpc_id = aws_vpc.main.id + + ingress { + from_port = 443 + to_port = 443 + protocol = "tcp" + security_groups = [aws_security_group.ecs_tasks.id] + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } + + tags = merge( + local.common_tags, + { + Name = "${local.name_prefix}-vpc-endpoints-sg" + } + ) +} + +# Interface Endpoint for Secrets Manager +resource "aws_vpc_endpoint" "secrets_manager" { + vpc_id = aws_vpc.main.id + service_name = "com.amazonaws.${var.region}.secretsmanager" + vpc_endpoint_type = "Interface" + private_dns_enabled = true + + subnet_ids = aws_subnet.private[*].id + security_group_ids = [aws_security_group.vpc_endpoints.id] + + tags = merge( + local.common_tags, + { + Name = "${local.name_prefix}-secrets-manager-endpoint" + } + ) +} + +# Interface Endpoint for ECR API +resource "aws_vpc_endpoint" "ecr_api" { + vpc_id = aws_vpc.main.id + service_name = "com.amazonaws.${var.region}.ecr.api" + vpc_endpoint_type = "Interface" + private_dns_enabled = true + + subnet_ids = aws_subnet.private[*].id + security_group_ids = [aws_security_group.vpc_endpoints.id] + + tags = merge( + local.common_tags, + { + Name = "${local.name_prefix}-ecr-api-endpoint" + } + ) +} + +# Interface Endpoint for ECR Docker Registry +resource "aws_vpc_endpoint" "ecr_dkr" { + vpc_id = aws_vpc.main.id + service_name = "com.amazonaws.${var.region}.ecr.dkr" + vpc_endpoint_type = "Interface" + private_dns_enabled = true + + subnet_ids = aws_subnet.private[*].id + security_group_ids = [aws_security_group.vpc_endpoints.id] + + tags = merge( + local.common_tags, + { + Name = "${local.name_prefix}-ecr-dkr-endpoint" + } + ) +} + +# Gateway Endpoint for S3 (required for ECR) +resource "aws_vpc_endpoint" "s3" { + vpc_id = aws_vpc.main.id + service_name = "com.amazonaws.${var.region}.s3" + vpc_endpoint_type = "Gateway" + + route_table_ids = concat( + aws_route_table.private[*].id, + [aws_route_table.database.id] + ) + + tags = merge( + local.common_tags, + { + Name = "${local.name_prefix}-s3-endpoint" + } + ) +} + +# Interface Endpoint for CloudWatch Logs +resource "aws_vpc_endpoint" "logs" { + vpc_id = aws_vpc.main.id + service_name = "com.amazonaws.${var.region}.logs" + vpc_endpoint_type = "Interface" + private_dns_enabled = true + + subnet_ids = aws_subnet.private[*].id + security_group_ids = [aws_security_group.vpc_endpoints.id] + + tags = merge( + local.common_tags, + { + Name = "${local.name_prefix}-logs-endpoint" + } + ) +} \ No newline at end of file diff --git a/deploy/aws/ecs-terraform/terraform/vpc.tf b/deploy/aws/ecs-terraform/terraform/vpc.tf new file mode 100644 index 0000000000..f7cf5828e1 --- /dev/null +++ b/deploy/aws/ecs-terraform/terraform/vpc.tf @@ -0,0 +1,192 @@ +# VPC Configuration +resource "aws_vpc" "main" { + cidr_block = var.vpc_cidr + enable_dns_hostnames = true + enable_dns_support = true + + tags = merge( + local.common_tags, + { + Name = "${local.name_prefix}-vpc" + } + ) +} + +# Internet Gateway +resource "aws_internet_gateway" "main" { + vpc_id = aws_vpc.main.id + + tags = merge( + local.common_tags, + { + Name = "${local.name_prefix}-igw" + } + ) +} + +# Elastic IPs for NAT Gateways +resource "aws_eip" "nat" { + count = var.enable_nat_gateway ? var.nat_gateway_count : 0 + domain = "vpc" + + tags = merge( + local.common_tags, + { + Name = "${local.name_prefix}-nat-eip-${count.index + 1}" + } + ) +} + +# Public Subnets (for NAT Gateway and ALB) +resource "aws_subnet" "public" { + count = var.availability_zones_count + vpc_id = aws_vpc.main.id + cidr_block = cidrsubnet(var.vpc_cidr, 8, count.index) + availability_zone = data.aws_availability_zones.available.names[count.index] + map_public_ip_on_launch = true + + tags = merge( + local.common_tags, + { + Name = "${local.name_prefix}-public-subnet-${data.aws_availability_zones.available.zone_ids[count.index]}" + Type = "public" + } + ) +} + +# Private Subnets (for ECS Tasks) +resource "aws_subnet" "private" { + count = var.availability_zones_count + vpc_id = aws_vpc.main.id + cidr_block = cidrsubnet(var.vpc_cidr, 8, count.index + 10) + availability_zone = data.aws_availability_zones.available.names[count.index] + + tags = merge( + local.common_tags, + { + Name = "${local.name_prefix}-private-subnet-${data.aws_availability_zones.available.zone_ids[count.index]}" + Type = "private" + } + ) +} + +# Database Subnets (isolated for RDS) +resource "aws_subnet" "database" { + count = var.availability_zones_count + vpc_id = aws_vpc.main.id + cidr_block = cidrsubnet(var.vpc_cidr, 8, count.index + 20) + availability_zone = data.aws_availability_zones.available.names[count.index] + + tags = merge( + local.common_tags, + { + Name = "${local.name_prefix}-database-subnet-${data.aws_availability_zones.available.zone_ids[count.index]}" + Type = "database" + } + ) +} + +# NAT Gateways +resource "aws_nat_gateway" "main" { + count = var.enable_nat_gateway ? var.nat_gateway_count : 0 + allocation_id = aws_eip.nat[count.index].id + subnet_id = aws_subnet.public[count.index].id + + tags = merge( + local.common_tags, + { + Name = "${local.name_prefix}-nat-gateway-${count.index + 1}" + } + ) + + depends_on = [aws_internet_gateway.main] +} + +# Route Table for Public Subnets +resource "aws_route_table" "public" { + vpc_id = aws_vpc.main.id + + route { + cidr_block = "0.0.0.0/0" + gateway_id = aws_internet_gateway.main.id + } + + tags = merge( + local.common_tags, + { + Name = "${local.name_prefix}-public-rt" + Type = "public" + } + ) +} + +# Route Tables for Private Subnets +resource "aws_route_table" "private" { + count = var.enable_nat_gateway ? var.nat_gateway_count : 0 + vpc_id = aws_vpc.main.id + + route { + cidr_block = "0.0.0.0/0" + nat_gateway_id = aws_nat_gateway.main[count.index].id + } + + tags = merge( + local.common_tags, + { + Name = "${local.name_prefix}-private-rt-${count.index + 1}" + Type = "private" + } + ) +} + +# Route Table for Database Subnets (no internet access) +resource "aws_route_table" "database" { + vpc_id = aws_vpc.main.id + + tags = merge( + local.common_tags, + { + Name = "${local.name_prefix}-database-rt" + Type = "database" + } + ) +} + +# Route Table Associations for Public Subnets +resource "aws_route_table_association" "public" { + count = var.availability_zones_count + subnet_id = aws_subnet.public[count.index].id + route_table_id = aws_route_table.public.id +} + +# Route Table Associations for Private Subnets +resource "aws_route_table_association" "private" { + count = var.availability_zones_count + subnet_id = aws_subnet.private[count.index].id + route_table_id = var.enable_nat_gateway ? aws_route_table.private[min(count.index, var.nat_gateway_count - 1)].id : aws_route_table.database.id +} + +# Route Table Associations for Database Subnets +resource "aws_route_table_association" "database" { + count = var.availability_zones_count + subnet_id = aws_subnet.database[count.index].id + route_table_id = aws_route_table.database.id +} + +# DB Subnet Group for RDS +resource "aws_db_subnet_group" "main" { + name = "${local.name_prefix}-db-subnet-group" + subnet_ids = aws_subnet.database[*].id + + tags = merge( + local.common_tags, + { + Name = "${local.name_prefix}-db-subnet-group" + } + ) +} + +# Data source for availability zones +data "aws_availability_zones" "available" { + state = "available" +} \ No newline at end of file From 19e45b5ba46f79f1d871f9ef6f2f25f360863a77 Mon Sep 17 00:00:00 2001 From: shunskkkk Date: Thu, 23 Oct 2025 11:32:04 +0900 Subject: [PATCH 4/4] Fix web container health check to use wget instead of curl Alpine Linux doesn't include curl by default, causing health checks to fail. Use wget which is available in Alpine Linux. Signed-off-by: shunskkkk --- deploy/aws/ecs-terraform/ecs/task-definition-web.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/aws/ecs-terraform/ecs/task-definition-web.json b/deploy/aws/ecs-terraform/ecs/task-definition-web.json index e02a66e748..e3d6122e5a 100644 --- a/deploy/aws/ecs-terraform/ecs/task-definition-web.json +++ b/deploy/aws/ecs-terraform/ecs/task-definition-web.json @@ -40,7 +40,7 @@ } }, "healthCheck": { - "command": ["CMD-SHELL", "curl -f http://localhost:3000 || exit 1"], + "command": ["CMD-SHELL", "wget --spider -q http://localhost:3000 || exit 1"], "interval": 30, "timeout": 5, "retries": 3,