From f84972e12781a9f3a01a432965d6b53c59d9c212 Mon Sep 17 00:00:00 2001 From: Varun Saravanan Date: Mon, 13 May 2024 16:45:39 -0400 Subject: [PATCH] refactor: update ephemeral spark example to latest deployment style --- examples/ephemeral-spark/ec2-key-pair.tf | 9 +- examples/ephemeral-spark/emr-buckets.tf | 8 +- .../ephemeral-spark/emr-hbase-config.json | 42 ++++++++++ .../ephemeral-spark/emr-spark-config.json | 12 +++ examples/ephemeral-spark/ephemeral-spark.tf | 41 +++++++-- examples/ephemeral-spark/hbase-cluster.tf | 74 +++++++++-------- examples/ephemeral-spark/label.tf | 8 ++ examples/ephemeral-spark/local.tfvars | 12 --- examples/ephemeral-spark/locals.tf | 3 + examples/ephemeral-spark/network.tf | 19 +++++ examples/ephemeral-spark/opensearch.tf | 19 ++--- examples/ephemeral-spark/outputs.tf | 6 +- .../{versions.tf => provider.tf} | 0 examples/ephemeral-spark/rds.tf | 29 ++++--- examples/ephemeral-spark/scale.tf | 25 ++++++ examples/ephemeral-spark/tamr-config.tf | 41 +++++---- examples/ephemeral-spark/tamr-config.yml | 61 ++++++++++++++ examples/ephemeral-spark/tamr-vm.tf | 77 ++++++++++++----- examples/ephemeral-spark/variables.tf | 83 ------------------- 19 files changed, 356 insertions(+), 213 deletions(-) create mode 100644 examples/ephemeral-spark/emr-hbase-config.json create mode 100644 examples/ephemeral-spark/emr-spark-config.json create mode 100644 examples/ephemeral-spark/label.tf delete mode 100644 examples/ephemeral-spark/local.tfvars create mode 100644 examples/ephemeral-spark/locals.tf create mode 100644 examples/ephemeral-spark/network.tf rename examples/ephemeral-spark/{versions.tf => provider.tf} (100%) create mode 100644 examples/ephemeral-spark/scale.tf create mode 100644 examples/ephemeral-spark/tamr-config.yml delete mode 100644 examples/ephemeral-spark/variables.tf diff --git a/examples/ephemeral-spark/ec2-key-pair.tf b/examples/ephemeral-spark/ec2-key-pair.tf index ac4d630..61ddf24 100644 --- a/examples/ephemeral-spark/ec2-key-pair.tf +++ b/examples/ephemeral-spark/ec2-key-pair.tf @@ -6,6 +6,13 @@ resource "tls_private_key" "emr_private_key" { module "emr_key_pair" { source = "terraform-aws-modules/key-pair/aws" version = "1.0.0" - key_name = "${var.name_prefix}-key" + key_name = "${local.name_prefix}-key" public_key = tls_private_key.emr_private_key.public_key_openssh } + +# Create a pem file with restricted permissions +resource "local_sensitive_file" "emr_private_key_file" { + content = tls_private_key.emr_private_key.private_key_pem + filename = "./${local.name_prefix}-key.pem" + file_permission = "0600" +} diff --git a/examples/ephemeral-spark/emr-buckets.tf b/examples/ephemeral-spark/emr-buckets.tf index f643623..466baf3 100644 --- a/examples/ephemeral-spark/emr-buckets.tf +++ b/examples/ephemeral-spark/emr-buckets.tf @@ -1,7 +1,7 @@ # Set up logs bucket with read/write permissions module "s3-logs" { - source = "git::git@github.com:Datatamer/terraform-aws-s3.git?ref=1.3.2" - bucket_name = "${var.name_prefix}-logs" + source = "git::git@github.com:Datatamer/terraform-aws-s3.git?ref=1.3.3" + bucket_name = "${local.name_prefix}-logs" read_write_actions = [ "s3:PutObject", "s3:GetObject", @@ -18,8 +18,8 @@ module "s3-logs" { # Set up root directory bucket module "s3-data" { - source = "git::git@github.com:Datatamer/terraform-aws-s3.git?ref=1.3.2" - bucket_name = "${var.name_prefix}-data" + source = "git::git@github.com:Datatamer/terraform-aws-s3.git?ref=1.3.3" + bucket_name = "${local.name_prefix}-data" read_write_actions = [ "s3:GetBucketLocation", "s3:GetBucketCORS", diff --git a/examples/ephemeral-spark/emr-hbase-config.json b/examples/ephemeral-spark/emr-hbase-config.json new file mode 100644 index 0000000..f6d6606 --- /dev/null +++ b/examples/ephemeral-spark/emr-hbase-config.json @@ -0,0 +1,42 @@ +[ + { + "Classification":"emrfs-site", + "Properties":{ + "fs.s3.consistent":"false", + "fs.s3.maxConnections":"50000", + "fs.s3.enableServerSideEncryption": "true", + "fs.s3a.enableServerSideEncryption":"true", + "fs.s3.create.allowFileNameEndsWithFolderSuffix": "true" + } + }, + { + "Classification": "hbase-site", + "Properties": { + "hbase.rootdir": "s3://${emr_hbase_s3_bucket_root_dir}/hbase-data/", + "hbase.client.scanner.timeout.period":"600000", + "hbase.hstore.blockingStoreFiles":"200", + "hbase.hregion.memstore.block.multiplier":"8", + "hbase.hregion.memstore.flush.size":"536870912", + "hbase.rpc.timeout":"600000", + "hbase.zookeeper.property.tickTime":"3000", + "zookeeper.session.timeout":"60000" + } + }, + { + "Classification": "hbase", + "Properties": { + "hbase.emr.storageMode":"s3" + } + }, + { + "Classification": "hbase-env", + "Properties": {}, + "Configurations": [{ + "Classification": "export", + "Properties": { + "HBASE_MASTER_OPTS": "-Xmx26624m", + "HBASE_REGIONSERVER_OPTS": "-Xmx26624m" + } + }] + } +] diff --git a/examples/ephemeral-spark/emr-spark-config.json b/examples/ephemeral-spark/emr-spark-config.json new file mode 100644 index 0000000..1594d17 --- /dev/null +++ b/examples/ephemeral-spark/emr-spark-config.json @@ -0,0 +1,12 @@ +[ + { + "Classification":"emrfs-site", + "Properties":{ + "fs.s3.consistent":"false", + "fs.s3.maxConnections":"50000", + "fs.s3.enableServerSideEncryption": "true", + "fs.s3a.enableServerSideEncryption":"true", + "fs.s3.create.allowFileNameEndsWithFolderSuffix": "true" + } + } +] diff --git a/examples/ephemeral-spark/ephemeral-spark.tf b/examples/ephemeral-spark/ephemeral-spark.tf index 50b7d0e..c237906 100644 --- a/examples/ephemeral-spark/ephemeral-spark.tf +++ b/examples/ephemeral-spark/ephemeral-spark.tf @@ -1,10 +1,10 @@ # Ephemeral Spark cluster module "ephemeral-spark-sgs" { source = "git::git@github.com:Datatamer/terraform-aws-emr.git//modules/aws-emr-sgs?ref=9.0.0" - vpc_id = var.vpc_id - emr_managed_sg_name = format("%s-%s", var.name_prefix, "Ephem-Spark-Internal") + vpc_id = local.vpc_id + emr_managed_sg_name = format("%s-%s", local.name_prefix, "Ephem-Spark-Internal") emr_service_access_sg_ids = module.aws-emr-sg-service-access.security_group_ids - tags = merge(var.tags, var.emr_tags) + tags = module.tags.tags } module "ephemeral-spark-iam" { @@ -14,10 +14,33 @@ module "ephemeral-spark-iam" { module.s3-logs.rw_policy_arn, module.s3-data.rw_policy_arn ] - vpc_id = var.vpc_id - emr_service_iam_policy_name = "${var.name_prefix}-spark-service-policy" - emr_service_role_name = "${var.name_prefix}-spark-service-role" - emr_ec2_instance_profile_name = "${var.name_prefix}-spark-emr-instance-profile" - emr_ec2_role_name = "${var.name_prefix}-spark-ec2-role" - tags = var.tags + vpc_id = local.vpc_id + emr_service_iam_policy_name = "${local.name_prefix}-spark-service-policy" + emr_service_role_name = "${local.name_prefix}-spark-service-role" + emr_ec2_instance_profile_name = "${local.name_prefix}-spark-emr-instance-profile" + emr_ec2_role_name = "${local.name_prefix}-spark-ec2-role" + tags = module.tags.tags +} + +module "ephemeral-spark-config" { + source = "git::git@github.com:Datatamer/terraform-aws-emr.git//modules/aws-emr-config?ref=9.0.0" + create_static_cluster = false + emr_config_file_path = "${path.module}/emr-spark-config.json" + bucket_name_for_root_directory = module.s3-data.bucket_name + + utility_script_bucket_key = "ephemeral-spark-util/upload_hbase_config.sh" + hadoop_config_path = "ephemeral-spark-config/hadoop/conf/" + hbase_config_path = "ephemeral-spark-config/hbase/conf.dist/" +} + +module "ephemeral-spark-sg-service-access" { + source = "git::git@github.com:Datatamer/terraform-aws-security-groups.git?ref=1.0.1" + vpc_id = local.vpc_id + ingress_ports = module.aws-vm-sg-ports.ingress_ports + ingress_cidr_blocks = local.ingress_cidr_blocks + egress_cidr_blocks = local.egress_cidr_blocks + sg_name_prefix = format("%s-%s", local.name_prefix, "spark-emr-service-access") + egress_protocol = "all" + ingress_protocol = "tcp" + tags = module.tags.tags } diff --git a/examples/ephemeral-spark/hbase-cluster.tf b/examples/ephemeral-spark/hbase-cluster.tf index c4baeef..a7e07f4 100644 --- a/examples/ephemeral-spark/hbase-cluster.tf +++ b/examples/ephemeral-spark/hbase-cluster.tf @@ -7,16 +7,19 @@ module "emr-hbase" { # Configurations create_static_cluster = true - release_label = "emr-6.6.0" # hbase 2.4.4 + release_label = "emr-6.8.0" applications = local.applications - emr_config_file_path = "${path.module}/../emr.json" - bucket_path_to_logs = "logs/${var.name_prefix}-hbase/" - tags = merge(var.tags, var.emr_tags) - abac_valid_tags = var.emr_abac_valid_tags + emr_config_file_path = "./emr-hbase-config.json" + bucket_path_to_logs = "logs/${local.name_prefix}-hbase/" + tags = module.tags.tags + + utility_script_bucket_key = "emr-hbase-util/upload_hbase_config.sh" + hadoop_config_path = "emr-hbase-config/hadoop/conf/" + hbase_config_path = "emr-hbase-config/hbase/conf.dist/" # Networking - subnet_id = var.compute_subnet_id - vpc_id = var.vpc_id + subnet_id = local.compute_subnet_id + vpc_id = local.vpc_id # Security Group IDs emr_managed_master_sg_ids = module.aws-emr-sg-master.security_group_ids emr_managed_core_sg_ids = module.aws-emr-sg-core.security_group_ids @@ -32,22 +35,22 @@ module "emr-hbase" { key_pair_name = module.emr_key_pair.key_pair_key_name # Names - cluster_name = "${var.name_prefix}-HBase-Cluster" - emr_service_role_name = "${var.name_prefix}-hbase-service-role" - emr_ec2_role_name = "${var.name_prefix}-hbase-ec2-role" - emr_ec2_instance_profile_name = "${var.name_prefix}-hbase-emr-instance-profile" - emr_service_iam_policy_name = "${var.name_prefix}-hbase-service-policy" - master_instance_fleet_name = "${var.name_prefix}-HBaseMasterInstanceGroup" - core_instance_fleet_name = "${var.name_prefix}-HBaseCoreInstanceGroup" - emr_managed_sg_name = "${var.name_prefix}-EMR-Managed" + cluster_name = "${local.name_prefix}-HBase-Cluster" + emr_service_role_name = "${local.name_prefix}-hbase-service-role" + emr_ec2_role_name = "${local.name_prefix}-hbase-ec2-role" + emr_ec2_instance_profile_name = "${local.name_prefix}-hbase-emr-instance-profile" + emr_service_iam_policy_name = "${local.name_prefix}-hbase-service-policy" + master_instance_fleet_name = "${local.name_prefix}-HBaseMasterInstanceGroup" + core_instance_fleet_name = "${local.name_prefix}-HBaseCoreInstanceGroup" + emr_managed_sg_name = "${local.name_prefix}-EMR-Managed" # Scale - master_instance_on_demand_count = 1 - core_instance_on_demand_count = 4 - master_instance_type = "m6g.xlarge" - core_instance_type = "r6g.xlarge" - master_ebs_size = 50 - core_ebs_size = 200 + master_instance_on_demand_count = local.hbase_master_instance_on_demand_count + core_instance_on_demand_count = local.hbase_core_instance_on_demand_count + master_instance_type = local.hbase_master_instance_type + core_instance_type = local.hbase_core_instance_type + master_ebs_size = local.hbase_master_ebs_size + core_ebs_size = local.hbase_core_ebs_size } module "sg-ports-emr" { @@ -58,37 +61,38 @@ module "sg-ports-emr" { module "aws-emr-sg-master" { source = "git::git@github.com:Datatamer/terraform-aws-security-groups.git?ref=1.0.1" - vpc_id = var.vpc_id - ingress_cidr_blocks = var.ingress_cidr_blocks + vpc_id = local.vpc_id + ingress_cidr_blocks = local.ingress_cidr_blocks ingress_security_groups = concat(module.aws-sg-vm.security_group_ids, [module.ephemeral-spark-sgs.emr_managed_sg_id]) - egress_cidr_blocks = var.egress_cidr_blocks + egress_cidr_blocks = local.egress_cidr_blocks ingress_ports = module.sg-ports-emr.ingress_master_ports - sg_name_prefix = format("%s-%s", var.name_prefix, "emr-master") + sg_name_prefix = format("%s-%s", local.name_prefix, "emr-master") egress_protocol = "all" ingress_protocol = "tcp" - tags = merge(var.tags, var.emr_tags) + tags = module.tags.tags } module "aws-emr-sg-core" { source = "git::git@github.com:Datatamer/terraform-aws-security-groups.git?ref=1.0.1" - vpc_id = var.vpc_id - ingress_cidr_blocks = var.ingress_cidr_blocks + vpc_id = local.vpc_id + ingress_cidr_blocks = local.ingress_cidr_blocks ingress_security_groups = concat(module.aws-sg-vm.security_group_ids, [module.ephemeral-spark-sgs.emr_managed_sg_id]) - egress_cidr_blocks = var.egress_cidr_blocks + egress_cidr_blocks = local.egress_cidr_blocks ingress_ports = module.sg-ports-emr.ingress_core_ports - sg_name_prefix = format("%s-%s", var.name_prefix, "emr-core") + sg_name_prefix = format("%s-%s", local.name_prefix, "emr-core") egress_protocol = "all" ingress_protocol = "tcp" - tags = merge(var.tags, var.emr_tags) + tags = module.tags.tags } module "aws-emr-sg-service-access" { source = "git::git@github.com:Datatamer/terraform-aws-security-groups.git?ref=1.0.1" - vpc_id = var.vpc_id - ingress_cidr_blocks = var.ingress_cidr_blocks + vpc_id = local.vpc_id + ingress_cidr_blocks = local.ingress_cidr_blocks + egress_cidr_blocks = local.egress_cidr_blocks ingress_ports = module.sg-ports-emr.ingress_service_access_ports - sg_name_prefix = format("%s-%s", var.name_prefix, "emr-service-access") + sg_name_prefix = format("%s-%s", local.name_prefix, "emr-service-access") egress_protocol = "all" ingress_protocol = "tcp" - tags = merge(var.tags, var.emr_tags) + tags = module.tags.tags } diff --git a/examples/ephemeral-spark/label.tf b/examples/ephemeral-spark/label.tf new file mode 100644 index 0000000..0721e1b --- /dev/null +++ b/examples/ephemeral-spark/label.tf @@ -0,0 +1,8 @@ +module "tags" { + department = "" + environment = "" + owner = "" + product = "" + customer = "" + name = "" +} diff --git a/examples/ephemeral-spark/local.tfvars b/examples/ephemeral-spark/local.tfvars deleted file mode 100644 index dc53bac..0000000 --- a/examples/ephemeral-spark/local.tfvars +++ /dev/null @@ -1,12 +0,0 @@ -name_prefix = "tamr-config-test" -ingress_cidr_blocks = [] # Add VPN CIDR here and any other CIDRs to allow ingress from -ami_id = "ami-0" # Replace me -license_key = "example-license-key" # Replace me -vpc_id = "vpc-example" -application_subnet_id = "subnet-us-east-1a" # Replace me -compute_subnet_id = "subnet-us-east-1b" # Replace me -data_subnet_ids = [ - "subnet-us-east-1a", - "subnet-us-east-1b", -] # Replace me with subnet IDs in different AZs -s3_bucket_logging = "example-log-bucket" # Replace me with an existing bucket diff --git a/examples/ephemeral-spark/locals.tf b/examples/ephemeral-spark/locals.tf new file mode 100644 index 0000000..74ee131 --- /dev/null +++ b/examples/ephemeral-spark/locals.tf @@ -0,0 +1,3 @@ +locals { + name_prefix = "" # Enter a name prefix here to apply to all resources for the deployment +} diff --git a/examples/ephemeral-spark/network.tf b/examples/ephemeral-spark/network.tf new file mode 100644 index 0000000..a2d380d --- /dev/null +++ b/examples/ephemeral-spark/network.tf @@ -0,0 +1,19 @@ +locals { + vpc_id = "" # enter a valid vpc id + + # Fill with valid subnets for ec2 and rds instances + ec2-private-a = "" + ec2-private-b = "" + ec2-private-c = "" + rds-private-a = "" + rds-private-b = "" + rds-private-c = "" + + compute_subnet_id = local.ec2-private-a + data_subnet_ids = [local.ec2-private-a, local.ec2-private-b] + + # Fill with corresponding cidr blocks + ingress_cidr_blocks = [""] + egress_cidr_blocks = [""] + +} diff --git a/examples/ephemeral-spark/opensearch.tf b/examples/ephemeral-spark/opensearch.tf index 368df5b..b91338b 100644 --- a/examples/ephemeral-spark/opensearch.tf +++ b/examples/ephemeral-spark/opensearch.tf @@ -2,14 +2,15 @@ module "tamr-opensearch-cluster" { source = "git::git@github.com:Datatamer/terraform-aws-opensearch?ref=6.0.0" # Names - domain_name = "${var.name_prefix}-opensearch" + domain_name = "${local.name_prefix}-opensearch" # In-transit encryption options node_to_node_encryption_enabled = true enforce_https = true # Networking - subnet_ids = [var.data_subnet_ids[0]] + vpc_id = local.vpc_id + subnet_ids = [local.data_subnet_ids[0]] security_group_ids = module.aws-sg-opensearch.security_group_ids } @@ -18,19 +19,15 @@ module "sg-ports-opensearch" { source = "git::git@github.com:Datatamer/terraform-aws-es.git//modules/es-ports?ref=5.0.0" } -data "aws_subnet" "application_subnet" { - id = var.application_subnet_id -} - module "aws-sg-opensearch" { source = "git::git@github.com:Datatamer/terraform-aws-security-groups.git?ref=1.0.1" - vpc_id = var.vpc_id - ingress_cidr_blocks = var.ingress_cidr_blocks + vpc_id = local.vpc_id + ingress_cidr_blocks = local.ingress_cidr_blocks ingress_security_groups = concat(module.aws-sg-vm.security_group_ids, [module.ephemeral-spark-sgs.emr_managed_sg_id]) - egress_cidr_blocks = var.egress_cidr_blocks + egress_cidr_blocks = local.egress_cidr_blocks ingress_ports = module.sg-ports-opensearch.ingress_ports - sg_name_prefix = format("%s-%s", var.name_prefix, "-os") - tags = var.tags + sg_name_prefix = format("%s-%s", local.name_prefix, "-os") + tags = module.tags.tags ingress_protocol = "tcp" egress_protocol = "all" } diff --git a/examples/ephemeral-spark/outputs.tf b/examples/ephemeral-spark/outputs.tf index 0c27ba2..3c446ff 100644 --- a/examples/ephemeral-spark/outputs.tf +++ b/examples/ephemeral-spark/outputs.tf @@ -32,12 +32,16 @@ output "ephemeral-spark-iam" { value = module.ephemeral-spark-iam } +output "ephemeral-spark-config" { + value = module.ephemeral-spark-config +} + output "ephemeral-spark-sgs" { value = module.ephemeral-spark-sgs } output "tamr-config" { - value = module.tamr-config.tamr_config_file + value = module.tamr-config.rendered sensitive = true } diff --git a/examples/ephemeral-spark/versions.tf b/examples/ephemeral-spark/provider.tf similarity index 100% rename from examples/ephemeral-spark/versions.tf rename to examples/ephemeral-spark/provider.tf diff --git a/examples/ephemeral-spark/rds.tf b/examples/ephemeral-spark/rds.tf index 83c4d82..d66a20c 100644 --- a/examples/ephemeral-spark/rds.tf +++ b/examples/ephemeral-spark/rds.tf @@ -5,36 +5,41 @@ resource "random_password" "rds-password" { } module "rds-postgres" { - source = "git::git@github.com:Datatamer/terraform-aws-rds-postgres.git?ref=4.0.1" + source = "git::git@github.com:Datatamer/terraform-aws-rds-postgres.git?ref=4.1.0" - identifier_prefix = "${var.name_prefix}-" + identifier_prefix = "${local.name_prefix}-" username = "tamr" password = random_password.rds-password.result - subnet_group_name = "${var.name_prefix}-subnet-group" + subnet_group_name = "${local.name_prefix}-subnet-group" postgres_name = "tamr0" - parameter_group_name = "${var.name_prefix}-rds-postgres-pg" + parameter_group_name = "${local.name_prefix}-rds-postgres-pg" # Network requirement: DB subnet group needs a subnet in at least two AZs - rds_subnet_ids = var.data_subnet_ids + rds_subnet_ids = local.data_subnet_ids + multi_az = false security_group_ids = module.rds-postgres-sg.security_group_ids - tags = var.tags + tags = module.tags.tags + + allocated_storage = 20 + max_allocated_storage = 1000 + instance_class = "db.r6g.large" } module "sg-ports-rds" { - source = "git::git@github.com:Datatamer/terraform-aws-rds-postgres.git//modules/rds-postgres-ports?ref=4.0.1" + source = "git::git@github.com:Datatamer/terraform-aws-rds-postgres.git//modules/rds-postgres-ports?ref=4.1.0" } module "rds-postgres-sg" { source = "git::git@github.com:Datatamer/terraform-aws-security-groups.git?ref=1.0.1" - vpc_id = var.vpc_id - ingress_cidr_blocks = var.ingress_cidr_blocks + vpc_id = local.vpc_id + ingress_cidr_blocks = local.ingress_cidr_blocks ingress_security_groups = module.aws-sg-vm.security_group_ids - egress_cidr_blocks = var.egress_cidr_blocks + egress_cidr_blocks = local.egress_cidr_blocks ingress_ports = module.sg-ports-rds.ingress_ports - sg_name_prefix = var.name_prefix + sg_name_prefix = local.name_prefix egress_protocol = "all" ingress_protocol = "tcp" - tags = var.tags + tags = module.tags.tags } diff --git a/examples/ephemeral-spark/scale.tf b/examples/ephemeral-spark/scale.tf new file mode 100644 index 0000000..23c27fb --- /dev/null +++ b/examples/ephemeral-spark/scale.tf @@ -0,0 +1,25 @@ +locals { + # tamr configs: + spark_driver_memory = "24GB" + spark_executor_instances = "63" + spark_executor_memory = "27GB" + spark_executor_cores = "4" + ec2_instance_type = "r6i.2xlarge" + ec2_volume_size = 250 + + # spark + master_instance_type = "r6g.xlarge" + core_instance_type = "r6g.4xlarge" + core_group_instance_count = 16 + core_ebs_size = 250 + master_ebs_size = 50 + master_group_instance_count = 1 + + # hbase + hbase_master_instance_on_demand_count = 1 + hbase_core_instance_on_demand_count = 24 + hbase_master_instance_type = "r6g.xlarge" + hbase_core_instance_type = "r6g.xlarge" + hbase_master_ebs_size = 50 + hbase_core_ebs_size = 300 +} diff --git a/examples/ephemeral-spark/tamr-config.tf b/examples/ephemeral-spark/tamr-config.tf index edd2ac5..f3f1f72 100644 --- a/examples/ephemeral-spark/tamr-config.tf +++ b/examples/ephemeral-spark/tamr-config.tf @@ -1,14 +1,13 @@ module "tamr-config" { - # source = "git::git@github.com:Datatamer/terraform-aws-tamr-config?ref=2.4.5" + # source = "git::git@github.com:Datatamer/terraform-aws-tamr-config?ref=2.6.0" source = "../.." - config_template_path = "tamr-config.yml.tmpl" - rendered_config_path = "./rendered-config.yml" + config_template_path = "${path.module}/tamr-config.yml" ephemeral_spark_configured = true additional_templated_variables = { - "TAMR_LICENSE_KEY" : var.license_key + "TAMR_ES_ENABLED" : "false", } - emr_tags = var.emr_tags + emr_tags = module.tags.tags # Backup tamr_backup_emr_cluster_id = module.emr-hbase.tamr_emr_cluster_id @@ -30,29 +29,30 @@ module "tamr-config" { # Spark spark_emr_cluster_id = "" - spark_cluster_log_uri = "s3n://${module.s3-logs.bucket_name}/${var.path_to_spark_logs}" - spark_driver_memory = "10G" - spark_executor_instances = 7 - spark_executor_memory = "12G" - spark_executor_cores = 2 + spark_cluster_log_uri = "s3n://${module.s3-logs.bucket_name}/" + spark_driver_memory = local.spark_driver_memory + spark_executor_instances = local.spark_executor_instances + spark_executor_memory = local.spark_executor_memory + spark_executor_cores = local.spark_executor_cores tamr_data_path = "tamr/unify-data" - tamr_spark_properties_override = "{'spark.driver.maxResultSize':'4g'}" + tamr_spark_properties_override = "{'spark.dynamicAllocation.enabled':'true','spark.driver.memoryOverhead':'3072','spark.executor.memoryOverhead':'3072','spark.executor.extraJavaOptions':'-Djdk.nio.maxCachedBufferSize=262144','spark.driver.maxResultSize':'8g','spark.task.maxFailures':'25','spark.stage.maxConsecutiveAttempts':'3','spark.sql.shuffle.partitions':'1100','spark.default.parallelism':'900','spark.sql.broadcastTimeout':'30000'}" # Ephemeral Spark - emr_release_label = "emr-5.29.0" # spark 2.4.4 + emr_release_label = "emr-6.5.0" emr_instance_profile_name = module.ephemeral-spark-iam.emr_ec2_instance_profile_name emr_service_role_name = module.ephemeral-spark-iam.emr_service_role_name emr_key_pair_name = module.emr_key_pair.key_pair_key_name - emr_subnet_id = var.compute_subnet_id - master_instance_type = "m4.large" + emr_subnet_id = local.compute_subnet_id + master_instance_type = local.master_instance_type master_ebs_volumes_count = 1 - master_ebs_size = 50 + master_ebs_size = local.master_ebs_size master_ebs_type = "gp2" core_ebs_volumes_count = 1 - core_ebs_size = 200 + core_ebs_size = local.core_ebs_size core_ebs_type = "gp2" - core_group_instance_count = 4 - core_instance_type = "r5.xlarge" + core_group_instance_count = local.core_group_instance_count + core_instance_type = local.core_instance_type + emr_cluster_name_prefix = local.name_prefix emr_managed_master_sg_id = module.ephemeral-spark-sgs.emr_managed_sg_id # emr_managed_master_sg_id = "" # you may leave this blank and AWS creates one automatically @@ -61,15 +61,12 @@ module "tamr-config" { # emr_managed_core_sg_id = "" # you may leave this blank and AWS creates one automatically emr_additional_core_sg_id = join(",", module.aws-emr-sg-master.security_group_ids) emr_service_access_sg_id = module.aws-emr-sg-service-access.security_group_ids[0] - - # Data Movement - apps_dms_enabled = false } # Upload the Tamr configuration to S3 resource "aws_s3_bucket_object" "upload_tamr_config" { bucket = module.s3-data.bucket_name key = "tamr/tamr-config.yml" - content = module.tamr-config.tamr_config_file + content = module.tamr-config.rendered server_side_encryption = "AES256" } diff --git a/examples/ephemeral-spark/tamr-config.yml b/examples/ephemeral-spark/tamr-config.yml new file mode 100644 index 0000000..f08a2bd --- /dev/null +++ b/examples/ephemeral-spark/tamr-config.yml @@ -0,0 +1,61 @@ +--- +# RDS +TAMR_PERSISTENCE_DB_USER: ${rds_pg_username} +TAMR_PERSISTENCE_DB_URL: jdbc:postgresql://${rds_pg_hostname}:${rds_pg_db_port}/${rds_pg_dbname} +TAMR_PERSISTENCE_DB_PASS: ${rds_pg_password} +TAMR_PERSISTENCE_DB_PORT: ${rds_pg_db_port} + +# HBase +TAMR_REMOTE_HBASE_ENABLED: true +TAMR_HBASE_REMOTE_DOWNLOAD_ENABLED: true +TAMR_CONNECTION_INFO_TYPE: hbase-site +TAMR_HBASE_NAMESPACE: ${hbase_namespace} +TAMR_HBASE_COMPRESSION: snappy +TAMR_HBASE_CONFIG_URIS: s3://${tamr_data_bucket}/${hbase_config_path}hbase-site.xml + +# HBase - Properties +TAMR_HBASE_STORAGE_MODE: ${hbase_storage_mode} +TAMR_HBASE_NUMBER_OF_REGIONS: ${hbase_number_of_regions} +TAMR_HBASE_NUMBER_OF_SALT_VALUES: ${hbase_number_of_salt_values} + +# Spark +TAMR_REMOTE_SPARK_ENABLED: true +TAMR_JOB_SPARK_CLUSTER : emr +TAMR_JOB_EMR_CLUSTER_ID: ${spark_emr_cluster_id} +TAMR_DATASET_EMR_LOG_URI: ${spark_cluster_log_uri} + +# Spark - Scale +TAMR_JOB_SPARK_DRIVER_MEM: ${spark_driver_memory} +TAMR_JOB_SPARK_EXECUTOR_INSTANCES: ${spark_executor_instances} +TAMR_JOB_SPARK_EXECUTOR_MEM: ${spark_executor_memory} +TAMR_JOB_SPARK_EXECUTOR_CORES: ${spark_executor_cores} +TAMR_JOB_SPARK_CONFIG_OVERRIDES: '${tamr_spark_config_override}' +TAMR_JOB_SPARK_PROPS: '${tamr_spark_properties_override}' + +# Elasticsearch +TAMR_REMOTE_ES_ENABLED: true +TAMR_ES_APIHOST: ${es_domain_endpoint}:443 +TAMR_ES_HEALTH_CHECK_METADATA: false +TAMR_ES_SSL_ENABLED: true + +# FileSystem +TAMR_UNIFY_DATA_DIR: s3://${tamr_data_bucket}/${tamr_data_path} +TAMR_FS_URI: s3://${tamr_data_bucket} + +# ESP +TAMR_STORAGE_PROVIDERS: '${tamr_external_storage_providers}' + +# Misc +TAMR_BIGQUERY_ENABLED: false + +# Backup Config +TAMR_FILE_BASED_HBASE_BACKUP_ENABLED: ${tamr_file_based_hbase_backup_enabled} +TAMR_BACKUP_AWS_CLI_ENABLED: ${tamr_backup_aws_cli_enabled} +TAMR_UNIFY_BACKUP_ES: ${tamr_unify_backup_es} +TAMR_UNIFY_BACKUP_AWS_ROLE_BASED_ACCESS: ${tamr_unify_backup_aws_role_based_access} +TAMR_UNIFY_BACKUP_URI: s3://${tamr_data_bucket}/${tamr_unify_backup_path} +TAMR_BACKUP_S3DISTCP_ENABLED: ${tamr_backup_s3distcp_enabled} +TAMR_BACKUP_EMR_CLUSTER_ID: ${tamr_backup_emr_cluster_id} + +# Core Connect +TAMR_CONNECT_DEFAULT_CLOUD_PROVIDER: S3 diff --git a/examples/ephemeral-spark/tamr-vm.tf b/examples/ephemeral-spark/tamr-vm.tf index 3a85b90..0bb15b2 100644 --- a/examples/ephemeral-spark/tamr-vm.tf +++ b/examples/ephemeral-spark/tamr-vm.tf @@ -1,32 +1,28 @@ locals { - ami_id = var.ami_id != "" ? var.ami_id : data.aws_ami.tamr-vm.id + ami_id = "ami-06a1f46caddb5669e" # Tamr BYOL AMI from the marketplace. Replace if needed. } -data "aws_ami" "tamr-vm" { - most_recent = true - owners = ["679593333241"] - name_regex = "ami-[a-z0-9]*-with-tamr-v202[0-9]*-[0-9]*gb-[0-9]*-no-license-.*" - filter { - name = "product-code" - values = ["832nkbrayw00cnivlh6nbbi6p"] - } +data "aws_subnet" "application_subnet" { + id = local.compute_subnet_id } module "tamr-vm" { - source = "git::git@github.com:Datatamer/terraform-aws-tamr-vm.git?ref=5.0.0" + source = "git::git@github.com:Datatamer/terraform-aws-tamr-vm.git?ref=5.1.0" ami = local.ami_id - instance_type = "r5.2xlarge" + instance_type = local.ec2_instance_type + volume_size = local.ec2_volume_size key_name = module.emr_key_pair.key_pair_key_name - subnet_id = var.application_subnet_id + subnet_id = local.application_subnet_id security_group_ids = module.aws-sg-vm.security_group_ids availability_zone = data.aws_subnet.application_subnet.availability_zone - aws_role_name = "${var.name_prefix}-tamr-ec2-role" - aws_instance_profile_name = "${var.name_prefix}-tamrvm-instance-profile" - aws_emr_creator_policy_name = "${var.name_prefix}-emr-creator-policy" + aws_role_name = "${local.name_prefix}-tamr-ec2-role" + aws_instance_profile_name = "${local.name_prefix}-tamrvm-instance-profile" + aws_emr_creator_policy_name = "${local.name_prefix}-emr-creator-policy" additional_policy_arns = [ module.s3-logs.rw_policy_arn, - module.s3-data.rw_policy_arn + module.s3-data.rw_policy_arn, + aws_iam_policy.ssm_policy.arn ] tamr_emr_cluster_ids = [] # leave empty when using ephemeral-spark tamr_emr_role_arns = [ @@ -35,22 +31,57 @@ module "tamr-vm" { module.ephemeral-spark-iam.emr_service_role_arn, module.ephemeral-spark-iam.emr_ec2_role_arn ] - emr_abac_valid_tags = var.emr_abac_valid_tags + tags = module.tags.tags } module "aws-vm-sg-ports" { - source = "git::git@github.com:Datatamer/terraform-aws-tamr-vm.git//modules/aws-security-groups?ref=5.0.0" + source = "git::git@github.com:Datatamer/terraform-aws-tamr-vm.git//modules/aws-security-groups?ref=5.1.0" } module "aws-sg-vm" { source = "git::git@github.com:Datatamer/terraform-aws-security-groups.git?ref=1.0.1" - vpc_id = var.vpc_id - ingress_cidr_blocks = var.ingress_cidr_blocks - egress_cidr_blocks = var.egress_cidr_blocks + vpc_id = local.vpc_id + ingress_cidr_blocks = local.ingress_cidr_blocks + egress_cidr_blocks = local.egress_cidr_blocks ingress_protocol = "tcp" egress_protocol = "all" ingress_ports = module.aws-vm-sg-ports.ingress_ports - sg_name_prefix = format("%s-%s", var.name_prefix, "tamr-vm") - tags = var.tags + sg_name_prefix = format("%s-%s", local.name_prefix, "tamr-vm") + tags = module.tags.tags +} + +resource "aws_iam_policy" "ssm_policy" { + name = "${local.name_prefix}-ssm-agent-policy" + policy = data.aws_iam_policy_document.ssm_policy.json + tags = module.tags.tags +} + +data "aws_iam_policy_document" "ssm_policy" { + version = "2012-10-17" + statement { + effect = "Allow" + actions = [ + "ssm:DescribeAssociation", + "ssm:DescribeDocument", + "ssm:GetDeployablePatchSnapshotForInstance", + "ssm:GetDocument", + "ssm:GetManifest", + "ssm:GetParameter", + "ssm:GetParameters", + "ssm:ListAssociations", + "ssm:ListInstanceAssociations", + "ssm:PutComplianceItems", + "ssm:PutConfigurePackageResult", + "ssm:PutInventory", + "ssm:UpdateAssociationStatus", + "ssm:UpdateInstanceAssociationStatus", + "ssm:UpdateInstanceInformation", + "ssmmessages:CreateControlChannel", + "ssmmessages:CreateDataChannel", + "ssmmessages:OpenControlChannel", + "ssmmessages:OpenDataChannel" + ] + resources = ["*"] + } } diff --git a/examples/ephemeral-spark/variables.tf b/examples/ephemeral-spark/variables.tf deleted file mode 100644 index d28c535..0000000 --- a/examples/ephemeral-spark/variables.tf +++ /dev/null @@ -1,83 +0,0 @@ -variable "name_prefix" { - type = string - description = "A prefix to add to the names of all created resources." - default = "tamr-config-ephemeral" -} - -variable "path_to_spark_logs" { - type = string - description = "Path in logs bucket to store spark logs. E.g. tamr/spark-logs" - default = "" -} - -variable "ingress_cidr_blocks" { - type = list(string) - description = "List of CIDR blocks from which ingress to ElasticSearch domain, Tamr VM, Tamr Postgres instance are allowed (i.e. VPN CIDR)" - default = [] -} - -variable "ami_id" { - type = string - description = "AMI to use for Tamr EC2 instance" -} - -variable "license_key" { - type = string - description = "Tamr license key" -} - -variable "vpc_id" { - type = string - description = "VPC ID of deployment" -} - -variable "application_subnet_id" { - type = string - description = "Subnet ID for Tamr VM" -} - -variable "compute_subnet_id" { - type = string - description = "Subnet ID for EMR cluster" -} - -variable "data_subnet_ids" { - type = list(string) - description = "List of at least 2 subnet IDs in different AZs" -} - -variable "egress_cidr_blocks" { - type = list(string) - description = "List of CIDR blocks from which ingress to ElasticSearch domain, Tamr VM, Tamr Postgres instance are allowed (i.e. VPN CIDR)" - default = ["0.0.0.0/0"] -} - -variable "tags" { - type = map(string) - description = "Map of tags to add to resources." - default = {} -} - -variable "emr_tags" { - type = map(string) - description = "Map of tags to add to EMR resources. They must contain abac_valid_tags at minimum" - default = {} -} - -variable "emr_abac_valid_tags" { - type = map(list(string)) - description = "Valid tags for maintaining resources when using ABAC IAM Policies with Tag Conditions. Make sure `emr_tags` contain the values specified here and that your Subnet is tagged as well" - default = {} -} - -variable "create_new_service_role" { - default = "false" - type = bool - description = "Whether to create a new IAM service linked role for ES. This only needs to happen once per account. If false, linked_service_role is required" -} - -variable "s3_bucket_logging" { - description = "The name of an existing S3 bucket where to store S3 server access logs." - type = string - default = "" -}