Merge pull request #22 from Datatamer/spark-configurable

souza-dan · web-flow · commit b51118e9acf1 · 2023-02-13T08:52:44.000-05:00
DEV-16800 - Exposes Spark &amp; Dataproc configuration
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,8 @@
 # GCP Tamr Wrapper
 
+## v2.1.0 - February 9th 2023
+* exposes spark & dataproc settings from the Tamr configuration module
+
 ## v2.0.4 - August 12th 2022
 * corrects IAM module version
 
diff --git a/README.md b/README.md
@@ -43,8 +43,19 @@ No provider.
 | additional\_admin\_users | list of additional entities to give admin permissions to provisioned resources | `list(string)` | `[]` | no |
 | additional\_read\_users | list of additional entities to give read only permissions to provisioned resources | `list(string)` | `[]` | no |
 | bucket\_locations | Location for the gcs buckets, default is `US` | `string` | `"US"` | no |
+| dataproc\_cluster\_master\_disk\_size | Size of disk to use on dataproc master disk | `number` | `1000` | no |
+| dataproc\_cluster\_master\_instance\_type | Instance type to use as dataproc master | `string` | `"n1-highmem-4"` | no |
+| dataproc\_cluster\_worker\_machine\_type | machine type of default worker pool | `string` | `"n1-standard-16"` | no |
+| dataproc\_cluster\_worker\_num\_instances | Number of default workers to use | `number` | `4` | no |
+| dataproc\_cluster\_worker\_num\_local\_ssds | Number of localssds to attach to each worker node | `number` | `2` | no |
+| dataproc\_image\_version | Dataproc image version | `string` | `"1.4"` | no |
 | force\_destroy | force destroy potentially persistent resources, like bigtable/gcs | `bool` | `false` | no |
 | labels | Labels to attach to created resources | `map(string)` | `{}` | no |
+| spark\_driver\_memory | Amount of memory spark should allocate to spark driver | `string` | `"12G"` | no |
+| spark\_executor\_cores | Amount of cores spark should allocate to each spark executor | `number` | `5` | no |
+| spark\_executor\_instances | number of spark executor instances | `number` | `12` | no |
+| spark\_executor\_memory | Amount of memory spark should allocate to each spark executor | `string` | `"13G"` | no |
+| spark\_properties\_override | json blob of spark properties to override, if not set will use a default set of properties that should work for most use cases | `string` | `""` | no |
 | sql\_disk\_size | size of the disk to use on the tamr sql instance | `number` | `10` | no |
 | sql\_disk\_type | The disk type to use on the cloud SQL instance. should be either PD\_SSD or PD\_STANDARD | `string` | `"PD_SSD"` | no |
 | sql\_tier | the machine type to use for the sql instance | `string` | `"db-custom-2-4096"` | no |
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-2.0.2
+2.1.0
diff --git a/main.tf b/main.tf
@@ -89,7 +89,7 @@ module "tamr_vm" {
 }
 
 module "config" {
-  source = "git::git@github.com:Datatamer/terraform-gcp-tamr-config.git?ref=v1.0.2"
+  source = "git::git@github.com:Datatamer/terraform-gcp-tamr-config.git?ref=v2.1.0"
 
   # tamr VM
   tamr_instance_zone            = var.zone
@@ -106,12 +106,24 @@ module "config" {
   tamr_dataproc_bucket = module.gcs_buckets.dataproc_bucket_name
   tamr_dataproc_region = var.region
   # dataproc_cluster_config
-  tamr_dataproc_cluster_subnetwork_uri = local.subnetwork
-  tamr_dataproc_cluster_zone           = var.zone
+  tamr_dataproc_cluster_subnetwork_uri        = local.subnetwork
+  tamr_dataproc_cluster_zone                  = var.zone
+  tamr_dataproc_cluster_master_instance_type  = var.dataproc_cluster_master_instance_type
+  tamr_dataproc_cluster_master_disk_size      = var.dataproc_cluster_master_disk_size
+  tamr_dataproc_cluster_worker_machine_type   = var.dataproc_cluster_worker_machine_type
+  tamr_dataproc_cluster_worker_num_instances  = var.dataproc_cluster_worker_num_instances
+  tamr_dataproc_cluster_worker_num_local_ssds = var.dataproc_cluster_worker_num_local_ssds
+  tamr_dataproc_image_version                 = var.dataproc_image_version
+  # spark
+  tamr_spark_driver_memory       = var.spark_driver_memory
+  tamr_spark_executor_memory     = var.spark_executor_memory
+  tamr_spark_executor_cores      = var.spark_executor_cores
+  tamr_spark_executor_instances  = var.spark_executor_instances
+  tamr_spark_properties_override = var.spark_properties_override
   # cloud sql
   tamr_cloud_sql_location = var.region
   tamr_cloud_sql_name     = module.cloud_sql.instance_name
   tamr_sql_password       = module.cloud_sql.tamr_password
   # filesystem
   tamr_filesystem_bucket = module.gcs_buckets.tamr_bucket_name
-}
+}
diff --git a/variables.tf b/variables.tf
@@ -73,7 +73,79 @@ variable "tamr_bigtable_max_nodes" {
   type        = number
   description = "Max number of nodes to scale up to"
 }
+
+#
+# Dataproc
+#
+variable "dataproc_cluster_master_instance_type" {
+  default     = "n1-highmem-4"
+  type        = string
+  description = "Instance type to use as dataproc master"
+}
+
+variable "dataproc_cluster_master_disk_size" {
+  default     = 1000
+  type        = number
+  description = "Size of disk to use on dataproc master disk"
+}
+
+variable "dataproc_cluster_worker_machine_type" {
+  default     = "n1-standard-16"
+  type        = string
+  description = "machine type of default worker pool"
+}
+
+variable "dataproc_cluster_worker_num_instances" {
+  default     = 4
+  type        = number
+  description = "Number of default workers to use"
+}
+
+variable "dataproc_cluster_worker_num_local_ssds" {
+  default     = 2
+  type        = number
+  description = "Number of localssds to attach to each worker node"
+}
+
+variable "dataproc_image_version" {
+  default     = "1.4"
+  type        = string
+  description = "Dataproc image version"
+}
+
+#
+# spark settings
 #
+variable "spark_driver_memory" {
+  default     = "12G"
+  type        = string
+  description = "Amount of memory spark should allocate to spark driver"
+}
+
+variable "spark_executor_memory" {
+  default     = "13G"
+  type        = string
+  description = "Amount of memory spark should allocate to each spark executor"
+}
+
+variable "spark_executor_cores" {
+  default     = 5
+  type        = number
+  description = "Amount of cores spark should allocate to each spark executor"
+}
+
+variable "spark_executor_instances" {
+  default     = 12
+  type        = number
+  description = "number of spark executor instances"
+}
+
+variable "spark_properties_override" {
+  default     = ""
+  type        = string
+  description = "json blob of spark properties to override, if not set will use a default set of properties that should work for most use cases"
+}
+
 # GCS
 #
 variable "bucket_locations" {