Skip to content

Commit b51118e

Browse files
authored
Merge pull request #22 from Datatamer/spark-configurable
DEV-16800 - Exposes Spark & Dataproc configuration
2 parents 40a50e4 + 8480b92 commit b51118e

File tree

5 files changed

+103
-5
lines changed

5 files changed

+103
-5
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
# GCP Tamr Wrapper
22

3+
## v2.1.0 - February 9th 2023
4+
* exposes spark & dataproc settings from the Tamr configuration module
5+
36
## v2.0.4 - August 12th 2022
47
* corrects IAM module version
58

README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,19 @@ No provider.
4343
| additional\_admin\_users | list of additional entities to give admin permissions to provisioned resources | `list(string)` | `[]` | no |
4444
| additional\_read\_users | list of additional entities to give read only permissions to provisioned resources | `list(string)` | `[]` | no |
4545
| bucket\_locations | Location for the gcs buckets, default is `US` | `string` | `"US"` | no |
46+
| dataproc\_cluster\_master\_disk\_size | Size of disk to use on dataproc master disk | `number` | `1000` | no |
47+
| dataproc\_cluster\_master\_instance\_type | Instance type to use as dataproc master | `string` | `"n1-highmem-4"` | no |
48+
| dataproc\_cluster\_worker\_machine\_type | machine type of default worker pool | `string` | `"n1-standard-16"` | no |
49+
| dataproc\_cluster\_worker\_num\_instances | Number of default workers to use | `number` | `4` | no |
50+
| dataproc\_cluster\_worker\_num\_local\_ssds | Number of localssds to attach to each worker node | `number` | `2` | no |
51+
| dataproc\_image\_version | Dataproc image version | `string` | `"1.4"` | no |
4652
| force\_destroy | force destroy potentially persistent resources, like bigtable/gcs | `bool` | `false` | no |
4753
| labels | Labels to attach to created resources | `map(string)` | `{}` | no |
54+
| spark\_driver\_memory | Amount of memory spark should allocate to spark driver | `string` | `"12G"` | no |
55+
| spark\_executor\_cores | Amount of cores spark should allocate to each spark executor | `number` | `5` | no |
56+
| spark\_executor\_instances | number of spark executor instances | `number` | `12` | no |
57+
| spark\_executor\_memory | Amount of memory spark should allocate to each spark executor | `string` | `"13G"` | no |
58+
| spark\_properties\_override | json blob of spark properties to override, if not set will use a default set of properties that should work for most use cases | `string` | `""` | no |
4859
| sql\_disk\_size | size of the disk to use on the tamr sql instance | `number` | `10` | no |
4960
| sql\_disk\_type | The disk type to use on the cloud SQL instance. should be either PD\_SSD or PD\_STANDARD | `string` | `"PD_SSD"` | no |
5061
| sql\_tier | the machine type to use for the sql instance | `string` | `"db-custom-2-4096"` | no |

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.0.2
1+
2.1.0

main.tf

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ module "tamr_vm" {
8989
}
9090

9191
module "config" {
92-
source = "git::git@github.com:Datatamer/terraform-gcp-tamr-config.git?ref=v1.0.2"
92+
source = "git::git@github.com:Datatamer/terraform-gcp-tamr-config.git?ref=v2.1.0"
9393

9494
# tamr VM
9595
tamr_instance_zone = var.zone
@@ -106,12 +106,24 @@ module "config" {
106106
tamr_dataproc_bucket = module.gcs_buckets.dataproc_bucket_name
107107
tamr_dataproc_region = var.region
108108
# dataproc_cluster_config
109-
tamr_dataproc_cluster_subnetwork_uri = local.subnetwork
110-
tamr_dataproc_cluster_zone = var.zone
109+
tamr_dataproc_cluster_subnetwork_uri = local.subnetwork
110+
tamr_dataproc_cluster_zone = var.zone
111+
tamr_dataproc_cluster_master_instance_type = var.dataproc_cluster_master_instance_type
112+
tamr_dataproc_cluster_master_disk_size = var.dataproc_cluster_master_disk_size
113+
tamr_dataproc_cluster_worker_machine_type = var.dataproc_cluster_worker_machine_type
114+
tamr_dataproc_cluster_worker_num_instances = var.dataproc_cluster_worker_num_instances
115+
tamr_dataproc_cluster_worker_num_local_ssds = var.dataproc_cluster_worker_num_local_ssds
116+
tamr_dataproc_image_version = var.dataproc_image_version
117+
# spark
118+
tamr_spark_driver_memory = var.spark_driver_memory
119+
tamr_spark_executor_memory = var.spark_executor_memory
120+
tamr_spark_executor_cores = var.spark_executor_cores
121+
tamr_spark_executor_instances = var.spark_executor_instances
122+
tamr_spark_properties_override = var.spark_properties_override
111123
# cloud sql
112124
tamr_cloud_sql_location = var.region
113125
tamr_cloud_sql_name = module.cloud_sql.instance_name
114126
tamr_sql_password = module.cloud_sql.tamr_password
115127
# filesystem
116128
tamr_filesystem_bucket = module.gcs_buckets.tamr_bucket_name
117-
}
129+
}

variables.tf

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,79 @@ variable "tamr_bigtable_max_nodes" {
7373
type = number
7474
description = "Max number of nodes to scale up to"
7575
}
76+
77+
#
78+
# Dataproc
79+
#
80+
variable "dataproc_cluster_master_instance_type" {
81+
default = "n1-highmem-4"
82+
type = string
83+
description = "Instance type to use as dataproc master"
84+
}
85+
86+
variable "dataproc_cluster_master_disk_size" {
87+
default = 1000
88+
type = number
89+
description = "Size of disk to use on dataproc master disk"
90+
}
91+
92+
variable "dataproc_cluster_worker_machine_type" {
93+
default = "n1-standard-16"
94+
type = string
95+
description = "machine type of default worker pool"
96+
}
97+
98+
variable "dataproc_cluster_worker_num_instances" {
99+
default = 4
100+
type = number
101+
description = "Number of default workers to use"
102+
}
103+
104+
variable "dataproc_cluster_worker_num_local_ssds" {
105+
default = 2
106+
type = number
107+
description = "Number of localssds to attach to each worker node"
108+
}
109+
110+
variable "dataproc_image_version" {
111+
default = "1.4"
112+
type = string
113+
description = "Dataproc image version"
114+
}
115+
116+
#
117+
# spark settings
76118
#
119+
variable "spark_driver_memory" {
120+
default = "12G"
121+
type = string
122+
description = "Amount of memory spark should allocate to spark driver"
123+
}
124+
125+
variable "spark_executor_memory" {
126+
default = "13G"
127+
type = string
128+
description = "Amount of memory spark should allocate to each spark executor"
129+
}
130+
131+
variable "spark_executor_cores" {
132+
default = 5
133+
type = number
134+
description = "Amount of cores spark should allocate to each spark executor"
135+
}
136+
137+
variable "spark_executor_instances" {
138+
default = 12
139+
type = number
140+
description = "number of spark executor instances"
141+
}
142+
143+
variable "spark_properties_override" {
144+
default = ""
145+
type = string
146+
description = "json blob of spark properties to override, if not set will use a default set of properties that should work for most use cases"
147+
}
148+
77149
# GCS
78150
#
79151
variable "bucket_locations" {

0 commit comments

Comments
 (0)