-
Notifications
You must be signed in to change notification settings - Fork 374
AN-751 predefinedMachineType runtime attribute
#7817
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
696b193
e2f5bbb
83d50e5
2b6b199
29a0441
775b1ed
9153f00
2eca2f1
192172a
68ce96e
678e29d
c95c551
a5af024
9359832
11791ce
a21c860
dd6fa13
19966c6
721bd0c
67dc13d
aa8dbc4
5f799c8
c5e9025
750328b
112dedd
dbe3b17
63b5f72
f9a948e
3ede6e9
ee131a4
1029f9f
47fd950
8c82dcf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| { | ||
| "minimal_hello_world.machine_type": "e2-medium" | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,18 @@ | ||
| name: e2-medium | ||
| testFormat: workflowsuccess | ||
| backends: [GCPBATCH] | ||
|
|
||
| files { | ||
| workflow: gcp_machine_type.wdl | ||
| inputs: e2-medium.json | ||
| } | ||
|
|
||
| # `e2-medium` is the cheapest machine that works decently in Batch, costing 20% less | ||
| # than the next alternative. May be suitable for a variety of "I just need a VM" tasks. | ||
| # https://cloud.google.com/compute/docs/general-purpose-machines#sharedcore | ||
| metadata { | ||
| "calls.minimal_hello_world.hello_world.runtimeAttributes.predefinedMachineType": "e2-medium" | ||
| "calls.minimal_hello_world.hello_world.runtimeAttributes.preemptible": "0" | ||
| "outputs.minimal_hello_world.actual_machine_type": ~~"machineTypes/e2-medium" | ||
| "outputs.minimal_hello_world.is_preemptible": "FALSE" | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| { | ||
| "minimal_hello_world.machine_type": "banana" | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,14 @@ | ||
| name: gcp_machine_type | ||
| testFormat: workflowsuccess | ||
| backends: [GCPBATCH] | ||
|
|
||
| files { | ||
| workflow: gcp_machine_type.wdl | ||
| } | ||
|
|
||
| metadata { | ||
| "calls.minimal_hello_world.hello_world.runtimeAttributes.predefinedMachineType": "n2-standard-2" | ||
| "calls.minimal_hello_world.hello_world.runtimeAttributes.preemptible": "0" | ||
| "outputs.minimal_hello_world.actual_machine_type": ~~"machineTypes/n2-standard-2" | ||
| "outputs.minimal_hello_world.is_preemptible": "FALSE" | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,61 @@ | ||
| version 1.0 | ||
|
|
||
| workflow minimal_hello_world { | ||
| input { | ||
| String image = "rockylinux/rockylinux:10" | ||
| String machine_type = "n2-standard-2" | ||
| Int preemptible = 0 | ||
| String zones = "northamerica-northeast1-a northamerica-northeast1-b northamerica-northeast1-c" | ||
| } | ||
|
|
||
| call hello_world { | ||
| input: | ||
| image = image, | ||
| machine_type = machine_type, | ||
| preemptible = preemptible, | ||
| zones = zones | ||
| } | ||
|
|
||
| output { | ||
| String stdout = hello_world.stdout | ||
| String actual_machine_type = hello_world.actual_machine_type | ||
| String is_preemptible = hello_world.is_preemptible | ||
| } | ||
| } | ||
|
|
||
| task hello_world { | ||
|
|
||
| input { | ||
| String image | ||
| String machine_type | ||
| Int preemptible | ||
| String zones | ||
| } | ||
|
|
||
| # Check machine specs by querying instance metadata | ||
| # https://cloud.google.com/compute/docs/metadata/predefined-metadata-keys#instance-metadata | ||
| command <<< | ||
| cat /etc/os-release | ||
| uname -a | ||
| cat /proc/cpuinfo | ||
| curl --header "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/instance/machine-type > actual_machine_type.txt | ||
| curl --header "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/instance/scheduling/preemptible > is_preemptible.txt | ||
| >>> | ||
|
|
||
| runtime { | ||
| docker: image | ||
| predefinedMachineType: machine_type | ||
| preemptible: preemptible | ||
| zones: zones | ||
| } | ||
|
|
||
| meta { | ||
| volatile: true | ||
| } | ||
|
|
||
| output { | ||
| String stdout = read_string(stdout()) | ||
| String actual_machine_type = read_string("actual_machine_type.txt") | ||
| String is_preemptible = read_string("is_preemptible.txt") | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,13 @@ | ||
| name: gcp_machine_type_fail | ||
| testFormat: workflowfailure | ||
| backends: [GCPBATCH] | ||
|
|
||
| files { | ||
| workflow: gcp_machine_type.wdl | ||
| inputs: fail_inputs.json | ||
| } | ||
|
|
||
| # Batch rejects the task and Cromwell fails it in an orderly manner | ||
| metadata { | ||
| "failures.0.causedBy.0.message": ~~"GCP Batch task exited with Success(0). " | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,19 @@ | ||
| name: gcp_machine_type_gpu | ||
| testFormat: workflowsuccess | ||
| backends: [GCPBATCH] | ||
|
|
||
| # Creates a `g2-standard-4` VM: 1 NVIDIA L4 GPU, 4 vCPUs, 16GB RAM | ||
| # This is the cheapest machine type under the new type-based GPU model, replacing the older machine type + gpu type scheme. | ||
| # For more information, see https://broadworkbench.atlassian.net/browse/AN-758 | ||
|
|
||
| files { | ||
| workflow: gcp_machine_type.wdl | ||
| inputs: gpu_inputs.json | ||
| } | ||
|
|
||
| metadata { | ||
| "calls.minimal_hello_world.hello_world.runtimeAttributes.predefinedMachineType": "g2-standard-4" | ||
| "calls.minimal_hello_world.hello_world.runtimeAttributes.preemptible": "0" | ||
| "outputs.minimal_hello_world.actual_machine_type": ~~"machineTypes/g2-standard-4" | ||
| "outputs.minimal_hello_world.is_preemptible": "FALSE" | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,15 @@ | ||
| name: gcp_machine_type_preemptible | ||
| testFormat: workflowsuccess | ||
| backends: [GCPBATCH] | ||
|
|
||
| files { | ||
| workflow: gcp_machine_type.wdl | ||
| inputs: preemptible_inputs.json | ||
| } | ||
|
|
||
| metadata { | ||
| "calls.minimal_hello_world.hello_world.runtimeAttributes.predefinedMachineType": "n2-standard-2" | ||
| "calls.minimal_hello_world.hello_world.runtimeAttributes.preemptible": "5" | ||
| "outputs.minimal_hello_world.actual_machine_type": ~~"machineTypes/n2-standard-2" | ||
| "outputs.minimal_hello_world.is_preemptible": "TRUE" | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,4 @@ | ||
| { | ||
| "minimal_hello_world.machine_type": "g2-standard-4", | ||
| "minimal_hello_world.zones": "us-east4-a us-east4-c" | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| { | ||
| "minimal_hello_world.preemptible": 5 | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -81,7 +81,7 @@ object GcpBatchAsyncBackendJobExecutionActor { | |
|
|
||
| new Exception( | ||
| s"Task $jobTag failed. $returnCodeMessage GCP Batch task exited with ${errorCode}(${errorCode.code}). ${message}" | ||
| ) | ||
| ) with NoStackTrace | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why the no stack trace? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When we deliberately create exceptions in the program flow, my opinion is that they should never have a stack trace as it clutters the log and is not relevant for debugging. A second order issue is that users often diligently copy-paste entire stack traces, rendering Slack threads and Zendesk cases unreadable. After: Before: |
||
| } | ||
|
|
||
| // GCS path regexes comments: | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,7 +25,7 @@ import com.google.cloud.batch.v1.{ | |
| import com.google.protobuf.Duration | ||
| import cromwell.backend.google.batch.io.GcpBatchAttachedDisk | ||
| import cromwell.backend.google.batch.models.GcpBatchConfigurationAttributes.GcsTransferConfiguration | ||
| import cromwell.backend.google.batch.models.{GcpBatchRequest, VpcAndSubnetworkProjectLabelValues} | ||
| import cromwell.backend.google.batch.models.{GcpBatchRequest, MachineType, VpcAndSubnetworkProjectLabelValues} | ||
| import cromwell.backend.google.batch.runnable._ | ||
| import cromwell.backend.google.batch.util.{BatchUtilityConversions, GcpBatchMachineConstraints} | ||
| import cromwell.core.labels.{Label, Labels} | ||
|
|
@@ -256,14 +256,33 @@ class GcpBatchRequestFactoryImpl()(implicit gcsTransferConfiguration: GcsTransfe | |
| isBackground = _.getBackground | ||
| ) | ||
|
|
||
| /** | ||
| * The "compute resource" concept is a suggestion to Batch regarding how many jobs can fit on a single VM. | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not sure I 100% understand. Why do we supply the compute resource if it is not used and lead to UI confusion? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because otherwise Google displays default values that are even more wrong. If we make a one-line change to
In the future we could enhance the code to calculate a CPU and memory size for each There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah gotcha, yeah definitely a follow up thing to do if you can add that to the cost ticket |
||
| * The Cromwell backend currently creates VMs at a 1:1 ratio with jobs, so the compute resource is effectively ignored. | ||
| * | ||
| * That said, it has a cosmetic effect in the Batch web UI, where it drives the "Cores" and "Memory" readouts. | ||
| * The machine type is the "real" VM shape; one can set bogus cores/memory in the compute resource, | ||
| * and it will have no effect other than the display. | ||
| */ | ||
| val computeResource = createComputeResource(cpuCores, memory, gcpBootDiskSizeMb) | ||
| val taskSpec = createTaskSpec(sortedRunnables, computeResource, durationInSeconds, allVolumes) | ||
| val taskGroup: TaskGroup = createTaskGroup(taskCount, taskSpec) | ||
| val machineType = GcpBatchMachineConstraints.machineType(runtimeAttributes.memory, | ||
| runtimeAttributes.cpu, | ||
| cpuPlatformOption = runtimeAttributes.cpuPlatform, | ||
| jobLogger = jobLogger | ||
| ) | ||
|
|
||
| val machineType = runtimeAttributes.machine match { | ||
| case Some(m: MachineType) => | ||
| // Allow users to select predefined machine types, such as `n2-standard-4`. | ||
| // Overrides CPU count and memory attributes. | ||
| // We still pass platform when machine is specified, it is the user's responsibility to select a valid type/platform combination | ||
| m.machineType | ||
| case None => | ||
| // CPU platform drives selection of machine type, but is not encoded in the `machineType` return value itself | ||
| GcpBatchMachineConstraints.machineType(runtimeAttributes.memory, | ||
| runtimeAttributes.cpu, | ||
| cpuPlatformOption = runtimeAttributes.cpuPlatform, | ||
| jobLogger = jobLogger | ||
| ) | ||
| } | ||
|
|
||
| val instancePolicy = | ||
| createInstancePolicy(cpuPlatform = cpuPlatform, spotModel, accelerators, allDisks, machineType = machineType) | ||
| val locationPolicy = LocationPolicy.newBuilder.addAllAllowedLocations(zones.asJava).build | ||
|
|
||

There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Disable no-op Scaladoc generation: We write the HTML docs to disk on the CI instance and then throw them away.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(My block comment in
GcpBatchRequestFactoryImpl.scalais invalid scaladoc, which is how I found out we did scaladoc)