From 84ca918db8a0c61bdba889fd71cd49b69df46c10 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Wed, 28 Aug 2024 09:29:37 -0400 Subject: [PATCH 1/2] WX-1810 WX-1830 n1/n2/n2d machine types, cpuPlatform on GCPBATCH (#7518) --- CHANGELOG.md | 2 + build.sbt | 1 + .../standardTestCases/papi_cpu_platform.test | 3 +- .../actors/BatchApiRunCreationClient.scala | 2 +- .../batch/api/GcpBatchRequestFactory.scala | 3 +- .../api/GcpBatchRequestFactoryImpl.scala | 18 +++++++-- .../models/GcpBatchRuntimeAttributes.scala | 1 + .../util/GcpBatchMachineConstraints.scala | 7 ++-- ...tchAsyncBackendJobExecutionActorSpec.scala | 2 +- .../util/GcpBatchMachineConstraintsSpec.scala | 39 ++++++++++++------- 10 files changed, 54 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a0645cabcaa..3e9aacb801a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,8 @@ be found [here](https://cromwell.readthedocs.io/en/stable/backends/HPC/#optional ### GCP Batch - The `genomics` configuration entry was renamed to `batch`, see [ReadTheDocs](https://cromwell.readthedocs.io/en/stable/backends/GCPBatch/) for more information. +- Fixes a bug with not being able to recover jobs on Cromwell restart. +- Fixes machine type selection to match the Google Cloud Life Sciences backend, including default n1 non shared-core machine types and correct handling of `cpuPlatform` to select n2 or n2d machine types as appropriate. - Fixes the preemption error handling, now, the correct error message is printed, this also handles the other potential exit codes. - Fixes pulling Docker image metadata from private GCR repositories. - Fixed `google_project` and `google_compute_service_account` workflow options not taking effect when using GCP Batch backend diff --git a/build.sbt b/build.sbt index b69784f5698..2ca612c5ef2 100644 --- a/build.sbt +++ b/build.sbt @@ -237,6 +237,7 @@ lazy val googlePipelinesV2Beta = (project in backendRoot / "google" / "pipelines lazy val googleBatch = (project in backendRoot / "google" / "batch") .withLibrarySettings("cromwell-google-batch-backend") + .dependsOn(core) .dependsOn(backend) .dependsOn(gcsFileSystem) .dependsOn(drsFileSystem) diff --git a/centaur/src/main/resources/standardTestCases/papi_cpu_platform.test b/centaur/src/main/resources/standardTestCases/papi_cpu_platform.test index 7b38c3a25d7..030c36e2b31 100644 --- a/centaur/src/main/resources/standardTestCases/papi_cpu_platform.test +++ b/centaur/src/main/resources/standardTestCases/papi_cpu_platform.test @@ -1,6 +1,7 @@ name: papi_cpu_platform testFormat: workflowsuccess -backends: [Papiv2] +backendsMode: any +backends: [Papiv2, GCPBATCH] files { workflow: papi_cpu_platform/papi_cpu_platform.wdl diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/BatchApiRunCreationClient.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/BatchApiRunCreationClient.scala index e31879e77b2..4dec231d462 100644 --- a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/BatchApiRunCreationClient.scala +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/BatchApiRunCreationClient.scala @@ -53,7 +53,7 @@ trait BatchApiRunCreationClient { this: Actor with ActorLogging with BatchInstru backendSingletonActor ! BatchApiRequestManager.BatchRunCreationRequest( request.workflowId, self, - requestFactory.submitRequest(request) + requestFactory.submitRequest(request, jobLogger) ) val newPromise = Promise[StandardAsyncJob]() runCreationClientPromise = Option(newPromise) diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/api/GcpBatchRequestFactory.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/api/GcpBatchRequestFactory.scala index ef38c8972c1..fe912d340f9 100644 --- a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/api/GcpBatchRequestFactory.scala +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/api/GcpBatchRequestFactory.scala @@ -6,13 +6,14 @@ import cromwell.backend.google.batch.io.GcpBatchAttachedDisk import cromwell.backend.google.batch.models.GcpBatchConfigurationAttributes.VirtualPrivateCloudConfiguration import cromwell.backend.google.batch.models._ import cromwell.backend.google.batch.monitoring.{CheckpointingConfiguration, MonitoringImage} +import cromwell.core.logging.JobLogger import cromwell.core.path.Path import wom.runtime.WomOutputRuntimeExtractor import scala.concurrent.duration.FiniteDuration trait GcpBatchRequestFactory { - def submitRequest(data: GcpBatchRequest): CreateJobRequest + def submitRequest(data: GcpBatchRequest, jobLogger: JobLogger): CreateJobRequest def queryRequest(jobName: JobName): GetJobRequest diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/api/GcpBatchRequestFactoryImpl.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/api/GcpBatchRequestFactoryImpl.scala index 64a97c34b79..745c39ca587 100644 --- a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/api/GcpBatchRequestFactoryImpl.scala +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/api/GcpBatchRequestFactoryImpl.scala @@ -22,7 +22,8 @@ import cromwell.backend.google.batch.io.GcpBatchAttachedDisk import cromwell.backend.google.batch.models.GcpBatchConfigurationAttributes.GcsTransferConfiguration import cromwell.backend.google.batch.models.{GcpBatchRequest, VpcAndSubnetworkProjectLabelValues} import cromwell.backend.google.batch.runnable._ -import cromwell.backend.google.batch.util.BatchUtilityConversions +import cromwell.backend.google.batch.util.{BatchUtilityConversions, GcpBatchMachineConstraints} +import cromwell.core.logging.JobLogger import scala.jdk.CollectionConverters._ @@ -74,7 +75,8 @@ class GcpBatchRequestFactoryImpl()(implicit gcsTransferConfiguration: GcsTransfe private def createInstancePolicy(cpuPlatform: String, spotModel: ProvisioningModel, accelerators: Option[Accelerator.Builder], - attachedDisks: List[AttachedDisk] + attachedDisks: List[AttachedDisk], + machineType: String ): InstancePolicy.Builder = { // set GPU count to 0 if not included in workflow @@ -82,6 +84,7 @@ class GcpBatchRequestFactoryImpl()(implicit gcsTransferConfiguration: GcsTransfe val instancePolicy = InstancePolicy.newBuilder .setProvisioningModel(spotModel) + .setMachineType(machineType) .addAllDisks(attachedDisks.asJava) .setMinCpuPlatform(cpuPlatform) .buildPartial() @@ -154,7 +157,7 @@ class GcpBatchRequestFactoryImpl()(implicit gcsTransferConfiguration: GcsTransfe } } - override def submitRequest(data: GcpBatchRequest): CreateJobRequest = { + override def submitRequest(data: GcpBatchRequest, jobLogger: JobLogger): CreateJobRequest = { val runtimeAttributes = data.gcpBatchParameters.runtimeAttributes val createParameters = data.createParameters @@ -224,7 +227,14 @@ class GcpBatchRequestFactoryImpl()(implicit gcsTransferConfiguration: GcsTransfe val computeResource = createComputeResource(cpuCores, memory, gcpBootDiskSizeMb) val taskSpec = createTaskSpec(sortedRunnables, computeResource, retryCount, durationInSeconds, allVolumes) val taskGroup: TaskGroup = createTaskGroup(taskCount, taskSpec) - val instancePolicy = createInstancePolicy(cpuPlatform, spotModel, accelerators, allDisks) + val machineType = GcpBatchMachineConstraints.machineType(runtimeAttributes.memory, + runtimeAttributes.cpu, + cpuPlatformOption = runtimeAttributes.cpuPlatform, + googleLegacyMachineSelection = false, + jobLogger = jobLogger + ) + val instancePolicy = + createInstancePolicy(cpuPlatform = cpuPlatform, spotModel, accelerators, allDisks, machineType = machineType) val locationPolicy = LocationPolicy.newBuilder.addAllowedLocations(zones).build val allocationPolicy = createAllocationPolicy(data, locationPolicy, instancePolicy.build, networkPolicy, gcpSa, accelerators) diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchRuntimeAttributes.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchRuntimeAttributes.scala index 550d2c79866..4609af772d7 100644 --- a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchRuntimeAttributes.scala +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchRuntimeAttributes.scala @@ -77,6 +77,7 @@ object GcpBatchRuntimeAttributes { private val cpuPlatformValidationInstance = new StringRuntimeAttributesValidation(CpuPlatformKey).optional // via `gcloud compute zones describe us-central1-a` val CpuPlatformIntelCascadeLakeValue = "Intel Cascade Lake" + val CpuPlatformIntelIceLakeValue = "Intel Ice Lake" val CpuPlatformAMDRomeValue = "AMD Rome" val UseDockerImageCacheKey = "useDockerImageCache" diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/GcpBatchMachineConstraints.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/GcpBatchMachineConstraints.scala index 2bbf835eeef..bbb7e940297 100644 --- a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/GcpBatchMachineConstraints.scala +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/GcpBatchMachineConstraints.scala @@ -6,9 +6,9 @@ import cromwell.backend.google.batch.models.{ N2CustomMachineType, N2DCustomMachineType } +import cromwell.core.logging.JobLogger import eu.timepit.refined.api.Refined import eu.timepit.refined.numeric.Positive -import org.slf4j.Logger import wdl4s.parser.MemoryUnit import wom.format.MemorySize @@ -17,16 +17,17 @@ object GcpBatchMachineConstraints { cpu: Int Refined Positive, cpuPlatformOption: Option[String], googleLegacyMachineSelection: Boolean, - jobLogger: Logger + jobLogger: JobLogger ): String = if (googleLegacyMachineSelection) { s"predefined-$cpu-${memory.to(MemoryUnit.MB).amount.intValue()}" } else { - // If someone requests Intel Cascade Lake as their CPU platform then switch the machine type to n2. + // If someone requests Intel Cascade Lake or Intel Ice Lake as their CPU platform then switch the machine type to n2. // Similarly, CPU platform of AMD Rome corresponds to the machine type n2d. val customMachineType = cpuPlatformOption match { case Some(GcpBatchRuntimeAttributes.CpuPlatformIntelCascadeLakeValue) => N2CustomMachineType + case Some(GcpBatchRuntimeAttributes.CpuPlatformIntelIceLakeValue) => N2CustomMachineType case Some(GcpBatchRuntimeAttributes.CpuPlatformAMDRomeValue) => N2DCustomMachineType case _ => N1CustomMachineType } diff --git a/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/actors/GcpBatchAsyncBackendJobExecutionActorSpec.scala b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/actors/GcpBatchAsyncBackendJobExecutionActorSpec.scala index d3bcb87c016..0903c1babb2 100644 --- a/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/actors/GcpBatchAsyncBackendJobExecutionActorSpec.scala +++ b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/actors/GcpBatchAsyncBackendJobExecutionActorSpec.scala @@ -131,7 +131,7 @@ class GcpBatchAsyncBackendJobExecutionActorSpec val runtimeAttributesBuilder = GcpBatchRuntimeAttributes.runtimeAttributesBuilder(configuration) val requestFactory: GcpBatchRequestFactory = new GcpBatchRequestFactory { - override def submitRequest(data: GcpBatchRequest): CreateJobRequest = null + override def submitRequest(data: GcpBatchRequest, jobLogger: JobLogger): CreateJobRequest = null override def queryRequest(jobName: JobName): GetJobRequest = null diff --git a/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/util/GcpBatchMachineConstraintsSpec.scala b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/util/GcpBatchMachineConstraintsSpec.scala index 417b1e829d1..c6b66d36d2f 100644 --- a/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/util/GcpBatchMachineConstraintsSpec.scala +++ b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/util/GcpBatchMachineConstraintsSpec.scala @@ -1,24 +1,27 @@ package cromwell.backend.google.batch.util import common.assertion.CromwellTimeoutSpec +import common.mock.MockSugar.mock import cromwell.backend.google.batch.models.GcpBatchRuntimeAttributes +import cromwell.core.logging.JobLogger import eu.timepit.refined.numeric.Positive import eu.timepit.refined.refineMV import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers import org.scalatest.prop.TableDrivenPropertyChecks._ import org.scalatest.prop.Tables.Table -import org.slf4j.helpers.NOPLogger import wdl4s.parser.MemoryUnit import wom.format.MemorySize class GcpBatchMachineConstraintsSpec extends AnyFlatSpec with CromwellTimeoutSpec with Matchers { behavior of "MachineConstraints" - private val n2Option = Option(GcpBatchRuntimeAttributes.CpuPlatformIntelCascadeLakeValue) + private val n2OptionCascadeLake = Option(GcpBatchRuntimeAttributes.CpuPlatformIntelCascadeLakeValue) private val n2dOption = Option(GcpBatchRuntimeAttributes.CpuPlatformAMDRomeValue) + private val n2OptionIceLake = Option(GcpBatchRuntimeAttributes.CpuPlatformIntelIceLakeValue) + it should "generate valid machine types" in { val validTypes = Table( ("memory", "cpu", "cpuPlatformOption", "googleLegacyMachineSelection", "machineTypeString"), @@ -41,7 +44,6 @@ class GcpBatchMachineConstraintsSpec extends AnyFlatSpec with CromwellTimeoutSpe // Same tests as above but with legacy machine type selection (cpu and memory as specified. No 'custom machine // requirement' adjustments are expected this time, except float->int) - (MemorySize(1024, MemoryUnit.MB), refineMV[Positive](1), None, true, "predefined-1-1024"), (MemorySize(4, MemoryUnit.GB), refineMV[Positive](3), None, true, "predefined-3-4096"), (MemorySize(1, MemoryUnit.GB), refineMV[Positive](1), None, true, "predefined-1-1024"), @@ -53,15 +55,26 @@ class GcpBatchMachineConstraintsSpec extends AnyFlatSpec with CromwellTimeoutSpe (MemorySize(2, MemoryUnit.GB), refineMV[Positive](33), None, true, "predefined-33-2048"), // Same tests but with cascade lake (n2) - (MemorySize(1024, MemoryUnit.MB), refineMV[Positive](1), n2Option, false, "n2-custom-2-2048"), - (MemorySize(4, MemoryUnit.GB), refineMV[Positive](3), n2Option, false, "n2-custom-4-4096"), - (MemorySize(1, MemoryUnit.GB), refineMV[Positive](1), n2Option, false, "n2-custom-2-2048"), - (MemorySize(1, MemoryUnit.GB), refineMV[Positive](4), n2Option, false, "n2-custom-4-4096"), - (MemorySize(14, MemoryUnit.GB), refineMV[Positive](16), n2Option, false, "n2-custom-16-16384"), - (MemorySize(13.65, MemoryUnit.GB), refineMV[Positive](1), n2Option, false, "n2-custom-2-14080"), - (MemorySize(1520.96, MemoryUnit.MB), refineMV[Positive](1), n2Option, false, "n2-custom-2-2048"), - (MemorySize(1024.0, MemoryUnit.MB), refineMV[Positive](1), n2Option, false, "n2-custom-2-2048"), - (MemorySize(2, MemoryUnit.GB), refineMV[Positive](33), n2Option, false, "n2-custom-36-36864"), + (MemorySize(1024, MemoryUnit.MB), refineMV[Positive](1), n2OptionCascadeLake, false, "n2-custom-2-2048"), + (MemorySize(4, MemoryUnit.GB), refineMV[Positive](3), n2OptionCascadeLake, false, "n2-custom-4-4096"), + (MemorySize(1, MemoryUnit.GB), refineMV[Positive](1), n2OptionCascadeLake, false, "n2-custom-2-2048"), + (MemorySize(1, MemoryUnit.GB), refineMV[Positive](4), n2OptionCascadeLake, false, "n2-custom-4-4096"), + (MemorySize(14, MemoryUnit.GB), refineMV[Positive](16), n2OptionCascadeLake, false, "n2-custom-16-16384"), + (MemorySize(13.65, MemoryUnit.GB), refineMV[Positive](1), n2OptionCascadeLake, false, "n2-custom-2-14080"), + (MemorySize(1520.96, MemoryUnit.MB), refineMV[Positive](1), n2OptionCascadeLake, false, "n2-custom-2-2048"), + (MemorySize(1024.0, MemoryUnit.MB), refineMV[Positive](1), n2OptionCascadeLake, false, "n2-custom-2-2048"), + (MemorySize(2, MemoryUnit.GB), refineMV[Positive](33), n2OptionCascadeLake, false, "n2-custom-36-36864"), + + // Same tests, but with ice lake. Should produce same results as cascade lake since they're both n2. + (MemorySize(1024, MemoryUnit.MB), refineMV[Positive](1), n2OptionIceLake, false, "n2-custom-2-2048"), + (MemorySize(4, MemoryUnit.GB), refineMV[Positive](3), n2OptionIceLake, false, "n2-custom-4-4096"), + (MemorySize(1, MemoryUnit.GB), refineMV[Positive](1), n2OptionIceLake, false, "n2-custom-2-2048"), + (MemorySize(1, MemoryUnit.GB), refineMV[Positive](4), n2OptionIceLake, false, "n2-custom-4-4096"), + (MemorySize(14, MemoryUnit.GB), refineMV[Positive](16), n2OptionIceLake, false, "n2-custom-16-16384"), + (MemorySize(13.65, MemoryUnit.GB), refineMV[Positive](1), n2OptionIceLake, false, "n2-custom-2-14080"), + (MemorySize(1520.96, MemoryUnit.MB), refineMV[Positive](1), n2OptionIceLake, false, "n2-custom-2-2048"), + (MemorySize(1024.0, MemoryUnit.MB), refineMV[Positive](1), n2OptionIceLake, false, "n2-custom-2-2048"), + (MemorySize(2, MemoryUnit.GB), refineMV[Positive](33), n2OptionIceLake, false, "n2-custom-36-36864"), // Same tests but with AMD Rome (n2d) #cpu > 16 are in increments of 16 (MemorySize(1024, MemoryUnit.MB), refineMV[Positive](1), n2dOption, false, "n2d-custom-2-1024"), @@ -83,7 +96,7 @@ class GcpBatchMachineConstraintsSpec extends AnyFlatSpec with CromwellTimeoutSpe cpu = cpu, cpuPlatformOption = cpuPlatformOption, googleLegacyMachineSelection = googleLegacyMachineSelection, - jobLogger = NOPLogger.NOP_LOGGER + jobLogger = mock[JobLogger] ) shouldBe expected } } From a3f20e92c59a88eba1f1b0590d69ebe84b4c74e2 Mon Sep 17 00:00:00 2001 From: javiergaitan Date: Thu, 3 Oct 2024 22:46:48 +0000 Subject: [PATCH 2/2] feat: [GCP Batch] Support passing standard machine types to the Google backend --- .../api/GcpBatchRequestFactoryImpl.scala | 1 + .../models/GcpBatchCustomMachineType.scala | 2 + .../models/GcpBatchRuntimeAttributes.scala | 17 ++- .../util/GcpBatchMachineConstraints.scala | 8 +- .../GcpBatchRuntimeAttributesSpec.scala | 3 +- .../util/GcpBatchMachineConstraintsSpec.scala | 125 +++++++++++------- 6 files changed, 101 insertions(+), 55 deletions(-) diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/api/GcpBatchRequestFactoryImpl.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/api/GcpBatchRequestFactoryImpl.scala index 745c39ca587..83411477b5d 100644 --- a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/api/GcpBatchRequestFactoryImpl.scala +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/api/GcpBatchRequestFactoryImpl.scala @@ -230,6 +230,7 @@ class GcpBatchRequestFactoryImpl()(implicit gcsTransferConfiguration: GcsTransfe val machineType = GcpBatchMachineConstraints.machineType(runtimeAttributes.memory, runtimeAttributes.cpu, cpuPlatformOption = runtimeAttributes.cpuPlatform, + standardMachineTypeOption = runtimeAttributes.standardMachineType, googleLegacyMachineSelection = false, jobLogger = jobLogger ) diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchCustomMachineType.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchCustomMachineType.scala index cf5db4df9e9..706b1161c82 100644 --- a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchCustomMachineType.scala +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchCustomMachineType.scala @@ -11,6 +11,8 @@ import wom.format.MemorySize import scala.math.{log, pow} +case class StandardMachineType(machineType: String) {} + /** * Adjusts memory and cpu for custom machine types. * diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchRuntimeAttributes.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchRuntimeAttributes.scala index 4609af772d7..eef637c4203 100644 --- a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchRuntimeAttributes.scala +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchRuntimeAttributes.scala @@ -49,7 +49,8 @@ final case class GcpBatchRuntimeAttributes(cpu: Int Refined Positive, continueOnReturnCode: ContinueOnReturnCode, noAddress: Boolean, useDockerImageCache: Option[Boolean], - checkpointFilename: Option[String] + checkpointFilename: Option[String], + standardMachineType: Option[String] ) object GcpBatchRuntimeAttributes { @@ -85,6 +86,8 @@ object GcpBatchRuntimeAttributes { UseDockerImageCacheKey ).optional + val StandardMachineTypeKey = "standardMachineType" + val CheckpointFileKey = "checkpointFile" private val checkpointFileValidationInstance = new StringRuntimeAttributesValidation(CheckpointFileKey).optional @@ -98,6 +101,8 @@ object GcpBatchRuntimeAttributes { ) private def cpuPlatformValidation(runtimeConfig: Option[Config]): OptionalRuntimeAttributesValidation[String] = cpuPlatformValidationInstance + private def standardMachineTypeValidation(runtimeConfig: Option[Config]): OptionalRuntimeAttributesValidation[String] = + new StringRuntimeAttributesValidation(StandardMachineTypeKey).optional private def gpuTypeValidation(runtimeConfig: Option[Config]): OptionalRuntimeAttributesValidation[GpuType] = GpuTypeValidation.optional @@ -171,7 +176,8 @@ object GcpBatchRuntimeAttributes { bootDiskSizeValidation(runtimeConfig), useDockerImageCacheValidation(runtimeConfig), checkpointFileValidationInstance, - dockerValidation + dockerValidation, + standardMachineTypeValidation(runtimeConfig) ) } @@ -228,6 +234,10 @@ object GcpBatchRuntimeAttributes { useDockerImageCacheValidation(runtimeAttrsConfig).key, validatedRuntimeAttributes ) + val standardMachineType: Option[String] = RuntimeAttributesValidation.extractOption( + standardMachineTypeValidation(runtimeAttrsConfig).key, + validatedRuntimeAttributes + ) new GcpBatchRuntimeAttributes( cpu = cpu, @@ -243,7 +253,8 @@ object GcpBatchRuntimeAttributes { continueOnReturnCode = continueOnReturnCode, noAddress = noAddress, useDockerImageCache = useDockerImageCache, - checkpointFilename = checkpointFileName + checkpointFilename = checkpointFileName, + standardMachineType = standardMachineType ) } diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/GcpBatchMachineConstraints.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/GcpBatchMachineConstraints.scala index bbb7e940297..7a2f205588e 100644 --- a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/GcpBatchMachineConstraints.scala +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/GcpBatchMachineConstraints.scala @@ -4,7 +4,8 @@ import cromwell.backend.google.batch.models.{ GcpBatchRuntimeAttributes, N1CustomMachineType, N2CustomMachineType, - N2DCustomMachineType + N2DCustomMachineType, + StandardMachineType } import cromwell.core.logging.JobLogger import eu.timepit.refined.api.Refined @@ -16,10 +17,13 @@ object GcpBatchMachineConstraints { def machineType(memory: MemorySize, cpu: Int Refined Positive, cpuPlatformOption: Option[String], + standardMachineTypeOption: Option[String], googleLegacyMachineSelection: Boolean, jobLogger: JobLogger ): String = - if (googleLegacyMachineSelection) { + if (standardMachineTypeOption.exists(_.trim.nonEmpty)) { + StandardMachineType(standardMachineTypeOption.get).machineType + } else if (googleLegacyMachineSelection) { s"predefined-$cpu-${memory.to(MemoryUnit.MB).amount.intValue()}" } else { // If someone requests Intel Cascade Lake or Intel Ice Lake as their CPU platform then switch the machine type to n2. diff --git a/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/models/GcpBatchRuntimeAttributesSpec.scala b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/models/GcpBatchRuntimeAttributesSpec.scala index 647f211a67b..4788519ea4c 100644 --- a/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/models/GcpBatchRuntimeAttributesSpec.scala +++ b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/models/GcpBatchRuntimeAttributesSpec.scala @@ -286,7 +286,8 @@ trait GcpBatchRuntimeAttributesSpecsMixin { continueOnReturnCode = ContinueOnReturnCodeSet(Set(0)), noAddress = false, useDockerImageCache = None, - checkpointFilename = None + checkpointFilename = None, + standardMachineType = None ) def assertBatchRuntimeAttributesSuccessfulCreation(runtimeAttributes: Map[String, WomValue], diff --git a/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/util/GcpBatchMachineConstraintsSpec.scala b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/util/GcpBatchMachineConstraintsSpec.scala index c6b66d36d2f..3ae55a0eaff 100644 --- a/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/util/GcpBatchMachineConstraintsSpec.scala +++ b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/util/GcpBatchMachineConstraintsSpec.scala @@ -24,77 +24,104 @@ class GcpBatchMachineConstraintsSpec extends AnyFlatSpec with CromwellTimeoutSpe it should "generate valid machine types" in { val validTypes = Table( - ("memory", "cpu", "cpuPlatformOption", "googleLegacyMachineSelection", "machineTypeString"), + ("memory", "cpu", "cpuPlatformOption", "standardMachineTypeOption", "googleLegacyMachineSelection", "machineTypeString"), // Already ok tuple - (MemorySize(1024, MemoryUnit.MB), refineMV[Positive](1), None, false, "custom-1-1024"), + (MemorySize(1024, MemoryUnit.MB), refineMV[Positive](1), None, None, false, "custom-1-1024"), // CPU must be even (except if it's 1) - (MemorySize(4, MemoryUnit.GB), refineMV[Positive](3), None, false, "custom-4-4096"), + (MemorySize(4, MemoryUnit.GB), refineMV[Positive](3), None, None, false, "custom-4-4096"), // Memory must be a multiple of 256 - (MemorySize(1, MemoryUnit.GB), refineMV[Positive](1), None, false, "custom-1-1024"), + (MemorySize(1, MemoryUnit.GB), refineMV[Positive](1), None, None, false, "custom-1-1024"), // Memory / cpu ratio must be > 0.9GB, increase memory - (MemorySize(1, MemoryUnit.GB), refineMV[Positive](4), None, false, "custom-4-3840"), - (MemorySize(14, MemoryUnit.GB), refineMV[Positive](16), None, false, "custom-16-14848"), + (MemorySize(1, MemoryUnit.GB), refineMV[Positive](4), None, None, false, "custom-4-3840"), + (MemorySize(14, MemoryUnit.GB), refineMV[Positive](16), None, None, false, "custom-16-14848"), // Memory / cpu ratio must be < 6.5GB, increase CPU - (MemorySize(13.65, MemoryUnit.GB), refineMV[Positive](1), None, false, "custom-4-14080"), + (MemorySize(13.65, MemoryUnit.GB), refineMV[Positive](1), None, None, false, "custom-4-14080"), // Memory should be an int - (MemorySize(1520.96, MemoryUnit.MB), refineMV[Positive](1), None, false, "custom-1-1536"), - (MemorySize(1024.0, MemoryUnit.MB), refineMV[Positive](1), None, false, "custom-1-1024"), + (MemorySize(1520.96, MemoryUnit.MB), refineMV[Positive](1), None, None, false, "custom-1-1536"), + (MemorySize(1024.0, MemoryUnit.MB), refineMV[Positive](1), None, None, false, "custom-1-1024"), // Increase to a cpu selection not valid for n2 below - (MemorySize(2, MemoryUnit.GB), refineMV[Positive](33), None, false, "custom-34-31488"), + (MemorySize(2, MemoryUnit.GB), refineMV[Positive](33), None, None, false, "custom-34-31488"), // Same tests as above but with legacy machine type selection (cpu and memory as specified. No 'custom machine // requirement' adjustments are expected this time, except float->int) - (MemorySize(1024, MemoryUnit.MB), refineMV[Positive](1), None, true, "predefined-1-1024"), - (MemorySize(4, MemoryUnit.GB), refineMV[Positive](3), None, true, "predefined-3-4096"), - (MemorySize(1, MemoryUnit.GB), refineMV[Positive](1), None, true, "predefined-1-1024"), - (MemorySize(1, MemoryUnit.GB), refineMV[Positive](4), None, true, "predefined-4-1024"), - (MemorySize(14, MemoryUnit.GB), refineMV[Positive](16), None, true, "predefined-16-14336"), - (MemorySize(13.65, MemoryUnit.GB), refineMV[Positive](1), None, true, "predefined-1-13977"), - (MemorySize(1520.96, MemoryUnit.MB), refineMV[Positive](1), None, true, "predefined-1-1520"), - (MemorySize(1024.0, MemoryUnit.MB), refineMV[Positive](1), None, true, "predefined-1-1024"), - (MemorySize(2, MemoryUnit.GB), refineMV[Positive](33), None, true, "predefined-33-2048"), + (MemorySize(1024, MemoryUnit.MB), refineMV[Positive](1), None, None, true, "predefined-1-1024"), + (MemorySize(4, MemoryUnit.GB), refineMV[Positive](3), None, None, true, "predefined-3-4096"), + (MemorySize(1, MemoryUnit.GB), refineMV[Positive](1), None, None, true, "predefined-1-1024"), + (MemorySize(1, MemoryUnit.GB), refineMV[Positive](4), None, None, true, "predefined-4-1024"), + (MemorySize(14, MemoryUnit.GB), refineMV[Positive](16), None, None, true, "predefined-16-14336"), + (MemorySize(13.65, MemoryUnit.GB), refineMV[Positive](1), None, None, true, "predefined-1-13977"), + (MemorySize(1520.96, MemoryUnit.MB), refineMV[Positive](1), None, None, true, "predefined-1-1520"), + (MemorySize(1024.0, MemoryUnit.MB), refineMV[Positive](1), None, None, true, "predefined-1-1024"), + (MemorySize(2, MemoryUnit.GB), refineMV[Positive](33), None, None, true, "predefined-33-2048"), // Same tests but with cascade lake (n2) - (MemorySize(1024, MemoryUnit.MB), refineMV[Positive](1), n2OptionCascadeLake, false, "n2-custom-2-2048"), - (MemorySize(4, MemoryUnit.GB), refineMV[Positive](3), n2OptionCascadeLake, false, "n2-custom-4-4096"), - (MemorySize(1, MemoryUnit.GB), refineMV[Positive](1), n2OptionCascadeLake, false, "n2-custom-2-2048"), - (MemorySize(1, MemoryUnit.GB), refineMV[Positive](4), n2OptionCascadeLake, false, "n2-custom-4-4096"), - (MemorySize(14, MemoryUnit.GB), refineMV[Positive](16), n2OptionCascadeLake, false, "n2-custom-16-16384"), - (MemorySize(13.65, MemoryUnit.GB), refineMV[Positive](1), n2OptionCascadeLake, false, "n2-custom-2-14080"), - (MemorySize(1520.96, MemoryUnit.MB), refineMV[Positive](1), n2OptionCascadeLake, false, "n2-custom-2-2048"), - (MemorySize(1024.0, MemoryUnit.MB), refineMV[Positive](1), n2OptionCascadeLake, false, "n2-custom-2-2048"), - (MemorySize(2, MemoryUnit.GB), refineMV[Positive](33), n2OptionCascadeLake, false, "n2-custom-36-36864"), + (MemorySize(1024, MemoryUnit.MB), refineMV[Positive](1), n2OptionCascadeLake, None, false, "n2-custom-2-2048"), + (MemorySize(4, MemoryUnit.GB), refineMV[Positive](3), n2OptionCascadeLake, None, false, "n2-custom-4-4096"), + (MemorySize(1, MemoryUnit.GB), refineMV[Positive](1), n2OptionCascadeLake, None, false, "n2-custom-2-2048"), + (MemorySize(1, MemoryUnit.GB), refineMV[Positive](4), n2OptionCascadeLake, None, false, "n2-custom-4-4096"), + (MemorySize(14, MemoryUnit.GB), refineMV[Positive](16), n2OptionCascadeLake, None, false, "n2-custom-16-16384"), + (MemorySize(13.65, MemoryUnit.GB), refineMV[Positive](1), n2OptionCascadeLake, None, false, "n2-custom-2-14080"), + (MemorySize(1520.96, MemoryUnit.MB), refineMV[Positive](1), n2OptionCascadeLake, None, false, "n2-custom-2-2048"), + (MemorySize(1024.0, MemoryUnit.MB), refineMV[Positive](1), n2OptionCascadeLake, None, false, "n2-custom-2-2048"), + (MemorySize(2, MemoryUnit.GB), refineMV[Positive](33), n2OptionCascadeLake, None, false, "n2-custom-36-36864"), // Same tests, but with ice lake. Should produce same results as cascade lake since they're both n2. - (MemorySize(1024, MemoryUnit.MB), refineMV[Positive](1), n2OptionIceLake, false, "n2-custom-2-2048"), - (MemorySize(4, MemoryUnit.GB), refineMV[Positive](3), n2OptionIceLake, false, "n2-custom-4-4096"), - (MemorySize(1, MemoryUnit.GB), refineMV[Positive](1), n2OptionIceLake, false, "n2-custom-2-2048"), - (MemorySize(1, MemoryUnit.GB), refineMV[Positive](4), n2OptionIceLake, false, "n2-custom-4-4096"), - (MemorySize(14, MemoryUnit.GB), refineMV[Positive](16), n2OptionIceLake, false, "n2-custom-16-16384"), - (MemorySize(13.65, MemoryUnit.GB), refineMV[Positive](1), n2OptionIceLake, false, "n2-custom-2-14080"), - (MemorySize(1520.96, MemoryUnit.MB), refineMV[Positive](1), n2OptionIceLake, false, "n2-custom-2-2048"), - (MemorySize(1024.0, MemoryUnit.MB), refineMV[Positive](1), n2OptionIceLake, false, "n2-custom-2-2048"), - (MemorySize(2, MemoryUnit.GB), refineMV[Positive](33), n2OptionIceLake, false, "n2-custom-36-36864"), + (MemorySize(1024, MemoryUnit.MB), refineMV[Positive](1), n2OptionIceLake, None, false, "n2-custom-2-2048"), + (MemorySize(4, MemoryUnit.GB), refineMV[Positive](3), n2OptionIceLake, None, false, "n2-custom-4-4096"), + (MemorySize(1, MemoryUnit.GB), refineMV[Positive](1), n2OptionIceLake, None, false, "n2-custom-2-2048"), + (MemorySize(1, MemoryUnit.GB), refineMV[Positive](4), n2OptionIceLake, None, false, "n2-custom-4-4096"), + (MemorySize(14, MemoryUnit.GB), refineMV[Positive](16), n2OptionIceLake, None, false, "n2-custom-16-16384"), + (MemorySize(13.65, MemoryUnit.GB), refineMV[Positive](1), n2OptionIceLake, None, false, "n2-custom-2-14080"), + (MemorySize(1520.96, MemoryUnit.MB), refineMV[Positive](1), n2OptionIceLake, None, false, "n2-custom-2-2048"), + (MemorySize(1024.0, MemoryUnit.MB), refineMV[Positive](1), n2OptionIceLake, None, false, "n2-custom-2-2048"), + (MemorySize(2, MemoryUnit.GB), refineMV[Positive](33), n2OptionIceLake, None, false, "n2-custom-36-36864"), // Same tests but with AMD Rome (n2d) #cpu > 16 are in increments of 16 - (MemorySize(1024, MemoryUnit.MB), refineMV[Positive](1), n2dOption, false, "n2d-custom-2-1024"), - (MemorySize(4, MemoryUnit.GB), refineMV[Positive](3), n2dOption, false, "n2d-custom-4-4096"), - (MemorySize(1, MemoryUnit.GB), refineMV[Positive](1), n2dOption, false, "n2d-custom-2-1024"), - (MemorySize(1, MemoryUnit.GB), refineMV[Positive](4), n2dOption, false, "n2d-custom-4-2048"), - (MemorySize(14, MemoryUnit.GB), refineMV[Positive](16), n2dOption, false, "n2d-custom-16-14336"), - (MemorySize(13.65, MemoryUnit.GB), refineMV[Positive](1), n2dOption, false, "n2d-custom-2-14080"), - (MemorySize(1520.96, MemoryUnit.MB), refineMV[Positive](1), n2dOption, false, "n2d-custom-2-1536"), - (MemorySize(1024.0, MemoryUnit.MB), refineMV[Positive](1), n2dOption, false, "n2d-custom-2-1024"), - (MemorySize(2, MemoryUnit.GB), refineMV[Positive](33), n2dOption, false, "n2d-custom-48-24576"), - (MemorySize(2, MemoryUnit.GB), refineMV[Positive](81), n2dOption, false, "n2d-custom-96-49152"), - (MemorySize(256, MemoryUnit.GB), refineMV[Positive](128), n2dOption, false, "n2d-custom-96-262144") + (MemorySize(1024, MemoryUnit.MB), refineMV[Positive](1), n2dOption, None, false, "n2d-custom-2-1024"), + (MemorySize(4, MemoryUnit.GB), refineMV[Positive](3), n2dOption, None, false, "n2d-custom-4-4096"), + (MemorySize(1, MemoryUnit.GB), refineMV[Positive](1), n2dOption, None, false, "n2d-custom-2-1024"), + (MemorySize(1, MemoryUnit.GB), refineMV[Positive](4), n2dOption, None, false, "n2d-custom-4-2048"), + (MemorySize(14, MemoryUnit.GB), refineMV[Positive](16), n2dOption, None, false, "n2d-custom-16-14336"), + (MemorySize(13.65, MemoryUnit.GB), refineMV[Positive](1), n2dOption, None, false, "n2d-custom-2-14080"), + (MemorySize(1520.96, MemoryUnit.MB), refineMV[Positive](1), n2dOption, None, false, "n2d-custom-2-1536"), + (MemorySize(1024.0, MemoryUnit.MB), refineMV[Positive](1), n2dOption, None, false, "n2d-custom-2-1024"), + (MemorySize(2, MemoryUnit.GB), refineMV[Positive](33), n2dOption, None, false, "n2d-custom-48-24576"), + (MemorySize(2, MemoryUnit.GB), refineMV[Positive](81), n2dOption, None, false, "n2d-custom-96-49152"), + (MemorySize(256, MemoryUnit.GB), refineMV[Positive](128), n2dOption, None, false, "n2d-custom-96-262144"), + + // Test Standard Machine types + // General-purpose machine family + (MemorySize(2, MemoryUnit.GB), refineMV[Positive](33), None, Option("n1-standard-2"), false, "n1-standard-2"), + (MemorySize(2, MemoryUnit.GB), refineMV[Positive](33), None, Option("n1-highmem-2"), false, "n1-highmem-2"), + (MemorySize(2, MemoryUnit.GB), refineMV[Positive](33), None, Option("n1-highcpu-4"), false, "n1-highcpu-4"), + (MemorySize(2, MemoryUnit.GB), refineMV[Positive](33), None, Option("f1-micro"), false, "f1-micro"), + + // Accelerator-optimized machine family + (MemorySize(2, MemoryUnit.GB), refineMV[Positive](33), None, Option("a2-highgpu-1g"), false, "a2-highgpu-1g"), + (MemorySize(2, MemoryUnit.GB), refineMV[Positive](33), None, Option("a3-megagpu-8g"), false, "a3-megagpu-8g"), + (MemorySize(2, MemoryUnit.GB), refineMV[Positive](33), None, Option("g2-standard-4"), false, "g2-standard-4"), + + // Other machine families + // Storage-optimized + (MemorySize(2, MemoryUnit.GB), refineMV[Positive](33), None, Option("z3-highmem-88"), false, "z3-highmem-88"), + // Compute-optimized + (MemorySize(2, MemoryUnit.GB), refineMV[Positive](33), None, Option("h3-standard-88"), false, "h3-standard-88"), + // Memory-optimized + (MemorySize(2, MemoryUnit.GB), refineMV[Positive](33), None, Option("m3-ultramem-128"), false, "m3-ultramem-128"), + (MemorySize(2, MemoryUnit.GB), refineMV[Positive](33), None, Option("a2-highgpu-1g"), false, "a2-highgpu-1g"), + (MemorySize(2, MemoryUnit.GB), refineMV[Positive](33), None, Option("a2-highgpu-1g"), false, "a2-highgpu-1g"), + + // Standard machine type overrides legacy selection + (MemorySize(2, MemoryUnit.GB), refineMV[Positive](33), None, Option("a2-highgpu-1g"), true, "a2-highgpu-1g"), + (MemorySize(2, MemoryUnit.GB), refineMV[Positive](33), None, Option("a2-highgpu-1g"), false, "a2-highgpu-1g") ) - forAll(validTypes) { (memory, cpu, cpuPlatformOption, googleLegacyMachineSelection, expected) => + forAll(validTypes) { (memory, cpu, cpuPlatformOption, standardMachineTypeOption, googleLegacyMachineSelection, expected) => GcpBatchMachineConstraints.machineType( memory = memory, cpu = cpu, cpuPlatformOption = cpuPlatformOption, + standardMachineTypeOption = standardMachineTypeOption, googleLegacyMachineSelection = googleLegacyMachineSelection, jobLogger = mock[JobLogger] ) shouldBe expected