From 7d1384de6dd34ec5c8923eeef0393dc39d3e69c8 Mon Sep 17 00:00:00 2001 From: Khalid Shakir Date: Fri, 27 Jun 2025 23:15:51 -0400 Subject: [PATCH] Use Papi or Batch log for retry with more memory. --- .../scala/cromwell/backend/io/JobPaths.scala | 2 + .../StandardAsyncExecutionActor.scala | 170 ++++++++++-------- .../gcpbatch/retry_with_more_memory.wdl | 3 + ...cpBatchAsyncBackendJobExecutionActor.scala | 16 +- .../GcpBatchConfigurationAttributes.scala | 25 ++- .../batch/models/GcpBatchJobPaths.scala | 7 + .../batch/util/MemoryRetryCheckMode.scala | 34 ++++ ...atchBackendLifecycleActorFactorySpec.scala | 4 +- ...BatchBackendCacheHitCopyingActorSpec.scala | 4 +- .../common/PipelinesApiJobPaths.scala | 2 + 10 files changed, 180 insertions(+), 87 deletions(-) create mode 100644 supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/MemoryRetryCheckMode.scala diff --git a/backend/src/main/scala/cromwell/backend/io/JobPaths.scala b/backend/src/main/scala/cromwell/backend/io/JobPaths.scala index 05ad6a56dc1..b614c5d40a7 100644 --- a/backend/src/main/scala/cromwell/backend/io/JobPaths.scala +++ b/backend/src/main/scala/cromwell/backend/io/JobPaths.scala @@ -80,6 +80,8 @@ trait JobPaths { lazy val dockerCid = callExecutionRoot.resolve(dockerCidFilename) lazy val returnCode = callExecutionRoot.resolve(returnCodeFilename) lazy val memoryRetryRC = callExecutionRoot.resolve(memoryRetryRCFilename) + // Path to to an existing file that contains the error text of the job if it failed due to memory constraints. + lazy val memoryRetryError = Option(standardPaths.error) // This is a `def` because `standardPaths` is a `var` that may be reassigned during the calculation of // standard output and error file names. diff --git a/backend/src/main/scala/cromwell/backend/standard/StandardAsyncExecutionActor.scala b/backend/src/main/scala/cromwell/backend/standard/StandardAsyncExecutionActor.scala index 0197c2a647c..c416d21a3c2 100644 --- a/backend/src/main/scala/cromwell/backend/standard/StandardAsyncExecutionActor.scala +++ b/backend/src/main/scala/cromwell/backend/standard/StandardAsyncExecutionActor.scala @@ -1416,7 +1416,7 @@ trait StandardAsyncExecutionActor ): Future[ExecutionHandle] = { // Returns true if the task has written an RC file that indicates OOM, false otherwise - def memoryRetryRC: Future[Boolean] = { + def memoryRetryRC: Future[(Boolean, Option[Path])] = { def readFile(path: Path, maxBytes: Option[Int]): Future[String] = asyncIo.contentAsStringAsync(path, maxBytes, failOnOverflow = false) @@ -1438,23 +1438,37 @@ trait StandardAsyncExecutionActor } } - def checkMemoryRetryStderr(errorKeys: List[String], maxBytes: Int): Future[Boolean] = - readFile(jobPaths.standardPaths.error, Option(maxBytes)) map { errorContent => + def checkMemoryRetryStderr(memoryRetryError: Path, errorKeys: List[String], maxBytes: Int): Future[Boolean] = + readFile(memoryRetryError, Option(maxBytes)) map { errorContent => errorKeys.exists(errorContent.contains) } - asyncIo.existsAsync(jobPaths.memoryRetryRC) flatMap { - case true => checkMemoryRetryRC() - case false => - (memoryRetryErrorKeys, memoryRetryStderrLimit) match { - case (Some(keys), Some(limit)) => - asyncIo.existsAsync(jobPaths.standardPaths.error) flatMap { - case true => checkMemoryRetryStderr(keys, limit) - case false => Future.successful(false) - } - case _ => Future.successful(false) - } - } + def checkMemoryRetryError(): Future[Boolean] = + (memoryRetryErrorKeys, memoryRetryStderrLimit, jobPaths.memoryRetryError) match { + case (Some(keys), Some(limit), Some(memoryRetryError)) => + for { + memoryRetryErrorExists <- asyncIo.existsAsync(memoryRetryError) + memoryRetryErrorFound <- + if (memoryRetryErrorExists) + checkMemoryRetryStderr(memoryRetryError, keys, limit) + else + Future.successful(false) + } yield memoryRetryErrorFound + case _ => Future.successful(false) + } + + // For backwards behavioral compatibility, check for the old memory retry RC file first. That file used to catch + // the errors from the standard error file, but now sometimes the error is written to a separate log file. + // If it exists, check its contents. If it doesn't find an OOM code, check the new memory retry error file. + for { + memoryRetryRCExists <- asyncIo.existsAsync(jobPaths.memoryRetryRC) + memoryRetryRCErrorFound <- if (memoryRetryRCExists) checkMemoryRetryRC() else Future.successful(false) + memoryRetryErrorFound <- if (memoryRetryRCErrorFound) Future.successful(true) else checkMemoryRetryError() + memoryErrorPathOption = + if (memoryRetryRCErrorFound) Option(jobPaths.standardPaths.error) + else if (memoryRetryErrorFound) jobPaths.memoryRetryError + else None + } yield (memoryRetryErrorFound, memoryErrorPathOption) } val stderr = jobPaths.standardPaths.error @@ -1465,74 +1479,76 @@ trait StandardAsyncExecutionActor // Only check stderr size if we need to, otherwise this results in a lot of unnecessary I/O that // may fail due to race conditions on quickly-executing jobs. stderrSize <- if (failOnStdErr) asyncIo.sizeAsync(stderr) else Future.successful(0L) - outOfMemoryDetected <- memoryRetryRC - } yield (stderrSize, returnCodeAsString, outOfMemoryDetected) - - stderrSizeAndReturnCodeAndMemoryRetry flatMap { case (stderrSize, returnCodeAsString, outOfMemoryDetected) => - val tryReturnCodeAsInt = Try(returnCodeAsString.trim.toInt) - - if (isDone(status)) { - tryReturnCodeAsInt match { - case Success(returnCodeAsInt) if failOnStdErr && stderrSize.intValue > 0 => - val executionHandle = Future.successful( - FailedNonRetryableExecutionHandle(StderrNonEmpty(jobDescriptor.key.tag, stderrSize, stderrAsOption), - Option(returnCodeAsInt), - None + (outOfMemoryDetected, outOfMemoryPathOption) <- memoryRetryRC + } yield (stderrSize, returnCodeAsString, outOfMemoryDetected, outOfMemoryPathOption) + + stderrSizeAndReturnCodeAndMemoryRetry flatMap { + case (stderrSize, returnCodeAsString, outOfMemoryDetected, outOfMemoryPathOption) => + val tryReturnCodeAsInt = Try(returnCodeAsString.trim.toInt) + + if (isDone(status)) { + tryReturnCodeAsInt match { + case Success(returnCodeAsInt) if failOnStdErr && stderrSize.intValue > 0 => + val executionHandle = Future.successful( + FailedNonRetryableExecutionHandle(StderrNonEmpty(jobDescriptor.key.tag, stderrSize, stderrAsOption), + Option(returnCodeAsInt), + None + ) ) - ) - retryElseFail(executionHandle) - case Success(returnCodeAsInt) if continueOnReturnCode.continueFor(returnCodeAsInt) => - handleExecutionSuccess(status, oldHandle, returnCodeAsInt) - // It's important that we check retryWithMoreMemory case before isAbort. RC could be 137 in either case; - // if it was caused by OOM killer, want to handle as OOM and not job abort. - case Success(returnCodeAsInt) if outOfMemoryDetected && memoryRetryRequested => - val executionHandle = Future.successful( - FailedNonRetryableExecutionHandle( - RetryWithMoreMemory(jobDescriptor.key.tag, stderrAsOption, memoryRetryErrorKeys, log), - Option(returnCodeAsInt), - None + retryElseFail(executionHandle) + case Success(returnCodeAsInt) if continueOnReturnCode.continueFor(returnCodeAsInt) => + handleExecutionSuccess(status, oldHandle, returnCodeAsInt) + // It's important that we check retryWithMoreMemory case before isAbort. RC could be 137 in either case; + // if it was caused by OOM killer, want to handle as OOM and not job abort. + case Success(returnCodeAsInt) if outOfMemoryDetected && memoryRetryRequested => + val executionHandle = Future.successful( + FailedNonRetryableExecutionHandle( + RetryWithMoreMemory(jobDescriptor.key.tag, outOfMemoryPathOption, memoryRetryErrorKeys, log), + Option(returnCodeAsInt), + None + ) ) - ) - retryElseFail(executionHandle, - MemoryRetryResult(outOfMemoryDetected, memoryRetryFactor, previousMemoryMultiplier) - ) - case Success(returnCodeAsInt) if isAbort(returnCodeAsInt) => - Future.successful(AbortedExecutionHandle) - case Success(returnCodeAsInt) => - val executionHandle = Future.successful( - FailedNonRetryableExecutionHandle(WrongReturnCode(jobDescriptor.key.tag, returnCodeAsInt, stderrAsOption), - Option(returnCodeAsInt), - None + retryElseFail(executionHandle, + MemoryRetryResult(outOfMemoryDetected, memoryRetryFactor, previousMemoryMultiplier) ) - ) - retryElseFail(executionHandle) - case Failure(_) => - Future.successful( - FailedNonRetryableExecutionHandle( - ReturnCodeIsNotAnInt(jobDescriptor.key.tag, returnCodeAsString, stderrAsOption), - kvPairsToSave = None + case Success(returnCodeAsInt) if isAbort(returnCodeAsInt) => + Future.successful(AbortedExecutionHandle) + case Success(returnCodeAsInt) => + val executionHandle = Future.successful( + FailedNonRetryableExecutionHandle( + WrongReturnCode(jobDescriptor.key.tag, returnCodeAsInt, stderrAsOption), + Option(returnCodeAsInt), + None + ) ) - ) - } - } else { - tryReturnCodeAsInt match { - case Success(returnCodeAsInt) - if outOfMemoryDetected && memoryRetryRequested && !continueOnReturnCode.continueFor(returnCodeAsInt) => - val executionHandle = Future.successful( - FailedNonRetryableExecutionHandle( - RetryWithMoreMemory(jobDescriptor.key.tag, stderrAsOption, memoryRetryErrorKeys, log), - Option(returnCodeAsInt), - None + retryElseFail(executionHandle) + case Failure(_) => + Future.successful( + FailedNonRetryableExecutionHandle( + ReturnCodeIsNotAnInt(jobDescriptor.key.tag, returnCodeAsString, stderrAsOption), + kvPairsToSave = None + ) ) - ) - retryElseFail(executionHandle, - MemoryRetryResult(outOfMemoryDetected, memoryRetryFactor, previousMemoryMultiplier) - ) - case _ => - val failureStatus = handleExecutionFailure(status, tryReturnCodeAsInt.toOption) - retryElseFail(failureStatus) + } + } else { + tryReturnCodeAsInt match { + case Success(returnCodeAsInt) + if outOfMemoryDetected && memoryRetryRequested && !continueOnReturnCode.continueFor(returnCodeAsInt) => + val executionHandle = Future.successful( + FailedNonRetryableExecutionHandle( + RetryWithMoreMemory(jobDescriptor.key.tag, outOfMemoryPathOption, memoryRetryErrorKeys, log), + Option(returnCodeAsInt), + None + ) + ) + retryElseFail(executionHandle, + MemoryRetryResult(outOfMemoryDetected, memoryRetryFactor, previousMemoryMultiplier) + ) + case _ => + val failureStatus = handleExecutionFailure(status, tryReturnCodeAsInt.toOption) + retryElseFail(failureStatus) + } } - } } recoverWith { case exception => if (isDone(status)) Future.successful(FailedNonRetryableExecutionHandle(exception, kvPairsToSave = None)) else { diff --git a/centaur/src/main/resources/standardTestCases/retry_with_more_memory/gcpbatch/retry_with_more_memory.wdl b/centaur/src/main/resources/standardTestCases/retry_with_more_memory/gcpbatch/retry_with_more_memory.wdl index 2c50ed34c86..8fc6b801807 100644 --- a/centaur/src/main/resources/standardTestCases/retry_with_more_memory/gcpbatch/retry_with_more_memory.wdl +++ b/centaur/src/main/resources/standardTestCases/retry_with_more_memory/gcpbatch/retry_with_more_memory.wdl @@ -1,6 +1,9 @@ version 1.0 task imitate_oom_error { + meta { + volatile: true + } command { echo "$MEM_SIZE $MEM_UNIT" diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/GcpBatchAsyncBackendJobExecutionActor.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/GcpBatchAsyncBackendJobExecutionActor.scala index fe46d15e0ad..9977ec6d498 100644 --- a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/GcpBatchAsyncBackendJobExecutionActor.scala +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/actors/GcpBatchAsyncBackendJobExecutionActor.scala @@ -27,7 +27,12 @@ import cromwell.backend.google.batch.models.RunStatus.TerminalRunStatus import cromwell.backend.google.batch.models._ import cromwell.backend.google.batch.monitoring.{BatchInstrumentation, CheckpointingConfiguration, MonitoringImage} import cromwell.backend.google.batch.runnable.WorkflowOptionKeys -import cromwell.backend.google.batch.util.{GcpBatchReferenceFilesMappingOperations, RuntimeOutputMapping} +import cromwell.backend.google.batch.util.{ + GcpBatchReferenceFilesMappingOperations, + MemoryRetryRunnable, + MemoryRetryStandard, + RuntimeOutputMapping +} import cromwell.backend.standard._ import cromwell.core._ import cromwell.core.io.IoCommandBuilder @@ -633,7 +638,14 @@ class GcpBatchAsyncBackendJobExecutionActor(override val standardParams: Standar // if the `memory_retry_multiplier` is not present in the workflow options there is no need to check whether or // not the `stderr` file contained memory retry error keys - val retryWithMoreMemoryKeys: Option[List[String]] = memoryRetryFactor.flatMap(_ => memoryRetryErrorKeys) + // If the retry detection is processed using the stderr then do not try retry with more memory in the backend. + // This keeps the backend jobs from logging the memory retry error keys in the very verbose Google Cloud Batch + // logs, tripping up the standard memory retry detection. + val retryWithMoreMemoryKeys: Option[List[String]] = + batchConfiguration.batchAttributes.memoryRetryCheckMode match { + case MemoryRetryRunnable => memoryRetryFactor.flatMap(_ => memoryRetryErrorKeys) + case MemoryRetryStandard => None + } val targetLogFile = batchAttributes.logsPolicy match { case GcpBatchLogsPolicy.CloudLogging => None diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchConfigurationAttributes.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchConfigurationAttributes.scala index 59a8eb5c63b..0d03d3f97d9 100644 --- a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchConfigurationAttributes.scala +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchConfigurationAttributes.scala @@ -22,7 +22,7 @@ import cromwell.backend.google.batch.models.GcpBatchConfigurationAttributes.{ GcsTransferConfiguration, VirtualPrivateCloudConfiguration } -import cromwell.backend.google.batch.util.GcpBatchReferenceFilesMappingOperations +import cromwell.backend.google.batch.util.{GcpBatchReferenceFilesMappingOperations, MemoryRetryCheckMode} import cromwell.cloudsupport.gcp.GoogleConfiguration import cromwell.cloudsupport.gcp.auth.GoogleAuthMode import cromwell.docker.DockerMirroring @@ -58,7 +58,8 @@ case class GcpBatchConfigurationAttributes( referenceFileToDiskImageMappingOpt: Option[Map[String, GcpBatchReferenceFilesDisk]], checkpointingInterval: FiniteDuration, logsPolicy: GcpBatchLogsPolicy, - maxTransientErrorRetries: Int + maxTransientErrorRetries: Int, + memoryRetryCheckMode: MemoryRetryCheckMode ) object GcpBatchConfigurationAttributes extends GcpBatchReferenceFilesMappingOperations with StrictLogging { @@ -89,6 +90,7 @@ object GcpBatchConfigurationAttributes extends GcpBatchReferenceFilesMappingOper val DefaultGcsTransferAttempts: Refined[Int, Positive] = refineMV[Positive](3) val checkpointingIntervalKey = "checkpointing-interval" + val memoryRetryCheckModeKey = "memory-retry-check-mode" private val batchKeys = CommonBackendConfigurationAttributes.commonValidConfigurationAttributeKeys ++ Set( "project", @@ -128,7 +130,8 @@ object GcpBatchConfigurationAttributes extends GcpBatchReferenceFilesMappingOper "virtual-private-cloud.subnetwork-label-key", "virtual-private-cloud.auth", "reference-disk-localization-manifests", - checkpointingIntervalKey + checkpointingIntervalKey, + memoryRetryCheckModeKey ) private val deprecatedBatchKeys: Map[String, String] = Map( @@ -311,6 +314,13 @@ object GcpBatchConfigurationAttributes extends GcpBatchReferenceFilesMappingOper val maxTransientErrorRetries: Int = backendConfig.as[Option[Int]]("max-transient-error-retries").getOrElse(10) + val memoryRetryCheckMode: ErrorOr[MemoryRetryCheckMode] = + MemoryRetryCheckMode + .tryParse( + backendConfig.getOrElse(memoryRetryCheckModeKey, MemoryRetryCheckMode.DefaultMode.name) + ) + .toErrorOr + def authGoogleConfigForBatchConfigurationAttributes( project: String, bucket: String, @@ -327,7 +337,8 @@ object GcpBatchConfigurationAttributes extends GcpBatchReferenceFilesMappingOper virtualPrivateCloudConfiguration: VirtualPrivateCloudConfiguration, batchRequestTimeoutConfiguration: BatchRequestTimeoutConfiguration, referenceDiskLocalizationManifestFilesOpt: Option[List[ManifestFile]], - logsPolicy: GcpBatchLogsPolicy + logsPolicy: GcpBatchLogsPolicy, + memoryRetryCheckMode: MemoryRetryCheckMode ): ErrorOr[GcpBatchConfigurationAttributes] = (googleConfig.auth(batchName), googleConfig.auth(gcsName)) mapN { (batchAuth, gcsAuth) => val generatedReferenceFilesMappingOpt = referenceDiskLocalizationManifestFilesOpt map { @@ -354,7 +365,8 @@ object GcpBatchConfigurationAttributes extends GcpBatchReferenceFilesMappingOper referenceFileToDiskImageMappingOpt = generatedReferenceFilesMappingOpt, checkpointingInterval = checkpointingInterval, logsPolicy = logsPolicy, - maxTransientErrorRetries = maxTransientErrorRetries + maxTransientErrorRetries = maxTransientErrorRetries, + memoryRetryCheckMode = memoryRetryCheckMode ) } @@ -373,7 +385,8 @@ object GcpBatchConfigurationAttributes extends GcpBatchReferenceFilesMappingOper virtualPrivateCloudConfiguration, batchRequestTimeoutConfigurationValidation, referenceDiskLocalizationManifestFiles, - logsPolicy + logsPolicy, + memoryRetryCheckMode ) flatMapN authGoogleConfigForBatchConfigurationAttributes match { case Valid(r) => r case Invalid(f) => diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchJobPaths.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchJobPaths.scala index 13417a0d05b..c12fdbeb548 100644 --- a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchJobPaths.scala +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/models/GcpBatchJobPaths.scala @@ -2,6 +2,7 @@ package cromwell.backend.google.batch.models import cromwell.backend.BackendJobDescriptorKey import cromwell.backend.google.batch.runnable.GcpBatchMetadataKeys +import cromwell.backend.google.batch.util.{MemoryRetryRunnable, MemoryRetryStandard} import cromwell.backend.io.JobPaths import cromwell.core.path.Path import cromwell.services.metadata.CallMetadataKeys @@ -48,6 +49,12 @@ case class GcpBatchJobPaths(override val workflowPaths: GcpBatchWorkflowPaths, case _ => None } + override lazy val memoryRetryError: Option[Path] = + workflowPaths.gcpBatchConfiguration.batchAttributes.memoryRetryCheckMode match { + case MemoryRetryRunnable => None + case MemoryRetryStandard => maybeBatchLogPath + } + override lazy val customMetadataPaths = { val backendLogsMetadata = maybeBatchLogPath map { p: Path => Map(CallMetadataKeys.BackendLogsPrefix + ":log" -> p) diff --git a/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/MemoryRetryCheckMode.scala b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/MemoryRetryCheckMode.scala new file mode 100644 index 00000000000..4f8ba85de66 --- /dev/null +++ b/supportedBackends/google/batch/src/main/scala/cromwell/backend/google/batch/util/MemoryRetryCheckMode.scala @@ -0,0 +1,34 @@ +package cromwell.backend.google.batch.util + +import scala.util.{Failure, Success, Try} + +sealed trait MemoryRetryCheckMode { + val name: String + + override def toString: String = name +} + +case object MemoryRetryRunnable extends MemoryRetryCheckMode { + override val name: String = "Runnable" +} + +case object MemoryRetryStandard extends MemoryRetryCheckMode { + override val name: String = "Standard" +} + +object MemoryRetryCheckMode { + val DefaultMode = MemoryRetryRunnable + private val AllModes: Seq[MemoryRetryCheckMode] = Seq(MemoryRetryRunnable, MemoryRetryStandard) + + def tryParse(mode: String): Try[MemoryRetryCheckMode] = + AllModes + .find(_.name.equalsIgnoreCase(mode)) + .map(Success(_)) + .getOrElse( + Failure( + new Exception( + s"Invalid memory retry check mode: '$mode', supported modes are: ${AllModes.map(_.name).mkString("'", "', '", "'")}" + ) + ) + ) +} diff --git a/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/actors/GcpBatchBackendLifecycleActorFactorySpec.scala b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/actors/GcpBatchBackendLifecycleActorFactorySpec.scala index db97d33d1b8..5dee959f532 100644 --- a/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/actors/GcpBatchBackendLifecycleActorFactorySpec.scala +++ b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/actors/GcpBatchBackendLifecycleActorFactorySpec.scala @@ -2,6 +2,7 @@ package cromwell.backend.google.batch.actors import cromwell.backend.google.batch.GcpBatchBackendLifecycleActorFactory import cromwell.backend.google.batch.models.{GcpBatchConfigurationAttributes, GcpBatchLogsPolicy} +import cromwell.backend.google.batch.util.MemoryRetryRunnable import eu.timepit.refined.numeric.Positive import eu.timepit.refined.refineV import org.scalatest.flatspec.AnyFlatSpecLike @@ -36,7 +37,8 @@ class GcpBatchBackendLifecycleActorFactorySpec extends AnyFlatSpecLike with Matc referenceFileToDiskImageMappingOpt = None, checkpointingInterval = 1 second, logsPolicy = GcpBatchLogsPolicy.CloudLogging, - maxTransientErrorRetries = 10 + maxTransientErrorRetries = 10, + memoryRetryCheckMode = MemoryRetryRunnable ) GcpBatchBackendLifecycleActorFactory.robustBuildAttributes(() => attributes) shouldBe attributes diff --git a/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/callcaching/BatchBackendCacheHitCopyingActorSpec.scala b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/callcaching/BatchBackendCacheHitCopyingActorSpec.scala index 77d1a0ba3c2..2bbf7b6ad88 100644 --- a/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/callcaching/BatchBackendCacheHitCopyingActorSpec.scala +++ b/supportedBackends/google/batch/src/test/scala/cromwell/backend/google/batch/callcaching/BatchBackendCacheHitCopyingActorSpec.scala @@ -8,6 +8,7 @@ import cromwell.backend.BackendCacheHitCopyingActor.{CopyingOutputsFailedRespons import cromwell.backend.BackendJobExecutionActor.JobSucceededResponse import cromwell.backend.google.batch.models.GcpBatchConfigurationAttributes.VirtualPrivateCloudConfiguration import cromwell.backend.google.batch.models._ +import cromwell.backend.google.batch.util.MemoryRetryRunnable import cromwell.backend.io.JobPaths import cromwell.backend.standard.StandardValidatedRuntimeAttributesBuilder import cromwell.backend.standard.callcaching.CopyingActorBlacklistCacheSupport.HasFormatting @@ -436,7 +437,8 @@ class BatchBackendCacheHitCopyingActorSpec referenceFileToDiskImageMappingOpt = None, checkpointingInterval = 10.minutes, logsPolicy = GcpBatchLogsPolicy.CloudLogging, - maxTransientErrorRetries = 10 + maxTransientErrorRetries = 10, + memoryRetryCheckMode = MemoryRetryRunnable ) val batchConfiguration = mockWithDefaults[GcpBatchConfiguration] diff --git a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiJobPaths.scala b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiJobPaths.scala index ec4af5344cf..fb01578bc67 100644 --- a/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiJobPaths.scala +++ b/supportedBackends/google/pipelines/common/src/main/scala/cromwell/backend/google/pipelines/common/PipelinesApiJobPaths.scala @@ -41,6 +41,8 @@ case class PipelinesApiJobPaths(override val workflowPaths: PipelinesApiWorkflow val jesMonitoringScriptFilename: String = s"${PipelinesApiJobPaths.JesMonitoringKey}.sh" val jesMonitoringImageScriptFilename: String = s"${PipelinesApiJobPaths.JesMonitoringImageKey}.sh" + override lazy val memoryRetryError: Option[Path] = Option(jesLogPath) + override lazy val customMetadataPaths = Map( CallMetadataKeys.BackendLogsPrefix + ":log" -> jesLogPath ) ++ (