diff --git a/backend/src/main/scala/cromwell/backend/standard/StandardAsyncExecutionActor.scala b/backend/src/main/scala/cromwell/backend/standard/StandardAsyncExecutionActor.scala index 8f8312023f1..975b1a85d2b 100644 --- a/backend/src/main/scala/cromwell/backend/standard/StandardAsyncExecutionActor.scala +++ b/backend/src/main/scala/cromwell/backend/standard/StandardAsyncExecutionActor.scala @@ -29,6 +29,7 @@ import cromwell.backend.validation._ import cromwell.core._ import cromwell.core.io.{AsyncIoActorClient, DefaultIoCommandBuilder, IoCommandBuilder} import cromwell.core.path.Path +import cromwell.core.retry._ import cromwell.services.keyvalue.KeyValueServiceActor._ import cromwell.services.keyvalue.KvClient import cromwell.services.metadata.CallMetadataKeys @@ -268,6 +269,25 @@ trait StandardAsyncExecutionActor } } + lazy val maxRetriesMode: MaxRetriesMode = { + val maxRetriesModeOption: Option[MaxRetriesMode] = + jobDescriptor.workflowDescriptor.getWorkflowOption(WorkflowOptions.MaxRetriesMode) flatMap { value: String => + MaxRetriesMode.tryParse(value) match { + case Success(v) => Option(v) + case Failure(e) => + // should not happen, this case should have been screened for and fast-failed during workflow materialization. + log.error( + e, + s"Programmer error: unexpected failure attempting to convert value for workflow option " + + s"'${WorkflowOptions.MaxRetriesMode.name}' to MaxRetriesMode." + ) + Option(MaxRetriesMode.DefaultMode) + } + } + + maxRetriesModeOption.getOrElse(MaxRetriesMode.DefaultMode) + } + lazy val memoryRetryRequested: Boolean = memoryRetryFactor.nonEmpty /** @@ -1101,8 +1121,17 @@ trait StandardAsyncExecutionActor failedRetryableOrNonRetryable match { case failedNonRetryable: FailedNonRetryableExecutionHandle if previousFailedRetries < maxRetries => - // The user asked us to retry finitely for them, possibly with a memory modification - evaluateFailureRetry(failedNonRetryable, kvsFromPreviousAttempt, kvsForNextAttempt, memoryRetry) + maxRetriesMode match { + case AllErrors => + // The user asked us to retry finitely for them, possibly with a memory modification + evaluateFailureRetry(failedNonRetryable, kvsFromPreviousAttempt, kvsForNextAttempt, memoryRetry) + case KnownErrors if memoryRetry.oomDetected => + // The user asked us to retry finitely for them, with a memory modification + evaluateFailureRetry(failedNonRetryable, kvsFromPreviousAttempt, kvsForNextAttempt, memoryRetry) + case _ => + // No reason to retry + Future.successful(failedNonRetryable) + } case failedNonRetryable: FailedNonRetryableExecutionHandle => // No reason to retry Future.successful(failedNonRetryable) diff --git a/centaur/src/main/resources/standardTestCases/max_retries/max_retries_mode_allerrors.options b/centaur/src/main/resources/standardTestCases/max_retries/max_retries_mode_allerrors.options new file mode 100644 index 00000000000..867fc91aaef --- /dev/null +++ b/centaur/src/main/resources/standardTestCases/max_retries/max_retries_mode_allerrors.options @@ -0,0 +1,3 @@ +{ + "max_retries_mode" : "AllErrors" +} diff --git a/centaur/src/main/resources/standardTestCases/max_retries/max_retries_mode_knownerrors.options b/centaur/src/main/resources/standardTestCases/max_retries/max_retries_mode_knownerrors.options new file mode 100644 index 00000000000..7aa7f56d1a7 --- /dev/null +++ b/centaur/src/main/resources/standardTestCases/max_retries/max_retries_mode_knownerrors.options @@ -0,0 +1,3 @@ +{ + "max_retries_mode" : "KnownErrors" +} diff --git a/centaur/src/main/resources/standardTestCases/max_retries_mode_allerrors.test b/centaur/src/main/resources/standardTestCases/max_retries_mode_allerrors.test new file mode 100644 index 00000000000..db461cdf345 --- /dev/null +++ b/centaur/src/main/resources/standardTestCases/max_retries_mode_allerrors.test @@ -0,0 +1,11 @@ +name: max_retries_mode_allerrors +testFormat: workflowfailure + +files { + workflow: max_retries/max_retries.wdl + options: max_retries/max_retries_mode_allerrors.options +} + +metadata { + "failures.0.causedBy.0.message": "Job retry_for_me.broken_task:NA:2 exited with return code 1 which has not been declared as a valid return code. See 'continueOnReturnCode' runtime attribute for more details." +} diff --git a/centaur/src/main/resources/standardTestCases/max_retries_mode_knownerrors.test b/centaur/src/main/resources/standardTestCases/max_retries_mode_knownerrors.test new file mode 100644 index 00000000000..c2c38c0045c --- /dev/null +++ b/centaur/src/main/resources/standardTestCases/max_retries_mode_knownerrors.test @@ -0,0 +1,11 @@ +name: max_retries_mode_knownerrors +testFormat: workflowfailure + +files { + workflow: max_retries/max_retries.wdl + options: max_retries/max_retries_mode_knownerrors.options +} + +metadata { + "failures.0.causedBy.0.message": "Job retry_for_me.broken_task:NA:1 exited with return code 1 which has not been declared as a valid return code. See 'continueOnReturnCode' runtime attribute for more details." +} diff --git a/core/src/main/scala/cromwell/core/WorkflowOptions.scala b/core/src/main/scala/cromwell/core/WorkflowOptions.scala index 78615a5147f..f58fa9ddb23 100644 --- a/core/src/main/scala/cromwell/core/WorkflowOptions.scala +++ b/core/src/main/scala/cromwell/core/WorkflowOptions.scala @@ -77,6 +77,7 @@ object WorkflowOptions { case object WorkflowFailureMode extends WorkflowOption("workflow_failure_mode") case object UseReferenceDisks extends WorkflowOption("use_reference_disks") case object MemoryRetryMultiplier extends WorkflowOption("memory_retry_multiplier") + case object MaxRetriesMode extends WorkflowOption("max_retries_mode") case object WorkflowCallbackUri extends WorkflowOption("workflow_callback_uri") private lazy val WorkflowOptionsConf = ConfigFactory.load.getConfig("workflow-options") diff --git a/core/src/main/scala/cromwell/core/retry/MaxRetriesMode.scala b/core/src/main/scala/cromwell/core/retry/MaxRetriesMode.scala new file mode 100644 index 00000000000..9f847022de2 --- /dev/null +++ b/core/src/main/scala/cromwell/core/retry/MaxRetriesMode.scala @@ -0,0 +1,17 @@ +package cromwell.core.retry + +import scala.util.{Failure, Success, Try} + +sealed trait MaxRetriesMode +case object AllErrors extends MaxRetriesMode +case object KnownErrors extends MaxRetriesMode + +object MaxRetriesMode { + val DefaultMode = AllErrors + private val AllModes = Seq(AllErrors, KnownErrors) + + def tryParse(mode: String): Try[MaxRetriesMode] = + AllModes find { _.toString.equalsIgnoreCase(mode) } map { Success(_) } getOrElse Failure( + new Exception(s"Invalid max retries mode: '$mode', supported modes are: ${AllModes.mkString("'", "', '", "'")}") + ) +} diff --git a/docs/RuntimeAttributes.md b/docs/RuntimeAttributes.md index 4461d5da433..e5e2f82cf09 100644 --- a/docs/RuntimeAttributes.md +++ b/docs/RuntimeAttributes.md @@ -247,7 +247,7 @@ runtime { *Default: _0_* -This retry option is introduced to provide a method for tackling transient job failures. For example, if a task fails due to a timeout from accessing an external service, then this option helps re-run the failed the task without having to re-run the entire workflow. It takes an Int as a value that indicates the maximum number of times Cromwell should retry a failed task. This retry is applied towards jobs that fail while executing the task command. This method only applies to transient job failures and is a feeble attempt to retry a job, that is it cannot be used to increase memory in out-of-memory situations. +This retry option is introduced to provide a method for tackling transient job failures. For example, if a task fails due to a timeout from accessing an external service, then this option helps re-run the failed the task without having to re-run the entire workflow. It takes an Int as a value that indicates the maximum number of times Cromwell should retry a failed task. This retry is applied towards jobs that fail while executing the task command. This method only applies to transient job failures and is a feeble attempt to retry a job. If using the Google backend, it's important to note that The `maxRetries` count is independent from the [preemptible](#preemptible) count. For example, the task below can be retried up to 6 times if it's preempted 3 times AND the command execution fails 3 times. diff --git a/docs/wf_options/Overview.md b/docs/wf_options/Overview.md index 84a44fbc865..9bb19a5d414 100644 --- a/docs/wf_options/Overview.md +++ b/docs/wf_options/Overview.md @@ -146,3 +146,12 @@ Example `options.json`: "memory_retry_multiplier" : 1.1 } ``` + +## Max Retries Mode + +The `max_retries_mode` workflow options sets the behavior of retrying failed jobs when the [`maxRetries` runtime +attribute](../RuntimeAttributes.md#maxretries) is specified. + +The possible values are `AllErrors` or `KnownErrors`. If set to `AllErrors`, the job will be retried for any error. If +set to `KnownErrors`, the job will only be retried for errors that are known to be retryable, such as increasing memory +in out-of-memory situations. The default value is `AllErrors`. diff --git a/engine/src/main/scala/cromwell/engine/workflow/lifecycle/materialization/MaterializeWorkflowDescriptorActor.scala b/engine/src/main/scala/cromwell/engine/workflow/lifecycle/materialization/MaterializeWorkflowDescriptorActor.scala index 45165cad5f5..6c300c3d4bb 100644 --- a/engine/src/main/scala/cromwell/engine/workflow/lifecycle/materialization/MaterializeWorkflowDescriptorActor.scala +++ b/engine/src/main/scala/cromwell/engine/workflow/lifecycle/materialization/MaterializeWorkflowDescriptorActor.scala @@ -27,6 +27,7 @@ import cromwell.core.io.AsyncIo import cromwell.core.labels.{Label, Labels} import cromwell.core.logging.WorkflowLogging import cromwell.core.path.{PathBuilder, PathBuilderFactory} +import cromwell.core.retry._ import cromwell.engine._ import cromwell.engine.backend.CromwellBackends import cromwell.engine.workflow.WorkflowProcessingEventPublishing._ @@ -182,6 +183,19 @@ object MaterializeWorkflowDescriptorActor { s"'$optionName' is specified in workflow options but value is not of expected Double type: ${e.getMessage}".invalidNel } } + + def validateMaxRetriesMode(workflowOptions: WorkflowOptions): ErrorOr[MaxRetriesMode] = { + val modeString: Try[String] = workflowOptions.get(WorkflowOptions.MaxRetriesMode) match { + case Success(value) => Success(value) + case Failure(_: OptionNotFoundException) => Success(MaxRetriesMode.DefaultMode.toString) + case Failure(e) => Failure(e) + } + + modeString flatMap MaxRetriesMode.tryParse match { + case Success(mode) => mode.validNel + case Failure(t) => t.getMessage.invalidNel + } + } } // TODO WOM: need to decide where to draw the line between language specific initialization and WOM @@ -495,12 +509,15 @@ class MaterializeWorkflowDescriptorActor(override val serviceRegistryActor: Acto val memoryRetryMultiplierValidation: ErrorOr[Unit] = validateMemoryRetryMultiplier(workflowOptions) + val maxRetriesModeValidation: ErrorOr[MaxRetriesMode] = validateMaxRetriesMode(workflowOptions) + (failureModeValidation, backendAssignmentsValidation, callCachingModeValidation, useReferenceDisksValidation, - memoryRetryMultiplierValidation - ) mapN { case (failureMode, backendAssignments, callCachingMode, _, _) => + memoryRetryMultiplierValidation, + maxRetriesModeValidation + ) mapN { case (failureMode, backendAssignments, callCachingMode, _, _, _) => val callable = womNamespace.executable.entryPoint val backendDescriptor = BackendWorkflowDescriptor(id, callable, diff --git a/server/src/test/scala/cromwell/engine/workflow/lifecycle/MaterializeWorkflowDescriptorActorSpec.scala b/server/src/test/scala/cromwell/engine/workflow/lifecycle/MaterializeWorkflowDescriptorActorSpec.scala index cf845af5d30..78ecd387944 100644 --- a/server/src/test/scala/cromwell/engine/workflow/lifecycle/MaterializeWorkflowDescriptorActorSpec.scala +++ b/server/src/test/scala/cromwell/engine/workflow/lifecycle/MaterializeWorkflowDescriptorActorSpec.scala @@ -84,6 +84,15 @@ class MaterializeWorkflowDescriptorActorSpec private val invalidMemoryRetryOptions5 = WorkflowOptions.fromJsonString(""" { "memory_retry_multiplier": true } """).get + private val validMaxRetriesModeOptions1 = + WorkflowOptions.fromJsonString(""" { "max_retries_mode": "AllErrors" } """).get + private val validMaxRetriesModeOptions2 = + WorkflowOptions.fromJsonString(""" { "max_retries_mode": "KnownErrors" } """).get + private val invalidMaxRetriesModeOptions1 = + WorkflowOptions.fromJsonString(""" { "max_retries_mode": "invalid value" } """).get + private val invalidMaxRetriesModeOptions2 = + WorkflowOptions.fromJsonString(""" { "max_retries_mode": true } """).get + before {} after { @@ -699,6 +708,27 @@ class MaterializeWorkflowDescriptorActorSpec } } + "accept valid max_retries_mode" in { + List(validMaxRetriesModeOptions1, validMaxRetriesModeOptions2, validOptions) map { options => + MaterializeWorkflowDescriptorActor.validateMaxRetriesMode(options) match { + case Valid(_) => // good! + case Invalid(_) => fail(s"max_retries_mode validation for $options failed but should have passed!") + } + } + } + + "reject invalid max_retries_mode" in { + List(invalidMaxRetriesModeOptions1, invalidMaxRetriesModeOptions2) map { options => + MaterializeWorkflowDescriptorActor.validateMaxRetriesMode(options) match { + case Invalid(errorsList) => + errorsList.head should startWith( + "Invalid max retries mode" + ) + case Valid(_) => fail(s"max_retries_mode validation for $options succeeded but should have failed!") + } + } + } + "fail materialization if memory_retry_multiplier is invalid" in { val materializeWfActor = system.actorOf( MaterializeWorkflowDescriptorActor.props(NoBehaviorActor,