Skip to content

Commit

Permalink
Add WDL input to set number of retries. (#247)
Browse files Browse the repository at this point in the history
  • Loading branch information
kshakir authored Oct 31, 2023
1 parent da9fabb commit e358888
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ task run_check_pytorch_cuda_status {
Int? hardware_boot_disk_size_GB = 20
String? hardware_zones = "us-east1-d us-east1-c us-central1-a us-central1-c us-west1-b"
String? hardware_gpu_type = "nvidia-tesla-t4"
Int? hardware_premptible_tries = 2
Int? hardware_max_retries = 0
String? nvidia_driver_version = "470.82.01" # need >=465.19.01 for CUDA 11.3
}
command {
Expand All @@ -28,7 +30,8 @@ task run_check_pytorch_cuda_status {
gpuCount: 1
gpuType: "${hardware_gpu_type}"
nvidiaDriverVersion: "${nvidia_driver_version}"
maxRetries: 0
preemptible: hardware_premptible_tries
maxRetries: hardware_max_retries
}
}
Expand Down
5 changes: 4 additions & 1 deletion wdl/cellbender_remove_background.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ task run_cellbender_remove_background_gpu {
Int? hardware_disk_size_GB = 50
Int? hardware_boot_disk_size_GB = 20
Int? hardware_preemptible_tries = 2
Int? hardware_max_retries = 0
Int? hardware_cpu_count = 4
Int? hardware_memory_GB = 32
String? hardware_gpu_type = "nvidia-tesla-t4"
Expand Down Expand Up @@ -186,7 +187,7 @@ task run_cellbender_remove_background_gpu {
nvidiaDriverVersion: "${nvidia_driver_version}"
preemptible: hardware_preemptible_tries
checkpointFile: "ckpt.tar.gz"
maxRetries: 0 # can be used in case of a PAPI error code 2 failure to install GPU drivers
maxRetries: hardware_max_retries # can be used in case of a PAPI error code 2 failure to install GPU drivers
}
meta {
author: "Stephen Fleming"
Expand Down Expand Up @@ -214,6 +215,8 @@ task run_cellbender_remove_background_gpu {
{help: "Optional file only used by CellBender developers or those trying to benchmark CellBender remove-background on simulated data. Normally, this input would not be supplied."}
hardware_preemptible_tries :
{help: "If nonzero, CellBender will be run on a preemptible instance, at a lower cost. If preempted, your run will not start from scratch, but will start from a checkpoint that is saved by CellBender and recovered by Cromwell. For example, if hardware_preemptible_tries is 2, your run will attempt twice using preemptible instances, and if the job is preempted both times before completing, it will finish on a non-preemptible machine. The cost savings is significant. The potential drawback is that preemption wastes time."}
hardware_max_retries :
{help: "If nonzero when CellBender exits without success it will be retried. If one also sets the memory_retry_multiplier workflow option, and the exit happens to be detected as an out of memory error, then the retry will also increase the memory allocated to the next run. The potential benefit is that one can start CellBender with less memory, and memory will be increased only when needed. The potential drawback is that the job will be retried even if the error is not a memory error."}
checkpoint_mins :
{help: "Time in minutes between creation of checkpoint files. Bear in mind that Cromwell copies checkpoints to a bucket every ten minutes."}
hardware_gpu_type :
Expand Down

0 comments on commit e358888

Please sign in to comment.