From f51776d3ed292c06ecc47298abec5ce906f8c45d Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Sun, 20 Jun 2021 17:15:42 +0200 Subject: [PATCH] Update DDP backend `if dist.is_nccl_available()` (#3705) --- train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train.py b/train.py index fbda7320839a..19bd97faca1f 100644 --- a/train.py +++ b/train.py @@ -539,7 +539,7 @@ def main(opt): assert torch.cuda.device_count() > LOCAL_RANK, 'insufficient CUDA devices for DDP command' torch.cuda.set_device(LOCAL_RANK) device = torch.device('cuda', LOCAL_RANK) - dist.init_process_group(backend="gloo", timeout=timedelta(seconds=60)) + dist.init_process_group(backend="nccl" if dist.is_nccl_available() else "gloo", timeout=timedelta(seconds=60)) assert opt.batch_size % WORLD_SIZE == 0, '--batch-size must be multiple of CUDA device count' assert not opt.image_weights, '--image-weights argument is not compatible with DDP training'