From 07632b06ada3fde83cf3313f3934906cfc1b1841 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Wed, 1 May 2024 15:23:06 -0700 Subject: [PATCH] [Core][Distributed] fix pynccl del error (#4508) --- vllm/distributed/device_communicators/pynccl.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py index 9434867e1b12..f21fcd262d81 100644 --- a/vllm/distributed/device_communicators/pynccl.py +++ b/vllm/distributed/device_communicators/pynccl.py @@ -200,6 +200,10 @@ def from_torch(cls, op: ReduceOp) -> int: ncclDataType_t, ctypes.c_void_p, ctypes.c_void_p ] +# be cautious! this is a collective call, it will block until all +# processes in the communicator have called this function. +# because Python object destruction can happen in random order, +# it is better not to call it at all. # equivalent to c declaration: # ncclResult_t ncclCommDestroy(ncclComm_t comm); _c_ncclCommDestroy = nccl.ncclCommDestroy @@ -278,11 +282,3 @@ def all_reduce(self, ncclDataTypeEnum.from_torch(tensor.dtype), ncclRedOpTypeEnum.from_torch(op), self.comm, ctypes.c_void_p(stream.cuda_stream))) - - def __del__(self): - # `dist` module might have been already destroyed - if hasattr(dist, 'destroy_process_group'): - dist.destroy_process_group() - # function might have been already destroyed - if _c_ncclCommDestroy is not None: - _c_ncclCommDestroy(self.comm)