Description
there is a node with 8 gpus,and I can't train my model on any 4 of the gpus, except gpu-id is 0,1,2,3.
how can I use any permutation and combination of the 8 gpus? Thanks
-- Process 2 terminated with the following error: Traceback (most recent call last): File "/home/lab-chen.qi/anaconda3/envs/torch17/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 19, in _wrap fn(i, *args) File "/home/lab-chen.qi/sc/resweightv1/tiny_imagenet_multi.py", line 223, in main_worker torch.cuda.set_device(gpu) File "/home/lab-chen.qi/anaconda3/envs/torch17/lib/python3.7/site-packages/torch/cuda/__init__.py", line 263, in set_device torch._C._cuda_setDevice(device) RuntimeError: CUDA error: invalid device ordinal
some of my code
import torch
import torch.nn as nn
import torch.distributed as dist
import torch.utils.data.distributed
import torch.multiprocessing as mp
import argparse
import os
parser = argparse.ArgumentParser(description = 'multi process')
parser.add_argument('--gpu-id',type =str,default='0,1,2,4')
parser.add_argument('--world-size', default=1, type=int,
help='number of nodes for distributed training')
parser.add_argument('--rank', default=0, type=int,
help='node rank for distributed training')
parser.add_argument('--dist-url', default='tcp://localhost:23456', type=str,
help='url used to set up distributed training')
parser.add_argument('--dist-backend', default='nccl', type=str,
help='distributed backend')
args = parser.parse_args()
def main():
global args
os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id
# args.gpu = list(map(int,args.gpu_id.split(',')))
# state = {k: v for k, v in args._get_kwargs()}
# ngpus_per_node = torch.cuda.device_count() #len(args.gpu)
ngpus_per_node = args.gpu_id.split(',').__len__()
# print(os.environ['CUDA_VISIBLE_DEVICES'])
# print('能看到的gpu',ngpus_per_node)
args.nprocs = ngpus_per_node
args.world_size = ngpus_per_node * args.world_size
mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
# Random seed
# best_acc = 0 # best test accuracy
def main_worker(local_rank,ngpus_per_node,args):
# global best_acc
# start from epoch 0 or last checkpoint epoch
# if not os.path.isdir(args.checkpoint):
# mkdir_p(args.checkpoint)
# # import pdb
# pdb.set_trace()
gpus = os.environ['CUDA_VISIBLE_DEVICES'].split(',')
gpu = int(gpus[local_rank])
args.gpu = gpu
best_acc = 0
# print(best_acc)
args.rank = args.rank * ngpus_per_node + local_rank#args.gpu[gpu]
print('rank: {} / {}'.format(args.rank, args.world_size))
dist.init_process_group(backend=args.dist_backend,
init_method=args.dist_url,
world_size=args.world_size,
rank=args.rank)
torch.cuda.set_device(gpu)
if __name__ == '__main__':
main()`
I try this, but it doesn't work Lightning-AI/pytorch-lightning#3791