diff --git a/.github/workflows/main_distributed.yaml b/.github/workflows/main_distributed.yaml index b70da3617a..78afef687b 100644 --- a/.github/workflows/main_distributed.yaml +++ b/.github/workflows/main_distributed.yaml @@ -17,10 +17,10 @@ jobs: steps: - uses: actions/checkout@v2 - - name: Set up Python 3.8 + - name: Set up Python 3.10 uses: actions/setup-python@v2 with: - python-version: 3.8 + python-version: 3.10 - name: Install PyTorch uses: astral-sh/setup-uv@v6 - name: Run Tests diff --git a/distributed/tensor_parallelism/README.md b/distributed/tensor_parallelism/README.md index b49d1672e8..ec61071e65 100644 --- a/distributed/tensor_parallelism/README.md +++ b/distributed/tensor_parallelism/README.md @@ -12,5 +12,5 @@ https://pytorch.org/docs/stable/distributed.tensor.parallel.html ``` pip install -r requirements.txt -python example.py +torchrun --nnodes 1 --nproc-per-node 4 tensor_parallel_example.py ``` diff --git a/distributed/tensor_parallelism/fsdp_tp_example.py b/distributed/tensor_parallelism/fsdp_tp_example.py index dbab48c1b8..87935f10f0 100644 --- a/distributed/tensor_parallelism/fsdp_tp_example.py +++ b/distributed/tensor_parallelism/fsdp_tp_example.py @@ -77,10 +77,11 @@ # create a sharding plan based on the given world_size. dp_size = _world_size // tp_size +device_type = torch.accelerator.current_accelerator().type # Create a device mesh with 2 dimensions. # First dim is the data parallel dimension # Second dim is the tensor parallel dimension. -device_mesh = init_device_mesh("cuda", (dp_size, tp_size), mesh_dim_names=("dp", "tp")) +device_mesh = init_device_mesh(device_type, (dp_size, tp_size), mesh_dim_names=("dp", "tp")) rank_log(_rank, logger, f"Device Mesh created: {device_mesh=}") tp_mesh = device_mesh["tp"] @@ -92,10 +93,10 @@ # to mimic the behavior of the dataloader. dp_rank = dp_mesh.get_local_rank() -# create model and move it to GPU - init"cuda"_mesh has already mapped GPU ids. +# create model and move it to GPU - initdevice_type_mesh has already mapped GPU ids. simple_llama2_config = ModelArgs(dim=256, n_layers=2, n_heads=16, vocab_size=32000) -model = Transformer.from_model_args(simple_llama2_config).to("cuda") +model = Transformer.from_model_args(simple_llama2_config).to(device_type) # init model weights model.init_weights() @@ -170,7 +171,7 @@ for i in range(num_iterations): # seeding with dp_rank to ensure identical inputs for TP groups torch.manual_seed(i + dp_rank) - inp = torch.randint(32000, (8, 256), device="cuda") + inp = torch.randint(32000, (8, 256), device=device_type) output = sharded_model(inp) output.sum().backward() diff --git a/distributed/tensor_parallelism/log_utils.py b/distributed/tensor_parallelism/log_utils.py index f16d46526d..d103df892b 100644 --- a/distributed/tensor_parallelism/log_utils.py +++ b/distributed/tensor_parallelism/log_utils.py @@ -17,6 +17,6 @@ def rank_log(_rank, logger, msg): def verify_min_gpu_count(min_gpus: int = 2) -> bool: """ verification that we have at least 2 gpus to run dist examples """ - has_cuda = torch.cuda.is_available() - gpu_count = torch.cuda.device_count() - return has_cuda and gpu_count >= min_gpus + has_gpu = torch.accelerator.is_available() + gpu_count = torch.accelerator.device_count() + return has_gpu and gpu_count >= min_gpus diff --git a/distributed/tensor_parallelism/requirements.txt b/distributed/tensor_parallelism/requirements.txt index 80fad36bf2..4f47924993 100644 --- a/distributed/tensor_parallelism/requirements.txt +++ b/distributed/tensor_parallelism/requirements.txt @@ -1,6 +1,3 @@ # Python dependencies required for running the example ---pre ---extra-index-url https://download.pytorch.org/whl/nightly/cu118 ---extra-index-url https://download.pytorch.org/whl/nightly/cu121 -torch >= 2.3.0.dev0; sys_platform == "linux" +torch >= 2.7.1; sys_platform == "linux" diff --git a/distributed/tensor_parallelism/sequence_parallel_example.py b/distributed/tensor_parallelism/sequence_parallel_example.py index 3324d28d4a..b145fbc95e 100644 --- a/distributed/tensor_parallelism/sequence_parallel_example.py +++ b/distributed/tensor_parallelism/sequence_parallel_example.py @@ -1,3 +1,5 @@ +# The following is an example command to run this code +# torchrun --nnodes 1 --nproc-per-node 4 sequence_parallel_example.py import os import sys import torch @@ -63,9 +65,10 @@ def forward(self, x): """ logger = get_logger() +device_type = torch.accelerator.current_accelerator().type # create a device mesh based on the given world_size. device_mesh = init_device_mesh( - device_type="cuda", mesh_shape=(int(os.environ["WORLD_SIZE"]),) + device_type=device_type, mesh_shape=(int(os.environ["WORLD_SIZE"]),) ) _rank = device_mesh.get_rank() @@ -75,7 +78,7 @@ def forward(self, x): rank_log(_rank, logger, f"Device Mesh created: {device_mesh=}") # create model and move it to GPU. Init_device_mesh has already assigned gpu ids... -model = ToyModel().to("cuda") +model = ToyModel().to(device_type) # Custom parallelization plan for the model sp_model = parallelize_module( @@ -100,7 +103,7 @@ def forward(self, x): for i in range(num_iters): # For SP, input can be different across all ranks. - inp = torch.rand(20, 10, device="cuda") + inp = torch.rand(20, 10, device=device_type) output = sp_model(inp) output.sum().backward() optimizer.step() diff --git a/distributed/tensor_parallelism/tensor_parallel_example.py b/distributed/tensor_parallelism/tensor_parallel_example.py index 0b9c884507..b96f982f0c 100755 --- a/distributed/tensor_parallelism/tensor_parallel_example.py +++ b/distributed/tensor_parallelism/tensor_parallel_example.py @@ -1,3 +1,5 @@ +# The following is an example command to run this code +# torchrun --nnodes 1 --nproc-per-node 4 tensor_parallel_example.py import os import sys import torch @@ -76,8 +78,8 @@ def forward(self, x): # create a device mesh based on the given world_size. _world_size = int(os.environ["WORLD_SIZE"]) - -device_mesh = init_device_mesh(device_type="cuda", mesh_shape=(_world_size,)) +device_type = torch.accelerator.current_accelerator().type +device_mesh = init_device_mesh(device_type=device_type, mesh_shape=(_world_size,)) _rank = device_mesh.get_rank() @@ -88,8 +90,8 @@ def forward(self, x): rank_log(_rank, logger, f"Device Mesh created: {device_mesh=}") -# create model and move it to GPU - init"cuda"_mesh has already mapped GPU ids. -tp_model = ToyModel().to("cuda") +# create model and move it to GPU - initdevice_type_mesh has already mapped GPU ids. +tp_model = ToyModel().to(device_type) # Custom parallelization plan for the model @@ -116,7 +118,7 @@ def forward(self, x): # For TP, input needs to be same across all TP ranks. # Setting the random seed is to mimic the behavior of dataloader. torch.manual_seed(i) - inp = torch.rand(20, 10, device="cuda") + inp = torch.rand(20, 10, device=device_type) output = tp_model(inp) output.sum().backward() optimizer.step() diff --git a/runtime.txt b/runtime.txt index cc1923a40b..bd28b9c5c2 100644 --- a/runtime.txt +++ b/runtime.txt @@ -1 +1 @@ -3.8 +3.9