Skip to content

Commit

Permalink
[FSDP2] cast scale to float32 in precompute (#835)
Browse files Browse the repository at this point in the history
Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
  • Loading branch information
weifengpy committed Sep 11, 2024
1 parent b4d0768 commit 85d03de
Showing 1 changed file with 2 additions and 2 deletions.
4 changes: 2 additions & 2 deletions torchao/float8/fsdp_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def precompute_float8_dynamic_scale_for_fsdp(module: nn.Module) -> None:
return

# inf-norm is equivalent to max(abs(w))
max_weights = torch._foreach_norm(weights, ord=math.inf, dtype=torch.float32) # Partial
max_weights = torch._foreach_norm(weights, ord=math.inf) # Partial
amax_tensor = torch.stack(max_weights) # Partial
# clamp is dispatched through DTensor
# it will issue a single all-reduce
Expand All @@ -69,7 +69,7 @@ def precompute_float8_dynamic_scale_for_fsdp(module: nn.Module) -> None:
scale_tensor = torch.clamp(scale_tensor, max=torch.finfo(torch.float16).max)
local_scale_tensor = scale_tensor.to_local()
for i, float8_linear in enumerate(float8_linears):
float8_linear.weight._local_tensor._precomputed_scale = local_scale_tensor[i]
float8_linear.weight._local_tensor._precomputed_scale = local_scale_tensor[i].to(torch.float32)


# FSDP pads its local tensor on dim-0. The subclass should be preserved such
Expand Down

0 comments on commit 85d03de

Please sign in to comment.