Chao1Han · Chao1Han · Jan 9, 2025 · Jan 17, 2025 · Jan 17, 2025 · Jan 22, 2025
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_autograd.py b/test/distributed/_composable/fsdp/test_fully_shard_autograd.py
@@ -21,7 +21,7 @@
     FSDPTestMultiThread,
     MLP,
 )
-from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.common_utils import run_tests, TEST_XPU
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     ModelArgs,
     Transformer,
@@ -31,7 +31,7 @@
 class TestFullyShardAutograd(FSDPTest):
     @property
     def world_size(self) -> int:
-        return min(4, torch.cuda.device_count())
+        return min(4, torch.xpu.device_count())
 
     def _reduce_1d_partial_grads(
         self, module: nn.Module, group: Optional[dist.ProcessGroup] = None
@@ -58,7 +58,7 @@ def _test_unused_forward_output(self, reshard_after_forward: Union[bool, int]):
         local_batch_size = 2
         global_batch_size, dim = (self.world_size * local_batch_size, 24)
         model = DoubleLinear(dim=dim, use_second_linear=True)
-        ref_model = copy.deepcopy(model).cuda()
+        ref_model = copy.deepcopy(model).xpu()
         fully_shard(model.lin1, reshard_after_forward=reshard_after_forward)
         fully_shard(model, reshard_after_forward=reshard_after_forward)
         ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
@@ -68,7 +68,7 @@ def _test_unused_forward_output(self, reshard_after_forward: Union[bool, int]):
         for iter_idx in range(10):
             # Use all forward outputs in the loss/backward for the first half
             # of the iterations and only the 1st forward output for the rest
-            global_inp = torch.rand((global_batch_size, dim), device="cuda")
+            global_inp = torch.rand((global_batch_size, dim), device="xpu")
             local_inp = global_inp[
                 self.rank * local_batch_size : (self.rank + 1) * local_batch_size
             ].detach()
@@ -104,7 +104,7 @@ def _test_unused_forward_module(self, reshard_after_forward: Union[bool, int]):
         local_batch_size, dim = (2, 24)
         global_batch_size = self.world_size * local_batch_size
         model = DoubleLinear(dim=dim, use_second_linear=False)
-        ref_model = copy.deepcopy(model).cuda()
+        ref_model = copy.deepcopy(model).xpu()
         fully_shard(model.lin1, reshard_after_forward=reshard_after_forward)
         fully_shard(model.lin2, reshard_after_forward=reshard_after_forward)
         fully_shard(model, reshard_after_forward=reshard_after_forward)
@@ -113,7 +113,7 @@ def _test_unused_forward_module(self, reshard_after_forward: Union[bool, int]):
 
         torch.manual_seed(1)  # same on all ranks
         for iter_idx in range(10):
-            global_inp = torch.rand((global_batch_size, dim), device="cuda")
+            global_inp = torch.rand((global_batch_size, dim), device="xpu")
             local_inp = global_inp[
                 self.rank * local_batch_size : (self.rank + 1) * local_batch_size
             ].detach()
@@ -214,7 +214,7 @@ def forward(self, x: torch.Tensor):
             Module(dim),
             FromContainerType(container_type),
         )
-        ref_model = copy.deepcopy(model).cuda()
+        ref_model = copy.deepcopy(model).xpu()
         for module in model:
             fully_shard(module)
         fully_shard(model)
@@ -223,7 +223,7 @@ def forward(self, x: torch.Tensor):
 
         torch.manual_seed(1)  # same on all ranks
         for iter_idx in range(10):
-            global_inp = torch.rand((global_batch_size, dim), device="cuda")
+            global_inp = torch.rand((global_batch_size, dim), device="xpu")
             local_inp = global_inp[
                 self.rank * local_batch_size : (self.rank + 1) * local_batch_size
             ].detach()
@@ -245,7 +245,6 @@ class TestFullyShardPostAccGradHookMultiThread(FSDPTestMultiThread):
     def world_size(self) -> int:
         return 2
 
-    @unittest.skipIf(not TEST_CUDA, "no cuda")
     def test_post_acc_grad_hook_runs(self):
         param_name_to_hook_count = collections.defaultdict(int)
 
@@ -260,7 +259,7 @@ def hook(param_name: str, param: torch.Tensor) -> None:
             param_hook = functools.partial(hook, param_name)
             param.register_post_accumulate_grad_hook(param_hook)
 
-        inp = torch.randn((2, 8), device="cuda")
+        inp = torch.randn((2, 8), device="xpu")
         model(inp).sum().backward()
         param_names = {param_name for param_name, _ in model.named_parameters()}
         self.assertEqual(param_names, set(param_name_to_hook_count.keys()))
@@ -271,7 +270,7 @@ def hook(param_name: str, param: torch.Tensor) -> None:
 class TestFullyShardPostAccGradHookMultiProcess(FSDPTest):
     @property
     def world_size(self) -> int:
-        return min(torch.cuda.device_count(), 2)
+        return min(torch.xpu.device_count(), 2)
 
     @skip_if_lt_x_gpu(2)
     def test_post_acc_grad_hook_optim_parity(self):
@@ -283,7 +282,7 @@ def test_post_acc_grad_hook_optim_parity(self):
         model_args = ModelArgs(dropout_p=0.0)
         model = Transformer(model_args)
 
-        ref_model = copy.deepcopy(model).cuda()
+        ref_model = copy.deepcopy(model).xpu()
         for module in itertools.chain(ref_model.layers, [ref_model]):
             fully_shard(module)
         optim_kwargs = {"lr": 1e-2, "foreach": False}
@@ -312,7 +311,7 @@ def optim_hook(param: nn.Parameter) -> None:
             param.register_post_accumulate_grad_hook(optim_hook)
 
         torch.manual_seed(42 + self.rank)
-        inp = torch.randint(0, model_args.vocab_size, (2, 16), device="cuda")
+        inp = torch.randint(0, model_args.vocab_size, (2, 16), device="xpu")
         for _ in range(10):
             ref_loss = ref_model(inp).sum()
             ref_loss.backward()

diff --git a/test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py b/test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py
@@ -33,9 +33,9 @@ def _test_clip_grad_norm(
         dp_mesh: Optional[DeviceMesh] = None,
     ):
         vector_norm_fn = functools.partial(torch.linalg.vector_norm, ord=norm_type)
-        dp_mesh = dp_mesh or init_device_mesh("cuda", (self.world_size,))
+        dp_mesh = dp_mesh or init_device_mesh("xpu", (self.world_size,))
         torch.manual_seed(42 + dp_mesh.get_local_rank() + 1)
-        for _ in range(10):
+        for iter_idx in range(10):
             ref_optim.zero_grad()
             ref_model(inp).sum().backward()
             optim.zero_grad()
@@ -91,22 +91,22 @@ def _test_clip_grad_norm(
 class TestClipGradNormWorldSize2(_TestClipGradNormBase):
     @property
     def world_size(self) -> int:
-        return min(torch.cuda.device_count(), 2)
+        return min(torch.xpu.device_count(), 2)
 
     @skip_if_lt_x_gpu(2)
     def test_clip_grad_norm_1d(self):
         for norm_type in (2, 1, float("inf")):
             torch.manual_seed(42)
             model_args = ModelArgs(dropout_p=0.0)
             model = Transformer(model_args)
-            ref_model = replicate(copy.deepcopy(model).cuda())
+            ref_model = replicate(copy.deepcopy(model).xpu())
             ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
             for module in model.modules():
                 if isinstance(module, TransformerBlock):
                     fully_shard(module)
             fully_shard(model)
             optim = torch.optim.Adam(model.parameters(), lr=1e-2)
-            inp = torch.randint(0, model.model_args.vocab_size, (3, 16), device="cuda")
+            inp = torch.randint(0, model.model_args.vocab_size, (3, 16), device="xpu")
             self._test_clip_grad_norm(
                 1, norm_type, ref_model, ref_optim, model, optim, inp
             )
@@ -115,14 +115,14 @@ def test_clip_grad_norm_1d(self):
 class TestClipGradNormWorldSize4(_TestClipGradNormBase):
     @property
     def world_size(self) -> int:
-        return min(torch.cuda.device_count(), 4)
+        return min(torch.xpu.device_count(), 4)
 
     @skip_if_lt_x_gpu(4)
     def test_clip_grad_norm_2d(self):
         for norm_type in (2, 1, 3, float("inf")):
             dp_size = 2
             global_mesh = init_device_mesh(
-                "cuda",
+                "xpu",
                 (dp_size, self.world_size // dp_size),
                 mesh_dim_names=("dp", "tp"),
             )
@@ -132,7 +132,7 @@ def test_clip_grad_norm_2d(self):
             # has some more significant numeric differences from the TP
             model = MLPStack(16, with_seq_parallel=True)
             ref_model = replicate(
-                copy.deepcopy(model).cuda(), process_group=dp_mesh.get_group()
+                copy.deepcopy(model).xpu(), process_group=dp_mesh.get_group()
             )
             ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
             model.parallelize(
@@ -142,7 +142,7 @@ def test_clip_grad_norm_2d(self):
                 reshard_after_forward=True,
             )
             optim = torch.optim.Adam(model.parameters(), lr=1e-2)
-            inp = torch.randn(2, 16, device="cuda")
+            inp = torch.randn(2, 16, device="xpu")
             self._test_clip_grad_norm(
                 0.5, norm_type, ref_model, ref_optim, model, optim, inp, dp_mesh
             )