diff --git a/test/xpu/distributed/test_c10d_ops_xccl.py b/test/xpu/distributed/test_c10d_ops_xccl.py index 5221404b9..4235fb399 100644 --- a/test/xpu/distributed/test_c10d_ops_xccl.py +++ b/test/xpu/distributed/test_c10d_ops_xccl.py @@ -41,6 +41,8 @@ TEST_MULTIGPU = TEST_XPU and torch.xpu.device_count() >= 2 +print("11111111111111111111111111111111111\n", flush=True) + class ProcessGroupXCCLOpTest(MultiProcContinousTest): @classmethod @@ -60,6 +62,7 @@ def rank_to_GPU(self): @requires_xccl() @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") def test_empty_tensors(self): + print("222222222222222222222222222222\n", flush=True) pg = self.pg local_device_idx = self.rank_to_GPU[self.rank][0] @@ -97,6 +100,7 @@ def test_empty_tensors(self): @parametrize("dtype", [torch.float32, torch.cfloat]) @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") def test_broadcast_ops(self, dtype: torch.dtype): + print("3333333333333333333333333333\n", flush=True) pg = self.pg def broadcast(xs, rootRank, rootTensor): @@ -138,6 +142,7 @@ def broadcast(xs, rootRank, rootTensor): @parametrize("dtype", [torch.float32, torch.cfloat]) @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") def test_allreduce_ops(self, dtype: torch.dtype): + print("44444444444444444444444444444444\n", flush=True) device_count = torch.xpu.device_count() pg = self.pg local_device_id = self.rank_to_GPU[self.rank][0] @@ -197,6 +202,7 @@ def allreduce(tensors, op): @requires_xccl() @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs") def test_alltoall_ops_with_xpufree_race(self): + print("55555555555555555555555555\n", flush=True) pg = self.pg opts = c10d.AllToAllOptions() local_device = f"xpu:{self.rank_to_GPU[self.rank][0]}" @@ -922,20 +928,28 @@ def test_all_to_all_single_none(self): ) +print("##########before instantiate\n", flush=True) instantiate_parametrized_tests(ProcessGroupXCCLOpTest) +print("##########after instantiate\n", flush=True) if __name__ == "__main__": - rank = int(os.getenv("RANK", -1)) + rank = int(os.getenv("RANK", -1)) # noqa: UP032 world_size = int(os.getenv("WORLD_SIZE", 2)) + print(f"########## world_size {world_size} {rank}", flush=True) if rank != -1: # Launched with torchrun or other multi-proc launchers. Directly run the test. + print("\nbefore process group", flush=True) ProcessGroupXCCLOpTest.run_rank(rank, world_size) + print("\nafter process group", flush=True) else: # Launched as a single process. Spawn subprocess to run the tests. # Also need a rendezvous file for `init_process_group` purpose. + + print("\nbefore multiprocess spawn", flush=True) rdvz_file = tempfile.NamedTemporaryFile(delete=False).name torch.multiprocessing.spawn( ProcessGroupXCCLOpTest.run_rank, nprocs=world_size, args=(world_size, rdvz_file), ) + print("\nafter multiprocess spawn", flush=True) diff --git a/test/xpu/distributed/test_c10d_xccl.py b/test/xpu/distributed/test_c10d_xccl.py index 0625a6993..ad3feb28a 100644 --- a/test/xpu/distributed/test_c10d_xccl.py +++ b/test/xpu/distributed/test_c10d_xccl.py @@ -91,12 +91,15 @@ def simple_reduce_tests(rank, world_size): TEST_MULTIXPU = torch.xpu.device_count() > 1 +print("22222222222222222222\n", flush=True) + class RendezvousEnvTest(TestCase): @retry_on_connect_failures @requires_xccl() @skip_but_pass_in_sandcastle_if(not TEST_XPU, "No GPUs available, skipping test") def test_common_errors(self): + print("333333333333333\n", flush=True) vars = { "WORLD_SIZE": "1", "RANK": "0", @@ -233,6 +236,7 @@ def rank_to_GPU(self): torch.xpu.device_count() < 2, "XCCL test requires 2+ GPUs" ) def test_close_multi_pg_unordered(self): + print("4444444444444444444444\n", flush=True) pg = self._create_process_group_xccl() device = self.rank_to_GPU[self.rank][0] t = torch.rand(10, 10, device=device) @@ -556,5 +560,7 @@ class SetDeviceMethod(Enum): COLLECTIVE_ARGUMENT = auto() # broadcast_object_list(device=) +print("111111111111111111111111111111111\n", flush=True) + if __name__ == "__main__": run_tests() diff --git a/test/xpu/run_distributed.py b/test/xpu/run_distributed.py index 0fa5bb337..f7df42091 100644 --- a/test/xpu/run_distributed.py +++ b/test/xpu/run_distributed.py @@ -20,9 +20,14 @@ def run(test_command): return result.returncode +print("before testing distributed/test_c10d_ops_xccl.py\n", flush=True) test_command = ["python", "distributed/test_c10d_ops_xccl.py"] res += run(test_command) +print("after testing distributed/test_c10d_ops_xccl.py\n", flush=True) + +print("before testing distributed/test_c10d_xccl.py\n", flush=True) test_command = ["python", "distributed/test_c10d_xccl.py"] +print("before testing distributed/test_c10d_xccl.py\n", flush=True) res += run(test_command) # run pytest with skiplist