diff --git a/test/xpu/distributed/test_c10d_ops_xccl.py b/test/xpu/distributed/test_c10d_ops_xccl.py
index 5221404b9..4235fb399 100644
--- a/test/xpu/distributed/test_c10d_ops_xccl.py
+++ b/test/xpu/distributed/test_c10d_ops_xccl.py
@@ -41,6 +41,8 @@
 
 TEST_MULTIGPU = TEST_XPU and torch.xpu.device_count() >= 2
 
+print("11111111111111111111111111111111111\n", flush=True)
+
 
 class ProcessGroupXCCLOpTest(MultiProcContinousTest):
     @classmethod
@@ -60,6 +62,7 @@ def rank_to_GPU(self):
     @requires_xccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
     def test_empty_tensors(self):
+        print("222222222222222222222222222222\n", flush=True)
         pg = self.pg
         local_device_idx = self.rank_to_GPU[self.rank][0]
 
@@ -97,6 +100,7 @@ def test_empty_tensors(self):
     @parametrize("dtype", [torch.float32, torch.cfloat])
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
     def test_broadcast_ops(self, dtype: torch.dtype):
+        print("3333333333333333333333333333\n", flush=True)
         pg = self.pg
 
         def broadcast(xs, rootRank, rootTensor):
@@ -138,6 +142,7 @@ def broadcast(xs, rootRank, rootTensor):
     @parametrize("dtype", [torch.float32, torch.cfloat])
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
     def test_allreduce_ops(self, dtype: torch.dtype):
+        print("44444444444444444444444444444444\n", flush=True)
         device_count = torch.xpu.device_count()
         pg = self.pg
         local_device_id = self.rank_to_GPU[self.rank][0]
@@ -197,6 +202,7 @@ def allreduce(tensors, op):
     @requires_xccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
     def test_alltoall_ops_with_xpufree_race(self):
+        print("55555555555555555555555555\n", flush=True)
         pg = self.pg
         opts = c10d.AllToAllOptions()
         local_device = f"xpu:{self.rank_to_GPU[self.rank][0]}"
@@ -922,20 +928,28 @@ def test_all_to_all_single_none(self):
         )
 
 
+print("##########before instantiate\n", flush=True)
 instantiate_parametrized_tests(ProcessGroupXCCLOpTest)
+print("##########after instantiate\n", flush=True)
 if __name__ == "__main__":
-    rank = int(os.getenv("RANK", -1))
+    rank = int(os.getenv("RANK", -1))  # noqa: UP032
     world_size = int(os.getenv("WORLD_SIZE", 2))
 
+    print(f"########## world_size {world_size} {rank}", flush=True)
     if rank != -1:
         # Launched with torchrun or other multi-proc launchers. Directly run the test.
+        print("\nbefore process group", flush=True)
         ProcessGroupXCCLOpTest.run_rank(rank, world_size)
+        print("\nafter process group", flush=True)
     else:
         # Launched as a single process. Spawn subprocess to run the tests.
         # Also need a rendezvous file for `init_process_group` purpose.
+
+        print("\nbefore multiprocess spawn", flush=True)
         rdvz_file = tempfile.NamedTemporaryFile(delete=False).name
         torch.multiprocessing.spawn(
             ProcessGroupXCCLOpTest.run_rank,
             nprocs=world_size,
             args=(world_size, rdvz_file),
         )
+        print("\nafter multiprocess spawn", flush=True)
diff --git a/test/xpu/distributed/test_c10d_xccl.py b/test/xpu/distributed/test_c10d_xccl.py
index 0625a6993..ad3feb28a 100644
--- a/test/xpu/distributed/test_c10d_xccl.py
+++ b/test/xpu/distributed/test_c10d_xccl.py
@@ -91,12 +91,15 @@ def simple_reduce_tests(rank, world_size):
 
 TEST_MULTIXPU = torch.xpu.device_count() > 1
 
+print("22222222222222222222\n", flush=True)
+
 
 class RendezvousEnvTest(TestCase):
     @retry_on_connect_failures
     @requires_xccl()
     @skip_but_pass_in_sandcastle_if(not TEST_XPU, "No GPUs available, skipping test")
     def test_common_errors(self):
+        print("333333333333333\n", flush=True)
         vars = {
             "WORLD_SIZE": "1",
             "RANK": "0",
@@ -233,6 +236,7 @@ def rank_to_GPU(self):
         torch.xpu.device_count() < 2, "XCCL test requires 2+ GPUs"
     )
     def test_close_multi_pg_unordered(self):
+        print("4444444444444444444444\n", flush=True)
         pg = self._create_process_group_xccl()
         device = self.rank_to_GPU[self.rank][0]
         t = torch.rand(10, 10, device=device)
@@ -556,5 +560,7 @@ class SetDeviceMethod(Enum):
     COLLECTIVE_ARGUMENT = auto()  # broadcast_object_list(device=)
 
 
+print("111111111111111111111111111111111\n", flush=True)
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/xpu/run_distributed.py b/test/xpu/run_distributed.py
index 0fa5bb337..f7df42091 100644
--- a/test/xpu/run_distributed.py
+++ b/test/xpu/run_distributed.py
@@ -20,9 +20,14 @@ def run(test_command):
     return result.returncode
 
 
+print("before testing distributed/test_c10d_ops_xccl.py\n", flush=True)
 test_command = ["python", "distributed/test_c10d_ops_xccl.py"]
 res += run(test_command)
+print("after testing distributed/test_c10d_ops_xccl.py\n", flush=True)
+
+print("before testing distributed/test_c10d_xccl.py\n", flush=True)
 test_command = ["python", "distributed/test_c10d_xccl.py"]
+print("before testing distributed/test_c10d_xccl.py\n", flush=True)
 res += run(test_command)
 
 # run pytest with skiplist