Skip to content

check distributed test hang issue #1676

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion test/xpu/distributed/test_c10d_ops_xccl.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@

TEST_MULTIGPU = TEST_XPU and torch.xpu.device_count() >= 2

print("11111111111111111111111111111111111\n", flush=True)


class ProcessGroupXCCLOpTest(MultiProcContinousTest):
@classmethod
Expand All @@ -60,6 +62,7 @@ def rank_to_GPU(self):
@requires_xccl()
@skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
def test_empty_tensors(self):
print("222222222222222222222222222222\n", flush=True)
pg = self.pg
local_device_idx = self.rank_to_GPU[self.rank][0]

Expand Down Expand Up @@ -97,6 +100,7 @@ def test_empty_tensors(self):
@parametrize("dtype", [torch.float32, torch.cfloat])
@skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
def test_broadcast_ops(self, dtype: torch.dtype):
print("3333333333333333333333333333\n", flush=True)
pg = self.pg

def broadcast(xs, rootRank, rootTensor):
Expand Down Expand Up @@ -138,6 +142,7 @@ def broadcast(xs, rootRank, rootTensor):
@parametrize("dtype", [torch.float32, torch.cfloat])
@skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
def test_allreduce_ops(self, dtype: torch.dtype):
print("44444444444444444444444444444444\n", flush=True)
device_count = torch.xpu.device_count()
pg = self.pg
local_device_id = self.rank_to_GPU[self.rank][0]
Expand Down Expand Up @@ -197,6 +202,7 @@ def allreduce(tensors, op):
@requires_xccl()
@skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
def test_alltoall_ops_with_xpufree_race(self):
print("55555555555555555555555555\n", flush=True)
pg = self.pg
opts = c10d.AllToAllOptions()
local_device = f"xpu:{self.rank_to_GPU[self.rank][0]}"
Expand Down Expand Up @@ -922,20 +928,28 @@ def test_all_to_all_single_none(self):
)


print("##########before instantiate\n", flush=True)
instantiate_parametrized_tests(ProcessGroupXCCLOpTest)
print("##########after instantiate\n", flush=True)
if __name__ == "__main__":
rank = int(os.getenv("RANK", -1))
rank = int(os.getenv("RANK", -1)) # noqa: UP032
world_size = int(os.getenv("WORLD_SIZE", 2))

print(f"########## world_size {world_size} {rank}", flush=True)
if rank != -1:
# Launched with torchrun or other multi-proc launchers. Directly run the test.
print("\nbefore process group", flush=True)
ProcessGroupXCCLOpTest.run_rank(rank, world_size)
print("\nafter process group", flush=True)
else:
# Launched as a single process. Spawn subprocess to run the tests.
# Also need a rendezvous file for `init_process_group` purpose.

print("\nbefore multiprocess spawn", flush=True)
rdvz_file = tempfile.NamedTemporaryFile(delete=False).name
torch.multiprocessing.spawn(
ProcessGroupXCCLOpTest.run_rank,
nprocs=world_size,
args=(world_size, rdvz_file),
)
print("\nafter multiprocess spawn", flush=True)
6 changes: 6 additions & 0 deletions test/xpu/distributed/test_c10d_xccl.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,12 +91,15 @@ def simple_reduce_tests(rank, world_size):

TEST_MULTIXPU = torch.xpu.device_count() > 1

print("22222222222222222222\n", flush=True)


class RendezvousEnvTest(TestCase):
@retry_on_connect_failures
@requires_xccl()
@skip_but_pass_in_sandcastle_if(not TEST_XPU, "No GPUs available, skipping test")
def test_common_errors(self):
print("333333333333333\n", flush=True)
vars = {
"WORLD_SIZE": "1",
"RANK": "0",
Expand Down Expand Up @@ -233,6 +236,7 @@ def rank_to_GPU(self):
torch.xpu.device_count() < 2, "XCCL test requires 2+ GPUs"
)
def test_close_multi_pg_unordered(self):
print("4444444444444444444444\n", flush=True)
pg = self._create_process_group_xccl()
device = self.rank_to_GPU[self.rank][0]
t = torch.rand(10, 10, device=device)
Expand Down Expand Up @@ -556,5 +560,7 @@ class SetDeviceMethod(Enum):
COLLECTIVE_ARGUMENT = auto() # broadcast_object_list(device=)


print("111111111111111111111111111111111\n", flush=True)

if __name__ == "__main__":
run_tests()
5 changes: 5 additions & 0 deletions test/xpu/run_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,14 @@ def run(test_command):
return result.returncode


print("before testing distributed/test_c10d_ops_xccl.py\n", flush=True)
test_command = ["python", "distributed/test_c10d_ops_xccl.py"]
res += run(test_command)
print("after testing distributed/test_c10d_ops_xccl.py\n", flush=True)

print("before testing distributed/test_c10d_xccl.py\n", flush=True)
test_command = ["python", "distributed/test_c10d_xccl.py"]
print("before testing distributed/test_c10d_xccl.py\n", flush=True)
res += run(test_command)

# run pytest with skiplist
Expand Down
Loading