Skip to content

Commit 0924fe1

Browse files
committed
add debugability for baby pg
Summary: - running multiple processes a few limitations - we can't get gpu profiles from subprocesses - the results can differ because of cuda using a different context that can't run concurrently, this can make it hard to debug if there's something wrong with the code or if it's an artefact of cuda context - use multiprocessing.dummy to use threads instead of process Test Plan: using the patch with baby nccl, we can get overlapping communication and computation <img width="1539" alt="image" src="https://github.com/user-attachments/assets/39152858-1373-4318-8646-398141db3072" /> we cannot get the overlap when using multiple processes, indicating it has something to do with cuda context <img width="1537" alt="image" src="https://github.com/user-attachments/assets/6b823d8e-a152-4678-a7e4-b6b8d6b6bb54" />
1 parent b642f46 commit 0924fe1

File tree

1 file changed

+119
-0
lines changed

1 file changed

+119
-0
lines changed
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
"""
8+
Multiprocessing Dummy Context
9+
=========================
10+
11+
This module provides a context-like interface for multiprocessing.dummy,
12+
which is a wrapper around the threading module that provides a multiprocessing-like
13+
interface but uses threads instead of processes.
14+
15+
This allows code that uses multiprocessing.get_context() to work with
16+
multiprocessing.dummy by providing a compatible interface.
17+
"""
18+
19+
import multiprocessing.dummy as mp
20+
21+
22+
class DummyContext:
23+
"""
24+
A context-like class for multiprocessing.dummy that mimics the interface
25+
of a context returned by multiprocessing.get_context().
26+
"""
27+
28+
def __init__(self, method=None):
29+
"""
30+
Initialize the dummy context.
31+
32+
Args:
33+
method: Ignored, only for compatibility with multiprocessing.get_context()
34+
"""
35+
pass
36+
37+
def Process(self, *args, **kwargs):
38+
"""
39+
Create a Process using multiprocessing.dummy.Process.
40+
"""
41+
return mp.Process(*args, **kwargs)
42+
43+
def Pipe(self, duplex=True):
44+
"""
45+
Create a Pipe using multiprocessing.dummy.Pipe.
46+
"""
47+
return mp.Pipe(duplex)
48+
49+
def Queue(self, maxsize=0):
50+
"""
51+
Create a Queue using multiprocessing.dummy.Queue.
52+
"""
53+
return mp.Queue(maxsize)
54+
55+
def Event(self):
56+
"""
57+
Create an Event using multiprocessing.dummy.Event.
58+
"""
59+
return mp.Event()
60+
61+
def Lock(self):
62+
"""
63+
Create a Lock using multiprocessing.dummy.Lock.
64+
"""
65+
return mp.Lock()
66+
67+
def RLock(self):
68+
"""
69+
Create an RLock using multiprocessing.dummy.RLock.
70+
"""
71+
return mp.RLock()
72+
73+
def Semaphore(self, value=1):
74+
"""
75+
Create a Semaphore using multiprocessing.dummy.Semaphore.
76+
"""
77+
return mp.Semaphore(value)
78+
79+
def BoundedSemaphore(self, value=1):
80+
"""
81+
Create a BoundedSemaphore using multiprocessing.dummy.BoundedSemaphore.
82+
"""
83+
return mp.BoundedSemaphore(value)
84+
85+
def Condition(self, lock=None):
86+
"""
87+
Create a Condition using multiprocessing.dummy.Condition.
88+
"""
89+
return mp.Condition(lock)
90+
91+
def Manager(self):
92+
"""
93+
Create a Manager using multiprocessing.dummy.Manager.
94+
"""
95+
return mp.Manager()
96+
97+
98+
def get_context(method=None):
99+
"""
100+
Return a context object for multiprocessing.dummy.
101+
102+
This function mimics multiprocessing.get_context() but returns a DummyContext
103+
that works with multiprocessing.dummy. This can be used to patch
104+
multiprocessing.dummy like so
105+
106+
107+
```
108+
import multiprocessing.dummy as mp
109+
from torchft.multiprocessing_dummy_context import get_context
110+
mp.get_context = get_context
111+
```
112+
113+
Args:
114+
method: Ignored, only for compatibility with multiprocessing.get_context()
115+
116+
Returns:
117+
A DummyContext instance
118+
"""
119+
return DummyContext(method)

0 commit comments

Comments
 (0)