Skip to content

Commit 5506fc1

Browse files
allenwang28facebook-github-bot
authored andcommitted
Fix a few open source tests (#616)
Summary: Changes: - test_python_actors was failing due to rust bindings. This moves RDMA manager creation into a classmethod - test_allocator was failing due to an aggressive regex. Relaxed this. - test_rdma had a typo in `needs_rdma` - in test_mailbox.py, for whatever reason the cast was taking a really long time to complete. The logic is the same without the cast. - fixes some logic in `HAS_TENSOR_ENGINE` for deciding on whether or not to start RDMA manager actor. - Disable `test_actor_mesh.py` tests. For some reason, after D78292490, this test stopped cleaning up properly which affects downstream tests when running sequentially. Specifically, `builtins/test_log.py` and `builtins/test_random.py`. Syncing with pzhan9, there's currently an issue where unclean shutdown can lead to fatal error that's being actively worked on. We should re-enable these tests once that is resolved. remaining failures: - test_actor_error::test_supervigion_with_sending_error - test_controller:: test_timeout_warning Differential Revision: D78747980
1 parent 354b66c commit 5506fc1

File tree

8 files changed

+77
-65
lines changed

8 files changed

+77
-65
lines changed

monarch_rdma/extension/lib.rs

Lines changed: 56 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -231,76 +231,76 @@ impl PyRdmaManager {
231231
fn device(&self) -> &str {
232232
&self.device
233233
}
234-
}
235-
236-
/// Creates an RDMA manager actor on the given ProcMesh.
237-
/// Returns the actor mesh if RDMA is supported, None otherwise.
238-
#[pyfunction]
239-
fn create_rdma_manager_blocking<'py>(
240-
py: Python<'py>,
241-
proc_mesh: &PyProcMesh,
242-
) -> PyResult<Option<PyRdmaManager>> {
243-
if !ibverbs_supported() {
244-
tracing::info!("rdma is not enabled on this hardware");
245-
return Ok(None);
246-
}
247234

248-
// TODO - make this configurable
249-
let config = IbverbsConfig::default();
250-
tracing::debug!("rdma is enabled, using device {}", config.device);
235+
/// Creates an RDMA manager actor on the given ProcMesh.
236+
/// Returns the actor mesh if RDMA is supported, None otherwise.
237+
#[classmethod]
238+
fn create_rdma_manager_blocking<'py>(
239+
_cls: &Bound<'_, PyType>,
240+
py: Python<'py>,
241+
proc_mesh: &PyProcMesh,
242+
) -> PyResult<Option<PyRdmaManager>> {
243+
if !ibverbs_supported() {
244+
tracing::info!("rdma is not enabled on this hardware");
245+
return Ok(None);
246+
}
251247

252-
let tracked_proc_mesh = proc_mesh.try_inner()?;
253-
let device = config.device.to_string();
248+
// TODO - make this configurable
249+
let config = IbverbsConfig::default();
250+
tracing::debug!("rdma is enabled, using device {}", config.device);
254251

255-
let actor_mesh = signal_safe_block_on(py, async move {
256-
tracked_proc_mesh
257-
.spawn("rdma_manager", &config)
258-
.await
259-
.map_err(|err| PyException::new_err(err.to_string()))
260-
})??;
252+
let tracked_proc_mesh = proc_mesh.try_inner()?;
253+
let device = config.device.to_string();
261254

262-
Ok(Some(PyRdmaManager {
263-
inner: actor_mesh,
264-
device,
265-
}))
266-
}
255+
let actor_mesh = signal_safe_block_on(py, async move {
256+
tracked_proc_mesh
257+
.spawn("rdma_manager", &config)
258+
.await
259+
.map_err(|err| PyException::new_err(err.to_string()))
260+
})??;
267261

268-
/// Creates an RDMA manager actor on the given ProcMesh (async version).
269-
/// Returns the actor mesh if RDMA is supported, None otherwise.
270-
#[pyfunction]
271-
fn create_rdma_manager_nonblocking<'py>(
272-
py: Python<'py>,
273-
proc_mesh: &PyProcMesh,
274-
) -> PyResult<Bound<'py, PyAny>> {
275-
if !ibverbs_supported() {
276-
tracing::info!("rdma is not enabled on this hardware");
277-
return Ok(py.None().into_bound(py));
262+
Ok(Some(PyRdmaManager {
263+
inner: actor_mesh,
264+
device,
265+
}))
278266
}
279267

280-
// TODO - make this configurable
281-
let config = IbverbsConfig::default();
282-
tracing::debug!("rdma is enabled, using device {}", config.device);
268+
/// Creates an RDMA manager actor on the given ProcMesh (async version).
269+
/// Returns the actor mesh if RDMA is supported, None otherwise.
270+
#[classmethod]
271+
fn create_rdma_manager_nonblocking<'py>(
272+
_cls: &Bound<'_, PyType>,
273+
py: Python<'py>,
274+
proc_mesh: &PyProcMesh,
275+
) -> PyResult<Bound<'py, PyAny>> {
276+
if !ibverbs_supported() {
277+
tracing::info!("rdma is not enabled on this hardware");
278+
return Ok(py.None().into_bound(py));
279+
}
283280

284-
let tracked_proc_mesh = proc_mesh.try_inner()?;
285-
let device = config.device.to_string();
281+
// TODO - make this configurable
282+
let config = IbverbsConfig::default();
283+
tracing::debug!("rdma is enabled, using device {}", config.device);
286284

287-
pyo3_async_runtimes::tokio::future_into_py(py, async move {
288-
let actor_mesh = tracked_proc_mesh
289-
.spawn::<RdmaManagerActor>("rdma_manager", &config)
290-
.await
291-
.map_err(|err| PyException::new_err(err.to_string()))?;
285+
let tracked_proc_mesh = proc_mesh.try_inner()?;
286+
let device = config.device.to_string();
292287

293-
Ok(Some(PyRdmaManager {
294-
inner: actor_mesh,
295-
device,
296-
}))
297-
})
288+
pyo3_async_runtimes::tokio::future_into_py(py, async move {
289+
let actor_mesh = tracked_proc_mesh
290+
.spawn::<RdmaManagerActor>("rdma_manager", &config)
291+
.await
292+
.map_err(|err| PyException::new_err(err.to_string()))?;
293+
294+
Ok(Some(PyRdmaManager {
295+
inner: actor_mesh,
296+
device,
297+
}))
298+
})
299+
}
298300
}
299301

300302
pub fn register_python_bindings(module: &Bound<'_, PyModule>) -> PyResult<()> {
301303
module.add_class::<PyRdmaBuffer>()?;
302304
module.add_class::<PyRdmaManager>()?;
303-
module.add_function(wrap_pyfunction!(create_rdma_manager_blocking, module)?)?;
304-
module.add_function(wrap_pyfunction!(create_rdma_manager_nonblocking, module)?)?;
305305
Ok(())
306306
}

python/monarch/_rust_bindings/rdma/__init__.pyi

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,13 @@ class _RdmaMemoryRegionView:
1616
class _RdmaManager:
1717
device: str
1818
def __repr__(self) -> str: ...
19+
@classmethod
20+
def create_rdma_manager_blocking(proc_mesh: Any) -> Optional[_RdmaManager]: ...
21+
@classmethod
22+
async def create_rdma_manager_nonblocking(
23+
proc_mesh: Any,
24+
) -> Optional[_RdmaManager]: ...
1925

20-
def create_rdma_manager_blocking(proc_mesh: Any) -> Optional[_RdmaManager]: ...
21-
async def create_rdma_manager_nonblocking(proc_mesh: Any) -> Optional[_RdmaManager]: ...
2226
@final
2327
class _RdmaBuffer:
2428
name: str

python/monarch/_src/actor/proc_mesh.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,14 +64,14 @@
6464
HAS_TENSOR_ENGINE = False
6565
try:
6666
from monarch._rust_bindings.rdma import ( # type: ignore[import]
67+
_RdmaBuffer,
6768
_RdmaManager,
68-
create_rdma_manager_blocking,
6969
)
7070

71-
HAS_TENSOR_ENGINE = True
71+
# type: ignore[16]
72+
HAS_TENSOR_ENGINE = _RdmaBuffer.rdma_supported()
7273
except ImportError:
7374
logging.warning("RDMA is not available on this platform")
74-
pass
7575

7676

7777
if TYPE_CHECKING:
@@ -118,7 +118,9 @@ def __init__(
118118
with fake_sync_state():
119119
if _mock_shape is None and HAS_TENSOR_ENGINE:
120120
# type: ignore[21]
121-
self._rdma_manager = create_rdma_manager_blocking(self._proc_mesh)
121+
self._rdma_manager = _RdmaManager.create_rdma_manager_blocking(
122+
self._proc_mesh
123+
)
122124
if not _is_initializing_debugger and _mock_shape is None:
123125
self._debug_manager = self.spawn(
124126
_DEBUG_MANAGER_ACTOR_NAME, DebugManager, debug_client()

python/tests/_monarch/test_actor_mesh.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ async def handle(
6868
)
6969

7070

71+
@pytest.mark.oss_skip
7172
async def test_bind_and_pickling() -> None:
7273
proc_mesh = await allocate()
7374
actor_mesh = await proc_mesh.spawn_nonblocking("test", MyActor)
@@ -115,13 +116,15 @@ async def verify_cast(
115116
await asyncio.wait_for(receiver.recv_task().into_future(), timeout=1)
116117

117118

119+
@pytest.mark.oss_skip
118120
@pytest.mark.timeout(30)
119121
async def test_cast_handle() -> None:
120122
proc_mesh = await allocate()
121123
actor_mesh = await proc_mesh.spawn_nonblocking("test", MyActor)
122124
await verify_cast(actor_mesh, proc_mesh.client, list(range(3 * 8 * 8)))
123125

124126

127+
@pytest.mark.oss_skip
125128
@pytest.mark.timeout(30)
126129
async def test_cast_ref() -> None:
127130
proc_mesh = await allocate()
@@ -187,13 +190,15 @@ async def verify_slice(
187190
assert again_shape.ranks() == selected_ranks, f"left is {sliced_shape.ranks()}"
188191

189192

193+
@pytest.mark.oss_skip
190194
@pytest.mark.timeout(30)
191195
async def test_slice_actor_mesh_handle() -> None:
192196
proc_mesh = await allocate()
193197
actor_mesh = await proc_mesh.spawn_nonblocking("test", MyActor)
194198
await verify_slice(actor_mesh, proc_mesh.client)
195199

196200

201+
@pytest.mark.oss_skip
197202
@pytest.mark.timeout(30)
198203
async def test_slice_actor_mesh_ref() -> None:
199204
proc_mesh = await allocate()

python/tests/_monarch/test_mailbox.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,5 +178,5 @@ def my_reduce(state: str, update: str) -> str:
178178
)
179179

180180
messge = await asyncio.wait_for(receiver.recv_task().into_future(), timeout=5)
181-
value = cast(str, pickle.loads(messge.message))
181+
value = pickle.loads(messge.message)
182182
assert "[reduced](start+msg0)" in value

python/tests/test_allocator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -281,7 +281,7 @@ def dummy(self) -> None:
281281

282282
with self.assertRaisesRegex(
283283
Exception,
284-
r"(?s)Remote actor <class 'monarch.python.tests.test_allocator.FailInitActor'>.__init__ call failed.*fail on init",
284+
r"(?s)fail on init",
285285
):
286286
await actor_mesh.dummy.call()
287287

python/tests/test_python_actors.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,7 @@ def check(module, path):
237237
if isinstance(value, ModuleType):
238238
check(value, f"{path}.{name}")
239239
elif hasattr(value, "__module__"):
240+
print(f"{name} {value.__module__} {path}")
240241
assert value.__name__ == name
241242
assert value.__module__ == path
242243

python/tests/test_rdma.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
reason="CUDA not available",
1717
)
1818
needs_rdma = pytest.mark.skipif(
19-
not rdma_available,
19+
not rdma_available(),
2020
reason="RDMA not available",
2121
)
2222

0 commit comments

Comments
 (0)