(6/final) Updates rdma.py with tensor_engine RDMA (#582)

allenwang28 · facebook-github-bot · commit 89082730929d · 2025-07-21T07:45:22.000-07:00
Summary:

This diff:
- Marks rdma.py for deprecation
- Replaces the existing RDMABuffer stubs with the real monarch_extension based version
- Updates examples/meta/rl toy_actor example to use RDMABuffer
- Splits out a few RDMA tests from test_python_actors.py into `test_rdma.py`

Differential Revision: D78366430
diff --git a/python/monarch/_src/actor/proc_mesh.py b/python/monarch/_src/actor/proc_mesh.py
@@ -6,6 +6,7 @@
 
 # pyre-strict
 
+import logging
 import os
 import sys
 import warnings
@@ -24,7 +25,6 @@
 )
 
 from monarch._rust_bindings.monarch_extension.logging import LoggingMeshClient
-
 from monarch._rust_bindings.monarch_hyperactor.alloc import (  # @manual=//monarch/monarch_extension:monarch_extension
     Alloc,
     AllocConstraints,
@@ -63,13 +63,14 @@
 
 HAS_TENSOR_ENGINE = False
 try:
-    # TODO: while the tensor_engine submodule doesn't exist yet, use the
-    # available of monarch.rdma as a proxy.
-    # type: ignore
-    from monarch.rdma import RDMAManager  # @manual
+    from monarch._rust_bindings.rdma import (  # type: ignore[import]
+        _RdmaManager,
+        create_rdma_manager_blocking,
+    )
 
     HAS_TENSOR_ENGINE = True
 except ImportError:
+    logging.warning("RDMA is not available on this platform")
     pass
 
 
@@ -102,7 +103,7 @@ def __init__(
         self._proc_mesh = hy_proc_mesh
         self._mock_shape: Optional[Shape] = _mock_shape
         # type: ignore[21]
-        self._rdma_manager: Optional["RDMAManager"] = None
+        self._rdma_manager: Optional["_RdmaManager"] = None
         self._debug_manager: Optional[DebugManager] = None
         self._mailbox: Mailbox = self._proc_mesh.client
         self._code_sync_client: Optional[CodeSyncMeshClient] = None
@@ -118,7 +119,7 @@ def __init__(
         with fake_sync_state():
             if _mock_shape is None and HAS_TENSOR_ENGINE:
                 # type: ignore[21]
-                self._rdma_manager = self.spawn("rdma_manager", RDMAManager).get()
+                self._rdma_manager = create_rdma_manager_blocking(self._proc_mesh)
             if not _is_initializing_debugger and _mock_shape is None:
                 self._debug_manager = self.spawn(
                     _DEBUG_MANAGER_ACTOR_NAME, DebugManager, debug_client()
diff --git a/python/monarch/_src/tensor_engine/rdma.py b/python/monarch/_src/tensor_engine/rdma.py
@@ -9,12 +9,15 @@
 from typing import Optional
 
 import torch
-from monarch._rust_bindings.rdma import _RdmaBuffer
 
+try:
+    from monarch._rust_bindings.rdma import _RdmaBuffer
+except ImportError as e:
+    logging.error("RDMA is not available: {}".format(e))
+    raise e
+from monarch._src.actor.actor_mesh import MonarchContext
 from monarch._src.actor.future import Future
 
-from monarch.actor import MonarchContext
-
 
 # RDMARead/WriteTransferWarnings are warnings that are only printed once per process.
 # Remove these once GPU support is added.
@@ -30,7 +33,7 @@ class RDMAWriteTransferWarning(Warning):
 warnings.simplefilter("once", RDMAWriteTransferWarning)
 
 
-def rdma_supported():
+def is_available():
     return _RdmaBuffer.rdma_supported()
 
 
@@ -52,7 +55,9 @@ def __init__(self, data: torch.Tensor) -> None:
 
         TODO: Create TensorBuffer, which will be main user API supporting non-contiguous , multi-byte-per-elment tensors
         """
-        assert _RdmaBuffer.rdma_supported()
+        assert (
+            is_available()
+        ), "Tried to create an RDMABuffer, but RDMA is not available on this platform."
 
         if data.device.type != "cpu":
             # TODO - CUDA support for RDMABuffer exists at the Rust layer, but
diff --git a/python/monarch/rdma.py b/python/monarch/rdma.py
@@ -4,158 +4,12 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import ctypes
+import warnings
 
-from dataclasses import dataclass
-from typing import cast, Dict, Optional, Tuple
-
-import torch
-
-from monarch._rust_bindings.monarch_hyperactor.proc import ActorId
-from monarch._src.actor.actor_mesh import (
-    _ActorMeshRefImpl,
-    Actor,
-    ActorMeshRef,
-    endpoint,
-    MonarchContext,
+warnings.warn(
+    "monarch.rdma is deprecated, please import from monarch.tensor_engine.rdma instead.",
+    DeprecationWarning,
+    stacklevel=2,
 )
 
-
-@dataclass
-class LocalRDMARecord:
-    data: torch.Tensor
-
-
-_local_buffers: Dict[int, "LocalRDMARecord"] = {}
-
-
-def _get_bytes(storage: torch.Tensor, offset: int, size: int) -> bytearray:
-    """Extracts a bytearray from a 1D, 1byte per item tensor."""
-    if offset + size > storage.numel():
-        raise ValueError(f"Read out of range: {offset + size} > {storage.size()}")
-    addr = storage.data_ptr()
-    if storage.device.type != "cpu":
-        result = bytearray(size)
-        result_tensor = torch.frombuffer(
-            result,
-            dtype=torch.uint8,
-        )
-        source_tensor = storage[offset:]
-        result_tensor.copy_(source_tensor)
-    else:
-        ctypes_array = (ctypes.c_byte * size).from_address(addr)
-        result = bytearray(ctypes_array)
-    return result
-
-
-class RDMAManager(Actor):
-    @staticmethod
-    def on_proc(proc_id: str) -> "RDMAManager":
-        ctx = MonarchContext.get()
-        return cast(
-            RDMAManager,
-            ActorMeshRef(
-                RDMAManager,
-                _ActorMeshRefImpl.from_actor_id(
-                    ctx.mailbox,
-                    ActorId.from_string(f"{proc_id}.rdma_manager[0]"),
-                ),
-                ctx.mailbox,
-            ),
-        )
-
-    @endpoint
-    async def drop(self, addr: int) -> None:
-        if addr in _local_buffers:
-            del _local_buffers[addr]
-
-    @endpoint
-    async def fetch(self, addr: int, offset: int, nbytes: int) -> bytearray:
-        if addr not in _local_buffers:
-            raise ValueError(f"Unknown buffer {addr}")
-        storage = _local_buffers[addr].data
-        return _get_bytes(storage, offset, nbytes)
-
-    @endpoint
-    async def put(self, addr: int, offset: int, bytes: bytearray) -> None:
-        if addr not in _local_buffers:
-            raise ValueError(f"Unknown buffer {addr}")
-        storage = _local_buffers[addr].data
-        storage[offset : offset + len(bytes)] = torch.frombuffer(
-            bytes, dtype=storage.dtype
-        )
-
-
-def _assert_tensor_is_1d_contiguous_uint8(t: torch.Tensor) -> None:
-    if t.ndim != 1:
-        raise ValueError(f"Tensor must be 1D, got {t.ndim}D")
-    if t.dtype != torch.uint8:
-        raise ValueError(f"Tensor must be uint8, got {t.dtype}")
-    if not t.is_contiguous():
-        raise ValueError("Tensor must be contiguous")
-
-
-class RDMABuffer:
-    def __init__(self, data: torch.Tensor) -> None:
-        """
-        RDMABuffer only supports 1D contiguous tensors that are 1 byte per item.
-
-        To create a 1 byte, 1D view, use t.view(torch.uint8).flatten()
-
-        TODO: Create TensorBuffer, which will be main user API supporting non-contiguous , multi-byte-per-elment tensors
-        """
-        _assert_tensor_is_1d_contiguous_uint8(data)
-        assert data.storage_offset() == 0
-        storage = data.untyped_storage()
-        self.addr: int = storage.data_ptr()
-        self.begin = 0
-        self.end: int = storage.size()
-        self.proc_id: str = MonarchContext.get().proc_id
-        self.local_data: object = None
-        _local_buffers[self.addr] = LocalRDMARecord(data)
-
-    def drop(self) -> None:
-        if self.proc_id is None:
-            del _local_buffers[self.addr]
-            return
-        rmda_actor = RDMAManager.on_proc(self.proc_id)
-        # pyre-ignore[16]: Undefined attribute [16]: `Endpoint` has no attribute `cast`.
-        rmda_actor.drop.cast(self.addr)
-
-    def __getstate__(self) -> Tuple[int, int, int, Optional[str]]:
-        proc_id = self.proc_id
-        # locally created RDMABuffer being set remotely,
-        # record its proc_id so we know how to establish connections to it
-        if proc_id is None:
-            proc_id = MonarchContext.get().proc_id
-        return (self.addr, self.begin, self.end, proc_id)
-
-    def __setstate__(self, state: Tuple[int, int, int, str]) -> None:
-        self.local_data = None
-        self.addr, self.begin, self.end, self.proc_id = state
-
-    async def read_into(self, dst: torch.Tensor, offset: int = 0) -> None:
-        """
-        Read data from the RDMABuffer into a destination tensor.
-
-        The destination tensor must be contiguous and 1 byte per item.
-        """
-        _assert_tensor_is_1d_contiguous_uint8(dst)
-        bytes = await RDMAManager.on_proc(self.proc_id).fetch.call_one(
-            self.addr, offset, dst.numel()
-        )
-        dst.copy_(torch.frombuffer(bytes, dtype=torch.uint8))
-
-    async def write(self, src: torch.Tensor, offset: int = 0) -> None:
-        """
-        Write data from a source tensor into the RDMABuffer.
-
-        The source tensor must be contiguous and 1 byte per item.
-        """
-        _assert_tensor_is_1d_contiguous_uint8(src)
-        bytes = _get_bytes(
-            src,
-            cast(int, src.storage_offset()),
-            src.numel(),
-        )
-        await RDMAManager.on_proc(self.proc_id).put.call_one(self.addr, offset, bytes)
+from monarch.tensor_engine import *  # noqa
diff --git a/python/monarch/tensor_engine/__init__.py b/python/monarch/tensor_engine/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Monarch Tensor Engine API - Public interface for tensor engine functionality.
+"""
+
+from monarch._src.tensor_engine.rdma import (
+    is_available,
+    RDMABuffer,
+    RDMAReadTransferWarning,
+    RDMAWriteTransferWarning,
+)
+
+__all__ = [
+    "is_available",
+    "RDMABuffer",
+    "RDMAReadTransferWarning",
+    "RDMAWriteTransferWarning",
+]
diff --git a/python/tests/test_python_actors.py b/python/tests/test_python_actors.py
diff --git a/python/tests/test_rdma.py b/python/tests/test_rdma.py