add flag

fzyzcjy · fzyzcjy · commit a4652985c59c · 2025-10-28T16:09:17.000+08:00
diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
@@ -45,25 +45,10 @@ size_t get_size_align_to_granularity(size_t size_raw, size_t granularity) {
     return size;
 }
 
-bool support_fabric() {
-    int device_count;
-    CUDA_CHECK(cudaGetDeviceCount(&device_count));
-
-    for (int device = 0; device < device_count; ++device) {
-        int support = 0;
-        CU_CHECK(cuDeviceGetAttribute(&support, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device));
-        if (!support) {
-            return false;
-        }
-    }
-
-    return true;
-}
-
-SharedMemoryAllocator::SharedMemoryAllocator() : enable_fabric(support_fabric()) {}
+SharedMemoryAllocator::SharedMemoryAllocator(bool use_fabric) : use_fabric(use_fabric) {}
 
 void SharedMemoryAllocator::malloc(void** ptr, size_t size_raw) {
-    if (enable_fabric) {
+    if (use_fabric) {
         CUdevice device;
         CU_CHECK(cuCtxGetDevice(&device));
 
@@ -90,7 +75,7 @@ void SharedMemoryAllocator::malloc(void** ptr, size_t size_raw) {
 }
 
 void SharedMemoryAllocator::free(void* ptr) {
-    if (enable_fabric) {
+    if (use_fabric) {
         cu_mem_free(ptr);
     } else {
         CUDA_CHECK(cudaFree(ptr));
@@ -103,7 +88,7 @@ void SharedMemoryAllocator::get_mem_handle(MemHandle* mem_handle, void* ptr) {
 
     mem_handle->size = size;
 
-    if (enable_fabric) {
+    if (use_fabric) {
         CUmemGenericAllocationHandle handle;
         CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr));
 
@@ -114,7 +99,7 @@ void SharedMemoryAllocator::get_mem_handle(MemHandle* mem_handle, void* ptr) {
 }
 
 void SharedMemoryAllocator::open_mem_handle(void** ptr, MemHandle* mem_handle) {
-    if (enable_fabric) {
+    if (use_fabric) {
         size_t size = mem_handle->size;
 
         CUmemGenericAllocationHandle handle;
@@ -129,7 +114,7 @@ void SharedMemoryAllocator::open_mem_handle(void** ptr, MemHandle* mem_handle) {
 }
 
 void SharedMemoryAllocator::close_mem_handle(void* ptr) {
-    if (enable_fabric) {
+    if (use_fabric) {
         cu_mem_free(ptr);
     } else {
         CUDA_CHECK(cudaIpcCloseMemHandle(ptr));
@@ -145,15 +130,17 @@ Buffer::Buffer(int rank,
                int64_t num_rdma_bytes,
                bool low_latency_mode,
                bool explicitly_destroy,
-               bool enable_shrink)
+               bool enable_shrink,
+               bool use_fabric)
     : rank(rank),
       num_ranks(num_ranks),
       num_nvl_bytes(num_nvl_bytes),
       num_rdma_bytes(num_rdma_bytes),
       enable_shrink(enable_shrink),
       low_latency_mode(low_latency_mode),
       explicitly_destroy(explicitly_destroy),
-      comm_stream(at::cuda::getStreamFromPool(true)) {
+      comm_stream(at::cuda::getStreamFromPool(true)),
+      shared_memory_allocator(use_fabric) {
     // Metadata memory
     int64_t barrier_signal_bytes = NUM_MAX_NVL_PEERS * sizeof(int);
     int64_t buffer_ptr_bytes = NUM_MAX_NVL_PEERS * sizeof(void*);
diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp
@@ -37,14 +37,14 @@ constexpr size_t HANDLE_SIZE = sizeof(MemHandle);
 
 class SharedMemoryAllocator {
 public:
-    SharedMemoryAllocator();
+    SharedMemoryAllocator(bool use_fabric);
     void malloc(void** ptr, size_t size);
     void free(void* ptr);
     void get_mem_handle(MemHandle* mem_handle, void* ptr);
     void open_mem_handle(void** ptr, MemHandle* mem_handle);
     void close_mem_handle(void* ptr);
 private:
-    bool enable_fabric;
+    bool use_fabric;
 };
 }
 
@@ -118,7 +118,8 @@ struct Buffer {
            int64_t num_rdma_bytes,
            bool low_latency_mode,
            bool explicitly_destroy,
-           bool enable_shrink);
+           bool enable_shrink,
+           bool use_fabric);
 
     ~Buffer() noexcept(false);
 
diff --git a/deep_ep/buffer.py b/deep_ep/buffer.py
@@ -37,6 +37,7 @@ def __init__(self,
                  num_qps_per_rank: int = 24,
                  allow_nvlink_for_low_latency_mode: bool = True,
                  allow_mnnvl: bool = False,
+                 use_fabric: bool = False,
                  explicitly_destroy: bool = False,
                  enable_shrink: bool = False,
                  comm: Optional["mpi4py.MPI.Comm"] = None) -> None:  # noqa: F821
@@ -55,6 +56,7 @@ def __init__(self,
                 Warning: PCIe connections may lead to errors due to memory ordering issues,
                 please make sure all connections are via NVLink.
             allow_mnnvl: whether to allow MNNVL
+            use_fabric: whether to use fabric API for memory buffers.
             enable_shrink: whether to enable shrink mode. The enable mode allocates a mask buffer to support masking ranks dynamically.
             explicitly_destroy: If this flag is set to True, you need to explicitly call `destroy()` to release resources;
                 otherwise, the resources will be released by the destructor.
@@ -88,7 +90,7 @@ def all_gather_object(obj):
         self.explicitly_destroy = explicitly_destroy
         self.enable_shrink = enable_shrink
         self.runtime = deep_ep_cpp.Buffer(self.rank, self.group_size, num_nvl_bytes, num_rdma_bytes, low_latency_mode, explicitly_destroy,
-                                          enable_shrink)
+                                          enable_shrink, use_fabric)
 
         # Synchronize device IDs
         local_device_id = self.runtime.get_local_device_id()