Allow NVSHMEM PE to NIC to be initialized by rank

smarterclayton · keonjang-google · smarterclayton · commit 2abd7e37a102 · 2025-10-29T14:52:38.000-04:00
The `nvshmemi_get_devices_by_distance` default initialization method in NVSHMEM does not work optimally for GPU configurations where 2 GPU and 2 RDMA NIC share a PCIe bus, such as the x86 based GCP A3 Ultra H200 and A4 B200 instance types: https://cloud.google.com/compute/docs/gpus/gpu-network-bandwidth#h200-gpus. GPU0 and GPU1 (on two independent processes) can observe NIC0, NIC1 on the same PCIe switch are equidistant and result in both GPUs leveraging NIC0, halving the observed bandwidth for RDMA in test_internode.py and in vLLM wide-EP. The alternative is a static mapping between GPU host index (PE) and NIC index (HCA), but the NVSHMEMX_INIT_WITH_UNIQUEID initialization method bypasses setting `mype_node` and `npes_node`. The `nvshmemi_boot_handle.pg_rank` for this initialization method is always 0 and the `nvshmem_boot_handle.pg_size` is always 2, preventing NVSHMEM_ENABLE_NIC_PE_MAPPING from leveraging a static list of devices in transport.cpp#nvshmemi_setup_connections: selected_devices[0] = nvshmemi_state->mype_node % (tcurr->n_devices > 0 ? tcurr->n_devices : 1); has mype_node = 0 for all devices. To allow static assignment, introduce a DEEP_EP_DEVICE_TO_HCA_MAPPING environment variable during Buffer python initialization that accepts `<cuda_device_id>:<HCA_name>:<HCA_port>` and resolves `torch.cuda.current_device()` to set NVSHMEM_HCA_LIST to the appropriate value or error. Co-Authored-By: Keon Jang <keonjang@google.com> Signed-off-by: Clayton Coleman <smarterclayton@gmail.com>
diff --git a/deep_ep/buffer.py b/deep_ep/buffer.py
@@ -101,6 +101,8 @@ def all_gather_object(obj):
         # Synchronize NVSHMEM unique IDs
         root_unique_id = None
         if self.runtime.get_num_rdma_ranks() > 1 or low_latency_mode:
+            self._setup_device_hca_mapping()
+
             # Enable IBGDA
             assert num_qps_per_rank > 0
             os.environ['NVSHMEM_DISABLE_P2P'] = '0' if allow_nvlink_for_low_latency_mode else '1'
@@ -133,6 +135,29 @@ def all_gather_object(obj):
         self.runtime.sync(device_ids, ipc_handles, root_unique_id)
         assert self.runtime.is_available()
 
+    def _setup_device_hca_mapping(self):
+        """
+        Set up device to NIC mapping using DEEP_EP_DEVICE_TO_HCA_MAPPING environment variable.
+        The mapping format is: "0:mlx5_0:1,1:mlx5_1:1,..." where each entry maps a CUDA device ID
+        to an HCA name separated by colon. HCA name can include additional suffixes like ":1".
+        """
+        if 'DEEP_EP_DEVICE_TO_HCA_MAPPING' in os.environ:
+            device_mapping = {}
+            mapping_str = os.environ['DEEP_EP_DEVICE_TO_HCA_MAPPING']
+            # Parse mapping string like "0:mlx5_0:1,1:mlx5_1:1,..."
+            for mapping in mapping_str.split(','):
+                assert ':' in mapping, f"Invalid mapping format '{mapping}' in DEEP_EP_DEVICE_TO_HCA_MAPPING. Expected format: '<device_id>:<hca_name>'"
+                parts = mapping.split(':', 1)  # Split only on first colon
+                device_id = int(parts[0])
+                hca_name = parts[1]  # Keep the rest as HCA name (including :1)
+                device_mapping[device_id] = hca_name
+
+            # Get current device and set appropriate HCA
+            current_device = torch.cuda.current_device()
+            assert current_device in device_mapping, f"Current CUDA device {current_device} not found in DEEP_EP_DEVICE_TO_HCA_MAPPING"
+            os.environ['NVSHMEM_ENABLE_PE_MAPPING'] = '1'
+            os.environ['NVSHMEM_HCA_LIST'] = device_mapping[current_device]
+
     def destroy(self):
         """
         Destroy the cpp runtime and release resources.