Clean up DP coord unit-test and code reuse

tdene · tdene · commit a28d34db948b · 2025-11-17T05:15:33.000-06:00
diff --git a/megatron/core/inference/engines/dynamic_engine.py b/megatron/core/inference/engines/dynamic_engine.py
@@ -4,6 +4,7 @@
 import logging
 import multiprocessing
 import os
+import socket
 import struct
 import time
 import warnings
@@ -142,13 +143,14 @@ def __init__(
         self.paused = False
         self.stopped = False
         self.enable_chunked_prefill = enable_chunked_prefill
+        self.rank = torch.distributed.get_rank()
 
         self.inference_logging_step_interval = inference_logging_step_interval
         # Configure wandb to use separate step counter for inference metrics (only once)
         if self.inference_logging_step_interval > 0 and self.context.metrics_writer is not None:
             logging.info(
                 f"\033[1;93m[INFERENCE]\033[0m "
-                f"\033[1;95mLogging inference metrics to wandb (rank {torch.distributed.get_rank()})\033[0m"
+                f"\033[1;95mLogging inference metrics to wandb (rank {self.rank})\033[0m"
             )
             if HAVE_WANDB and self.context.metrics_writer.__name__ == "wandb":
                 # Make all inference/* metrics use inference_step as their x-axis
@@ -202,7 +204,7 @@ def create_cuda_graphs(self, reset_context: bool = True):
 
         if moe_pad_experts and context.non_decode_cuda_graphs:
             context.non_decode_cuda_graphs = False
-            if torch.distributed.get_rank() == 0:
+            if self.rank == 0:
                 warnings.warn(
                     "MoE models do not support non-decode cuda graphs. "
                     "Forcing non_decode_cuda_graphs to False."
@@ -301,16 +303,18 @@ async def start_listening_to_data_parallel_coordinator(
         `InferenceCoordinator`. It configures different ZMQ socket patterns
         based on the rank's role within the distributed topology.
 
+        Note that this method must be called on all ranks, as it uses blocking torch broadcasts.
+
         The setup involves two primary roles within each data-parallel group:
-        1.  **TP Coordinator (TP_rank=0, PP_rank=0)**: This rank connects directly
+        1.  **MP Coordinator (TP_rank=0, PP_rank=0)**: This rank connects directly
             to the central coordinator via a ZMQ `DEALER` socket. It receives
             requests and uses a ZMQ `PUB` (publisher) socket to broadcast them
-            to all other ranks within its tensor-parallel (TP) group.
-        2.  **TP Workers (all other ranks)**: These ranks use ZMQ `SUB` (subscriber)
-            sockets to listen for requests broadcast by their local TP Coordinator.
+            to all other ranks within its model-parallel (MP) group.
+        2.  **MP Workers (all other ranks)**: These ranks use ZMQ `SUB` (subscriber)
+            sockets to listen for requests broadcast by their local MP Coordinator.
 
-        This architecture uses fast Inter-Process Communication (`ipc`) sockets for
-        intra-node broadcasts within a TP group.
+        This architecture uses TCP sockets for both inter-node and intra-node broadcasts
+        within an MP group.
 
         Finally, after setting up the communication channels and ensuring all ranks
         are synchronized, this method starts the main engine processing loop
@@ -322,12 +326,6 @@ async def start_listening_to_data_parallel_coordinator(
             launch_inference_coordinator (bool, optional): If True, the global rank 0
                 process will spawn and manage the `InferenceCoordinator`
                 process. Defaults to True.
-
-        Note:
-            The current implementation uses `ipc` sockets for broadcasting requests
-            within a Tensor Parallel group, which limits each TP group to a single
-            physical node. For example, if you have 8 GPUs per node, then this will only
-            work with TP=[1,2,4,8]
         """
 
         assert HAVE_ZMQ, (
@@ -338,7 +336,32 @@ async def start_listening_to_data_parallel_coordinator(
             "pip install msgpack"
         )
 
-        if launch_inference_coordinator and torch.distributed.get_rank() == 0:
+        self.zmq_context = zmq.Context().instance()
+        self.zmq_sockets = []  # keep track of all sockets created by this engine
+
+        # Get world info.
+        dp_group = parallel_state.get_data_parallel_group()
+        dp_src = parallel_state.get_data_parallel_src_rank()
+        dp_size = parallel_state.get_data_parallel_world_size()
+        dp_rank = parallel_state.get_data_parallel_rank()
+
+        mp_group = parallel_state.get_model_parallel_group()
+        mp_src = parallel_state.get_model_parallel_src_rank()
+        tp_rank = parallel_state.get_tensor_model_parallel_rank()
+        pp_rank = parallel_state.get_pipeline_model_parallel_rank()
+
+        self.is_mp_coordinator = tp_rank == 0 and pp_rank == 0
+        self.is_dp_coordinator = (dp_rank == 0) and self.is_mp_coordinator
+
+        # Get local IP.
+        with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as tmp_sock:
+            tmp_sock.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1)
+            tmp_sock.connect(('<broadcast>', 0))
+            local_ip = tmp_sock.getsockname()[0]
+        del tmp_sock
+
+        # Spawn a DP coordinator process and get the connection info.
+        if launch_inference_coordinator and self.is_dp_coordinator:
             spawn_context = multiprocessing.get_context('spawn')
             coordinator_ready_event = spawn_context.Event()
             self.inference_coordinator_process = spawn_context.Process(
@@ -351,61 +374,65 @@ async def start_listening_to_data_parallel_coordinator(
             )
             self.inference_coordinator_process.start()
 
-        # Todo [Siddharth]: can we move this code to another file?
-        self.zmq_context = zmq.Context()
-        self.zmq_sockets = []  # keep track of all sockets created by this engine
+        # Find available ports for MP and bind to them.
+        if self.is_mp_coordinator:
+            mp_req_sock = self.zmq_context.socket(zmq.PUB)
+            mp_req_sock.bind_to_random_port(f"tcp://{local_ip}")
+            mp_req_addr = [mp_req_sock.getsockopt_string(zmq.LAST_ENDPOINT)]
+
+            mp_len_sock = self.zmq_context.socket(zmq.PUB)
+            mp_len_sock.bind_to_random_port(f"tcp://{local_ip}")
+            mp_len_addr = [mp_len_sock.getsockopt_string(zmq.LAST_ENDPOINT)]
+        else:
+            mp_req_addr = [None]
+            mp_len_addr = [None]
+
+        # Broadcast addresses to respective ranks.
+        torch.distributed.broadcast_object_list(mp_req_addr, src=mp_src, group=mp_group)
+        torch.distributed.broadcast_object_list(mp_len_addr, src=mp_src, group=mp_group)
+
         ip_address_of_dp_coordinator = os.getenv('MASTER_ADDR', '127.0.0.1')
-        identity = f'tp-coord-{parallel_state.get_data_parallel_rank()}'
-        if (
-            parallel_state.get_tensor_model_parallel_rank() == 0
-            and parallel_state.get_pipeline_model_parallel_rank() == 0
-        ):
+        dp_addr = [f"tcp://{ip_address_of_dp_coordinator}:{inference_coordinator_port}"]
+        identity = f'mp-coord-{dp_rank}'
+        if self.is_mp_coordinator:
             # 1. Create dealer sockets where tp_rank = 0 and pp_rank = 0
             #    These will receive requests from an InferenceCoordinator.
             self.socket_for_receiving_requests = self.zmq_context.socket(zmq.DEALER)
 
             self.socket_for_receiving_requests.setsockopt(zmq.IDENTITY, identity.encode('utf-8'))
-            self.socket_for_receiving_requests.connect(
-                f"tcp://{ip_address_of_dp_coordinator}:{inference_coordinator_port}"
-            )
+            self.socket_for_receiving_requests.connect(dp_addr[0])
 
             # send empty string. this is used to register with the coordinator.
             self.socket_for_receiving_requests.send(b"")
 
             # 2. Create a publisher socket. This is used to publish or broadcast
-            #    requests within the tensor parallel group
-            self.tensor_parallel_publisher_socket = self.zmq_context.socket(zmq.PUB)
-            self.tensor_parallel_publisher_socket.bind(f"ipc:///tmp/{identity}-tp-bcast-socket-req")
+            #    requests within the model parallel group
+            self.model_parallel_publisher_socket = mp_req_sock
 
             # 3. Create another publisher socket to broadcast the number of messages to receive.
-            self.tensor_parallel_num_msgs_publisher_socket = self.zmq_context.socket(zmq.PUB)
-            self.tensor_parallel_num_msgs_publisher_socket.bind(
-                f"ipc:///tmp/{identity}-tp-bcast-socket-len"
-            )
+            self.model_parallel_num_msgs_publisher_socket = mp_len_sock
             self.zmq_sockets += [
                 self.socket_for_receiving_requests,
-                self.tensor_parallel_num_msgs_publisher_socket,
-                self.tensor_parallel_publisher_socket,
+                self.model_parallel_num_msgs_publisher_socket,
+                self.model_parallel_publisher_socket,
             ]
-        # All TP ranks subscribe to the two publisher sockets
-        self.tensor_parallel_subscriber_socket = self.zmq_context.socket(zmq.SUB)
-        self.tensor_parallel_subscriber_socket.connect(f"ipc:///tmp/{identity}-tp-bcast-socket-req")
-        self.tensor_parallel_subscriber_socket.setsockopt_string(zmq.SUBSCRIBE, "")
-
-        self.tensor_parallel_num_msgs_subscriber_socket = self.zmq_context.socket(zmq.SUB)
-        self.tensor_parallel_num_msgs_subscriber_socket.connect(
-            f"ipc:///tmp/{identity}-tp-bcast-socket-len"
-        )
-        self.tensor_parallel_num_msgs_subscriber_socket.setsockopt_string(zmq.SUBSCRIBE, "")
+        # All MP ranks subscribe to the two publisher sockets
+        self.model_parallel_subscriber_socket = self.zmq_context.socket(zmq.SUB)
+        self.model_parallel_subscriber_socket.connect(mp_req_addr[0])
+        self.model_parallel_subscriber_socket.setsockopt_string(zmq.SUBSCRIBE, "")
+
+        self.model_parallel_num_msgs_subscriber_socket = self.zmq_context.socket(zmq.SUB)
+        self.model_parallel_num_msgs_subscriber_socket.connect(mp_len_addr[0])
+        self.model_parallel_num_msgs_subscriber_socket.setsockopt_string(zmq.SUBSCRIBE, "")
 
         self.zmq_sockets += [
-            self.tensor_parallel_subscriber_socket,
-            self.tensor_parallel_num_msgs_subscriber_socket,
+            self.model_parallel_subscriber_socket,
+            self.model_parallel_num_msgs_subscriber_socket,
         ]
 
-        torch.distributed.barrier(parallel_state.get_tensor_model_parallel_group())
+        torch.distributed.barrier(mp_group)
 
-        if launch_inference_coordinator and torch.distributed.get_rank() == 0:
+        if launch_inference_coordinator and self.is_dp_coordinator:
             await await_process_event(coordinator_ready_event, self.inference_coordinator_process)
             logging.info("Inference co-ordinator is ready to receive requests!")
 
@@ -455,7 +482,7 @@ def _add_request(
             try:
                 eod = self.controller.tokenizer.eod
             except AttributeError:
-                if torch.distributed.get_rank() == 0:
+                if self.rank == 0:
                     warnings.warn(
                         "Termination ID not specified, and tokenizer does not define eod."
                         "Defaulting to not using termination id."
@@ -932,16 +959,16 @@ def schedule_requests(self) -> int:
         """Drains the ZMQ socket for a batch of requests and adds them to the engine.
 
         This method is a collective and synchronous operation that must be called
-        by all ranks in a Tensor Parallel (TP) group at the same time. It ensures
+        by all ranks in a Model Parallel (MP) group at the same time. It ensures
         that all ranks process the exact same batch of incoming requests and
         control signals.
 
         The synchronization works as follows:
-        1.  The TP rank 0 drains all pending messages from its subscriber socket
+        1.  The MP rank 0 drains all pending messages from its subscriber socket
             in a non-blocking manner.
-        2.  TP rank 0 then broadcasts the number of messages it received to all other
-            ranks in its TP group using a dedicated publisher socket.
-        3.  The other TP ranks wait to receive this count, and then receive exactly
+        2.  MP rank 0 then broadcasts the number of messages it received to all other
+            ranks in its MP group using a dedicated publisher socket.
+        3.  The other MP ranks wait to receive this count, and then receive exactly
             that many messages from their subscriber sockets.
 
         Once all ranks have the same batch of messages, they are unpacked and
@@ -950,18 +977,17 @@ def schedule_requests(self) -> int:
 
         Note:
             This function is synchronous and must be called collectively by all
-            ranks in a TP group. It should not be launched in a separate coroutine
+            ranks in a MP group. It should not be launched in a separate coroutine
             to ensure all ranks execute it in lockstep before proceeding to the
             next engine step.
 
         Returns:
             int: The number of messages that were received and processed in this batch.
         """
 
-        rank = parallel_state.get_tensor_model_parallel_rank()
         torch.cuda.nvtx.range_push("drain_zmq_socket")
         all_messages = []
-        if rank == 0:
+        if self.is_mp_coordinator:
             while True:
                 try:
                     # Receive messages in a non-blocking way.
@@ -973,22 +999,22 @@ def schedule_requests(self) -> int:
             # First publish the number of messages to dequeue.
             # This is important because we want all tensor parallel ranks
             # to dequeue the same number of messages.
-            self.tensor_parallel_num_msgs_publisher_socket.send(
+            self.model_parallel_num_msgs_publisher_socket.send(
                 struct.pack('!i', messages_to_dequeue)
             )
-            # Now publish the actual messages to all tensor parallel ranks
+            # Now publish the actual messages to all model parallel ranks
             for message in all_messages:
-                self.tensor_parallel_publisher_socket.send(message)
+                self.model_parallel_publisher_socket.send(message)
         else:
-            # First, receive the number of messages to dequeue from tp-rank 0
+            # First, receive the number of messages to dequeue from mp-rank 0
             messages_to_dequeue = struct.unpack(
-                '!i', self.tensor_parallel_num_msgs_subscriber_socket.recv()
+                '!i', self.model_parallel_num_msgs_subscriber_socket.recv()
             )[0]
             # Now, dequeue the same number of messages from the subscriber socket.
             # Note that these receives are blocking, because the messages
             # are guaranteed to be available after the tp-rank 0 has sent them.
             for _ in range(messages_to_dequeue):
-                all_messages.append(self.tensor_parallel_subscriber_socket.recv())
+                all_messages.append(self.model_parallel_subscriber_socket.recv())
 
         torch.cuda.nvtx.range_pop()
         for message in all_messages:
@@ -1080,12 +1106,8 @@ async def run_engine_with_coordinator(
 
                 engine_output = await self.async_step(verbose=verbose)
 
-                is_tp0_and_pp0 = (
-                    parallel_state.get_tensor_model_parallel_rank() == 0
-                    and parallel_state.get_pipeline_model_parallel_rank() == 0
-                )
                 if (
-                    is_tp0_and_pp0
+                    self.is_mp_coordinator
                     and engine_output is not None
                     and engine_output["finished_requests"]
                 ):
diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
@@ -1491,7 +1491,8 @@ def _add_inference_args(parser):
     group.add_argument('--inference-wandb-logging-step-interval', type=int, default=0,
                        help='Step interval for logging inference metrics to wandb. '
                             'Default to 0 to disable inference wandb logging.')
-
+    group.add_argument("--inference-coordinator-port", type=int, default=12346,
+                       help="This port will be used to setup the inference coordinator on node-0")
     return parser
 
 
diff --git a/tests/unit_tests/inference/test_data_parallel_inference_coordinator.py b/tests/unit_tests/inference/test_data_parallel_inference_coordinator.py