Merge

tdene · tdene · commit 56a5a0f0feff · 2025-11-18T10:31:39.000-06:00
diff --git a/examples/inference/gpt/utils.py b/examples/inference/gpt/utils.py
@@ -117,12 +117,6 @@ def add_common_inference_args(parser: ArgumentParser) -> ArgumentParser:
         '`--prompt-file` above). The first `--prompt-file-num-truncate` samples '
         'will be used, in order.',
     )
-    group.add_argument(
-        "--inference-coordinator-port",
-        type=int,
-        help="This port will be used to setup the inference co-ordinator on node-0",
-        default=12346
-    )
     group.add_argument(
         "--use-flashinfer-fused-rope",
         action='store_true',
diff --git a/megatron/core/inference/engines/dynamic_engine.py b/megatron/core/inference/engines/dynamic_engine.py
@@ -375,16 +375,7 @@ async def start_listening_to_data_parallel_coordinator(
             launch_inference_coordinator (bool, optional): If True, the global rank 0
                 process will spawn and manage the `InferenceCoordinator`
                 process. Defaults to True.
-<<<<<<< HEAD
             verbose (bool): Whether to run in verbose mode.
-
-        Note:
-            The current implementation uses `ipc` sockets for broadcasting requests
-            within a Tensor Parallel group, which limits each TP group to a single
-            physical node. For example, if you have 8 GPUs per node, then this will only
-            work with TP=[1,2,4,8]
-=======
->>>>>>> a28d34db94 (Clean up DP coord unit-test and code reuse)
         """
 
         assert HAVE_ZMQ, (
@@ -1285,7 +1276,6 @@ def stop(self):
         for socket in self.zmq_sockets:
             socket.close()
         self.zmq_context.term()
-        parallel_state.destroy_model_parallel()
 
     @trace_async_exceptions
     async def run_engine(
@@ -1306,7 +1296,6 @@ async def run_engine(
                             )
                         )
                     )
-
                 await self.async_step(verbose=verbose)
         except asyncio.CancelledError:
             pass
@@ -1345,7 +1334,6 @@ async def run_engine_with_coordinator(
                     self.suspend()
                     await asyncio.sleep(0.02)
                     continue
-
                 else:
                     self.resume()
 
diff --git a/tests/unit_tests/inference/test_data_parallel_inference_coordinator.py b/tests/unit_tests/inference/test_data_parallel_inference_coordinator.py
@@ -85,6 +85,9 @@ async def async_step(self, *, verbose: Optional[bool] = False) -> Dict:
         to_remove = []
         for request_id, record in self.request_records.items():
             if record[-1].status == Status.ACTIVE_AND_GENERATING_TOKENS:
+                record[-1].sampling_params.num_tokens_to_generate -= 1
+                if record[-1].sampling_params.num_tokens_to_generate > 0:
+                    continue
                 record[-1].status = Status.COMPLETED
                 self.context.active_cnt -= 1
                 finished_request_records.append(record)
@@ -122,6 +125,7 @@ class CoordinatorTestConfig:
     num_requests: int = 10**1
     min_time_offset: float = 10 ** (-4)
     max_time_offset: float = 10 ** (-3)
+    num_steps_to_finish: int = 1
     num_iterations: int = 1
 
     tensor_model_parallel_size: int = 1
@@ -154,7 +158,10 @@ def _build_requests(cls, test_config: CoordinatorTestConfig) -> List[Tuple]:
 
         for _ in range(test_config.num_requests):
             arrival_delta = random.uniform(test_config.min_time_offset, test_config.max_time_offset)
-            ret.append(("Hello world!", SamplingParams(), arrival_delta))
+            num_tokens = test_config.num_steps_to_finish
+            ret.append(
+                ("Hello world!", SamplingParams(num_tokens_to_generate=num_tokens), arrival_delta)
+            )
         return ret
 
     @classmethod
@@ -165,6 +172,7 @@ def _build_test_env(cls, test_config):
         )
         requests = cls._build_requests(test_config)
         engine = DummyEngine()
+        engine.num_steps_to_finish = test_config.num_steps_to_finish
         return CoordinatorTestEnv(config=test_config, requests=requests, engine=engine)
 
     @classmethod
@@ -180,37 +188,48 @@ async def _run_test(cls, **test_config_kwargs):
             launch_inference_coordinator=test_config.launch_inference_coordinator,
         )
 
-        if dist.get_rank() == 0:
-            client = InferenceClient(test_config.port)
-            await client.start()
-            env.timing_data["init_time"] = time.time()
-
-            all_results = []
-            for _ in range(test_config.num_iterations):
-                futures = []
-                for request in tqdm(env.requests, "add_requests"):
-                    prompt, sampling_params, arrival_delta = request
-                    await asyncio.sleep(arrival_delta)
-                    fut = client.add_request(prompt=prompt, sampling_params=sampling_params)
-                    futures.append(fut)
-                results: List[DynamicInferenceRequestRecord] = await asyncio.gather(*futures)
-                all_results.append(results)
-            env.timing_data["done_time"] = time.time()
-
-            if test_config.stop_engines:
-                client.stop_engines()
-            client.stop()
-
-        if test_config.stop_engines:
-            await env.engine.engine_loop_task
+        results_success = False
+        shutdown_success = False
+        try:
+            if dist.get_rank() == 0:
+                client = InferenceClient(test_config.port)
+                await client.start()
+                env.timing_data["init_time"] = time.time()
+
+                all_results = []
+                for _ in range(test_config.num_iterations):
+                    futures = []
+                    for request in tqdm(env.requests, "add_requests"):
+                        prompt, sampling_params, arrival_delta = request
+                        await asyncio.sleep(arrival_delta)
+                        fut = client.add_request(prompt=prompt, sampling_params=sampling_params)
+                        futures.append(fut)
+                    results: List[DynamicInferenceRequestRecord] = await asyncio.gather(*futures)
+                    all_results.append(results)
+                env.timing_data["done_time"] = time.time()
+            results_success = True
+        finally:
+            try:
+                if dist.get_rank() == 0:
+                    if test_config.stop_engines:
+                        client.stop_engines()
+                    client.stop()
+                if test_config.stop_engines:
+                    await env.engine.engine_loop_task
+                shutdown_success = True
+            except:
+                env.engine.engine_loop_task.cancel()
+
         env.timing_data["stop_time"] = time.time()
 
+        assert results_success, "Did not receive all results successfully."
+        assert shutdown_success, "Did not shutdown successfully."
         if dist.get_rank() == 0:
             env.responses = all_results
             if test_config.verify_results:
                 for batch in all_results:
-                    for result in batch:
-                        assert result.status == Status.COMPLETED
+                    for record in batch:
+                        assert record[-1].status == Status.COMPLETED
 
         return env
 
@@ -267,9 +286,9 @@ async def test_throughput(self):
             init_duration = (env.timing_data["init_time"] - env.timing_data["start_time"]) * 10**3
             golden_init_duration = 4445.64  # ms
             run_duration = (env.timing_data["done_time"] - env.timing_data["init_time"]) * 10**3
-            golden_run_duration = 3088.87  # ms
+            golden_run_duration = 2906.29  # ms
             stop_duration = (env.timing_data["stop_time"] - env.timing_data["done_time"]) * 10**3
-            golden_stop_duration = 129.57  # ms
+            golden_stop_duration = 10.77  # ms
 
             # Print current results.
             print(f"Initialization time: {init_duration:.2f} ms")
@@ -288,7 +307,7 @@ def clamp_to_golden_value(value, golden_value, delta=0.1):
                 f"WARNING: Run duration {run_duration:.2f}s deviates from "
                 f"golden value {golden_run_duration:.2f}s"
             )
-            assert clamp_to_golden_value(stop_duration, golden_stop_duration, delta=0.3), (
+            assert clamp_to_golden_value(stop_duration, golden_stop_duration, delta=1.0), (
                 f"WARNING: Stop duration {stop_duration:.2f}s deviates from "
                 f"golden value {golden_stop_duration:.2f}s"
             )
@@ -304,10 +323,10 @@ def clamp_to_golden_value(value, golden_value, delta=0.1):
 if __name__ == "__main__":
     test = TestCoordinator()
     asyncio.run(test.test_simple())
-    test.test_tp()
-    test.test_pp()
-    test_test.tp_pp()
-    test_test.throughput()
+    asyncio.run(test.test_tp())
+    asyncio.run(test.test_pp())
+    asyncio.run(test.test_tp_pp())
+    asyncio.run(test.test_throughput())
     test.teardown_method(None)
     print("~~~")
     print("success.")