Skip to content

Commit 29c1a0e

Browse files
JunyiXu-nvmikeiovine
authored andcommitted
[https://nvbugs/5670793][fix] Solve trtllm-serve launch_disaggregated… (#9324)
Signed-off-by: Junyi Xu <[email protected]> Signed-off-by: Mike Iovine <[email protected]>
1 parent 870e98b commit 29c1a0e

File tree

2 files changed

+11
-4
lines changed

2 files changed

+11
-4
lines changed

tensorrt_llm/commands/serve.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
parse_disagg_config_file,
2929
parse_metadata_server_config_file)
3030
from tensorrt_llm.llmapi.llm_utils import update_llm_args_with_extra_dict
31-
from tensorrt_llm.llmapi.mpi_session import find_free_port
31+
from tensorrt_llm.llmapi.mpi_session import find_free_ipc_addr
3232
from tensorrt_llm.llmapi.reasoning_parser import ReasoningParserFactory
3333
from tensorrt_llm.logger import logger, severity_map
3434
from tensorrt_llm.serve import OpenAIDisaggServer, OpenAIServer
@@ -700,10 +700,10 @@ def _launch_disaggregated_leader(sub_comm, instance_idx: int, config_file: str,
700700

701701
# This mimics the behavior of trtllm-llmapi-launch
702702
# TODO: Make the port allocation atomic
703-
free_port = find_free_port()
703+
free_ipc_addr = find_free_ipc_addr()
704704
os.environ[LlmLauncherEnvs.TLLM_SPAWN_PROXY_PROCESS] = "1"
705-
os.environ[LlmLauncherEnvs.TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR.
706-
value] = f"tcp://127.0.0.1:{free_port}"
705+
os.environ[
706+
LlmLauncherEnvs.TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR.value] = free_ipc_addr
707707
os.environ[DisaggLauncherEnvs.TLLM_DISAGG_RUN_REMOTE_MPI_SESSION_CLIENT.
708708
value] = "1"
709709
os.environ[DisaggLauncherEnvs.TLLM_DISAGG_INSTANCE_IDX] = str(instance_idx)

tensorrt_llm/llmapi/mpi_session.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -541,6 +541,13 @@ def find_free_port() -> int:
541541
return s.getsockname()[1]
542542

543543

544+
def find_free_ipc_addr() -> str:
545+
import os
546+
import tempfile
547+
import uuid
548+
return f'ipc://{os.path.join(tempfile.gettempdir(), "rpc_" + str(uuid.uuid4()))}'
549+
550+
544551
def get_mpi_world_size() -> int:
545552
# avoid cyclic import
546553
from ..executor.utils import get_spawn_proxy_process_env

0 commit comments

Comments
 (0)