From da6077d68f5da288ef3ca31e9d4d88368bf7b729 Mon Sep 17 00:00:00 2001 From: Junyi Chen Date: Tue, 22 Jul 2025 15:29:59 +0800 Subject: [PATCH 1/2] add shm size check --- lightllm/server/api_cli.py | 3 +++ lightllm/server/api_start.py | 45 ++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/lightllm/server/api_cli.py b/lightllm/server/api_cli.py index ee5518ea8..da00f9516 100644 --- a/lightllm/server/api_cli.py +++ b/lightllm/server/api_cli.py @@ -187,6 +187,9 @@ def make_argument_parser() -> argparse.ArgumentParser: ) parser.add_argument("--disable_log_stats", action="store_true", help="disable logging throughput stats.") parser.add_argument("--log_stats_interval", type=int, default=10, help="log stats interval in second.") + parser.add_argument( + "--enable-shm-warning", action="store_true", default=True, help="Enable periodic shm warning logs" + ) parser.add_argument("--router_token_ratio", type=float, default=0.0, help="token ratio to control router dispatch") parser.add_argument( diff --git a/lightllm/server/api_start.py b/lightllm/server/api_start.py index 5b24e37a1..e0e101de5 100644 --- a/lightllm/server/api_start.py +++ b/lightllm/server/api_start.py @@ -4,6 +4,7 @@ import uuid import subprocess import signal +import shutil from lightllm.utils.net_utils import alloc_can_use_network_port, PortLocker from lightllm.utils.start_utils import process_manager, kill_recursive from .metrics.manager import start_metric_manager @@ -19,6 +20,37 @@ logger = init_logger(__name__) +def get_shm_size_gb(): + """ + 获取 /dev/shm 的总大小(以GB为单位)。 + """ + try: + shm_path = "/dev/shm" + if not os.path.exists(shm_path): + logger.error(f"{shm_path} not exist, this may indicate a system or Docker configuration anomaly.") + return 0 + + # shutil.disk_usage 返回 (total, used, free) + total_bytes = shutil.disk_usage(shm_path).total + total_gb = total_bytes / (1024 ** 3) + return total_gb + except Exception as e: + logger.error(f"Error getting /dev/shm size: {e}") + return 0 + + +def check_shm_size(): + RED = "\033[91m" + GREEN = "\033[92m" + ENDC = "\033[0m" + shm_size = get_shm_size_gb() + required_size = 128 # 128G + if shm_size < required_size: + logger.warning(f"{RED}Available shm size is less than 128G: {shm_size:.2f}G{ENDC}") + else: + logger.info(f"{GREEN}/dev/shm available space is sufficient ({shm_size:.2f} GB >= {required_size} GB).{ENDC}") + + def setup_signal_handlers(http_server_process, process_manager): def signal_handler(sig, frame): if sig == signal.SIGINT: @@ -62,6 +94,19 @@ def signal_handler(sig, frame): def normal_or_p_d_start(args): set_unique_server_name(args) + check_shm_size() + + if args.enable_shm_warning: + import threading + + def periodic_shm_warning(): + while True: + check_shm_size() + time.sleep(120) # 每 120 秒打印一次警告日志 + + shm_warning_thread = threading.Thread(target=periodic_shm_warning, daemon=True) + shm_warning_thread.start() + if args.enable_mps: from lightllm.utils.device_utils import enable_mps From 4ebea761667c13b7642b009b3788aefbb9b0c763 Mon Sep 17 00:00:00 2001 From: Junyi Chen Date: Tue, 22 Jul 2025 15:38:56 +0800 Subject: [PATCH 2/2] fix --- lightllm/server/api_cli.py | 2 +- lightllm/server/api_start.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lightllm/server/api_cli.py b/lightllm/server/api_cli.py index da00f9516..c3f9388d0 100644 --- a/lightllm/server/api_cli.py +++ b/lightllm/server/api_cli.py @@ -188,7 +188,7 @@ def make_argument_parser() -> argparse.ArgumentParser: parser.add_argument("--disable_log_stats", action="store_true", help="disable logging throughput stats.") parser.add_argument("--log_stats_interval", type=int, default=10, help="log stats interval in second.") parser.add_argument( - "--enable-shm-warning", action="store_true", default=True, help="Enable periodic shm warning logs" + "--disable-shm-warning", action="store_true", default=False, help="Disable periodic shm warning logs" ) parser.add_argument("--router_token_ratio", type=float, default=0.0, help="token ratio to control router dispatch") diff --git a/lightllm/server/api_start.py b/lightllm/server/api_start.py index e0e101de5..b9f376450 100644 --- a/lightllm/server/api_start.py +++ b/lightllm/server/api_start.py @@ -96,7 +96,7 @@ def normal_or_p_d_start(args): check_shm_size() - if args.enable_shm_warning: + if not args.disable_shm_warning: import threading def periodic_shm_warning():