Skip to content

Add shm size check #978

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions lightllm/server/api_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,9 @@ def make_argument_parser() -> argparse.ArgumentParser:
)
parser.add_argument("--disable_log_stats", action="store_true", help="disable logging throughput stats.")
parser.add_argument("--log_stats_interval", type=int, default=10, help="log stats interval in second.")
parser.add_argument(
"--disable-shm-warning", action="store_true", default=False, help="Disable periodic shm warning logs"
)

parser.add_argument("--router_token_ratio", type=float, default=0.0, help="token ratio to control router dispatch")
parser.add_argument(
Expand Down
45 changes: 45 additions & 0 deletions lightllm/server/api_start.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import uuid
import subprocess
import signal
import shutil
from lightllm.utils.net_utils import alloc_can_use_network_port, PortLocker
from lightllm.utils.start_utils import process_manager, kill_recursive
from .metrics.manager import start_metric_manager
Expand All @@ -19,6 +20,37 @@
logger = init_logger(__name__)


def get_shm_size_gb():
"""
获取 /dev/shm 的总大小(以GB为单位)。
"""
try:
shm_path = "/dev/shm"
if not os.path.exists(shm_path):
logger.error(f"{shm_path} not exist, this may indicate a system or Docker configuration anomaly.")
return 0

# shutil.disk_usage 返回 (total, used, free)
total_bytes = shutil.disk_usage(shm_path).total
total_gb = total_bytes / (1024 ** 3)
return total_gb
except Exception as e:
logger.error(f"Error getting /dev/shm size: {e}")
return 0
Comment on lines +23 to +39

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The function calculates the total SHM size but the PR description refers to "available" space. This can be misleading. To check for available space, use shutil.disk_usage(shm_path).free and rename the function to get_shm_free_size_gb for clarity.

Suggested change
def get_shm_size_gb():
"""
获取 /dev/shm 的总大小以GB为单位)。
"""
try:
shm_path = "/dev/shm"
if not os.path.exists(shm_path):
logger.error(f"{shm_path} not exist, this may indicate a system or Docker configuration anomaly.")
return 0
# shutil.disk_usage 返回 (total, used, free)
total_bytes = shutil.disk_usage(shm_path).total
total_gb = total_bytes / (1024 ** 3)
return total_gb
except Exception as e:
logger.error(f"Error getting /dev/shm size: {e}")
return 0
def get_shm_free_size_gb():
"""
获取 /dev/shm 的可用大小以GB为单位)。
"""
try:
shm_path = "/dev/shm"
if not os.path.exists(shm_path):
logger.error(f"{shm_path} not exist, this may indicate a system or Docker configuration anomaly.")
return 0
# shutil.disk_usage 返回 (total, used, free)
free_bytes = shutil.disk_usage(shm_path).free
free_gb = free_bytes / (1024 ** 3)
return free_gb
except Exception as e:
logger.error(f"Error getting /dev/shm size: {e}")
return 0



def check_shm_size():
RED = "\033[91m"
GREEN = "\033[92m"
ENDC = "\033[0m"
shm_size = get_shm_size_gb()
required_size = 128 # 128G
if shm_size < required_size:
logger.warning(f"{RED}Available shm size is less than 128G: {shm_size:.2f}G{ENDC}")
else:
logger.info(f"{GREEN}/dev/shm available space is sufficient ({shm_size:.2f} GB >= {required_size} GB).{ENDC}")
Comment on lines +47 to +51

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

For better maintainability, define required_size as a constant REQUIRED_SIZE_GB and use it in the warning message to avoid hardcoding the value 128G.

Suggested change
required_size = 128 # 128G
if shm_size < required_size:
logger.warning(f"{RED}Available shm size is less than 128G: {shm_size:.2f}G{ENDC}")
else:
logger.info(f"{GREEN}/dev/shm available space is sufficient ({shm_size:.2f} GB >= {required_size} GB).{ENDC}")
REQUIRED_SIZE_GB = 128 # 128G
if shm_size < REQUIRED_SIZE_GB:
logger.warning(f"{RED}Available shm size is less than {REQUIRED_SIZE_GB}G: {shm_size:.2f}G{ENDC}")
else:
logger.info(f"{GREEN}/dev/shm available space is sufficient ({shm_size:.2f} GB >= {REQUIRED_SIZE_GB} GB).{ENDC}")



def setup_signal_handlers(http_server_process, process_manager):
def signal_handler(sig, frame):
if sig == signal.SIGINT:
Expand Down Expand Up @@ -62,6 +94,19 @@ def signal_handler(sig, frame):
def normal_or_p_d_start(args):
set_unique_server_name(args)

check_shm_size()

if not args.disable_shm_warning:
import threading

def periodic_shm_warning():
while True:
check_shm_size()
time.sleep(120) # 每 120 秒打印一次警告日志

shm_warning_thread = threading.Thread(target=periodic_shm_warning, daemon=True)
shm_warning_thread.start()
Comment on lines +100 to +108

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

For style and maintainability:

  • Move import threading to the top of the file (PEP 8).
  • Define the sleep interval 120 as a named constant (e.g., SHM_CHECK_INTERVAL_S = 120) at the module level.

Since I can't suggest changes outside the diff, I'll define the constant locally, but consider moving both the import and the constant to the module level.

Suggested change
import threading
def periodic_shm_warning():
while True:
check_shm_size()
time.sleep(120) # 每 120 秒打印一次警告日志
shm_warning_thread = threading.Thread(target=periodic_shm_warning, daemon=True)
shm_warning_thread.start()
import threading
SHM_CHECK_INTERVAL_S = 120 # Consider moving this to a module-level constant
def periodic_shm_warning():
while True:
check_shm_size()
time.sleep(SHM_CHECK_INTERVAL_S) # 每 120 秒打印一次警告日志
shm_warning_thread = threading.Thread(target=periodic_shm_warning, daemon=True)
shm_warning_thread.start()


if args.enable_mps:
from lightllm.utils.device_utils import enable_mps

Expand Down