Skip to content

vLLM: Update vLLM cpu to v0.7.1 #12777

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions python/llm/src/ipex_llm/vllm/cpu/engine/engine.py
Original file line number Diff line number Diff line change
@@ -230,21 +230,23 @@ def from_engine_args(cls, engine_args: AsyncEngineArgs,
return super().from_engine_args(engine_args, usage_context, ipc_path)


def signal_handler(*_) -> None:
raise KeyboardInterrupt("MQLLMEngine terminated") # noqa


def run_mp_engine(engine_args: AsyncEngineArgs, usage_context: UsageContext,
ipc_path: str, load_in_low_bit: str, engine_alive):

def signal_handler(*_) -> None:
# Interrupt server on sigterm
raise KeyboardInterrupt("MQLLMEngine terminated") # noqa

try:
signal.signal(signal.SIGTERM, signal_handler)

engine = IPEXLLMMQLLMEngine.from_engine_args(engine_args=engine_args,
usage_context=usage_context,
ipc_path=ipc_path,
load_in_low_bit=load_in_low_bit)

signal.signal(signal.SIGTERM, signal_handler)

engine.start()

except BaseException as e:
logger.exception(e)
engine_alive.value = False
851 changes: 119 additions & 732 deletions python/llm/src/ipex_llm/vllm/cpu/entrypoints/api_server.py

Large diffs are not rendered by default.

265 changes: 198 additions & 67 deletions python/llm/src/ipex_llm/vllm/cpu/entrypoints/openai/api_server.py

Large diffs are not rendered by default.

90 changes: 60 additions & 30 deletions python/llm/src/ipex_llm/vllm/cpu/entrypoints/openai/cli_args.py
Original file line number Diff line number Diff line change
@@ -12,7 +12,8 @@
from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption,
validate_chat_template)
from vllm.entrypoints.openai.serving_engine import (LoRAModulePath,
from vllm.entrypoints.openai.reasoning_parsers import ReasoningParserManager
from vllm.entrypoints.openai.serving_models import (LoRAModulePath,
PromptAdapterPath)
from vllm.entrypoints.openai.tool_parsers import ToolParserManager
from vllm.utils import FlexibleArgumentParser
@@ -79,29 +80,29 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
parser.add_argument("--host",
type=nullable_str,
default=None,
help="host name")
parser.add_argument("--port", type=int, default=8000, help="port number")
help="Host name.")
parser.add_argument("--port", type=int, default=8000, help="Port number.")
parser.add_argument(
"--uvicorn-log-level",
type=str,
default="info",
choices=['debug', 'info', 'warning', 'error', 'critical', 'trace'],
help="log level for uvicorn")
help="Log level for uvicorn.")
parser.add_argument("--allow-credentials",
action="store_true",
help="allow credentials")
help="Allow credentials.")
parser.add_argument("--allowed-origins",
type=json.loads,
default=["*"],
help="allowed origins")
help="Allowed origins.")
parser.add_argument("--allowed-methods",
type=json.loads,
default=["*"],
help="allowed methods")
help="Allowed methods.")
parser.add_argument("--allowed-headers",
type=json.loads,
default=["*"],
help="allowed headers")
help="Allowed headers.")
parser.add_argument("--api-key",
type=nullable_str,
default=None,
@@ -115,10 +116,10 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
action=LoRAParserAction,
help="LoRA module configurations in either 'name=path' format"
"or JSON format. "
"Example (old format): 'name=path' "
"Example (old format): ``'name=path'`` "
"Example (new format): "
"'{\"name\": \"name\", \"local_path\": \"path\", "
"\"base_model_name\": \"id\"}'")
"``{\"name\": \"name\", \"path\": \"lora_path\", "
"\"base_model_name\": \"id\"}``")
parser.add_argument(
"--prompt-adapters",
type=nullable_str,
@@ -132,7 +133,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
default=None,
help="The file path to the chat template, "
"or the template in single-line form "
"for the specified model")
"for the specified model.")
parser.add_argument(
'--chat-template-content-format',
type=str,
@@ -141,38 +142,39 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
help='The format to render message content within a chat template.'
'\n\n'
'* "string" will render the content as a string. '
'Example: "Hello World"\n'
'Example: ``"Hello World"``\n'
'* "openai" will render the content as a list of dictionaries, '
'similar to OpenAI schema. '
'Example: [{"type": "text", "text": "Hello world!"}]')
'Example: ``[{"type": "text", "text": "Hello world!"}]``')
parser.add_argument("--response-role",
type=nullable_str,
default="assistant",
help="The role name to return if "
"`request.add_generation_prompt=true`.")
"``request.add_generation_prompt=true``.")
parser.add_argument("--ssl-keyfile",
type=nullable_str,
default=None,
help="The file path to the SSL key file")
help="The file path to the SSL key file.")
parser.add_argument("--ssl-certfile",
type=nullable_str,
default=None,
help="The file path to the SSL cert file")
help="The file path to the SSL cert file.")
parser.add_argument("--ssl-ca-certs",
type=nullable_str,
default=None,
help="The CA certificates file")
help="The CA certificates file.")
parser.add_argument(
"--ssl-cert-reqs",
type=int,
default=int(ssl.CERT_NONE),
help="Whether client certificate is required (see stdlib ssl module's)"
help="Whether client certificate is required (see stdlib ssl module's)."
)
parser.add_argument(
"--root-path",
type=nullable_str,
default=None,
help="FastAPI root_path when app is behind a path based routing proxy")
help="FastAPI root_path when app is behind a path based routing proxy."
)
parser.add_argument(
"--middleware",
type=nullable_str,
@@ -182,15 +184,15 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
"We accept multiple --middleware arguments. "
"The value should be an import path. "
"If a function is provided, vLLM will add it to the server "
"using @app.middleware('http'). "
"using ``@app.middleware('http')``. "
"If a class is provided, vLLM will add it to the server "
"using app.add_middleware(). ")
"using ``app.add_middleware()``. ")
parser.add_argument(
"--return-tokens-as-token-ids",
action="store_true",
help="When --max-logprobs is specified, represents single tokens as "
"strings of the form 'token_id:{token_id}' so that tokens that "
"are not JSON-encodable can be identified.")
help="When ``--max-logprobs`` is specified, represents single tokens "
" as strings of the form 'token_id:{token_id}' so that tokens "
"that are not JSON-encodable can be identified.")
parser.add_argument(
"--disable-frontend-multiprocessing",
action="store_true",
@@ -205,8 +207,24 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
"--enable-auto-tool-choice",
action="store_true",
default=False,
help="Enable auto tool choice for supported models. Use --tool-call-parser"
" to specify which parser to use")
help="Enable auto tool choice for supported models. Use "
"``--tool-call-parser`` to specify which parser to use.")
parser.add_argument(
"--enable-reasoning",
action="store_true",
default=False,
help="Whether to enable reasoning_content for the model. "
"If enabled, the model will be able to generate reasoning content.")

valid_reasoning_parsers = ReasoningParserManager.reasoning_parsers.keys()
parser.add_argument(
"--reasoning-parser",
type=str,
metavar="{" + ",".join(valid_reasoning_parsers) + "}",
default=None,
help="Select the reasoning parser depending on the model that you're using."
" This is used to parse the reasoning content into OpenAI API "
"format. Required for ``--enable-reasoning``.")

valid_tool_parsers = ToolParserManager.tool_parsers.keys()
parser.add_argument(
@@ -217,15 +235,15 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
default=None,
help="Select the tool call parser depending on the model that you're using."
" This is used to parse the model-generated tool call into OpenAI API "
"format. Required for --enable-auto-tool-choice.")
"format. Required for ``--enable-auto-tool-choice``.")

parser.add_argument(
"--tool-parser-plugin",
type=str,
default="",
help="Special the tool parser plugin write to parse the model-generated tool"
" into OpenAI API format, the name register in this plugin can be used "
"in --tool-call-parser.")
"in ``--tool-call-parser``.")

parser = AsyncEngineArgs.add_cli_args(parser)

@@ -240,7 +258,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
"--disable-fastapi-docs",
action='store_true',
default=False,
help="Disable FastAPI's OpenAPI schema, Swagger UI, and ReDoc endpoint"
help="Disable FastAPI's OpenAPI schema, Swagger UI, and ReDoc endpoint."
)
parser.add_argument(
"--enable-prompt-tokens-details",
@@ -270,6 +288,18 @@ def validate_parsed_serve_args(args: argparse.Namespace):
raise TypeError("Error: --enable-auto-tool-choice requires " # noqa
"--tool-call-parser")

# Enable reasoning needs a reasoning parser to be valid
if args.enable_reasoning and not args.reasoning_parser:
raise TypeError("Error: --enable-reasoning requires " # noqa
"--reasoning-parser")

# Ref https://api-docs.deepseek.com/guides/reasoning_model
# tool call and reasoning cannot be enabled at the same time.
if args.enable_auto_tool_choice and args.enable_reasoning:
raise TypeError( # noqa
"Error: --enable-auto-tool-choice and "
"--enable-reasoning cannot be enabled at the same time")


def create_parser_for_docs() -> FlexibleArgumentParser:
parser_for_docs = FlexibleArgumentParser(
23 changes: 0 additions & 23 deletions python/llm/src/ipex_llm/vllm/cpu/ipex_llm_v1_wrapper.py

This file was deleted.

9 changes: 3 additions & 6 deletions python/llm/src/ipex_llm/vllm/cpu/model_convert.py
Original file line number Diff line number Diff line change
@@ -48,7 +48,7 @@ def _sample_get_logits(
logits = lm_head(hidden_states)
if embedding_bias is not None:
logits += embedding_bias
if self.use_gather:
if not self.use_all_gather:
logits = tensor_model_parallel_gather(logits)
else:
logits = tensor_model_parallel_all_gather(logits)
@@ -65,12 +65,9 @@ def _model_sample_convert():
def _ipex_llm_convert(load_in_low_bit):
from vllm.worker.cpu_model_runner import CPUModelRunner
from ipex_llm.vllm.cpu.ipex_llm_wrapper import get_ipex_llm_wrapper
from ipex_llm.vllm.cpu.ipex_llm_v1_wrapper import get_ipex_llm_v1_wrapper
import vllm.executor.ray_utils as ray_utils_v0
import vllm.v1.executor.ray_utils as ray_utils_v1
import vllm.executor.ray_utils as ray_utils
setattr(CPUModelRunner, "load_model", get_load_function(load_in_low_bit))
setattr(ray_utils_v0, "RayWorkerWrapper", get_ipex_llm_wrapper(load_in_low_bit))
setattr(ray_utils_v1, "RayWorkerWrapper", get_ipex_llm_v1_wrapper(load_in_low_bit))
setattr(ray_utils, "RayWorkerWrapper", get_ipex_llm_wrapper(load_in_low_bit))


def get_load_function(low_bit):