intel · xiangyuT · Feb 6, 2025 · Feb 7, 2025
diff --git a/python/llm/src/ipex_llm/vllm/cpu/engine/engine.py b/python/llm/src/ipex_llm/vllm/cpu/engine/engine.py
@@ -230,21 +230,23 @@ def from_engine_args(cls, engine_args: AsyncEngineArgs,
         return super().from_engine_args(engine_args, usage_context, ipc_path)
 
 
+def signal_handler(*_) -> None:
+    raise KeyboardInterrupt("MQLLMEngine terminated")  # noqa
+
+
 def run_mp_engine(engine_args: AsyncEngineArgs, usage_context: UsageContext,
                   ipc_path: str, load_in_low_bit: str, engine_alive):
 
-    def signal_handler(*_) -> None:
-        # Interrupt server on sigterm
-        raise KeyboardInterrupt("MQLLMEngine terminated")  # noqa
-
     try:
-        signal.signal(signal.SIGTERM, signal_handler)
-
         engine = IPEXLLMMQLLMEngine.from_engine_args(engine_args=engine_args,
                                                      usage_context=usage_context,
                                                      ipc_path=ipc_path,
                                                      load_in_low_bit=load_in_low_bit)
+
+        signal.signal(signal.SIGTERM, signal_handler)
+
         engine.start()
+
     except BaseException as e:
         logger.exception(e)
         engine_alive.value = False

diff --git a/python/llm/src/ipex_llm/vllm/cpu/entrypoints/api_server.py b/python/llm/src/ipex_llm/vllm/cpu/entrypoints/api_server.py
diff --git a/python/llm/src/ipex_llm/vllm/cpu/entrypoints/openai/api_server.py b/python/llm/src/ipex_llm/vllm/cpu/entrypoints/openai/api_server.py
diff --git a/python/llm/src/ipex_llm/vllm/cpu/entrypoints/openai/cli_args.py b/python/llm/src/ipex_llm/vllm/cpu/entrypoints/openai/cli_args.py
@@ -12,7 +12,8 @@
 from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
 from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption,
                                          validate_chat_template)
-from vllm.entrypoints.openai.serving_engine import (LoRAModulePath,
+from vllm.entrypoints.openai.reasoning_parsers import ReasoningParserManager
+from vllm.entrypoints.openai.serving_models import (LoRAModulePath,
                                                     PromptAdapterPath)
 from vllm.entrypoints.openai.tool_parsers import ToolParserManager
 from vllm.utils import FlexibleArgumentParser
@@ -79,29 +80,29 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
     parser.add_argument("--host",
                         type=nullable_str,
                         default=None,
-                        help="host name")
-    parser.add_argument("--port", type=int, default=8000, help="port number")
+                        help="Host name.")
+    parser.add_argument("--port", type=int, default=8000, help="Port number.")
     parser.add_argument(
         "--uvicorn-log-level",
         type=str,
         default="info",
         choices=['debug', 'info', 'warning', 'error', 'critical', 'trace'],
-        help="log level for uvicorn")
+        help="Log level for uvicorn.")
     parser.add_argument("--allow-credentials",
                         action="store_true",
-                        help="allow credentials")
+                        help="Allow credentials.")
     parser.add_argument("--allowed-origins",
                         type=json.loads,
                         default=["*"],
-                        help="allowed origins")
+                        help="Allowed origins.")
     parser.add_argument("--allowed-methods",
                         type=json.loads,
                         default=["*"],
-                        help="allowed methods")
+                        help="Allowed methods.")
     parser.add_argument("--allowed-headers",
                         type=json.loads,
                         default=["*"],
-                        help="allowed headers")
+                        help="Allowed headers.")
     parser.add_argument("--api-key",
                         type=nullable_str,
                         default=None,
@@ -115,10 +116,10 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         action=LoRAParserAction,
         help="LoRA module configurations in either 'name=path' format"
         "or JSON format. "
-        "Example (old format): 'name=path' "
+        "Example (old format): ``'name=path'`` "
         "Example (new format): "
-        "'{\"name\": \"name\", \"local_path\": \"path\", "
-        "\"base_model_name\": \"id\"}'")
+        "``{\"name\": \"name\", \"path\": \"lora_path\", "
+        "\"base_model_name\": \"id\"}``")
     parser.add_argument(
         "--prompt-adapters",
         type=nullable_str,
@@ -132,7 +133,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                         default=None,
                         help="The file path to the chat template, "
                         "or the template in single-line form "
-                        "for the specified model")
+                        "for the specified model.")
     parser.add_argument(
         '--chat-template-content-format',
         type=str,
@@ -141,38 +142,39 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         help='The format to render message content within a chat template.'
         '\n\n'
         '* "string" will render the content as a string. '
-        'Example: "Hello World"\n'
+        'Example: ``"Hello World"``\n'
         '* "openai" will render the content as a list of dictionaries, '
         'similar to OpenAI schema. '
-        'Example: [{"type": "text", "text": "Hello world!"}]')
+        'Example: ``[{"type": "text", "text": "Hello world!"}]``')
     parser.add_argument("--response-role",
                         type=nullable_str,
                         default="assistant",
                         help="The role name to return if "
-                        "`request.add_generation_prompt=true`.")
+                        "``request.add_generation_prompt=true``.")
     parser.add_argument("--ssl-keyfile",
                         type=nullable_str,
                         default=None,
-                        help="The file path to the SSL key file")
+                        help="The file path to the SSL key file.")
     parser.add_argument("--ssl-certfile",
                         type=nullable_str,
                         default=None,
-                        help="The file path to the SSL cert file")
+                        help="The file path to the SSL cert file.")
     parser.add_argument("--ssl-ca-certs",
                         type=nullable_str,
                         default=None,
-                        help="The CA certificates file")
+                        help="The CA certificates file.")
     parser.add_argument(
         "--ssl-cert-reqs",
         type=int,
         default=int(ssl.CERT_NONE),
-        help="Whether client certificate is required (see stdlib ssl module's)"
+        help="Whether client certificate is required (see stdlib ssl module's)."
     )
     parser.add_argument(
         "--root-path",
         type=nullable_str,
         default=None,
-        help="FastAPI root_path when app is behind a path based routing proxy")
+        help="FastAPI root_path when app is behind a path based routing proxy."
+    )
     parser.add_argument(
         "--middleware",
         type=nullable_str,
@@ -182,15 +184,15 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         "We accept multiple --middleware arguments. "
         "The value should be an import path. "
         "If a function is provided, vLLM will add it to the server "
-        "using @app.middleware('http'). "
+        "using ``@app.middleware('http')``. "
         "If a class is provided, vLLM will add it to the server "
-        "using app.add_middleware(). ")
+        "using ``app.add_middleware()``. ")
     parser.add_argument(
         "--return-tokens-as-token-ids",
         action="store_true",
-        help="When --max-logprobs is specified, represents single tokens as "
-        "strings of the form 'token_id:{token_id}' so that tokens that "
-        "are not JSON-encodable can be identified.")
+        help="When ``--max-logprobs`` is specified, represents single tokens "
+        " as strings of the form 'token_id:{token_id}' so that tokens "
+        "that are not JSON-encodable can be identified.")
     parser.add_argument(
         "--disable-frontend-multiprocessing",
         action="store_true",
@@ -205,8 +207,24 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         "--enable-auto-tool-choice",
         action="store_true",
         default=False,
-        help="Enable auto tool choice for supported models. Use --tool-call-parser"
-        " to specify which parser to use")
+        help="Enable auto tool choice for supported models. Use "
+        "``--tool-call-parser`` to specify which parser to use.")
+    parser.add_argument(
+        "--enable-reasoning",
+        action="store_true",
+        default=False,
+        help="Whether to enable reasoning_content for the model. "
+        "If enabled, the model will be able to generate reasoning content.")
+
+    valid_reasoning_parsers = ReasoningParserManager.reasoning_parsers.keys()
+    parser.add_argument(
+        "--reasoning-parser",
+        type=str,
+        metavar="{" + ",".join(valid_reasoning_parsers) + "}",
+        default=None,
+        help="Select the reasoning parser depending on the model that you're using."
+        " This is used to parse the reasoning content into OpenAI API "
+        "format. Required for ``--enable-reasoning``.")
 
     valid_tool_parsers = ToolParserManager.tool_parsers.keys()
     parser.add_argument(
@@ -217,15 +235,15 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         default=None,
         help="Select the tool call parser depending on the model that you're using."
         " This is used to parse the model-generated tool call into OpenAI API "
-        "format. Required for --enable-auto-tool-choice.")
+        "format. Required for ``--enable-auto-tool-choice``.")
 
     parser.add_argument(
         "--tool-parser-plugin",
         type=str,
         default="",
         help="Special the tool parser plugin write to parse the model-generated tool"
         " into OpenAI API format, the name register in this plugin can be used "
-        "in --tool-call-parser.")
+        "in ``--tool-call-parser``.")
 
     parser = AsyncEngineArgs.add_cli_args(parser)
 
@@ -240,7 +258,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         "--disable-fastapi-docs",
         action='store_true',
         default=False,
-        help="Disable FastAPI's OpenAPI schema, Swagger UI, and ReDoc endpoint"
+        help="Disable FastAPI's OpenAPI schema, Swagger UI, and ReDoc endpoint."
     )
     parser.add_argument(
         "--enable-prompt-tokens-details",
@@ -270,6 +288,18 @@ def validate_parsed_serve_args(args: argparse.Namespace):
         raise TypeError("Error: --enable-auto-tool-choice requires "  # noqa
                         "--tool-call-parser")
 
+    # Enable reasoning needs a reasoning parser to be valid
+    if args.enable_reasoning and not args.reasoning_parser:
+        raise TypeError("Error: --enable-reasoning requires "  # noqa
+                        "--reasoning-parser")
+
+    # Ref https://api-docs.deepseek.com/guides/reasoning_model
+    # tool call and reasoning cannot be enabled at the same time.
+    if args.enable_auto_tool_choice and args.enable_reasoning:
+        raise TypeError(  # noqa
+            "Error: --enable-auto-tool-choice and "
+            "--enable-reasoning cannot be enabled at the same time")
+
 
 def create_parser_for_docs() -> FlexibleArgumentParser:
     parser_for_docs = FlexibleArgumentParser(

diff --git a/python/llm/src/ipex_llm/vllm/cpu/ipex_llm_v1_wrapper.py b/python/llm/src/ipex_llm/vllm/cpu/ipex_llm_v1_wrapper.py
diff --git a/python/llm/src/ipex_llm/vllm/cpu/model_convert.py b/python/llm/src/ipex_llm/vllm/cpu/model_convert.py
@@ -48,7 +48,7 @@ def _sample_get_logits(
         logits = lm_head(hidden_states)
         if embedding_bias is not None:
             logits += embedding_bias
-    if self.use_gather:
+    if not self.use_all_gather:
         logits = tensor_model_parallel_gather(logits)
     else:
         logits = tensor_model_parallel_all_gather(logits)
@@ -65,12 +65,9 @@ def _model_sample_convert():
 def _ipex_llm_convert(load_in_low_bit):
     from vllm.worker.cpu_model_runner import CPUModelRunner
     from ipex_llm.vllm.cpu.ipex_llm_wrapper import get_ipex_llm_wrapper
-    from ipex_llm.vllm.cpu.ipex_llm_v1_wrapper import get_ipex_llm_v1_wrapper
-    import vllm.executor.ray_utils as ray_utils_v0
-    import vllm.v1.executor.ray_utils as ray_utils_v1
+    import vllm.executor.ray_utils as ray_utils
     setattr(CPUModelRunner, "load_model", get_load_function(load_in_low_bit))
-    setattr(ray_utils_v0, "RayWorkerWrapper", get_ipex_llm_wrapper(load_in_low_bit))
-    setattr(ray_utils_v1, "RayWorkerWrapper", get_ipex_llm_v1_wrapper(load_in_low_bit))
+    setattr(ray_utils, "RayWorkerWrapper", get_ipex_llm_wrapper(load_in_low_bit))
 
 
 def get_load_function(low_bit):