diff --git a/sagemaker-core/src/sagemaker/core/image_uri_config/huggingface-sglang.json b/sagemaker-core/src/sagemaker/core/image_uri_config/huggingface-sglang.json new file mode 100644 index 0000000000..85aa710b4b --- /dev/null +++ b/sagemaker-core/src/sagemaker/core/image_uri_config/huggingface-sglang.json @@ -0,0 +1,66 @@ +{ + "inference": { + "processors": [ + "gpu" + ], + "version_aliases": { + "0.5": "0.5.8" + }, + "versions": { + "0.5.8": { + "py_versions": [ + "py312" + ], + "registries": { + "af-south-1": "626614931356", + "ap-east-1": "871362719292", + "ap-east-2": "975050140332", + "ap-northeast-1": "763104351884", + "ap-northeast-2": "763104351884", + "ap-northeast-3": "364406365360", + "ap-south-1": "763104351884", + "ap-south-2": "772153158452", + "ap-southeast-1": "763104351884", + "ap-southeast-2": "763104351884", + "ap-southeast-3": "907027046896", + "ap-southeast-4": "457447274322", + "ap-southeast-5": "550225433462", + "ap-southeast-6": "633930458069", + "ap-southeast-7": "590183813437", + "ca-central-1": "763104351884", + "ca-west-1": "204538143572", + "cn-north-1": "727897471807", + "cn-northwest-1": "727897471807", + "eu-central-1": "763104351884", + "eu-central-2": "380420809688", + "eu-north-1": "763104351884", + "eu-south-1": "692866216735", + "eu-south-2": "503227376785", + "eu-west-1": "763104351884", + "eu-west-2": "763104351884", + "eu-west-3": "763104351884", + "il-central-1": "780543022126", + "me-central-1": "914824155844", + "me-south-1": "217643126080", + "mx-central-1": "637423239942", + "sa-east-1": "763104351884", + "us-east-1": "763104351884", + "us-east-2": "763104351884", + "us-gov-east-1": "446045086412", + "us-gov-west-1": "442386744353", + "us-iso-east-1": "886529160074", + "us-isob-east-1": "094389454867", + "us-isof-east-1": "303241398832", + "us-isof-south-1": "454834333376", + "us-west-1": "763104351884", + "us-west-2": "763104351884" + }, + "tag_prefix": "0.5.8-transformers4.57.3", + "repository": "huggingface-sglang", + "container_version": { + "gpu": "cu129-ubuntu24.04" + } + } + } + } +} diff --git a/sagemaker-core/src/sagemaker/core/image_uri_config/huggingface-vllm-omni.json b/sagemaker-core/src/sagemaker/core/image_uri_config/huggingface-vllm-omni.json new file mode 100644 index 0000000000..99a0138de7 --- /dev/null +++ b/sagemaker-core/src/sagemaker/core/image_uri_config/huggingface-vllm-omni.json @@ -0,0 +1,66 @@ +{ + "inference": { + "processors": [ + "gpu" + ], + "version_aliases": { + "0.20": "0.20.0" + }, + "versions": { + "0.20.0": { + "py_versions": [ + "py312" + ], + "registries": { + "af-south-1": "626614931356", + "ap-east-1": "871362719292", + "ap-east-2": "975050140332", + "ap-northeast-1": "763104351884", + "ap-northeast-2": "763104351884", + "ap-northeast-3": "364406365360", + "ap-south-1": "763104351884", + "ap-south-2": "772153158452", + "ap-southeast-1": "763104351884", + "ap-southeast-2": "763104351884", + "ap-southeast-3": "907027046896", + "ap-southeast-4": "457447274322", + "ap-southeast-5": "550225433462", + "ap-southeast-6": "633930458069", + "ap-southeast-7": "590183813437", + "ca-central-1": "763104351884", + "ca-west-1": "204538143572", + "cn-north-1": "727897471807", + "cn-northwest-1": "727897471807", + "eu-central-1": "763104351884", + "eu-central-2": "380420809688", + "eu-north-1": "763104351884", + "eu-south-1": "692866216735", + "eu-south-2": "503227376785", + "eu-west-1": "763104351884", + "eu-west-2": "763104351884", + "eu-west-3": "763104351884", + "il-central-1": "780543022126", + "me-central-1": "914824155844", + "me-south-1": "217643126080", + "mx-central-1": "637423239942", + "sa-east-1": "763104351884", + "us-east-1": "763104351884", + "us-east-2": "763104351884", + "us-gov-east-1": "446045086412", + "us-gov-west-1": "442386744353", + "us-iso-east-1": "886529160074", + "us-isob-east-1": "094389454867", + "us-isof-east-1": "303241398832", + "us-isof-south-1": "454834333376", + "us-west-1": "763104351884", + "us-west-2": "763104351884" + }, + "tag_prefix": "0.20.0-transformers5.8.1", + "repository": "huggingface-vllm-omni", + "container_version": { + "gpu": "cu130-amzn2023" + } + } + } + } +} diff --git a/sagemaker-core/src/sagemaker/core/image_uri_config/huggingface-vllm.json b/sagemaker-core/src/sagemaker/core/image_uri_config/huggingface-vllm.json new file mode 100644 index 0000000000..c03be6a2e8 --- /dev/null +++ b/sagemaker-core/src/sagemaker/core/image_uri_config/huggingface-vllm.json @@ -0,0 +1,176 @@ +{ + "inference": { + "processors": [ + "gpu" + ], + "version_aliases": { + "0.14": "0.14.0", + "0.17": "0.17.0", + "0.21": "0.21.0" + }, + "versions": { + "0.14.0": { + "py_versions": [ + "py312" + ], + "registries": { + "af-south-1": "626614931356", + "ap-east-1": "871362719292", + "ap-east-2": "975050140332", + "ap-northeast-1": "763104351884", + "ap-northeast-2": "763104351884", + "ap-northeast-3": "364406365360", + "ap-south-1": "763104351884", + "ap-south-2": "772153158452", + "ap-southeast-1": "763104351884", + "ap-southeast-2": "763104351884", + "ap-southeast-3": "907027046896", + "ap-southeast-4": "457447274322", + "ap-southeast-5": "550225433462", + "ap-southeast-6": "633930458069", + "ap-southeast-7": "590183813437", + "ca-central-1": "763104351884", + "ca-west-1": "204538143572", + "cn-north-1": "727897471807", + "cn-northwest-1": "727897471807", + "eu-central-1": "763104351884", + "eu-central-2": "380420809688", + "eu-north-1": "763104351884", + "eu-south-1": "692866216735", + "eu-south-2": "503227376785", + "eu-west-1": "763104351884", + "eu-west-2": "763104351884", + "eu-west-3": "763104351884", + "il-central-1": "780543022126", + "me-central-1": "914824155844", + "me-south-1": "217643126080", + "mx-central-1": "637423239942", + "sa-east-1": "763104351884", + "us-east-1": "763104351884", + "us-east-2": "763104351884", + "us-gov-east-1": "446045086412", + "us-gov-west-1": "442386744353", + "us-iso-east-1": "886529160074", + "us-isob-east-1": "094389454867", + "us-isof-east-1": "303241398832", + "us-isof-south-1": "454834333376", + "us-west-1": "763104351884", + "us-west-2": "763104351884" + }, + "tag_prefix": "0.14.0-transformers4.57.3", + "repository": "huggingface-vllm", + "container_version": { + "gpu": "cu129-ubuntu22.04" + } + }, + "0.17.0": { + "py_versions": [ + "py312" + ], + "registries": { + "af-south-1": "626614931356", + "ap-east-1": "871362719292", + "ap-east-2": "975050140332", + "ap-northeast-1": "763104351884", + "ap-northeast-2": "763104351884", + "ap-northeast-3": "364406365360", + "ap-south-1": "763104351884", + "ap-south-2": "772153158452", + "ap-southeast-1": "763104351884", + "ap-southeast-2": "763104351884", + "ap-southeast-3": "907027046896", + "ap-southeast-4": "457447274322", + "ap-southeast-5": "550225433462", + "ap-southeast-6": "633930458069", + "ap-southeast-7": "590183813437", + "ca-central-1": "763104351884", + "ca-west-1": "204538143572", + "cn-north-1": "727897471807", + "cn-northwest-1": "727897471807", + "eu-central-1": "763104351884", + "eu-central-2": "380420809688", + "eu-north-1": "763104351884", + "eu-south-1": "692866216735", + "eu-south-2": "503227376785", + "eu-west-1": "763104351884", + "eu-west-2": "763104351884", + "eu-west-3": "763104351884", + "il-central-1": "780543022126", + "me-central-1": "914824155844", + "me-south-1": "217643126080", + "mx-central-1": "637423239942", + "sa-east-1": "763104351884", + "us-east-1": "763104351884", + "us-east-2": "763104351884", + "us-gov-east-1": "446045086412", + "us-gov-west-1": "442386744353", + "us-iso-east-1": "886529160074", + "us-isob-east-1": "094389454867", + "us-isof-east-1": "303241398832", + "us-isof-south-1": "454834333376", + "us-west-1": "763104351884", + "us-west-2": "763104351884" + }, + "tag_prefix": "0.17.0-transformers4.57.5", + "repository": "huggingface-vllm", + "container_version": { + "gpu": "cu129-ubuntu22.04" + } + }, + "0.21.0": { + "py_versions": [ + "py312" + ], + "registries": { + "af-south-1": "626614931356", + "ap-east-1": "871362719292", + "ap-east-2": "975050140332", + "ap-northeast-1": "763104351884", + "ap-northeast-2": "763104351884", + "ap-northeast-3": "364406365360", + "ap-south-1": "763104351884", + "ap-south-2": "772153158452", + "ap-southeast-1": "763104351884", + "ap-southeast-2": "763104351884", + "ap-southeast-3": "907027046896", + "ap-southeast-4": "457447274322", + "ap-southeast-5": "550225433462", + "ap-southeast-6": "633930458069", + "ap-southeast-7": "590183813437", + "ca-central-1": "763104351884", + "ca-west-1": "204538143572", + "cn-north-1": "727897471807", + "cn-northwest-1": "727897471807", + "eu-central-1": "763104351884", + "eu-central-2": "380420809688", + "eu-north-1": "763104351884", + "eu-south-1": "692866216735", + "eu-south-2": "503227376785", + "eu-west-1": "763104351884", + "eu-west-2": "763104351884", + "eu-west-3": "763104351884", + "il-central-1": "780543022126", + "me-central-1": "914824155844", + "me-south-1": "217643126080", + "mx-central-1": "637423239942", + "sa-east-1": "763104351884", + "us-east-1": "763104351884", + "us-east-2": "763104351884", + "us-gov-east-1": "446045086412", + "us-gov-west-1": "442386744353", + "us-iso-east-1": "886529160074", + "us-isob-east-1": "094389454867", + "us-isof-east-1": "303241398832", + "us-isof-south-1": "454834333376", + "us-west-1": "763104351884", + "us-west-2": "763104351884" + }, + "tag_prefix": "0.21.0-transformers5.8.1", + "repository": "huggingface-vllm", + "container_version": { + "gpu": "cu130-ubuntu22.04" + } + } + } + } +} diff --git a/sagemaker-core/src/sagemaker/core/image_uri_config/huggingface.json b/sagemaker-core/src/sagemaker/core/image_uri_config/huggingface.json index dc3987a8d8..26625217b5 100644 --- a/sagemaker-core/src/sagemaker/core/image_uri_config/huggingface.json +++ b/sagemaker-core/src/sagemaker/core/image_uri_config/huggingface.json @@ -18,7 +18,8 @@ "4.48": "4.48.0", "4.49": "4.49.0", "4.55": "4.55.0", - "4.56": "4.56.2" + "4.56": "4.56.2", + "5.3": "5.3.0" }, "versions": { "4.4.2": { @@ -1258,6 +1259,53 @@ "gpu": "cu129-ubuntu22.04" } } + }, + "5.3.0": { + "version_aliases": { + "pytorch2.9": "pytorch2.9.0" + }, + "pytorch2.9.0": { + "py_versions": [ + "py312" + ], + "registries": { + "af-south-1": "626614931356", + "il-central-1": "780543022126", + "ap-east-1": "871362719292", + "ap-northeast-1": "763104351884", + "ap-northeast-2": "763104351884", + "ap-northeast-3": "364406365360", + "ap-south-1": "763104351884", + "ap-southeast-1": "763104351884", + "ap-southeast-2": "763104351884", + "ap-southeast-3": "907027046896", + "ca-central-1": "763104351884", + "cn-north-1": "727897471807", + "cn-northwest-1": "727897471807", + "eu-central-1": "763104351884", + "eu-north-1": "763104351884", + "eu-west-1": "763104351884", + "eu-west-2": "763104351884", + "eu-west-3": "763104351884", + "eu-south-1": "692866216735", + "me-south-1": "217643126080", + "me-central-1": "914824155844", + "sa-east-1": "763104351884", + "us-east-1": "763104351884", + "us-east-2": "763104351884", + "us-gov-east-1": "446045086412", + "us-gov-west-1": "442386744353", + "us-iso-east-1": "886529160074", + "us-isob-east-1": "094389454867", + "us-west-1": "763104351884", + "us-west-2": "763104351884", + "ca-west-1": "204538143572" + }, + "repository": "huggingface-pytorch-training", + "container_version": { + "gpu": "cu130-ubuntu22.04" + } + } } } }, @@ -1276,7 +1324,8 @@ "4.28": "4.28.1", "4.37": "4.37.0", "4.49": "4.49.0", - "4.51": "4.51.3" + "4.51": "4.51.3", + "5.5": "5.5.3" }, "versions": { "4.6.1": { @@ -2281,6 +2330,58 @@ "cpu": "ubuntu22.04" } } + }, + "5.5.3": { + "version_aliases": { + "pytorch2.6": "pytorch2.6.0" + }, + "pytorch2.6.0": { + "py_versions": [ + "py312" + ], + "registries": { + "af-south-1": "626614931356", + "il-central-1": "780543022126", + "ap-east-1": "871362719292", + "ap-northeast-1": "763104351884", + "ap-northeast-2": "763104351884", + "ap-northeast-3": "364406365360", + "ap-south-1": "763104351884", + "ap-south-2": "772153158452", + "ap-southeast-1": "763104351884", + "ap-southeast-2": "763104351884", + "ap-southeast-3": "907027046896", + "ap-southeast-4": "457447274322", + "ca-central-1": "763104351884", + "cn-north-1": "727897471807", + "cn-northwest-1": "727897471807", + "eu-central-1": "763104351884", + "eu-central-2": "380420809688", + "eu-north-1": "763104351884", + "eu-west-1": "763104351884", + "eu-west-2": "763104351884", + "eu-west-3": "763104351884", + "eu-south-1": "692866216735", + "eu-south-2": "503227376785", + "me-south-1": "217643126080", + "me-central-1": "914824155844", + "sa-east-1": "763104351884", + "us-east-1": "763104351884", + "us-east-2": "763104351884", + "us-gov-east-1": "446045086412", + "us-gov-west-1": "442386744353", + "us-iso-east-1": "886529160074", + "us-isob-east-1": "094389454867", + "us-west-1": "763104351884", + "us-west-2": "763104351884", + "ca-west-1": "204538143572" + }, + "repository": "huggingface-pytorch-inference", + "container_version": { + "gpu": "cu124-ubuntu22.04", + "cpu": "ubuntu22.04" + } + } } } } diff --git a/sagemaker-serve/src/sagemaker/serve/constants.py b/sagemaker-serve/src/sagemaker/serve/constants.py index 7c69dc947d..a66968e14d 100644 --- a/sagemaker-serve/src/sagemaker/serve/constants.py +++ b/sagemaker-serve/src/sagemaker/serve/constants.py @@ -65,6 +65,17 @@ ModelServer.TGI, ModelServer.TEI, ModelServer.SMD, + ModelServer.VLLM, + ModelServer.SGLANG, + ModelServer.VLLM_OMNI, +} + +# HuggingFace pipeline tags that should route to the vLLM-omni (multimodal) server. +# Note: visual-question-answering is folded into image-text-to-text as of transformers v5. +OMNI_TASKS: Set[str] = { + "image-text-to-text", + "any-to-any", + "audio-text-to-text", } # ======================================== diff --git a/sagemaker-serve/src/sagemaker/serve/model_builder.py b/sagemaker-serve/src/sagemaker/serve/model_builder.py index 27eaaa8fa3..53da34871c 100644 --- a/sagemaker-serve/src/sagemaker/serve/model_builder.py +++ b/sagemaker-serve/src/sagemaker/serve/model_builder.py @@ -135,7 +135,12 @@ ENDPOINT_CONFIG_ASYNC_KMS_KEY_ID_PATH, MODEL_CONTAINERS_PATH, ) -from sagemaker.serve.constants import LOCAL_MODES, SUPPORTED_MODEL_SERVERS, Framework +from sagemaker.serve.constants import ( + LOCAL_MODES, + SUPPORTED_MODEL_SERVERS, + OMNI_TASKS, + Framework, +) from sagemaker.core.workflow.pipeline_context import PipelineSession, runnable_by_pipeline from sagemaker.core import fw_utils from sagemaker.core.helper.session_helper import container_def @@ -2602,10 +2607,16 @@ def _build_single_modelbuilder( if self.schema_builder is None and model_task is not None: self._hf_schema_builder_init(model_task) + # Task-based auto-selection. SGLang is not auto-selected by task; it is + # opt-in only via model_server=ModelServer.SGLANG, which is handled earlier + # by the _build_for_model_server() short-circuit above. if model_task == "text-generation": - self.built_model = self._build_for_tgi() + self.built_model = self._build_for_vllm() + return self.built_model + elif model_task in OMNI_TASKS: + self.built_model = self._build_for_vllm_omni() return self.built_model - elif model_task in ["sentence-similarity", "feature-extraction"]: + elif model_task in ["sentence-similarity", "feature-extraction", "text-ranking"]: self.built_model = self._build_for_tei() return self.built_model else: diff --git a/sagemaker-serve/src/sagemaker/serve/model_builder_servers.py b/sagemaker-serve/src/sagemaker/serve/model_builder_servers.py index 7156ee20ee..6a6c35b36c 100644 --- a/sagemaker-serve/src/sagemaker/serve/model_builder_servers.py +++ b/sagemaker-serve/src/sagemaker/serve/model_builder_servers.py @@ -16,6 +16,7 @@ and deploying machine learning models across different model servers and deployment modes. It supports various frameworks including PyTorch, TensorFlow, HuggingFace, XGBoost, and more. """ + from __future__ import absolute_import, annotations import os @@ -36,7 +37,10 @@ _get_gpu_info_fallback, ) from sagemaker.serve.utils.hf_utils import _get_model_config_properties_from_hf -from sagemaker.serve.detector.image_detector import _get_model_base, _detect_framework_and_version +from sagemaker.serve.detector.image_detector import ( + _get_model_base, + _detect_framework_and_version, +) from sagemaker.serve.detector.pickler import save_pkl from sagemaker.serve.utils.types import ModelServer @@ -48,11 +52,15 @@ from sagemaker.serve.model_server.tgi.utils import _get_default_tgi_configurations from sagemaker.serve.model_server.tgi.prepare import prepare_tgi_js_resources from sagemaker.serve.model_server.djl_serving.prepare import prepare_djl_js_resources -from sagemaker.serve.model_server.multi_model_server.prepare import prepare_mms_js_resources +from sagemaker.serve.model_server.multi_model_server.prepare import ( + prepare_mms_js_resources, +) # Model server preparation from sagemaker.serve.model_server.torchserve.prepare import prepare_for_torchserve -from sagemaker.serve.model_server.tensorflow_serving.prepare import prepare_for_tf_serving +from sagemaker.serve.model_server.tensorflow_serving.prepare import ( + prepare_for_tf_serving, +) from sagemaker.serve.model_server.smd.prepare import prepare_for_smd from sagemaker.serve.model_server.multi_model_server.prepare import prepare_for_mms @@ -70,7 +78,6 @@ class _ModelBuilderServers(object): - def _build_for_model_server(self) -> Model: """Build model using explicit model server configuration. @@ -109,6 +116,12 @@ def _build_for_model_server(self) -> Model: return self._build_for_tei() elif self.model_server == ModelServer.TGI: return self._build_for_tgi() + elif self.model_server == ModelServer.VLLM: + return self._build_for_vllm() + elif self.model_server == ModelServer.SGLANG: + return self._build_for_sglang() + elif self.model_server == ModelServer.VLLM_OMNI: + return self._build_for_vllm_omni() elif self.model_server == ModelServer.MMS: return self._build_for_transformers() elif self.model_server == ModelServer.SMD: @@ -289,6 +302,97 @@ def _build_for_tgi(self) -> Model: return model + def _build_for_hf_server(self, model_server: ModelServer) -> Model: + """Build a HuggingFace model for a hub-download serving container. + + Generic build path shared by the vLLM, SGLang, and vLLM-omni servers. It + configures the container to pull the model directly from the HuggingFace Hub + (HF_MODEL_ID), resolves the appropriate DLC image via _auto_detect_image_uri, + and prepares the model for the selected mode. Server-specific tuning (sharding, + tensor parallelism, MAX_* limits, etc.) is intentionally left to the container + defaults and will be added in a follow-up. + + Args: + model_server: The HuggingFace serving model server to build for + (VLLM, SGLANG, or VLLM_OMNI). + + Returns: + Model: Configured model ready for deployment. + """ + self.secret_key = "" + self.model_server = model_server + + # Use notebook instance type if available + nb_instance = _get_nb_instance() + if nb_instance and not self._user_provided_instance_type: + self.instance_type = nb_instance + logger.debug(f"Using detected notebook instance type: {nb_instance}") + + from sagemaker.serve.model_server.tgi.prepare import _create_dir_structure + + _create_dir_structure(self.model_path) + + if isinstance(self.model, str) and not self._is_jumpstart_model_id(): + # These containers download the model directly from the HuggingFace Hub + # Todo: missing something? + self.env_vars.setdefault("HF_MODEL_ID", self.model) + + if self.env_vars.get("HUGGING_FACE_HUB_TOKEN"): + self.env_vars["HF_TOKEN"] = self.env_vars.get("HUGGING_FACE_HUB_TOKEN") + + self.s3_upload_path = None + + self._auto_detect_image_uri() + + if not self._optimizing: + if self.mode in LOCAL_MODES: + self._prepare_for_mode(should_upload_artifacts=True) + else: + self.s3_model_data_url, _ = self._prepare_for_mode() + + # Cache management based on mode + if self.mode in LOCAL_MODES: + self.env_vars.update({"HF_HUB_OFFLINE": "1"}) + else: + self.env_vars["HF_HOME"] = "/tmp" + self.env_vars["HUGGINGFACE_HUB_CACHE"] = "/tmp" + + if self.mode == Mode.SAGEMAKER_ENDPOINT and not self.instance_type: + raise ValueError( + "Instance type must be provided when building for SageMaker Endpoint mode." + ) + + model = self._create_model() + + if "HF_HUB_OFFLINE" in self.env_vars: + self.env_vars.update({"HF_HUB_OFFLINE": "0"}) + + return model + + def _build_for_vllm(self) -> Model: + """Build a HuggingFace model for the vLLM serving container. + + Returns: + Model: Configured model ready for vLLM deployment. + """ + return self._build_for_hf_server(ModelServer.VLLM) + + def _build_for_sglang(self) -> Model: + """Build a HuggingFace model for the SGLang serving container. + + Returns: + Model: Configured model ready for SGLang deployment. + """ + return self._build_for_hf_server(ModelServer.SGLANG) + + def _build_for_vllm_omni(self) -> Model: + """Build a HuggingFace model for the vLLM-omni (multimodal) serving container. + + Returns: + Model: Configured model ready for vLLM-omni deployment. + """ + return self._build_for_hf_server(ModelServer.VLLM_OMNI) + def _build_for_djl(self) -> Model: """Build model for DJL Serving deployment. diff --git a/sagemaker-serve/src/sagemaker/serve/model_builder_utils.py b/sagemaker-serve/src/sagemaker/serve/model_builder_utils.py index 9a96f9680e..c9ae8c854a 100644 --- a/sagemaker-serve/src/sagemaker/serve/model_builder_utils.py +++ b/sagemaker-serve/src/sagemaker/serve/model_builder_utils.py @@ -62,6 +62,7 @@ def build(self): from sagemaker.serve.compute_resource_requirements import ResourceRequirements from sagemaker.serve.constants import ( DEFAULT_SERIALIZERS_BY_FRAMEWORK, + OMNI_TASKS, Framework, ) from sagemaker.serve.builder.schema_builder import SchemaBuilder @@ -675,7 +676,9 @@ def _detect_huggingface_image(self) -> None: model_task = hf_model_md.get("pipeline_tag") if model_task == "text-generation": - effective_model_server = ModelServer.TGI + effective_model_server = ModelServer.VLLM + elif model_task in OMNI_TASKS: + effective_model_server = ModelServer.VLLM_OMNI elif model_task in ["sentence-similarity", "feature-extraction"]: effective_model_server = ModelServer.TEI else: @@ -694,6 +697,36 @@ def _detect_huggingface_image(self) -> None: ) self.framework = Framework.HUGGINGFACE + elif effective_model_server == ModelServer.VLLM: + # vLLM: Use image_uris.retrieve with "huggingface-vllm" framework + self.image_uri = image_uris.retrieve( + "huggingface-vllm", + region=self.region, + version=None, # Use latest version + image_scope="inference", + ) + self.framework = Framework.HUGGINGFACE + + elif effective_model_server == ModelServer.SGLANG: + # SGLang: Use image_uris.retrieve with "huggingface-sglang" framework + self.image_uri = image_uris.retrieve( + "huggingface-sglang", + region=self.region, + version=None, # Use latest version + image_scope="inference", + ) + self.framework = Framework.HUGGINGFACE + + elif effective_model_server == ModelServer.VLLM_OMNI: + # vLLM-omni: Use image_uris.retrieve with "huggingface-vllm-omni" framework + self.image_uri = image_uris.retrieve( + "huggingface-vllm-omni", + region=self.region, + version=None, # Use latest version + image_scope="inference", + ) + self.framework = Framework.HUGGINGFACE + elif effective_model_server == ModelServer.TEI: # TEI: Use image_uris.retrieve with "huggingface-tei" framework self.image_uri = image_uris.retrieve( diff --git a/sagemaker-serve/src/sagemaker/serve/utils/types.py b/sagemaker-serve/src/sagemaker/serve/utils/types.py index b405d85b21..210dee77f1 100644 --- a/sagemaker-serve/src/sagemaker/serve/utils/types.py +++ b/sagemaker-serve/src/sagemaker/serve/utils/types.py @@ -20,6 +20,9 @@ def __str__(self): TGI = 6 TEI = 7 SMD = 8 + VLLM = 9 + SGLANG = 10 + VLLM_OMNI = 11 class HardwareType(Enum): diff --git a/sagemaker-serve/tests/unit/test_model_builder_utils_extended_coverage.py b/sagemaker-serve/tests/unit/test_model_builder_utils_extended_coverage.py index 919a74fe7d..142bd66234 100644 --- a/sagemaker-serve/tests/unit/test_model_builder_utils_extended_coverage.py +++ b/sagemaker-serve/tests/unit/test_model_builder_utils_extended_coverage.py @@ -150,9 +150,63 @@ def test_detect_hf_image_tei(self, mock_metadata, mock_retrieve): mock_retrieve.return_value = "763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-tei:latest" utils._detect_huggingface_image() - + self.assertIsNotNone(utils.image_uri) + @patch('sagemaker.core.image_uris.retrieve') + @patch.object(_ModelBuilderUtils, 'get_huggingface_model_metadata') + def test_detect_hf_image_vllm(self, mock_metadata, mock_retrieve): + """Test HF image detection resolves the huggingface-vllm framework for vLLM.""" + utils = _ModelBuilderUtils() + utils.model = "gpt2" + utils.region = "us-west-2" + utils.model_server = ModelServer.VLLM + mock_retrieve.return_value = ( + "763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-vllm:latest" + ) + + utils._detect_huggingface_image() + + self.assertIsNotNone(utils.image_uri) + self.assertEqual(utils.framework, Framework.HUGGINGFACE) + self.assertEqual(mock_retrieve.call_args.args[0], "huggingface-vllm") + + @patch('sagemaker.core.image_uris.retrieve') + @patch.object(_ModelBuilderUtils, 'get_huggingface_model_metadata') + def test_detect_hf_image_sglang(self, mock_metadata, mock_retrieve): + """Test HF image detection resolves the huggingface-sglang framework for SGLang.""" + utils = _ModelBuilderUtils() + utils.model = "gpt2" + utils.region = "us-west-2" + utils.model_server = ModelServer.SGLANG + mock_retrieve.return_value = ( + "763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-sglang:latest" + ) + + utils._detect_huggingface_image() + + self.assertIsNotNone(utils.image_uri) + self.assertEqual(utils.framework, Framework.HUGGINGFACE) + self.assertEqual(mock_retrieve.call_args.args[0], "huggingface-sglang") + + @patch('sagemaker.core.image_uris.retrieve') + @patch.object(_ModelBuilderUtils, 'get_huggingface_model_metadata') + def test_detect_hf_image_vllm_omni(self, mock_metadata, mock_retrieve): + """Test HF image detection resolves the huggingface-vllm-omni framework for omni.""" + utils = _ModelBuilderUtils() + utils.model = "llava-hf/llava-1.5-7b-hf" + utils.region = "us-west-2" + utils.model_server = ModelServer.VLLM_OMNI + mock_retrieve.return_value = ( + "763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-vllm-omni:latest" + ) + + utils._detect_huggingface_image() + + self.assertIsNotNone(utils.image_uri) + self.assertEqual(utils.framework, Framework.HUGGINGFACE) + self.assertEqual(mock_retrieve.call_args.args[0], "huggingface-vllm-omni") + class TestNormalizeFrameworkToEnum(unittest.TestCase): """Test _normalize_framework_to_enum method.""" diff --git a/sagemaker-serve/tests/unit/test_model_builder_v3.py b/sagemaker-serve/tests/unit/test_model_builder_v3.py index 76f13a6d06..703975c95e 100644 --- a/sagemaker-serve/tests/unit/test_model_builder_v3.py +++ b/sagemaker-serve/tests/unit/test_model_builder_v3.py @@ -938,7 +938,7 @@ def test_build_with_djl( self.assertEqual(result, mock_model) mock_build_djl.assert_called_once() - @patch('sagemaker.serve.model_builder.ModelBuilder._build_for_tgi') + @patch('sagemaker.serve.model_builder.ModelBuilder._build_for_vllm') @patch('sagemaker.serve.model_builder.ModelBuilder._is_huggingface_model') @patch('sagemaker.serve.model_builder.ModelBuilder.get_huggingface_model_metadata') @patch('sagemaker.serve.model_builder.ModelBuilder._get_client_translators') @@ -946,29 +946,29 @@ def test_build_with_djl( @patch('sagemaker.serve.model_builder.ModelBuilder._build_validations') @patch('sagemaker.serve.model_builder.ModelBuilder._handle_mlflow_input') @patch('sagemaker.serve.model_builder.ModelBuilder._hf_schema_builder_init') - def test_build_with_tgi_for_text_generation( + def test_build_with_vllm_for_text_generation( self, mock_hf_schema_init, mock_mlflow, mock_validations, mock_serve_setting, - mock_translators, mock_hf_metadata, mock_is_hf, mock_build_tgi + mock_translators, mock_hf_metadata, mock_is_hf, mock_build_vllm ): - """Test build with TGI for text-generation models.""" + """Test build defaults to vLLM for text-generation models.""" mock_model = Mock(spec=Model) - mock_build_tgi.return_value = mock_model + mock_build_vllm.return_value = mock_model mock_translators.return_value = (Mock(), Mock()) mock_serve_setting.return_value = Mock() mock_is_hf.return_value = True mock_hf_metadata.return_value = {"pipeline_tag": "text-generation"} mock_hf_schema_init.return_value = None # Skip schema initialization - + builder = ModelBuilder( model="gpt2", role_arn=self.mock_role_arn, sagemaker_session=self.mock_session ) - + result = builder._build_single_modelbuilder() - + self.assertEqual(result, mock_model) - mock_build_tgi.assert_called_once() + mock_build_vllm.assert_called_once() if __name__ == '__main__': diff --git a/sagemaker-serve/tests/unit/test_model_builder_workflows.py b/sagemaker-serve/tests/unit/test_model_builder_workflows.py index eec0053e28..695948eb69 100644 --- a/sagemaker-serve/tests/unit/test_model_builder_workflows.py +++ b/sagemaker-serve/tests/unit/test_model_builder_workflows.py @@ -384,21 +384,21 @@ def setUp(self): @patch('sagemaker.serve.model_builder.ModelBuilder._is_huggingface_model') @patch('sagemaker.serve.model_builder.ModelBuilder.get_huggingface_model_metadata') - @patch('sagemaker.serve.model_builder.ModelBuilder._build_for_tgi') + @patch('sagemaker.serve.model_builder.ModelBuilder._build_for_vllm') @patch('sagemaker.serve.model_builder.ModelBuilder._get_client_translators') @patch('sagemaker.serve.model_builder.ModelBuilder._is_jumpstart_model_id') @patch('sagemaker.serve.model_builder.ModelBuilder._use_jumpstart_equivalent') @patch('sagemaker.serve.model_builder.ModelBuilder._hf_schema_builder_init') - def test_build_single_with_hf_text_generation(self, mock_schema_init, mock_use_js, mock_is_js, mock_get_trans, mock_build_tgi, mock_get_md, mock_is_hf): - """Test _build_single_modelbuilder with HF text-generation model.""" + def test_build_single_with_hf_text_generation(self, mock_schema_init, mock_use_js, mock_is_js, mock_get_trans, mock_build_vllm, mock_get_md, mock_is_hf): + """Test _build_single_modelbuilder routes HF text-generation models to vLLM.""" mock_is_hf.return_value = True mock_is_js.return_value = False mock_use_js.return_value = False mock_get_md.return_value = {"pipeline_tag": "text-generation"} mock_model = Mock(spec=Model) - mock_build_tgi.return_value = mock_model + mock_build_vllm.return_value = mock_model mock_get_trans.return_value = (None, None) - + builder = ModelBuilder( model="gpt2", role_arn=MOCK_ROLE_ARN, @@ -406,12 +406,70 @@ def test_build_single_with_hf_text_generation(self, mock_schema_init, mock_use_j mode=Mode.SAGEMAKER_ENDPOINT, image_uri=MOCK_IMAGE_URI ) - + result = builder._build_single_modelbuilder() - + self.assertEqual(result, mock_model) self.assertEqual(builder.model_hub, ModelHub.HUGGINGFACE) - mock_build_tgi.assert_called_once() + mock_build_vllm.assert_called_once() + + @patch('sagemaker.serve.model_builder.ModelBuilder._is_huggingface_model') + @patch('sagemaker.serve.model_builder.ModelBuilder.get_huggingface_model_metadata') + @patch('sagemaker.serve.model_builder.ModelBuilder._build_for_vllm_omni') + @patch('sagemaker.serve.model_builder.ModelBuilder._get_client_translators') + @patch('sagemaker.serve.model_builder.ModelBuilder._is_jumpstart_model_id') + @patch('sagemaker.serve.model_builder.ModelBuilder._use_jumpstart_equivalent') + @patch('sagemaker.serve.model_builder.ModelBuilder._hf_schema_builder_init') + def test_build_single_with_hf_multimodal(self, mock_schema_init, mock_use_js, mock_is_js, mock_get_trans, mock_build_omni, mock_get_md, mock_is_hf): + """Test _build_single_modelbuilder routes HF multimodal models to vLLM-omni.""" + mock_is_hf.return_value = True + mock_is_js.return_value = False + mock_use_js.return_value = False + mock_get_md.return_value = {"pipeline_tag": "image-text-to-text"} + mock_model = Mock(spec=Model) + mock_build_omni.return_value = mock_model + mock_get_trans.return_value = (None, None) + + builder = ModelBuilder( + model="llava-hf/llava-1.5-7b-hf", + role_arn=MOCK_ROLE_ARN, + sagemaker_session=self.mock_session, + mode=Mode.SAGEMAKER_ENDPOINT, + image_uri=MOCK_IMAGE_URI + ) + + result = builder._build_single_modelbuilder() + + self.assertEqual(result, mock_model) + mock_build_omni.assert_called_once() + + @patch('sagemaker.serve.model_builder.ModelBuilder._is_huggingface_model') + @patch('sagemaker.serve.model_builder.ModelBuilder._build_for_sglang') + @patch('sagemaker.serve.model_builder.ModelBuilder._get_client_translators') + @patch('sagemaker.serve.model_builder.ModelBuilder._is_jumpstart_model_id') + @patch('sagemaker.serve.model_builder.ModelBuilder._use_jumpstart_equivalent') + def test_build_single_with_hf_sglang_opt_in(self, mock_use_js, mock_is_js, mock_get_trans, mock_build_sglang, mock_is_hf): + """Test _build_single_modelbuilder routes to SGLang when opted in via model_server.""" + mock_is_hf.return_value = True + mock_is_js.return_value = False + mock_use_js.return_value = False + mock_model = Mock(spec=Model) + mock_build_sglang.return_value = mock_model + mock_get_trans.return_value = (None, None) + + builder = ModelBuilder( + model="gpt2", + role_arn=MOCK_ROLE_ARN, + sagemaker_session=self.mock_session, + mode=Mode.SAGEMAKER_ENDPOINT, + model_server=ModelServer.SGLANG, + image_uri=MOCK_IMAGE_URI + ) + + result = builder._build_single_modelbuilder() + + self.assertEqual(result, mock_model) + mock_build_sglang.assert_called_once() @patch('sagemaker.serve.model_builder.ModelBuilder._is_huggingface_model') @patch('sagemaker.serve.model_builder.ModelBuilder.get_huggingface_model_metadata')