Merge pull request #277 from dheerajoruganty/main

aarora79 · web-flow · commit 012be473a80e · 2025-01-26T16:01:34.000-05:00
VLLM GPU support
diff --git a/src/fmbench/scripts/constants.py b/src/fmbench/scripts/constants.py
@@ -3,6 +3,7 @@
 
 CONTAINER_TYPE_DJL: str = 'djl'
 CONTAINER_TYPE_VLLM: str = 'vllm'
+CONTAINER_TYPE_VLLM_GPU: str = 'vllm_gpu'
 CONTAINER_TYPE_TRITON: str = 'triton'
 CONTAINER_TYPE_OLLAMA: str = 'ollama'
 CONTAINER_TYPE_HUGGINGFACE: str = 'huggingface'
diff --git a/src/fmbench/scripts/ec2_deploy.py b/src/fmbench/scripts/ec2_deploy.py
@@ -21,7 +21,7 @@
 from typing import Dict, Union
 from fmbench.scripts import constants
 from ec2_metadata import ec2_metadata
-from fmbench.scripts.inference_containers import (djl, vllm, triton, ollama)
+from fmbench.scripts.inference_containers import (djl, vllm, vllm_gpu, triton, ollama)
 from fmbench.scripts.constants import (IS_NEURON_INSTANCE, LISTEN_PORT)
 from fmbench.scripts.prepare_for_multi_model_containers import prepare_docker_compose_yml
 
@@ -75,6 +75,8 @@ def _create_deployment_script(image_uri,
             deploy_script_content = djl.create_script(region, image_uri, model_id, model_name, env_str, privileged_str, hf_token, directory)
         case constants.CONTAINER_TYPE_VLLM:
             deploy_script_content = vllm.create_script(region, image_uri, model_id, model_name, env_str, privileged_str, hf_token, directory)
+        case constants.CONTAINER_TYPE_VLLM_GPU:
+            deploy_script_content = vllm_gpu.create_script(region, image_uri, model_id, model_name, env_str, privileged_str, hf_token, directory)
         case constants.CONTAINER_TYPE_TRITON:
             deploy_script_content = triton.create_script(region, image_uri, model_id, model_name, env_str, privileged_str, hf_token, directory)
         case constants.CONTAINER_TYPE_OLLAMA:
@@ -115,7 +117,9 @@ def _check_model_deployment(endpoint, model_id, container_type, model_loading_ti
     logger.info(f"Checking deployment status at {endpoint}")
     if container_type == constants.CONTAINER_TYPE_DJL:
         data = {"inputs": "tell me a story of the little red riding hood"}
-    elif container_type == constants.CONTAINER_TYPE_VLLM or container_type == constants.CONTAINER_TYPE_OLLAMA :
+    elif container_type == constants.CONTAINER_TYPE_VLLM \
+    or container_type == constants.CONTAINER_TYPE_OLLAMA \
+    or container_type == constants.CONTAINER_TYPE_VLLM_GPU:
         data = {"model": model_id,  # Specify the model to use
                 "prompt": "tell me a story of the little red riding hood",}
     elif container_type == constants.CONTAINER_TYPE_TRITON:
diff --git a/src/fmbench/scripts/ec2_predictor.py b/src/fmbench/scripts/ec2_predictor.py
@@ -96,7 +96,7 @@ def get_prediction(self, payload: Dict) -> FMBenchPredictionResponse:
                 response.raise_for_status()
                 response_json = json.loads(response.text)
                 full_output = response_json['text_output']
-            elif container_type == constants.CONTAINER_TYPE_VLLM:
+            elif container_type == constants.CONTAINER_TYPE_VLLM or container_type == constants.CONTAINER_TYPE_VLLM_GPU:
                 # vllm uses prompt rather than input and then
                 # the code in the calling function still expects input
                 # so make a copy
diff --git a/src/fmbench/scripts/inference_containers/vllm_gpu.py b/src/fmbench/scripts/inference_containers/vllm_gpu.py
@@ -0,0 +1,24 @@
+"""
+vllm specific code
+"""
+import logging
+from fmbench.scripts.inference_containers.utils import (STOP_AND_RM_CONTAINER,
+                                                        FMBENCH_MODEL_CONTAINER_NAME)
+
+logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+def create_script(region, image_uri, model_id, model_name, env_str, privileged_str, hf_token, directory):
+    """
+    Script for running the docker container for the inference server
+    """
+    script = f"""#!/bin/sh
+
+        {STOP_AND_RM_CONTAINER}
+
+        
+        vllm serve {model_id}
+
+        echo "started docker run in daemon mode"
+    """
+    return script