Skip to content

Commit 012be47

Browse files
authored
Merge pull request #277 from dheerajoruganty/main
VLLM GPU support
2 parents f4b9f75 + f316729 commit 012be47

File tree

4 files changed

+32
-3
lines changed

4 files changed

+32
-3
lines changed

src/fmbench/scripts/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
CONTAINER_TYPE_DJL: str = 'djl'
55
CONTAINER_TYPE_VLLM: str = 'vllm'
6+
CONTAINER_TYPE_VLLM_GPU: str = 'vllm_gpu'
67
CONTAINER_TYPE_TRITON: str = 'triton'
78
CONTAINER_TYPE_OLLAMA: str = 'ollama'
89
CONTAINER_TYPE_HUGGINGFACE: str = 'huggingface'

src/fmbench/scripts/ec2_deploy.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from typing import Dict, Union
2222
from fmbench.scripts import constants
2323
from ec2_metadata import ec2_metadata
24-
from fmbench.scripts.inference_containers import (djl, vllm, triton, ollama)
24+
from fmbench.scripts.inference_containers import (djl, vllm, vllm_gpu, triton, ollama)
2525
from fmbench.scripts.constants import (IS_NEURON_INSTANCE, LISTEN_PORT)
2626
from fmbench.scripts.prepare_for_multi_model_containers import prepare_docker_compose_yml
2727

@@ -75,6 +75,8 @@ def _create_deployment_script(image_uri,
7575
deploy_script_content = djl.create_script(region, image_uri, model_id, model_name, env_str, privileged_str, hf_token, directory)
7676
case constants.CONTAINER_TYPE_VLLM:
7777
deploy_script_content = vllm.create_script(region, image_uri, model_id, model_name, env_str, privileged_str, hf_token, directory)
78+
case constants.CONTAINER_TYPE_VLLM_GPU:
79+
deploy_script_content = vllm_gpu.create_script(region, image_uri, model_id, model_name, env_str, privileged_str, hf_token, directory)
7880
case constants.CONTAINER_TYPE_TRITON:
7981
deploy_script_content = triton.create_script(region, image_uri, model_id, model_name, env_str, privileged_str, hf_token, directory)
8082
case constants.CONTAINER_TYPE_OLLAMA:
@@ -115,7 +117,9 @@ def _check_model_deployment(endpoint, model_id, container_type, model_loading_ti
115117
logger.info(f"Checking deployment status at {endpoint}")
116118
if container_type == constants.CONTAINER_TYPE_DJL:
117119
data = {"inputs": "tell me a story of the little red riding hood"}
118-
elif container_type == constants.CONTAINER_TYPE_VLLM or container_type == constants.CONTAINER_TYPE_OLLAMA :
120+
elif container_type == constants.CONTAINER_TYPE_VLLM \
121+
or container_type == constants.CONTAINER_TYPE_OLLAMA \
122+
or container_type == constants.CONTAINER_TYPE_VLLM_GPU:
119123
data = {"model": model_id, # Specify the model to use
120124
"prompt": "tell me a story of the little red riding hood",}
121125
elif container_type == constants.CONTAINER_TYPE_TRITON:

src/fmbench/scripts/ec2_predictor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ def get_prediction(self, payload: Dict) -> FMBenchPredictionResponse:
9696
response.raise_for_status()
9797
response_json = json.loads(response.text)
9898
full_output = response_json['text_output']
99-
elif container_type == constants.CONTAINER_TYPE_VLLM:
99+
elif container_type == constants.CONTAINER_TYPE_VLLM or container_type == constants.CONTAINER_TYPE_VLLM_GPU:
100100
# vllm uses prompt rather than input and then
101101
# the code in the calling function still expects input
102102
# so make a copy
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
"""
2+
vllm specific code
3+
"""
4+
import logging
5+
from fmbench.scripts.inference_containers.utils import (STOP_AND_RM_CONTAINER,
6+
FMBENCH_MODEL_CONTAINER_NAME)
7+
8+
logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', level=logging.INFO)
9+
logger = logging.getLogger(__name__)
10+
11+
def create_script(region, image_uri, model_id, model_name, env_str, privileged_str, hf_token, directory):
12+
"""
13+
Script for running the docker container for the inference server
14+
"""
15+
script = f"""#!/bin/sh
16+
17+
{STOP_AND_RM_CONTAINER}
18+
19+
20+
vllm serve {model_id}
21+
22+
echo "started docker run in daemon mode"
23+
"""
24+
return script

0 commit comments

Comments
 (0)