|
| 1 | +# config file for a rest endpoint supported on fmbench - |
| 2 | +# this file uses a llama-3-8b-chat-hf deployed on ec2 |
| 3 | +general: |
| 4 | + name: "llama3-8b-c8g.24xl-ec2" |
| 5 | + model_name: "llama3-8b-instruct" |
| 6 | + |
| 7 | +# AWS and SageMaker settings |
| 8 | +aws: |
| 9 | + # AWS region, this parameter is templatized, no need to change |
| 10 | + region: {region} |
| 11 | + # SageMaker execution role used to run FMBench, this parameter is templatized, no need to change |
| 12 | + sagemaker_execution_role: {role_arn} |
| 13 | + # S3 bucket to which metrics, plots and reports would be written to |
| 14 | + bucket: {write_bucket} ## add the name of your desired bucket |
| 15 | + |
| 16 | +# directory paths in the write bucket, no need to change these |
| 17 | +dir_paths: |
| 18 | + data_prefix: data |
| 19 | + prompts_prefix: prompts |
| 20 | + all_prompts_file: all_prompts.csv |
| 21 | + metrics_dir: metrics |
| 22 | + models_dir: models |
| 23 | + metadata_dir: metadata |
| 24 | + |
| 25 | +# S3 information for reading datasets, scripts and tokenizer |
| 26 | +s3_read_data: |
| 27 | + # read bucket name, templatized, if left unchanged will default to sagemaker-fmbench-read-region-account_id |
| 28 | + read_bucket: {read_bucket} |
| 29 | + scripts_prefix: scripts ## add your own scripts in case you are using anything that is not on jumpstart |
| 30 | + |
| 31 | + # S3 prefix in the read bucket where deployment and inference scripts should be placed |
| 32 | + scripts_prefix: scripts |
| 33 | + |
| 34 | + # deployment and inference script files to be downloaded are placed in this list |
| 35 | + # only needed if you are creating a new deployment script or inference script |
| 36 | + # your HuggingFace token does need to be in this list and should be called "hf_token.txt" |
| 37 | + script_files: |
| 38 | + - hf_token.txt |
| 39 | + |
| 40 | + # configuration files (like this one) are placed in this prefix |
| 41 | + configs_prefix: configs |
| 42 | + |
| 43 | + # list of configuration files to download, for now only pricing.yml needs to be downloaded |
| 44 | + config_files: |
| 45 | + - pricing.yml |
| 46 | + |
| 47 | + # S3 prefix for the dataset files |
| 48 | + source_data_prefix: source_data |
| 49 | + # list of dataset files, the list below is from the LongBench dataset https://huggingface.co/datasets/THUDM/LongBench |
| 50 | + source_data_files: |
| 51 | + - 2wikimqa_e.jsonl |
| 52 | + - 2wikimqa.jsonl |
| 53 | + - hotpotqa_e.jsonl |
| 54 | + - hotpotqa.jsonl |
| 55 | + - narrativeqa.jsonl |
| 56 | + - triviaqa_e.jsonl |
| 57 | + - triviaqa.jsonl |
| 58 | + |
| 59 | + # S3 prefix for the tokenizer to be used with the models |
| 60 | + # NOTE 1: the same tokenizer is used with all the models being tested through a config file |
| 61 | + # NOTE 2: place your model specific tokenizers in a prefix named as <model_name>_tokenizer |
| 62 | + # so the mistral tokenizer goes in mistral_tokenizer, Llama2 tokenizer goes in llama2_tokenizer |
| 63 | + tokenizer_prefix: llama3_tokenizer |
| 64 | + |
| 65 | + # S3 prefix for prompt templates |
| 66 | + prompt_template_dir: prompt_template |
| 67 | + |
| 68 | + # prompt template to use, NOTE: same prompt template gets used for all models being tested through a config file |
| 69 | + # the FMBench repo already contains a bunch of prompt templates so review those first before creating a new one |
| 70 | + prompt_template_file: prompt_template_llama3.txt |
| 71 | + |
| 72 | +# steps to run, usually all of these would be |
| 73 | +# set to yes so nothing needs to change here |
| 74 | +# you could, however, bypass some steps for example |
| 75 | +# set the 2_deploy_model.ipynb to no if you are re-running |
| 76 | +# the same config file and the model is already deployed |
| 77 | +run_steps: |
| 78 | + 0_setup.ipynb: yes |
| 79 | + 1_generate_data.ipynb: yes |
| 80 | + 2_deploy_model.ipynb: yes |
| 81 | + 3_run_inference.ipynb: yes |
| 82 | + 4_model_metric_analysis.ipynb: yes |
| 83 | + 5_cleanup.ipynb: yes |
| 84 | + |
| 85 | +datasets: |
| 86 | + # dataset related configuration |
| 87 | + prompt_template_keys: |
| 88 | + - input |
| 89 | + - context |
| 90 | + |
| 91 | + # if your dataset has multiple languages and it has a language |
| 92 | + # field then you could filter it for a language. Similarly, |
| 93 | + # you can filter your dataset to only keep prompts between |
| 94 | + # a certain token length limit (the token length is determined |
| 95 | + # using the tokenizer you provide in the tokenizer_prefix prefix in the |
| 96 | + # read S3 bucket). Each of the array entries below create a payload file |
| 97 | + # containing prompts matching the language and token length criteria. |
| 98 | + filters: |
| 99 | + - language: en |
| 100 | + min_length_in_tokens: 1 |
| 101 | + max_length_in_tokens: 500 |
| 102 | + payload_file: payload_en_1-500.jsonl |
| 103 | + - language: en |
| 104 | + min_length_in_tokens: 500 |
| 105 | + max_length_in_tokens: 1000 |
| 106 | + payload_file: payload_en_500-1000.jsonl |
| 107 | + - language: en |
| 108 | + min_length_in_tokens: 1000 |
| 109 | + max_length_in_tokens: 2000 |
| 110 | + payload_file: payload_en_1000-2000.jsonl |
| 111 | + - language: en |
| 112 | + min_length_in_tokens: 2000 |
| 113 | + max_length_in_tokens: 3000 |
| 114 | + payload_file: payload_en_2000-3000.jsonl |
| 115 | + - language: en |
| 116 | + min_length_in_tokens: 3000 |
| 117 | + max_length_in_tokens: 3840 |
| 118 | + payload_file: payload_en_3000-3840.jsonl |
| 119 | + |
| 120 | +# While the tests would run on all the datasets |
| 121 | +# configured in the experiment entries below but |
| 122 | +# the price:performance analysis is only done for 1 |
| 123 | +# dataset which is listed below as the dataset_of_interest |
| 124 | +metrics: |
| 125 | + dataset_of_interest: en_3000-3840 |
| 126 | + |
| 127 | +# all pricing information is in the pricing.yml file |
| 128 | +# this file is provided in the repo. You can add entries |
| 129 | +# to this file for new instance types and new Bedrock models |
| 130 | +pricing: pricing.yml |
| 131 | + |
| 132 | +# inference parameters, these are added to the payload |
| 133 | +# for each inference request. The list here is not static |
| 134 | +# any parameter supported by the inference container can be |
| 135 | +# added to the list. Put the sagemaker parameters in the sagemaker |
| 136 | +# section, bedrock parameters in the bedrock section (not shown here). |
| 137 | +# Use the section name (sagemaker in this example) in the inference_spec.parameter_set |
| 138 | +# section under experiments. |
| 139 | +inference_parameters: |
| 140 | + ec2_vllm: |
| 141 | + model: meta-llama/Meta-Llama-3-8B-Instruct |
| 142 | + temperature: 0.1 |
| 143 | + top_p: 0.92 |
| 144 | + top_k: 120 |
| 145 | + max_tokens: 100 |
| 146 | + |
| 147 | +# Configuration for experiments to be run. The experiments section is an array |
| 148 | +# so more than one experiments can be added, these could belong to the same model |
| 149 | +# but different instance types, or different models, or even different hosting |
| 150 | +# options. |
| 151 | +experiments: |
| 152 | + - name: "llama3-8b-instruct" |
| 153 | + # AWS region, this parameter is templatized, no need to change |
| 154 | + region: {region} |
| 155 | + # model_id is interpreted in conjunction with the deployment_script, so if you |
| 156 | + # use a JumpStart model id then set the deployment_script to jumpstart.py. |
| 157 | + # if deploying directly from HuggingFace this would be a HuggingFace model id |
| 158 | + # see the DJL serving deployment script in the code repo for reference. |
| 159 | + #from huggingface to grab |
| 160 | + model_id: meta-llama/Meta-Llama-3-8B-Instruct # model id, version and image uri not needed for byo endpoint |
| 161 | + model_version: |
| 162 | + model_name: "llama3-8b-instruct" |
| 163 | + # this can be changed to the IP address of your specific EC2 instance where the model is hosted |
| 164 | + ep_name: 'http://localhost:8000/v1/completions' |
| 165 | + instance_type: "c8g.24xlarge" |
| 166 | + image_uri: vllm-cpu-env |
| 167 | + deploy: yes #setting to yes to run deployment script for ec2 |
| 168 | + instance_count: |
| 169 | + deployment_script: ec2_deploy.py |
| 170 | + # FMBench comes packaged with multiple inference scripts, such as scripts for SageMaker |
| 171 | + # and Bedrock. You can also add your own. This is an example for a rest DJL predictor |
| 172 | + # for a llama3-8b-instruct deployed on ec2 |
| 173 | + inference_script: ec2_predictor.py |
| 174 | + # This section defines the settings for Amazon EC2 instances |
| 175 | + ec2: |
| 176 | + # Privileged Mode makes the docker container run with root. |
| 177 | + # This basically means that if you are root in a container you have the privileges of root on the host system |
| 178 | + # Only need this if you need to set VLLM_CPU_OMP_THREADS_BIND env variable. |
| 179 | + privileged_mode: yes |
| 180 | + # The following line specifies the runtime and GPU settings for the instance |
| 181 | + # '--runtime=nvidia' tells the container runtime to use the NVIDIA runtime |
| 182 | + # '--gpus all' makes all GPUs available to the container |
| 183 | + # '--shm-size 12g' sets the size of the shared memory to 12 gigabytes |
| 184 | + gpu_or_neuron_setting: |
| 185 | + # This setting specifies the timeout (in seconds) for loading the model. In this case, the timeout is set to 2400 seconds, which is 40 minutes. |
| 186 | + # If the model takes longer than 40 minutes to load, the process will time out and fail. |
| 187 | + model_loading_timeout: 2400 |
| 188 | + inference_spec: |
| 189 | + # this should match one of the sections in the inference_parameters section above |
| 190 | + parameter_set: ec2_vllm |
| 191 | + # if not set assume djl |
| 192 | + container_type: vllm |
| 193 | + # modify the serving properties to match your model and requirements |
| 194 | + serving.properties: |
| 195 | + # runs are done for each combination of payload file and concurrency level |
| 196 | + payload_files: |
| 197 | + - payload_en_1-500.jsonl |
| 198 | + - payload_en_500-1000.jsonl |
| 199 | + - payload_en_1000-2000.jsonl |
| 200 | + - payload_en_2000-3000.jsonl |
| 201 | + - payload_en_3000-3840.jsonl |
| 202 | + # concurrency level refers to number of requests sent in parallel to an endpoint |
| 203 | + # the next set of requests is sent once responses for all concurrent requests have |
| 204 | + # been received. |
| 205 | + concurrency_levels: |
| 206 | + - 1 |
| 207 | + # - 2 |
| 208 | + # - 4 |
| 209 | + # Environment variables to be passed to the container |
| 210 | + # this is not a fixed list, you can add more parameters as applicable. |
| 211 | + env: |
| 212 | + MODEL_LOADING_TIMEOUT: 2400 |
| 213 | + # This instance is equipped with 96 CPUs, and we are allocating 93 of them to run this container. |
| 214 | + # For additional details, refer to the following URL: |
| 215 | + # https://docs.vllm.ai/en/latest/getting_started/cpu-installation.html#related-runtime-environment-variables |
| 216 | + VLLM_CPU_OMP_THREADS_BIND: 0-92 |
| 217 | + |
| 218 | +report: |
| 219 | + latency_budget: 35 |
| 220 | + cost_per_10k_txn_budget: 60 |
| 221 | + error_rate_budget: 0 |
| 222 | + per_inference_request_file: per_inference_request_results.csv |
| 223 | + all_metrics_file: all_metrics.csv |
| 224 | + txn_count_for_showing_cost: 10000 |
| 225 | + v_shift_w_single_instance: 0.025 |
| 226 | + v_shift_w_gt_one_instance: 0.025 |
0 commit comments