Skip to content

Commit b8a9e41

Browse files
committed
analytics across multiple experiments, config file llama3-70b on g5.48xl
1 parent 5accc93 commit b8a9e41

File tree

2 files changed

+277
-0
lines changed

2 files changed

+277
-0
lines changed

analytics.py

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
"""
2+
Analyze data across multiple fmbench runs
3+
"""
4+
import os
5+
import math
6+
import glob
7+
import json
8+
import yaml
9+
import logging
10+
import argparse
11+
import pandas as pd
12+
from pathlib import Path
13+
14+
logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', level=logging.INFO)
15+
logger = logging.getLogger(__name__)
16+
17+
RPM_LIST = [1, 10, 100, 1000, 10000]
18+
LATENCY_THRESHOLD: int = 2
19+
RESULTS_DIR: str = "./"
20+
PAYLOAD_FILE_OF_INTEREST: str = "payload_en_3000-3840.jsonl"
21+
MODEL: str = "llama3-8b-instruct"
22+
PRICING_FILE_PATH: str = os.path.join("src", "fmbench", "configs", "pricing.yml")
23+
24+
def cost_per_txn(row, pricing):
25+
txns_per_hour = row['transactions_per_minute'] * 60
26+
instance_cost_per_hour = pricing['pricing']['instance_based'][row['instance_type']]
27+
cost_per_txn = round(instance_cost_per_hour / txns_per_hour, 4)
28+
return cost_per_txn
29+
30+
def cost_per_1k_tokens(row, pricing):
31+
txns_per_hour = row['transactions_per_minute'] * 60
32+
tokens_per_hour = (row['prompt_token_count_mean'] + row['completion_token_count_mean']) * txns_per_hour
33+
instance_cost_per_hour = pricing['pricing']['instance_based'][row['instance_type']]
34+
cost_per_1k_tokens = round(1000 * (instance_cost_per_hour / tokens_per_hour), 8)
35+
return cost_per_1k_tokens
36+
37+
# Determine how many instances would be required to run 100 requests/minute,
38+
# 1000 requests/minute, 10000 requests/minute. The idea being that at the
39+
# low end of the total number of requests/minute smaller instances which provide
40+
# good inference latency at low concurrencies would suffice (said another way,
41+
# the larger more expensive instances are an overkill at this stage) but as
42+
# the number of requests/minute increase there comes an inflexion point beyond
43+
# which the number of smaller instances required would be so much that it
44+
# would be more economical to use fewer instances of the larger more expensive instances.
45+
def cost_per_n_rpm(r, rpm, pricing):
46+
instance_count_needed = math.ceil(rpm / r['transactions_per_minute'])
47+
cost = round(instance_count_needed * pricing['pricing']['instance_based'][r['instance_type']], 2)
48+
return (instance_count_needed, cost)
49+
50+
51+
52+
def main():
53+
parser = argparse.ArgumentParser(description='Analyze mukltiple FMBench runs')
54+
parser.add_argument('--latency-threshold',
55+
type=int,
56+
default=LATENCY_THRESHOLD,
57+
help=f'Latency threshold, runs with p95 above this are not useful, default={LATENCY_THRESHOLD}',
58+
required=False)
59+
parser.add_argument('--payload-file',
60+
type=str,
61+
default=PAYLOAD_FILE_OF_INTEREST,
62+
help=f'Payload file representing payload of interest, default={PAYLOAD_FILE_OF_INTEREST}',
63+
required=False)
64+
parser.add_argument('--model-id',
65+
type=str,
66+
default=MODEL,
67+
help=f'Model for which data is being analyzed, default={MODEL}',
68+
required=False)
69+
args = parser.parse_args()
70+
print(f"main, {args} = args")
71+
72+
# load pricing info
73+
pricing = yaml.safe_load(Path(PRICING_FILE_PATH).read_text())
74+
logger.info(f"pricing={json.dumps(pricing, indent=2)}")
75+
76+
# all results file to be parsed
77+
all_metrics_summary_files = glob.glob(os.path.join(RESULTS_DIR, "results-*",
78+
"all_metrics_summary.csv"),
79+
recursive=True)
80+
logger.info(f"found {len(all_metrics_summary_files)} files {all_metrics_summary_files} ")
81+
82+
# read all results file in a single dataframe
83+
df = pd.concat(list(map(pd.read_csv, all_metrics_summary_files)))
84+
logger.info(f"read {len(all_metrics_summary_files)} files in a dataframe of shape {df.shape}")
85+
86+
# filter to keep only relevant data
87+
df_selected = df[df.latency_p95 <= args.latency_threshold]
88+
logger.info(f"after filtering to keep rows with latency_p95 <= {args.latency_threshold}s, df shape {df_selected.shape}")
89+
90+
# select row with highest concurrency level
91+
grouping_cols = ["experiment_name", "payload_file", "instance_type", "instance_count"]
92+
df_summary_all = df_selected.loc[df_selected.groupby(grouping_cols)['concurrency'].transform(max) == df_selected['concurrency']]
93+
94+
# find price per txn and price per token
95+
df_summary_all['cost_per_txn'] = df_summary_all.apply(lambda r: cost_per_txn(r, pricing), axis=1)
96+
df_summary_all['cost_per_1k_tokens'] = df_summary_all.apply(lambda r: cost_per_1k_tokens(r, pricing), axis=1)
97+
98+
# extrapolate to price per n requests per minue
99+
for rpm in RPM_LIST:
100+
col_name = f"instance_count_and_cost_{rpm}_rpm"
101+
df_summary_all[col_name] = df_summary_all.apply(lambda r: cost_per_n_rpm(r, rpm, pricing), axis=1)
102+
103+
df_summary_all = df_summary_all.sort_values(by="cost_per_1k_tokens")
104+
105+
summary_file: str = f"{args.model_id}-summary-p95-latency={args.latency_threshold}s.csv"
106+
df_summary_all.to_csv(summary_file, index=False)
107+
logger.info(f"saved df_summary_all dataframe of shape={df_summary_all.shape} in {summary_file}")
108+
109+
summary_file_payload_of_interest: str = f"{args.model_id}-summary-{Path(args.payload_file).stem}-p95-latency={LATENCY_THRESHOLD}s.csv"
110+
df_summary_payload_of_interest = df_summary_all[df_summary_all.payload_file == args.payload_file]
111+
df_summary_payload_of_interest = df_summary_payload_of_interest.sort_values(by="cost_per_1k_tokens")
112+
113+
df_summary_payload_of_interest.to_csv(summary_file_payload_of_interest, index=False)
114+
logger.info(f"saved df_summary_payload_of_interest dataframe of "\
115+
f"shape={df_summary_payload_of_interest.shape} in {summary_file_payload_of_interest}")
116+
logger.info("all done")
117+
118+
119+
if __name__ == "__main__":
120+
main()
Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
general:
2+
name: "llama3-70b-g5-48xl-v1"
3+
model_name: "llama3-70b-instruct"
4+
5+
# AWS and SageMaker settings
6+
aws:
7+
region: {region}
8+
# uncomment and set the Role ARN if not running on sagemaker
9+
sagemaker_execution_role: {role_arn}
10+
## these are the buckets/resources you will create in your account below:
11+
bucket: {write_bucket} ## add the name of your desired bucket
12+
13+
## WRITE BUCKET -- Write the results, data, metrics, endpoint.json and payloads to this bucket directory
14+
dir_paths:
15+
data_prefix: data ## add the prefix for all your data management/storage
16+
prompts_prefix: prompts
17+
all_prompts_file: all_prompts.csv
18+
metrics_dir: metrics
19+
models_dir: models
20+
metadata_dir: metadata ## add a file here to dynamically track the metrics dir
21+
22+
## READ BUCKET -- Represents the section to read from scripts, source data and tokenizer for a separate s3 bucket for read/write segregation
23+
s3_read_data:
24+
read_bucket: {read_bucket}
25+
scripts_prefix: scripts ## add your own scripts in case you are using anything that is not on jumpstart
26+
script_files:
27+
- hf_token.txt ## add your scripts files you have in s3 (including inference files, serving stacks, if any)
28+
configs_prefix: configs
29+
config_files:
30+
- pricing.yml # mention the name of the config files that you want to be downloaded from s3 into the configs directory locally
31+
source_data_prefix: source_data ## Add a source_data folder to store your raw data in an s3 path configured by you
32+
source_data_files:
33+
# - rajpurkar/squad_v2.jsonl
34+
- 2wikimqa_e.jsonl
35+
- 2wikimqa.jsonl
36+
- hotpotqa_e.jsonl
37+
- hotpotqa.jsonl
38+
- narrativeqa.jsonl
39+
- triviaqa_e.jsonl
40+
- triviaqa.jsonl
41+
tokenizer_prefix: llama3_70b_tokenizer ## add the tokenizer.json and config.json from your specific tokenizer type
42+
prompt_template_dir: prompt_template
43+
prompt_template_file: prompt_template_llama3.txt ## add your desired prompt template type
44+
45+
## section that enables container to run notebooks and python scripts automatically
46+
run_steps:
47+
0_setup.ipynb: yes
48+
1_generate_data.ipynb: yes
49+
2_deploy_model.ipynb: yes
50+
3_run_inference.ipynb: yes
51+
4_model_metric_analysis.ipynb: yes
52+
5_cleanup.ipynb: yes
53+
54+
55+
datasets:
56+
prompt_template_keys:
57+
- input
58+
- context
59+
filters:
60+
- language: en
61+
min_length_in_tokens: 1
62+
max_length_in_tokens: 500
63+
payload_file: payload_en_1-500.jsonl
64+
- language: en
65+
min_length_in_tokens: 500
66+
max_length_in_tokens: 1000
67+
payload_file: payload_en_500-1000.jsonl
68+
- language: en
69+
min_length_in_tokens: 1000
70+
max_length_in_tokens: 2000
71+
payload_file: payload_en_1000-2000.jsonl
72+
- language: en
73+
min_length_in_tokens: 2000
74+
max_length_in_tokens: 3000
75+
payload_file: payload_en_2000-3000.jsonl
76+
- language: en
77+
min_length_in_tokens: 3000
78+
max_length_in_tokens: 3840
79+
payload_file: payload_en_3000-3840.jsonl
80+
81+
metrics:
82+
dataset_of_interest: en_3000-3840
83+
84+
pricing: pricing.yml
85+
86+
inference_parameters:
87+
sagemaker:
88+
max_new_tokens: 100
89+
top_p: 0.92
90+
temperature: 0.1
91+
details: True
92+
stop: '<|eot_id|>'
93+
94+
experiments:
95+
- name: llama-3-70b-instruct-g5-48xl-djl-deepspeed0.12.6-cu121
96+
model_id: meta-textgeneration-llama-3-70b-instruct
97+
model_version: "*"
98+
model_name: llama-3-70b-instruct
99+
ep_name: llama-3-70b-instruct-g5-48xl
100+
instance_type: "ml.g5.48xlarge"
101+
image_uri: '763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121'
102+
deploy: yes
103+
instance_count: 1
104+
deployment_script: jumpstart.py
105+
inference_script: sagemaker_predictor.py
106+
inference_spec:
107+
parameter_set: sagemaker
108+
payload_files:
109+
- payload_en_1-500.jsonl
110+
- payload_en_500-1000.jsonl
111+
- payload_en_1000-2000.jsonl
112+
- payload_en_2000-3000.jsonl
113+
- payload_en_3000-3840.jsonl
114+
concurrency_levels:
115+
- 1
116+
- 2
117+
- 4
118+
- 6
119+
- 8
120+
121+
accept_eula: true
122+
# optional metadata about the model, this is not used for anything other than
123+
# just being logged in the report as is as part of the config file dump
124+
additional_metadata: |
125+
job_queue_size: 1000
126+
max_dynamic_batch_size: 1
127+
max_batch_delay: 100
128+
max_idle_time: 60
129+
load_on_devices: *
130+
engine: MPI
131+
mpi_mode: true
132+
option.entryPoint: null
133+
option.tensor_parallel_degree: 8
134+
option.max_rolling_batch_size: 256
135+
option.mpi_mode: true
136+
option.model_id: /opt/ml/model
137+
option.rolling_batch: lmi-dist
138+
env:
139+
SAGEMAKER_PROGRAM: "inference.py"
140+
ENDPOINT_SERVER_TIMEOUT: "3600"
141+
MODEL_CACHE_ROOT: "/opt/ml/model"
142+
SAGEMAKER_ENV: "1"
143+
HF_MODEL_ID: "/opt/ml/model"
144+
MAX_INPUT_LENGTH: "4095"
145+
MAX_TOTAL_TOKENS: "4096"
146+
SM_NUM_GPUS: "8"
147+
SAGEMAKER_MODEL_SERVER_WORKERS: "1"
148+
149+
report:
150+
latency_budget: 2
151+
cost_per_10k_txn_budget: 20
152+
error_rate_budget: 0
153+
per_inference_request_file: per_inference_request_results.csv
154+
all_metrics_file: all_metrics.csv
155+
txn_count_for_showing_cost: 10000
156+
v_shift_w_single_instance: 0.025
157+
v_shift_w_gt_one_instance: 0.025

0 commit comments

Comments
 (0)