analytics across multiple experiments, config file llama3-70b on g5.48xl

aarora79 · aarora79 · commit b8a9e41565d5 · 2024-06-07T02:11:55.000Z
diff --git a/analytics.py b/analytics.py
@@ -0,0 +1,120 @@
+"""
+Analyze data across multiple fmbench runs
+"""
+import os
+import math
+import glob
+import json
+import yaml
+import logging
+import argparse
+import pandas as pd
+from pathlib import Path
+
+logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+RPM_LIST = [1, 10, 100, 1000, 10000]
+LATENCY_THRESHOLD: int = 2
+RESULTS_DIR: str = "./"
+PAYLOAD_FILE_OF_INTEREST: str = "payload_en_3000-3840.jsonl"
+MODEL: str = "llama3-8b-instruct"
+PRICING_FILE_PATH: str = os.path.join("src", "fmbench", "configs", "pricing.yml")
+
+def cost_per_txn(row, pricing):
+    txns_per_hour = row['transactions_per_minute'] * 60
+    instance_cost_per_hour = pricing['pricing']['instance_based'][row['instance_type']]
+    cost_per_txn = round(instance_cost_per_hour / txns_per_hour, 4)
+    return cost_per_txn
+
+def cost_per_1k_tokens(row, pricing):
+    txns_per_hour = row['transactions_per_minute'] * 60
+    tokens_per_hour = (row['prompt_token_count_mean'] + row['completion_token_count_mean']) * txns_per_hour
+    instance_cost_per_hour = pricing['pricing']['instance_based'][row['instance_type']]
+    cost_per_1k_tokens = round(1000 * (instance_cost_per_hour / tokens_per_hour), 8)
+    return cost_per_1k_tokens
+
+# Determine how many instances would be required to run 100 requests/minute,
+# 1000 requests/minute, 10000 requests/minute. The idea being that at the 
+# low end of the total number of requests/minute smaller instances which provide
+# good inference latency at low concurrencies would suffice (said another way, 
+# the larger more expensive instances are an overkill at this stage) but as 
+# the number of requests/minute increase there comes an inflexion point beyond
+# which the number of smaller instances required would be so much that it 
+# would be more economical to use fewer instances of the larger more expensive instances.
+def cost_per_n_rpm(r, rpm, pricing):
+    instance_count_needed = math.ceil(rpm / r['transactions_per_minute'])
+    cost = round(instance_count_needed * pricing['pricing']['instance_based'][r['instance_type']], 2)
+    return (instance_count_needed, cost)
+
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Analyze mukltiple FMBench runs')
+    parser.add_argument('--latency-threshold',
+                        type=int,
+                        default=LATENCY_THRESHOLD,
+                        help=f'Latency threshold, runs with p95 above this are not useful, default={LATENCY_THRESHOLD}',
+                        required=False)
+    parser.add_argument('--payload-file',
+                        type=str,
+                        default=PAYLOAD_FILE_OF_INTEREST,
+                        help=f'Payload file representing payload of interest, default={PAYLOAD_FILE_OF_INTEREST}',
+                        required=False)
+    parser.add_argument('--model-id',
+                        type=str,
+                        default=MODEL,
+                        help=f'Model for which data is being analyzed, default={MODEL}',
+                        required=False)
+    args = parser.parse_args()
+    print(f"main, {args} = args")
+    
+    # load pricing info
+    pricing =  yaml.safe_load(Path(PRICING_FILE_PATH).read_text())
+    logger.info(f"pricing={json.dumps(pricing, indent=2)}")
+    
+    # all results file to be parsed
+    all_metrics_summary_files = glob.glob(os.path.join(RESULTS_DIR, "results-*",
+                                                       "all_metrics_summary.csv"),
+                                          recursive=True)
+    logger.info(f"found {len(all_metrics_summary_files)} files {all_metrics_summary_files} ")
+
+    # read all results file in a single dataframe
+    df = pd.concat(list(map(pd.read_csv, all_metrics_summary_files)))
+    logger.info(f"read {len(all_metrics_summary_files)} files in a dataframe of shape {df.shape}")
+
+    # filter to keep only relevant data
+    df_selected = df[df.latency_p95 <= args.latency_threshold]
+    logger.info(f"after filtering to keep rows with latency_p95 <= {args.latency_threshold}s, df shape {df_selected.shape}")
+
+    # select row with highest concurrency level
+    grouping_cols = ["experiment_name", "payload_file", "instance_type", "instance_count"]
+    df_summary_all = df_selected.loc[df_selected.groupby(grouping_cols)['concurrency'].transform(max) == df_selected['concurrency']]
+
+    # find price per txn and price per token
+    df_summary_all['cost_per_txn'] = df_summary_all.apply(lambda r: cost_per_txn(r, pricing), axis=1)
+    df_summary_all['cost_per_1k_tokens'] = df_summary_all.apply(lambda r: cost_per_1k_tokens(r, pricing), axis=1)
+
+    # extrapolate to price per n requests per minue
+    for rpm in RPM_LIST:
+        col_name = f"instance_count_and_cost_{rpm}_rpm"
+        df_summary_all[col_name] = df_summary_all.apply(lambda r: cost_per_n_rpm(r, rpm, pricing), axis=1)
+
+    df_summary_all = df_summary_all.sort_values(by="cost_per_1k_tokens")
+
+    summary_file: str = f"{args.model_id}-summary-p95-latency={args.latency_threshold}s.csv"
+    df_summary_all.to_csv(summary_file, index=False)
+    logger.info(f"saved df_summary_all dataframe of shape={df_summary_all.shape} in {summary_file}")
+    
+    summary_file_payload_of_interest: str = f"{args.model_id}-summary-{Path(args.payload_file).stem}-p95-latency={LATENCY_THRESHOLD}s.csv"
+    df_summary_payload_of_interest = df_summary_all[df_summary_all.payload_file == args.payload_file]
+    df_summary_payload_of_interest = df_summary_payload_of_interest.sort_values(by="cost_per_1k_tokens")
+
+    df_summary_payload_of_interest.to_csv(summary_file_payload_of_interest, index=False)
+    logger.info(f"saved df_summary_payload_of_interest dataframe of "\
+                f"shape={df_summary_payload_of_interest.shape} in {summary_file_payload_of_interest}")
+    logger.info("all done")
+
+    
+if __name__ == "__main__":
+    main()
diff --git a/src/fmbench/configs/llama3/70b/config-llama3-70b-instruct-g5-48xl.yml b/src/fmbench/configs/llama3/70b/config-llama3-70b-instruct-g5-48xl.yml
@@ -0,0 +1,157 @@
+general:
+  name: "llama3-70b-g5-48xl-v1"      
+  model_name: "llama3-70b-instruct"
+  
+# AWS and SageMaker settings
+aws:
+  region: {region}
+  # uncomment and set the Role ARN if not running on sagemaker
+  sagemaker_execution_role: {role_arn}
+  ## these are the buckets/resources you will create in your account below:
+  bucket: {write_bucket} ## add the name of your desired bucket
+
+## WRITE BUCKET -- Write the results, data, metrics, endpoint.json and payloads to this bucket directory
+dir_paths:
+    data_prefix: data ## add the prefix for all your data management/storage
+    prompts_prefix: prompts
+    all_prompts_file: all_prompts.csv
+    metrics_dir: metrics
+    models_dir: models
+    metadata_dir: metadata ## add a file here to dynamically track the metrics dir
+
+## READ BUCKET -- Represents the section to read from scripts, source data and tokenizer for a separate s3 bucket for read/write segregation
+s3_read_data:
+    read_bucket: {read_bucket}
+    scripts_prefix: scripts ## add your own scripts in case you are using anything that is not on jumpstart
+    script_files:
+    - hf_token.txt ## add your scripts files you have in s3 (including inference files, serving stacks, if any)
+    configs_prefix: configs
+    config_files:
+    - pricing.yml # mention the name of the config files that you want to be downloaded from s3 into the configs directory locally
+    source_data_prefix: source_data  ## Add a source_data folder to store your raw data in an s3 path configured by you
+    source_data_files:
+    # - rajpurkar/squad_v2.jsonl
+    - 2wikimqa_e.jsonl
+    - 2wikimqa.jsonl
+    - hotpotqa_e.jsonl
+    - hotpotqa.jsonl
+    - narrativeqa.jsonl
+    - triviaqa_e.jsonl
+    - triviaqa.jsonl
+    tokenizer_prefix: llama3_70b_tokenizer ## add the tokenizer.json and config.json from your specific tokenizer type
+    prompt_template_dir: prompt_template
+    prompt_template_file: prompt_template_llama3.txt ## add your desired prompt template type
+
+## section that enables container to run notebooks and python scripts automatically 
+run_steps:
+    0_setup.ipynb: yes
+    1_generate_data.ipynb: yes
+    2_deploy_model.ipynb: yes
+    3_run_inference.ipynb: yes
+    4_model_metric_analysis.ipynb: yes
+    5_cleanup.ipynb: yes 
+
+
+datasets:
+  prompt_template_keys:
+  - input
+  - context
+  filters:
+  - language: en    
+    min_length_in_tokens: 1
+    max_length_in_tokens: 500
+    payload_file: payload_en_1-500.jsonl
+  - language: en
+    min_length_in_tokens: 500
+    max_length_in_tokens: 1000
+    payload_file: payload_en_500-1000.jsonl
+  - language: en
+    min_length_in_tokens: 1000
+    max_length_in_tokens: 2000
+    payload_file: payload_en_1000-2000.jsonl
+  - language: en
+    min_length_in_tokens: 2000
+    max_length_in_tokens: 3000
+    payload_file: payload_en_2000-3000.jsonl
+  - language: en
+    min_length_in_tokens: 3000
+    max_length_in_tokens: 3840
+    payload_file: payload_en_3000-3840.jsonl
+ 
+metrics:
+  dataset_of_interest: en_3000-3840
+  
+pricing: pricing.yml
+
+inference_parameters:
+  sagemaker:
+    max_new_tokens: 100
+    top_p: 0.92
+    temperature: 0.1
+    details: True
+    stop: '<|eot_id|>'
+
+experiments:
+  - name: llama-3-70b-instruct-g5-48xl-djl-deepspeed0.12.6-cu121
+    model_id: meta-textgeneration-llama-3-70b-instruct
+    model_version: "*"
+    model_name: llama-3-70b-instruct
+    ep_name: llama-3-70b-instruct-g5-48xl
+    instance_type: "ml.g5.48xlarge"
+    image_uri: '763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121'
+    deploy: yes
+    instance_count: 1
+    deployment_script: jumpstart.py
+    inference_script: sagemaker_predictor.py
+    inference_spec:
+      parameter_set: sagemaker
+    payload_files:
+    - payload_en_1-500.jsonl
+    - payload_en_500-1000.jsonl
+    - payload_en_1000-2000.jsonl
+    - payload_en_2000-3000.jsonl
+    - payload_en_3000-3840.jsonl
+    concurrency_levels:
+    - 1
+    - 2
+    - 4
+    - 6
+    - 8
+
+    accept_eula: true
+    # optional metadata about the model, this is not used for anything other than
+    # just being logged in the report as is as part of the config file dump
+    additional_metadata: |
+      job_queue_size: 1000
+      max_dynamic_batch_size: 1
+      max_batch_delay: 100
+      max_idle_time: 60
+      load_on_devices: *
+      engine: MPI
+      mpi_mode: true
+      option.entryPoint: null
+      option.tensor_parallel_degree: 8
+      option.max_rolling_batch_size: 256
+      option.mpi_mode: true
+      option.model_id: /opt/ml/model
+      option.rolling_batch: lmi-dist
+    env:
+      SAGEMAKER_PROGRAM: "inference.py"
+      ENDPOINT_SERVER_TIMEOUT: "3600"
+      MODEL_CACHE_ROOT: "/opt/ml/model"
+      SAGEMAKER_ENV: "1"
+      HF_MODEL_ID: "/opt/ml/model"
+      MAX_INPUT_LENGTH: "4095"
+      MAX_TOTAL_TOKENS: "4096"
+      SM_NUM_GPUS: "8"
+      SAGEMAKER_MODEL_SERVER_WORKERS: "1"   
+  
+report:
+  latency_budget: 2
+  cost_per_10k_txn_budget: 20
+  error_rate_budget: 0
+  per_inference_request_file: per_inference_request_results.csv
+  all_metrics_file: all_metrics.csv
+  txn_count_for_showing_cost: 10000
+  v_shift_w_single_instance: 0.025
+  v_shift_w_gt_one_instance: 0.025