|
1 | 1 | # SPDX-License-Identifier: Apache-2.0 |
| 2 | +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project |
2 | 3 | """ |
3 | 4 | LM eval harness on model to compare vs HF baseline computed offline. |
4 | 5 | Configs are found in configs/$MODEL.yaml |
5 | 6 |
|
6 | | -* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml |
7 | | -* export LM_EVAL_TP_SIZE=4 |
8 | | -* pytest -s test_lm_eval_correctness.py |
| 7 | +pytest -s -v test_lm_eval_correctness.py \ |
| 8 | + --config-list-file=configs/models-small.txt \ |
| 9 | + --tp-size=1 |
9 | 10 | """ |
10 | 11 |
|
11 | | -import os |
12 | | -from pathlib import Path |
13 | | - |
14 | 12 | import lm_eval |
15 | | -import numpy |
16 | | -import pytest |
| 13 | +import numpy as np |
17 | 14 | import yaml |
18 | 15 |
|
19 | 16 | RTOL = 0.08 |
20 | | -TEST_DATA_FILE = os.environ.get( |
21 | | - "LM_EVAL_TEST_DATA_FILE", |
22 | | - ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml") |
23 | | - |
24 | | -TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1) |
25 | | - |
26 | 17 |
|
27 | | -def launch_lm_eval(eval_config): |
28 | | - trust_remote_code = eval_config.get('trust_remote_code', False) |
29 | | - |
30 | | - model_args = f"pretrained={eval_config['model_name']}," \ |
31 | | - f"tensor_parallel_size={TP_SIZE}," \ |
32 | | - f"add_bos_token=true," \ |
33 | | - f"trust_remote_code={trust_remote_code}" |
34 | 18 |
|
| 19 | +def launch_lm_eval(eval_config, tp_size): |
| 20 | + trust_remote_code = eval_config.get("trust_remote_code", False) |
| 21 | + model_args = ( |
| 22 | + f"pretrained={eval_config['model_name']}," |
| 23 | + f"tensor_parallel_size={tp_size}," |
| 24 | + f"enforce_eager=true," |
| 25 | + f"add_bos_token=true," |
| 26 | + f"trust_remote_code={trust_remote_code}" |
| 27 | + ) |
35 | 28 | results = lm_eval.simple_evaluate( |
36 | 29 | model="vllm", |
37 | 30 | model_args=model_args, |
38 | 31 | tasks=[task["name"] for task in eval_config["tasks"]], |
39 | 32 | num_fewshot=eval_config["num_fewshot"], |
40 | 33 | limit=eval_config["limit"], |
41 | | - batch_size="auto") |
42 | | - |
| 34 | + batch_size="auto", |
| 35 | + ) |
43 | 36 | return results |
44 | 37 |
|
45 | 38 |
|
46 | | -def test_lm_eval_correctness(): |
47 | | - eval_config = yaml.safe_load( |
48 | | - Path(TEST_DATA_FILE).read_text(encoding="utf-8")) |
49 | | - |
50 | | - if eval_config[ |
51 | | - "model_name"] == "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform": #noqa: E501 |
52 | | - pytest.skip("FBGEMM is currently failing on main.") |
| 39 | +def test_lm_eval_correctness_param(config_filename, tp_size): |
| 40 | + eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8")) |
53 | 41 |
|
54 | | - # Launch eval requests. |
55 | | - results = launch_lm_eval(eval_config) |
| 42 | + results = launch_lm_eval(eval_config, tp_size) |
56 | 43 |
|
57 | | - # Confirm scores match ground truth. |
58 | 44 | success = True |
59 | 45 | for task in eval_config["tasks"]: |
60 | 46 | for metric in task["metrics"]: |
61 | 47 | ground_truth = metric["value"] |
62 | 48 | measured_value = results["results"][task["name"]][metric["name"]] |
63 | | - print(f'{task["name"]} | {metric["name"]}: ' |
64 | | - f'ground_truth={ground_truth} | measured={measured_value}') |
65 | | - success = success and numpy.isclose( |
66 | | - ground_truth, measured_value, rtol=RTOL) |
| 49 | + print( |
| 50 | + f"{task['name']} | {metric['name']}: " |
| 51 | + f"ground_truth={ground_truth} | measured={measured_value}" |
| 52 | + ) |
| 53 | + success = success and np.isclose(ground_truth, measured_value, rtol=RTOL) |
67 | 54 |
|
68 | | - # Assert at the end, print all scores even on failure for debugging. |
69 | 55 | assert success |
0 commit comments