Skip to content

Commit b26409d

Browse files
authored
R1 Hybrid: Add Benchmark for DeepSeek R1 transformers example (intel#12854)
* init * fix * update * update * fix * fix
1 parent 5d041f9 commit b26409d

File tree

4 files changed

+5020
-4
lines changed

4 files changed

+5020
-4
lines changed

python/llm/dev/test/lint-python

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
2121
PYTHON_ROOT_DIR="$SCRIPT_DIR/.."
2222
echo $PYTHON_ROOT_DIR
2323
PATHS_TO_CHECK="$SCRIPT_DIR/../../src"
24-
PATTERNS_TO_EXCLUDE="__init__.py,log4Error.py,$SCRIPT_DIR/../../src/ipex_llm/langchain/*,$SCRIPT_DIR/../../src/ipex_llm/transformers/gguf/models/model_implement/yuan2/*,benchmark_util_4_29.py,benchmark_util_4_42.py,benchmark_util_4_43.py,benchmark_util_4_44.py,benchmark_util_4_45.py,benchmark_util_4_47.py,tgi_api_server.py,api_server.py"
24+
PATTERNS_TO_EXCLUDE="__init__.py,log4Error.py,$SCRIPT_DIR/../../src/ipex_llm/langchain/*,$SCRIPT_DIR/../../src/ipex_llm/transformers/gguf/models/model_implement/yuan2/*,benchmark_util_4_29.py,benchmark_util_4_42.py,benchmark_util_4_43.py,benchmark_util_4_44.py,benchmark_util_4_45.py,benchmark_util_4_47.py,benchmark_util_deepseek.py,tgi_api_server.py,api_server.py"
2525
PEP8_REPORT_PATH="$PYTHON_ROOT_DIR/test/pep8-report.txt"
2626
PYLINT_REPORT_PATH="$PYTHON_ROOT_DIR/test/pylint-report.txt"
2727
PYLINT_INSTALL_INFO="$PYTHON_ROOT_DIR/test/pylint-info.txt"
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
#
2+
# Copyright 2016 The BigDL Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
#
16+
17+
from typing import List, Optional, Tuple, Union
18+
import warnings
19+
import os
20+
21+
import torch
22+
from torch import nn
23+
import time
24+
import argparse
25+
import ipex_llm
26+
import numpy as np
27+
28+
from ipex_llm.transformers import AutoModelForCausalLM, convert_model_hybrid
29+
from ipex_llm.utils.benchmark_util_deepseek import BenchmarkWrapper
30+
31+
from transformers import AutoTokenizer, GenerationConfig
32+
from transformers.cache_utils import Cache, DynamicCache
33+
34+
35+
PROMPT_FORMAT = """
36+
A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>.
37+
User: {prompt}.
38+
Assistant: <think>
39+
"""
40+
41+
if __name__ == '__main__':
42+
parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for Llama2 model')
43+
parser.add_argument('--repo-id-or-model-path', type=str, default="meta-llama/Llama-2-7b-chat-hf",
44+
help='The huggingface repo id for the Llama2 (e.g. `meta-llama/Llama-2-7b-chat-hf` and `meta-llama/Llama-2-13b-chat-hf`) to be downloaded'
45+
', or the path to the huggingface checkpoint folder')
46+
parser.add_argument('--prompt', type=str, default="If \( a > 1 \), then the sum of the real solutions of \( \sqrt{a} - \sqrt{a + x} = x \) is equal to:",
47+
help='Prompt to infer')
48+
parser.add_argument('--n-predict', type=int, default=32,
49+
help='Max tokens to predict')
50+
parser.add_argument('--load-path', type=str, default=None,
51+
help='The path to load the low-bit model.')
52+
parser.add_argument('--warm-up', type=int, default=1,
53+
help='Num of warm-up trials.')
54+
parser.add_argument('--num-trials', type=int, default=1,
55+
help='Num of trials to run.')
56+
57+
args = parser.parse_args()
58+
model_path = args.repo_id_or_model_path
59+
60+
load_path = args.load_path
61+
if load_path:
62+
model = AutoModelForCausalLM.load_low_bit(load_path, trust_remote_code=True)
63+
tokenizer = AutoTokenizer.from_pretrained(load_path,
64+
trust_remote_code=True)
65+
else:
66+
model = AutoModelForCausalLM.from_pretrained(model_path,
67+
load_in_4bit=True,
68+
optimize_model=True,
69+
trust_remote_code=True,
70+
use_cache=True)
71+
tokenizer = AutoTokenizer.from_pretrained(model_path,
72+
trust_remote_code=True)
73+
74+
model = model.bfloat16()
75+
model = convert_model_hybrid(model)
76+
print(model)
77+
78+
model = BenchmarkWrapper(model)
79+
e2e_time_list = []
80+
prefill_time_list = []
81+
rest_cost_mean_list = []
82+
83+
# Generate predicted tokens
84+
with torch.inference_mode():
85+
prompt = PROMPT_FORMAT.format(prompt=args.prompt)
86+
input_ids = tokenizer.encode(prompt, return_tensors="pt")
87+
# ipex_llm model needs a warmup, then inference time can be accurate
88+
for i in range(args.warm_up):
89+
output = model.generate(input_ids,
90+
max_new_tokens=args.n_predict,
91+
min_new_tokens=args.n_predict)
92+
93+
# start inference
94+
for i in range(args.num_trials):
95+
st = time.time()
96+
output = model.generate(input_ids,
97+
max_new_tokens=args.n_predict,
98+
min_new_tokens=args.n_predict)
99+
torch.xpu.synchronize()
100+
end = time.time()
101+
output = output.cpu()
102+
e2e_time_list.append(end - st)
103+
prefill_time_list.append(model.first_cost)
104+
rest_cost_mean_list.append(model.rest_cost_mean)
105+
106+
print('-'*20, 'Performance', '-'*20)
107+
print(f"End-to-end time: {np.mean(e2e_time_list)} s")
108+
print(f"Prefill time: {np.mean(prefill_time_list)} s")
109+
print(f"Rest cost mean: {np.mean(rest_cost_mean_list) * 1000} ms")

python/llm/example/GPU/DeepSeek-R1/generate_hybrid.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,8 @@
2424
import argparse
2525
import ipex_llm
2626

27-
from ipex_llm.transformers import convert_model_hybrid
28-
from ipex_llm.transformers import AutoModelForCausalLM
27+
from ipex_llm.transformers import AutoModelForCausalLM, convert_model_hybrid
28+
2929
from transformers import AutoTokenizer, GenerationConfig
3030
from transformers.cache_utils import Cache, DynamicCache
3131

@@ -65,7 +65,7 @@
6565
tokenizer = AutoTokenizer.from_pretrained(model_path,
6666
trust_remote_code=True)
6767

68-
#model = model.bfloat16()
68+
model = model.bfloat16()
6969
model = convert_model_hybrid(model)
7070
print(model)
7171

0 commit comments

Comments
 (0)