Skip to content

Commit 00f322d

Browse files
authored
Finetune ChatGLM with Deepspeed Zero3 LoRA (intel#11314)
* Fintune ChatGLM with Deepspeed Zero3 LoRA * add deepspeed zero3 config * rename config * remove offload_param * add save_checkpoint parameter * Update lora_deepspeed_zero3_finetune_chatglm3_6b_arc_2_card.sh * refine
1 parent 5dad33e commit 00f322d

File tree

4 files changed

+116
-21
lines changed

4 files changed

+116
-21
lines changed

python/llm/example/GPU/LLM-Finetuning/LoRA/README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,12 @@ Here, we provide example usages on different hardware. Please refer to the appro
3333
bash lora_finetune_llama2_7b_arc_1_card.sh
3434
```
3535

36+
##### Finetuning ChatGLM3-6B on two Arc A770
37+
38+
```bash
39+
bash lora_deepspeed_zero3_finetune_chatglm3_6b_arc_2_card.sh
40+
```
41+
3642
##### Finetuning LLaMA2-7B on four Intel Data Center GPU Max 1100
3743

3844
```bash

python/llm/example/GPU/LLM-Finetuning/LoRA/alpaca_lora_finetuning.py

Lines changed: 51 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,8 @@ def train(
107107
gradient_checkpointing: bool = False,
108108
deepspeed: str = None,
109109
training_mode: str = "lora",
110+
deepspeed_zero3: bool = False,
111+
save_checkpoint: bool = True,
110112
):
111113
invalidInputError(training_mode == "lora",
112114
f"This example is for lora training mode, but got training_mode={training_mode}.")
@@ -136,6 +138,8 @@ def train(
136138
f"resume_from_checkpoint: {resume_from_checkpoint or False}\n"
137139
f"prompt template: {prompt_template_name}\n"
138140
f"training_mode: {training_mode}\n"
141+
f"deepspeed_zero3: {deepspeed_zero3}\n"
142+
f"save_checkpoint: {save_checkpoint}\n"
139143
)
140144
assert (
141145
base_model
@@ -154,28 +158,54 @@ def train(
154158
# Check if parameter passed or if set within environ
155159
use_wandb = wandb_check(wandb_project, wandb_watch, wandb_log_model)
156160

161+
if deepspeed_zero3:
162+
deepspeed = deepspeed if deepspeed is not None else "./deepspeed_zero3_config.json"
163+
157164
if saved_low_bit_model is not None:
158165
# Load the low bit optimized model if provide the saved path
159-
model = AutoModelForCausalLM.load_low_bit(
160-
saved_low_bit_model,
161-
optimize_model=False,
162-
torch_dtype=torch.bfloat16,
163-
modules_to_not_convert=["lm_head"],
164-
trust_remote_code=True,
165-
)
166+
if deepspeed_zero3:
167+
import deepspeed as ds
168+
with ds.zero.Init(config_dict_or_path=deepspeed):
169+
model = AutoModelForCausalLM.load_low_bit(
170+
saved_low_bit_model,
171+
optimize_model=False,
172+
torch_dtype=torch.bfloat16,
173+
modules_to_not_convert=["lm_head"],
174+
trust_remote_code=True,
175+
)
176+
else:
177+
model = AutoModelForCausalLM.load_low_bit(
178+
saved_low_bit_model,
179+
optimize_model=False,
180+
torch_dtype=torch.bfloat16,
181+
modules_to_not_convert=["lm_head"],
182+
trust_remote_code=True,
183+
)
166184
else:
167-
model = AutoModelForCausalLM.from_pretrained(
168-
base_model,
169-
load_in_low_bit="bf16",
170-
optimize_model=False,
171-
torch_dtype=torch.bfloat16,
172-
modules_to_not_convert=["lm_head"],
173-
trust_remote_code=True,
174-
)
175-
176-
print(f"Model loaded on rank {os.environ.get('LOCAL_RANK')}")
177-
model = model.to(f'xpu:{os.environ.get("LOCAL_RANK", 0)}')
178-
print(f"Model moved to rank {os.environ.get('LOCAL_RANK')}")
185+
if deepspeed_zero3:
186+
import deepspeed as ds
187+
with ds.zero.Init(config_dict_or_path=deepspeed):
188+
model = AutoModelForCausalLM.from_pretrained(
189+
base_model,
190+
load_in_low_bit="bf16",
191+
optimize_model=False,
192+
torch_dtype=torch.bfloat16,
193+
modules_to_not_convert=["lm_head"],
194+
trust_remote_code=True,
195+
)
196+
else:
197+
model = AutoModelForCausalLM.from_pretrained(
198+
base_model,
199+
load_in_low_bit="bf16",
200+
optimize_model=False,
201+
torch_dtype=torch.bfloat16,
202+
modules_to_not_convert=["lm_head"],
203+
trust_remote_code=True,
204+
)
205+
if not deepspeed_zero3:
206+
print(f"Model loaded on rank {os.environ.get('LOCAL_RANK')}")
207+
model = model.to(f'xpu:{os.environ.get("LOCAL_RANK", 0)}')
208+
print(f"Model moved to rank {os.environ.get('LOCAL_RANK')}")
179209

180210
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
181211
print(f"Tokenizer loaded on rank {os.environ.get('LOCAL_RANK')}")
@@ -234,12 +264,12 @@ def train(
234264
logging_steps=1,
235265
optim="adamw_torch",
236266
evaluation_strategy="steps" if val_set_size > 0 else "no",
237-
save_strategy="steps",
267+
save_strategy="steps" if save_checkpoint else "no",
238268
eval_steps=100 if val_set_size > 0 else None,
239269
save_steps=100,
240270
output_dir=output_dir,
241271
save_total_limit=100,
242-
load_best_model_at_end=True if val_set_size > 0 else False,
272+
load_best_model_at_end=True if val_set_size > 0 and save_checkpoint else False,
243273
ddp_find_unused_parameters=False if ddp else None,
244274
group_by_length=group_by_length,
245275
report_to="wandb" if use_wandb else None,
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
{
2+
"zero_optimization": {
3+
"stage": 3,
4+
"contiguous_gradients": true,
5+
"overlap_comm": true,
6+
"offload_optimizer": {"device": "cpu"}
7+
},
8+
"bf16": {
9+
"enabled": true
10+
},
11+
"world_size":2,
12+
"train_batch_size": 2,
13+
"train_micro_batch_size_per_gpu": 1,
14+
"gradient_accumulation_steps": 1,
15+
"stage3_gather_16bit_weights_on_model_save":true
16+
}
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
#
2+
# Copyright 2016 The BigDL Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
#
16+
17+
export MASTER_ADDR=127.0.0.1
18+
export MASTER_PORT=29503
19+
export FI_PROVIDER=tcp
20+
export CCL_ATL_TRANSPORT=ofi
21+
export CCL_ZE_IPC_EXCHANGE=sockets
22+
23+
basekit_root=/opt/intel/oneapi
24+
source $basekit_root/setvars.sh --force
25+
source $basekit_root/ccl/latest/env/vars.sh --force
26+
27+
NUM_GPUS=2 # number of used GPU
28+
export USE_XETLA=OFF
29+
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
30+
export TORCH_LLM_ALLREDUCE=0 # Different from PVC
31+
32+
mpirun -n $NUM_GPUS \
33+
python ./alpaca_lora_finetuning.py \
34+
--base_model "THUDM/chatglm3-6b" \
35+
--data_path "yahma/alpaca-cleaned" \
36+
--output_dir "./ipex-llm-lora-alpaca" \
37+
--gradient_checkpointing True \
38+
--lora_target_modules "['query_key_value', 'dense', 'dense_h_to_4h', 'dense_4h_to_h']" \
39+
--micro_batch_size 1 \
40+
--batch_size 2 \
41+
--save_checkpoint False \
42+
--deepspeed_zero3 True > lora_deepspeed_zero3_finetune_chatglm3_6b.log
43+

0 commit comments

Comments
 (0)