Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ data/
jobs/
debug/
audio/

exp/
examples/s2s/scripts/debug
examples/vsr_LRS3/scripts/decode_avhubert_vo_vicuna_7b_noself.sh
examples/asr_librispeech/scripts/decode_hubert_xtralarge_linear_vicuna_7b_copy.sh
Expand Down
26 changes: 2 additions & 24 deletions examples/aispeech_asr/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,6 @@ dev_scp_file_path= # Path to validation data
train_max_frame_length=1500 # Maximum frame length for training
eval_max_frame_length=1000 # Maximum frame length for evaluation
multitask_prompt_path= # Path to multitask.jsonl
prompt_style="\{\}" # Prompt style, e.g., "<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n" or "USER: {}\n ASSISTANT:"
projector=linear # Type of projector
encoder_name=whisper # Name of the encoder
llm_name=Qwen2.5-7B-Instruct # Name of the LLM
Expand All @@ -86,7 +85,7 @@ For LoRA training, set (with `ckpt_path` pointing to the model saved in the prev
```bash
use_peft=true
if [[ $use_peft == "true" ]]; then
ckpt_path= # For DDP training, provide the path to the saved pt file; for DeepSpeed training, convert mp_rank_00_model_states.pt to model.pt using the `scripts/transcribe_deepspeed_to_pt.py` script
ckpt_path=
fi
```
### Deepspeed
Expand All @@ -113,28 +112,7 @@ When using `bf16`/`fp16` for training, deepspeed saves about 20GB of GPU memory
}
}
```

Note that when using `zero-0`/`1`/`2`, the DeepSpeed model is saved in a format that requires a script to convert `mp_rank_00_model_states.pt` to `model.pt`, such as `python scripts/transcribe_deepspeed_to_pt.py mp_rank_00_model_states.pt output_dir`.

```
global_step1000
global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
...
global_step1000/mp_rank_00_model_states.pt
latest
zero_to_fp32.py
```

If training with `Zero-3`, the model is saved in a different format and can be converted using `python zero_to_fp32.py global_step50 outputdir`.

```
global_step50
global_step50/zero_pp_rank_0_mp_rank_00_model_states.pt
global_step50/zero_pp_rank_0_mp_rank_00_optim_states.pt
...
latest
zero_to_fp32.py
```
Note that when using `zero-0`/`1`/`2`/`3`, the DeepSpeed model is saved as `pytorch_model.bin`
If you use bf16/fp16 training in DeepSpeed and encounter NaN in train/eval loss, check the autocast in `src/slam_llm/utils/deepspeed_utils.py`:

```python
Expand Down
4 changes: 2 additions & 2 deletions examples/aispeech_asr/README_zh.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,11 +82,11 @@ deepspeed_config= # DeepSpeed配置文件路径
use_peft=false
```

训练LoRA时,设置如下(`ckpt_path`是上一步训练保存的模型路径):
训练LoRA时,设置如下(`ckpt_path`是上一步训练保存的模型路径`pytorch_model.bin/model.pt`):
```bash
use_peft=true
if [[ $use_peft == "true" ]]; then
ckpt_path= # 如果是DDP训练,直接写入保存的pt文件路径;如果是Deepspeed训练,需将mp_rank_00_model_states.pt文件转化为model.pt,可使用`scripts/transcribe_deepspeed_to_pt.py`脚本
ckpt_path=
fi
```

Expand Down
4 changes: 3 additions & 1 deletion examples/aispeech_asr/aispeech_asr_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,10 @@ class DataConfig:
dataset: str = "multitask_dataset"
train_max_frame_length: int = 1500
eval_max_frame_length: int = 1000
audio_sample_rate: int = 16000
max_audio_length: int = 30
multitask_prompt_path: str = "/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/multiprompt.jsonl"
prompt_style: str = "\{\}" #
prompt_style: str = "{}" # "<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n" | "USER: {}\n ASSISTANT:" Comment:Changed it in aispeech_asr_config.py
append_info_tasks : List = field(default_factory=lambda: [ "hotword"])
file: str = "examples/aispeech_asr/slam_llm/datasets/speech_dataset_large.py:get_speech_dataset"
train_scp_file_path: str = ""
Expand Down
5 changes: 2 additions & 3 deletions examples/aispeech_asr/scripts/decode.sh
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
#!/bin/bash
set -e
run_dir=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM-NPU
run_dir=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM
cd $run_dir
code_dir=examples/aispeech_asr

prompt_style="\{\}" # "<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n" | "USER: {}\n ASSISTANT:"
projector=linear
encoder_name=whisper
llm_name=Qwen2.5-7B-Instruct
Expand All @@ -15,6 +14,7 @@ encoder_projector_ds_rate=5
eval_max_frame_length=1000
ckpt_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/aispeech_asr/exp/librispeech/20250322/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1121/mala_asr_epoch_2_step_25000_best
test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/test
# prompt_style="\{\}" # "<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n" | "USER: {}\n ASSISTANT:" Comment:Changed it in aispeech_asr_config.py


# Choose Encoder
Expand Down Expand Up @@ -69,7 +69,6 @@ python \
++model_config.encoder_path=$speech_encoder_path \
++model_config.encoder_dim=$encoder_dim \
++model_config.encoder_projector=$projector \
++dataset_config.prompt_style=$prompt_style \
++dataset_config.dataset=$dataset \
++dataset_config.pad_or_trim=$pad_or_trim \
++dataset_config.test_scp_file_path=$test_scp_file_path \
Expand Down
4 changes: 2 additions & 2 deletions examples/aispeech_asr/scripts/decode_deepspeed.sh
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
#!/bin/bash
set -e
run_dir=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM-NPU
run_dir=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM
cd $run_dir
code_dir=examples/aispeech_asr

prompt_style="\{\}" # "<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n" | "USER: {}\n ASSISTANT:"
projector=linear
encoder_name=whisper
llm_name=Qwen2.5-7B-Instruct
Expand All @@ -15,6 +14,7 @@ encoder_projector_ds_rate=5
eval_max_frame_length=1000
ckpt_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/aispeech_asr/exp/librispeech/20250322/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1121/mala_asr_epoch_2_step_25000_best
test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/test
# prompt_style="\{\}" # "<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n" | "USER: {}\n ASSISTANT:" Comment:Changed it in aispeech_asr_config.py


# Choose Encoder
Expand Down
13 changes: 6 additions & 7 deletions examples/aispeech_asr/scripts/finetune_deepspeed.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,16 @@ export OMP_NUM_THREADS=1



run_dir=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM-NPU
run_dir=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM
cd $run_dir
code_dir=examples/aispeech_asr

train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/test
dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/test
train_max_frame_length=500
eval_max_frame_length=500
train_max_frame_length=2000
eval_max_frame_length=2500
multitask_prompt_path="/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/multiprompt.jsonl"
prompt_style="\{\}" # "<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n" | "USER: {}\n ASSISTANT:"
# prompt_style="\{\}" # "<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n" | "USER: {}\n ASSISTANT:" Comment:Changed it in aispeech_asr_config.py
projector=linear
encoder_name=whisper
llm_name=Qwen2.5-7B-Instruct
Expand All @@ -30,7 +30,7 @@ pad_or_trim=true # For whisper
deepspeed_config=examples/aispeech_asr/conf/ds_config.json

if [[ $use_peft == "true" || $freeze_encoder == false ]];then
ckpt_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/aispeech_asr/exp/slidespeech/20250414/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1515_slidespeech_text/mala_asr_epoch_2_step_7000
ckpt_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/aispeech_asr/exp/slidespeech/20250414/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1515_slidespeech_text/mala_asr_epoch_2_step_7000/pytorch_model.bin
fi

# Choose Encoder
Expand Down Expand Up @@ -86,7 +86,6 @@ hydra.run.dir=$output_dir \
++model_config.encoder_path=$speech_encoder_path \
++model_config.encoder_dim=$encoder_dim \
++model_config.encoder_projector=$projector \
++dataset_config.prompt_style=$prompt_style \
++dataset_config.train_max_frame_length=$train_max_frame_length \
++dataset_config.eval_max_frame_length=$eval_max_frame_length \
++dataset_config.multitask_prompt_path=$multitask_prompt_path \
Expand All @@ -107,7 +106,7 @@ hydra.run.dir=$output_dir \
++metric=acc \
"
if [[ $use_peft == "true" || $freeze_encoder == false ]];then
hydra_args+="++ckpt_path=$ckpt_path/model.pt"
hydra_args+="++ckpt_path=$ckpt_path"
fi


Expand Down
17 changes: 8 additions & 9 deletions examples/aispeech_asr/scripts/finetune_torchrun.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,16 @@ export OMP_NUM_THREADS=1



run_dir=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM-NPU
run_dir=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM
cd $run_dir
code_dir=examples/aispeech_asr

train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/test
dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/test
train_max_frame_length=1500
eval_max_frame_length=3000
train_max_frame_length=1400
eval_max_frame_length=2000
multitask_prompt_path="/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/multiprompt.jsonl"
prompt_style="\{\}" # "<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n" | "USER: {}\n ASSISTANT:"
# prompt_style="\{\}" # "<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n" | "USER: {}\n ASSISTANT:" Comment:Changed it in aispeech_asr_config.py
projector=linear
encoder_name=whisper
llm_name=Qwen2.5-1.5B-Instruct
Expand All @@ -29,7 +29,7 @@ pad_or_trim=true # For whisper


if [[ $use_peft == "true" || $freeze_encoder == false ]];then
ckpt_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/aispeech_asr/exp/slidespeech/20250414/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1515_slidespeech_text/mala_asr_epoch_2_step_7000
ckpt_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/aispeech_asr/exp/slidespeech/20250414/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1515_slidespeech_text/mala_asr_epoch_2_step_7000/model.pt
fi

# Choose Encoder
Expand Down Expand Up @@ -89,7 +89,6 @@ hydra.run.dir=$output_dir \
++model_config.encoder_path=$speech_encoder_path \
++model_config.encoder_dim=$encoder_dim \
++model_config.encoder_projector=$projector \
++dataset_config.prompt_style=$prompt_style \
++dataset_config.train_max_frame_length=$train_max_frame_length \
++dataset_config.eval_max_frame_length=$eval_max_frame_length \
++dataset_config.multitask_prompt_path=$multitask_prompt_path \
Expand All @@ -104,18 +103,18 @@ hydra.run.dir=$output_dir \
++train_config.freeze_llm=true \
++train_config.use_peft=$use_peft \
++train_config.batching_strategy=dynamic \
++train_config.validation_interval=10 \
++train_config.validation_interval=1000 \
++train_config.num_workers_dataloader=8 \
++train_config.output_dir=$output_dir \
++metric=acc \
"
if [[ $use_peft == "true" || $freeze_encoder == false ]];then
hydra_args+="++ckpt_path=$ckpt_path/model.pt"
hydra_args+="++ckpt_path=$ckpt_path"
fi

torchrun \
--nnodes 1 \
--nproc_per_node 2 \
--nproc_per_node 8 \
--master_port=29505 \
$code_dir/finetune_torchrun.py \
--config-path "conf" \
Expand Down
9 changes: 0 additions & 9 deletions examples/aispeech_asr/scripts/transcribe_deepspeed_to_pt.py

This file was deleted.

24 changes: 2 additions & 22 deletions examples/asr_librispeech/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,27 +79,7 @@ If you're interested in training with DeepSpeed, refer to the script `finetune_w
}
```

Note that when using `zero-0`/`1`/`2`, the DeepSpeed model is saved in a format that requires a script to convert `mp_rank_00_model_states.pt` to `model.pt`, such as `python transcribe_deepspeed_to_pt.py mp_rank_00_model_states.pt output_dir`.

```
global_step1000
global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
...
global_step1000/mp_rank_00_model_states.pt
latest
zero_to_fp32.py
```

If training with `Zero-3`, the model is saved in a different format and can be converted using `python zero_to_fp32.py global_step50 outputdir`.

```
global_step50
global_step50/zero_pp_rank_0_mp_rank_00_model_states.pt
global_step50/zero_pp_rank_0_mp_rank_00_optim_states.pt
...
latest
zero_to_fp32.py
```
Note that when using `zero-0`/`1`/`2`/`3`, the DeepSpeed model is saved as `pytorch_model.bin`, and you should change "++ckpt_path=$ckpt_path/model.pt" to " ++ckpt_path=$ckpt_path/pytorch_model.bin" in the script to use the model during inference.
If you use bf16/fp16 training in DeepSpeed and encounter NaN in train/eval loss, check the autocast in `src/slam_llm/utils/deepspeed_utils.py`:

```python
Expand All @@ -116,4 +96,4 @@ You can refer to the paper for more results.
journal={arXiv preprint arXiv:2402.08846},
year={2024}
}
```
```

This file was deleted.

4 changes: 3 additions & 1 deletion src/slam_llm/datasets/speech_dataset_large.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ def __init__(self, dataset_config, tokenizer=None, split='train'):
self.inference_mode = dataset_config.get("inference_mode", False)
self.normalize = dataset_config.get("normalize", False)
self.input_type = dataset_config.get("input_type", None)
self.max_audio_length = dataset_config.get("max_audio_length", 30)
self.audio_sample_rate = dataset_config.get("audio_sample_rate", 16000)
assert self.input_type in ["raw", "mel"], "input_type must be one of [raw, mel]"

def __iter__(self):
Expand Down Expand Up @@ -86,7 +88,7 @@ def __iter__(self):
ark_path = item["path"]
numpy_array = kaldiio.load_mat(ark_path)
audio_raw = numpy_array[1].astype(np.float32) / 32768
if len(audio_raw) / 16000 > 30:
if len(audio_raw) / self.audio_sample_rate > self.max_audio_length:
continue
key = item["key"]
target = item["target"]
Expand Down
14 changes: 11 additions & 3 deletions src/slam_llm/utils/checkpoint_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
import torch
import time
from collections import OrderedDict

from deepspeed.utils.zero_to_fp32 import (
convert_zero_checkpoint_to_fp32_state_dict)
from torch.distributed.fsdp import (
FullyShardedDataParallel as FSDP,
StateDictType,
Expand Down Expand Up @@ -168,11 +169,18 @@ def save_model_checkpoint(
def save_model_checkpoint_deepspeed(model, cfg, checkpoint_name="checkpoint"):
logger.info(f"--> saving model ...")
save_dir = os.path.join(cfg.output_dir, checkpoint_name)
os.makedirs(save_dir, exist_ok=True)
dist.barrier()
if os.environ["RANK"] == "0":
os.makedirs(save_dir, exist_ok=True)
dist.barrier()
# save_full_path = os.path.join(save_dir, "model.pt")
save_full_path = save_dir
model.save_checkpoint(save_dir=save_full_path, exclude_frozen_parameters=True)
logger.info(f"encoder saved at {save_full_path}")
dist.barrier()
if os.environ["RANK"] == "0":
convert_zero_checkpoint_to_fp32_state_dict(save_full_path,save_full_path)
dist.barrier()
logger.info(f"encoder saved at {save_full_path}_model")

def save_model_checkpoint_peft(model, optimizer, rank, cfg, checkpoint_name="checkpoint", save_trainable_only=True):
logger.info(f"--> saving model ...")
Expand Down
2 changes: 1 addition & 1 deletion src/slam_llm/utils/deepspeed_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def deepspeed_join(group_join):
local_rank = int(os.environ["LOCAL_RANK"])
rank = int(os.environ["RANK"])
world_size = int(os.environ["WORLD_SIZE"])
logging.info("Detected uneven workload distribution: {}\n".format(e) +
logging.info("Detected uneven workload distribution. " +
"Break current worker to manually join all workers, " +
"world_size {}, current rank {}, current local_rank {}\n".
format(world_size, rank, local_rank))
Expand Down
6 changes: 3 additions & 3 deletions src/slam_llm/utils/train_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
best_val_acc = 0.0
for epoch in range(train_config.num_epochs):
epoch_start_time = time.perf_counter()
with MemoryTrace() as memtrace,Join([model,optimizer]): # track the memory usage
with MemoryTrace() as memtrace,Join([model]): # track the memory usage
model.train()
total_loss = 0.0
total_acc = 0.0
Expand Down Expand Up @@ -326,8 +326,8 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
if torch.cuda.device_count() > 1 and (train_config.enable_fsdp or train_config.enable_ddp):
dist.all_reduce(total_loss, op=dist.ReduceOp.SUM)
dist.all_reduce(total_acc, op=dist.ReduceOp.SUM)
train_epoch_loss = total_loss / (len(train_dataloader) if train_config.batching_strategy != "dynamic" else (step + 1) *train_config.num_epochs)
train_epoch_acc = total_acc / (len(train_dataloader) if train_config.batching_strategy != "dynamic" else (step + 1) *train_config.num_epochs)
train_epoch_loss = total_loss / (len(train_dataloader) if train_config.batching_strategy != "dynamic" else (step + 1) )
train_epoch_acc = total_acc / (len(train_dataloader) if train_config.batching_strategy != "dynamic" else (step + 1) )
if train_config.enable_fsdp or train_config.enable_ddp:
train_epoch_loss = train_epoch_loss/world_size
train_epoch_acc = train_epoch_acc/world_size
Expand Down
Loading