asr_librispeech support deepspeed and update aispeech_asr

teamtee · teamtee · commit 5ab1e810f9c6 · 2025-04-23T15:12:38.000+08:00
diff --git a/examples/aispeech_asr/README.md b/examples/aispeech_asr/README.md
@@ -89,7 +89,59 @@ if [[ $use_peft == "true" ]]; then
     ckpt_path=  # For DDP training, provide the path to the saved pt file; for DeepSpeed training, convert mp_rank_00_model_states.pt to model.pt using the `scripts/transcribe_deepspeed_to_pt.py` script
 fi
 ```
+### Deepspeed
+When using `bf16`/`fp16` for training, deepspeed saves about 20GB of GPU memory compared to `torchrun` when training a 7B model. For 7B models, it's recommended to use `zero-0`/`1`/`2`, while for extremely large models, `zero-3` can be used, though communication may become a bottleneck.
 
+```json
+{
+    "train_micro_batch_size_per_gpu": 4,
+    "gradient_accumulation_steps": 1,
+    "optimizer": {
+        "type": "Adam",
+        "params": {
+            "lr": 1e-4
+        }
+    },
+    "fp16": {
+        "enabled": true
+    },
+    "zero_optimization": {
+        "stage": 2,
+        "offload_optimizer": {
+            "device": "cpu"
+        }
+    }
+}
+```
+
+Note that when using `zero-0`/`1`/`2`, the DeepSpeed model is saved in a format that requires a script to convert `mp_rank_00_model_states.pt` to `model.pt`, such as `python scripts/transcribe_deepspeed_to_pt.py mp_rank_00_model_states.pt output_dir`.
+
+```
+global_step1000
+global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
+...
+global_step1000/mp_rank_00_model_states.pt
+latest
+zero_to_fp32.py
+```
+
+If training with `Zero-3`, the model is saved in a different format and can be converted using `python zero_to_fp32.py global_step50 outputdir`.
+
+```
+global_step50
+global_step50/zero_pp_rank_0_mp_rank_00_model_states.pt
+global_step50/zero_pp_rank_0_mp_rank_00_optim_states.pt
+...
+latest
+zero_to_fp32.py
+```
+If you use bf16/fp16 training in DeepSpeed and encounter NaN in train/eval loss, check the autocast in `src/slam_llm/utils/deepspeed_utils.py`:
+
+```python
+with autocast()  # original code
+with autocast(dtype=torch.bfloat16) # must work
+with autocast(dtype=torch.float16) 
+```
 ## Decoding
 
 - **Single-machine single-GPU decoding**: Refer to `scripts/decode.sh`
diff --git a/examples/aispeech_asr/finetune_deepspeed.py b/examples/aispeech_asr/finetune_deepspeed.py
@@ -21,7 +21,7 @@ class RunConfig:
     ckpt_path: Optional[str] = field(
         default=None, metadata={"help": "The path to projector checkpoint"}
     )
-    deepspeed_config : str =""
+    deepspeed_config : str ="examples/aispeech_asr/conf/ds_config.json"
     deepspeed_ckpt_path: Optional[str] = field(
         default=None, metadata={"help": "The path to projector checkpoint"}
     )
diff --git a/examples/aispeech_asr/scripts/finetune_deepspeed.sh b/examples/aispeech_asr/scripts/finetune_deepspeed.sh
@@ -27,7 +27,7 @@ use_fp16=true
 freeze_encoder=true
 pad_or_trim=true # For whisper
 
-deepspeed_config=code_dir=examples/aispeech_asr/conf/ds_config.json
+deepspeed_config=examples/aispeech_asr/conf/ds_config.json
 
 if [[ $use_peft == "true" || $freeze_encoder == false ]];then
     ckpt_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/aispeech_asr/exp/slidespeech/20250414/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1515_slidespeech_text/mala_asr_epoch_2_step_7000
diff --git a/examples/aispeech_asr/slam_llm b/examples/aispeech_asr/slam_llm
diff --git a/examples/asr_librispeech/README.md b/examples/asr_librispeech/README.md
@@ -53,6 +53,60 @@ Setting `use_fast_kernels` will enable using of Flash Attention or Xformer memor
 
 If you are interested in running full parameter fine-tuning on the 70B model, you can enable `low_cpu_fsdp` mode as the following command. This option will load model on rank0 only before moving model to devices to construct FSDP. This can dramatically save cpu memory when loading large models like 70B (on a 8-gpu node, this reduces cpu memory from 2+T to 280G for 70B model). This has been tested with `BF16` on 16xA100, 80GB GPUs.
 
+### Fine-tuning using Deepspeed
+
+If you're interested in training with DeepSpeed, refer to the script `finetune_whisper_large_linear_vicuna_7b_deepspeed.sh`. The training configuration is shown in `conf/ds_config.json`. When using `bf16`/`fp16` for training, it saves about 20GB of GPU memory compared to `torchrun` when training a 7B model. For 7B models, it's recommended to use `zero-0`/`1`/`2`, while for extremely large models, `zero-3` can be used, though communication may become a bottleneck.
+
+```json
+{
+    "train_micro_batch_size_per_gpu": 4,
+    "gradient_accumulation_steps": 1,
+    "optimizer": {
+        "type": "Adam",
+        "params": {
+            "lr": 1e-4
+        }
+    },
+    "fp16": {
+        "enabled": true
+    },
+    "zero_optimization": {
+        "stage": 2,
+        "offload_optimizer": {
+            "device": "cpu"
+        }
+    }
+}
+```
+
+Note that when using `zero-0`/`1`/`2`, the DeepSpeed model is saved in a format that requires a script to convert `mp_rank_00_model_states.pt` to `model.pt`, such as `python transcribe_deepspeed_to_pt.py mp_rank_00_model_states.pt output_dir`.
+
+```
+global_step1000
+global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
+...
+global_step1000/mp_rank_00_model_states.pt
+latest
+zero_to_fp32.py
+```
+
+If training with `Zero-3`, the model is saved in a different format and can be converted using `python zero_to_fp32.py global_step50 outputdir`.
+
+```
+global_step50
+global_step50/zero_pp_rank_0_mp_rank_00_model_states.pt
+global_step50/zero_pp_rank_0_mp_rank_00_optim_states.pt
+...
+latest
+zero_to_fp32.py
+```
+If you use bf16/fp16 training in DeepSpeed and encounter NaN in train/eval loss, check the autocast in `src/slam_llm/utils/deepspeed_utils.py`:
+
+```python
+with autocast()  # original code
+with autocast(dtype=torch.bfloat16)
+with autocast(dtype=torch.float16)
+```
 ##  Citation
 You can refer to the paper for more results. 
 ```
diff --git a/examples/asr_librispeech/scripts/finetune_whisper_large_linear_vicuna_7b_deepspeed.sh b/examples/asr_librispeech/scripts/finetune_whisper_large_linear_vicuna_7b_deepspeed.sh
@@ -1,7 +1,6 @@
 #!/bin/bash
 # export PYTHONPATH=/root/whisper:$PYTHONPATH
 export PYTHONPATH=/root/fairseq:$PYTHONPATH
-# export CUDA_VISIBLE_DEVICES=6,7
 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
 export TOKENIZERS_PARALLELISM=false
 # export CUDA_LAUNCH_BLOCKING=1
@@ -12,81 +11,58 @@ export OMP_NUM_THREADS=1
 # export NCCL_DEBUG_SUBSYS=ALL
 # export TORCH_DISTRIBUTED_DEBUG=INFO
 
-run_dir=/work/SLAM-LLM
+run_dir=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM-NPU
 cd $run_dir
 code_dir=examples/asr_librispeech
 
-speech_encoder_path=/cxgroup/model/whisper/large-v3.pt
+speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/wavlm/WavLM-Large.pt
+llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/vicuna-7b-v1.5
+train_data_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/multitask_wav.jsonl
+val_data_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/multitask_wav.jsonl
 
-llm_path=/cxgroup/model/vicuna-7b-v1.5
-# llm_path=/nfs/maziyang.mzy/models/vicuna-13b-v1.5
-
-output_dir=/work/exps/vicuna-7b-v1.5-librispeech-linear-steplrwarmupkeep1e-4-whisper-largev3-$(date +"%Y%m%d")-deepspeed
+output_dir=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM-NPU/examples/asr_librispeech/exp-$(date +"%Y%m%d")
 
 hydra_args="
 hydra.run.dir=$output_dir \
 ++model_config.llm_name=vicuna-7b-v1.5 \
 ++model_config.llm_path=$llm_path \
 ++model_config.llm_dim=4096 \
-++model_config.encoder_name=whisper \
+++model_config.encoder_name=wavlm \
+++model_config.normalize=true \
+++dataset_config.normalize=true \
 ++model_config.encoder_projector_ds_rate=5 \
 ++model_config.encoder_path=$speech_encoder_path \
-++model_config.encoder_dim=1280 \
+++model_config.encoder_dim=1024 \
 ++model_config.encoder_projector=linear \
 ++dataset_config.dataset=speech_dataset \
-++dataset_config.train_data_path=data/librispeech/train960.jsonl \
-++dataset_config.val_data_path=data/librispeech/dev.jsonl \
-++dataset_config.input_type=mel \
-++dataset_config.mel_size=128 \
+++dataset_config.train_data_path=$train_data_path \
+++dataset_config.val_data_path=$val_data_path \
+++dataset_config.input_type=raw \
 ++train_config.model_name=asr \
-++train_config.num_epochs=6 \
-++train_config.enable_deepspeed=true \
+++train_config.num_epochs=3 \
 ++train_config.freeze_encoder=true \
 ++train_config.freeze_llm=true \
 ++train_config.batching_strategy=custom \
 ++train_config.warmup_steps=1000 \
 ++train_config.total_steps=100000 \
 ++train_config.lr=1e-4 \
-++train_config.validation_interval=1000 \
-++train_config.batch_size_training=4 \
+++train_config.validation_interval=50 \
+++train_config.batch_size_training=1 \
 ++train_config.val_batch_size=4 \
-++train_config.num_workers_dataloader=4 \
+++train_config.num_workers_dataloader=2 \
 ++train_config.output_dir=$output_dir \
 ++metric=acc \
 "
-# ++train_config.use_peft=true \
-# ++train_config.peft_config.r=32 \
-# ++model_config.encoder_projector=linear \
-# ++model_config.encoder_projector_ds_rate=5 \
-# ++train_config.peft_config.peft_method=lora \
-# --peft_ckpt "/nfs/maziyang.mzy/exps/llama-2-hf-finetune-asr-ds5-proj2048-lr1e-5-whisper-prompt-padding30-20231228/asr/4" \
-# --ckpt_path "/nfs/maziyang.mzy/exps/llama-2-hf-finetune-asr-ds5-proj2048-lr1e-5-whisper-prompt-padding30-20231228/asr/4/model.pt" \
-#++log_config.log_file=/$output_dir/train.log \
-#++log_config.use_wandb=true \
-#++log_config.wandb_dir=$output_dir \
-#++log_config.wandb_entity_name=zym22 \
-#++log_config.wandb_project_name=slam-llm \
-#++log_config.wandb_exp_name=${0##*/%.*} \
-#++log_config.log_interval 5 \
+
+
+
 
 deepspeed \
-    --include localhost:4,5 \
-    --master_port=29502 \
+    --num_gpus=8 \
+    --num_nodes=1 \
     $code_dir/deepspeed_finetune_asr.py \
     $hydra_args
     # --num_gpus=2 \
     # --num_nodes=1 \
+    # --master_port=29502 \
 
-# -m debugpy --listen 5678 --wait-for-client
-# if [[ $CUDA_VISIBLE_DEVICES != *","* ]]; then
-#     python -m debugpy --listen 5678 --wait-for-client finetune_asr.py \
-#         $hydra_args
-# else
-#     deepspeed \
-#         --num_nodes=1 \
-#         --include localhost:6,7 \
-#         --master_port=29502 \
-#         $code_dir/deepspeed_finetune_asr.py \
-#         $hydra_args
-#         # --num_gpus=2 \
-# fi
diff --git a/examples/asr_librispeech/scripts/transcribe_deepspeed_to_pt.py b/examples/asr_librispeech/scripts/transcribe_deepspeed_to_pt.py
@@ -0,0 +1,9 @@
+import argparse
+import torch
+import torch_npu
+import  sys
+in_path = sys.argv[1]
+out_path = sys.argv[2]
+weight_dict = torch.load(in_path)["module"]
+torch.save(weight_dict, f"{out_path}/model.pt")
+print("[Finish]")
diff --git a/src/slam_llm/datasets/speech_dataset_large.py b/src/slam_llm/datasets/speech_dataset_large.py
@@ -60,22 +60,22 @@ def __init__(self, dataset_config, tokenizer=None, split='train'):
     def __iter__(self):
         multitask_task_path = os.path.join(self.data_path,"multitask.jsonl")
         worker_info = torch.utils.data.get_worker_info()
-        if worker_info is None:  # 不在 DataLoader 的多进程环境中
+        if worker_info is None:  # Not in the multi-processing environment of DataLoader.
             num_workers = 1
             worker_id = 0
         else:
             num_workers = worker_info.num_workers
             worker_id = worker_info.id
 
-        # 获取分布式环境中的进程信息
+        # Obtain the process information in the distributed environment.
         if dist.is_available() and dist.is_initialized():
             world_size = dist.get_world_size()
             rank = dist.get_rank()
         else:
             world_size = 1
             rank = 0
 
-        # 计算每个 worker 和每个进程应该处理的数据范围
+        # Calculate the data range that each worker and each process should handle.
         total_num_workers = num_workers * world_size
         worker_rank = rank * num_workers + worker_id 
         data_index = 0

Original file line number	Diff line number	Diff line change
`@@ -21,7 +21,7 @@ class RunConfig:`
`21`	`21`	`ckpt_path: Optional[str] = field(`
`22`	`22`	`default=None, metadata={"help": "The path to projector checkpoint"}`
`23`	`23`	`)`
`24`		`- deepspeed_config : str =""`
	`24`	`+ deepspeed_config : str ="examples/aispeech_asr/conf/ds_config.json"`
`25`	`25`	`deepspeed_ckpt_path: Optional[str] = field(`
`26`	`26`	`default=None, metadata={"help": "The path to projector checkpoint"}`
`27`	`27`	`)`