diff --git a/.gitignore b/.gitignore index 65408424..a994c527 100644 --- a/.gitignore +++ b/.gitignore @@ -9,17 +9,20 @@ debug.sh transformers wandb/ log/ +**/exp/ *.log outputs/ data/ jobs/ debug/ audio/ - +extra-info +kernel_meta examples/s2s/scripts/debug examples/vsr_LRS3/scripts/decode_avhubert_vo_vicuna_7b_noself.sh examples/asr_librispeech/scripts/decode_hubert_xtralarge_linear_vicuna_7b_copy.sh examples/vsr_LRS3/scripts/decode_avhubert_vo_vicuna_7b_copy.sh scripts_all examples/hotwords_librispeech -examples/asr_librispeech/scripts/decode_hubert_xtralarge_linear_vicuna_7b_debug.sh \ No newline at end of file +examples/asr_librispeech/scripts/decode_hubert_xtralarge_linear_vicuna_7b_debug.sh +**/fusion_result.* \ No newline at end of file diff --git a/.hydra/config.yaml b/.hydra/config.yaml new file mode 100644 index 00000000..a2cce8cc --- /dev/null +++ b/.hydra/config.yaml @@ -0,0 +1,32 @@ +dataset_config: + prompt: 请转写音频为文字 + llm_name: Qwen2-7B-Instruct + prompt_style: normal + dataset: slidespeech + pad_or_trim: true + test_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/slidespeech/asr/test/ + input_type: raw + mel_size: 128 + inference_mode: true +model_config: + llm_name: Qwen2-7B-Instruct + llm_path: /aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/Qwen2-7B-Instruct + llm_dim: 3584 + encoder_name: conformer + normalize: true + encoder_projector_ds_rate: 2 + encoder_path: /aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/FireRedASR-LLM-L/asr_encoder.pth.tar + encoder_dim: 1280 + encoder_projector: linear +train_config: + model_name: firered_asr + freeze_encoder: true + freeze_llm: true + use_peft: false + batching_strategy: custom + num_epochs: 1 + val_batch_size: 8 + num_workers_dataloader: 8 + output_dir: '' + inference_mode: true +decode_log: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/examples/asr_fireredasr/decode_slidespeech_asr_normal diff --git a/.hydra/hydra.yaml b/.hydra/hydra.yaml new file mode 100644 index 00000000..5238d5a9 --- /dev/null +++ b/.hydra/hydra.yaml @@ -0,0 +1,183 @@ +hydra: + run: + dir: '' + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.run.dir= + - hydra.mode=RUN + task: + - ++model_config.llm_name=Qwen2-7B-Instruct + - ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/Qwen2-7B-Instruct + - ++model_config.llm_dim=3584 + - ++model_config.encoder_name=conformer + - ++model_config.normalize=true + - ++model_config.encoder_projector_ds_rate=2 + - ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/FireRedASR-LLM-L/asr_encoder.pth.tar + - ++model_config.encoder_dim=1280 + - ++model_config.encoder_projector=linear + - ++dataset_config.llm_name=Qwen2-7B-Instruct + - ++dataset_config.prompt_style=normal + - ++dataset_config.dataset=slidespeech + - ++dataset_config.pad_or_trim=true + - ++dataset_config.test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/slidespeech/asr/test/ + - ++dataset_config.input_type=raw + - ++dataset_config.mel_size=128 + - ++dataset_config.inference_mode=true + - ++train_config.model_name=firered_asr + - ++train_config.freeze_encoder=true + - ++train_config.freeze_llm=true + - ++train_config.use_peft=false + - ++train_config.batching_strategy=custom + - ++train_config.num_epochs=1 + - ++train_config.val_batch_size=8 + - ++train_config.num_workers_dataloader=8 + - ++train_config.output_dir= + - ++train_config.inference_mode=true + - ++decode_log=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/examples/asr_fireredasr/decode_slidespeech_asr_normal + job: + name: inference_fireredasr_deepspeed + chdir: null + override_dirname: ++dataset_config.dataset=slidespeech,++dataset_config.inference_mode=true,++dataset_config.input_type=raw,++dataset_config.llm_name=Qwen2-7B-Instruct,++dataset_config.mel_size=128,++dataset_config.pad_or_trim=true,++dataset_config.prompt_style=normal,++dataset_config.test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/slidespeech/asr/test/,++decode_log=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/examples/asr_fireredasr/decode_slidespeech_asr_normal,++model_config.encoder_dim=1280,++model_config.encoder_name=conformer,++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/FireRedASR-LLM-L/asr_encoder.pth.tar,++model_config.encoder_projector=linear,++model_config.encoder_projector_ds_rate=2,++model_config.llm_dim=3584,++model_config.llm_name=Qwen2-7B-Instruct,++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/Qwen2-7B-Instruct,++model_config.normalize=true,++train_config.batching_strategy=custom,++train_config.freeze_encoder=true,++train_config.freeze_llm=true,++train_config.inference_mode=true,++train_config.model_name=firered_asr,++train_config.num_epochs=1,++train_config.num_workers_dataloader=8,++train_config.output_dir=,++train_config.use_peft=false,++train_config.val_batch_size=8 + id: ??? + num: ??? + config_name: prompt.yaml + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/examples/asr_fireredasr/conf + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/.hydra/overrides.yaml b/.hydra/overrides.yaml new file mode 100644 index 00000000..75c5a644 --- /dev/null +++ b/.hydra/overrides.yaml @@ -0,0 +1,28 @@ +- ++model_config.llm_name=Qwen2-7B-Instruct +- ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/Qwen2-7B-Instruct +- ++model_config.llm_dim=3584 +- ++model_config.encoder_name=conformer +- ++model_config.normalize=true +- ++model_config.encoder_projector_ds_rate=2 +- ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/FireRedASR-LLM-L/asr_encoder.pth.tar +- ++model_config.encoder_dim=1280 +- ++model_config.encoder_projector=linear +- ++dataset_config.llm_name=Qwen2-7B-Instruct +- ++dataset_config.prompt_style=normal +- ++dataset_config.dataset=slidespeech +- ++dataset_config.pad_or_trim=true +- ++dataset_config.test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/slidespeech/asr/test/ +- ++dataset_config.input_type=raw +- ++dataset_config.mel_size=128 +- ++dataset_config.inference_mode=true +- ++train_config.model_name=firered_asr +- ++train_config.freeze_encoder=true +- ++train_config.freeze_llm=true +- ++train_config.use_peft=false +- ++train_config.batching_strategy=custom +- ++train_config.num_epochs=1 +- ++train_config.val_batch_size=8 +- ++train_config.num_workers_dataloader=8 +- ++train_config.output_dir= +- ++train_config.inference_mode=true +- ++decode_log=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/examples/asr_fireredasr/decode_slidespeech_asr_normal diff --git a/examples/aispeech_asr/README.md b/examples/aispeech_asr/README.md new file mode 100644 index 00000000..e7b4f21d --- /dev/null +++ b/examples/aispeech_asr/README.md @@ -0,0 +1,68 @@ +# MALA-ASR_SLIDESPEECH + +## Guides + +[MaLa-ASR](https://www.arxiv.org/abs/2406.05839) is an LLM-based ASR model that can integrate textual keywords extracted from presentation slides to improve recognition of conference content. + +![](docs/demo.png) + +## Model Architecture + +We use the official WavLM-Large model as our speech encoder, the public Vicuna 7B as our large language model decoder, and a simple-structured linear projector, consisting of a 1-D convolution layer and two linear layers as our projector. Refer to the [paper](https://www.arxiv.org/abs/2406.05839) for more details. + +![](docs/model.png) + +## Performance and checkpoints +We only train the linear projector in this recipe. +Encoder | Projector | LLM | dev | test +|---|---|---|---|---| +[WavLM-large](https://drive.google.com/file/d/12-cB34qCTvByWT-QtOcZaqwwO21FLSqU/view) | [Linear](https://drive.google.com/file/d/1hYS5UI3W0WVOZRVbqWxDUWIFMO9VgzHk/view?usp=drive_link)(~15.74M) | [vicuna-7b-v1.5](https://huggingface.co/lmsys/vicuna-7b-v1.5) | 8.91 | 9.14 + + +## Data preparation +Refer to official [SLIDESPEECH CORPUS](https://slidespeech.github.io/). + +Specifically, take the file `slidespeech_dataset.py` as an example, the dataset requires four files: `my_wav.scp`, `utt2num_samples`, `text`, `hot_related/ocr_1gram_top50_mmr070_hotwords_list`. + +`my_wav.scp` is a file of audio path lists. We transform wav file to ark file, so this file looks like +``` +ID1 xxx/slidespeech/dev_oracle_v1/data/format.1/data_wav.ark:22 +ID2 xxx/slidespeech/dev_oracle_v1/data/format.1/data_wav.ark:90445 +... +``` + +To generate this file, you can get audio wavs from https://www.openslr.org/144/ and get the time segments from https://slidespeech.github.io/. The second website provides segments, transcription text, OCR results at https://speech-lab-share-data.oss-cn-shanghai.aliyuncs.com/SlideSpeech/related_files.tar.gz (~1.37GB). You need to segment the wav by the timestamps provided in `segments` file. + + + This _related_files.tar.gz_ also provides `text` and a file named `keywords`. The file `keywords` refers to `hot_related/ocr_1gram_top50_mmr070_hotwords_list`, which contains hotwords list. + +`utt2num_samples` contains the length of the wavs, which looks like +``` +ID1 103680 +ID2 181600 +... +``` + +## Decode with checkpoints +``` +bash decode_MaLa-ASR_withkeywords_L95.sh +``` +Modify the path including `speech_encoder_path`, `llm_path`, `output_dir`, `ckpt_path` and `decode_log` in the script when you run the shell script. + +## Train a new model + +### Use self-supervised model(such as WavLM) as the encoder +``` +bash finetune_MaLa-ASR_withkeywords_L95.sh +``` + +## Citation +You can refer to the paper for more results. +``` +@inproceedings{yang2024malaasr, + title={MaLa-ASR: Multimedia-Assisted LLM-Based ASR}, + author={Guanrou Yang and Ziyang Ma and Fan Yu and Zhifu Gao and Shiliang Zhang and Xie Chen}, + booktitle={Proc. INTERSPEECH}, + year={2024}, +} +``` \ No newline at end of file diff --git a/examples/aispeech_asr/conf/ds_config.json b/examples/aispeech_asr/conf/ds_config.json new file mode 100644 index 00000000..7134f34c --- /dev/null +++ b/examples/aispeech_asr/conf/ds_config.json @@ -0,0 +1,38 @@ +{ + "train_micro_batch_size_per_gpu": 2, + "gradient_accumulation_steps": 1, + "optimizer": { + "type": "AdamW", + "params": { + "lr": 5e-5, + "betas": [0.9, 0.999], + "eps": 1e-06, + "weight_decay": 0.01 + } + }, + "bf16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 100, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 0.01 + }, + "zero_optimization": { + "stage": 2, + "allgather_partitions": true, + "overlap_comm": true, + "reduce_scatter": true, + "contiguous_gradients": true + }, + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": 0.00, + "warmup_max_lr": 0.00005, + "warmup_num_steps": 1000 + } + }, + "checkpoint_activations": false + +} \ No newline at end of file diff --git a/examples/aispeech_asr/conf/ds_config_from_k2.json b/examples/aispeech_asr/conf/ds_config_from_k2.json new file mode 100644 index 00000000..613709ac --- /dev/null +++ b/examples/aispeech_asr/conf/ds_config_from_k2.json @@ -0,0 +1,38 @@ +{ + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 100, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 0.01 + }, + "zero_optimization": { + "stage": 1, + "allgather_partitions": true, + "allgather_bucket_size": 2e8, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 2e8, + "contiguous_gradients": true + }, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-4 + } + }, + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": 0, + "warmup_max_lr": 1e-4, + "warmup_num_steps": 1000 + } + }, + "gradient_accumulation_steps": 4, + "gradient_clipping": 5, + "steps_per_print": 50, + "train_micro_batch_size_per_gpu": 1, + "wall_clock_breakdown": false +} \ No newline at end of file diff --git a/examples/aispeech_asr/conf/prompt.yaml b/examples/aispeech_asr/conf/prompt.yaml new file mode 100644 index 00000000..32cf2374 --- /dev/null +++ b/examples/aispeech_asr/conf/prompt.yaml @@ -0,0 +1,14 @@ +dataset_config: + # we put prompt here, because the hydra override in shell script only support a small subset of chars + # prompt: "Transcribe speech to text. Output the transcription directly without redundant content. Ensure that the output is not duplicated. " + # prompt: "<|im_start|>user\n请将语音转写为汉字<|im_end|>\n<|im_start|>assistant\n" + # prompt: "识别语音" + # prompt : "将上面的语音转写为英文" + # prompt: "Transcribe speech to English." + # prompt: "Transcribe speech to text.And then translate the text to spanish." + # prompt: "Transcribe speech to text." + # prompt: "Tell me what is the language of the text." + prompt: "Transcribe speech to text." + # prompt: "Transcribe speech to text.Follow words may occur in audio:{}." + # prompt: "" + # prompt: "请问上面有几个句子,有多少个字,给字编号然后输出文本" diff --git a/examples/aispeech_asr/dataset/multitask_dataset.py b/examples/aispeech_asr/dataset/multitask_dataset.py new file mode 100644 index 00000000..885feb2a --- /dev/null +++ b/examples/aispeech_asr/dataset/multitask_dataset.py @@ -0,0 +1,389 @@ +import torch +from torch.utils.data import Dataset,IterableDataset +import whisper +import kaldiio +import types +from functools import partial +# import pyroomacoustics as pra +import torch.distributed as dist +import string +import copy +import numpy as np +import copy +from tqdm import tqdm +import os +import json +import random +import torchaudio.transforms as T +from torchaudio.transforms import SpeedPerturbation +import torchaudio +import torchaudio.functional as F +import random +import logging +logger = logging.getLogger(__name__) +class MultiTaskDataset(IterableDataset): + def __init__(self, dataset_config, tokenizer=None, split='train',musan_path=None): + super().__init__() + self.multitask_prompt_list = {} + multitask_prompt_path = "/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/multiprompt.jsonl" + with open(multitask_prompt_path) as f_prompt: + for line in f_prompt: + item = json.loads(line.strip()) + if item["task"] in self.multitask_prompt_list: + self.multitask_prompt_list[item["task"]].append(item["prompt"]) + else: + self.multitask_prompt_list[item["task"]] = [item["prompt"]] + print(f"[Prompt] {self.multitask_prompt_list}") + if split == "train": + self.data_path = dataset_config.train_scp_file_path + elif split == "val": + self.data_path = dataset_config.dev_scp_file_path + elif split == "test": + self.data_path = dataset_config.test_scp_file_path + else: + assert(0) + if musan_path is not None: + self.musan_list = [] + with open(musan_path) as f: + for line in f: + key,path = line.split(" ") + self.musan_list.append(path) + + + self.llm_name = dataset_config.get("llm_name", None) + self.prompt_style = dataset_config.get("prompt_style", "normal") + if self.llm_name == "Qwen2.5-7B-Instruct": + if self.prompt_style == "normal": + self.prompt_template1 = "{}" + elif self.prompt_style == "instruct": + self.prompt_template1 = "<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n" + else: + if self.prompt_style == "normal": + self.prompt_template1 = "{}" + elif self.prompt_style == "instruct": + self.prompt_template1 = "USER: {}\n ASSISTANT:" + self.answer_template = "{}" + self.dataset_config = dataset_config + self.tokenizer = tokenizer + self.split = split + self.spec_augmentation = dataset_config.get("spec_augmentation", False) + self.speed_perturb = dataset_config.get("speed_perturb", False) + self.add_noise = dataset_config.get("musan", False) + self.add_reverb = dataset_config.get("add_reverb", False) + self.noise_file_path = dataset_config.get("noise_file_path", False) + if self.add_noise == True: + self.musan_wav_files = [] + for root, dirs, files in os.walk(self.noise_file_path): + for file in files: + if file.endswith('.wav'): + self.musan_wav_files.append(os.path.join(root, file)) + self.pad_or_trim = dataset_config.get("pad_or_trim", False) + self.IGNORE_INDEX = -100 # The default setting in CrossEntropyLoss + self.mel_size = dataset_config.get("mel_size", 80) # 80 for whisper large v1 and v2, 128 for large v3 + self.fix_length_audio = dataset_config.get("fix_length_audio", -1) + self.inference_mode = dataset_config.get("inference_mode", False) + self.prompt_mode = dataset_config.get("prompt_mode", None) + self.normalize = dataset_config.get("normalize", False) + self.input_type = dataset_config.get("input_type", None) + assert self.input_type in ["raw", "mel"], "input_type must be one of [raw, mel]" + + + + def speedPerturb(self, audio_raw): + orig_freq = 16000 + # 定义速度扰动因子,例如 [0.9, 1.0, 1.1] 表示速度减少10%,保持不变,增加10%Q + factors = [0.9,1,1.1] + # 创建速度扰动变换 + speed_perturbation = SpeedPerturbation(orig_freq, factors) + # 应用速度扰动 + audio_raw = torch.from_numpy(audio_raw) + # 由于 SpeedPerturbation 返回的是一个函数,我们需要调用它 + # 并且传入原始音频张量 + audio_raw = speed_perturbation(audio_raw)[0] + return audio_raw + def specAugment(self, spec): + spec = spec.permute(1, 0).unsqueeze(0) + stretch = T.TimeStretch(n_freq=128) + rate = random.random()*0.2 + 0.9 + Timemasking = T.TimeMasking(time_mask_param=100) + Frequencymasking = T.FrequencyMasking(freq_mask_param=27) + spec = stretch(spec, rate).to(torch.float32) + spec = Timemasking(spec) + spec = Timemasking(spec) + spec = Frequencymasking(spec) + spec = Frequencymasking(spec) + spec = spec.squeeze(0).permute(1, 0) + return spec + def addNoise(self, audio_raw): + noise, _ = torchaudio.load(random.choice(self.musan_wav_files)) + noise.unsqueeze_(0) + # 如果语音比噪声长,随机选择噪声的起始点 + if audio_raw.shape > noise.shape: + # 随机选择噪声的起始点 + start_idx = random.randint(0, audio_raw.shape - noise.shape) + # 在语音的随机位置开始添加噪声 + speech_with_noise = torch.zeros_like(audio_raw) + speech_with_noise[:, start_idx:start_idx + noise.shape] += noise + else: + # 如果噪声比语音长,从噪声的随机位置开始截取 + start_idx = random.randint(0, noise.shape - audio_raw.shape) + noise = noise[:, start_idx:start_idx + audio_raw.shape] + # 直接将噪声添加到语音中 + snr_dbs = random.randomint(1, 30) + noisy_speeches = F.add_noise(audio_raw, noise, snr_dbs) + return noisy_speeches + # def simulate_room_reverb(self, audio_raw,fs): + # room_dim = [random.uniform(3, 10), random.uniform(3, 10), random.uniform(2, 5)] + # # 随机生成目标混响时间(RT60,单位:秒) + # rt60_tgt = random.uniform(0.3, 1.0) + # # 生成随机房间参数 + # # 使用 Sabine 公式计算吸声系数和反射阶数 + # e_absorption, max_order = pra.inverse_sabine(rt60_tgt, room_dim) + # # 创建房间 + # room = pra.ShoeBox( + # room_dim, + # fs=fs, + # materials=pra.Material(e_absorption), + # max_order=int(max_order), + # use_rand_ism=True, # 使用随机化图像方法减少回声 + # max_rand_disp=0.05, # 最大随机位移(单位:米) + # ) + # # 随机生成声源位置 + # source_position = [random.uniform(0.5, room_dim[0] - 0.5), + # random.uniform(0.5, room_dim[1] - 0.5), + # random.uniform(0.5, room_dim[2] - 0.5)] + # room.add_source(source_position, signal=audio_raw) + # # 随机生成麦克风位置 + # mic_locs = np.c_[ + # [random.uniform(0.5, room_dim[0] - 0.5), random.uniform(0.5, room_dim[1] - 0.5), random.uniform(0.5, room_dim[2] - 0.5)], + # ] + # room.add_microphone_array(mic_locs) + # # 运行模拟 + # room.simulate() + # # 返回麦克风阵列的信号 + # return room.mic_array.signals[0, :] + + + def __iter__(self): + multitask_task_path = os.path.join(self.data_path,"multitask.jsonl") + worker_info = torch.utils.data.get_worker_info() + if worker_info is None: # 不在 DataLoader 的多进程环境中 + num_workers = 1 + worker_id = 0 + else: + num_workers = worker_info.num_workers + worker_id = worker_info.id + + # 获取分布式环境中的进程信息 + if dist.is_available() and dist.is_initialized(): + world_size = dist.get_world_size() + rank = dist.get_rank() + else: + world_size = 1 + rank = 0 + + # 计算每个 worker 和每个进程应该处理的数据范围 + total_num_workers = num_workers * world_size + worker_rank = rank * num_workers + worker_id + data_index = 0 + with open(multitask_task_path) as f_task: + for line in f_task: + if (data_index % total_num_workers) == worker_rank : + # try: + item = json.loads(line.strip()) + ark_path = item["path"] + numpy_array = kaldiio.load_mat(ark_path) + audio_raw = numpy_array[1].astype(np.float32) / 32768 + if len(audio_raw) / 16000 > 30: + continue + key = item["key"] + target = item["target"].upper() + ## data augmentation + if self.split == "train" and self.speed_perturb == True: + audio_raw = self.speedPerturb(audio_raw) + if self.split == "train" and self.add_noise == True: + audio_raw = self.addNoise(audio_raw, self.musan_list) + # if self.split == "train" and self.add_reverb == True: + # audio_raw = self.simulate_room_reverb(audio_raw, 16000).astype(np.float32) + if self.input_type == "raw": + audio_raw = torch.from_numpy(audio_raw).float() + if self.normalize: + audio_raw = torch.nn.functional.layer_norm(audio_raw, audio_raw.shape) + audio_length = len(audio_raw) // 320 # ad-hoc for fairseq 320x downsample + audio_length = audio_length // 5 # ad-hoc for 5x fc downsample + elif self.input_type == "mel": + if self.pad_or_trim == True: + audio_raw = whisper.pad_or_trim(audio_raw) + audio_mel = whisper.log_mel_spectrogram(audio_raw, n_mels=self.mel_size).permute(1, 0) + if self.split == "train" and self.spec_augmentation == True: + audio_mel = self.specAugment(audio_mel) + audio_length = (audio_mel.shape[0] + 1) // 2 # ad-hoc for whisper for 2x downsample from mel to feats + audio_length = audio_length // 5 # ad-hoc for 5x fc downsample + # audio_length = calculate_output_length_1d(audio_length, 5, 5, 0) # ad-hoc for 5x cov1d downsample + if self.fix_length_audio > 0: + audio_length = self.fix_length_audio + audio_pseudo = torch.full((audio_length,), -1) # placeholder + + prompt = random.choice(self.multitask_prompt_list[item["task"]]) + prompt = self.prompt_template1.format(prompt) + if item["task"] in ["prevtext","hotword","domain"]: + prompt = prompt.format(item[item["task"]].upper()) + prompt_ids = self.tokenizer.encode(prompt) + prompt_length = len(prompt_ids) + + if self.inference_mode: + prompt_ids = torch.tensor(prompt_ids, dtype=torch.int64) + example_ids = torch.cat((audio_pseudo, prompt_ids)) # [audio,prompt] + example_mask = example_ids.ge(-1) # [True,True] + + yield { + "input_ids": example_ids, + "attention_mask": example_mask, + "audio": audio_raw if self.input_type == "raw" else None, + "audio_mel": audio_mel if self.input_type == "mel" else None, + 'audio_length': audio_length, + 'key': key, + 'target': target, + } + else: + answer = self.answer_template.format(target) + example = prompt + answer # FIX(MZY): avoid putting a bos token before answer. + example_ids = self.tokenizer.encode(example) # [prompt,answer] + example_ids.append(self.tokenizer.eos_token_id) # [prompt,answer,eos] + example_ids = torch.tensor( + example_ids, dtype=torch.int64 + ) + example_ids = torch.cat((audio_pseudo, example_ids)) # [audio,prompt,answer,eos] + + labels_ids = copy.deepcopy(example_ids) # [audio,prompt,answer,eos] + labels_ids[:audio_length + prompt_length] = -1 # [-1,-1,answer,eos]; + example_mask = example_ids.ge(-1) # FIX(GZF): [True,True,True,True] + + label_mask = labels_ids.ge(0) # [False,False,True,True] + example_ids[~example_mask] = 0 # [audio,prompt,answer,eos] + labels_ids[~label_mask] = self.IGNORE_INDEX # [-100,-100,answer,eos] + yield { + "input_ids": example_ids, + "labels": labels_ids, + "attention_mask": example_mask, + "audio": audio_raw if self.input_type == "raw" else None, + "audio_mel": audio_mel if self.input_type == "mel" else None, + 'audio_length': audio_length, + } + # except: + # logger.warning("[Item Error]"+key+target) + data_index += 1 + def pad(self, sequence, max_length, padding_idx=0): + if isinstance(sequence, (int, list, tuple)): + if len(sequence) < max_length: + sequence = sequence + [padding_idx] * (max_length - len(sequence)) + else: + sequence = sequence[:max_length] + elif isinstance(sequence, torch.Tensor): + if len(sequence) < max_length: + sequence = torch.cat( + (sequence, torch.full(([max_length - len(sequence)] + list(sequence.size())[1:]), padding_idx))) + else: + sequence = sequence[:max_length] + elif isinstance(sequence, np.ndarray): + if len(sequence) < max_length: + sequence = np.concatenate( + (sequence, np.full((max_length - len(sequence),) + sequence.shape[1:], padding_idx))) + else: + sequence = sequence[:max_length] + else: + raise Exception("Type mismatch during padding!") + return sequence + + def collator(self, samples): + assert samples is not None + input_ids_max_length = max([s['input_ids'].shape[0] for s in samples]) + input_ids = torch.stack([self.pad(s['input_ids'], input_ids_max_length, self.tokenizer.pad_token_id) + for s in samples]) + attention_mask = torch.stack([self.pad(s['attention_mask'], input_ids_max_length, False) + for s in samples]) + if self.input_type == "raw": + audio_raw_max_length = max([s['audio'].shape[0] for s in samples]) + audio_raw = torch.stack([self.pad(s['audio'], audio_raw_max_length, 0) + for s in samples]) + audio_mask = torch.zeros(len(samples), audio_raw_max_length) + for line, sample in enumerate(samples): + audio_mask[line, :sample['audio'].shape[0]] = 1 + elif self.input_type == "mel": + audio_mel_max_length = max([s['audio_mel'].shape[0] for s in samples]) + audio_mel = torch.stack([self.pad(s['audio_mel'], audio_mel_max_length, 0) + for s in samples]) + audio_mel_post_mask = torch.zeros(len(samples), (audio_mel_max_length + 1) // 2) # ad-hoc for whisper for 2x downsample from mel to feats + for line, sample in enumerate(samples): + audio_mel_post_mask[line, :(sample['audio_mel'].shape[0] + 1) // 2] = 1 + + modality_mask = torch.zeros_like(attention_mask) + for line, sample in enumerate(samples): + modality_mask[line, :sample['audio_length']] = 1 + + if self.inference_mode: + keys = [s['key'] for s in samples] + targets = [s['target'] for s in samples] + + return { + "input_ids": input_ids, + "attention_mask": attention_mask, + "audio": audio_raw if self.input_type == "raw" else None, + "audio_mask": audio_mask if self.input_type == "raw" else None, + "audio_mel": audio_mel if self.input_type == "mel" else None, + "audio_mel_post_mask": audio_mel_post_mask if self.input_type == "mel" else None, + "modality_mask": modality_mask, + "keys": keys, + "targets": targets + } + + labels = torch.stack([self.pad(s['labels'], input_ids_max_length, self.IGNORE_INDEX) + for s in samples]) + return { + "input_ids": input_ids, + "labels": labels, + "attention_mask": attention_mask, + "audio": audio_raw if self.input_type == "raw" else None, + "audio_mask": audio_mask if self.input_type == "raw" else None, + "audio_mel": audio_mel if self.input_type == "mel" else None, + "audio_mel_post_mask": audio_mel_post_mask if self.input_type == "mel" else None, + "modality_mask": modality_mask + } + +class MultiTaskDynamicBatchDataset(IterableDataset): + def __init__(self, dataset: IterableDataset, window_class) -> None: + super().__init__() + self.dp = dataset + + assert window_class is not None + self.window_class = window_class + self.collator = self.dp.collator + self._buffer = [] + def __iter__(self): + for elem in self.dp: + if not self.window_class(elem, self._buffer): + self._buffer.append(elem) + else: + if len(self._buffer) > 0: + yield self._buffer + del self._buffer + self._buffer = [elem] + if len(self._buffer) > 0: + yield self._buffer + del self._buffer + self._buffer = [] + +def window_class(elem,buffer,max_frame_length): + if len(buffer) == 0: + return True + max_frame = max(len(elem["input_ids"]),max([ len(_["input_ids"]) for _ in buffer])) + return (len(buffer) + 1) * max_frame > max_frame_length + +def get_speech_dataset(dataset_config, tokenizer, split): + dataset = MultiTaskDataset(dataset_config, tokenizer, split) + dataset = MultiTaskDynamicBatchDataset(dataset,partial(window_class,max_frame_length = dataset_config.max_frame_length)) + return dataset + + + + \ No newline at end of file diff --git a/examples/aispeech_asr/docs/demo.png b/examples/aispeech_asr/docs/demo.png new file mode 100644 index 00000000..acba4559 Binary files /dev/null and b/examples/aispeech_asr/docs/demo.png differ diff --git a/examples/aispeech_asr/docs/model.png b/examples/aispeech_asr/docs/model.png new file mode 100644 index 00000000..95ecefc5 Binary files /dev/null and b/examples/aispeech_asr/docs/model.png differ diff --git a/examples/aispeech_asr/finetune_mala_asr.py b/examples/aispeech_asr/finetune_mala_asr.py new file mode 100644 index 00000000..3f0d46a0 --- /dev/null +++ b/examples/aispeech_asr/finetune_mala_asr.py @@ -0,0 +1,49 @@ +from slam_llm.pipeline.finetune import main as train +from typing import Optional + +import hydra +import logging +from dataclasses import dataclass, field +from omegaconf import DictConfig, ListConfig, OmegaConf +from mala_asr_config import ModelConfig, TrainConfig, DataConfig, LogConfig, FSDPConfig + +@dataclass +class RunConfig: + dataset_config: DataConfig = field(default_factory=DataConfig) + model_config: ModelConfig = field(default_factory=ModelConfig) + train_config: TrainConfig = field(default_factory=TrainConfig) + log_config: LogConfig = field(default_factory=LogConfig) + fsdp_config: FSDPConfig = field(default_factory=FSDPConfig) + debug: bool = field(default=False, metadata={"help": "Use pdb when true"}) + metric: str = field(default="acc", metadata={"help": "The metric for evaluation"}) + ckpt_path: Optional[str] = field( + default=None, metadata={"help": "The path to projector checkpoint"} + ) + +@hydra.main(config_name=None, version_base=None) +def main_hydra(cfg: DictConfig): + run_config = RunConfig() + cfg = OmegaConf.merge(run_config, cfg) + def to_plain_list(cfg_item): + if isinstance(cfg_item, ListConfig): + return OmegaConf.to_container(cfg_item, resolve=True) + elif isinstance(cfg_item, DictConfig): + return {k: to_plain_list(v) for k, v in cfg_item.items()} + else: + return cfg_item + + # kwargs = to_plain_list(cfg) + kwargs = cfg + log_level = getattr(logging, kwargs.get("log_level", "INFO").upper()) + + logging.basicConfig(level=log_level) + + if kwargs.get("debug", False): + import pdb; + pdb.set_trace() + + train(kwargs) + + +if __name__ == "__main__": + main_hydra() \ No newline at end of file diff --git a/examples/aispeech_asr/finetune_mala_asr_deepspeed.py b/examples/aispeech_asr/finetune_mala_asr_deepspeed.py new file mode 100644 index 00000000..2ee6b1bf --- /dev/null +++ b/examples/aispeech_asr/finetune_mala_asr_deepspeed.py @@ -0,0 +1,63 @@ +from slam_llm.pipeline.finetune_deepspeed import main as train +from typing import Optional +import argparse +import hydra +import logging +from dataclasses import dataclass, field +from omegaconf import DictConfig, ListConfig, OmegaConf +from mala_asr_config import ModelConfig, TrainConfig, DataConfig, LogConfig, FSDPConfig +import sys +@dataclass +class RunConfig: + dataset_config: DataConfig = field(default_factory=DataConfig) + model_config: ModelConfig = field(default_factory=ModelConfig) + train_config: TrainConfig = field(default_factory=TrainConfig) + log_config: LogConfig = field(default_factory=LogConfig) + fsdp_config: FSDPConfig = field(default_factory=FSDPConfig) + debug: bool = field(default=False, metadata={"help": "Use pdb when true"}) + metric: str = field(default="acc", metadata={"help": "The metric for evaluation"}) + ckpt_path: Optional[str] = field( + default=None, metadata={"help": "The path to projector checkpoint"} + ) + deepspeed_config : str ="" + deepspeed_ckpt_path: Optional[str] = field( + default=None, metadata={"help": "The path to projector checkpoint"} + ) + deepspeed_ckpt_id: Optional[str] = field( + default=None, metadata={"help": "The id to projector checkpoint"} + ) +def parse_deepspeed_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--local_rank", type=int, default=-1) + + args, unknown_args = parser.parse_known_args() + return args, unknown_args +@hydra.main(config_name=None, version_base=None) +def main_hydra(cfg: DictConfig): + run_config = RunConfig() + cfg = OmegaConf.merge(run_config, cfg) + def to_plain_list(cfg_item): + if isinstance(cfg_item, ListConfig): + return OmegaConf.to_container(cfg_item, resolve=True) + elif isinstance(cfg_item, DictConfig): + return {k: to_plain_list(v) for k, v in cfg_item.items()} + else: + return cfg_item + + # kwargs = to_plain_list(cfg) + kwargs = cfg + log_level = getattr(logging, kwargs.get("log_level", "INFO").upper()) + + logging.basicConfig(level=log_level) + + if kwargs.get("debug", False): + import pdb; + pdb.set_trace() + + train(kwargs) + + +if __name__ == "__main__": + deepspeed_args, remaining_args = parse_deepspeed_args() + sys.argv = [sys.argv[0]] + remaining_args # 仅传递 Hydra 能处理的参数 + main_hydra() \ No newline at end of file diff --git a/examples/aispeech_asr/inference_batch_deepspeed.py b/examples/aispeech_asr/inference_batch_deepspeed.py new file mode 100644 index 00000000..f9b583a9 --- /dev/null +++ b/examples/aispeech_asr/inference_batch_deepspeed.py @@ -0,0 +1,66 @@ +from slam_llm.pipeline.inference_batch_deepspeed import main as inference +from typing import Optional +import argparse +import hydra +import logging +from dataclasses import dataclass, field +from omegaconf import DictConfig, ListConfig, OmegaConf +from mala_asr_config import ModelConfig, TrainConfig, DataConfig, LogConfig, FSDPConfig +import sys +@dataclass +class RunConfig: + dataset_config: DataConfig = field(default_factory=DataConfig) + model_config: ModelConfig = field(default_factory=ModelConfig) + train_config: TrainConfig = field(default_factory=TrainConfig) + log_config: LogConfig = field(default_factory=LogConfig) + fsdp_config: FSDPConfig = field(default_factory=FSDPConfig) + debug: bool = field(default=False, metadata={"help": "Use pdb when true"}) + metric: str = field(default="acc", metadata={"help": "The metric for evaluation"}) + decode_log: str = field( + default="output/decode_log", + metadata={"help": "The prefix for the decode output"}, + ) + ckpt_path: str = field( + default="output/model.pt", metadata={"help": "The path to projector checkpoint"} + ) + peft_ckpt: Optional[str] = field( + default=None, + metadata={ + "help": "The path to peft checkpoint, should be a directory including adapter_config.json" + }, + ) +def parse_deepspeed_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--local_rank", type=int, default=-1) + + args, unknown_args = parser.parse_known_args() + return args, unknown_args +@hydra.main(config_name=None, version_base=None) +def main_hydra(cfg: DictConfig): + run_config = RunConfig() + cfg = OmegaConf.merge(run_config, cfg) + def to_plain_list(cfg_item): + if isinstance(cfg_item, ListConfig): + return OmegaConf.to_container(cfg_item, resolve=True) + elif isinstance(cfg_item, DictConfig): + return {k: to_plain_list(v) for k, v in cfg_item.items()} + else: + return cfg_item + + # kwargs = to_plain_list(cfg) + kwargs = cfg + log_level = getattr(logging, kwargs.get("log_level", "INFO").upper()) + + logging.basicConfig(level=log_level) + + if kwargs.get("debug", False): + import pdb; + pdb.set_trace() + + inference(kwargs) + + +if __name__ == "__main__": + deepspeed_args, remaining_args = parse_deepspeed_args() + sys.argv = [sys.argv[0]] + remaining_args # 仅传递 Hydra 能处理的参数 + main_hydra() \ No newline at end of file diff --git a/examples/aispeech_asr/inference_mala_asr_batch.py b/examples/aispeech_asr/inference_mala_asr_batch.py new file mode 100644 index 00000000..e733f7b3 --- /dev/null +++ b/examples/aispeech_asr/inference_mala_asr_batch.py @@ -0,0 +1,53 @@ +from slam_llm.pipeline.inference_batch import main as inference + +import hydra +import logging +from dataclasses import dataclass, field +from omegaconf import DictConfig, ListConfig, OmegaConf +from typing import Optional +from mala_asr_config import ModelConfig, TrainConfig, DataConfig, LogConfig, FSDPConfig + + +@dataclass +class RunConfig: + dataset_config: DataConfig = field(default_factory=DataConfig) + model_config: ModelConfig = field(default_factory=ModelConfig) + train_config: TrainConfig = field(default_factory=TrainConfig) + log_config: LogConfig = field(default_factory=LogConfig) + fsdp_config: FSDPConfig = field(default_factory=FSDPConfig) + debug: bool = field(default=False, metadata={"help": "Use pdb when true"}) + metric: str = field(default="acc", metadata={"help": "The metric for evaluation"}) + decode_log: str = field( + default="output/decode_log", + metadata={"help": "The prefix for the decode output"}, + ) + ckpt_path: str = field( + default="output/model.pt", metadata={"help": "The path to projector checkpoint"} + ) + peft_ckpt: Optional[str] = field( + default=None, + metadata={ + "help": "The path to peft checkpoint, should be a directory including adapter_config.json" + }, + ) + + +@hydra.main(config_name=None, version_base=None) +def main_hydra(cfg: DictConfig): + run_config = RunConfig() + cfg = OmegaConf.merge(run_config, cfg) + # kwargs = to_plain_list(cfg) + log_level = getattr(logging, cfg.get("log_level", "INFO").upper()) + + logging.basicConfig(level=log_level) + + if cfg.get("debug", False): + import pdb + + pdb.set_trace() + + inference(cfg) + + +if __name__ == "__main__": + main_hydra() diff --git a/examples/aispeech_asr/mala_asr_config.py b/examples/aispeech_asr/mala_asr_config.py new file mode 100644 index 00000000..71e4aeec --- /dev/null +++ b/examples/aispeech_asr/mala_asr_config.py @@ -0,0 +1,158 @@ +from dataclasses import dataclass, field +from typing import Optional, List +from torch.distributed.fsdp import ShardingStrategy + + +@dataclass +class ModelConfig: + file: str = "examples/aispeech_asr/model/slam_model_mala_asr.py:model_factory" + llm_name: str = "vicuna-7b-v1.5" + llm_path: str = "PATH/to/LLAMA/7B" + llm_type: str = "decoder_only" + llm_dim: int = 4096 + encoder_name: Optional[str] = None + encoder_ds_rate: int = 2 + encoder_path: Optional[str] = None + encoder_path_hf: Optional[str] = None + encoder_dim: int = 1280 + encoder_projector: str = "linear" + qformer_layers : int = 8 + encoder_projector_ds_rate: int = 5 + modal: str = "audio" + normalize: Optional[bool] = field(default=False, metadata={ + "help": "whether input is normalized, used for models such as wavlm" + }) + encoder_type: str = field(default="finetune", metadata={ + "help": "whether model is only pretrained or finetuned, used for models such as hubert" + }) + +# @dataclass +# class PeftConfig: +# peft_method: str = "lora" # None , llama_adapter, prefix +# r: int = 8 +# lora_alpha: int = 32 +# target_modules: List = field(default_factory=lambda: [ "q_proj", "v_proj"]) +# bias: str = "none" +# task_type: str = "CAUSAL_LM" +# lora_dropout: float = 0.05 +# inference_mode: bool = False + +@dataclass +class PeftConfig: + peft_method: str = "lora" # None , llama_adapter, prefix + r: int = 64 + lora_alpha: int = 16 + target_modules: List = field(default_factory=lambda: [ "q_proj","k_proj", "v_proj", "o_proj", "up_proj","gate_proj","down_proj"]) + # target_modules: List = field(default_factory=lambda: ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'down_proj']) + + bias: str = "none" + task_type: str = "CAUSAL_LM" + lora_dropout: float = 0.05 + inference_mode: bool = False + +@dataclass +class TrainConfig: + model_name:str = "PATH/to/LLAMA/7B" + enable_ddp:bool = False + enable_deepspeed:bool = False + enable_fsdp:bool = False + low_cpu_fsdp:bool = False + run_validation:bool = True + batch_size_training: Optional[int] = None + batching_strategy:str = field(default="packing", metadata={ + "help":"alternative: padding" + }) # + context_length:int = 4096 + gradient_accumulation_steps:int = 1 + num_epochs:int = 3 + num_workers_dataloader:int = 1 + warmup_steps:int = 1000 + total_steps:int = 100000 + validation_interval:int = 1000 + lr:float = 1e-4 + weight_decay:float = 0.0 + gamma:float = 0.85 + seed:int = 42 + use_fp16:bool = False + mixed_precision:bool = True + val_batch_size:Optional[int] = None + + use_peft:bool = False + peft_config:PeftConfig = field(default_factory=PeftConfig) + output_dir:str = "PATH/to/save/PEFT/model" + freeze_layers:bool = False + num_freeze_layers:int = 1 + quantization:bool = False + one_gpu:bool = False + save_model:bool = True + dist_checkpoint_root_folder:str = "PATH/to/save/FSDP/model" # will be used if using FSDP + dist_checkpoint_folder:str = "fine-tuned" # will be used if using FSDP + save_optimizer:bool = False # will be used if using FSDP + use_fast_kernels:bool = False # Enable using SDPA from PyTroch Accelerated Transformers, make use Flash Attention and Xformer memory-efficient kernels + run_test_during_validation:bool = False + run_test_during_validation_file:str = "test.wav" + run_test_during_validation_prompt:str = "<|ASR|>" + freeze_llm:bool = field(default=False, metadata={ + "help": "whether to freeze llm when finetuning, should be true when use peft finetuning" + }) + freeze_encoder:bool = False + +@dataclass +class DataConfig: + dataset: str = "multitask_dataset" + llm_name: str = "vicuna-7b-v1.5" + max_frame_length: int = 1400 + prompt_style: str = "normal" # instruct + file: str = "examples/aispeech_asr/dataset/multitask_dataset.py:get_speech_dataset" + speed_perturb : bool = False + spec_augmentation : bool = False + add_noise : bool = False + add_reverb : bool = False + noise_file_path: str = "" + train_scp_file_path: str = "" + dev_scp_file_path: str = "" + test_scp_file_path: str = "" + train_split: str = "train" + dev_split: str = "dev" + test_split:str = "test" + pad_or_trim: bool = True + prompt: Optional[str] = None + use_ocr: bool = True + inference_mode: bool = False + prompt_mode : str = "qwen" + lower: bool = False + encoder_ds_rate: int = 2 + encoder_projector_ds_rate: int = 5 + fix_length_audio: int = -1 + inference_mode:bool = False + input_type: str = field(default="raw", metadata={ + "help":"Use raw when input is wav, mel when for whisper" + }) + mel_size: int = field(default=80, metadata={ + "help": "80 for whisper large v1 and v2, 128 for v3" + }) + normalize: Optional[bool] = field(default=False, metadata={ + "help": "whether input is normalized, used for models such as wavlm" + }) + +@dataclass +class FSDPConfig: + mixed_precision: bool = True + use_fp16: bool = False + # sharding_strategy = "FULL_SHARD" #ShardingStrategy = ShardingStrategy.FULL_SHARD + sharding_strategy: ShardingStrategy = "SHARD_GRAD_OP" #ShardingStrategy.NO_SHARD #MZY: set NO_SHARD when use DDP + checkpoint_type: str = "SHARDED_STATE_DICT" # alternatively can use SHARDED_STATE_DICT save one file per rank, and can resize the world-size. + fsdp_activation_checkpointing: bool = True + fsdp_cpu_offload: bool = False + pure_bf16: bool = False + optimizer: str = "AdamW" + +@dataclass +class LogConfig: + use_wandb: bool = False + wandb_dir: str = "tmp/test_wandb" + wandb_entity_name: str = "project_name" + wandb_project_name: str = "project_name" + wandb_exp_name: str = "exp_name" + log_file: str = "tmp/test.log" + log_interval: int = 5 diff --git a/examples/aispeech_asr/model/slam_model_mala_asr.py b/examples/aispeech_asr/model/slam_model_mala_asr.py new file mode 100644 index 00000000..830b49fa --- /dev/null +++ b/examples/aispeech_asr/model/slam_model_mala_asr.py @@ -0,0 +1,156 @@ +import torch +import os +import logging +from slam_llm.models.slam_model import ( + slam_model, + setup_tokenizer, + setup_encoder, + setup_encoder_projector, + setup_llm, +) +from slam_llm.utils.train_utils import print_model_size + +logger = logging.getLogger(__name__) + +def model_factory(train_config, model_config, **kwargs): + # return necessary components for training + tokenizer = setup_tokenizer(train_config, model_config, **kwargs) + + encoder = setup_encoder(train_config, model_config, **kwargs) + + # llm + llm = setup_llm(train_config, model_config, **kwargs) + + # projector + encoder_projector = setup_encoder_projector( + train_config, model_config, **kwargs + ) + model = slam_model_asr( + encoder, + llm, + encoder_projector, + tokenizer, + train_config, + model_config, + **kwargs, + ) + + ckpt_path = kwargs.get( + "ckpt_path", None + ) # FIX(MZY): load model ckpt(mainly projector, related to model_checkpointing/checkpoint_handler.py: save_model_checkpoint_peft) + if ckpt_path is not None: + logger.info("loading other parts from: {}".format(ckpt_path)) + ckpt_dict = torch.load(ckpt_path, map_location="cpu") + model.load_state_dict(ckpt_dict, strict=False) + + + print_model_size( + model, + train_config, + ( + int(os.environ["RANK"]) + if train_config.enable_fsdp or train_config.enable_ddp + else 0 + ), + ) + return model, tokenizer + + +class slam_model_asr(slam_model): + def __init__( + self, + encoder, + llm, + encoder_projector, + tokenizer, + train_config, + model_config, + **kwargs, + ): + super().__init__( + encoder, + llm, + encoder_projector, + tokenizer, + train_config, + model_config, + **kwargs, + ) + + + @torch.no_grad() + def inference( + self, + wav_path=None, + prompt=None, + generation_config=None, + logits_processor=None, + stopping_criteria=None, + prefix_allowed_tokens_fn=None, + synced_gpus=None, + assistant_model=None, + streamer=None, + negative_prompt_ids=None, + negative_prompt_attention_mask=None, + **kwargs, + ): + # inference for asr model + + device = kwargs.get("device", "npu") + if os.path.exists(wav_path): # Audio-Text QA + import whisper + + audio_raw = whisper.load_audio(wav_path) + audio_raw = whisper.pad_or_trim(audio_raw) + + mel_size = getattr( + self.dataset_config, "mel_size", 80 + ) # 80 for large v1 and v2, 128 for large v3 + audio_mel = ( + whisper.log_mel_spectrogram(audio_raw, n_mels=mel_size) + .permute(1, 0)[None, :, :] + .to(device) + ) + + encoder_outs = self.encoder.extract_variable_length_features( + audio_mel.permute(0, 2, 1) + ) + + if self.model_config.encoder_projector == "q-former": + audio_mel_post_mask = torch.ones( + encoder_outs.size()[:-1], dtype=torch.long + ).to(encoder_outs.device) + encoder_outs = self.encoder_projector(encoder_outs, audio_mel_post_mask) + if self.model_config.encoder_projector == "linear": + encoder_outs = self.encoder_projector(encoder_outs) + else: # Text QA + encoder_outs = torch.empty( + 1, 0, self.llm.model.embed_tokens.embedding_dim + ).to(device) + + prompt = "USER: {}\n ASSISTANT:".format(prompt) + prompt_ids = self.tokenizer.encode(prompt) + prompt_length = len(prompt_ids) + prompt_ids = torch.tensor(prompt_ids, dtype=torch.int64).to(device) + + if hasattr(self.llm.model, "embed_tokens"): + inputs_embeds = self.llm.model.embed_tokens(prompt_ids) + elif hasattr(self.llm.model.model, "embed_tokens"): + inputs_embeds = self.llm.model.model.embed_tokens(prompt_ids) + else: + inputs_embeds = self.llm.model.model.model.embed_tokens(prompt_ids) + + inputs_embeds = torch.cat( + (encoder_outs, inputs_embeds[None, :, :]), dim=1 + ) # [audio,prompt] + + attention_mask = torch.ones(inputs_embeds.size()[:-1], dtype=torch.long).to( + inputs_embeds.device + ) + + # generate + model_outputs = self.generate( + inputs_embeds=inputs_embeds, attention_mask=attention_mask, **kwargs + ) + + return model_outputs diff --git a/examples/aispeech_asr/scripts/decode_MaLa-ASR.sh b/examples/aispeech_asr/scripts/decode_MaLa-ASR.sh new file mode 100644 index 00000000..cdbb77f1 --- /dev/null +++ b/examples/aispeech_asr/scripts/decode_MaLa-ASR.sh @@ -0,0 +1,141 @@ +#!/bin/bash +#export PYTHONPATH=/root/whisper:$PYTHONPATH +# export CUDA_VISIBLE_DEVICES=1 +# export ASCEND_VISIBLE_DEVICES=1 +export TOKENIZERS_PARALLELISM=false +# export OPENBLAS_NUM_THREADS=1 +# export GOTO_NUM_THREADS=1 +# export OMP_NUM_THREADS=1 +# export CUDA_LAUNCH_BLOCKING=1 +set -e +run_dir=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/ +cd $run_dir +code_dir=examples/aispeech_asr + +dataset=slidespeech +prompt_style=normal # normal #instruct +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech-clean || $dataset == librispeech-other || $dataset == alimeeting || $dataset == gigaspeech ]] +then + # aishell-1:asr hotword + # aishell-2:asr hotword mt + # librispeech:asr prevtext mt + # alimeeting: asr_far_bf asr_near asr_far_gss + dataset_task=asr +fi +projector=linear +encoder_name=whisper +sentence=connect +llm_name=Qwen2.5-7B-Instruct +use_peft=true +use_fp16=true +pad_or_trim=true +encoder_projector_ds_rate=5 +# ckpt_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/aispeech_asr/exp/librispeech/20250325/whisper_linear_Qwen2.5-7B-Instruct_loratrue_padtrue_normal_asr_speedfalse_specaugfalse-1123/mala_asr_epoch_1_step_34000 + +if [[ $encoder_name == "whisper" ]] +then + encoder_finetune=false +fi + +# Choose Encoder +if [[ $encoder_name == "whisper" ]] +then + if [[ $encoder_finetune == true ]] + then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper-Pt/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt + mel_size=80 + else + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + mel_size=128 + fi + encoder_dim=1280 + input_type=mel + +elif [[ $encoder_name == "wavlm" ]] +then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/wavlm/WavLM-Large.pt + encoder_dim=1024 + input_type=raw + mel_size=128 +else + exit 1 +fi + +# Choose LLM +if [[ $llm_name == "vicuna-7b-v1.5" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/vicuna-7b-v1.5 + llm_dim=4096 +elif [[ $llm_name == "Qwen2.5-7B-Instruct" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim=3584 +elif [[ $llm_name == "Qwen2-7B" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2-7B + llm_dim=3584 +elif [[ $llm_name == "Qwen2.5-7B" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B + llm_dim=3584 +else + exit 1 +fi + +if [[ $dataset == "aishell-1" || $dataset == "aishell-2" || $dataset == "alimeeting" || $dataset == gigaspeech ]] +then + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/test/ +elif [[ $dataset == "librispeech-other" ]] +then + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/test-other/ +elif [[ $dataset == "librispeech-clean" ]] +then + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/test-clean/ +elif [[ $dataset == "wenetspeech_test_net" ]] +then + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/wenetspeech/asr/test_net/ +elif [[ $dataset == "wenetspeech_test_meeting" ]] +then + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/wenetspeech/asr/test_meeting/ +else + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/test/ +fi + +decode_log=$ckpt_path/decode_${dataset}_${dataset_task}_${prompt_style} +python $code_dir/inference_mala_asr_batch.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + hydra.run.dir=$ckpt_path \ + ++model_config.llm_name=$llm_name \ + ++model_config.llm_path=$llm_path \ + ++model_config.llm_dim=$llm_dim \ + ++model_config.encoder_name=$encoder_name \ + ++model_config.normalize=true \ + ++model_config.encoder_projector_ds_rate=5 \ + ++model_config.encoder_path=$speech_encoder_path \ + ++model_config.encoder_dim=$encoder_dim \ + ++model_config.encoder_projector=$projector \ + ++dataset_config.llm_name=$llm_name \ + ++dataset_config.prompt_style=$prompt_style \ + ++dataset_config.dataset=$dataset \ + ++dataset_config.pad_or_trim=$pad_or_trim \ + ++dataset_config.test_scp_file_path=$test_scp_file_path \ + ++dataset_config.input_type=$input_type \ + ++dataset_config.mel_size=$mel_size \ + ++dataset_config.inference_mode=true \ + ++train_config.model_name=mala_asr \ + ++train_config.freeze_encoder=true \ + ++train_config.freeze_llm=true \ + ++train_config.use_peft=$use_peft \ + ++train_config.batching_strategy=custom \ + ++train_config.num_epochs=1 \ + ++train_config.val_batch_size=8 \ + ++train_config.num_workers_dataloader=0 \ + ++train_config.output_dir=$output_dir \ + ++decode_log=$decode_log \ + ++ckpt_path=$ckpt_path/model.pt + + +python /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/tools/wenet_compute_cer.py --char=1 -v=1 ${decode_log}_gt ${decode_log}_pred > ${decode_log}_cer +python /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/tools/pyResults/pyResults.py ${decode_log}_gt ${decode_log}_pred > ${decode_log}_ser +python "/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/SLAM-LLM/examples/mala_asr_slidespeech/slam_llm/utils/compute_wer.py" ${decode_log}_gt ${decode_log}_pred ${decode_log}_ser diff --git a/examples/aispeech_asr/scripts/decode_MaLa-ASR_deepspeech_multinode_nossh.sh b/examples/aispeech_asr/scripts/decode_MaLa-ASR_deepspeech_multinode_nossh.sh new file mode 100644 index 00000000..7576b98c --- /dev/null +++ b/examples/aispeech_asr/scripts/decode_MaLa-ASR_deepspeech_multinode_nossh.sh @@ -0,0 +1,201 @@ +#!/bin/bash +# export PYTHONPATH=/root/fairseq:$PYTHONPATH +export ASCEND_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export TOKENIZERS_PARALLELISM=false +export HCCL_CONNECT_TIMEOUT=3600 +# export CUDA_LAUNCH_BLOCKING=1 +export HYDRA_FULL_ERROR=1 +export OMP_NUM_THREADS=1 + +# debug setting for multiple gpus +# export NCCL_DEBUG=INFO +# export NCCL_DEBUG_SUBSYS=ALL +# export TORCH_DISTRIBUTED_DEBUG=INFO + +run_dir=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/ +cd $run_dir +code_dir=examples/aispeech_asr +# multitask +# dataset=alimeeting +# dataset=zh-6w-en-1w-asr +dataset=aishell-1 +prompt_style=normal #instruct +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech ]] +then + # aishell1:asr hotword + # aisehll2:asr hotword mt + # librispeech:asr prevtext mt + # alimeeting: asr_far_bf asr_near + # gigaspeech: asr + # wenetspeech: asr + dataset_task=asr +fi +projector=linear +encoder_name=whisper +llm_name=Qwen2.5-7B-Instruct +use_peft=false +use_fp16=true +freeze_encoder=true +pad_or_trim=true +encoder_projector_ds_rate=5 + +speed_perturb=false +spec_augmentation=false +add_noise=false +add_reverb=false +ckpt_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/aispeech_asr/exp/zh-1k-en-1k-asr/20250313/whisper_linear_Qwen2.5-7B-Instruct_loratrue_padtrue_normal__speedfalse_specaugfalse-2021/mala_asr_epoch_1_step_4000 +# deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config_from_k2.json +deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/aispeech_asr/conf/ds_config.json +# deepspeed_ckpt_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/aispeech_asr/exp/multitask_asr/20250305/whisper_linear_Qwen2.5-7B-Instruct_loratrue_padtrue_instruct__speedfalse_specaugfalse-1309/mala_asr_epoch_1_step_90000 +# deepspeed_ckpt_id=global_step90000 +if [[ $encoder_name == "whisper" ]] +then + encoder_finetune=false +fi +if [[ $use_peft == "true" || $freeze_encoder == false ]];then + ckpt_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/aispeech_asr/exp/multitask_asr/20250305/whisper_linear_Qwen2.5-7B-Instruct_loratrue_padtrue_instruct__speedfalse_specaugfalse-1309/mala_asr_epoch_1_step_90000 + # deepspeed_ckpt_path= +fi + +# Choose Encoder +if [[ $encoder_name == "whisper" ]] +then + if [[ $encoder_finetune == true ]] + then + speech_encoder_path= + mel_size=80 + else + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + mel_size=128 + fi + encoder_dim=1280 + input_type=mel + +elif [[ $encoder_name == "wavlm" ]] +then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/wavlm/WavLM-Large.pt + encoder_dim=1024 + input_type=raw + mel_size=128 +else + exit 1 +fi + +# Choose LLM +if [[ $llm_name == "vicuna-7b-v1.5" ]] +then + llm_path= + llm_dim=4096 +elif [[ $llm_name == "Qwen2.5-7B-Instruct" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim=3584 +elif [[ $llm_name == "Qwen2-7B" ]] +then + llm_path= + llm_dim=3584 +elif [[ $llm_name == "Qwen2.5-7B" ]] +then + llm_path= + llm_dim=3584 +else + exit 1 +fi + +# Choose Train/Dev/Test +if [[ $dataset == "aishell-1" || $dataset == "aishell-2" || $dataset == "alimeeting" || $dataset == gigaspeech ]] +then + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/test/ +elif [[ $dataset == "librispeech-other" ]] +then + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/test-other/ +elif [[ $dataset == "librispeech-clean" ]] +then + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/test-clean/ +elif [[ $dataset == "wenetspeech_test_net" ]] +then + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/wenetspeech/asr/test_net/ +elif [[ $dataset == "wenetspeech_test_meeting" ]] +then + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/wenetspeech/asr/test_meeting/ +else + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/test/ +fi + + +output_dir=${code_dir}/exp/${dataset}/$(date +"%Y%m%d")/${encoder_name}_${projector}_${llm_name}_lora${use_peft}_pad${pad_or_trim}_${prompt_style}_${dataset_task}_speed${speed_perturb}_specaug${spec_augmentation}-$(date +"%H%M") +hydra_args=" +hydra.run.dir=$output_dir \ +++model_config.llm_name=$llm_name \ +++model_config.llm_path=$llm_path \ +++model_config.llm_dim=$llm_dim \ +++model_config.encoder_name=$encoder_name \ +++model_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++model_config.encoder_path=$speech_encoder_path \ +++model_config.encoder_dim=$encoder_dim \ +++model_config.encoder_projector=$projector \ +++model_config.normalize=true \ +++dataset_config.llm_name=$llm_name \ +++dataset_config.prompt_style=$prompt_style \ +++dataset_config.normalize=true \ +++dataset_config.dataset=$dataset \ +++dataset_config.input_type=$input_type \ +++dataset_config.speed_perturb=$speed_perturb \ +++dataset_config.spec_augmentation=$spec_augmentation \ +++dataset_config.add_reverb=$add_reverb \ +++dataset_config.noise_file_path=$noise_file_path \ +++dataset_config.mel_size=$mel_size \ +++dataset_config.pad_or_trim=$pad_or_trim \ +++dataset_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++dataset_config.test_scp_file_path=$test_scp_file_path \ +++train_config.model_name=mala_asr \ +++train_config.num_epochs=5 \ +++train_config.freeze_encoder=$freeze_encoder \ +++train_config.freeze_llm=true \ +++train_config.use_peft=$use_peft \ +++train_config.batching_strategy=custom \ +++train_config.warmup_steps=10 \ +++train_config.total_steps=600000 \ +++train_config.lr=1e-4 \ +++train_config.validation_interval=10000 \ +++train_config.batch_size_training=6 \ +++train_config.val_batch_size=7 \ +++train_config.num_workers_dataloader=8 \ +++train_config.output_dir=$output_dir \ +++metric=acc \ +" +if [[ $use_peft == "true" ]];then + hydra_args+="++ckpt_path=$ckpt_path/model.pt" + # hydra_args+=" ++deepspeed_ckpt_path=$deepspeed_ckpt_path " + +fi +hydra_args+="++ckpt_path=$ckpt_path/model.pt" +# hydra_args+=" ++deepspeed_ckpt_path=$deepspeed_ckpt_path " + +HOST_FILE="/tmp/"${JobID} #生成的hostfile的完整文件名,$JobID调度系统会自动生成 +SSH_PORT=6666 #因调度系统强制普通用户身份起容器,需要将ssh端口指定为大于1024的值 + +gen_hostfile() { #此函数负责生成hostfile, 已跟调度系统对接好,直接使用,不要修改 + echo "${VC_MASTER_HOSTS} slots=${GPU_PER_TASK}" > ${HOST_FILE} + echo "${VC_WORKER_HOSTS}" | awk -F ',' -v gpu_num=$GPU_PER_TASK '{for (i=1; i<=NF; i++) print $i" slots="gpu_num}' >> ${HOST_FILE} +} + +do_train() { + cat $HOST_FILE #训练主入口函数 + /usr/sbin/sshd -p ${SSH_PORT} #在Rank0上后台启动sshd服务,不要修改 + deepspeed \ + --node_rank=$RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT \ + --hostfile $HOST_FILE \ + --no_ssh \ + $code_dir/inference_batch_deepspeed.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + ++train_config.enable_fsdp=false \ + ++train_config.enable_ddp=true \ + ++train_config.use_fp16=$use_fp16 \ + ${hydra_args} +} +gen_hostfile +do_train \ No newline at end of file diff --git a/examples/aispeech_asr/scripts/decode_MaLa-ASR_deepspeed.sh b/examples/aispeech_asr/scripts/decode_MaLa-ASR_deepspeed.sh new file mode 100644 index 00000000..098e10bf --- /dev/null +++ b/examples/aispeech_asr/scripts/decode_MaLa-ASR_deepspeed.sh @@ -0,0 +1,143 @@ +#!/bin/bash +#export PYTHONPATH=/root/whisper:$PYTHONPATH +# export CUDA_VISIBLE_DEVICES=1 +# export ASCEND_VISIBLE_DEVICES=1 +export TOKENIZERS_PARALLELISM=false +# export OPENBLAS_NUM_THREADS=1 +# export GOTO_NUM_THREADS=1 +# export OMP_NUM_THREADS=1 +# export CUDA_LAUNCH_BLOCKING=1 +set -e +run_dir=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/ +cd $run_dir +code_dir=examples/aispeech_asr + +dataset=aishell-2-train +prompt_style=normal # normal #instruct +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech-clean || $dataset == librispeech-other || $dataset == alimeeting || $dataset == gigaspeech ]] +then + # aishell-1:asr hotword + # aishell-2:asr hotword mt + # librispeech:asr prevtext mt + # alimeeting: asr_far_bf asr_near asr_far_gss + dataset_task=asr +fi +projector=linear +encoder_name=whisper +llm_name=Qwen2.5-7B-Instruct +use_peft=true +use_fp16=true +pad_or_trim=true +encoder_projector_ds_rate=5 +ckpt_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/aispeech_asr/exp/zh-1k-en-1k-asr/20250322/whisper_linear_Qwen2.5-7B-Instruct_loratrue_padtrue_normal__speedfalse_specaugfalse-1356/mala_asr_epoch_3_step_2000 +if [[ $encoder_name == "whisper" ]] +then + encoder_finetune=false +fi + +# Choose Encoder +if [[ $encoder_name == "whisper" ]] +then + if [[ $encoder_finetune == true ]] + then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper-Pt/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt + mel_size=80 + else + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + mel_size=128 + fi + encoder_dim=1280 + input_type=mel + +elif [[ $encoder_name == "wavlm" ]] +then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/wavlm/WavLM-Large.pt + encoder_dim=1024 + input_type=raw + mel_size=128 +else + exit 1 +fi + +# Choose LLM +if [[ $llm_name == "vicuna-7b-v1.5" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/vicuna-7b-v1.5 + llm_dim=4096 +elif [[ $llm_name == "Qwen2.5-7B-Instruct" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim=3584 +elif [[ $llm_name == "Qwen2-7B" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2-7B + llm_dim=3584 +elif [[ $llm_name == "Qwen2.5-7B" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B + llm_dim=3584 +else + exit 1 +fi + +if [[ $dataset == "aishell-1" || $dataset == "aishell-2" || $dataset == "alimeeting" || $dataset == gigaspeech ]] +then + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/test/ +elif [[ $dataset == "librispeech-other" ]] +then + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/test-other/ +elif [[ $dataset == "librispeech-clean" ]] +then + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/test-clean/ +elif [[ $dataset == "wenetspeech_test_net" ]] +then + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/wenetspeech/asr/test_net/ +elif [[ $dataset == "wenetspeech_test_meeting" ]] +then + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/wenetspeech/asr/test_meeting/ +else + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/test/ +fi +decode_log=$ckpt_path/decode_${dataset}_${dataset_task}_${prompt_style} +deepspeed \ + --num_nodes 1 \ + --num_gpus 8 \ + $code_dir/inference_batch_deepspeed.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + hydra.run.dir=$ckpt_path \ + ++model_config.llm_name=$llm_name \ + ++model_config.llm_path=$llm_path \ + ++model_config.llm_dim=$llm_dim \ + ++model_config.encoder_name=$encoder_name \ + ++model_config.normalize=true \ + ++model_config.encoder_projector_ds_rate=5 \ + ++model_config.encoder_path=$speech_encoder_path \ + ++model_config.encoder_dim=$encoder_dim \ + ++model_config.encoder_projector=$projector \ + ++dataset_config.llm_name=$llm_name \ + ++dataset_config.prompt_style=$prompt_style \ + ++dataset_config.dataset=$dataset \ + ++dataset_config.pad_or_trim=$pad_or_trim \ + ++dataset_config.test_scp_file_path=$test_scp_file_path \ + ++dataset_config.input_type=$input_type \ + ++dataset_config.mel_size=$mel_size \ + ++dataset_config.inference_mode=true \ + ++train_config.model_name=mala_asr \ + ++train_config.freeze_encoder=true \ + ++train_config.freeze_llm=true \ + ++train_config.use_peft=$use_peft \ + ++train_config.batching_strategy=custom \ + ++train_config.num_epochs=1 \ + ++train_config.val_batch_size=8 \ + ++train_config.num_workers_dataloader=0 \ + ++train_config.output_dir=$output_dir \ + ++decode_log=$decode_log \ + ++ckpt_path=$ckpt_path/model.pt + + + +python /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/tools/wenet_compute_cer.py --char=1 -v=1 ${decode_log}_gt ${decode_log}_pred > ${decode_log}_cer +python /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/tools/pyResults/pyResults.py ${decode_log}_gt ${decode_log}_pred > ${decode_log}_ser +python "/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/SLAM-LLM/examples/mala_asr_slidespeech/slam_llm/utils/compute_wer.py" ${decode_log}_gt ${decode_log}_pred ${decode_log}_ser + diff --git a/examples/aispeech_asr/scripts/finetune_MaLa-ASR_deepspeed.sh b/examples/aispeech_asr/scripts/finetune_MaLa-ASR_deepspeed.sh new file mode 100644 index 00000000..b71d6b6d --- /dev/null +++ b/examples/aispeech_asr/scripts/finetune_MaLa-ASR_deepspeed.sh @@ -0,0 +1,178 @@ +#!/bin/bash +# export PYTHONPATH=/root/fairseq:$PYTHONPATH +# export ASCEND_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export TOKENIZERS_PARALLELISM=false +# export CUDA_LAUNCH_BLOCKING=1 +export HYDRA_FULL_ERROR=1 +export OMP_NUM_THREADS=1 + +# debug setting for multiple gpus +# export NCCL_DEBUG=INFO +# export NCCL_DEBUG_SUBSYS=ALL +# export TORCH_DISTRIBUTED_DEBUG=INFO + +run_dir=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/ +cd $run_dir +code_dir=examples/aispeech_asr +# multitask +# dataset=alimeeting +# dataset=multitask_asr +dataset=librispeech +prompt_style=normal #instruct +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech ]] +then + # aishell1:asr hotword + # aisehll2:asr hotword mt + # librispeech:asr prevtext mt + # alimeeting: asr_far_bf asr_near + # gigaspeech: asr + # wenetspeech: asr + dataset_task=asr +fi +projector=linear +encoder_name=whisper +llm_name=Qwen2.5-7B-Instruct +use_peft=true +use_fp16=true +freeze_encoder=true +pad_or_trim=true +encoder_projector_ds_rate=5 +# enhance +# enhance +speed_perturb=false +spec_augmentation=false +add_noise=false +add_reverb=false +# deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config_from_k2.json +deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/aispeech_asr/conf/ds_config.json +# /hpc_stor01/home/yangui.fang_sx/workingspace/project/mala_asr_slidespeech_origin/exp/multitask_asr/20250111/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padfalse_normal__speedtrue_specaugfalse-1106/mala_asr_epoch_2_step_1299/ +# ckpt_path=/hpc_stor01/home/yangui.fang_sx/workingspace/project/mala_asr_slidespeech_origin/exp/multitask_asr/20250114/whisper_linear_Qwen2.5-7B-Instruct_loratrue_padtrue_normal__speedfalse_specaugfalse-2018_Upper_/mala_asr_epoch_2_step_12299 +if [[ $encoder_name == "whisper" ]] +then + encoder_finetune=false +fi +if [[ $use_peft == "true" || $freeze_encoder == false ]];then + ckpt_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/aispeech_asr/exp/librispeech/20250322/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1121/mala_asr_epoch_2_step_25000_best +fi + +# Choose Encoder +if [[ $encoder_name == "whisper" ]] +then + if [[ $encoder_finetune == true ]] + then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/aispeech_asr/scripts/finetune_MaLa-ASR_deepspeed_multinode.sh + mel_size=80 + else + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + mel_size=128 + fi + encoder_dim=1280 + input_type=mel + +elif [[ $encoder_name == "wavlm" ]] +then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/wavlm/WavLM-Large.pt + encoder_dim=1024 + input_type=raw + mel_size=128 +else + exit 1 +fi + +# Choose LLM +if [[ $llm_name == "vicuna-7b-v1.5" ]] +then + llm_path= + llm_dim=4096 +elif [[ $llm_name == "Qwen2.5-7B-Instruct" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim=3584 +elif [[ $llm_name == "Qwen2-7B" ]] +then + llm_path= + llm_dim=3584 +elif [[ $llm_name == "Qwen2.5-7B" ]] +then + llm_path= + llm_dim=3584 +else + exit 1 +fi + +# Choose Train/Dev/Test +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/dev/ +elif [[ $dataset == "librispeech" ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/dev-other/ +else + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/dev/ +fi + + +output_dir=${code_dir}/exp/${dataset}/$(date +"%Y%m%d")/${encoder_name}_${projector}_${llm_name}_lora${use_peft}_pad${pad_or_trim}_${prompt_style}_${dataset_task}_speed${speed_perturb}_specaug${spec_augmentation}-$(date +"%H%M") +hydra_args=" +hydra.run.dir=$output_dir \ +++model_config.llm_name=$llm_name \ +++model_config.llm_path=$llm_path \ +++model_config.llm_dim=$llm_dim \ +++model_config.encoder_name=$encoder_name \ +++model_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++model_config.encoder_path=$speech_encoder_path \ +++model_config.encoder_dim=$encoder_dim \ +++model_config.encoder_projector=$projector \ +++model_config.normalize=true \ +++dataset_config.llm_name=$llm_name \ +++dataset_config.prompt_style=$prompt_style \ +++dataset_config.normalize=true \ +++dataset_config.dataset=$dataset \ +++dataset_config.input_type=$input_type \ +++dataset_config.speed_perturb=$speed_perturb \ +++dataset_config.spec_augmentation=$spec_augmentation \ +++dataset_config.add_reverb=$add_reverb \ +++dataset_config.noise_file_path=$noise_file_path \ +++dataset_config.mel_size=$mel_size \ +++dataset_config.pad_or_trim=$pad_or_trim \ +++dataset_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++dataset_config.train_scp_file_path=$train_scp_file_path \ +++dataset_config.dev_scp_file_path=$dev_scp_file_path \ +++train_config.model_name=mala_asr \ +++train_config.num_epochs=5 \ +++train_config.freeze_encoder=$freeze_encoder \ +++train_config.freeze_llm=true \ +++train_config.use_peft=$use_peft \ +++train_config.batching_strategy=custom \ +++train_config.validation_interval=1000 \ +++train_config.num_workers_dataloader=8 \ +++train_config.output_dir=$output_dir \ +++metric=acc \ +" +if [[ $use_peft == "true" || $freeze_encoder == false ]];then + hydra_args+="++ckpt_path=$ckpt_path/model.pt" +fi +# hydra_args+="++ckpt_path=$ckpt_path/model.pt" + +# -m debugpy --listen 5678 --wait-for-client +if [[ $ASCEND_VISIBLE_DEVICES != *","* ]]; then + python -m debugpy --listen 5678 --wait-for-client $code_dir/finetune_mala_asr.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + $hydra_args +else + deepspeed \ + --num_nodes 1 \ + --num_gpus 8 \ + $code_dir/finetune_mala_asr_deepspeed.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + ++train_config.enable_fsdp=false \ + ++train_config.enable_ddp=true \ + ++train_config.use_fp16=$use_fp16 \ + ++deepspeed_config=$deepspeed_config \ + ${hydra_args} +fi diff --git a/examples/aispeech_asr/scripts/finetune_MaLa-ASR_deepspeed_multinode.sh b/examples/aispeech_asr/scripts/finetune_MaLa-ASR_deepspeed_multinode.sh new file mode 100644 index 00000000..bc57f52f --- /dev/null +++ b/examples/aispeech_asr/scripts/finetune_MaLa-ASR_deepspeed_multinode.sh @@ -0,0 +1,186 @@ +#!/bin/bash +# export PYTHONPATH=/root/fairseq:$PYTHONPATH +export ASCEND_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export TOKENIZERS_PARALLELISM=false +export HCCL_CONNECT_TIMEOUT=3600 +# export CUDA_LAUNCH_BLOCKING=1 +export HYDRA_FULL_ERROR=1 +export OMP_NUM_THREADS=1 + +# debug setting for multiple gpus +# export NCCL_DEBUG=INFO +# export NCCL_DEBUG_SUBSYS=ALL +# export TORCH_DISTRIBUTED_DEBUG=INFO + +run_dir=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/ +cd $run_dir +code_dir=examples/aispeech_asr + +dataset=aishell-2 +prompt_style=normal #instruct +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech ]] +then + # aishell1:asr hotword + # aisehll2:asr hotword mt + # librispeech:asr prevtext mt + # alimeeting: asr_far_bf asr_near + # gigaspeech: asr + # wenetspeech: asr + dataset_task=asr +fi +projector=linear +encoder_name=whisper +llm_name=Qwen2.5-7B-Instruct +use_peft=true +use_fp16=true +freeze_encoder=true +pad_or_trim=true +encoder_projector_ds_rate=5 +speed_perturb=false +spec_augmentation=false +add_noise=false +add_reverb=false +# deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config_from_k2.json +deepspeed_config=./conf/ds_config.json +if [[ $encoder_name == "whisper" ]] +then + encoder_finetune=false +fi +if [[ $use_peft == "true" || $freeze_encoder == false ]];then + ckpt_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/aispeech_asr/exp/librispeech/20250322/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1121/mala_asr_epoch_2_step_25000_best +fi + +# Choose Encoder +if [[ $encoder_name == "whisper" ]] +then + if [[ $encoder_finetune == true ]] + then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt + mel_size=80 + else + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + mel_size=128 + fi + encoder_dim=1280 + input_type=mel + +elif [[ $encoder_name == "wavlm" ]] +then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/wavlm/WavLM-Large.pt + encoder_dim=1024 + input_type=raw + mel_size=128 +else + exit 1 +fi + +# Choose LLM +if [[ $llm_name == "vicuna-7b-v1.5" ]] +then + llm_path= + llm_dim=4096 +elif [[ $llm_name == "Qwen2.5-7B-Instruct" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim=3584 +elif [[ $llm_name == "Qwen2-7B" ]] +then + llm_path= + llm_dim=3584 +elif [[ $llm_name == "Qwen2.5-7B" ]] +then + llm_path= + llm_dim=3584 +else + exit 1 +fi + +# Choose Train/Dev/Test +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/dev/ +elif [[ $dataset == "librispeech" ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/dev-other/ +else + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/dev/ +fi + + +output_dir=${code_dir}/exp/${dataset}/$(date +"%Y%m%d")/${encoder_name}_${projector}_${llm_name}_lora${use_peft}_pad${pad_or_trim}_${prompt_style}_${dataset_task}_speed${speed_perturb}_specaug${spec_augmentation}-$(date +"%H%M") +hydra_args=" +hydra.run.dir=$output_dir \ +++model_config.llm_name=$llm_name \ +++model_config.llm_path=$llm_path \ +++model_config.llm_dim=$llm_dim \ +++model_config.encoder_name=$encoder_name \ +++model_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++model_config.encoder_path=$speech_encoder_path \ +++model_config.encoder_dim=$encoder_dim \ +++model_config.encoder_projector=$projector \ +++model_config.normalize=true \ +++dataset_config.llm_name=$llm_name \ +++dataset_config.prompt_style=$prompt_style \ +++dataset_config.normalize=true \ +++dataset_config.dataset=$dataset \ +++dataset_config.input_type=$input_type \ +++dataset_config.speed_perturb=$speed_perturb \ +++dataset_config.spec_augmentation=$spec_augmentation \ +++dataset_config.add_reverb=$add_reverb \ +++dataset_config.noise_file_path=$noise_file_path \ +++dataset_config.mel_size=$mel_size \ +++dataset_config.pad_or_trim=$pad_or_trim \ +++dataset_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++dataset_config.train_scp_file_path=$train_scp_file_path \ +++dataset_config.dev_scp_file_path=$dev_scp_file_path \ +++train_config.model_name=mala_asr \ +++train_config.num_epochs=5 \ +++train_config.freeze_encoder=$freeze_encoder \ +++train_config.freeze_llm=true \ +++train_config.use_peft=$use_peft \ +++train_config.batching_strategy=custom \ +++train_config.num_workers_dataloader=0 \ +++train_config.output_dir=$output_dir \ +++metric=acc \ +" +if [[ $use_peft == "true" || $freeze_encoder == false ]];then + hydra_args+="++ckpt_path=$ckpt_path/model.pt" +fi +# hydra_args+="++ckpt_path=$ckpt_path/model.pt" + + +HOST_FILE="/tmp/"${JobID} #生成的hostfile的完整文件名,$JobID调度系统会自动生成 +SSH_PORT=6666 #因调度系统强制普通用户身份起容器,需要将ssh端口指定为大于1024的值 + +gen_hostfile() { #此函数负责生成hostfile, 已跟调度系统对接好,直接使用,不要修改 + echo "${VC_MASTER_HOSTS} slots=${GPU_PER_TASK}" > ${HOST_FILE} + echo "${VC_WORKER_HOSTS}" | awk -F ',' -v gpu_num=$GPU_PER_TASK '{for (i=1; i<=NF; i++) print $i" slots="gpu_num}' >> ${HOST_FILE} +} + +do_train() { + cat $HOST_FILE #训练主入口函数 + /usr/sbin/sshd -p ${SSH_PORT} #在Rank0上后台启动sshd服务,不要修改 + deepspeed \ + --hostfile $HOST_FILE \ + --ssh_port $SSH_PORT \ + $code_dir/finetune_mala_asr_deepspeed.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + ++train_config.enable_fsdp=false \ + ++train_config.enable_ddp=true \ + ++train_config.use_fp16=$use_fp16 \ + ++deepspeed_config=$deepspeed_config \ + ${hydra_args} \ + + +} + +if [ "${RANK}" = "0" ]; then #只在index为RANK0的POD上启动主训练脚本,其他节点由主节点通过ssh分发任务($RANK由调度系统自动分配) + gen_hostfile #生成分布式训练需要的hostfile + do_train #启动训练 +else + /usr/sbin/sshd -D -p ${SSH_PORT} #其他节点的task,仅前台运行sshd服务,不执行主训练脚本,不要修改 +fi \ No newline at end of file diff --git a/examples/aispeech_asr/scripts/finetune_MaLa-ASR_deepspeed_multinode_nossh.sh b/examples/aispeech_asr/scripts/finetune_MaLa-ASR_deepspeed_multinode_nossh.sh new file mode 100644 index 00000000..cf773995 --- /dev/null +++ b/examples/aispeech_asr/scripts/finetune_MaLa-ASR_deepspeed_multinode_nossh.sh @@ -0,0 +1,189 @@ +#!/bin/bash +# export PYTHONPATH=/root/fairseq:$PYTHONPATH +export ASCEND_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export TOKENIZERS_PARALLELISM=false +export HCCL_CONNECT_TIMEOUT=3600 +# export CUDA_LAUNCH_BLOCKING=1 +export HYDRA_FULL_ERROR=1 +export OMP_NUM_THREADS=1 + +# debug setting for multiple gpus +# export NCCL_DEBUG=INFO +# export NCCL_DEBUG_SUBSYS=ALL +# export TORCH_DISTRIBUTED_DEBUG=INFO + +run_dir=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/ +cd $run_dir +code_dir=examples/aispeech_asr +# multitask +# dataset=zh-1w-en-1w-asr +dataset=zh-6w-en-1w-fangyan-3w-asr +prompt_style=normal #instruct +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech ]] +then + # aishell1:asr hotword + # aisehll2:asr hotword mt + # librispeech:asr prevtext mt + # alimeeting: asr_far_bf asr_near + # gigaspeech: asr + # wenetspeech: asr + dataset_task=asr +fi +projector=linear +encoder_name=whisper +llm_name=Qwen2.5-7B-Instruct +use_peft=true +use_fp16=true +freeze_encoder=false +pad_or_trim=false +encoder_projector_ds_rate=5 + +speed_perturb=false +spec_augmentation=false +add_noise=false +add_reverb=false +# deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config_from_k2.json +deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/aispeech_asr/conf/ds_config.json +deepspeed_ckpt_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/aispeech_asr/exp/zh-6w-en-1w-fangyan-3w-asr/20250331/whisper_linear_Qwen2.5-7B-Instruct_loratrue_padfalse_normal__speedfalse_specaugfalse-08/mala_asr_epoch_1_step_100000 +# ckpt_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/aispeech_asr/exp/zh-1k-en-1k-asr/20250313/whisper_linear_Qwen2.5-7B-Instruct_loratrue_padtrue_normal__speedfalse_specaugfalse-2021/mala_asr_epoch_1_step_4000 +# deepspeed_ckpt_id=global_step90000 +if [[ $encoder_name == "whisper" ]] +then + encoder_finetune=false +fi +if [[ $use_peft == "true" || $freeze_encoder == false ]]; +then + # ckpt_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/aispeech_asr/exp/zh-1k-en-1k-asr/20250326/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padfalse_normal__speedfalse_specaugfalse-14/mala_asr_epoch_3_step_7000 + deepspeed_ckpt_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/aispeech_asr/exp/zh-6w-en-1w-fangyan-3w-asr/20250331/whisper_linear_Qwen2.5-7B-Instruct_loratrue_padfalse_normal__speedfalse_specaugfalse-08/mala_asr_epoch_1_step_100000 +fi + +# Choose Encoder +if [[ $encoder_name == "whisper" ]] +then + if [[ $encoder_finetune == true ]] + then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt + mel_size=80 + else + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + mel_size=128 + fi + encoder_dim=1280 + input_type=mel + +elif [[ $encoder_name == "wavlm" ]] +then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/wavlm/WavLM-Large.pt + encoder_dim=1024 + input_type=raw + mel_size=128 +else + exit 1 +fi + +# Choose LLM +if [[ $llm_name == "vicuna-7b-v1.5" ]] +then + llm_path= + llm_dim=4096 +elif [[ $llm_name == "Qwen2.5-7B-Instruct" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim=3584 +elif [[ $llm_name == "Qwen2-7B" ]] +then + llm_path= + llm_dim=3584 +elif [[ $llm_name == "Qwen2.5-7B" ]] +then + llm_path= + llm_dim=3584 +else + exit 1 +fi + +# Choose Train/Dev/Test +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/dev/ +elif [[ $dataset == "librispeech" ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/dev-other/ +else + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/dev/ +fi + + +output_dir=${code_dir}/exp/${dataset}/$(date +"%Y%m%d")/${encoder_name}_${projector}_${llm_name}_lora${use_peft}_pad${pad_or_trim}_${prompt_style}_${dataset_task}_speed${speed_perturb}_specaug${spec_augmentation}-$(date +"%H") +hydra_args=" +hydra.run.dir=$output_dir \ +++model_config.llm_name=$llm_name \ +++model_config.llm_path=$llm_path \ +++model_config.llm_dim=$llm_dim \ +++model_config.encoder_name=$encoder_name \ +++model_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++model_config.encoder_path=$speech_encoder_path \ +++model_config.encoder_dim=$encoder_dim \ +++model_config.encoder_projector=$projector \ +++model_config.normalize=true \ +++dataset_config.llm_name=$llm_name \ +++dataset_config.prompt_style=$prompt_style \ +++dataset_config.normalize=true \ +++dataset_config.dataset=$dataset \ +++dataset_config.input_type=$input_type \ +++dataset_config.speed_perturb=$speed_perturb \ +++dataset_config.spec_augmentation=$spec_augmentation \ +++dataset_config.add_reverb=$add_reverb \ +++dataset_config.noise_file_path=$noise_file_path \ +++dataset_config.mel_size=$mel_size \ +++dataset_config.pad_or_trim=$pad_or_trim \ +++dataset_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++dataset_config.train_scp_file_path=$train_scp_file_path \ +++dataset_config.dev_scp_file_path=$dev_scp_file_path \ +++train_config.model_name=mala_asr \ +++train_config.num_epochs=5 \ +++train_config.freeze_encoder=$freeze_encoder \ +++train_config.freeze_llm=true \ +++train_config.use_peft=$use_peft \ +++train_config.batching_strategy=custom \ +++train_config.validation_interval=10000 \ +++train_config.num_workers_dataloader=8 \ +++train_config.output_dir=$output_dir \ +++metric=acc \ +" +if [[ $use_peft == "true" || $freeze_encoder == false ]];then + # hydra_args+="++ckpt_path=$ckpt_path/model.pt" + hydra_args+=" ++deepspeed_ckpt_path=$deepspeed_ckpt_path " +fi +# hydra_args+=" ++deepspeed_ckpt_path=$deepspeed_ckpt_path " +HOST_FILE="/tmp/"${JobID} #生成的hostfile的完整文件名,$JobID调度系统会自动生成 +SSH_PORT=6666 #因调度系统强制普通用户身份起容器,需要将ssh端口指定为大于1024的值 + +gen_hostfile() { #此函数负责生成hostfile, 已跟调度系统对接好,直接使用,不要修改 + echo "${VC_MASTER_HOSTS} slots=${GPU_PER_TASK}" > ${HOST_FILE} + echo "${VC_WORKER_HOSTS}" | awk -F ',' -v gpu_num=$GPU_PER_TASK '{for (i=1; i<=NF; i++) print $i" slots="gpu_num}' >> ${HOST_FILE} +} + +do_train() { + cat $HOST_FILE #训练主入口函数 + /usr/sbin/sshd -p ${SSH_PORT} #在Rank0上后台启动sshd服务,不要修改 + deepspeed \ + --node_rank=$RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT \ + --hostfile $HOST_FILE \ + --no_ssh \ + $code_dir/finetune_mala_asr_deepspeed.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + ++train_config.enable_fsdp=false \ + ++train_config.enable_ddp=true \ + ++train_config.use_fp16=$use_fp16 \ + ++deepspeed_config=$deepspeed_config \ + ${hydra_args} +} +gen_hostfile +do_train \ No newline at end of file diff --git a/examples/aispeech_asr/scripts/finetune_MaLa-ASR_torchrun.sh b/examples/aispeech_asr/scripts/finetune_MaLa-ASR_torchrun.sh new file mode 100644 index 00000000..665743fc --- /dev/null +++ b/examples/aispeech_asr/scripts/finetune_MaLa-ASR_torchrun.sh @@ -0,0 +1,180 @@ +#!/bin/bash +# export PYTHONPATH=/root/fairseq:$PYTHONPATH +export ASCEND_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export TOKENIZERS_PARALLELISM=false +# export CUDA_LAUNCH_BLOCKING=1 +export HYDRA_FULL_ERROR=1 +export OMP_NUM_THREADS=1 + +# debug setting for multiple gpus +# export NCCL_DEBUG=INFO +# export NCCL_DEBUG_SUBSYS=ALL +# export TORCH_DISTRIBUTED_DEBUG=INFO + +run_dir=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/ +cd $run_dir +code_dir=examples/aispeech_asr +# multitask +# dataset=alimeeting +# multitask_asr +dataset=aishell-1 +prompt_style=normal #instruct +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech ]] +then + # aishell1:asr hotword + # aisehll2:asr hotword mt + # librispeech:asr prevtext mt + # alimeeting: asr_far_bf asr_near + # gigaspeech: asr + # wenetspeech: asr + dataset_task=asr +fi +projector=linear +encoder_name=whisper +llm_name=Qwen2.5-7B-Instruct +use_peft=false +use_fp16=true +freeze_encoder=true +pad_or_trim=true +encoder_projector_ds_rate=5 +# enhance +# enhance +speed_perturb=false +spec_augmentation=false +add_noise=false +add_reverb=false +# /hpc_stor01/home/yangui.fang_sx/workingspace/project/mala_asr_slidespeech_origin/exp/multitask_asr/20250111/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padfalse_normal__speedtrue_specaugfalse-1106/mala_asr_epoch_2_step_1299/ +# ckpt_path=/hpc_stor01/home/yangui.fang_sx/workingspace/project/mala_asr_slidespeech_origin/exp/multitask_asr/20250114/whisper_linear_Qwen2.5-7B-Instruct_loratrue_padtrue_normal__speedfalse_specaugfalse-2018_Upper_/mala_asr_epoch_2_step_12299 +if [[ $encoder_name == "whisper" ]] +then + encoder_finetune=false +fi +if [[ $use_peft == "true" || $freeze_encoder == false ]];then + ckpt_path= +fi + +# Choose Encoder +if [[ $encoder_name == "whisper" ]] +then + if [[ $encoder_finetune == true ]] + then + speech_encoder_path= + mel_size=80 + else + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + mel_size=128 + fi + encoder_dim=1280 + input_type=mel + +elif [[ $encoder_name == "wavlm" ]] +then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/wavlm/WavLM-Large.pt + encoder_dim=1024 + input_type=raw + mel_size=128 +else + exit 1 +fi + +# Choose LLM +if [[ $llm_name == "vicuna-7b-v1.5" ]] +then + llm_path= + llm_dim=4096 +elif [[ $llm_name == "Qwen2.5-7B-Instruct" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim=3584 +elif [[ $llm_name == "Qwen2-7B" ]] +then + llm_path= + llm_dim=3584 +elif [[ $llm_name == "Qwen2.5-7B" ]] +then + llm_path= + llm_dim=3584 +else + exit 1 +fi + +# Choose Train/Dev/Test +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/dev/ +elif [[ $dataset == "librispeech" ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/dev-other/ +else + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/dev/ +fi + +output_dir=${code_dir}/exp/${dataset}/$(date +"%Y%m%d")/${encoder_name}_${projector}_${llm_name}_lora${use_peft}_pad${pad_or_trim}_${prompt_style}_${dataset_task}_speed${speed_perturb}_specaug${spec_augmentation}-$(date +"%H%M") +hydra_args=" +hydra.run.dir=$output_dir \ +++model_config.llm_name=$llm_name \ +++model_config.llm_path=$llm_path \ +++model_config.llm_dim=$llm_dim \ +++model_config.encoder_name=$encoder_name \ +++model_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++model_config.encoder_path=$speech_encoder_path \ +++model_config.encoder_dim=$encoder_dim \ +++model_config.encoder_projector=$projector \ +++model_config.normalize=true \ +++dataset_config.llm_name=$llm_name \ +++dataset_config.prompt_style=$prompt_style \ +++dataset_config.normalize=true \ +++dataset_config.dataset=$dataset \ +++dataset_config.input_type=$input_type \ +++dataset_config.speed_perturb=$speed_perturb \ +++dataset_config.spec_augmentation=$spec_augmentation \ +++dataset_config.add_reverb=$add_reverb \ +++dataset_config.noise_file_path=$noise_file_path \ +++dataset_config.mel_size=$mel_size \ +++dataset_config.pad_or_trim=$pad_or_trim \ +++dataset_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++dataset_config.train_scp_file_path=$train_scp_file_path \ +++dataset_config.dev_scp_file_path=$dev_scp_file_path \ +++train_config.model_name=mala_asr \ +++train_config.num_epochs=5 \ +++train_config.freeze_encoder=$freeze_encoder \ +++train_config.freeze_llm=true \ +++train_config.use_peft=$use_peft \ +++train_config.batching_strategy=custom \ +++train_config.warmup_steps=1000 \ +++train_config.total_steps=100000 \ +++train_config.lr=5e-5 \ +++train_config.validation_interval=50 \ +++train_config.batch_size_training=2 \ +++train_config.val_batch_size=2 \ +++train_config.num_workers_dataloader=8 \ +++train_config.output_dir=$output_dir \ +++metric=acc \ +" +if [[ $use_peft == "true" || $freeze_encoder == false ]];then + hydra_args+="++ckpt_path=$ckpt_path/model.pt" +fi +# hydra_args+="++ckpt_path=$ckpt_path/model.pt" + +# -m debugpy --listen 5678 --wait-for-client +if [[ $ASCEND_VISIBLE_DEVICES != *","* ]]; then + python -m debugpy --listen 5678 --wait-for-client $code_dir/finetune_mala_asr.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + $hydra_args +else + torchrun \ + --nnodes 1 \ + --nproc_per_node 2 \ + --master_port=29505 \ + $code_dir/finetune_mala_asr.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + ++train_config.enable_fsdp=false \ + ++train_config.enable_ddp=true \ + ++train_config.use_fp16=$use_fp16 \ + ${hydra_args} +fi diff --git a/examples/aispeech_asr/scripts/finetune_MaLa-ASR_torchrun_multiplenode.sh b/examples/aispeech_asr/scripts/finetune_MaLa-ASR_torchrun_multiplenode.sh new file mode 100644 index 00000000..b54e261b --- /dev/null +++ b/examples/aispeech_asr/scripts/finetune_MaLa-ASR_torchrun_multiplenode.sh @@ -0,0 +1,182 @@ +#!/bin/bash +# export PYTHONPATH=/root/fairseq:$PYTHONPATH +export ASCEND_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export TOKENIZERS_PARALLELISM=false +# export CUDA_LAUNCH_BLOCKING=1 +export HYDRA_FULL_ERROR=1 +export OMP_NUM_THREADS=1 + +# debug setting for multiple gpus +# export NCCL_DEBUG=INFO +# export NCCL_DEBUG_SUBSYS=ALL +# export TORCH_DISTRIBUTED_DEBUG=INFO + + +run_dir=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/ +cd $run_dir +code_dir=examples/aispeech_asr +# multitask +# dataset=alimeeting +dataset=aishell-1 +prompt_style=normal #instruct +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech ]] +then + # aishell1:asr hotword + # aisehll2:asr hotword mt + # librispeech:asr prevtext mt + # alimeeting: asr_far_bf asr_near + # gigaspeech: asr + # wenetspeech: asr + dataset_task=asr +fi +projector=linear +encoder_name=whisper +llm_name=Qwen2.5-7B-Instruct +use_peft=false +use_fp16=true +freeze_encoder=true +pad_or_trim=true +encoder_projector_ds_rate=5 +# enhance +# enhance +speed_perturb=false +spec_augmentation=false +add_noise=false +add_reverb=false +# /hpc_stor01/home/yangui.fang_sx/workingspace/project/mala_asr_slidespeech_origin/exp/multitask_asr/20250111/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padfalse_normal__speedtrue_specaugfalse-1106/mala_asr_epoch_2_step_1299/ +# ckpt_path=/hpc_stor01/home/yangui.fang_sx/workingspace/project/mala_asr_slidespeech_origin/exp/multitask_asr/20250114/whisper_linear_Qwen2.5-7B-Instruct_loratrue_padtrue_normal__speedfalse_specaugfalse-2018_Upper_/mala_asr_epoch_2_step_12299 +if [[ $encoder_name == "whisper" ]] +then + encoder_finetune=false +fi +if [[ $use_peft == "true" || $freeze_encoder == false ]];then + ckpt_path= +fi + +# Choose Encoder +if [[ $encoder_name == "whisper" ]] +then + if [[ $encoder_finetune == true ]] + then + speech_encoder_path= + mel_size=80 + else + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + mel_size=128 + fi + encoder_dim=1280 + input_type=mel + +elif [[ $encoder_name == "wavlm" ]] +then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/wavlm/WavLM-Large.pt + encoder_dim=1024 + input_type=raw + mel_size=128 +else + exit 1 +fi + +# Choose LLM +if [[ $llm_name == "vicuna-7b-v1.5" ]] +then + llm_path= + llm_dim=4096 +elif [[ $llm_name == "Qwen2.5-7B-Instruct" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim=3584 +elif [[ $llm_name == "Qwen2-7B" ]] +then + llm_path= + llm_dim=3584 +elif [[ $llm_name == "Qwen2.5-7B" ]] +then + llm_path= + llm_dim=3584 +else + exit 1 +fi + +# Choose Train/Dev/Test +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/dev/ +elif [[ $dataset == "librispeech" ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/dev-other/ +else + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/dev/ +fi + +output_dir=${code_dir}/exp/${dataset}/$(date +"%Y%m%d")/${encoder_name}_${projector}_${llm_name}_lora${use_peft}_pad${pad_or_trim}_${prompt_style}_${dataset_task}_speed${speed_perturb}_specaug${spec_augmentation}-$(date +"%H%M") +hydra_args=" +hydra.run.dir=$output_dir \ +++model_config.llm_name=$llm_name \ +++model_config.llm_path=$llm_path \ +++model_config.llm_dim=$llm_dim \ +++model_config.encoder_name=$encoder_name \ +++model_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++model_config.encoder_path=$speech_encoder_path \ +++model_config.encoder_dim=$encoder_dim \ +++model_config.encoder_projector=$projector \ +++model_config.normalize=true \ +++dataset_config.llm_name=$llm_name \ +++dataset_config.prompt_style=$prompt_style \ +++dataset_config.normalize=true \ +++dataset_config.dataset=$dataset \ +++dataset_config.input_type=$input_type \ +++dataset_config.speed_perturb=$speed_perturb \ +++dataset_config.spec_augmentation=$spec_augmentation \ +++dataset_config.add_reverb=$add_reverb \ +++dataset_config.noise_file_path=$noise_file_path \ +++dataset_config.mel_size=$mel_size \ +++dataset_config.pad_or_trim=$pad_or_trim \ +++dataset_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++dataset_config.train_scp_file_path=$train_scp_file_path \ +++dataset_config.dev_scp_file_path=$dev_scp_file_path \ +++train_config.model_name=mala_asr \ +++train_config.num_epochs=5 \ +++train_config.freeze_encoder=$freeze_encoder \ +++train_config.freeze_llm=true \ +++train_config.use_peft=$use_peft \ +++train_config.batching_strategy=custom \ +++train_config.warmup_steps=1000 \ +++train_config.total_steps=100000 \ +++train_config.lr=5e-5 \ +++train_config.validation_interval=10000 \ +++train_config.batch_size_training=2 \ +++train_config.val_batch_size=2 \ +++train_config.num_workers_dataloader=4 \ +++train_config.output_dir=$output_dir \ +++metric=acc \ +" +if [[ $use_peft == "true" || $freeze_encoder == false ]];then + hydra_args+="++ckpt_path=$ckpt_path/model.pt" +fi +# hydra_args+="++ckpt_path=$ckpt_path/model.pt" + +# -m debugpy --listen 5678 --wait-for-client +if [[ $ASCEND_VISIBLE_DEVICES != *","* ]]; then + python -m debugpy --listen 5678 --wait-for-client $code_dir/finetune_mala_asr.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + $hydra_args +else + torchrun \ + --nnodes 2 \ + --nproc_per_node 8 \ + --node_rank=$RANK \ + --master_addr=$MASTER_ADDR \ + --master_port=$MASTER_PORT \ + $code_dir/finetune_mala_asr.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + ++train_config.enable_fsdp=false \ + ++train_config.enable_ddp=true \ + ++train_config.use_fp16=$use_fp16 \ + ${hydra_args} +fi diff --git a/examples/aispeech_asr/scripts/transcribe_deepspeed_to_pt.py b/examples/aispeech_asr/scripts/transcribe_deepspeed_to_pt.py new file mode 100644 index 00000000..e2a02862 --- /dev/null +++ b/examples/aispeech_asr/scripts/transcribe_deepspeed_to_pt.py @@ -0,0 +1,9 @@ +import argparse +import torch +import torch_npu +import sys +in_path = sys.argv[1] +out_path = sys.argv[2] +weight_dict = torch.load(in_path)["module"] +torch.save(weight_dict, f"{out_path}/model.pt") +print("[Finish]") \ No newline at end of file diff --git a/examples/aispeech_asr/slam_llm b/examples/aispeech_asr/slam_llm new file mode 120000 index 00000000..e2d801ef --- /dev/null +++ b/examples/aispeech_asr/slam_llm @@ -0,0 +1 @@ +/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/src/slam_llm \ No newline at end of file diff --git a/examples/aispeech_domain/README.md b/examples/aispeech_domain/README.md new file mode 100644 index 00000000..e7b4f21d --- /dev/null +++ b/examples/aispeech_domain/README.md @@ -0,0 +1,68 @@ +# MALA-ASR_SLIDESPEECH + +## Guides + +[MaLa-ASR](https://www.arxiv.org/abs/2406.05839) is an LLM-based ASR model that can integrate textual keywords extracted from presentation slides to improve recognition of conference content. + +![](docs/demo.png) + +## Model Architecture + +We use the official WavLM-Large model as our speech encoder, the public Vicuna 7B as our large language model decoder, and a simple-structured linear projector, consisting of a 1-D convolution layer and two linear layers as our projector. Refer to the [paper](https://www.arxiv.org/abs/2406.05839) for more details. + +![](docs/model.png) + +## Performance and checkpoints +We only train the linear projector in this recipe. +Encoder | Projector | LLM | dev | test +|---|---|---|---|---| +[WavLM-large](https://drive.google.com/file/d/12-cB34qCTvByWT-QtOcZaqwwO21FLSqU/view) | [Linear](https://drive.google.com/file/d/1hYS5UI3W0WVOZRVbqWxDUWIFMO9VgzHk/view?usp=drive_link)(~15.74M) | [vicuna-7b-v1.5](https://huggingface.co/lmsys/vicuna-7b-v1.5) | 8.91 | 9.14 + + +## Data preparation +Refer to official [SLIDESPEECH CORPUS](https://slidespeech.github.io/). + +Specifically, take the file `slidespeech_dataset.py` as an example, the dataset requires four files: `my_wav.scp`, `utt2num_samples`, `text`, `hot_related/ocr_1gram_top50_mmr070_hotwords_list`. + +`my_wav.scp` is a file of audio path lists. We transform wav file to ark file, so this file looks like +``` +ID1 xxx/slidespeech/dev_oracle_v1/data/format.1/data_wav.ark:22 +ID2 xxx/slidespeech/dev_oracle_v1/data/format.1/data_wav.ark:90445 +... +``` + +To generate this file, you can get audio wavs from https://www.openslr.org/144/ and get the time segments from https://slidespeech.github.io/. The second website provides segments, transcription text, OCR results at https://speech-lab-share-data.oss-cn-shanghai.aliyuncs.com/SlideSpeech/related_files.tar.gz (~1.37GB). You need to segment the wav by the timestamps provided in `segments` file. + + + This _related_files.tar.gz_ also provides `text` and a file named `keywords`. The file `keywords` refers to `hot_related/ocr_1gram_top50_mmr070_hotwords_list`, which contains hotwords list. + +`utt2num_samples` contains the length of the wavs, which looks like +``` +ID1 103680 +ID2 181600 +... +``` + +## Decode with checkpoints +``` +bash decode_MaLa-ASR_withkeywords_L95.sh +``` +Modify the path including `speech_encoder_path`, `llm_path`, `output_dir`, `ckpt_path` and `decode_log` in the script when you run the shell script. + +## Train a new model + +### Use self-supervised model(such as WavLM) as the encoder +``` +bash finetune_MaLa-ASR_withkeywords_L95.sh +``` + +## Citation +You can refer to the paper for more results. +``` +@inproceedings{yang2024malaasr, + title={MaLa-ASR: Multimedia-Assisted LLM-Based ASR}, + author={Guanrou Yang and Ziyang Ma and Fan Yu and Zhifu Gao and Shiliang Zhang and Xie Chen}, + booktitle={Proc. INTERSPEECH}, + year={2024}, +} +``` \ No newline at end of file diff --git a/examples/aispeech_domain/aispeech_asr/README.md b/examples/aispeech_domain/aispeech_asr/README.md new file mode 100644 index 00000000..e7b4f21d --- /dev/null +++ b/examples/aispeech_domain/aispeech_asr/README.md @@ -0,0 +1,68 @@ +# MALA-ASR_SLIDESPEECH + +## Guides + +[MaLa-ASR](https://www.arxiv.org/abs/2406.05839) is an LLM-based ASR model that can integrate textual keywords extracted from presentation slides to improve recognition of conference content. + +![](docs/demo.png) + +## Model Architecture + +We use the official WavLM-Large model as our speech encoder, the public Vicuna 7B as our large language model decoder, and a simple-structured linear projector, consisting of a 1-D convolution layer and two linear layers as our projector. Refer to the [paper](https://www.arxiv.org/abs/2406.05839) for more details. + +![](docs/model.png) + +## Performance and checkpoints +We only train the linear projector in this recipe. +Encoder | Projector | LLM | dev | test +|---|---|---|---|---| +[WavLM-large](https://drive.google.com/file/d/12-cB34qCTvByWT-QtOcZaqwwO21FLSqU/view) | [Linear](https://drive.google.com/file/d/1hYS5UI3W0WVOZRVbqWxDUWIFMO9VgzHk/view?usp=drive_link)(~15.74M) | [vicuna-7b-v1.5](https://huggingface.co/lmsys/vicuna-7b-v1.5) | 8.91 | 9.14 + + +## Data preparation +Refer to official [SLIDESPEECH CORPUS](https://slidespeech.github.io/). + +Specifically, take the file `slidespeech_dataset.py` as an example, the dataset requires four files: `my_wav.scp`, `utt2num_samples`, `text`, `hot_related/ocr_1gram_top50_mmr070_hotwords_list`. + +`my_wav.scp` is a file of audio path lists. We transform wav file to ark file, so this file looks like +``` +ID1 xxx/slidespeech/dev_oracle_v1/data/format.1/data_wav.ark:22 +ID2 xxx/slidespeech/dev_oracle_v1/data/format.1/data_wav.ark:90445 +... +``` + +To generate this file, you can get audio wavs from https://www.openslr.org/144/ and get the time segments from https://slidespeech.github.io/. The second website provides segments, transcription text, OCR results at https://speech-lab-share-data.oss-cn-shanghai.aliyuncs.com/SlideSpeech/related_files.tar.gz (~1.37GB). You need to segment the wav by the timestamps provided in `segments` file. + + + This _related_files.tar.gz_ also provides `text` and a file named `keywords`. The file `keywords` refers to `hot_related/ocr_1gram_top50_mmr070_hotwords_list`, which contains hotwords list. + +`utt2num_samples` contains the length of the wavs, which looks like +``` +ID1 103680 +ID2 181600 +... +``` + +## Decode with checkpoints +``` +bash decode_MaLa-ASR_withkeywords_L95.sh +``` +Modify the path including `speech_encoder_path`, `llm_path`, `output_dir`, `ckpt_path` and `decode_log` in the script when you run the shell script. + +## Train a new model + +### Use self-supervised model(such as WavLM) as the encoder +``` +bash finetune_MaLa-ASR_withkeywords_L95.sh +``` + +## Citation +You can refer to the paper for more results. +``` +@inproceedings{yang2024malaasr, + title={MaLa-ASR: Multimedia-Assisted LLM-Based ASR}, + author={Guanrou Yang and Ziyang Ma and Fan Yu and Zhifu Gao and Shiliang Zhang and Xie Chen}, + booktitle={Proc. INTERSPEECH}, + year={2024}, +} +``` \ No newline at end of file diff --git a/examples/aispeech_domain/aispeech_asr/finetune_mala_asr.py b/examples/aispeech_domain/aispeech_asr/finetune_mala_asr.py new file mode 100644 index 00000000..3f0d46a0 --- /dev/null +++ b/examples/aispeech_domain/aispeech_asr/finetune_mala_asr.py @@ -0,0 +1,49 @@ +from slam_llm.pipeline.finetune import main as train +from typing import Optional + +import hydra +import logging +from dataclasses import dataclass, field +from omegaconf import DictConfig, ListConfig, OmegaConf +from mala_asr_config import ModelConfig, TrainConfig, DataConfig, LogConfig, FSDPConfig + +@dataclass +class RunConfig: + dataset_config: DataConfig = field(default_factory=DataConfig) + model_config: ModelConfig = field(default_factory=ModelConfig) + train_config: TrainConfig = field(default_factory=TrainConfig) + log_config: LogConfig = field(default_factory=LogConfig) + fsdp_config: FSDPConfig = field(default_factory=FSDPConfig) + debug: bool = field(default=False, metadata={"help": "Use pdb when true"}) + metric: str = field(default="acc", metadata={"help": "The metric for evaluation"}) + ckpt_path: Optional[str] = field( + default=None, metadata={"help": "The path to projector checkpoint"} + ) + +@hydra.main(config_name=None, version_base=None) +def main_hydra(cfg: DictConfig): + run_config = RunConfig() + cfg = OmegaConf.merge(run_config, cfg) + def to_plain_list(cfg_item): + if isinstance(cfg_item, ListConfig): + return OmegaConf.to_container(cfg_item, resolve=True) + elif isinstance(cfg_item, DictConfig): + return {k: to_plain_list(v) for k, v in cfg_item.items()} + else: + return cfg_item + + # kwargs = to_plain_list(cfg) + kwargs = cfg + log_level = getattr(logging, kwargs.get("log_level", "INFO").upper()) + + logging.basicConfig(level=log_level) + + if kwargs.get("debug", False): + import pdb; + pdb.set_trace() + + train(kwargs) + + +if __name__ == "__main__": + main_hydra() \ No newline at end of file diff --git a/examples/aispeech_domain/aispeech_asr/finetune_mala_asr_deepspeed.py b/examples/aispeech_domain/aispeech_asr/finetune_mala_asr_deepspeed.py new file mode 100644 index 00000000..2ee6b1bf --- /dev/null +++ b/examples/aispeech_domain/aispeech_asr/finetune_mala_asr_deepspeed.py @@ -0,0 +1,63 @@ +from slam_llm.pipeline.finetune_deepspeed import main as train +from typing import Optional +import argparse +import hydra +import logging +from dataclasses import dataclass, field +from omegaconf import DictConfig, ListConfig, OmegaConf +from mala_asr_config import ModelConfig, TrainConfig, DataConfig, LogConfig, FSDPConfig +import sys +@dataclass +class RunConfig: + dataset_config: DataConfig = field(default_factory=DataConfig) + model_config: ModelConfig = field(default_factory=ModelConfig) + train_config: TrainConfig = field(default_factory=TrainConfig) + log_config: LogConfig = field(default_factory=LogConfig) + fsdp_config: FSDPConfig = field(default_factory=FSDPConfig) + debug: bool = field(default=False, metadata={"help": "Use pdb when true"}) + metric: str = field(default="acc", metadata={"help": "The metric for evaluation"}) + ckpt_path: Optional[str] = field( + default=None, metadata={"help": "The path to projector checkpoint"} + ) + deepspeed_config : str ="" + deepspeed_ckpt_path: Optional[str] = field( + default=None, metadata={"help": "The path to projector checkpoint"} + ) + deepspeed_ckpt_id: Optional[str] = field( + default=None, metadata={"help": "The id to projector checkpoint"} + ) +def parse_deepspeed_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--local_rank", type=int, default=-1) + + args, unknown_args = parser.parse_known_args() + return args, unknown_args +@hydra.main(config_name=None, version_base=None) +def main_hydra(cfg: DictConfig): + run_config = RunConfig() + cfg = OmegaConf.merge(run_config, cfg) + def to_plain_list(cfg_item): + if isinstance(cfg_item, ListConfig): + return OmegaConf.to_container(cfg_item, resolve=True) + elif isinstance(cfg_item, DictConfig): + return {k: to_plain_list(v) for k, v in cfg_item.items()} + else: + return cfg_item + + # kwargs = to_plain_list(cfg) + kwargs = cfg + log_level = getattr(logging, kwargs.get("log_level", "INFO").upper()) + + logging.basicConfig(level=log_level) + + if kwargs.get("debug", False): + import pdb; + pdb.set_trace() + + train(kwargs) + + +if __name__ == "__main__": + deepspeed_args, remaining_args = parse_deepspeed_args() + sys.argv = [sys.argv[0]] + remaining_args # 仅传递 Hydra 能处理的参数 + main_hydra() \ No newline at end of file diff --git a/examples/aispeech_domain/aispeech_asr/inference_batch_deepspeed.py b/examples/aispeech_domain/aispeech_asr/inference_batch_deepspeed.py new file mode 100644 index 00000000..f9b583a9 --- /dev/null +++ b/examples/aispeech_domain/aispeech_asr/inference_batch_deepspeed.py @@ -0,0 +1,66 @@ +from slam_llm.pipeline.inference_batch_deepspeed import main as inference +from typing import Optional +import argparse +import hydra +import logging +from dataclasses import dataclass, field +from omegaconf import DictConfig, ListConfig, OmegaConf +from mala_asr_config import ModelConfig, TrainConfig, DataConfig, LogConfig, FSDPConfig +import sys +@dataclass +class RunConfig: + dataset_config: DataConfig = field(default_factory=DataConfig) + model_config: ModelConfig = field(default_factory=ModelConfig) + train_config: TrainConfig = field(default_factory=TrainConfig) + log_config: LogConfig = field(default_factory=LogConfig) + fsdp_config: FSDPConfig = field(default_factory=FSDPConfig) + debug: bool = field(default=False, metadata={"help": "Use pdb when true"}) + metric: str = field(default="acc", metadata={"help": "The metric for evaluation"}) + decode_log: str = field( + default="output/decode_log", + metadata={"help": "The prefix for the decode output"}, + ) + ckpt_path: str = field( + default="output/model.pt", metadata={"help": "The path to projector checkpoint"} + ) + peft_ckpt: Optional[str] = field( + default=None, + metadata={ + "help": "The path to peft checkpoint, should be a directory including adapter_config.json" + }, + ) +def parse_deepspeed_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--local_rank", type=int, default=-1) + + args, unknown_args = parser.parse_known_args() + return args, unknown_args +@hydra.main(config_name=None, version_base=None) +def main_hydra(cfg: DictConfig): + run_config = RunConfig() + cfg = OmegaConf.merge(run_config, cfg) + def to_plain_list(cfg_item): + if isinstance(cfg_item, ListConfig): + return OmegaConf.to_container(cfg_item, resolve=True) + elif isinstance(cfg_item, DictConfig): + return {k: to_plain_list(v) for k, v in cfg_item.items()} + else: + return cfg_item + + # kwargs = to_plain_list(cfg) + kwargs = cfg + log_level = getattr(logging, kwargs.get("log_level", "INFO").upper()) + + logging.basicConfig(level=log_level) + + if kwargs.get("debug", False): + import pdb; + pdb.set_trace() + + inference(kwargs) + + +if __name__ == "__main__": + deepspeed_args, remaining_args = parse_deepspeed_args() + sys.argv = [sys.argv[0]] + remaining_args # 仅传递 Hydra 能处理的参数 + main_hydra() \ No newline at end of file diff --git a/examples/aispeech_domain/aispeech_asr/inference_mala_asr_batch.py b/examples/aispeech_domain/aispeech_asr/inference_mala_asr_batch.py new file mode 100644 index 00000000..e733f7b3 --- /dev/null +++ b/examples/aispeech_domain/aispeech_asr/inference_mala_asr_batch.py @@ -0,0 +1,53 @@ +from slam_llm.pipeline.inference_batch import main as inference + +import hydra +import logging +from dataclasses import dataclass, field +from omegaconf import DictConfig, ListConfig, OmegaConf +from typing import Optional +from mala_asr_config import ModelConfig, TrainConfig, DataConfig, LogConfig, FSDPConfig + + +@dataclass +class RunConfig: + dataset_config: DataConfig = field(default_factory=DataConfig) + model_config: ModelConfig = field(default_factory=ModelConfig) + train_config: TrainConfig = field(default_factory=TrainConfig) + log_config: LogConfig = field(default_factory=LogConfig) + fsdp_config: FSDPConfig = field(default_factory=FSDPConfig) + debug: bool = field(default=False, metadata={"help": "Use pdb when true"}) + metric: str = field(default="acc", metadata={"help": "The metric for evaluation"}) + decode_log: str = field( + default="output/decode_log", + metadata={"help": "The prefix for the decode output"}, + ) + ckpt_path: str = field( + default="output/model.pt", metadata={"help": "The path to projector checkpoint"} + ) + peft_ckpt: Optional[str] = field( + default=None, + metadata={ + "help": "The path to peft checkpoint, should be a directory including adapter_config.json" + }, + ) + + +@hydra.main(config_name=None, version_base=None) +def main_hydra(cfg: DictConfig): + run_config = RunConfig() + cfg = OmegaConf.merge(run_config, cfg) + # kwargs = to_plain_list(cfg) + log_level = getattr(logging, cfg.get("log_level", "INFO").upper()) + + logging.basicConfig(level=log_level) + + if cfg.get("debug", False): + import pdb + + pdb.set_trace() + + inference(cfg) + + +if __name__ == "__main__": + main_hydra() diff --git a/examples/aispeech_domain/aispeech_asr/mala_asr_config.py b/examples/aispeech_domain/aispeech_asr/mala_asr_config.py new file mode 100644 index 00000000..59ae523a --- /dev/null +++ b/examples/aispeech_domain/aispeech_asr/mala_asr_config.py @@ -0,0 +1,155 @@ +from dataclasses import dataclass, field +from typing import Optional, List +from torch.distributed.fsdp import ShardingStrategy + + +@dataclass +class ModelConfig: + file: str = "examples/aispeech_asr/model/slam_model_mala_asr.py:model_factory" + llm_name: str = "vicuna-7b-v1.5" + llm_path: str = "PATH/to/LLAMA/7B" + llm_type: str = "decoder_only" + llm_dim: int = 4096 + encoder_name: Optional[str] = None + encoder_ds_rate: int = 2 + encoder_path: Optional[str] = None + encoder_path_hf: Optional[str] = None + encoder_dim: int = 1280 + encoder_projector: str = "linear" + qformer_layers : int = 8 + encoder_projector_ds_rate: int = 5 + modal: str = "audio" + normalize: Optional[bool] = field(default=False, metadata={ + "help": "whether input is normalized, used for models such as wavlm" + }) + encoder_type: str = field(default="finetune", metadata={ + "help": "whether model is only pretrained or finetuned, used for models such as hubert" + }) + +# @dataclass +# class PeftConfig: +# peft_method: str = "lora" # None , llama_adapter, prefix +# r: int = 8 +# lora_alpha: int = 32 +# target_modules: List = field(default_factory=lambda: [ "q_proj", "v_proj"]) +# bias: str = "none" +# task_type: str = "CAUSAL_LM" +# lora_dropout: float = 0.05 +# inference_mode: bool = False + +@dataclass +class PeftConfig: + peft_method: str = "lora" # None , llama_adapter, prefix + r: int = 64 + lora_alpha: int = 16 + target_modules: List = field(default_factory=lambda: [ "q_proj", "v_proj", "o_proj", "up_proj","gate_proj","down_proj"]) + bias: str = "none" + task_type: str = "CAUSAL_LM" + lora_dropout: float = 0.05 + inference_mode: bool = False + +@dataclass +class TrainConfig: + model_name:str = "PATH/to/LLAMA/7B" + enable_ddp:bool = False + enable_deepspeed:bool = False + enable_fsdp:bool = False + low_cpu_fsdp:bool = False + run_validation:bool = True + batch_size_training:int = 4 + batching_strategy:str = field(default="packing", metadata={ + "help":"alternative: padding" + }) # + context_length:int = 4096 + gradient_accumulation_steps:int = 1 + num_epochs:int = 3 + num_workers_dataloader:int = 1 + warmup_steps:int = 1000 + total_steps:int = 100000 + validation_interval:int = 1000 + lr:float = 1e-4 + weight_decay:float = 0.0 + gamma:float = 0.85 + seed:int = 1234 + use_fp16:bool = False + mixed_precision:bool = True + val_batch_size:int = 1 + + use_peft:bool = False + peft_config:PeftConfig = field(default_factory=PeftConfig) + output_dir:str = "PATH/to/save/PEFT/model" + freeze_layers:bool = False + num_freeze_layers:int = 1 + quantization:bool = False + one_gpu:bool = False + save_model:bool = True + dist_checkpoint_root_folder:str = "PATH/to/save/FSDP/model" # will be used if using FSDP + dist_checkpoint_folder:str = "fine-tuned" # will be used if using FSDP + save_optimizer:bool = False # will be used if using FSDP + use_fast_kernels:bool = False # Enable using SDPA from PyTroch Accelerated Transformers, make use Flash Attention and Xformer memory-efficient kernels + run_test_during_validation:bool = False + run_test_during_validation_file:str = "test.wav" + run_test_during_validation_prompt:str = "<|ASR|>" + freeze_llm:bool = field(default=False, metadata={ + "help": "whether to freeze llm when finetuning, should be true when use peft finetuning" + }) + freeze_encoder:bool = False + +@dataclass +class DataConfig: + dataset: str = "multitask_dataset" + llm_name: str = "vicuna-7b-v1.5" + prompt_style: str = "normal" # instruct + file: str = "examples/aispeech_asr/dataset/multitask_dataset.py:get_speech_dataset" + speed_perturb : bool = False + spec_augmentation : bool = False + add_noise : bool = False + add_reverb : bool = False + noise_file_path: str = "" + train_scp_file_path: str = "" + dev_scp_file_path: str = "" + test_scp_file_path: str = "" + train_split: str = "train" + dev_split: str = "dev" + test_split:str = "test" + pad_or_trim: bool = True + prompt: Optional[str] = None + use_ocr: bool = True + inference_mode: bool = False + prompt_mode : str = "qwen" + lower: bool = False + encoder_ds_rate: int = 2 + encoder_projector_ds_rate: int = 5 + fix_length_audio: int = -1 + inference_mode:bool = False + input_type: str = field(default="raw", metadata={ + "help":"Use raw when input is wav, mel when for whisper" + }) + mel_size: int = field(default=80, metadata={ + "help": "80 for whisper large v1 and v2, 128 for v3" + }) + normalize: Optional[bool] = field(default=False, metadata={ + "help": "whether input is normalized, used for models such as wavlm" + }) + +@dataclass +class FSDPConfig: + mixed_precision: bool = True + use_fp16: bool = False + # sharding_strategy = "FULL_SHARD" #ShardingStrategy = ShardingStrategy.FULL_SHARD + sharding_strategy: ShardingStrategy = "SHARD_GRAD_OP" #ShardingStrategy.NO_SHARD #MZY: set NO_SHARD when use DDP + checkpoint_type: str = "SHARDED_STATE_DICT" # alternatively can use SHARDED_STATE_DICT save one file per rank, and can resize the world-size. + fsdp_activation_checkpointing: bool = True + fsdp_cpu_offload: bool = False + pure_bf16: bool = False + optimizer: str = "AdamW" + +@dataclass +class LogConfig: + use_wandb: bool = False + wandb_dir: str = "tmp/test_wandb" + wandb_entity_name: str = "project_name" + wandb_project_name: str = "project_name" + wandb_exp_name: str = "exp_name" + log_file: str = "tmp/test.log" + log_interval: int = 5 diff --git a/examples/aispeech_domain/conf/ds_config.json b/examples/aispeech_domain/conf/ds_config.json new file mode 100644 index 00000000..27095dcd --- /dev/null +++ b/examples/aispeech_domain/conf/ds_config.json @@ -0,0 +1,35 @@ +{ + "train_micro_batch_size_per_gpu": 2, + "gradient_accumulation_steps": 1, + "optimizer": { + "type": "Adam", + "params": { + "lr": 5e-5 + } + }, + "bf16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 100, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 0.01 + }, + "zero_optimization": { + "stage": 2, + "allgather_partitions": true, + "overlap_comm": true, + "reduce_scatter": true, + "contiguous_gradients": true + }, + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": 0.00, + "warmup_max_lr": 0.00005, + "warmup_num_steps": 1 + } + }, + "checkpoint_activations": true + +} \ No newline at end of file diff --git a/examples/aispeech_domain/conf/ds_config_from_k2.json b/examples/aispeech_domain/conf/ds_config_from_k2.json new file mode 100644 index 00000000..613709ac --- /dev/null +++ b/examples/aispeech_domain/conf/ds_config_from_k2.json @@ -0,0 +1,38 @@ +{ + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 100, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 0.01 + }, + "zero_optimization": { + "stage": 1, + "allgather_partitions": true, + "allgather_bucket_size": 2e8, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 2e8, + "contiguous_gradients": true + }, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-4 + } + }, + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": 0, + "warmup_max_lr": 1e-4, + "warmup_num_steps": 1000 + } + }, + "gradient_accumulation_steps": 4, + "gradient_clipping": 5, + "steps_per_print": 50, + "train_micro_batch_size_per_gpu": 1, + "wall_clock_breakdown": false +} \ No newline at end of file diff --git a/examples/aispeech_domain/conf/prompt.yaml b/examples/aispeech_domain/conf/prompt.yaml new file mode 100644 index 00000000..32cf2374 --- /dev/null +++ b/examples/aispeech_domain/conf/prompt.yaml @@ -0,0 +1,14 @@ +dataset_config: + # we put prompt here, because the hydra override in shell script only support a small subset of chars + # prompt: "Transcribe speech to text. Output the transcription directly without redundant content. Ensure that the output is not duplicated. " + # prompt: "<|im_start|>user\n请将语音转写为汉字<|im_end|>\n<|im_start|>assistant\n" + # prompt: "识别语音" + # prompt : "将上面的语音转写为英文" + # prompt: "Transcribe speech to English." + # prompt: "Transcribe speech to text.And then translate the text to spanish." + # prompt: "Transcribe speech to text." + # prompt: "Tell me what is the language of the text." + prompt: "Transcribe speech to text." + # prompt: "Transcribe speech to text.Follow words may occur in audio:{}." + # prompt: "" + # prompt: "请问上面有几个句子,有多少个字,给字编号然后输出文本" diff --git a/examples/aispeech_domain/dataset/multitask_dataset.py b/examples/aispeech_domain/dataset/multitask_dataset.py new file mode 100644 index 00000000..87931e78 --- /dev/null +++ b/examples/aispeech_domain/dataset/multitask_dataset.py @@ -0,0 +1,212 @@ +import torch +from torch.utils.data import Dataset,IterableDataset +import whisper +import kaldiio +# import pyroomacoustics as pra +import torch.distributed as dist +import string +import copy +import numpy as np +import copy +from tqdm import tqdm +import os +import json +import random +import torchaudio.transforms as T +from torchaudio.transforms import SpeedPerturbation +import torchaudio +import torchaudio.functional as F +import random +class MultiTaskDataset(IterableDataset): + def __init__(self, dataset_config, tokenizer=None, split='train',musan_path=None): + super().__init__() + self.multitask_prompt_list = {} + multitask_prompt_path = "/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/multiprompt.jsonl" + with open(multitask_prompt_path) as f_prompt: + for line in f_prompt: + item = json.loads(line.strip()) + if item["task"] in self.multitask_prompt_list: + self.multitask_prompt_list[item["task"]].append(item["prompt"]) + else: + self.multitask_prompt_list[item["task"]] = [item["prompt"]] + print(f"[Prompt] {self.multitask_prompt_list}") + if split == "train": + self.data_path = dataset_config.train_scp_file_path + elif split == "val": + self.data_path = dataset_config.dev_scp_file_path + elif split == "test": + self.data_path = dataset_config.test_scp_file_path + else: + assert(0) + if musan_path is not None: + self.musan_list = [] + with open(musan_path) as f: + for line in f: + key,path = line.split(" ") + self.musan_list.append(path) + + + self.llm_name = dataset_config.get("llm_name", None) + self.prompt_style = dataset_config.get("prompt_style", "normal") + if self.llm_name == "Qwen2.5-7B-Instruct": + if self.prompt_style == "normal": + self.prompt_template1 = "{}" + elif self.prompt_style == "instruct": + self.prompt_template1 = "<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n" + else: + if self.prompt_style == "normal": + self.prompt_template1 = "{}" + elif self.prompt_style == "instruct": + self.prompt_template1 = "USER: {}\n ASSISTANT:" + self.answer_template = "{}" + self.dataset_config = dataset_config + self.tokenizer = tokenizer + self.split = split + self.spec_augmentation = dataset_config.get("spec_augmentation", False) + self.speed_perturb = dataset_config.get("speed_perturb", False) + self.add_noise = dataset_config.get("musan", False) + self.add_reverb = dataset_config.get("add_reverb", False) + self.noise_file_path = dataset_config.get("noise_file_path", False) + if self.add_noise == True: + self.musan_wav_files = [] + for root, dirs, files in os.walk(self.noise_file_path): + for file in files: + if file.endswith('.wav'): + self.musan_wav_files.append(os.path.join(root, file)) + self.pad_or_trim = dataset_config.get("pad_or_trim", False) + self.IGNORE_INDEX = -100 # The default setting in CrossEntropyLoss + self.mel_size = dataset_config.get("mel_size", 80) # 80 for whisper large v1 and v2, 128 for large v3 + self.fix_length_audio = dataset_config.get("fix_length_audio", -1) + self.inference_mode = dataset_config.get("inference_mode", False) + self.prompt_mode = dataset_config.get("prompt_mode", None) + self.normalize = dataset_config.get("normalize", False) + self.input_type = dataset_config.get("input_type", None) + assert self.input_type in ["raw", "mel"], "input_type must be one of [raw, mel]" + + + + def speedPerturb(self, audio_raw): + orig_freq = 16000 + # 定义速度扰动因子,例如 [0.9, 1.0, 1.1] 表示速度减少10%,保持不变,增加10%Q + factors = [0.9,1,1.1] + # 创建速度扰动变换 + speed_perturbation = SpeedPerturbation(orig_freq, factors) + # 应用速度扰动 + audio_raw = torch.from_numpy(audio_raw) + # 由于 SpeedPerturbation 返回的是一个函数,我们需要调用它 + # 并且传入原始音频张量 + audio_raw = speed_perturbation(audio_raw)[0] + return audio_raw + def specAugment(self, spec): + spec = spec.permute(1, 0).unsqueeze(0) + stretch = T.TimeStretch(n_freq=128) + rate = random.random()*0.2 + 0.9 + Timemasking = T.TimeMasking(time_mask_param=100) + Frequencymasking = T.FrequencyMasking(freq_mask_param=27) + spec = stretch(spec, rate).to(torch.float32) + spec = Timemasking(spec) + spec = Timemasking(spec) + spec = Frequencymasking(spec) + spec = Frequencymasking(spec) + spec = spec.squeeze(0).permute(1, 0) + return spec + def addNoise(self, audio_raw): + noise, _ = torchaudio.load(random.choice(self.musan_wav_files)) + noise.unsqueeze_(0) + # 如果语音比噪声长,随机选择噪声的起始点 + if audio_raw.shape > noise.shape: + # 随机选择噪声的起始点 + start_idx = random.randint(0, audio_raw.shape - noise.shape) + # 在语音的随机位置开始添加噪声 + speech_with_noise = torch.zeros_like(audio_raw) + speech_with_noise[:, start_idx:start_idx + noise.shape] += noise + else: + # 如果噪声比语音长,从噪声的随机位置开始截取 + start_idx = random.randint(0, noise.shape - audio_raw.shape) + noise = noise[:, start_idx:start_idx + audio_raw.shape] + # 直接将噪声添加到语音中 + snr_dbs = random.randomint(1, 30) + noisy_speeches = F.add_noise(audio_raw, noise, snr_dbs) + return noisy_speeches + # def simulate_room_reverb(self, audio_raw,fs): + # room_dim = [random.uniform(3, 10), random.uniform(3, 10), random.uniform(2, 5)] + # # 随机生成目标混响时间(RT60,单位:秒) + # rt60_tgt = random.uniform(0.3, 1.0) + # # 生成随机房间参数 + # # 使用 Sabine 公式计算吸声系数和反射阶数 + # e_absorption, max_order = pra.inverse_sabine(rt60_tgt, room_dim) + # # 创建房间 + # room = pra.ShoeBox( + # room_dim, + # fs=fs, + # materials=pra.Material(e_absorption), + # max_order=int(max_order), + # use_rand_ism=True, # 使用随机化图像方法减少回声 + # max_rand_disp=0.05, # 最大随机位移(单位:米) + # ) + # # 随机生成声源位置 + # source_position = [random.uniform(0.5, room_dim[0] - 0.5), + # random.uniform(0.5, room_dim[1] - 0.5), + # random.uniform(0.5, room_dim[2] - 0.5)] + # room.add_source(source_position, signal=audio_raw) + # # 随机生成麦克风位置 + # mic_locs = np.c_[ + # [random.uniform(0.5, room_dim[0] - 0.5), random.uniform(0.5, room_dim[1] - 0.5), random.uniform(0.5, room_dim[2] - 0.5)], + # ] + # room.add_microphone_array(mic_locs) + # # 运行模拟 + # room.simulate() + # # 返回麦克风阵列的信号 + # return room.mic_array.signals[0, :] + def __iter__(self): + multitask_task_path = os.path.join(self.data_path,"multitask.jsonl") + worker_info = torch.utils.data.get_worker_info() + if worker_info is None: # 不在 DataLoader 的多进程环境中 + num_workers = 1 + worker_id = 0 + else: + num_workers = worker_info.num_workers + worker_id = worker_info.id + + # 获取分布式环境中的进程信息 + if dist.is_available() and dist.is_initialized(): + world_size = dist.get_world_size() + rank = dist.get_rank() + else: + world_size = 1 + rank = 0 + + # 计算每个 worker 和每个进程应该处理的数据范围 + total_num_workers = num_workers * world_size + worker_rank = rank * num_workers + worker_id + data_index = 0 + with open(multitask_task_path) as f_task: + for line in f_task: + if (data_index % total_num_workers) == worker_rank : + item = json.loads(line.strip()) + key = item["key"] + target = item["target"] + yield { + "target" + } + data_index += 1 + + + + def collator(self, samples): + assert samples is not None + target = [ _["target"] for _ in samples] + processed_data = self.tokenizer(text=target, return_tensors="pt") + # 处理labels的生成 + labels = copy.deepcopy(processed_data["input_ids"]) + processed_data["labels"] = labels + return processed_data + + +def get_speech_dataset(dataset_config, tokenizer, split): + dataset = MultiTaskDataset(dataset_config, tokenizer, split) + return dataset + + + + \ No newline at end of file diff --git a/examples/aispeech_domain/docs/demo.png b/examples/aispeech_domain/docs/demo.png new file mode 100644 index 00000000..acba4559 Binary files /dev/null and b/examples/aispeech_domain/docs/demo.png differ diff --git a/examples/aispeech_domain/docs/model.png b/examples/aispeech_domain/docs/model.png new file mode 100644 index 00000000..95ecefc5 Binary files /dev/null and b/examples/aispeech_domain/docs/model.png differ diff --git a/examples/aispeech_domain/finetune_mala_asr.py b/examples/aispeech_domain/finetune_mala_asr.py new file mode 100644 index 00000000..3f0d46a0 --- /dev/null +++ b/examples/aispeech_domain/finetune_mala_asr.py @@ -0,0 +1,49 @@ +from slam_llm.pipeline.finetune import main as train +from typing import Optional + +import hydra +import logging +from dataclasses import dataclass, field +from omegaconf import DictConfig, ListConfig, OmegaConf +from mala_asr_config import ModelConfig, TrainConfig, DataConfig, LogConfig, FSDPConfig + +@dataclass +class RunConfig: + dataset_config: DataConfig = field(default_factory=DataConfig) + model_config: ModelConfig = field(default_factory=ModelConfig) + train_config: TrainConfig = field(default_factory=TrainConfig) + log_config: LogConfig = field(default_factory=LogConfig) + fsdp_config: FSDPConfig = field(default_factory=FSDPConfig) + debug: bool = field(default=False, metadata={"help": "Use pdb when true"}) + metric: str = field(default="acc", metadata={"help": "The metric for evaluation"}) + ckpt_path: Optional[str] = field( + default=None, metadata={"help": "The path to projector checkpoint"} + ) + +@hydra.main(config_name=None, version_base=None) +def main_hydra(cfg: DictConfig): + run_config = RunConfig() + cfg = OmegaConf.merge(run_config, cfg) + def to_plain_list(cfg_item): + if isinstance(cfg_item, ListConfig): + return OmegaConf.to_container(cfg_item, resolve=True) + elif isinstance(cfg_item, DictConfig): + return {k: to_plain_list(v) for k, v in cfg_item.items()} + else: + return cfg_item + + # kwargs = to_plain_list(cfg) + kwargs = cfg + log_level = getattr(logging, kwargs.get("log_level", "INFO").upper()) + + logging.basicConfig(level=log_level) + + if kwargs.get("debug", False): + import pdb; + pdb.set_trace() + + train(kwargs) + + +if __name__ == "__main__": + main_hydra() \ No newline at end of file diff --git a/examples/aispeech_domain/finetune_mala_asr_deepspeed.py b/examples/aispeech_domain/finetune_mala_asr_deepspeed.py new file mode 100644 index 00000000..2ee6b1bf --- /dev/null +++ b/examples/aispeech_domain/finetune_mala_asr_deepspeed.py @@ -0,0 +1,63 @@ +from slam_llm.pipeline.finetune_deepspeed import main as train +from typing import Optional +import argparse +import hydra +import logging +from dataclasses import dataclass, field +from omegaconf import DictConfig, ListConfig, OmegaConf +from mala_asr_config import ModelConfig, TrainConfig, DataConfig, LogConfig, FSDPConfig +import sys +@dataclass +class RunConfig: + dataset_config: DataConfig = field(default_factory=DataConfig) + model_config: ModelConfig = field(default_factory=ModelConfig) + train_config: TrainConfig = field(default_factory=TrainConfig) + log_config: LogConfig = field(default_factory=LogConfig) + fsdp_config: FSDPConfig = field(default_factory=FSDPConfig) + debug: bool = field(default=False, metadata={"help": "Use pdb when true"}) + metric: str = field(default="acc", metadata={"help": "The metric for evaluation"}) + ckpt_path: Optional[str] = field( + default=None, metadata={"help": "The path to projector checkpoint"} + ) + deepspeed_config : str ="" + deepspeed_ckpt_path: Optional[str] = field( + default=None, metadata={"help": "The path to projector checkpoint"} + ) + deepspeed_ckpt_id: Optional[str] = field( + default=None, metadata={"help": "The id to projector checkpoint"} + ) +def parse_deepspeed_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--local_rank", type=int, default=-1) + + args, unknown_args = parser.parse_known_args() + return args, unknown_args +@hydra.main(config_name=None, version_base=None) +def main_hydra(cfg: DictConfig): + run_config = RunConfig() + cfg = OmegaConf.merge(run_config, cfg) + def to_plain_list(cfg_item): + if isinstance(cfg_item, ListConfig): + return OmegaConf.to_container(cfg_item, resolve=True) + elif isinstance(cfg_item, DictConfig): + return {k: to_plain_list(v) for k, v in cfg_item.items()} + else: + return cfg_item + + # kwargs = to_plain_list(cfg) + kwargs = cfg + log_level = getattr(logging, kwargs.get("log_level", "INFO").upper()) + + logging.basicConfig(level=log_level) + + if kwargs.get("debug", False): + import pdb; + pdb.set_trace() + + train(kwargs) + + +if __name__ == "__main__": + deepspeed_args, remaining_args = parse_deepspeed_args() + sys.argv = [sys.argv[0]] + remaining_args # 仅传递 Hydra 能处理的参数 + main_hydra() \ No newline at end of file diff --git a/examples/aispeech_domain/inference_batch_deepspeed.py b/examples/aispeech_domain/inference_batch_deepspeed.py new file mode 100644 index 00000000..f9b583a9 --- /dev/null +++ b/examples/aispeech_domain/inference_batch_deepspeed.py @@ -0,0 +1,66 @@ +from slam_llm.pipeline.inference_batch_deepspeed import main as inference +from typing import Optional +import argparse +import hydra +import logging +from dataclasses import dataclass, field +from omegaconf import DictConfig, ListConfig, OmegaConf +from mala_asr_config import ModelConfig, TrainConfig, DataConfig, LogConfig, FSDPConfig +import sys +@dataclass +class RunConfig: + dataset_config: DataConfig = field(default_factory=DataConfig) + model_config: ModelConfig = field(default_factory=ModelConfig) + train_config: TrainConfig = field(default_factory=TrainConfig) + log_config: LogConfig = field(default_factory=LogConfig) + fsdp_config: FSDPConfig = field(default_factory=FSDPConfig) + debug: bool = field(default=False, metadata={"help": "Use pdb when true"}) + metric: str = field(default="acc", metadata={"help": "The metric for evaluation"}) + decode_log: str = field( + default="output/decode_log", + metadata={"help": "The prefix for the decode output"}, + ) + ckpt_path: str = field( + default="output/model.pt", metadata={"help": "The path to projector checkpoint"} + ) + peft_ckpt: Optional[str] = field( + default=None, + metadata={ + "help": "The path to peft checkpoint, should be a directory including adapter_config.json" + }, + ) +def parse_deepspeed_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--local_rank", type=int, default=-1) + + args, unknown_args = parser.parse_known_args() + return args, unknown_args +@hydra.main(config_name=None, version_base=None) +def main_hydra(cfg: DictConfig): + run_config = RunConfig() + cfg = OmegaConf.merge(run_config, cfg) + def to_plain_list(cfg_item): + if isinstance(cfg_item, ListConfig): + return OmegaConf.to_container(cfg_item, resolve=True) + elif isinstance(cfg_item, DictConfig): + return {k: to_plain_list(v) for k, v in cfg_item.items()} + else: + return cfg_item + + # kwargs = to_plain_list(cfg) + kwargs = cfg + log_level = getattr(logging, kwargs.get("log_level", "INFO").upper()) + + logging.basicConfig(level=log_level) + + if kwargs.get("debug", False): + import pdb; + pdb.set_trace() + + inference(kwargs) + + +if __name__ == "__main__": + deepspeed_args, remaining_args = parse_deepspeed_args() + sys.argv = [sys.argv[0]] + remaining_args # 仅传递 Hydra 能处理的参数 + main_hydra() \ No newline at end of file diff --git a/examples/aispeech_domain/inference_mala_asr_batch.py b/examples/aispeech_domain/inference_mala_asr_batch.py new file mode 100644 index 00000000..e733f7b3 --- /dev/null +++ b/examples/aispeech_domain/inference_mala_asr_batch.py @@ -0,0 +1,53 @@ +from slam_llm.pipeline.inference_batch import main as inference + +import hydra +import logging +from dataclasses import dataclass, field +from omegaconf import DictConfig, ListConfig, OmegaConf +from typing import Optional +from mala_asr_config import ModelConfig, TrainConfig, DataConfig, LogConfig, FSDPConfig + + +@dataclass +class RunConfig: + dataset_config: DataConfig = field(default_factory=DataConfig) + model_config: ModelConfig = field(default_factory=ModelConfig) + train_config: TrainConfig = field(default_factory=TrainConfig) + log_config: LogConfig = field(default_factory=LogConfig) + fsdp_config: FSDPConfig = field(default_factory=FSDPConfig) + debug: bool = field(default=False, metadata={"help": "Use pdb when true"}) + metric: str = field(default="acc", metadata={"help": "The metric for evaluation"}) + decode_log: str = field( + default="output/decode_log", + metadata={"help": "The prefix for the decode output"}, + ) + ckpt_path: str = field( + default="output/model.pt", metadata={"help": "The path to projector checkpoint"} + ) + peft_ckpt: Optional[str] = field( + default=None, + metadata={ + "help": "The path to peft checkpoint, should be a directory including adapter_config.json" + }, + ) + + +@hydra.main(config_name=None, version_base=None) +def main_hydra(cfg: DictConfig): + run_config = RunConfig() + cfg = OmegaConf.merge(run_config, cfg) + # kwargs = to_plain_list(cfg) + log_level = getattr(logging, cfg.get("log_level", "INFO").upper()) + + logging.basicConfig(level=log_level) + + if cfg.get("debug", False): + import pdb + + pdb.set_trace() + + inference(cfg) + + +if __name__ == "__main__": + main_hydra() diff --git a/examples/aispeech_domain/mala_asr_config.py b/examples/aispeech_domain/mala_asr_config.py new file mode 100644 index 00000000..59ae523a --- /dev/null +++ b/examples/aispeech_domain/mala_asr_config.py @@ -0,0 +1,155 @@ +from dataclasses import dataclass, field +from typing import Optional, List +from torch.distributed.fsdp import ShardingStrategy + + +@dataclass +class ModelConfig: + file: str = "examples/aispeech_asr/model/slam_model_mala_asr.py:model_factory" + llm_name: str = "vicuna-7b-v1.5" + llm_path: str = "PATH/to/LLAMA/7B" + llm_type: str = "decoder_only" + llm_dim: int = 4096 + encoder_name: Optional[str] = None + encoder_ds_rate: int = 2 + encoder_path: Optional[str] = None + encoder_path_hf: Optional[str] = None + encoder_dim: int = 1280 + encoder_projector: str = "linear" + qformer_layers : int = 8 + encoder_projector_ds_rate: int = 5 + modal: str = "audio" + normalize: Optional[bool] = field(default=False, metadata={ + "help": "whether input is normalized, used for models such as wavlm" + }) + encoder_type: str = field(default="finetune", metadata={ + "help": "whether model is only pretrained or finetuned, used for models such as hubert" + }) + +# @dataclass +# class PeftConfig: +# peft_method: str = "lora" # None , llama_adapter, prefix +# r: int = 8 +# lora_alpha: int = 32 +# target_modules: List = field(default_factory=lambda: [ "q_proj", "v_proj"]) +# bias: str = "none" +# task_type: str = "CAUSAL_LM" +# lora_dropout: float = 0.05 +# inference_mode: bool = False + +@dataclass +class PeftConfig: + peft_method: str = "lora" # None , llama_adapter, prefix + r: int = 64 + lora_alpha: int = 16 + target_modules: List = field(default_factory=lambda: [ "q_proj", "v_proj", "o_proj", "up_proj","gate_proj","down_proj"]) + bias: str = "none" + task_type: str = "CAUSAL_LM" + lora_dropout: float = 0.05 + inference_mode: bool = False + +@dataclass +class TrainConfig: + model_name:str = "PATH/to/LLAMA/7B" + enable_ddp:bool = False + enable_deepspeed:bool = False + enable_fsdp:bool = False + low_cpu_fsdp:bool = False + run_validation:bool = True + batch_size_training:int = 4 + batching_strategy:str = field(default="packing", metadata={ + "help":"alternative: padding" + }) # + context_length:int = 4096 + gradient_accumulation_steps:int = 1 + num_epochs:int = 3 + num_workers_dataloader:int = 1 + warmup_steps:int = 1000 + total_steps:int = 100000 + validation_interval:int = 1000 + lr:float = 1e-4 + weight_decay:float = 0.0 + gamma:float = 0.85 + seed:int = 1234 + use_fp16:bool = False + mixed_precision:bool = True + val_batch_size:int = 1 + + use_peft:bool = False + peft_config:PeftConfig = field(default_factory=PeftConfig) + output_dir:str = "PATH/to/save/PEFT/model" + freeze_layers:bool = False + num_freeze_layers:int = 1 + quantization:bool = False + one_gpu:bool = False + save_model:bool = True + dist_checkpoint_root_folder:str = "PATH/to/save/FSDP/model" # will be used if using FSDP + dist_checkpoint_folder:str = "fine-tuned" # will be used if using FSDP + save_optimizer:bool = False # will be used if using FSDP + use_fast_kernels:bool = False # Enable using SDPA from PyTroch Accelerated Transformers, make use Flash Attention and Xformer memory-efficient kernels + run_test_during_validation:bool = False + run_test_during_validation_file:str = "test.wav" + run_test_during_validation_prompt:str = "<|ASR|>" + freeze_llm:bool = field(default=False, metadata={ + "help": "whether to freeze llm when finetuning, should be true when use peft finetuning" + }) + freeze_encoder:bool = False + +@dataclass +class DataConfig: + dataset: str = "multitask_dataset" + llm_name: str = "vicuna-7b-v1.5" + prompt_style: str = "normal" # instruct + file: str = "examples/aispeech_asr/dataset/multitask_dataset.py:get_speech_dataset" + speed_perturb : bool = False + spec_augmentation : bool = False + add_noise : bool = False + add_reverb : bool = False + noise_file_path: str = "" + train_scp_file_path: str = "" + dev_scp_file_path: str = "" + test_scp_file_path: str = "" + train_split: str = "train" + dev_split: str = "dev" + test_split:str = "test" + pad_or_trim: bool = True + prompt: Optional[str] = None + use_ocr: bool = True + inference_mode: bool = False + prompt_mode : str = "qwen" + lower: bool = False + encoder_ds_rate: int = 2 + encoder_projector_ds_rate: int = 5 + fix_length_audio: int = -1 + inference_mode:bool = False + input_type: str = field(default="raw", metadata={ + "help":"Use raw when input is wav, mel when for whisper" + }) + mel_size: int = field(default=80, metadata={ + "help": "80 for whisper large v1 and v2, 128 for v3" + }) + normalize: Optional[bool] = field(default=False, metadata={ + "help": "whether input is normalized, used for models such as wavlm" + }) + +@dataclass +class FSDPConfig: + mixed_precision: bool = True + use_fp16: bool = False + # sharding_strategy = "FULL_SHARD" #ShardingStrategy = ShardingStrategy.FULL_SHARD + sharding_strategy: ShardingStrategy = "SHARD_GRAD_OP" #ShardingStrategy.NO_SHARD #MZY: set NO_SHARD when use DDP + checkpoint_type: str = "SHARDED_STATE_DICT" # alternatively can use SHARDED_STATE_DICT save one file per rank, and can resize the world-size. + fsdp_activation_checkpointing: bool = True + fsdp_cpu_offload: bool = False + pure_bf16: bool = False + optimizer: str = "AdamW" + +@dataclass +class LogConfig: + use_wandb: bool = False + wandb_dir: str = "tmp/test_wandb" + wandb_entity_name: str = "project_name" + wandb_project_name: str = "project_name" + wandb_exp_name: str = "exp_name" + log_file: str = "tmp/test.log" + log_interval: int = 5 diff --git a/examples/aispeech_domain/model/slam_model_mala_asr.py b/examples/aispeech_domain/model/slam_model_mala_asr.py new file mode 100644 index 00000000..6ef5c763 --- /dev/null +++ b/examples/aispeech_domain/model/slam_model_mala_asr.py @@ -0,0 +1,41 @@ +import torch +import os +import logging +from slam_llm.models.slam_model import ( + slam_model, + setup_tokenizer, + setup_encoder, + setup_encoder_projector, + setup_llm, +) +from slam_llm.utils.train_utils import print_model_size + +logger = logging.getLogger(__name__) + +def model_factory(train_config, model_config, **kwargs): + # return necessary components for training + tokenizer = setup_tokenizer(train_config, model_config, **kwargs) + + + + # llm + model = setup_llm(train_config, model_config, **kwargs) + ckpt_path = kwargs.get( + "ckpt_path", None + ) # FIX(MZY): load model ckpt(mainly projector, related to model_checkpointing/checkpoint_handler.py: save_model_checkpoint_peft) + if ckpt_path is not None: + logger.info("loading other parts from: {}".format(ckpt_path)) + ckpt_dict = torch.load(ckpt_path, map_location="cpu") + model.load_state_dict(ckpt_dict, strict=False) + + + print_model_size( + model, + train_config, + ( + int(os.environ["RANK"]) + if train_config.enable_fsdp or train_config.enable_ddp + else 0 + ), + ) + return model, tokenizer diff --git a/examples/aispeech_domain/scripts/decode_MaLa-ASR.sh b/examples/aispeech_domain/scripts/decode_MaLa-ASR.sh new file mode 100644 index 00000000..ce4efefb --- /dev/null +++ b/examples/aispeech_domain/scripts/decode_MaLa-ASR.sh @@ -0,0 +1,141 @@ +#!/bin/bash +#export PYTHONPATH=/root/whisper:$PYTHONPATH +# export CUDA_VISIBLE_DEVICES=1 +# export ASCEND_VISIBLE_DEVICES=1 +export TOKENIZERS_PARALLELISM=false +# export OPENBLAS_NUM_THREADS=1 +# export GOTO_NUM_THREADS=1 +# export OMP_NUM_THREADS=1 +# export CUDA_LAUNCH_BLOCKING=1 +set -e +run_dir=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/ +cd $run_dir +code_dir=examples/aispeech_asr + +dataset=aishell-1 +prompt_style=instruct # normal #instruct +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech-clean || $dataset == librispeech-other || $dataset == alimeeting || $dataset == gigaspeech ]] +then + # aishell-1:asr hotword + # aishell-2:asr hotword mt + # librispeech:asr prevtext mt + # alimeeting: asr_far_bf asr_near asr_far_gss + dataset_task=asr +fi +projector=linear +encoder_name=whisper +sentence=connect +llm_name=Qwen2.5-7B-Instruct +use_peft=true +use_fp16=true +pad_or_trim=true +encoder_projector_ds_rate=5 +ckpt_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/aispeech_asr/exp/multitask_asr/20250307/whisper_linear_Qwen2.5-7B-Instruct_loratrue_padtrue_instruct__speedfalse_specaugfalse-1718/mala_asr_epoch_1_step_160000/ + +if [[ $encoder_name == "whisper" ]] +then + encoder_finetune=false +fi + +# Choose Encoder +if [[ $encoder_name == "whisper" ]] +then + if [[ $encoder_finetune == true ]] + then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper-Pt/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt + mel_size=80 + else + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + mel_size=128 + fi + encoder_dim=1280 + input_type=mel + +elif [[ $encoder_name == "wavlm" ]] +then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/wavlm/WavLM-Large.pt + encoder_dim=1024 + input_type=raw + mel_size=128 +else + exit 1 +fi + +# Choose LLM +if [[ $llm_name == "vicuna-7b-v1.5" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/vicuna-7b-v1.5 + llm_dim=4096 +elif [[ $llm_name == "Qwen2.5-7B-Instruct" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim=3584 +elif [[ $llm_name == "Qwen2-7B" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2-7B + llm_dim=3584 +elif [[ $llm_name == "Qwen2.5-7B" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B + llm_dim=3584 +else + exit 1 +fi + +if [[ $dataset == "aishell-1" || $dataset == "aishell-2" || $dataset == "alimeeting" || $dataset == gigaspeech ]] +then + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/test/ +elif [[ $dataset == "librispeech-other" ]] +then + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/test-other/ +elif [[ $dataset == "librispeech-clean" ]] +then + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/test-clean/ +elif [[ $dataset == "wenetspeech_test_net" ]] +then + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/wenetspeech/asr/test_net/ +elif [[ $dataset == "wenetspeech_test_meeting" ]] +then + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/wenetspeech/asr/test_meeting/ +else + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/test/ +fi + +decode_log=$ckpt_path/decode_${dataset}_${dataset_task}_${prompt_style} +python $code_dir/inference_mala_asr_batch.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + hydra.run.dir=$ckpt_path \ + ++model_config.llm_name=$llm_name \ + ++model_config.llm_path=$llm_path \ + ++model_config.llm_dim=$llm_dim \ + ++model_config.encoder_name=$encoder_name \ + ++model_config.normalize=true \ + ++model_config.encoder_projector_ds_rate=5 \ + ++model_config.encoder_path=$speech_encoder_path \ + ++model_config.encoder_dim=$encoder_dim \ + ++model_config.encoder_projector=$projector \ + ++dataset_config.llm_name=$llm_name \ + ++dataset_config.prompt_style=$prompt_style \ + ++dataset_config.dataset=$dataset \ + ++dataset_config.pad_or_trim=$pad_or_trim \ + ++dataset_config.test_scp_file_path=$test_scp_file_path \ + ++dataset_config.input_type=$input_type \ + ++dataset_config.mel_size=$mel_size \ + ++dataset_config.inference_mode=true \ + ++train_config.model_name=mala_asr \ + ++train_config.freeze_encoder=true \ + ++train_config.freeze_llm=true \ + ++train_config.use_peft=$use_peft \ + ++train_config.batching_strategy=custom \ + ++train_config.num_epochs=1 \ + ++train_config.val_batch_size=8 \ + ++train_config.num_workers_dataloader=0 \ + ++train_config.output_dir=$output_dir \ + ++decode_log=$decode_log \ + ++ckpt_path=$ckpt_path/model.pt + + +python /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/tools/wenet_compute_cer.py --char=1 -v=1 ${decode_log}_gt ${decode_log}_pred > ${decode_log}_cer +python /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/tools/pyResults/pyResults.py ${decode_log}_gt ${decode_log}_pred > ${decode_log}_ser +python "/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/SLAM-LLM/examples/mala_asr_slidespeech/slam_llm/utils/compute_wer.py" ${decode_log}_gt ${decode_log}_pred ${decode_log}_ser diff --git a/examples/aispeech_domain/scripts/decode_MaLa-ASR_deepspeech_multinode_nossh.sh b/examples/aispeech_domain/scripts/decode_MaLa-ASR_deepspeech_multinode_nossh.sh new file mode 100644 index 00000000..cadd54f4 --- /dev/null +++ b/examples/aispeech_domain/scripts/decode_MaLa-ASR_deepspeech_multinode_nossh.sh @@ -0,0 +1,197 @@ +#!/bin/bash +# export PYTHONPATH=/root/fairseq:$PYTHONPATH +export ASCEND_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export TOKENIZERS_PARALLELISM=false +export HCCL_CONNECT_TIMEOUT=3600 +# export CUDA_LAUNCH_BLOCKING=1 +export HYDRA_FULL_ERROR=1 +export OMP_NUM_THREADS=1 + +# debug setting for multiple gpus +# export NCCL_DEBUG=INFO +# export NCCL_DEBUG_SUBSYS=ALL +# export TORCH_DISTRIBUTED_DEBUG=INFO + +run_dir=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/ +cd $run_dir +code_dir=examples/aispeech_asr +# multitask +# dataset=alimeeting +# dataset=zh-6w-en-1w-asr +dataset=aishell-1 +prompt_style=normal #instruct +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech ]] +then + # aishell1:asr hotword + # aisehll2:asr hotword mt + # librispeech:asr prevtext mt + # alimeeting: asr_far_bf asr_near + # gigaspeech: asr + # wenetspeech: asr + dataset_task=asr +fi +projector=linear +encoder_name=whisper +llm_name=Qwen2.5-7B-Instruct +use_peft=false +use_fp16=true +freeze_encoder=true +pad_or_trim=true +encoder_projector_ds_rate=5 + +speed_perturb=false +spec_augmentation=false +add_noise=false +add_reverb=false +ckpt_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/aispeech_asr/exp/multitask_asr/20250305/whisper_linear_Qwen2.5-7B-Instruct_loratrue_padtrue_instruct__speedfalse_specaugfalse-1309/mala_asr_epoch_1_step_90000 +# deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config_from_k2.json +deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/aispeech_asr/conf/ds_config.json +# deepspeed_ckpt_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/aispeech_asr/exp/multitask_asr/20250305/whisper_linear_Qwen2.5-7B-Instruct_loratrue_padtrue_instruct__speedfalse_specaugfalse-1309/mala_asr_epoch_1_step_90000 +# deepspeed_ckpt_id=global_step90000 +if [[ $encoder_name == "whisper" ]] +then + encoder_finetune=false +fi +if [[ $use_peft == "true" || $freeze_encoder == false ]];then + ckpt_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/aispeech_asr/exp/multitask_asr/20250305/whisper_linear_Qwen2.5-7B-Instruct_loratrue_padtrue_instruct__speedfalse_specaugfalse-1309/mala_asr_epoch_1_step_90000 + # deepspeed_ckpt_path= +fi + +# Choose Encoder +if [[ $encoder_name == "whisper" ]] +then + if [[ $encoder_finetune == true ]] + then + speech_encoder_path= + mel_size=80 + else + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + mel_size=128 + fi + encoder_dim=1280 + input_type=mel + +elif [[ $encoder_name == "wavlm" ]] +then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/wavlm/WavLM-Large.pt + encoder_dim=1024 + input_type=raw + mel_size=128 +else + exit 1 +fi + +# Choose LLM +if [[ $llm_name == "vicuna-7b-v1.5" ]] +then + llm_path= + llm_dim=4096 +elif [[ $llm_name == "Qwen2.5-7B-Instruct" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim=3584 +elif [[ $llm_name == "Qwen2-7B" ]] +then + llm_path= + llm_dim=3584 +elif [[ $llm_name == "Qwen2.5-7B" ]] +then + llm_path= + llm_dim=3584 +else + exit 1 +fi + +# Choose Train/Dev/Test +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/dev/ +elif [[ $dataset == "librispeech" ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/dev-other/ +else + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/dev/ +fi + + +output_dir=${code_dir}/exp/${dataset}/$(date +"%Y%m%d")/${encoder_name}_${projector}_${llm_name}_lora${use_peft}_pad${pad_or_trim}_${prompt_style}_${dataset_task}_speed${speed_perturb}_specaug${spec_augmentation}-$(date +"%H%M") +hydra_args=" +hydra.run.dir=$output_dir \ +++model_config.llm_name=$llm_name \ +++model_config.llm_path=$llm_path \ +++model_config.llm_dim=$llm_dim \ +++model_config.encoder_name=$encoder_name \ +++model_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++model_config.encoder_path=$speech_encoder_path \ +++model_config.encoder_dim=$encoder_dim \ +++model_config.encoder_projector=$projector \ +++model_config.normalize=true \ +++dataset_config.llm_name=$llm_name \ +++dataset_config.prompt_style=$prompt_style \ +++dataset_config.normalize=true \ +++dataset_config.dataset=$dataset \ +++dataset_config.input_type=$input_type \ +++dataset_config.speed_perturb=$speed_perturb \ +++dataset_config.spec_augmentation=$spec_augmentation \ +++dataset_config.add_reverb=$add_reverb \ +++dataset_config.noise_file_path=$noise_file_path \ +++dataset_config.mel_size=$mel_size \ +++dataset_config.pad_or_trim=$pad_or_trim \ +++dataset_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++dataset_config.train_scp_file_path=$train_scp_file_path \ +++dataset_config.dev_scp_file_path=$dev_scp_file_path \ +++train_config.model_name=mala_asr \ +++train_config.num_epochs=5 \ +++train_config.freeze_encoder=$freeze_encoder \ +++train_config.freeze_llm=true \ +++train_config.use_peft=$use_peft \ +++train_config.batching_strategy=custom \ +++train_config.warmup_steps=10 \ +++train_config.total_steps=600000 \ +++train_config.lr=1e-4 \ +++train_config.validation_interval=10000 \ +++train_config.batch_size_training=7 \ +++train_config.val_batch_size=7 \ +++train_config.num_workers_dataloader=8 \ +++train_config.output_dir=$output_dir \ +++metric=acc \ +" +if [[ $use_peft == "true" ]];then + hydra_args+="++ckpt_path=$ckpt_path/model.pt" + # hydra_args+=" ++deepspeed_ckpt_path=$deepspeed_ckpt_path " + +fi +hydra_args+="++ckpt_path=$ckpt_path/model.pt" +# hydra_args+=" ++deepspeed_ckpt_path=$deepspeed_ckpt_path " + +HOST_FILE="/tmp/"${JobID} #生成的hostfile的完整文件名,$JobID调度系统会自动生成 +SSH_PORT=6666 #因调度系统强制普通用户身份起容器,需要将ssh端口指定为大于1024的值 + +gen_hostfile() { #此函数负责生成hostfile, 已跟调度系统对接好,直接使用,不要修改 + echo "${VC_MASTER_HOSTS} slots=${GPU_PER_TASK}" > ${HOST_FILE} + echo "${VC_WORKER_HOSTS}" | awk -F ',' -v gpu_num=$GPU_PER_TASK '{for (i=1; i<=NF; i++) print $i" slots="gpu_num}' >> ${HOST_FILE} +} + +do_train() { + cat $HOST_FILE #训练主入口函数 + /usr/sbin/sshd -p ${SSH_PORT} #在Rank0上后台启动sshd服务,不要修改 + deepspeed \ + --node_rank=$RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT \ + --hostfile $HOST_FILE \ + --no_ssh \ + $code_dir/inference_batch_deepspeed.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + ++train_config.enable_fsdp=false \ + ++train_config.enable_ddp=true \ + ++train_config.use_fp16=$use_fp16 \ + ++deepspeed_config=$deepspeed_config \ + ${hydra_args} +} +gen_hostfile +do_train \ No newline at end of file diff --git a/examples/aispeech_domain/scripts/decode_MaLa-ASR_deepspeed.sh b/examples/aispeech_domain/scripts/decode_MaLa-ASR_deepspeed.sh new file mode 100644 index 00000000..95b3cf49 --- /dev/null +++ b/examples/aispeech_domain/scripts/decode_MaLa-ASR_deepspeed.sh @@ -0,0 +1,145 @@ +#!/bin/bash +#export PYTHONPATH=/root/whisper:$PYTHONPATH +# export CUDA_VISIBLE_DEVICES=1 +# export ASCEND_VISIBLE_DEVICES=1 +export TOKENIZERS_PARALLELISM=false +# export OPENBLAS_NUM_THREADS=1 +# export GOTO_NUM_THREADS=1 +# export OMP_NUM_THREADS=1 +# export CUDA_LAUNCH_BLOCKING=1 +set -e +run_dir=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/ +cd $run_dir +code_dir=examples/aispeech_asr + +dataset=aishell-1 +prompt_style=normal # normal #instruct +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech-clean || $dataset == librispeech-other || $dataset == alimeeting || $dataset == gigaspeech ]] +then + # aishell-1:asr hotword + # aishell-2:asr hotword mt + # librispeech:asr prevtext mt + # alimeeting: asr_far_bf asr_near asr_far_gss + dataset_task=asr_far_bf +fi +projector=linear +encoder_name=whisper +sentence=connect +llm_name=Qwen2.5-7B-Instruct +use_peft=true +use_fp16=true +pad_or_trim=true +encoder_projector_ds_rate=5 +ckpt_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/aispeech_asr/exp/zh-1w-en-1w-asr/20250315/whisper_linear_Qwen2.5-7B-Instruct_loratrue_padtrue_normal__speedfalse_specaugfalse-1341/mala_asr_epoch_1_step_110000 + +if [[ $encoder_name == "whisper" ]] +then + encoder_finetune=false +fi + +# Choose Encoder +if [[ $encoder_name == "whisper" ]] +then + if [[ $encoder_finetune == true ]] + then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper-Pt/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt + mel_size=80 + else + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + mel_size=128 + fi + encoder_dim=1280 + input_type=mel + +elif [[ $encoder_name == "wavlm" ]] +then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/wavlm/WavLM-Large.pt + encoder_dim=1024 + input_type=raw + mel_size=128 +else + exit 1 +fi + +# Choose LLM +if [[ $llm_name == "vicuna-7b-v1.5" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/vicuna-7b-v1.5 + llm_dim=4096 +elif [[ $llm_name == "Qwen2.5-7B-Instruct" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim=3584 +elif [[ $llm_name == "Qwen2-7B" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2-7B + llm_dim=3584 +elif [[ $llm_name == "Qwen2.5-7B" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B + llm_dim=3584 +else + exit 1 +fi + +if [[ $dataset == "aishell-1" || $dataset == "aishell-2" || $dataset == "alimeeting" || $dataset == gigaspeech ]] +then + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/test/ +elif [[ $dataset == "librispeech-other" ]] +then + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/test-other/ +elif [[ $dataset == "librispeech-clean" ]] +then + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/test-clean/ +elif [[ $dataset == "wenetspeech_test_net" ]] +then + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/wenetspeech/asr/test_net/ +elif [[ $dataset == "wenetspeech_test_meeting" ]] +then + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/wenetspeech/asr/test_meeting/ +else + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/test/ +fi + +decode_log=$ckpt_path/decode_${dataset}_${dataset_task}_${prompt_style} +deepspeed \ + --num_nodes 1 \ + --num_gpus 8 \ + $code_dir/inference_batch_deepspeed.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + hydra.run.dir=$ckpt_path \ + ++model_config.llm_name=$llm_name \ + ++model_config.llm_path=$llm_path \ + ++model_config.llm_dim=$llm_dim \ + ++model_config.encoder_name=$encoder_name \ + ++model_config.normalize=true \ + ++model_config.encoder_projector_ds_rate=5 \ + ++model_config.encoder_path=$speech_encoder_path \ + ++model_config.encoder_dim=$encoder_dim \ + ++model_config.encoder_projector=$projector \ + ++dataset_config.llm_name=$llm_name \ + ++dataset_config.prompt_style=$prompt_style \ + ++dataset_config.dataset=$dataset \ + ++dataset_config.pad_or_trim=$pad_or_trim \ + ++dataset_config.test_scp_file_path=$test_scp_file_path \ + ++dataset_config.input_type=$input_type \ + ++dataset_config.mel_size=$mel_size \ + ++dataset_config.inference_mode=true \ + ++train_config.model_name=mala_asr \ + ++train_config.freeze_encoder=true \ + ++train_config.freeze_llm=true \ + ++train_config.use_peft=$use_peft \ + ++train_config.batching_strategy=custom \ + ++train_config.num_epochs=1 \ + ++train_config.val_batch_size=8 \ + ++train_config.num_workers_dataloader=0 \ + ++train_config.output_dir=$output_dir \ + ++decode_log=$decode_log \ + ++ckpt_path=$ckpt_path/model.pt + + +python /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/tools/wenet_compute_cer.py --char=1 -v=1 ${decode_log}_gt ${decode_log}_pred > ${decode_log}_cer +python /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/tools/pyResults/pyResults.py ${decode_log}_gt ${decode_log}_pred > ${decode_log}_ser +python "/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/SLAM-LLM/examples/mala_asr_slidespeech/slam_llm/utils/compute_wer.py" ${decode_log}_gt ${decode_log}_pred ${decode_log}_ser + diff --git a/examples/aispeech_domain/scripts/finetune_MaLa-ASR_deepspeed.sh b/examples/aispeech_domain/scripts/finetune_MaLa-ASR_deepspeed.sh new file mode 100644 index 00000000..3ce28139 --- /dev/null +++ b/examples/aispeech_domain/scripts/finetune_MaLa-ASR_deepspeed.sh @@ -0,0 +1,181 @@ +#!/bin/bash +# export PYTHONPATH=/root/fairseq:$PYTHONPATH +export ASCEND_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export TOKENIZERS_PARALLELISM=false +# export CUDA_LAUNCH_BLOCKING=1 +export HYDRA_FULL_ERROR=1 +export OMP_NUM_THREADS=1 + +# debug setting for multiple gpus +# export NCCL_DEBUG=INFO +# export NCCL_DEBUG_SUBSYS=ALL +# export TORCH_DISTRIBUTED_DEBUG=INFO + +run_dir=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/ +cd $run_dir +code_dir=examples/aispeech_asr +# multitask +# dataset=alimeeting +# dataset=multitask_asr +dataset=aishell-1 +prompt_style=normal #instruct +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech ]] +then + # aishell1:asr hotword + # aisehll2:asr hotword mt + # librispeech:asr prevtext mt + # alimeeting: asr_far_bf asr_near + # gigaspeech: asr + # wenetspeech: asr + dataset_task=asr +fi +projector=linear +encoder_name=whisper +llm_name=Qwen2.5-7B-Instruct +use_peft=false +use_fp16=true +freeze_encoder=true +pad_or_trim=true +encoder_projector_ds_rate=5 +# enhance +# enhance +speed_perturb=false +spec_augmentation=false +add_noise=false +add_reverb=false +# deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config_from_k2.json +deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/aispeech_asr/conf/ds_config.json +# /hpc_stor01/home/yangui.fang_sx/workingspace/project/mala_asr_slidespeech_origin/exp/multitask_asr/20250111/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padfalse_normal__speedtrue_specaugfalse-1106/mala_asr_epoch_2_step_1299/ +# ckpt_path=/hpc_stor01/home/yangui.fang_sx/workingspace/project/mala_asr_slidespeech_origin/exp/multitask_asr/20250114/whisper_linear_Qwen2.5-7B-Instruct_loratrue_padtrue_normal__speedfalse_specaugfalse-2018_Upper_/mala_asr_epoch_2_step_12299 +if [[ $encoder_name == "whisper" ]] +then + encoder_finetune=false +fi +if [[ $use_peft == "true" || $freeze_encoder == false ]];then + ckpt_path= +fi + +# Choose Encoder +if [[ $encoder_name == "whisper" ]] +then + if [[ $encoder_finetune == true ]] + then + speech_encoder_path= + mel_size=80 + else + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + mel_size=128 + fi + encoder_dim=1280 + input_type=mel + +elif [[ $encoder_name == "wavlm" ]] +then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/wavlm/WavLM-Large.pt + encoder_dim=1024 + input_type=raw + mel_size=128 +else + exit 1 +fi + +# Choose LLM +if [[ $llm_name == "vicuna-7b-v1.5" ]] +then + llm_path= + llm_dim=4096 +elif [[ $llm_name == "Qwen2.5-7B-Instruct" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim=3584 +elif [[ $llm_name == "Qwen2-7B" ]] +then + llm_path= + llm_dim=3584 +elif [[ $llm_name == "Qwen2.5-7B" ]] +then + llm_path= + llm_dim=3584 +else + exit 1 +fi + +# Choose Train/Dev/Test +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/dev/ +elif [[ $dataset == "librispeech" ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/dev-other/ +else + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/dev/ +fi + + +output_dir=${code_dir}/exp/${dataset}/$(date +"%Y%m%d")/${encoder_name}_${projector}_${llm_name}_lora${use_peft}_pad${pad_or_trim}_${prompt_style}_${dataset_task}_speed${speed_perturb}_specaug${spec_augmentation}-$(date +"%H%M") +hydra_args=" +hydra.run.dir=$output_dir \ +++model_config.llm_name=$llm_name \ +++model_config.llm_path=$llm_path \ +++model_config.llm_dim=$llm_dim \ +++model_config.encoder_name=$encoder_name \ +++model_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++model_config.encoder_path=$speech_encoder_path \ +++model_config.encoder_dim=$encoder_dim \ +++model_config.encoder_projector=$projector \ +++model_config.normalize=true \ +++dataset_config.llm_name=$llm_name \ +++dataset_config.prompt_style=$prompt_style \ +++dataset_config.normalize=true \ +++dataset_config.dataset=$dataset \ +++dataset_config.input_type=$input_type \ +++dataset_config.speed_perturb=$speed_perturb \ +++dataset_config.spec_augmentation=$spec_augmentation \ +++dataset_config.add_reverb=$add_reverb \ +++dataset_config.noise_file_path=$noise_file_path \ +++dataset_config.mel_size=$mel_size \ +++dataset_config.pad_or_trim=$pad_or_trim \ +++dataset_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++dataset_config.train_scp_file_path=$train_scp_file_path \ +++dataset_config.dev_scp_file_path=$dev_scp_file_path \ +++train_config.model_name=mala_asr \ +++train_config.num_epochs=5 \ +++train_config.freeze_encoder=$freeze_encoder \ +++train_config.freeze_llm=true \ +++train_config.use_peft=$use_peft \ +++train_config.batching_strategy=custom \ +++train_config.total_steps=100000 \ +++train_config.validation_interval=10000 \ +++train_config.batch_size_training=10 \ +++train_config.val_batch_size=10 \ +++train_config.num_workers_dataloader=16 \ +++train_config.output_dir=$output_dir \ +++metric=acc \ +" +if [[ $use_peft == "true" || $freeze_encoder == false ]];then + hydra_args+="++ckpt_path=$ckpt_path/model.pt" +fi +# hydra_args+="++ckpt_path=$ckpt_path/model.pt" + +# -m debugpy --listen 5678 --wait-for-client +if [[ $ASCEND_VISIBLE_DEVICES != *","* ]]; then + python -m debugpy --listen 5678 --wait-for-client $code_dir/finetune_mala_asr.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + $hydra_args +else + deepspeed \ + --num_nodes 1 \ + --num_gpus 1 \ + $code_dir/finetune_mala_asr_deepspeed.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + ++train_config.enable_fsdp=false \ + ++train_config.enable_ddp=true \ + ++train_config.use_fp16=$use_fp16 \ + ++deepspeed_config=$deepspeed_config \ + ${hydra_args} +fi diff --git a/examples/aispeech_domain/scripts/finetune_MaLa-ASR_deepspeed_multinode.sh b/examples/aispeech_domain/scripts/finetune_MaLa-ASR_deepspeed_multinode.sh new file mode 100644 index 00000000..cfc48ca3 --- /dev/null +++ b/examples/aispeech_domain/scripts/finetune_MaLa-ASR_deepspeed_multinode.sh @@ -0,0 +1,190 @@ +#!/bin/bash +# export PYTHONPATH=/root/fairseq:$PYTHONPATH +export ASCEND_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export TOKENIZERS_PARALLELISM=false + +# export CUDA_LAUNCH_BLOCKING=1 +export HYDRA_FULL_ERROR=1 +export OMP_NUM_THREADS=1 + +# debug setting for multiple gpus +# export NCCL_DEBUG=INFO +# export NCCL_DEBUG_SUBSYS=ALL +# export TORCH_DISTRIBUTED_DEBUG=INFO + +run_dir=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/ +cd $run_dir +code_dir=examples/aispeech_asr + +dataset=aishell-1 +prompt_style=normal #instruct +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech ]] +then + # aishell1:asr hotword + # aisehll2:asr hotword mt + # librispeech:asr prevtext mt + # alimeeting: asr_far_bf asr_near + # gigaspeech: asr + # wenetspeech: asr + dataset_task=asr +fi +projector=linear +encoder_name=whisper +llm_name=Qwen2.5-7B-Instruct +use_peft=false +use_fp16=true +freeze_encoder=true +pad_or_trim=true +encoder_projector_ds_rate=5 +speed_perturb=false +spec_augmentation=false +add_noise=false +add_reverb=false +# deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config_from_k2.json +deepspeed_config=./conf/ds_config.json +if [[ $encoder_name == "whisper" ]] +then + encoder_finetune=false +fi +if [[ $use_peft == "true" || $freeze_encoder == false ]];then + ckpt_path= +fi + +# Choose Encoder +if [[ $encoder_name == "whisper" ]] +then + if [[ $encoder_finetune == true ]] + then + speech_encoder_path= + mel_size=80 + else + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + mel_size=128 + fi + encoder_dim=1280 + input_type=mel + +elif [[ $encoder_name == "wavlm" ]] +then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/wavlm/WavLM-Large.pt + encoder_dim=1024 + input_type=raw + mel_size=128 +else + exit 1 +fi + +# Choose LLM +if [[ $llm_name == "vicuna-7b-v1.5" ]] +then + llm_path= + llm_dim=4096 +elif [[ $llm_name == "Qwen2.5-7B-Instruct" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim=3584 +elif [[ $llm_name == "Qwen2-7B" ]] +then + llm_path= + llm_dim=3584 +elif [[ $llm_name == "Qwen2.5-7B" ]] +then + llm_path= + llm_dim=3584 +else + exit 1 +fi + +# Choose Train/Dev/Test +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/dev/ +elif [[ $dataset == "librispeech" ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/dev-other/ +else + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/dev/ +fi + + +output_dir=${code_dir}/exp/${dataset}/$(date +"%Y%m%d")/${encoder_name}_${projector}_${llm_name}_lora${use_peft}_pad${pad_or_trim}_${prompt_style}_${dataset_task}_speed${speed_perturb}_specaug${spec_augmentation}-$(date +"%H%M") +hydra_args=" +hydra.run.dir=$output_dir \ +++model_config.llm_name=$llm_name \ +++model_config.llm_path=$llm_path \ +++model_config.llm_dim=$llm_dim \ +++model_config.encoder_name=$encoder_name \ +++model_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++model_config.encoder_path=$speech_encoder_path \ +++model_config.encoder_dim=$encoder_dim \ +++model_config.encoder_projector=$projector \ +++model_config.normalize=true \ +++dataset_config.llm_name=$llm_name \ +++dataset_config.prompt_style=$prompt_style \ +++dataset_config.normalize=true \ +++dataset_config.dataset=$dataset \ +++dataset_config.input_type=$input_type \ +++dataset_config.speed_perturb=$speed_perturb \ +++dataset_config.spec_augmentation=$spec_augmentation \ +++dataset_config.add_reverb=$add_reverb \ +++dataset_config.noise_file_path=$noise_file_path \ +++dataset_config.mel_size=$mel_size \ +++dataset_config.pad_or_trim=$pad_or_trim \ +++dataset_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++dataset_config.train_scp_file_path=$train_scp_file_path \ +++dataset_config.dev_scp_file_path=$dev_scp_file_path \ +++train_config.model_name=mala_asr \ +++train_config.num_epochs=5 \ +++train_config.freeze_encoder=$freeze_encoder \ +++train_config.freeze_llm=true \ +++train_config.use_peft=$use_peft \ +++train_config.batching_strategy=custom \ +++train_config.total_steps=100000 \ +++train_config.validation_interval=1000 \ +++train_config.batch_size_training=6 \ +++train_config.val_batch_size=6 \ +++train_config.num_workers_dataloader=0 \ +++train_config.output_dir=$output_dir \ +++metric=acc \ +" +if [[ $use_peft == "true" || $freeze_encoder == false ]];then + hydra_args+="++ckpt_path=$ckpt_path/model.pt" +fi +# hydra_args+="++ckpt_path=$ckpt_path/model.pt" + + +HOST_FILE="/tmp/"${JobID} #生成的hostfile的完整文件名,$JobID调度系统会自动生成 +SSH_PORT=6666 #因调度系统强制普通用户身份起容器,需要将ssh端口指定为大于1024的值 + +gen_hostfile() { #此函数负责生成hostfile, 已跟调度系统对接好,直接使用,不要修改 + echo "${VC_MASTER_HOSTS} slots=${GPU_PER_TASK}" > ${HOST_FILE} + echo "${VC_WORKER_HOSTS}" | awk -F ',' -v gpu_num=$GPU_PER_TASK '{for (i=1; i<=NF; i++) print $i" slots="gpu_num}' >> ${HOST_FILE} +} + +do_train() { + cat $HOST_FILE #训练主入口函数 + /usr/sbin/sshd -p ${SSH_PORT} #在Rank0上后台启动sshd服务,不要修改 + deepspeed \ + --hostfile $HOST_FILE \ + --ssh_port $SSH_PORT \ + $code_dir/finetune_mala_asr_deepspeed.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + ++train_config.enable_fsdp=false \ + ++train_config.enable_ddp=true \ + ++train_config.use_fp16=$use_fp16 \ + ++deepspeed_config=$deepspeed_config \ + ${hydra_args} \ + + +} + +if [ "${RANK}" = "0" ]; then #只在index为RANK0的POD上启动主训练脚本,其他节点由主节点通过ssh分发任务($RANK由调度系统自动分配) + gen_hostfile #生成分布式训练需要的hostfile + do_train #启动训练 +else + /usr/sbin/sshd -D -p ${SSH_PORT} #其他节点的task,仅前台运行sshd服务,不执行主训练脚本,不要修改 +fi \ No newline at end of file diff --git a/examples/aispeech_domain/scripts/finetune_MaLa-ASR_deepspeed_multinode_nossh.sh b/examples/aispeech_domain/scripts/finetune_MaLa-ASR_deepspeed_multinode_nossh.sh new file mode 100644 index 00000000..804bb5df --- /dev/null +++ b/examples/aispeech_domain/scripts/finetune_MaLa-ASR_deepspeed_multinode_nossh.sh @@ -0,0 +1,193 @@ +#!/bin/bash +# export PYTHONPATH=/root/fairseq:$PYTHONPATH +export ASCEND_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export TOKENIZERS_PARALLELISM=false + +# export CUDA_LAUNCH_BLOCKING=1 +export HYDRA_FULL_ERROR=1 +export OMP_NUM_THREADS=1 + +# debug setting for multiple gpus +# export NCCL_DEBUG=INFO +# export NCCL_DEBUG_SUBSYS=ALL +# export TORCH_DISTRIBUTED_DEBUG=INFO + +run_dir=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/ +cd $run_dir +code_dir=examples/aispeech_asr +# multitask +dataset=zh-1w-en-1w-asr +# dataset=zh-1w+en-1w-asr +prompt_style=normal #instruct +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech ]] +then + # aishell1:asr hotword + # aisehll2:asr hotword mt + # librispeech:asr prevtext mt + # alimeeting: asr_far_bf asr_near + # gigaspeech: asr + # wenetspeech: asr + dataset_task=asr +fi +projector=linear +encoder_name=whisper +llm_name=Qwen2.5-7B-Instruct +use_peft=true +use_fp16=true +freeze_encoder=false +pad_or_trim=true +encoder_projector_ds_rate=5 + +speed_perturb=false +spec_augmentation=false +add_noise=false +add_reverb=false +# deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config_from_k2.json +deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/aispeech_asr/conf/ds_config.json +deepspeed_ckpt_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/aispeech_asr/exp/zh-1w-en-1w-asr/20250317/whisper_linear_Qwen2.5-7B-Instruct_loratrue_padtrue_normal__speedfalse_specaugfalse-1959/mala_asr_epoch_1_step_140000 +# ckpt_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/aispeech_asr/exp/multitask_asr/20250302/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal__speedfalse_specaugfalse-2232/mala_asr_epoch_2_step_32463/ +# deepspeed_ckpt_id=global_step90000 +if [[ $encoder_name == "whisper" ]] +then + encoder_finetune=false +fi +if [[ $use_peft == "true" ]];then + ckpt_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/aispeech_asr/exp/zh-1w-en-1w-asr/20250315/whisper_linear_Qwen2.5-7B-Instruct_loratrue_padtrue_normal__speedfalse_specaugfalse-1341/mala_asr_epoch_1_step_110000 + # deepspeed_ckpt_path= +fi + +# Choose Encoder +if [[ $encoder_name == "whisper" ]] +then + if [[ $encoder_finetune == true ]] + then + speech_encoder_path= + mel_size=80 + else + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + mel_size=128 + fi + encoder_dim=1280 + input_type=mel + +elif [[ $encoder_name == "wavlm" ]] +then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/wavlm/WavLM-Large.pt + encoder_dim=1024 + input_type=raw + mel_size=128 +else + exit 1 +fi + +# Choose LLM +if [[ $llm_name == "vicuna-7b-v1.5" ]] +then + llm_path= + llm_dim=4096 +elif [[ $llm_name == "Qwen2.5-7B-Instruct" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim=3584 +elif [[ $llm_name == "Qwen2-7B" ]] +then + llm_path= + llm_dim=3584 +elif [[ $llm_name == "Qwen2.5-7B" ]] +then + llm_path= + llm_dim=3584 +else + exit 1 +fi + +# Choose Train/Dev/Test +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/dev/ +elif [[ $dataset == "librispeech" ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/dev-other/ +else + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/dev/ +fi + + +output_dir=${code_dir}/exp/${dataset}/$(date +"%Y%m%d")/${encoder_name}_${projector}_${llm_name}_lora${use_peft}_pad${pad_or_trim}_${prompt_style}_${dataset_task}_speed${speed_perturb}_specaug${spec_augmentation}-$(date +"%H%M") +hydra_args=" +hydra.run.dir=$output_dir \ +++model_config.llm_name=$llm_name \ +++model_config.llm_path=$llm_path \ +++model_config.llm_dim=$llm_dim \ +++model_config.encoder_name=$encoder_name \ +++model_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++model_config.encoder_path=$speech_encoder_path \ +++model_config.encoder_dim=$encoder_dim \ +++model_config.encoder_projector=$projector \ +++model_config.normalize=true \ +++dataset_config.llm_name=$llm_name \ +++dataset_config.prompt_style=$prompt_style \ +++dataset_config.normalize=true \ +++dataset_config.dataset=$dataset \ +++dataset_config.input_type=$input_type \ +++dataset_config.speed_perturb=$speed_perturb \ +++dataset_config.spec_augmentation=$spec_augmentation \ +++dataset_config.add_reverb=$add_reverb \ +++dataset_config.noise_file_path=$noise_file_path \ +++dataset_config.mel_size=$mel_size \ +++dataset_config.pad_or_trim=$pad_or_trim \ +++dataset_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++dataset_config.train_scp_file_path=$train_scp_file_path \ +++dataset_config.dev_scp_file_path=$dev_scp_file_path \ +++train_config.model_name=mala_asr \ +++train_config.num_epochs=5 \ +++train_config.freeze_encoder=$freeze_encoder \ +++train_config.freeze_llm=true \ +++train_config.use_peft=$use_peft \ +++train_config.batching_strategy=custom \ +++train_config.total_steps=600000 \ +++train_config.validation_interval=10000 \ +++train_config.batch_size_training=4 \ +++train_config.val_batch_size=4 \ +++train_config.num_workers_dataloader=8 \ +++train_config.output_dir=$output_dir \ +++metric=acc \ +" +if [[ $use_peft == "true" ]];then + # hydra_args+="++ckpt_path=$ckpt_path/model.pt" + hydra_args+=" ++deepspeed_ckpt_path=$deepspeed_ckpt_path " + +fi +# hydra_args+=" ++deepspeed_ckpt_path=$deepspeed_ckpt_path " + +HOST_FILE="/tmp/"${JobID} #生成的hostfile的完整文件名,$JobID调度系统会自动生成 +SSH_PORT=6666 #因调度系统强制普通用户身份起容器,需要将ssh端口指定为大于1024的值 + +gen_hostfile() { #此函数负责生成hostfile, 已跟调度系统对接好,直接使用,不要修改 + echo "${VC_MASTER_HOSTS} slots=${GPU_PER_TASK}" > ${HOST_FILE} + echo "${VC_WORKER_HOSTS}" | awk -F ',' -v gpu_num=$GPU_PER_TASK '{for (i=1; i<=NF; i++) print $i" slots="gpu_num}' >> ${HOST_FILE} +} + +do_train() { + cat $HOST_FILE #训练主入口函数 + /usr/sbin/sshd -p ${SSH_PORT} #在Rank0上后台启动sshd服务,不要修改 + deepspeed \ + --node_rank=$RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT \ + --hostfile $HOST_FILE \ + --no_ssh \ + $code_dir/finetune_mala_asr_deepspeed.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + ++train_config.enable_fsdp=false \ + ++train_config.enable_ddp=true \ + ++train_config.use_fp16=$use_fp16 \ + ++deepspeed_config=$deepspeed_config \ + ${hydra_args} +} +gen_hostfile +do_train \ No newline at end of file diff --git a/examples/aispeech_domain/scripts/finetune_MaLa-ASR_torchrun.sh b/examples/aispeech_domain/scripts/finetune_MaLa-ASR_torchrun.sh new file mode 100644 index 00000000..665743fc --- /dev/null +++ b/examples/aispeech_domain/scripts/finetune_MaLa-ASR_torchrun.sh @@ -0,0 +1,180 @@ +#!/bin/bash +# export PYTHONPATH=/root/fairseq:$PYTHONPATH +export ASCEND_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export TOKENIZERS_PARALLELISM=false +# export CUDA_LAUNCH_BLOCKING=1 +export HYDRA_FULL_ERROR=1 +export OMP_NUM_THREADS=1 + +# debug setting for multiple gpus +# export NCCL_DEBUG=INFO +# export NCCL_DEBUG_SUBSYS=ALL +# export TORCH_DISTRIBUTED_DEBUG=INFO + +run_dir=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/ +cd $run_dir +code_dir=examples/aispeech_asr +# multitask +# dataset=alimeeting +# multitask_asr +dataset=aishell-1 +prompt_style=normal #instruct +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech ]] +then + # aishell1:asr hotword + # aisehll2:asr hotword mt + # librispeech:asr prevtext mt + # alimeeting: asr_far_bf asr_near + # gigaspeech: asr + # wenetspeech: asr + dataset_task=asr +fi +projector=linear +encoder_name=whisper +llm_name=Qwen2.5-7B-Instruct +use_peft=false +use_fp16=true +freeze_encoder=true +pad_or_trim=true +encoder_projector_ds_rate=5 +# enhance +# enhance +speed_perturb=false +spec_augmentation=false +add_noise=false +add_reverb=false +# /hpc_stor01/home/yangui.fang_sx/workingspace/project/mala_asr_slidespeech_origin/exp/multitask_asr/20250111/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padfalse_normal__speedtrue_specaugfalse-1106/mala_asr_epoch_2_step_1299/ +# ckpt_path=/hpc_stor01/home/yangui.fang_sx/workingspace/project/mala_asr_slidespeech_origin/exp/multitask_asr/20250114/whisper_linear_Qwen2.5-7B-Instruct_loratrue_padtrue_normal__speedfalse_specaugfalse-2018_Upper_/mala_asr_epoch_2_step_12299 +if [[ $encoder_name == "whisper" ]] +then + encoder_finetune=false +fi +if [[ $use_peft == "true" || $freeze_encoder == false ]];then + ckpt_path= +fi + +# Choose Encoder +if [[ $encoder_name == "whisper" ]] +then + if [[ $encoder_finetune == true ]] + then + speech_encoder_path= + mel_size=80 + else + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + mel_size=128 + fi + encoder_dim=1280 + input_type=mel + +elif [[ $encoder_name == "wavlm" ]] +then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/wavlm/WavLM-Large.pt + encoder_dim=1024 + input_type=raw + mel_size=128 +else + exit 1 +fi + +# Choose LLM +if [[ $llm_name == "vicuna-7b-v1.5" ]] +then + llm_path= + llm_dim=4096 +elif [[ $llm_name == "Qwen2.5-7B-Instruct" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim=3584 +elif [[ $llm_name == "Qwen2-7B" ]] +then + llm_path= + llm_dim=3584 +elif [[ $llm_name == "Qwen2.5-7B" ]] +then + llm_path= + llm_dim=3584 +else + exit 1 +fi + +# Choose Train/Dev/Test +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/dev/ +elif [[ $dataset == "librispeech" ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/dev-other/ +else + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/dev/ +fi + +output_dir=${code_dir}/exp/${dataset}/$(date +"%Y%m%d")/${encoder_name}_${projector}_${llm_name}_lora${use_peft}_pad${pad_or_trim}_${prompt_style}_${dataset_task}_speed${speed_perturb}_specaug${spec_augmentation}-$(date +"%H%M") +hydra_args=" +hydra.run.dir=$output_dir \ +++model_config.llm_name=$llm_name \ +++model_config.llm_path=$llm_path \ +++model_config.llm_dim=$llm_dim \ +++model_config.encoder_name=$encoder_name \ +++model_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++model_config.encoder_path=$speech_encoder_path \ +++model_config.encoder_dim=$encoder_dim \ +++model_config.encoder_projector=$projector \ +++model_config.normalize=true \ +++dataset_config.llm_name=$llm_name \ +++dataset_config.prompt_style=$prompt_style \ +++dataset_config.normalize=true \ +++dataset_config.dataset=$dataset \ +++dataset_config.input_type=$input_type \ +++dataset_config.speed_perturb=$speed_perturb \ +++dataset_config.spec_augmentation=$spec_augmentation \ +++dataset_config.add_reverb=$add_reverb \ +++dataset_config.noise_file_path=$noise_file_path \ +++dataset_config.mel_size=$mel_size \ +++dataset_config.pad_or_trim=$pad_or_trim \ +++dataset_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++dataset_config.train_scp_file_path=$train_scp_file_path \ +++dataset_config.dev_scp_file_path=$dev_scp_file_path \ +++train_config.model_name=mala_asr \ +++train_config.num_epochs=5 \ +++train_config.freeze_encoder=$freeze_encoder \ +++train_config.freeze_llm=true \ +++train_config.use_peft=$use_peft \ +++train_config.batching_strategy=custom \ +++train_config.warmup_steps=1000 \ +++train_config.total_steps=100000 \ +++train_config.lr=5e-5 \ +++train_config.validation_interval=50 \ +++train_config.batch_size_training=2 \ +++train_config.val_batch_size=2 \ +++train_config.num_workers_dataloader=8 \ +++train_config.output_dir=$output_dir \ +++metric=acc \ +" +if [[ $use_peft == "true" || $freeze_encoder == false ]];then + hydra_args+="++ckpt_path=$ckpt_path/model.pt" +fi +# hydra_args+="++ckpt_path=$ckpt_path/model.pt" + +# -m debugpy --listen 5678 --wait-for-client +if [[ $ASCEND_VISIBLE_DEVICES != *","* ]]; then + python -m debugpy --listen 5678 --wait-for-client $code_dir/finetune_mala_asr.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + $hydra_args +else + torchrun \ + --nnodes 1 \ + --nproc_per_node 2 \ + --master_port=29505 \ + $code_dir/finetune_mala_asr.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + ++train_config.enable_fsdp=false \ + ++train_config.enable_ddp=true \ + ++train_config.use_fp16=$use_fp16 \ + ${hydra_args} +fi diff --git a/examples/aispeech_domain/scripts/finetune_MaLa-ASR_torchrun_multiplenode.sh b/examples/aispeech_domain/scripts/finetune_MaLa-ASR_torchrun_multiplenode.sh new file mode 100644 index 00000000..b54e261b --- /dev/null +++ b/examples/aispeech_domain/scripts/finetune_MaLa-ASR_torchrun_multiplenode.sh @@ -0,0 +1,182 @@ +#!/bin/bash +# export PYTHONPATH=/root/fairseq:$PYTHONPATH +export ASCEND_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export TOKENIZERS_PARALLELISM=false +# export CUDA_LAUNCH_BLOCKING=1 +export HYDRA_FULL_ERROR=1 +export OMP_NUM_THREADS=1 + +# debug setting for multiple gpus +# export NCCL_DEBUG=INFO +# export NCCL_DEBUG_SUBSYS=ALL +# export TORCH_DISTRIBUTED_DEBUG=INFO + + +run_dir=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/ +cd $run_dir +code_dir=examples/aispeech_asr +# multitask +# dataset=alimeeting +dataset=aishell-1 +prompt_style=normal #instruct +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech ]] +then + # aishell1:asr hotword + # aisehll2:asr hotword mt + # librispeech:asr prevtext mt + # alimeeting: asr_far_bf asr_near + # gigaspeech: asr + # wenetspeech: asr + dataset_task=asr +fi +projector=linear +encoder_name=whisper +llm_name=Qwen2.5-7B-Instruct +use_peft=false +use_fp16=true +freeze_encoder=true +pad_or_trim=true +encoder_projector_ds_rate=5 +# enhance +# enhance +speed_perturb=false +spec_augmentation=false +add_noise=false +add_reverb=false +# /hpc_stor01/home/yangui.fang_sx/workingspace/project/mala_asr_slidespeech_origin/exp/multitask_asr/20250111/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padfalse_normal__speedtrue_specaugfalse-1106/mala_asr_epoch_2_step_1299/ +# ckpt_path=/hpc_stor01/home/yangui.fang_sx/workingspace/project/mala_asr_slidespeech_origin/exp/multitask_asr/20250114/whisper_linear_Qwen2.5-7B-Instruct_loratrue_padtrue_normal__speedfalse_specaugfalse-2018_Upper_/mala_asr_epoch_2_step_12299 +if [[ $encoder_name == "whisper" ]] +then + encoder_finetune=false +fi +if [[ $use_peft == "true" || $freeze_encoder == false ]];then + ckpt_path= +fi + +# Choose Encoder +if [[ $encoder_name == "whisper" ]] +then + if [[ $encoder_finetune == true ]] + then + speech_encoder_path= + mel_size=80 + else + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + mel_size=128 + fi + encoder_dim=1280 + input_type=mel + +elif [[ $encoder_name == "wavlm" ]] +then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/wavlm/WavLM-Large.pt + encoder_dim=1024 + input_type=raw + mel_size=128 +else + exit 1 +fi + +# Choose LLM +if [[ $llm_name == "vicuna-7b-v1.5" ]] +then + llm_path= + llm_dim=4096 +elif [[ $llm_name == "Qwen2.5-7B-Instruct" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim=3584 +elif [[ $llm_name == "Qwen2-7B" ]] +then + llm_path= + llm_dim=3584 +elif [[ $llm_name == "Qwen2.5-7B" ]] +then + llm_path= + llm_dim=3584 +else + exit 1 +fi + +# Choose Train/Dev/Test +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/dev/ +elif [[ $dataset == "librispeech" ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/dev-other/ +else + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/dev/ +fi + +output_dir=${code_dir}/exp/${dataset}/$(date +"%Y%m%d")/${encoder_name}_${projector}_${llm_name}_lora${use_peft}_pad${pad_or_trim}_${prompt_style}_${dataset_task}_speed${speed_perturb}_specaug${spec_augmentation}-$(date +"%H%M") +hydra_args=" +hydra.run.dir=$output_dir \ +++model_config.llm_name=$llm_name \ +++model_config.llm_path=$llm_path \ +++model_config.llm_dim=$llm_dim \ +++model_config.encoder_name=$encoder_name \ +++model_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++model_config.encoder_path=$speech_encoder_path \ +++model_config.encoder_dim=$encoder_dim \ +++model_config.encoder_projector=$projector \ +++model_config.normalize=true \ +++dataset_config.llm_name=$llm_name \ +++dataset_config.prompt_style=$prompt_style \ +++dataset_config.normalize=true \ +++dataset_config.dataset=$dataset \ +++dataset_config.input_type=$input_type \ +++dataset_config.speed_perturb=$speed_perturb \ +++dataset_config.spec_augmentation=$spec_augmentation \ +++dataset_config.add_reverb=$add_reverb \ +++dataset_config.noise_file_path=$noise_file_path \ +++dataset_config.mel_size=$mel_size \ +++dataset_config.pad_or_trim=$pad_or_trim \ +++dataset_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++dataset_config.train_scp_file_path=$train_scp_file_path \ +++dataset_config.dev_scp_file_path=$dev_scp_file_path \ +++train_config.model_name=mala_asr \ +++train_config.num_epochs=5 \ +++train_config.freeze_encoder=$freeze_encoder \ +++train_config.freeze_llm=true \ +++train_config.use_peft=$use_peft \ +++train_config.batching_strategy=custom \ +++train_config.warmup_steps=1000 \ +++train_config.total_steps=100000 \ +++train_config.lr=5e-5 \ +++train_config.validation_interval=10000 \ +++train_config.batch_size_training=2 \ +++train_config.val_batch_size=2 \ +++train_config.num_workers_dataloader=4 \ +++train_config.output_dir=$output_dir \ +++metric=acc \ +" +if [[ $use_peft == "true" || $freeze_encoder == false ]];then + hydra_args+="++ckpt_path=$ckpt_path/model.pt" +fi +# hydra_args+="++ckpt_path=$ckpt_path/model.pt" + +# -m debugpy --listen 5678 --wait-for-client +if [[ $ASCEND_VISIBLE_DEVICES != *","* ]]; then + python -m debugpy --listen 5678 --wait-for-client $code_dir/finetune_mala_asr.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + $hydra_args +else + torchrun \ + --nnodes 2 \ + --nproc_per_node 8 \ + --node_rank=$RANK \ + --master_addr=$MASTER_ADDR \ + --master_port=$MASTER_PORT \ + $code_dir/finetune_mala_asr.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + ++train_config.enable_fsdp=false \ + ++train_config.enable_ddp=true \ + ++train_config.use_fp16=$use_fp16 \ + ${hydra_args} +fi diff --git a/examples/aispeech_domain/scripts/transcribe_deepspeed_to_pt.py b/examples/aispeech_domain/scripts/transcribe_deepspeed_to_pt.py new file mode 100644 index 00000000..e2a02862 --- /dev/null +++ b/examples/aispeech_domain/scripts/transcribe_deepspeed_to_pt.py @@ -0,0 +1,9 @@ +import argparse +import torch +import torch_npu +import sys +in_path = sys.argv[1] +out_path = sys.argv[2] +weight_dict = torch.load(in_path)["module"] +torch.save(weight_dict, f"{out_path}/model.pt") +print("[Finish]") \ No newline at end of file diff --git a/examples/aispeech_domain/slam_llm b/examples/aispeech_domain/slam_llm new file mode 120000 index 00000000..e2d801ef --- /dev/null +++ b/examples/aispeech_domain/slam_llm @@ -0,0 +1 @@ +/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/src/slam_llm \ No newline at end of file diff --git a/examples/asr_fireredasr/conf/ds_config.json b/examples/asr_fireredasr/conf/ds_config.json new file mode 100644 index 00000000..56af1aff --- /dev/null +++ b/examples/asr_fireredasr/conf/ds_config.json @@ -0,0 +1,36 @@ +{ + "train_micro_batch_size_per_gpu": 2, + "gradient_accumulation_steps": 1, + "optimizer": { + "type": "Adam", + "params": { + "lr": 5e-6 + } + + }, + "bf16": { + "enabled": true, + "auto_cast": true, + "loss_scale": 0, + "loss_scale_window": 100, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 0.01 + }, + "zero_optimization": { + "stage": 2, + "allgather_partitions": true, + "overlap_comm": true, + "reduce_scatter": true, + "contiguous_gradients": true + }, + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": 0.0, + "warmup_max_lr": 0.000005, + "warmup_num_steps": 100 + } + }, + "checkpoint_activations": false +} \ No newline at end of file diff --git a/examples/asr_fireredasr/conf/prompt.yaml b/examples/asr_fireredasr/conf/prompt.yaml new file mode 100644 index 00000000..8e4a3486 --- /dev/null +++ b/examples/asr_fireredasr/conf/prompt.yaml @@ -0,0 +1,4 @@ +dataset_config: + # we put prompt here, because the hydra override in shell script only support a small subset of chars + # prompt: "Transcribe speech to text. Output the transcription directly without redundant content. Ensure that the output is not duplicated. " + prompt: "请转写音频为文字" diff --git a/examples/asr_fireredasr/finetune_fireredasr.py b/examples/asr_fireredasr/finetune_fireredasr.py new file mode 100644 index 00000000..ef277e63 --- /dev/null +++ b/examples/asr_fireredasr/finetune_fireredasr.py @@ -0,0 +1,49 @@ +from slam_llm.pipeline.finetune import main as train +from typing import Optional + +import hydra +import logging +from dataclasses import dataclass, field +from omegaconf import DictConfig, ListConfig, OmegaConf +from fireredasr_config import ModelConfig, TrainConfig, DataConfig, LogConfig, FSDPConfig + +@dataclass +class RunConfig: + dataset_config: DataConfig = field(default_factory=DataConfig) + model_config: ModelConfig = field(default_factory=ModelConfig) + train_config: TrainConfig = field(default_factory=TrainConfig) + log_config: LogConfig = field(default_factory=LogConfig) + fsdp_config: FSDPConfig = field(default_factory=FSDPConfig) + debug: bool = field(default=False, metadata={"help": "Use pdb when true"}) + metric: str = field(default="acc", metadata={"help": "The metric for evaluation"}) + ckpt_path: Optional[str] = field( + default=None, metadata={"help": "The path to model's checkpoint"} + ) + +@hydra.main(config_name=None, version_base=None) +def main_hydra(cfg: DictConfig): + run_config = RunConfig() + cfg = OmegaConf.merge(run_config, cfg) + def to_plain_list(cfg_item): + if isinstance(cfg_item, ListConfig): + return OmegaConf.to_container(cfg_item, resolve=True) + elif isinstance(cfg_item, DictConfig): + return {k: to_plain_list(v) for k, v in cfg_item.items()} + else: + return cfg_item + + # kwargs = to_plain_list(cfg) + kwargs = cfg + log_level = getattr(logging, kwargs.get("log_level", "INFO").upper()) + + logging.basicConfig(level=log_level) + + if kwargs.get("debug", False): + import pdb; + pdb.set_trace() + + train(kwargs) + + +if __name__ == "__main__": + main_hydra() \ No newline at end of file diff --git a/examples/asr_fireredasr/finetune_fireredasr_deepspeed.py b/examples/asr_fireredasr/finetune_fireredasr_deepspeed.py new file mode 100644 index 00000000..d0a3e549 --- /dev/null +++ b/examples/asr_fireredasr/finetune_fireredasr_deepspeed.py @@ -0,0 +1,57 @@ +from slam_llm.pipeline.finetune_deepspeed import main as train +from typing import Optional +import argparse +import hydra +import logging +from dataclasses import dataclass, field +from omegaconf import DictConfig, ListConfig, OmegaConf +from fireredasr_config import ModelConfig, TrainConfig, DataConfig, LogConfig, FSDPConfig +import sys +@dataclass +class RunConfig: + dataset_config: DataConfig = field(default_factory=DataConfig) + model_config: ModelConfig = field(default_factory=ModelConfig) + train_config: TrainConfig = field(default_factory=TrainConfig) + log_config: LogConfig = field(default_factory=LogConfig) + fsdp_config: FSDPConfig = field(default_factory=FSDPConfig) + debug: bool = field(default=False, metadata={"help": "Use pdb when true"}) + metric: str = field(default="acc", metadata={"help": "The metric for evaluation"}) + ckpt_path: Optional[str] = field( + default=None, metadata={"help": "The path to projector checkpoint"} + ) + deepspeed_config : str ="" +def parse_deepspeed_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--local_rank", type=int, default=-1) + + args, unknown_args = parser.parse_known_args() + return args, unknown_args +@hydra.main(config_name=None, version_base=None) +def main_hydra(cfg: DictConfig): + run_config = RunConfig() + cfg = OmegaConf.merge(run_config, cfg) + def to_plain_list(cfg_item): + if isinstance(cfg_item, ListConfig): + return OmegaConf.to_container(cfg_item, resolve=True) + elif isinstance(cfg_item, DictConfig): + return {k: to_plain_list(v) for k, v in cfg_item.items()} + else: + return cfg_item + + # kwargs = to_plain_list(cfg) + kwargs = cfg + log_level = getattr(logging, kwargs.get("log_level", "INFO").upper()) + + logging.basicConfig(level=log_level) + + if kwargs.get("debug", False): + import pdb; + pdb.set_trace() + + train(kwargs) + + +if __name__ == "__main__": + deepspeed_args, remaining_args = parse_deepspeed_args() + sys.argv = [sys.argv[0]] + remaining_args # 仅传递 Hydra 能处理的参数 + main_hydra() \ No newline at end of file diff --git a/examples/asr_fireredasr/fireredasr_config.py b/examples/asr_fireredasr/fireredasr_config.py new file mode 100644 index 00000000..6c4f3e0c --- /dev/null +++ b/examples/asr_fireredasr/fireredasr_config.py @@ -0,0 +1,166 @@ +from dataclasses import dataclass, field +from typing import Optional, List +from torch.distributed.fsdp import ShardingStrategy + +@dataclass +class ModelConfig: + file: str = "examples/asr_fireredasr/model/slam_fireredasr.py" + llm_name: str = "vicuna-7b-v1.5" + llm_path: str = "PATH/to/LLAMA/7B" + llm_type: str = "decoder_only" + llm_dim: int = 4096 + firered_path :str = "/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/FireRedASR-LLM/" + encoder_name: Optional[str] = None + encoder_ds_rate: int = 2 + encoder_path: Optional[str] = None + encoder_path_hf: Optional[str] = None + encoder_dim: int = 1280 + encoder_projector: str = "linear" + qformer_layers : int = 8 + encoder_projector_ds_rate: int = 2 + modal: str = "audio" + normalize: Optional[bool] = field(default=False, metadata={ + "help": "whether input is normalized, used for models such as wavlm" + }) + encoder_type: str = field(default="finetune", metadata={ + "help": "whether model is only pretrained or finetuned, used for models such as hubert" + }) + +# @dataclass +# class PeftConfig: +# peft_method: str = "lora" # None , llama_adapter, prefix +# r: int = 8 +# lora_alpha: int = 32 +# target_modules: List = field(default_factory=lambda: [ "q_proj", "v_proj"]) +# bias: str = "none" +# task_type: str = "CAUSAL_LM" +# lora_dropout: float = 0.05 +# inference_mode: bool = False + +@dataclass +class PeftConfig: + peft_method: str = "lora" # None , llama_adapter, prefix + r: int = 64 + lora_alpha: int = 16 + target_modules: List = field(default_factory=lambda: [ "q_proj","k_proj", "v_proj", "o_proj", "up_proj","gate_proj","down_proj"]) + bias: str = "none" + task_type: str = "CAUSAL_LM" + lora_dropout: float = 0.05 + inference_mode: bool = False + +@dataclass +class TrainConfig: + model_name:str = "PATH/to/LLAMA/7B" + enable_ddp:bool = False + enable_deepspeed:bool = False + enable_fsdp:bool = False + low_cpu_fsdp:bool = False + run_validation:bool = True + batch_size_training:int = 4 + batching_strategy:str = field(default="packing", metadata={ + "help":"alternative: padding" + }) # + context_length:int = 4096 + gradient_accumulation_steps:int = 1 + num_epochs:int = 3 + num_workers_dataloader:int = 1 + warmup_steps:int = 1000 + total_steps:int = 100000 + validation_interval:int = 1000 + lr:float = 1e-4 + weight_decay:float = 0.0 + gamma:float = 0.85 + seed:int = 42 + use_fp16:bool = False + mixed_precision:bool = True + val_batch_size:int = 1 + + use_peft:bool = False + peft_config:PeftConfig = field(default_factory=PeftConfig) + output_dir:str = "PATH/to/save/PEFT/model" + freeze_layers:bool = False + num_freeze_layers:int = 1 + quantization:bool = False + one_gpu:bool = False + save_model:bool = True + dist_checkpoint_root_folder:str = "PATH/to/save/FSDP/model" # will be used if using FSDP + dist_checkpoint_folder:str = "fine-tuned" # will be used if using FSDP + save_optimizer:bool = False # will be used if using FSDP + use_fast_kernels:bool = False # Enable using SDPA from PyTroch Accelerated Transformers, make use Flash Attention and Xformer memory-efficient kernels + run_test_during_validation:bool = False + run_test_during_validation_file:str = "test.wav" + run_test_during_validation_prompt:str = "<|ASR|>" + freeze_llm:bool = field(default=False, metadata={ + "help": "whether to freeze llm when finetuning, should be true when use peft finetuning" + }) + freeze_encoder:bool = False + inference_mode:bool = True + +@dataclass +class DataConfig: + # dataset: str = "slidespeech_dataset" + dataset: str = "multitask_dataset" + llm_name: str = "vicuna-7b-v1.5" + prompt_style: str = "normal" # instruct + # file: str = "examples/mala_asr_slidespeech/dataset/slidespeech_dataset.py:get_speech_dataset" + file: str = "examples/asr_fireredasr/model/slam_fireredasr.py:get_speech_dataset" + cmvn_file: str = "/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/FireRedASR-LLM/cmvn.ark" + text: Optional[str] = None + wav_paths: Optional[List[str]] = None + wav_path: Optional[str] = None + wav_dir: Optional[str] = None + wav_scp: Optional[str] = None + speed_perturb : bool = False + spec_augmentation : bool = False + add_noise : bool = False + add_reverb : bool = False + noise_file_path: str = "" + train_scp_file_path: str = "" + train_text_file: str = "" + dev_scp_file_path: str = "" + dev_text_file: str = "" + test_scp_file_path: str = "" + train_split: str = "train" + dev_split: str = "dev" + test_split:str = "test" + pad_or_trim: bool = True + prompt: Optional[str] = None + use_ocr: bool = True + inference_mode: bool = False + prompt_mode : str = "qwen" + lower: bool = False + encoder_ds_rate: int = 2 + encoder_projector_ds_rate: int = 5 + fix_length_audio: int = -1 + inference_mode:bool = False + input_type: str = field(default="raw", metadata={ + "help":"Use raw when input is wav, mel when for whisper" + }) + mel_size: int = field(default=80, metadata={ + "help": "80 for whisper large v1 and v2, 128 for v3" + }) + normalize: Optional[bool] = field(default=False, metadata={ + "help": "whether input is normalized, used for models such as wavlm" + }) + +@dataclass +class FSDPConfig: + mixed_precision: bool = True + use_fp16: bool = False + # sharding_strategy = "FULL_SHARD" #ShardingStrategy = ShardingStrategy.FULL_SHARD + sharding_strategy: ShardingStrategy = "SHARD_GRAD_OP" #ShardingStrategy.NO_SHARD #MZY: set NO_SHARD when use DDP + checkpoint_type: str = "SHARDED_STATE_DICT" # alternatively can use SHARDED_STATE_DICT save one file per rank, and can resize the world-size. + fsdp_activation_checkpointing: bool = True + fsdp_cpu_offload: bool = False + pure_bf16: bool = False + optimizer: str = "AdamW" + +@dataclass +class LogConfig: + use_wandb: bool = False + wandb_dir: str = "tmp/test_wandb" + wandb_entity_name: str = "project_name" + wandb_project_name: str = "project_name" + wandb_exp_name: str = "exp_name" + log_file: str = "tmp/test.log" + log_interval: int = 5 diff --git a/examples/asr_fireredasr/inference_fireredasr.py b/examples/asr_fireredasr/inference_fireredasr.py new file mode 100644 index 00000000..d7657776 --- /dev/null +++ b/examples/asr_fireredasr/inference_fireredasr.py @@ -0,0 +1,53 @@ +from slam_llm.pipeline.inference_batch import main as inference + +import hydra +import logging +from dataclasses import dataclass, field +from omegaconf import DictConfig, ListConfig, OmegaConf +from typing import Optional +from fireredasr_config import ModelConfig, TrainConfig, DataConfig, LogConfig, FSDPConfig + + +@dataclass +class RunConfig: + dataset_config: DataConfig = field(default_factory=DataConfig) + model_config: ModelConfig = field(default_factory=ModelConfig) + train_config: TrainConfig = field(default_factory=TrainConfig) + log_config: LogConfig = field(default_factory=LogConfig) + fsdp_config: FSDPConfig = field(default_factory=FSDPConfig) + debug: bool = field(default=False, metadata={"help": "Use pdb when true"}) + metric: str = field(default="acc", metadata={"help": "The metric for evaluation"}) + decode_log: str = field( + default="output/decode_log", + metadata={"help": "The prefix for the decode output"}, + ) + ckpt_path: Optional[str] = field( + default=None, metadata={"help": "The path to projector checkpoint"} + ) + peft_ckpt: Optional[str] = field( + default=None, + metadata={ + "help": "The path to peft checkpoint, should be a directory including adapter_config.json" + }, + ) + + +@hydra.main(config_name=None, version_base=None) +def main_hydra(cfg: DictConfig): + run_config = RunConfig() + cfg = OmegaConf.merge(run_config, cfg) + # kwargs = to_plain_list(cfg) + log_level = getattr(logging, cfg.get("log_level", "INFO").upper()) + + logging.basicConfig(level=log_level) + + if cfg.get("debug", False): + import pdb + + pdb.set_trace() + + inference(cfg) + + +if __name__ == "__main__": + main_hydra() diff --git a/examples/asr_fireredasr/inference_fireredasr_deepspeed.py b/examples/asr_fireredasr/inference_fireredasr_deepspeed.py new file mode 100644 index 00000000..b00bb9db --- /dev/null +++ b/examples/asr_fireredasr/inference_fireredasr_deepspeed.py @@ -0,0 +1,60 @@ +from slam_llm.pipeline.inference_batch_deepspeed import main as inference +import sys +import hydra +import logging +from dataclasses import dataclass, field +from omegaconf import DictConfig, ListConfig, OmegaConf +from typing import Optional +from fireredasr_config import ModelConfig, TrainConfig, DataConfig, LogConfig, FSDPConfig +import argparse + +@dataclass +class RunConfig: + dataset_config: DataConfig = field(default_factory=DataConfig) + model_config: ModelConfig = field(default_factory=ModelConfig) + train_config: TrainConfig = field(default_factory=TrainConfig) + log_config: LogConfig = field(default_factory=LogConfig) + fsdp_config: FSDPConfig = field(default_factory=FSDPConfig) + debug: bool = field(default=False, metadata={"help": "Use pdb when true"}) + metric: str = field(default="acc", metadata={"help": "The metric for evaluation"}) + decode_log: str = field( + default="output/decode_log", + metadata={"help": "The prefix for the decode output"}, + ) + ckpt_path: Optional[str] = field( + default=None, metadata={"help": "The path to projector checkpoint"} + ) + peft_ckpt: Optional[str] = field( + default=None, + metadata={ + "help": "The path to peft checkpoint, should be a directory including adapter_config.json" + }, + ) +def parse_deepspeed_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--local_rank", type=int, default=-1) + + args, unknown_args = parser.parse_known_args() + return args, unknown_args + +@hydra.main(config_name=None, version_base=None) +def main_hydra(cfg: DictConfig): + run_config = RunConfig() + cfg = OmegaConf.merge(run_config, cfg) + # kwargs = to_plain_list(cfg) + log_level = getattr(logging, cfg.get("log_level", "INFO").upper()) + + logging.basicConfig(level=log_level) + + if cfg.get("debug", False): + import pdb + + pdb.set_trace() + + inference(cfg) + + +if __name__ == "__main__": + deepspeed_args, remaining_args = parse_deepspeed_args() + sys.argv = [sys.argv[0]] + remaining_args # 仅传递 Hydra 能处理的参数 + main_hydra() diff --git a/examples/asr_fireredasr/model/fireredasr/__init__.py b/examples/asr_fireredasr/model/fireredasr/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/examples/asr_fireredasr/model/fireredasr/data b/examples/asr_fireredasr/model/fireredasr/data new file mode 120000 index 00000000..3d6bf669 --- /dev/null +++ b/examples/asr_fireredasr/model/fireredasr/data @@ -0,0 +1 @@ +/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/FireRedASR/fireredasr/data \ No newline at end of file diff --git a/examples/asr_fireredasr/model/fireredasr/models/fireredasr.py b/examples/asr_fireredasr/model/fireredasr/models/fireredasr.py new file mode 100644 index 00000000..0f7da203 --- /dev/null +++ b/examples/asr_fireredasr/model/fireredasr/models/fireredasr.py @@ -0,0 +1,129 @@ +import os +import time + +import torch +import torch_npu +from fireredasr.data.asr_feat import ASRFeatExtractor +from fireredasr.models.fireredasr_aed import FireRedAsrAed +from fireredasr.models.fireredasr_llm import FireRedAsrLlm +from fireredasr.tokenizer.aed_tokenizer import ChineseCharEnglishSpmTokenizer +from fireredasr.tokenizer.llm_tokenizer import LlmTokenizerWrapper + + +class FireRedAsr: + @classmethod + def from_pretrained(cls, asr_type, model_dir): + assert asr_type in ["aed", "llm"] + + cmvn_path = os.path.join(model_dir, "cmvn.ark") + feat_extractor = ASRFeatExtractor(cmvn_path) + + if asr_type == "aed": + model_path = os.path.join(model_dir, "model.pth.tar") + dict_path =os.path.join(model_dir, "dict.txt") + spm_model = os.path.join(model_dir, "train_bpe1000.model") + model = load_fireredasr_aed_model(model_path) + tokenizer = ChineseCharEnglishSpmTokenizer(dict_path, spm_model) + elif asr_type == "llm": + model_path = os.path.join(model_dir, "model.pth.tar") + encoder_path = os.path.join(model_dir, "asr_encoder.pth.tar") + llm_dir = os.path.join(model_dir, "Qwen2-7B-Instruct") + model, tokenizer = load_firered_llm_model_and_tokenizer( + model_path, encoder_path, llm_dir) + model.eval() + return cls(asr_type, feat_extractor, model, tokenizer) + + def __init__(self, asr_type, feat_extractor, model, tokenizer): + self.asr_type = asr_type + self.feat_extractor = feat_extractor + self.model = model + self.tokenizer = tokenizer + + @torch.no_grad() + def transcribe(self, batch_uttid, batch_wav_path, args={}): + feats, lengths, durs = self.feat_extractor(batch_wav_path) + total_dur = sum(durs) + if args.get("use_gpu", False): + feats, lengths = feats.cuda(), lengths.cuda() + self.model.cuda() + else: + feats, lengths = feats.npu(), lengths.npu() + self.model.npu() + + if self.asr_type == "aed": + start_time = time.time() + + hyps = self.model.transcribe( + feats, lengths, + args.get("beam_size", 1), + args.get("nbest", 1), + args.get("decode_max_len", 0), + args.get("softmax_smoothing", 1.0), + args.get("aed_length_penalty", 0.0), + args.get("eos_penalty", 1.0) + ) + + elapsed = time.time() - start_time + rtf= elapsed / total_dur if total_dur > 0 else 0 + + results = [] + for uttid, wav, hyp in zip(batch_uttid, batch_wav_path, hyps): + hyp = hyp[0] # only return 1-best + hyp_ids = [int(id) for id in hyp["yseq"].cpu()] + text = self.tokenizer.detokenize(hyp_ids) + results.append({"uttid": uttid, "text": text, "wav": wav, + "rtf": f"{rtf:.4f}"}) + return results + + elif self.asr_type == "llm": + input_ids, attention_mask, _, _ = \ + LlmTokenizerWrapper.preprocess_texts( + origin_texts=[""]*feats.size(0), tokenizer=self.tokenizer, + max_len=128, decode=True) + if args.get("use_gpu", False): + input_ids = input_ids.cuda() + attention_mask = attention_mask.cuda() + else: + input_ids = input_ids.npu() + attention_mask = attention_mask.npu() + start_time = time.time() + + generated_ids = self.model.transcribe( + feats, lengths, input_ids, attention_mask, + args.get("beam_size", 1), + args.get("decode_max_len", 0), + args.get("decode_min_len", 0), + args.get("repetition_penalty", 1.0), + args.get("llm_length_penalty", 0.0), + args.get("temperature", 1.0) + ) + + elapsed = time.time() - start_time + rtf= elapsed / total_dur if total_dur > 0 else 0 + texts = self.tokenizer.batch_decode(generated_ids, + skip_special_tokens=True) + results = [] + for uttid, wav, text in zip(batch_uttid, batch_wav_path, texts): + results.append({"uttid": uttid, "text": text, "wav": wav, + "rtf": f"{rtf:.4f}"}) + return results + + + +def load_fireredasr_aed_model(model_path): + package = torch.load(model_path, map_location=lambda storage, loc: storage) + print("model args:", package["args"]) + model = FireRedAsrAed.from_args(package["args"]) + model.load_state_dict(package["model_state_dict"], strict=True) + return model + + +def load_firered_llm_model_and_tokenizer(model_path, encoder_path, llm_dir): + package = torch.load(model_path, map_location=lambda storage, loc: storage) + package["args"].encoder_path = encoder_path + package["args"].llm_dir = llm_dir + print("model args:", package["args"]) + model = FireRedAsrLlm.from_args(package["args"]) + model.load_state_dict(package["model_state_dict"], strict=False) + tokenizer = LlmTokenizerWrapper.build_llm_tokenizer(llm_dir) + return model, tokenizer diff --git a/examples/asr_fireredasr/model/fireredasr/models/fireredasr_aed.py b/examples/asr_fireredasr/model/fireredasr/models/fireredasr_aed.py new file mode 100644 index 00000000..4938c2c3 --- /dev/null +++ b/examples/asr_fireredasr/model/fireredasr/models/fireredasr_aed.py @@ -0,0 +1,35 @@ +import torch + +from fireredasr.models.module.conformer_encoder import ConformerEncoder +from fireredasr.models.module.transformer_decoder import TransformerDecoder + + +class FireRedAsrAed(torch.nn.Module): + @classmethod + def from_args(cls, args): + return cls(args) + + def __init__(self, args): + super().__init__() + self.sos_id = args.sos_id + self.eos_id = args.eos_id + + self.encoder = ConformerEncoder( + args.idim, args.n_layers_enc, args.n_head, args.d_model, + args.residual_dropout, args.dropout_rate, + args.kernel_size, args.pe_maxlen) + + self.decoder = TransformerDecoder( + args.sos_id, args.eos_id, args.pad_id, args.odim, + args.n_layers_dec, args.n_head, args.d_model, + args.residual_dropout, args.pe_maxlen) + + def transcribe(self, padded_input, input_lengths, + beam_size=1, nbest=1, decode_max_len=0, + softmax_smoothing=1.0, length_penalty=0.0, eos_penalty=1.0): + enc_outputs, _, enc_mask = self.encoder(padded_input, input_lengths) + nbest_hyps = self.decoder.batch_beam_search( + enc_outputs, enc_mask, + beam_size, nbest, decode_max_len, + softmax_smoothing, length_penalty, eos_penalty) + return nbest_hyps diff --git a/examples/asr_fireredasr/model/fireredasr/models/fireredasr_llm.py b/examples/asr_fireredasr/model/fireredasr/models/fireredasr_llm.py new file mode 100644 index 00000000..350dacf4 --- /dev/null +++ b/examples/asr_fireredasr/model/fireredasr/models/fireredasr_llm.py @@ -0,0 +1,343 @@ +import logging +import os +import random +import re + +import torch +import torch.nn as nn +from transformers import AutoModelForCausalLM +from torch.npu.amp import autocast +from fireredasr.models.fireredasr_aed import FireRedAsrAed +from fireredasr.models.module.adapter import Adapter +from fireredasr.tokenizer.llm_tokenizer import DEFAULT_SPEECH_TOKEN, IGNORE_TOKEN_ID +from fireredasr.tokenizer.llm_tokenizer import LlmTokenizerWrapper +from fireredasr.utils.param import count_model_parameters +import sys +sys.path.append('/aistor/aispeech/hpc_stor01/home/pengjing00sx/SLAM-LLM/examples/asr_fireredasr') +from slam_llm.utils.metric import compute_accuracy + +class FireRedAsrLlm(nn.Module): + @classmethod + def load_encoder(cls, model_path): + assert os.path.exists(model_path) + package = torch.load(model_path, map_location=lambda storage, loc: storage) + model = FireRedAsrAed.from_args(package["args"]) + if "model_state_dict" in package: + model.load_state_dict(package["model_state_dict"], strict=False) + encoder = model.encoder + encoder_dim = encoder.odim + return encoder, encoder_dim + + @classmethod + def from_args(cls, args): + logging.info(args) + logging.info("Build FireRedAsrLlm") + # Build Speech Encoder + encoder, encoder_dim = cls.load_encoder(args.encoder_path) + count_model_parameters(encoder) + if args.freeze_encoder: + logging.info(f"Frezee encoder") + for name, param in encoder.named_parameters(): + param.requires_grad = False + encoder.eval() + + if args.use_flash_attn: + attn_implementation = "flash_attention_2" + if args.use_fp16: + torch_dtype = torch.float16 + else: + torch_dtype = torch.float32 + else: + attn_implementation = "eager" + if args.use_fp16: + torch_dtype = torch.float16 + else: + torch_dtype = torch.float32 + + # Build LLM + llm = AutoModelForCausalLM.from_pretrained( + args.llm_dir, + attn_implementation=attn_implementation, + torch_dtype=torch_dtype, + ) + + count_model_parameters(llm) + # LLM Freeze or LoRA + llm_dim = llm.config.hidden_size + if args.freeze_llm: + logging.info(f"Frezee LLM") + for name, param in llm.named_parameters(): + param.requires_grad = False + llm.eval() + else: + if args.use_lora: + from peft import LoraConfig, get_peft_model + lora_config = LoraConfig( + r=64, + lora_alpha=16, + target_modules=[ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "up_proj", + "gate_proj", + "down_proj", + ], + lora_dropout=0.05, + task_type="CAUSAL_LM", + ) + llm = get_peft_model(llm, lora_config) + llm.print_trainable_parameters() + + tokenizer = LlmTokenizerWrapper.build_llm_tokenizer(args.llm_dir) + assert tokenizer.pad_token_id == tokenizer.convert_tokens_to_ids("<|endoftext|>") + llm.config.pad_token_id = tokenizer.pad_token_id + llm.config.bos_token_id = tokenizer.convert_tokens_to_ids("<|im_start|>") + llm.config.eos_token_id = tokenizer.convert_tokens_to_ids("<|im_end|>") + llm.config.default_speech_token_id = tokenizer.convert_tokens_to_ids( + DEFAULT_SPEECH_TOKEN + ) + + # Build projector + encoder_projector = Adapter( + encoder_dim, llm_dim, args.encoder_downsample_rate) + count_model_parameters(encoder_projector) + + return cls(encoder, llm, encoder_projector, + args.freeze_encoder, args.freeze_llm) + + def __init__(self, encoder, llm, encoder_projector, + freeze_encoder, freeze_llm): + super().__init__() + self.encoder = encoder + self.llm = llm + self.encoder_projector = encoder_projector + # args + self.freeze_encoder = freeze_encoder + self.freeze_llm = freeze_llm + self.llm_config = llm.config + + def transcribe(self, padded_feat, feat_lengths, padded_input_ids, attention_mask, + beam_size=1, decode_max_len=0, decode_min_len=0, + repetition_penalty=1.0, llm_length_penalty=0, temperature=1.0): + encoder_outs, enc_lengths, enc_mask = self.encoder(padded_feat, feat_lengths) + speech_features, speech_lens = self.encoder_projector(encoder_outs, enc_lengths) + inputs_embeds = self.llm.get_input_embeddings()(padded_input_ids) + + inputs_embeds, attention_mask, _ = \ + self._merge_input_ids_with_speech_features( + speech_features.to(inputs_embeds.dtype), inputs_embeds, padded_input_ids, attention_mask, + speech_lens=speech_lens + ) + + max_new_tokens = speech_features.size(1) if decode_max_len < 1 else decode_max_len + max_new_tokens = max(1, max_new_tokens) + + generated_ids = self.llm.generate( + inputs_embeds=inputs_embeds, + max_new_tokens=max_new_tokens, + num_beams=beam_size, + do_sample=False, + min_length=decode_min_len, + top_p=1.0, + repetition_penalty=repetition_penalty, + length_penalty=llm_length_penalty, + temperature=temperature, + bos_token_id=self.llm.config.bos_token_id, + eos_token_id=self.llm.config.eos_token_id, + pad_token_id=self.llm.config.pad_token_id, + ) + + return generated_ids + @autocast(dtype=torch.bfloat16) + def forward(self, **batch): + padded_feat = batch["feats"] + feat_lengths = batch["lengths"] + padded_input_ids = batch["input_ids"] + attention_mask = batch["attention_mask"] + target_ids = batch["target_ids"] + # print(padded_feat.dtype) + # input() + encoder_outs, enc_lengths, enc_mask = self.encoder(padded_feat, feat_lengths) + speech_features, speech_lens = self.encoder_projector(encoder_outs, enc_lengths) + inputs_embeds = self.llm.get_input_embeddings()(padded_input_ids) + # train + inputs_embeds, attention_mask, labels = \ + self._merge_input_ids_with_speech_features( + speech_features.to(inputs_embeds.dtype), inputs_embeds, padded_input_ids, attention_mask, target_ids, + speech_lens=speech_lens + ) + model_outputs = self.llm(inputs_embeds=inputs_embeds, attention_mask=attention_mask, labels=labels) + acc = -1 + with torch.no_grad(): + preds = torch.argmax(model_outputs.logits, -1) + acc = compute_accuracy(preds.detach()[:, :-1], labels.detach()[:, 1:], ignore_label=-100) + # input() + return model_outputs, acc + + # SLAM-LLM api + @torch.no_grad() + def generate(self, **batch): + # decode args: + beam_size=3 + decode_max_len=0 + decode_min_len=0 + repetition_penalty=3.0 + llm_length_penalty=1.0 + temperature=1.0 + + keys = batch["keys"] + padded_feat = batch["feats"] + feat_lengths = batch["lengths"] + padded_input_ids = batch["input_ids"] + attention_mask = batch["attention_mask"] + encoder_outs, enc_lengths, enc_mask = self.encoder(padded_feat, feat_lengths) + speech_features, speech_lens = self.encoder_projector(encoder_outs, enc_lengths) + inputs_embeds = self.llm.get_input_embeddings()(padded_input_ids) + inputs_embeds, attention_mask, _ = \ + self._merge_input_ids_with_speech_features( + speech_features.to(inputs_embeds.dtype), inputs_embeds, padded_input_ids, attention_mask, + speech_lens=speech_lens + ) + max_new_tokens = speech_features.size(1) if decode_max_len < 1 else decode_max_len + max_new_tokens = max(1, max_new_tokens) + generated_ids = self.llm.generate( + inputs_embeds=inputs_embeds, + max_new_tokens=max_new_tokens, + num_beams=beam_size, + do_sample=False, + min_length=decode_min_len, + top_p=1.0, + repetition_penalty=repetition_penalty, + length_penalty=llm_length_penalty, + temperature=temperature, + bos_token_id=self.llm.config.bos_token_id, + eos_token_id=self.llm.config.eos_token_id, + pad_token_id=self.llm.config.pad_token_id, + ) + return generated_ids + + def _merge_input_ids_with_speech_features( + self, speech_features, inputs_embeds, input_ids, attention_mask, labels=None, + speech_lens=None + ): + """ + Modified from: https://github.com/k2-fsa/icefall/blob/master/egs/speech_llm/ASR_LLM/whisper_llm_zh/model.py + """ + speech_lens = None + num_speechs, speech_len, embed_dim = speech_features.shape + batch_size, sequence_length = input_ids.shape + left_padding = not torch.sum( + input_ids[:, -1] == torch.tensor(self.llm.config.pad_token_id) + ) + # print(f"pad_token_id{self.llm.config.pad_token_id}") + # 1. Create a mask to know where special speech tokens are + special_speech_token_mask = input_ids == self.llm.config.default_speech_token_id + # print(f"default_speech_token_id{self.llm.config.default_speech_token_id}") + num_special_speech_tokens = torch.sum(special_speech_token_mask, dim=-1) + # Compute the maximum embed dimension + max_embed_dim = ( + num_special_speech_tokens.max() * (speech_len - 1) + ) + sequence_length + batch_indices, non_speech_indices = torch.where( + input_ids != self.llm.config.default_speech_token_id + ) + + # 2. Compute the positions where text should be written + # Calculate new positions for text tokens in merged speech-text sequence. + # `special_speech_token_mask` identifies speech tokens. Each speech token will be replaced by `nb_text_tokens_per_speechs - 1` text tokens. + # `torch.cumsum` computes how each speech token shifts subsequent text token positions. + # - 1 to adjust for zero-based indexing, as `cumsum` inherently increases indices by one. + new_token_positions = ( + torch.cumsum((special_speech_token_mask * (speech_len - 1) + 1), -1) - 1 + ) # (N,U) + nb_speech_pad = max_embed_dim - 1 - new_token_positions[:, -1] + if left_padding: + new_token_positions += nb_speech_pad[:, None] # offset for left padding + text_to_overwrite = new_token_positions[batch_indices, non_speech_indices] + + # 3. Create the full embedding, already padded to the maximum position + final_embedding = torch.zeros( + batch_size, + max_embed_dim, + embed_dim, + dtype=inputs_embeds.dtype, + device=inputs_embeds.device, + ) + final_attention_mask = torch.zeros( + batch_size, + max_embed_dim, + dtype=attention_mask.dtype, + device=inputs_embeds.device, + ) + if labels is not None: + final_labels = torch.full( + (batch_size, max_embed_dim), + IGNORE_TOKEN_ID, + dtype=input_ids.dtype, + device=input_ids.device, + ) + # In case the Vision model or the Language model has been offloaded to CPU, we need to manually + # set the corresponding tensors into their correct target device. + target_device = inputs_embeds.device + batch_indices, non_speech_indices, text_to_overwrite = ( + batch_indices.to(target_device), + non_speech_indices.to(target_device), + text_to_overwrite.to(target_device), + ) + attention_mask = attention_mask.to(target_device) + + # 4. Fill the embeddings based on the mask. If we have ["hey" "", "how", "are"] + # we need to index copy on [0, 577, 578, 579] for the text and [1:576] for the speech features + final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[ + batch_indices, non_speech_indices + ] + final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[ + batch_indices, non_speech_indices + ] + if labels is not None: + final_labels[batch_indices, text_to_overwrite] = labels[ + batch_indices, non_speech_indices + ] + + # 5. Fill the embeddings corresponding to the speechs. Anything that is not `text_positions` needs filling (#29835) + speech_to_overwrite = torch.full( + (batch_size, max_embed_dim), + True, + dtype=torch.bool, + device=inputs_embeds.device, + ) + speech_to_overwrite[batch_indices, text_to_overwrite] = False + if speech_lens is not None: + speech_pad_position = speech_to_overwrite.cumsum(-1) <= speech_lens[:, None] + speech_to_overwrite &= speech_to_overwrite.cumsum(-1) - 1 >= nb_speech_pad[ + :, None + ].to(target_device) + + if speech_to_overwrite.sum() != speech_features.shape[:-1].numel(): + raise ValueError( + f"The input provided to the model are wrong. The number of speech tokens is {torch.sum(special_speech_token_mask)} while" + f" the number of speech given to the model is {num_speechs}. This prevents correct indexing and breaks batch generation." + ) + + final_embedding[speech_to_overwrite] = ( + speech_features.contiguous().reshape(-1, embed_dim).to(target_device) + ) + if speech_lens is not None: + speech_to_overwrite &= speech_pad_position + final_attention_mask |= speech_to_overwrite + + # 6. Mask out the embedding at padding positions, as we later use the past_key_value value to determine the non-attended tokens. + batch_indices, pad_indices = torch.where( + input_ids == self.llm.config.pad_token_id + ) + indices_to_mask = new_token_positions[batch_indices, pad_indices] + + final_embedding[batch_indices, indices_to_mask] = 0 + + if labels is None: + final_labels = None + + return final_embedding, final_attention_mask, final_labels #, position_ids + \ No newline at end of file diff --git a/examples/asr_fireredasr/model/fireredasr/models/module/adapter.py b/examples/asr_fireredasr/model/fireredasr/models/module/adapter.py new file mode 100644 index 00000000..952dfd3b --- /dev/null +++ b/examples/asr_fireredasr/model/fireredasr/models/module/adapter.py @@ -0,0 +1,30 @@ +import torch +import torch.nn as nn + + +class Adapter(nn.Module): + def __init__(self, encoder_dim, llm_dim, downsample_rate=2): + super().__init__() + self.ds = downsample_rate + self.linear1 = nn.Linear(encoder_dim * downsample_rate, llm_dim) + self.relu = nn.ReLU() + self.linear2 = nn.Linear(llm_dim, llm_dim) + + def forward(self, x, x_lens): + batch_size, seq_len, feat_dim = x.size() + num_frames_to_discard = seq_len % self.ds + if num_frames_to_discard > 0: + x = x[:, :-num_frames_to_discard, :] + seq_len = x.size(1) + + x = x.contiguous() + x = x.view( + batch_size, seq_len // self.ds, feat_dim * self.ds + ) + + x = self.linear1(x) + x = self.relu(x) + x = self.linear2(x) + + new_x_lens = torch.clamp(x_lens, max=seq_len) // self.ds + return x, new_x_lens diff --git a/examples/asr_fireredasr/model/fireredasr/models/module/conformer_encoder.py b/examples/asr_fireredasr/model/fireredasr/models/module/conformer_encoder.py new file mode 100644 index 00000000..1dab3809 --- /dev/null +++ b/examples/asr_fireredasr/model/fireredasr/models/module/conformer_encoder.py @@ -0,0 +1,322 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class ConformerEncoder(nn.Module): + def __init__(self, idim, n_layers, n_head, d_model, + residual_dropout=0.1, dropout_rate=0.1, kernel_size=33, + pe_maxlen=5000): + super().__init__() + self.odim = d_model + + self.input_preprocessor = Conv2dSubsampling(idim, d_model) + self.positional_encoding = RelPositionalEncoding(d_model) + self.dropout = nn.Dropout(residual_dropout) + + self.layer_stack = nn.ModuleList() + for l in range(n_layers): + block = RelPosEmbConformerBlock(d_model, n_head, + residual_dropout, + dropout_rate, kernel_size) + self.layer_stack.append(block) + + def forward(self, padded_input, input_lengths, pad=True): + if pad: + padded_input = F.pad(padded_input, + (0, 0, 0, self.input_preprocessor.context - 1), 'constant', 0.0) + src_mask = self.padding_position_is_0(padded_input, input_lengths) + + embed_output, input_lengths, src_mask = self.input_preprocessor(padded_input, src_mask) + enc_output = self.dropout(embed_output) + + pos_emb = self.dropout(self.positional_encoding(embed_output)) + + enc_outputs = [] + for enc_layer in self.layer_stack: + enc_output = enc_layer(enc_output, pos_emb, slf_attn_mask=src_mask, + pad_mask=src_mask) + enc_outputs.append(enc_output) + + return enc_output, input_lengths, src_mask + + def padding_position_is_0(self, padded_input, input_lengths): + N, T = padded_input.size()[:2] + mask = torch.ones((N, T)).to(padded_input.device) + for i in range(N): + mask[i, input_lengths[i]:] = 0 + mask = mask.unsqueeze(dim=1) + return mask.to(torch.uint8) + + +class RelPosEmbConformerBlock(nn.Module): + def __init__(self, d_model, n_head, + residual_dropout=0.1, + dropout_rate=0.1, kernel_size=33): + super().__init__() + self.ffn1 = ConformerFeedForward(d_model, dropout_rate) + self.mhsa = RelPosMultiHeadAttention(n_head, d_model, + residual_dropout) + self.conv = ConformerConvolution(d_model, kernel_size, + dropout_rate) + self.ffn2 = ConformerFeedForward(d_model, dropout_rate) + self.layer_norm = nn.LayerNorm(d_model) + + def forward(self, x, pos_emb, slf_attn_mask=None, pad_mask=None): + out = 0.5 * x + 0.5 * self.ffn1(x) + out = self.mhsa(out, out, out, pos_emb, mask=slf_attn_mask)[0] + out = self.conv(out, pad_mask) + out = 0.5 * out + 0.5 * self.ffn2(out) + out = self.layer_norm(out) + return out + + +class Swish(nn.Module): + def forward(self, x): + return x * torch.sigmoid(x) + + +class Conv2dSubsampling(nn.Module): + def __init__(self, idim, d_model, out_channels=32): + super().__init__() + self.conv = nn.Sequential( + nn.Conv2d(1, out_channels, 3, 2), + nn.ReLU(), + nn.Conv2d(out_channels, out_channels, 3, 2), + nn.ReLU(), + ) + subsample_idim = ((idim - 1) // 2 - 1) // 2 + self.out = nn.Linear(out_channels * subsample_idim, d_model) + + self.subsampling = 4 + left_context = right_context = 3 # both exclude currect frame + self.context = left_context + 1 + right_context # 7 + + def forward(self, x, x_mask): + x = x.unsqueeze(1) + x = self.conv(x) + N, C, T, D = x.size() + x = self.out(x.transpose(1, 2).contiguous().view(N, T, C * D)) + mask = x_mask[:, :, :-2:2][:, :, :-2:2] + input_lengths = mask[:, -1, :].sum(dim=-1) + return x, input_lengths, mask + + +class RelPositionalEncoding(torch.nn.Module): + def __init__(self, d_model, max_len=5000): + super().__init__() + pe_positive = torch.zeros(max_len, d_model, requires_grad=False) + pe_negative = torch.zeros(max_len, d_model, requires_grad=False) + position = torch.arange(0, max_len).unsqueeze(1).float() + div_term = torch.exp(torch.arange(0, d_model, 2).float() * + -(torch.log(torch.tensor(10000.0)).item()/d_model)) + pe_positive[:, 0::2] = torch.sin(position * div_term) + pe_positive[:, 1::2] = torch.cos(position * div_term) + pe_negative[:, 0::2] = torch.sin(-1 * position * div_term) + pe_negative[:, 1::2] = torch.cos(-1 * position * div_term) + + pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0) + pe_negative = pe_negative[1:].unsqueeze(0) + pe = torch.cat([pe_positive, pe_negative], dim=1) + self.register_buffer('pe', pe) + + def forward(self, x): + # Tmax = 2 * max_len - 1 + Tmax, T = self.pe.size(1), x.size(1) + pos_emb = self.pe[:, Tmax // 2 - T + 1 : Tmax // 2 + T].clone().detach() + return pos_emb + + +class ConformerFeedForward(nn.Module): + def __init__(self, d_model, dropout_rate=0.1): + super().__init__() + pre_layer_norm = nn.LayerNorm(d_model) + linear_expand = nn.Linear(d_model, d_model*4) + nonlinear = Swish() + dropout_pre = nn.Dropout(dropout_rate) + linear_project = nn.Linear(d_model*4, d_model) + dropout_post = nn.Dropout(dropout_rate) + self.net = nn.Sequential(pre_layer_norm, + linear_expand, + nonlinear, + dropout_pre, + linear_project, + dropout_post) + + def forward(self, x): + residual = x + output = self.net(x) + output = output + residual + return output + + +class ConformerConvolution(nn.Module): + def __init__(self, d_model, kernel_size=33, dropout_rate=0.1): + super().__init__() + assert kernel_size % 2 == 1 + self.pre_layer_norm = nn.LayerNorm(d_model) + self.pointwise_conv1 = nn.Conv1d(d_model, d_model*4, kernel_size=1, bias=False) + self.glu = F.glu + self.padding = (kernel_size - 1) // 2 + self.depthwise_conv = nn.Conv1d(d_model*2, d_model*2, + kernel_size, stride=1, + padding=self.padding, + groups=d_model*2, bias=False) + self.batch_norm = nn.LayerNorm(d_model*2) + self.swish = Swish() + self.pointwise_conv2 = nn.Conv1d(d_model*2, d_model, kernel_size=1, bias=False) + self.dropout = nn.Dropout(dropout_rate) + + def forward(self, x, mask=None): + residual = x + out = self.pre_layer_norm(x) + out = out.transpose(1, 2) + if mask is not None: + out.masked_fill_(mask.ne(1), 0.0) + out = self.pointwise_conv1(out) + out = F.glu(out, dim=1) + out = self.depthwise_conv(out) + + out = out.transpose(1, 2) + out = self.swish(self.batch_norm(out)) + out = out.transpose(1, 2) + + out = self.dropout(self.pointwise_conv2(out)) + if mask is not None: + out.masked_fill_(mask.ne(1), 0.0) + out = out.transpose(1, 2) + return out + residual + + +class EncoderMultiHeadAttention(nn.Module): + def __init__(self, n_head, d_model, + residual_dropout=0.1): + super().__init__() + assert d_model % n_head == 0 + self.n_head = n_head + self.d_k = d_model // n_head + self.d_v = self.d_k + + self.w_qs = nn.Linear(d_model, n_head * self.d_k, bias=False) + self.w_ks = nn.Linear(d_model, n_head * self.d_k, bias=False) + self.w_vs = nn.Linear(d_model, n_head * self.d_v, bias=False) + + self.layer_norm_q = nn.LayerNorm(d_model) + self.layer_norm_k = nn.LayerNorm(d_model) + self.layer_norm_v = nn.LayerNorm(d_model) + + self.attention = ScaledDotProductAttention(temperature=self.d_k ** 0.5) + self.fc = nn.Linear(n_head * self.d_v, d_model, bias=False) + self.dropout = nn.Dropout(residual_dropout) + + def forward(self, q, k, v, mask=None): + sz_b, len_q = q.size(0), q.size(1) + + residual = q + q, k, v = self.forward_qkv(q, k, v) + + output, attn = self.attention(q, k, v, mask=mask) + + output = self.forward_output(output, residual, sz_b, len_q) + return output, attn + + def forward_qkv(self, q, k, v): + d_k, d_v, n_head = self.d_k, self.d_v, self.n_head + sz_b, len_q, len_k, len_v = q.size(0), q.size(1), k.size(1), v.size(1) + + q = self.layer_norm_q(q) + k = self.layer_norm_k(k) + v = self.layer_norm_v(v) + + q = self.w_qs(q).view(sz_b, len_q, n_head, d_k) + k = self.w_ks(k).view(sz_b, len_k, n_head, d_k) + v = self.w_vs(v).view(sz_b, len_v, n_head, d_v) + q = q.transpose(1, 2) + k = k.transpose(1, 2) + v = v.transpose(1, 2) + return q, k, v + + def forward_output(self, output, residual, sz_b, len_q): + output = output.transpose(1, 2).contiguous().view(sz_b, len_q, -1) + fc_out = self.fc(output) + output = self.dropout(fc_out) + output = output + residual + return output + + +class ScaledDotProductAttention(nn.Module): + def __init__(self, temperature): + super().__init__() + self.temperature = temperature + self.dropout = nn.Dropout(0.0) + self.INF = float('inf') + + def forward(self, q, k, v, mask=None): + attn = torch.matmul(q, k.transpose(2, 3)) / self.temperature + output, attn = self.forward_attention(attn, v, mask) + return output, attn + + def forward_attention(self, attn, v, mask=None): + if mask is not None: + mask = mask.unsqueeze(1) + mask = mask.eq(0) + attn = attn.masked_fill(mask, -self.INF) + attn = torch.softmax(attn, dim=-1).masked_fill(mask, 0.0) + else: + attn = torch.softmax(attn, dim=-1) + + d_attn = self.dropout(attn) + output = torch.matmul(d_attn, v) + + return output, attn + + +class RelPosMultiHeadAttention(EncoderMultiHeadAttention): + def __init__(self, n_head, d_model, + residual_dropout=0.1): + super().__init__(n_head, d_model, + residual_dropout) + d_k = d_model // n_head + self.scale = 1.0 / (d_k ** 0.5) + self.linear_pos = nn.Linear(d_model, n_head * d_k, bias=False) + self.pos_bias_u = nn.Parameter(torch.FloatTensor(n_head, d_k)) + self.pos_bias_v = nn.Parameter(torch.FloatTensor(n_head, d_k)) + torch.nn.init.xavier_uniform_(self.pos_bias_u) + torch.nn.init.xavier_uniform_(self.pos_bias_v) + + def _rel_shift(self, x): + N, H, T1, T2 = x.size() + zero_pad = torch.zeros((N, H, T1, 1), device=x.device, dtype=x.dtype) + x_padded = torch.cat([zero_pad, x], dim=-1) + + x_padded = x_padded.view(N, H, T2 + 1, T1) + x = x_padded[:, :, 1:].view_as(x) + x = x[:, :, :, : x.size(-1) // 2 + 1] + return x + + def forward(self, q, k, v, pos_emb, mask=None): + sz_b, len_q = q.size(0), q.size(1) + + residual = q + q, k, v = self.forward_qkv(q, k, v) + + q = q.transpose(1, 2) + n_batch_pos = pos_emb.size(0) + p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.n_head, self.d_k) + p = p.transpose(1, 2) + + q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) + q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) + + matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) + + matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) + matrix_bd = self._rel_shift(matrix_bd) + + attn_scores = matrix_ac + matrix_bd + attn_scores.mul_(self.scale) + + output, attn = self.attention.forward_attention(attn_scores, v, mask=mask) + + output = self.forward_output(output, residual, sz_b, len_q) + return output, attn diff --git a/examples/asr_fireredasr/model/fireredasr/models/module/transformer_decoder.py b/examples/asr_fireredasr/model/fireredasr/models/module/transformer_decoder.py new file mode 100644 index 00000000..2088b083 --- /dev/null +++ b/examples/asr_fireredasr/model/fireredasr/models/module/transformer_decoder.py @@ -0,0 +1,299 @@ +from typing import List, Optional, Dict + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor + + +class TransformerDecoder(nn.Module): + def __init__( + self, sos_id, eos_id, pad_id, odim, + n_layers, n_head, d_model, + residual_dropout=0.1, pe_maxlen=5000): + super().__init__() + self.INF = 1e10 + # parameters + self.pad_id = pad_id + self.sos_id = sos_id + self.eos_id = eos_id + self.n_layers = n_layers + + # Components + self.tgt_word_emb = nn.Embedding(odim, d_model, padding_idx=self.pad_id) + self.positional_encoding = PositionalEncoding(d_model, max_len=pe_maxlen) + self.dropout = nn.Dropout(residual_dropout) + + self.layer_stack = nn.ModuleList() + for l in range(n_layers): + block = DecoderLayer(d_model, n_head, residual_dropout) + self.layer_stack.append(block) + + self.tgt_word_prj = nn.Linear(d_model, odim, bias=False) + self.layer_norm_out = nn.LayerNorm(d_model) + + self.tgt_word_prj.weight = self.tgt_word_emb.weight + self.scale = (d_model ** 0.5) + + def batch_beam_search(self, encoder_outputs, src_masks, + beam_size=1, nbest=1, decode_max_len=0, + softmax_smoothing=1.0, length_penalty=0.0, eos_penalty=1.0): + B = beam_size + N, Ti, H = encoder_outputs.size() + device = encoder_outputs.device + maxlen = decode_max_len if decode_max_len > 0 else Ti + assert eos_penalty > 0.0 and eos_penalty <= 1.0 + + # Init + encoder_outputs = encoder_outputs.unsqueeze(1).repeat(1, B, 1, 1).view(N*B, Ti, H) + src_mask = src_masks.unsqueeze(1).repeat(1, B, 1, 1).view(N*B, -1, Ti) + ys = torch.ones(N*B, 1).fill_(self.sos_id).long().to(device) + caches: List[Optional[Tensor]] = [] + for _ in range(self.n_layers): + caches.append(None) + scores = torch.tensor([0.0] + [-self.INF]*(B-1)).float().to(device) + scores = scores.repeat(N).view(N*B, 1) + is_finished = torch.zeros_like(scores) + + # Autoregressive Prediction + for t in range(maxlen): + tgt_mask = self.ignored_target_position_is_0(ys, self.pad_id) + + dec_output = self.dropout( + self.tgt_word_emb(ys) * self.scale + + self.positional_encoding(ys)) + + i = 0 + for dec_layer in self.layer_stack: + dec_output = dec_layer.forward( + dec_output, encoder_outputs, + tgt_mask, src_mask, + cache=caches[i]) + caches[i] = dec_output + i += 1 + + dec_output = self.layer_norm_out(dec_output) + + t_logit = self.tgt_word_prj(dec_output[:, -1]) + t_scores = F.log_softmax(t_logit / softmax_smoothing, dim=-1) + + if eos_penalty != 1.0: + t_scores[:, self.eos_id] *= eos_penalty + + t_topB_scores, t_topB_ys = torch.topk(t_scores, k=B, dim=1) + t_topB_scores = self.set_finished_beam_score_to_zero(t_topB_scores, is_finished) + t_topB_ys = self.set_finished_beam_y_to_eos(t_topB_ys, is_finished) + + # Accumulated + scores = scores + t_topB_scores + + # Pruning + scores = scores.view(N, B*B) + scores, topB_score_ids = torch.topk(scores, k=B, dim=1) + scores = scores.view(-1, 1) + + topB_row_number_in_each_B_rows_of_ys = torch.div(topB_score_ids, B).view(N*B) + stride = B * torch.arange(N).view(N, 1).repeat(1, B).view(N*B).to(device) + topB_row_number_in_ys = topB_row_number_in_each_B_rows_of_ys.long() + stride.long() + + # Update ys + ys = ys[topB_row_number_in_ys] + t_ys = torch.gather(t_topB_ys.view(N, B*B), dim=1, index=topB_score_ids).view(N*B, 1) + ys = torch.cat((ys, t_ys), dim=1) + + # Update caches + new_caches: List[Optional[Tensor]] = [] + for cache in caches: + if cache is not None: + new_caches.append(cache[topB_row_number_in_ys]) + caches = new_caches + + # Update finished state + is_finished = t_ys.eq(self.eos_id) + if is_finished.sum().item() == N*B: + break + + # Length penalty (follow GNMT) + scores = scores.view(N, B) + ys = ys.view(N, B, -1) + ys_lengths = self.get_ys_lengths(ys) + if length_penalty > 0.0: + penalty = torch.pow((5+ys_lengths.float())/(5.0+1), length_penalty) + scores /= penalty + nbest_scores, nbest_ids = torch.topk(scores, k=int(nbest), dim=1) + nbest_scores = -1.0 * nbest_scores + index = nbest_ids + B * torch.arange(N).view(N, 1).to(device).long() + nbest_ys = ys.view(N*B, -1)[index.view(-1)] + nbest_ys = nbest_ys.view(N, nbest_ids.size(1), -1) + nbest_ys_lengths = ys_lengths.view(N*B)[index.view(-1)].view(N, -1) + + # result + nbest_hyps: List[List[Dict[str, Tensor]]] = [] + for n in range(N): + n_nbest_hyps: List[Dict[str, Tensor]] = [] + for i, score in enumerate(nbest_scores[n]): + new_hyp = { + "yseq": nbest_ys[n, i, 1:nbest_ys_lengths[n, i]] + } + n_nbest_hyps.append(new_hyp) + nbest_hyps.append(n_nbest_hyps) + return nbest_hyps + + def ignored_target_position_is_0(self, padded_targets, ignore_id): + mask = torch.ne(padded_targets, ignore_id) + mask = mask.unsqueeze(dim=1) + T = padded_targets.size(-1) + upper_tri_0_mask = self.upper_triangular_is_0(T).unsqueeze(0).to(mask.dtype) + upper_tri_0_mask = upper_tri_0_mask.to(mask.dtype).to(mask.device) + return mask.to(torch.uint8) & upper_tri_0_mask.to(torch.uint8) + + def upper_triangular_is_0(self, size): + ones = torch.ones(size, size) + tri_left_ones = torch.tril(ones) + return tri_left_ones.to(torch.uint8) + + def set_finished_beam_score_to_zero(self, scores, is_finished): + NB, B = scores.size() + is_finished = is_finished.float() + mask_score = torch.tensor([0.0] + [-self.INF]*(B-1)).float().to(scores.device) + mask_score = mask_score.view(1, B).repeat(NB, 1) + return scores * (1 - is_finished) + mask_score * is_finished + + def set_finished_beam_y_to_eos(self, ys, is_finished): + is_finished = is_finished.long() + return ys * (1 - is_finished) + self.eos_id * is_finished + + def get_ys_lengths(self, ys): + N, B, Tmax = ys.size() + ys_lengths = torch.sum(torch.ne(ys, self.eos_id), dim=-1) + return ys_lengths.int() + + + +class DecoderLayer(nn.Module): + def __init__(self, d_model, n_head, dropout): + super().__init__() + self.self_attn_norm = nn.LayerNorm(d_model) + self.self_attn = DecoderMultiHeadAttention(d_model, n_head, dropout) + + self.cross_attn_norm = nn.LayerNorm(d_model) + self.cross_attn = DecoderMultiHeadAttention(d_model, n_head, dropout) + + self.mlp_norm = nn.LayerNorm(d_model) + self.mlp = PositionwiseFeedForward(d_model, d_model*4, dropout) + + def forward(self, dec_input, enc_output, self_attn_mask, cross_attn_mask, + cache=None): + x = dec_input + residual = x + x = self.self_attn_norm(x) + if cache is not None: + xq = x[:, -1:, :] + residual = residual[:, -1:, :] + self_attn_mask = self_attn_mask[:, -1:, :] + else: + xq = x + x = self.self_attn(xq, x, x, mask=self_attn_mask) + x = residual + x + + residual = x + x = self.cross_attn_norm(x) + x = self.cross_attn(x, enc_output, enc_output, mask=cross_attn_mask) + x = residual + x + + residual = x + x = self.mlp_norm(x) + x = residual + self.mlp(x) + + if cache is not None: + x = torch.cat([cache, x], dim=1) + + return x + + +class DecoderMultiHeadAttention(nn.Module): + def __init__(self, d_model, n_head, dropout=0.1): + super().__init__() + self.d_model = d_model + self.n_head = n_head + self.d_k = d_model // n_head + + self.w_qs = nn.Linear(d_model, n_head * self.d_k) + self.w_ks = nn.Linear(d_model, n_head * self.d_k, bias=False) + self.w_vs = nn.Linear(d_model, n_head * self.d_k) + + self.attention = DecoderScaledDotProductAttention( + temperature=self.d_k ** 0.5) + self.fc = nn.Linear(n_head * self.d_k, d_model) + self.dropout = nn.Dropout(dropout) + + def forward(self, q, k, v, mask=None): + bs = q.size(0) + + q = self.w_qs(q).view(bs, -1, self.n_head, self.d_k) + k = self.w_ks(k).view(bs, -1, self.n_head, self.d_k) + v = self.w_vs(v).view(bs, -1, self.n_head, self.d_k) + q = q.transpose(1, 2) + k = k.transpose(1, 2) + v = v.transpose(1, 2) + + if mask is not None: + mask = mask.unsqueeze(1) + + output = self.attention(q, k, v, mask=mask) + + output = output.transpose(1, 2).contiguous().view(bs, -1, self.d_model) + output = self.fc(output) + output = self.dropout(output) + + return output + + +class DecoderScaledDotProductAttention(nn.Module): + def __init__(self, temperature): + super().__init__() + self.temperature = temperature + self.INF = float("inf") + + def forward(self, q, k, v, mask=None): + attn = torch.matmul(q, k.transpose(2, 3)) / self.temperature + if mask is not None: + mask = mask.eq(0) + attn = attn.masked_fill(mask, -self.INF) + attn = torch.softmax(attn, dim=-1).masked_fill(mask, 0.0) + else: + attn = torch.softmax(attn, dim=-1) + output = torch.matmul(attn, v) + return output + + +class PositionwiseFeedForward(nn.Module): + def __init__(self, d_model, d_ff, dropout=0.1): + super().__init__() + self.w_1 = nn.Linear(d_model, d_ff) + self.act = nn.GELU() + self.w_2 = nn.Linear(d_ff, d_model) + self.dropout = nn.Dropout(dropout) + + def forward(self, x): + output = self.w_2(self.act(self.w_1(x))) + output = self.dropout(output) + return output + + +class PositionalEncoding(nn.Module): + def __init__(self, d_model, max_len=5000): + super().__init__() + assert d_model % 2 == 0 + pe = torch.zeros(max_len, d_model, requires_grad=False) + position = torch.arange(0, max_len).unsqueeze(1).float() + div_term = torch.exp(torch.arange(0, d_model, 2).float() * + -(torch.log(torch.tensor(10000.0)).item()/d_model)) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + + def forward(self, x): + length = x.size(1) + return self.pe[:, :length].clone().detach() diff --git a/examples/asr_fireredasr/model/fireredasr/speech2text.py b/examples/asr_fireredasr/model/fireredasr/speech2text.py new file mode 100755 index 00000000..21f645b4 --- /dev/null +++ b/examples/asr_fireredasr/model/fireredasr/speech2text.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python3 + +import argparse +import glob +import os +import sys + +from fireredasr.models.fireredasr import FireRedAsr + + +parser = argparse.ArgumentParser() +parser.add_argument('--asr_type', type=str, required=True, choices=["aed", "llm"]) +parser.add_argument('--model_dir', type=str, required=True) + +# Input / Output +parser.add_argument("--wav_path", type=str) +parser.add_argument("--wav_paths", type=str, nargs="*") +parser.add_argument("--wav_dir", type=str) +parser.add_argument("--wav_scp", type=str) +parser.add_argument("--output", type=str) + +# Decode Options +parser.add_argument('--use_gpu', type=int, default=1) +parser.add_argument("--batch_size", type=int, default=1) +parser.add_argument("--beam_size", type=int, default=1) +parser.add_argument("--decode_max_len", type=int, default=0) +# FireRedASR-AED +parser.add_argument("--nbest", type=int, default=1) +parser.add_argument("--softmax_smoothing", type=float, default=1.0) +parser.add_argument("--aed_length_penalty", type=float, default=0.0) +parser.add_argument("--eos_penalty", type=float, default=1.0) +# FireRedASR-LLM +parser.add_argument("--decode_min_len", type=int, default=0) +parser.add_argument("--repetition_penalty", type=float, default=1.0) +parser.add_argument("--llm_length_penalty", type=float, default=0.0) +parser.add_argument("--temperature", type=float, default=1.0) + + +def main(args): + wavs = get_wav_info(args) + fout = open(args.output, "w") if args.output else None + + model = FireRedAsr.from_pretrained(args.asr_type, args.model_dir) + + batch_uttid = [] + batch_wav_path = [] + for i, wav in enumerate(wavs): + uttid, wav_path = wav + batch_uttid.append(uttid) + batch_wav_path.append(wav_path) + if len(batch_wav_path) < args.batch_size and i != len(wavs) - 1: + continue + + results = model.transcribe( + batch_uttid, + batch_wav_path, + { + "use_gpu": args.use_gpu, + "beam_size": args.beam_size, + "nbest": args.nbest, + "decode_max_len": args.decode_max_len, + "softmax_smoothing": args.softmax_smoothing, + "aed_length_penalty": args.aed_length_penalty, + "eos_penalty": args.eos_penalty, + "decode_min_len": args.decode_min_len, + "repetition_penalty": args.repetition_penalty, + "llm_length_penalty": args.llm_length_penalty, + "temperature": args.temperature + } + ) + + for result in results: + print(result) + if fout is not None: + fout.write(f"{result['uttid']}\t{result['text']}\n") + + batch_uttid = [] + batch_wav_path = [] + + +def get_wav_info(args): + """ + Returns: + wavs: list of (uttid, wav_path) + """ + base = lambda p: os.path.basename(p).replace(".wav", "") + if args.wav_path: + wavs = [(base(args.wav_path), args.wav_path)] + elif args.wav_paths and len(args.wav_paths) >= 1: + wavs = [(base(p), p) for p in sorted(args.wav_paths)] + elif args.wav_scp: + wavs = [line.strip().split() for line in open(args.wav_scp)] + elif args.wav_dir: + wavs = glob.glob(f"{args.wav_dir}/**/*.wav", recursive=True) + wavs = [(base(p), p) for p in sorted(wavs)] + else: + raise ValueError("Please provide valid wav info") + print(f"#wavs={len(wavs)}") + return wavs + + +if __name__ == "__main__": + args = parser.parse_args() + print(args) + main(args) diff --git a/examples/asr_fireredasr/model/fireredasr/tokenizer/aed_tokenizer.py b/examples/asr_fireredasr/model/fireredasr/tokenizer/aed_tokenizer.py new file mode 100644 index 00000000..00285b54 --- /dev/null +++ b/examples/asr_fireredasr/model/fireredasr/tokenizer/aed_tokenizer.py @@ -0,0 +1,67 @@ +import logging +import re + +import sentencepiece as spm + +from fireredasr.data.token_dict import TokenDict + + +class ChineseCharEnglishSpmTokenizer: + """ + - One Chinese char is a token. + - Split English word into SPM and one piece is a token. + - Ignore ' ' between Chinese char + - Replace ' ' between English word with "▁" by spm_model + - Need to put SPM piece into dict file + - If not set spm_model, will use English char and + """ + SPM_SPACE = "▁" + + def __init__(self, dict_path, spm_model, unk="", space=""): + self.dict = TokenDict(dict_path, unk=unk) + self.space = space + if spm_model: + self.sp = spm.SentencePieceProcessor() + self.sp.Load(spm_model) + else: + self.sp = None + print("[WRAN] Not set spm_model, will use English char") + print("[WARN] Please check how to deal with ' '(space)") + if self.space not in self.dict: + print("Please add to your dict, or it will be ") + + def tokenize(self, text, replace_punc=True): + #if text == "": + # logging.info(f"empty text") + text = text.upper() + tokens = [] + if replace_punc: + text = re.sub("[,。?!,\.?!]", " ", text) + pattern = re.compile(r'([\u3400-\u4dbf\u4e00-\u9fff])') + parts = pattern.split(text.strip()) + parts = [p for p in parts if len(p.strip()) > 0] + for part in parts: + if pattern.fullmatch(part) is not None: + tokens.append(part) + else: + if self.sp: + for piece in self.sp.EncodeAsPieces(part.strip()): + tokens.append(piece) + else: + for char in part.strip(): + tokens.append(char if char != " " else self.space) + tokens_id = [] + for token in tokens: + tokens_id.append(self.dict.get(token, self.dict.unk)) + return tokens, tokens_id + + def detokenize(self, inputs, join_symbol="", replace_spm_space=True): + """inputs is ids or tokens, do not need self.sp""" + if len(inputs) > 0 and type(inputs[0]) == int: + tokens = [self.dict[id] for id in inputs] + else: + tokens = inputs + s = f"{join_symbol}".join(tokens) + if replace_spm_space: + s = s.replace(self.SPM_SPACE, ' ').strip() + return s diff --git a/examples/asr_fireredasr/model/fireredasr/tokenizer/llm_tokenizer.py b/examples/asr_fireredasr/model/fireredasr/tokenizer/llm_tokenizer.py new file mode 100644 index 00000000..5388fc03 --- /dev/null +++ b/examples/asr_fireredasr/model/fireredasr/tokenizer/llm_tokenizer.py @@ -0,0 +1,106 @@ +import re + +import torch +from transformers import AutoTokenizer +from transformers.trainer_pt_utils import LabelSmoother + +DEFAULT_SPEECH_TOKEN = "" +IGNORE_TOKEN_ID = LabelSmoother.ignore_index + + +class LlmTokenizerWrapper: + @classmethod + def build_llm_tokenizer(cls, llm_path, use_flash_attn=False): + tokenizer = AutoTokenizer.from_pretrained(llm_path) + if use_flash_attn: + tokenizer.padding_side = "left" + else: + tokenizer.padding_side = "right" + # tokenizer.padding_side = "left" + special_tokens_dict = {"additional_special_tokens": [DEFAULT_SPEECH_TOKEN]} + tokenizer.add_special_tokens(special_tokens_dict) + return tokenizer + + @classmethod + def clean_text(cls, origin_text): + """remove punc, remove space between Chinese and keep space between English""" + # remove punc + text = re.sub("[,。?!,\.!?《》()\·“”、\\/]", "", origin_text) + # merge space + text = re.sub("\s+", " ", text) + + # remove space between Chinese and keep space between English + pattern = re.compile(r'([\u3400-\u4dbf\u4e00-\u9fff])') # Chinese + parts = pattern.split(text.strip()) + parts = [p for p in parts if len(p.strip()) > 0] + text = "".join(parts) + text = text.strip() + + text = text.lower() + return text + + @classmethod + def preprocess_texts(cls, origin_texts, tokenizer, max_len, decode=False): + messages = [] + clean_texts = [] + for i, origin_text in enumerate(origin_texts): + text = cls.clean_text(origin_text) + clean_texts.append(text) + text = text if not decode else "" + message = [ + {"role": "user", "content": f"{DEFAULT_SPEECH_TOKEN}请转写音频为文字"}, + {"role": "assistant", "content": text}, + ] + messages.append(message) + + texts = [] + if not decode: + TEMPLATE = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if loop.last %}{{ '<|im_end|>'}}{% else %}{{ '<|im_end|>\n' }}{% endif %}{% endfor %}" + else: + TEMPLATE = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if loop.last %}{{''}}{% else %}{{ '<|im_end|>\n' }}{% endif %}{% endfor %}" + for i, msg in enumerate(messages): + texts.append( + tokenizer.apply_chat_template( + msg, + tokenize=True, + chat_template=TEMPLATE, + add_generation_prompt=False, + padding="longest", + max_length=max_len, + truncation=True, + ) + ) + + # Padding texts + max_len_texts = max([len(text) for text in texts]) + if tokenizer.padding_side == "right": + texts = [ + text + [tokenizer.pad_token_id] * (max_len_texts - len(text)) + for text in texts + ] + else: + texts = [ + [tokenizer.pad_token_id] * (max_len_texts - len(text)) + text + for text in texts + ] + input_ids = torch.tensor(texts, dtype=torch.int) + + target_ids = input_ids.clone() + target_ids[target_ids == tokenizer.pad_token_id] = IGNORE_TOKEN_ID + + # first get the indices of the tokens + mask_prompt = True + if mask_prompt: + mask_indices = torch.where( + input_ids == tokenizer.convert_tokens_to_ids("assistant") + ) + for i in range(mask_indices[0].size(0)): + row = mask_indices[0][i] + col = mask_indices[1][i] + target_ids[row, : col + 2] = IGNORE_TOKEN_ID + + attention_mask = input_ids.ne(tokenizer.pad_token_id) + + target_ids = target_ids.type(torch.LongTensor) + input_ids = input_ids.type(torch.LongTensor) + return input_ids, attention_mask, target_ids, clean_texts diff --git a/examples/asr_fireredasr/model/fireredasr/utils/param.py b/examples/asr_fireredasr/model/fireredasr/utils/param.py new file mode 100644 index 00000000..8dfa28d2 --- /dev/null +++ b/examples/asr_fireredasr/model/fireredasr/utils/param.py @@ -0,0 +1,13 @@ +import logging + +import torch + + +def count_model_parameters(model): + if not isinstance(model, torch.nn.Module): + return 0, 0 + name = f"{model.__class__.__name__} {model.__class__}" + num = sum(p.numel() for p in model.parameters() if p.requires_grad) + size = num * 4.0 / 1024.0 / 1024.0 # float32, MB + logging.info(f"#param of {name} is {num} = {size:.1f} MB (float32)") + return num, size diff --git a/examples/asr_fireredasr/model/fireredasr/utils/wer.py b/examples/asr_fireredasr/model/fireredasr/utils/wer.py new file mode 100755 index 00000000..30107ca8 --- /dev/null +++ b/examples/asr_fireredasr/model/fireredasr/utils/wer.py @@ -0,0 +1,303 @@ +#!/usr/bin/env python3 + +import argparse +import re +from collections import OrderedDict + + +parser = argparse.ArgumentParser() +parser.add_argument("--ref", type=str, required=True) +parser.add_argument("--hyp", type=str, required=True) +parser.add_argument("--print_sentence_wer", type=int, default=0) +parser.add_argument("--do_tn", type=int, default=0, help="simple tn by cn2an") +parser.add_argument("--rm_special", type=int, default=0, help="remove <\|.*?\|>") + + +def main(args): + uttid2refs = read_uttid2tokens(args.ref, args.do_tn, args.rm_special) + uttid2hyps = read_uttid2tokens(args.hyp, args.do_tn, args.rm_special) + uttid2wer_info, wer_stat, en_dig_stat = compute_uttid2wer_info( + uttid2refs, uttid2hyps, args.print_sentence_wer) + wer_stat.print() + en_dig_stat.print() + + +def read_uttid2tokens(filename, do_tn=False, rm_special=False): + print(f">>> Read uttid to tokens: {filename}", flush=True) + uttid2tokens = OrderedDict() + uttid2text = read_uttid2text(filename, do_tn, rm_special) + for uttid, text in uttid2text.items(): + tokens = text2tokens(text) + uttid2tokens[uttid] = tokens + return uttid2tokens + + +def read_uttid2text(filename, do_tn=False, rm_special=False): + uttid2text = OrderedDict() + with open(filename, "r", encoding="utf8") as fin: + for i, line in enumerate(fin): + cols = line.split() + if len(cols) == 0: + print("[WARN] empty line, continue", i, flush=True) + continue + assert cols[0] not in uttid2text, f"repeated uttid: {line}" + if len(cols) == 1: + uttid2text[cols[0]] = "" + continue + txt = " ".join(cols[1:]) + if rm_special: + txt = " ".join([t for t in re.split("<\|.*?\|>", txt) if t.strip() != ""]) + if do_tn: + import cn2an + txt = cn2an.transform(txt, "an2cn") + uttid2text[cols[0]] = txt + return uttid2text + + +def text2tokens(text): + PUNCTUATIONS = ",。?!,\.?!"#$%&'()*+-/:;<=>@[\]^_`{|}~⦅⦆「」、 、〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏﹑﹔·。\":" + "()\[\]{}/;`|=+" + if text == "": + return [] + tokens = [] + + text = re.sub("", "", text) + text = re.sub(r"[%s]+" % PUNCTUATIONS, " ", text) + + pattern = re.compile(r'([\u4e00-\u9fff])') + parts = pattern.split(text.strip().upper()) + parts = [p for p in parts if len(p.strip()) > 0] + for part in parts: + if pattern.fullmatch(part) is not None: + tokens.append(part) + else: + for word in part.strip().split(): + tokens.append(word) + return tokens + + +def compute_uttid2wer_info(refs, hyps, print_sentence_wer=False): + print(f">>> Compute uttid to wer info", flush=True) + + uttid2wer_info = OrderedDict() + wer_stat = WerStats() + en_dig_stat = EnDigStats() + + for uttid, ref in refs.items(): + if uttid not in hyps: + print(f"[WARN] No hyp for {uttid}", flush=True) + continue + hyp = hyps[uttid] + + if len(hyp) - len(ref) >= 8: + print(f"[BidLengthDiff]: {uttid} {len(ref)} {len(hyp)}#{' '.join(ref)}#{' '.join(hyp)}") + #continue + + wer_info = compute_one_wer_info(ref, hyp) + uttid2wer_info[uttid] = wer_info + ns = count_english_ditgit(ref, hyp, wer_info) + wer_stat.add(wer_info) + en_dig_stat.add(*ns) + if print_sentence_wer: + print(f"{uttid} {wer_info}") + + return uttid2wer_info, wer_stat, en_dig_stat + + +COST_SUB = 3 +COST_DEL = 3 +COST_INS = 3 + +ALIGN_CRT = 0 +ALIGN_SUB = 1 +ALIGN_DEL = 2 +ALIGN_INS = 3 +ALIGN_END = 4 + + +def compute_one_wer_info(ref, hyp): + """Impl minimum edit distance and backtrace. + Args: + ref, hyp: List[str] + Returns: + WerInfo + """ + ref_len = len(ref) + hyp_len = len(hyp) + + class _DpPoint: + def __init__(self, cost, align): + self.cost = cost + self.align = align + + dp = [] + for i in range(0, ref_len + 1): + dp.append([]) + for j in range(0, hyp_len + 1): + dp[-1].append(_DpPoint(i * j, ALIGN_CRT)) + + # Initialize + for i in range(1, hyp_len + 1): + dp[0][i].cost = dp[0][i - 1].cost + COST_INS; + dp[0][i].align = ALIGN_INS + for i in range(1, ref_len + 1): + dp[i][0].cost = dp[i - 1][0].cost + COST_DEL + dp[i][0].align = ALIGN_DEL + + # DP + for i in range(1, ref_len + 1): + for j in range(1, hyp_len + 1): + min_cost = 0 + min_align = ALIGN_CRT + if hyp[j - 1] == ref[i - 1]: + min_cost = dp[i - 1][j - 1].cost + min_align = ALIGN_CRT + else: + min_cost = dp[i - 1][j - 1].cost + COST_SUB + min_align = ALIGN_SUB + + del_cost = dp[i - 1][j].cost + COST_DEL + if del_cost < min_cost: + min_cost = del_cost + min_align = ALIGN_DEL + + ins_cost = dp[i][j - 1].cost + COST_INS + if ins_cost < min_cost: + min_cost = ins_cost + min_align = ALIGN_INS + + dp[i][j].cost = min_cost + dp[i][j].align = min_align + + # Backtrace + crt = sub = ins = det = 0 + i = ref_len + j = hyp_len + align = [] + while i > 0 or j > 0: + if dp[i][j].align == ALIGN_CRT: + align.append((i, j, ALIGN_CRT)) + i -= 1 + j -= 1 + crt += 1 + elif dp[i][j].align == ALIGN_SUB: + align.append((i, j, ALIGN_SUB)) + i -= 1 + j -= 1 + sub += 1 + elif dp[i][j].align == ALIGN_DEL: + align.append((i, j, ALIGN_DEL)) + i -= 1 + det += 1 + elif dp[i][j].align == ALIGN_INS: + align.append((i, j, ALIGN_INS)) + j -= 1 + ins += 1 + + err = sub + det + ins + align.reverse() + wer_info = WerInfo(ref_len, err, crt, sub, det, ins, align) + return wer_info + + + +class WerInfo: + def __init__(self, ref, err, crt, sub, dele, ins, ali): + self.r = ref + self.e = err + self.c = crt + self.s = sub + self.d = dele + self.i = ins + self.ali = ali + r = max(self.r, 1) + self.wer = 100.0 * (self.s + self.d + self.i) / r + + def __repr__(self): + s = f"wer {self.wer:.2f} ref {self.r:2d} sub {self.s:2d} del {self.d:2d} ins {self.i:2d}" + return s + + +class WerStats: + def __init__(self): + self.infos = [] + + def add(self, wer_info): + self.infos.append(wer_info) + + def print(self): + r = sum(info.r for info in self.infos) + if r <= 0: + print(f"REF len is {r}, check") + r = 1 + s = sum(info.s for info in self.infos) + d = sum(info.d for info in self.infos) + i = sum(info.i for info in self.infos) + se = 100.0 * s / r + de = 100.0 * d / r + ie = 100.0 * i / r + wer = 100.0 * (s + d + i) / r + sen = max(len(self.infos), 1) + errsen = sum(info.e > 0 for info in self.infos) + ser = 100.0 * errsen / sen + print("-"*80) + print(f"ref{r:6d} sub{s:6d} del{d:6d} ins{i:6d}") + print(f"WER{wer:6.2f} sub{se:6.2f} del{de:6.2f} ins{ie:6.2f}") + print(f"SER{ser:6.2f} = {errsen} / {sen}") + print("-"*80) + + +class EnDigStats: + def __init__(self): + self.n_en_word = 0 + self.n_en_correct = 0 + self.n_dig_word = 0 + self.n_dig_correct = 0 + + def add(self, n_en_word, n_en_correct, n_dig_word, n_dig_correct): + self.n_en_word += n_en_word + self.n_en_correct += n_en_correct + self.n_dig_word += n_dig_word + self.n_dig_correct += n_dig_correct + + def print(self): + print(f"English #word={self.n_en_word}, #correct={self.n_en_correct}\n" + f"Digit #word={self.n_dig_word}, #correct={self.n_dig_correct}") + print("-"*80) + + + +def count_english_ditgit(ref, hyp, wer_info): + patt_en = "[a-zA-Z\.\-\']+" + patt_dig = "[0-9]+" + patt_cjk = re.compile(r'([\u4e00-\u9fff])') + n_en_word = 0 + n_en_correct = 0 + n_dig_word = 0 + n_dig_correct = 0 + ali = wer_info.ali + for i, token in enumerate(ref): + if re.match(patt_en, token): + n_en_word += 1 + for y in ali: + if y[0] == i+1 and y[2] == ALIGN_CRT: + j = y[1] - 1 + n_en_correct += 1 + break + if re.match(patt_dig, token): + n_dig_word += 1 + for y in ali: + if y[0] == i+1 and y[2] == ALIGN_CRT: + j = y[1] - 1 + n_dig_correct += 1 + break + if not re.match(patt_cjk, token) and not re.match(patt_en, token) \ + and not re.match(patt_dig, token): + print("[WiredChar]:", token) + return n_en_word, n_en_correct, n_dig_word, n_dig_correct + + + +if __name__ == "__main__": + args = parser.parse_args() + print(args, flush=True) + main(args) diff --git a/examples/asr_fireredasr/model/slam_fireredasr.py b/examples/asr_fireredasr/model/slam_fireredasr.py new file mode 100644 index 00000000..88a718bc --- /dev/null +++ b/examples/asr_fireredasr/model/slam_fireredasr.py @@ -0,0 +1,243 @@ +import os +import time +import json +import torch +import torch_npu +import sys +import copy +sys.path.append('/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/asr_fireredasr/model') +from fireredasr.models.fireredasr import FireRedAsr +from torch.utils.data import Dataset, DataLoader, IterableDataset +from fireredasr.data.asr_feat import ASRFeatExtractor +from fireredasr.models.fireredasr_aed import FireRedAsrAed +from fireredasr.models.fireredasr_llm import FireRedAsrLlm +from fireredasr.tokenizer.aed_tokenizer import ChineseCharEnglishSpmTokenizer +from fireredasr.tokenizer.llm_tokenizer import LlmTokenizerWrapper +import torch.distributed as dist +import kaldiio + +def model_factory(train_config, model_config, **kwargs): + model_dir = model_config.firered_path + model_path = os.path.join(model_dir, "model.pth.tar") + encoder_path = os.path.join(model_dir, "asr_encoder.pth.tar") + llm_dir = os.path.join(model_dir, "Qwen2-7B-Instruct") + model, tokenizer = load_firered_llm_model_and_tokenizer( + model_path, encoder_path, llm_dir, train_config) + ckpt_path = kwargs.get("ckpt_path", None) + if ckpt_path is not None: + print("loading other parts from: {}".format(ckpt_path)) + ckpt_dict = torch.load(ckpt_path, map_location="cpu") + model.load_state_dict(ckpt_dict, strict=False) + return model, tokenizer + +def load_firered_llm_model_and_tokenizer(model_path, encoder_path, llm_dir, train_config): + # model_path = "/aistor/aispeech/hpc_stor01/home/pengjing00sx/SLAM-LLM/examples/asr_fireredasr/exp/aishell-1/20250311/conformer_linear_Qwen2-7B-Instruct_encodertrue_loratrue_padtrue_normal_asr_speedfalse_specaugfalse-1058/fireredasrllm_epoch_1_step_100/model.pth.tar" + package = torch.load(model_path, map_location=lambda storage, loc: storage) + # print(type(package["args"])) + # input() + package["args"].encoder_path = encoder_path + package["args"].llm_dir = llm_dir + # if train_config.freeze_encoder: + package["args"].freeze_encoder = 0 + model = FireRedAsrLlm.from_args(package["args"]) + model.load_state_dict(package["model_state_dict"], strict=False) + tokenizer = LlmTokenizerWrapper.build_llm_tokenizer(llm_dir) + return model, tokenizer + + +class FireRedDataset(Dataset): + def __init__(self, dataset_config, tokenizer=None, split='train'): + super().__init__() + cmvn_path = dataset_config.cmvn_file + self.feature_extractor = ASRFeatExtractor(cmvn_path) + self.tokenizer = tokenizer + self.split = split + self.inference_mode = dataset_config.inference_mode + self.data_list = {} + self.multitask_task_list = [] + if split == "train": + data_path = dataset_config.train_scp_file_path + elif split == "val": + data_path = dataset_config.dev_scp_file_path + elif split == "test": + data_path = dataset_config.test_scp_file_path + else: + raise ValueError("Invalid split") + data_scp_file_path = os.path.join(data_path,"my_wav.scp") + with open(data_scp_file_path) as f: + for line in f: + key, path = line.strip().split(" ") + self.data_list[key] = path + multitask_task_path = os.path.join(data_path,"multitask.jsonl") + with open(multitask_task_path) as f: + for line in f: + item = json.loads(line.strip()) + if item["key"] in self.data_list: + self.multitask_task_list.append(item) + else: + print(item) + + def __len__(self): + return len(self.multitask_task_list) + + def __getitem__(self, index): + + # Deal with every wav one by one + item = self.multitask_task_list[index] + key = [item["key"]] + target = [item["target"]] + + return { + "key": key, + "target": target, + } + + def collator(self, samples): + assert samples is not None + + # Extract each field from the samples + keys = [sample["key"][0] for sample in samples] + targets = [sample["target"][0] for sample in samples] + + # Get padded feats + batch_wav_path = [] + for key in keys: + ark_path = self.data_list[key] + batch_wav_path.append(ark_path) + feats, lengths, durs = self.feature_extractor(batch_wav_path) + + # Get input_ids and target_ids + # inference + if self.inference_mode: + input_ids, attention_mask, target_ids, _ = \ + LlmTokenizerWrapper.preprocess_texts( + origin_texts=[""]*len(keys), tokenizer=self.tokenizer, + max_len=128, decode=True) + # training + else: + input_ids, attention_mask, target_ids, clean_texts = \ + LlmTokenizerWrapper.preprocess_texts( + origin_texts=targets, tokenizer=self.tokenizer, + max_len=128, decode=False) + + return { + "keys": keys, + "targets": targets, + "feats": feats, + "lengths": lengths, + "input_ids": input_ids, + "attention_mask": attention_mask, + "target_ids": target_ids, + } + +class FireRedDatasetLarge(IterableDataset): + def __init__(self, dataset_config, tokenizer=None, split='train'): + super().__init__() + cmvn_path = dataset_config.cmvn_file + self.feature_extractor = ASRFeatExtractor(cmvn_path) + self.tokenizer = tokenizer + self.split = split + self.inference_mode = dataset_config.inference_mode + + # 根据split选择对应的数据路径 + if split == "train": + data_path = dataset_config.train_scp_file_path + elif split == "val": + data_path = dataset_config.dev_scp_file_path + elif split == "test": + data_path = dataset_config.test_scp_file_path + else: + raise ValueError("Invalid split") + + # 加载多任务数据 + self.multitask_task_path = os.path.join(data_path, "multitask.jsonl") + + def get_audio_duration(self, wav_path): + """计算 WAV 音频的时长(单位:秒)""" + sample_rate, wav_np = kaldiio.load_mat(wav_path) + if sample_rate != 16000: + return None + dur = wav_np.shape[0] / sample_rate + return dur + + def __iter__(self): + multitask_task_path = self.multitask_task_path + worker_info = torch.utils.data.get_worker_info() + if worker_info is None: # 不在 DataLoader 的多进程环境中 + num_workers = 1 + worker_id = 0 + else: + num_workers = worker_info.num_workers + worker_id = worker_info.id + + # 获取分布式环境中的进程信息 + if dist.is_available() and dist.is_initialized(): + world_size = dist.get_world_size() + rank = dist.get_rank() + else: + world_size = 1 + rank = 0 + + # 计算每个 worker 和每个进程应该处理的数据范围 + total_num_workers = num_workers * world_size + worker_rank = rank * num_workers + worker_id + data_index = 0 + + with open(multitask_task_path) as f_task: + for line in f_task: + if (data_index % total_num_workers) == worker_rank : + item = json.loads(line.strip()) + + # ark_path = item.get("path", None) + # if ark_path is None: + # print(f"⚠️ 缺少 ark_path: {item}") + # continue + # # **🚀 计算音频时长** + # duration = self.get_audio_duration(ark_path) + # if duration is None or duration > 30.0: + # continue # 跳过时长超限的样本 + + yield{ + "key": item["key"], + "target": item["target"], + "ark_path": item["path"] + } + data_index += 1 + + def collator(self, samples): + assert samples is not None + + # 提取每个样本的字段 + keys = [sample["key"] for sample in samples] + targets = [sample["target"] for sample in samples] + batch_wav_path = [sample["ark_path"] for sample in samples] + + # 获取特征 + feats, lengths, durs = self.feature_extractor(batch_wav_path) + + # 获取 input_ids 和 target_ids + if self.inference_mode: + input_ids, attention_mask, target_ids, _ = \ + LlmTokenizerWrapper.preprocess_texts( + origin_texts=[""] * len(keys), tokenizer=self.tokenizer, + max_len=128, decode=True) + else: + input_ids, attention_mask, target_ids, clean_texts = \ + LlmTokenizerWrapper.preprocess_texts( + origin_texts=targets, tokenizer=self.tokenizer, + max_len=128, decode=False) + + return { + "keys": keys, + "targets": targets, + "feats": feats, + "lengths": lengths, + "input_ids": input_ids, + "attention_mask": attention_mask, + "target_ids": target_ids, + } + + +def get_speech_dataset(dataset_config, tokenizer, split): + dataset = FireRedDatasetLarge(dataset_config, tokenizer, split) + return dataset \ No newline at end of file diff --git a/examples/asr_fireredasr/readme b/examples/asr_fireredasr/readme new file mode 100644 index 00000000..858b58db --- /dev/null +++ b/examples/asr_fireredasr/readme @@ -0,0 +1,4 @@ +小红书推理与微调代码部分: + 主体使用slam--llm框架, 但是Model的load、推理以及forward均在model文件夹的fireredasr完成 + slam_fireredasr主要包含load和数据dataset的转写 + 注意精度设置方式与main分支不同, 尤其是bf16的时候, 请关注具体训练代码 \ No newline at end of file diff --git a/examples/asr_fireredasr/scripts/decode_fireredasr.sh b/examples/asr_fireredasr/scripts/decode_fireredasr.sh new file mode 100644 index 00000000..622ea0f8 --- /dev/null +++ b/examples/asr_fireredasr/scripts/decode_fireredasr.sh @@ -0,0 +1,139 @@ +#!/bin/bash +#export PYTHONPATH=/root/whisper:$PYTHONPATH +export ASCEND_VISIBLE_DEVICES=0 +export TOKENIZERS_PARALLELISM=false +# export CUDA_LAUNCH_BLOCKING=1 +set -e +run_dir=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/ +cd $run_dir +code_dir=examples/asr_fireredasr + +dataset=aishell-2 +prompt_style=instruct # normal #instruct +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech-clean || $dataset == librispeech-other || $dataset == alimeeting || $dataset == "slidespeech" ]] +then + # aishell-1:asr hotword + # aishell-2:asr hotword mt + # librispeech:asr prevtext mt + # alimeeting: asr_far_bf asr_near + dataset_task=asr +fi +projector=linear +encoder_name=conformer +llm_name=Qwen2-7B-Instruct +use_peft=false +use_fp16=true +pad_or_trim=true +encoder_projector_ds_rate=2 +file=examples/asr_fireredasr/model/slam_fireredasr.py:model_factory +ckpt_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/asr_fireredasr/exp/slidespeech/20250401/conformer_linear_Qwen2-7B-Instruct_loratrue_padtrue_normal_asr_speedfalse_specaugfalse-2217/mala_asr_epoch_1_step_6000 +# Choose Encoder, 这个还是一点用也没有 +if [[ $encoder_name == "whisper" ]] +then + if [[ $encoder_finetune == true ]] + then + speech_encoder_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/whisper-Pt/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt + mel_size=80 + else + speech_encoder_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/whisper-large-v3/large-v3.pt + mel_size=128 + fi + encoder_dim=1280 + input_type=mel + +elif [[ $encoder_name == "wavlm" ]] +then + speech_encoder_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/wavlm/WavLM-Large.pt + encoder_dim=1024 + input_type=raw + mel_size=128 + +elif [[ $encoder_name == "conformer" ]] +then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/FireRedASR-LLM-L/asr_encoder.pth.tar + encoder_dim=1280 + input_type=raw + mel_size=128 + +else + exit 1 +fi + +# Choose LLM, 这个一点用也没有 +if [[ $llm_name == "vicuna-7b-v1.5" ]] +then + llm_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/vicuna-7b-v1.5 + llm_dim=4096 +elif [[ $llm_name == "Qwen2.5-7B-Instruct" ]] +then + llm_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/Qwen2.5-7B-Instruct + llm_dim=3584 +elif [[ $llm_name == "Qwen2-7B" ]] +then + llm_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/Qwen2-7B + llm_dim=3584 +elif [[ $llm_name == "Qwen2.5-7B" ]] +then + llm_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/Qwen2.5-7B + llm_dim=3584 +elif [[ $llm_name == "Qwen2-7B-Instruct" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/Qwen2-7B-Instruct + llm_dim=3584 +else + exit 1 +fi + +if [[ $dataset == "aishell-1" || $dataset == "aishell-2" || $dataset == "alimeeting" || $dataset == "slidespeech" ]] +then + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/test/ +elif [[ $dataset == "librispeech-other" ]] +then + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/test-other/ +elif [[ $dataset == "librispeech-clean" ]] +then + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/test-clean/ +else + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/test/ +fi +decode_log=$ckpt_path/decode_${dataset}_${dataset_task}_${prompt_style} +# decode_log=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/examples/asr_fireredasr/decode_${dataset}_${dataset_task}_${prompt_style} +# -m debugpy --listen 5678 --wait-for-client +python $code_dir/inference_fireredasr.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + hydra.run.dir=$ckpt_path \ + ++model_config.llm_name=$llm_name \ + ++model_config.llm_path=$llm_path \ + ++model_config.llm_dim=$llm_dim \ + ++model_config.encoder_name=$encoder_name \ + ++model_config.normalize=true \ + ++model_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ + ++model_config.encoder_path=$speech_encoder_path \ + ++model_config.encoder_dim=$encoder_dim \ + ++model_config.encoder_projector=$projector \ + ++dataset_config.llm_name=$llm_name \ + ++dataset_config.prompt_style=$prompt_style \ + ++dataset_config.dataset=$dataset \ + ++dataset_config.pad_or_trim=$pad_or_trim \ + ++dataset_config.test_scp_file_path=$test_scp_file_path \ + ++dataset_config.input_type=$input_type \ + ++dataset_config.mel_size=$mel_size \ + ++dataset_config.inference_mode=true \ + ++train_config.model_name=firered_asr \ + ++train_config.freeze_encoder=true \ + ++train_config.freeze_llm=true \ + ++train_config.use_peft=$use_peft \ + ++train_config.batching_strategy=custom \ + ++train_config.num_epochs=1 \ + ++train_config.val_batch_size=8 \ + ++train_config.num_workers_dataloader=8 \ + ++train_config.output_dir=$output_dir \ + ++train_config.inference_mode=true \ + ++decode_log=$decode_log \ + ++ckpt_path=$ckpt_path + + +# python /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/tools/wenet_compute_cer.py --char=1 -v=1 ${decode_log}_gt ${decode_log}_pred > ${decode_log}_cer +# python /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/tools/pyResults/pyResults.py ${decode_log}_gt ${decode_log}_pred > ${decode_log}_ser +# python "/hpc_stor01/home/yangui.fang_sx/workingspace/SLAM-LLM/examples/mala_asr_slidespeech/slam_llm/utils/compute_wer.py" ${decode_log}_gt ${decode_log}_pred ${decode_log}_ser diff --git a/examples/asr_fireredasr/scripts/decode_fireredasr_deepspeed.sh b/examples/asr_fireredasr/scripts/decode_fireredasr_deepspeed.sh new file mode 100644 index 00000000..d262aab3 --- /dev/null +++ b/examples/asr_fireredasr/scripts/decode_fireredasr_deepspeed.sh @@ -0,0 +1,146 @@ +#!/bin/bash +#export PYTHONPATH=/root/whisper:$PYTHONPATH +export ASCEND_VISIBLE_DEVICES=0 +export TOKENIZERS_PARALLELISM=false +# export CUDA_LAUNCH_BLOCKING=1 +set -e +run_dir=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/ +cd $run_dir +code_dir=examples/asr_fireredasr +dataset=slidespeech +ckpt_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/asr_fireredasr_text/exp/slidespeech/20250402/conformer_linear_Qwen2-7B-Instruct_loratrue_padtrue_normal_asr_speedfalse_specaugfalse-1526/mala_asr_epoch_1_step_12000 +# ckpt_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/asr_fireredasr/exp/alimeeting/20250320/conformer_linear_Qwen2-7B-Instruct_loratrue_padtrue_normal_asr_far_bf_speedfalse_specaugfalse-1513/mala_asr_epoch_2_step_4000 +prompt_style=normal # normal #instruct +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech-clean || $dataset == librispeech-other || $dataset == alimeeting || $dataset == slidespeech ]] +then + # aishell-1:asr hotword + # aishell-2:asr hotword mt + # librispeech:asr prevtext mt + # alimeeting: asr_far_bf asr_near + # slidespeech: asr domain + dataset_task=asr +fi +projector=linear +encoder_name=conformer +llm_name=Qwen2-7B-Instruct +use_peft=false +use_fp16=true +pad_or_trim=true +encoder_projector_ds_rate=2 +deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/asr_fireredasr/conf/ds_config.json +prompt_style=normal #instruct +# Choose Encoder, 这个还是一点用也没有 +if [[ $encoder_name == "whisper" ]] +then + if [[ $encoder_finetune == true ]] + then + speech_encoder_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/whisper-Pt/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt + mel_size=80 + else + speech_encoder_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/whisper-large-v3/large-v3.pt + mel_size=128 + fi + encoder_dim=1280 + input_type=mel + +elif [[ $encoder_name == "wavlm" ]] +then + speech_encoder_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/wavlm/WavLM-Large.pt + encoder_dim=1024 + input_type=raw + mel_size=128 + +elif [[ $encoder_name == "conformer" ]] +then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/FireRedASR-LLM-L/asr_encoder.pth.tar + encoder_dim=1280 + input_type=raw + mel_size=128 + +else + exit 1 +fi + +# Choose LLM, 这个一点用也没有 +if [[ $llm_name == "vicuna-7b-v1.5" ]] +then + llm_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/vicuna-7b-v1.5 + llm_dim=4096 +elif [[ $llm_name == "Qwen2.5-7B-Instruct" ]] +then + llm_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/Qwen2.5-7B-Instruct + llm_dim=3584 +elif [[ $llm_name == "Qwen2-7B" ]] +then + llm_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/Qwen2-7B + llm_dim=3584 +elif [[ $llm_name == "Qwen2.5-7B" ]] +then + llm_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/Qwen2.5-7B + llm_dim=3584 +elif [[ $llm_name == "Qwen2-7B-Instruct" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/Qwen2-7B-Instruct + llm_dim=3584 +else + exit 1 +fi + +if [[ $dataset == "aishell-1" || $dataset == "aishell-2" || $dataset == "alimeeting" || slidespeech ]] +then + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/test/ +elif [[ $dataset == "librispeech-other" ]] +then + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/test-other/ +elif [[ $dataset == "librispeech-clean" ]] +then + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/test-clean/ +else + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/test/ +fi +decode_log=$ckpt_path/decode_${dataset}_${dataset_task}_${prompt_style} +# decode_log=./decode_${dataset}_${dataset_task}_${prompt_style} +# decode_log=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/examples/asr_fireredasr/decode_${dataset}_${dataset_task}_${prompt_style} + +# -m debugpy --listen 5678 --wait-for-client +deepspeed \ + --num_nodes 1 \ + --num_gpus 8 \ + $code_dir/inference_fireredasr_deepspeed.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + hydra.run.dir=$ckpt_path \ + ++model_config.llm_name=$llm_name \ + ++model_config.llm_path=$llm_path \ + ++model_config.llm_dim=$llm_dim \ + ++model_config.encoder_name=$encoder_name \ + ++model_config.normalize=true \ + ++model_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ + ++model_config.encoder_path=$speech_encoder_path \ + ++model_config.encoder_dim=$encoder_dim \ + ++model_config.encoder_projector=$projector \ + ++dataset_config.llm_name=$llm_name \ + ++dataset_config.prompt_style=$prompt_style \ + ++dataset_config.dataset=$dataset \ + ++dataset_config.pad_or_trim=$pad_or_trim \ + ++dataset_config.test_scp_file_path=$test_scp_file_path \ + ++dataset_config.input_type=$input_type \ + ++dataset_config.mel_size=$mel_size \ + ++dataset_config.inference_mode=true \ + ++train_config.model_name=firered_asr \ + ++train_config.freeze_encoder=true \ + ++train_config.freeze_llm=true \ + ++train_config.use_peft=$use_peft \ + ++train_config.batching_strategy=custom \ + ++train_config.num_epochs=1 \ + ++train_config.val_batch_size=8 \ + ++train_config.num_workers_dataloader=8 \ + ++train_config.output_dir=$output_dir \ + ++train_config.inference_mode=true \ + ++decode_log=$decode_log \ + ++ckpt_path=$ckpt_path/model.pt + + +python /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/tools/wenet_compute_cer.py --char=1 -v=1 ${decode_log}_gt ${decode_log}_pred > ${decode_log}_cer +python /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/tools/pyResults/pyResults.py ${decode_log}_gt ${decode_log}_pred > ${decode_log}_ser +python "/hpc_stor01/home/yangui.fang_sx/workingspace/SLAM-LLM/examples/mala_asr_slidespeech/slam_llm/utils/compute_wer.py" ${decode_log}_gt ${decode_log}_pred ${decode_log}_ser diff --git a/examples/asr_fireredasr/scripts/finetune_fireredasr.sh b/examples/asr_fireredasr/scripts/finetune_fireredasr.sh new file mode 100644 index 00000000..4ed0c55b --- /dev/null +++ b/examples/asr_fireredasr/scripts/finetune_fireredasr.sh @@ -0,0 +1,194 @@ +#!/bin/bash +# export PYTHONPATH=/root/fairseq:$PYTHONPATH +export ASCEND_VISIBLE_DEVICES=6,7 +export TOKENIZERS_PARALLELISM=false +# export CUDA_LAUNCH_BLOCKING=1 +export HYDRA_FULL_ERROR=1 +export OMP_NUM_THREADS=1 + +# debug setting for multiple gpus +# export NCCL_DEBUG=INFO +# export NCCL_DEBUG_SUBSYS=ALL +# export TORCH_DISTRIBUTED_DEBUG=INFO +run_dir=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/ +cd $run_dir +code_dir=examples/asr_fireredasr +# multitask +# dataset=alimeeting +# multitask_asr +dataset=alimeeting +prompt_style=normal #instruct +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech ]] +then + # aishell1:asr hotword + # aisehll2:asr hotword mt + # librispeech:asr prevtext mt + # alimeeting: asr_far_bf asr_near asr_far_bf_sot asr_far_sot asr_far_gss + # aishell-4: asr_far_bf asr_far_sot asf + # gigaspeech: asr + # wenetspeech: asr + dataset_task=asr_far_bf_sot +fi +projector=linear +encoder_name=conformer +llm_name=Qwen2-7B-Instruct +use_peft=true +use_fp16=true +freeze_encoder=true +pad_or_trim=true +encoder_projector_ds_rate=2 +# enhance +# enhance +speed_perturb=false +spec_augmentation=false +add_noise=false +add_reverb=false +firered_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/FireRedASR-LLM/ +# if [[ $use_peft == "true" || $freeze_encoder == false ]];then +# ckpt_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/FireRedASR-LLM/ +# fi + +# Choose Encoder +if [[ $encoder_name == "whisper" ]] +then + if [[ $encoder_finetune == true ]] + then + speech_encoder_path= + mel_size=80 + else + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + mel_size=128 + fi + encoder_dim=1280 + input_type=mel + +elif [[ $encoder_name == "wavlm" ]] +then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/wavlm/WavLM-Large.pt + encoder_dim=1024 + input_type=raw + mel_size=128 +elif [[ $encoder_name == "conformer" ]] +then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/FireRedASR-LLM-L/asr_encoder.pth.tar + encoder_dim=1280 + input_type=raw + mel_size=128 +else + exit 1 +fi + +# Choose LLM +if [[ $llm_name == "vicuna-7b-v1.5" ]] +then + llm_path= + llm_dim=4096 +elif [[ $llm_name == "Qwen2.5-7B-Instruct" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim=3584 +elif [[ $llm_name == "Qwen2-7B-Instruct" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/Qwen2-7B-Instruct + llm_dim=3584 +elif [[ $llm_name == "Qwen2-7B" ]] +then + llm_path= + llm_dim=3584 +elif [[ $llm_name == "Qwen2.5-7B" ]] +then + llm_path= + llm_dim=3584 +else + exit 1 +fi + +# Choose Train/Dev/Test +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/dev/ + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/test/ +elif [[ $dataset == "librispeech" ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/dev-other/ +else + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/dev/ +fi +file=examples/asr_fireredasr/model/slam_fireredasr.py:model_factory +inference_mode=False +output_dir=${code_dir}/exp/${dataset}/$(date +"%Y%m%d")/${encoder_name}_${projector}_${llm_name}_lora${use_peft}_pad${pad_or_trim}_${prompt_style}_${dataset_task}_speed${speed_perturb}_specaug${spec_augmentation}-$(date +"%H%M") +hydra_args=" +hydra.run.dir=$output_dir \ +++model_config.llm_name=$llm_name \ +++model_config.llm_path=$llm_path \ +++model_config.llm_dim=$llm_dim \ +++model_config.encoder_name=$encoder_name \ +++model_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++model_config.encoder_path=$speech_encoder_path \ +++model_config.encoder_dim=$encoder_dim \ +++model_config.encoder_projector=$projector \ +++model_config.firered_path=$firered_path \ +++model_config.normalize=true \ +++model_config.file=$file \ +++dataset_config.llm_name=$llm_name \ +++dataset_config.prompt_style=$prompt_style \ +++dataset_config.normalize=true \ +++dataset_config.dataset=$dataset \ +++dataset_config.wav_scp=$test_scp_file_path/my_wav.scp \ +++dataset_config.text=$test_scp_file_path/text \ +++dataset_config.input_type=$input_type \ +++dataset_config.speed_perturb=$speed_perturb \ +++dataset_config.spec_augmentation=$spec_augmentation \ +++dataset_config.add_reverb=$add_reverb \ +++dataset_config.noise_file_path=$noise_file_path \ +++dataset_config.mel_size=$mel_size \ +++dataset_config.pad_or_trim=$pad_or_trim \ +++dataset_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++dataset_config.train_scp_file_path=$train_scp_file_path \ +++dataset_config.train_text_file=$train_scp_file_path \ +++dataset_config.dev_scp_file_path=$dev_scp_file_path \ +++dataset_config.dev_text_file=$dev_scp_file_path \ +++train_config.model_name=mala_asr \ +++train_config.num_epochs=5 \ +++train_config.freeze_encoder=$freeze_encoder \ +++train_config.freeze_llm=true \ +++train_config.use_peft=$use_peft \ +++train_config.batching_strategy=custom \ +++train_config.warmup_steps=100 \ +++train_config.total_steps=100000 \ +++train_config.lr=1e-4 \ +++train_config.validation_interval=500 \ +++train_config.batch_size_training=1 \ +++train_config.val_batch_size=1 \ +++train_config.num_workers_dataloader=8 \ +++train_config.output_dir=$output_dir \ +++train_config.inference_mode=$inference_mode \ +++metric=acc \ +" +# if [[ $use_peft == "true" || $freeze_encoder == false ]];then +# hydra_args+="++ckpt_path=$ckpt_path" +# fi +# hydra_args+="++ckpt_path=$ckpt_path/model.pt" + +# -m debugpy --listen 5678 --wait-for-client +if [[ $ASCEND_VISIBLE_DEVICES != *","* ]]; then + python -m debugpy --listen 5678 --wait-for-client $code_dir/finetune_fireredasr.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + $hydra_args +else + torchrun \ + --nnodes 1 \ + --nproc_per_node 2 \ + --master_port=29505 \ + $code_dir/finetune_fireredasr.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + ++train_config.enable_fsdp=false \ + ++train_config.enable_ddp=true \ + ++train_config.use_fp16=$use_fp16 \ + ${hydra_args} +fi diff --git a/examples/asr_fireredasr/scripts/finetune_fireredasr_deepspeed.sh b/examples/asr_fireredasr/scripts/finetune_fireredasr_deepspeed.sh new file mode 100644 index 00000000..41cb77a6 --- /dev/null +++ b/examples/asr_fireredasr/scripts/finetune_fireredasr_deepspeed.sh @@ -0,0 +1,182 @@ +#!/bin/bash +# export PYTHONPATH=/root/fairseq:$PYTHONPATH +# export ASCEND_VISIBLE_DEVICES=6,7 +export TOKENIZERS_PARALLELISM=false +# export CUDA_LAUNCH_BLOCKING=1 +export HYDRA_FULL_ERROR=1 +export OMP_NUM_THREADS=1 + +# debug setting for multiple gpus +# export NCCL_DEBUG=INFO +# export NCCL_DEBUG_SUBSYS=ALL +# export TORCH_DISTRIBUTED_DEBUG=INFO +run_dir=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/ +cd $run_dir +code_dir=examples/asr_fireredasr +# multitask +# dataset=alimeeting +# multitask_asr +dataset=aishell-2 +deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/asr_fireredasr/conf/ds_config.json +prompt_style=normal #instruct +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech ]] +then + # aishell1:asr hotword + # aisehll2:asr hotword mt + # librispeech:asr prevtext mt + # alimeeting: asr_far_bf asr_near + # gigaspeech: asr + # wenetspeech: asr + dataset_task=asr +fi +projector=linear +encoder_name=conformer +llm_name=Qwen2-7B-Instruct +use_peft=true +use_fp16=true +freeze_encoder=true +pad_or_trim=true +encoder_projector_ds_rate=2 +# enhance +# enhance +speed_perturb=false +spec_augmentation=false +add_noise=false +add_reverb=false +# if [[ $use_peft == "true" || $freeze_encoder == false ]];then +# ckpt_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/asr_fireredasr/exp/alimeeting/20250315/conformer_linear_Qwen2-7B-Instruct_loratrue_padtrue_normal_asr_far_bf_sot_speedfalse_specaugfalse-1410/mala_asr_epoch_1_step_12000 +# fi +# Choose Encoder +if [[ $encoder_name == "whisper" ]] +then + if [[ $encoder_finetune == true ]] + then + speech_encoder_path= + mel_size=80 + else + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + mel_size=128 + fi + encoder_dim=1280 + input_type=mel + +elif [[ $encoder_name == "wavlm" ]] +then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/wavlm/WavLM-Large.pt + encoder_dim=1024 + input_type=raw + mel_size=128 +elif [[ $encoder_name == "conformer" ]] +then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/FireRedASR-LLM-L/asr_encoder.pth.tar + encoder_dim=1280 + input_type=raw + mel_size=128 +else + exit 1 +fi + +# Choose LLM +if [[ $llm_name == "vicuna-7b-v1.5" ]] +then + llm_path= + llm_dim=4096 +elif [[ $llm_name == "Qwen2.5-7B-Instruct" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim=3584 +elif [[ $llm_name == "Qwen2-7B-Instruct" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/Qwen2-7B-Instruct + llm_dim=3584 +elif [[ $llm_name == "Qwen2-7B" ]] +then + llm_path= + llm_dim=3584 +elif [[ $llm_name == "Qwen2.5-7B" ]] +then + llm_path= + llm_dim=3584 +else + exit 1 +fi + +# Choose Train/Dev/Test +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/dev/ + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/test/ +elif [[ $dataset == "librispeech" ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/dev-other/ +else + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/dev/ +fi +file=examples/asr_fireredasr/model/slam_fireredasr.py:model_factory +inference_mode=False +output_dir=${code_dir}/exp/${dataset}/$(date +"%Y%m%d")/${encoder_name}_${projector}_${llm_name}_lora${use_peft}_pad${pad_or_trim}_${prompt_style}_${dataset_task}_speed${speed_perturb}_specaug${spec_augmentation}-$(date +"%H%M") +hydra_args=" +hydra.run.dir=$output_dir \ +++model_config.llm_name=$llm_name \ +++model_config.llm_path=$llm_path \ +++model_config.llm_dim=$llm_dim \ +++model_config.encoder_name=$encoder_name \ +++model_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++model_config.encoder_path=$speech_encoder_path \ +++model_config.encoder_dim=$encoder_dim \ +++model_config.encoder_projector=$projector \ +++model_config.normalize=true \ +++model_config.file=$file \ +++dataset_config.llm_name=$llm_name \ +++dataset_config.prompt_style=$prompt_style \ +++dataset_config.normalize=true \ +++dataset_config.dataset=$dataset \ +++dataset_config.wav_scp=$test_scp_file_path/my_wav.scp \ +++dataset_config.text=$test_scp_file_path/text \ +++dataset_config.input_type=$input_type \ +++dataset_config.speed_perturb=$speed_perturb \ +++dataset_config.spec_augmentation=$spec_augmentation \ +++dataset_config.add_reverb=$add_reverb \ +++dataset_config.noise_file_path=$noise_file_path \ +++dataset_config.mel_size=$mel_size \ +++dataset_config.pad_or_trim=$pad_or_trim \ +++dataset_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++dataset_config.train_scp_file_path=$train_scp_file_path \ +++dataset_config.train_text_file=$train_scp_file_path \ +++dataset_config.dev_scp_file_path=$dev_scp_file_path \ +++dataset_config.dev_text_file=$dev_scp_file_path \ +++train_config.model_name=mala_asr \ +++train_config.num_epochs=5 \ +++train_config.freeze_encoder=$freeze_encoder \ +++train_config.freeze_llm=true \ +++train_config.use_peft=$use_peft \ +++train_config.batching_strategy=custom \ +++train_config.warmup_steps=100 \ +++train_config.total_steps=100000 \ +++train_config.lr=5e-5 \ +++train_config.validation_interval=4000 \ +++train_config.batch_size_training=4 \ +++train_config.val_batch_size=6 \ +++train_config.num_workers_dataloader=8 \ +++train_config.output_dir=$output_dir \ +++train_config.inference_mode=$inference_mode \ +++metric=acc \ +" +# if [[ $use_peft == "true" || $freeze_encoder == false ]];then +# hydra_args+=" ++ckpt_path=$ckpt_path/model.pt " +# fi + +deepspeed \ + --num_nodes 1 \ + --num_gpus 2 \ + $code_dir/finetune_fireredasr_deepspeed.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + ++train_config.enable_fsdp=false \ + ++train_config.enable_ddp=true \ + ++train_config.use_fp16=$use_fp16 \ + ++deepspeed_config=$deepspeed_config \ + ${hydra_args} diff --git a/examples/asr_fireredasr/scripts/finetune_fireredasr_deepspeed_multinode.sh b/examples/asr_fireredasr/scripts/finetune_fireredasr_deepspeed_multinode.sh new file mode 100644 index 00000000..b26c4fc9 --- /dev/null +++ b/examples/asr_fireredasr/scripts/finetune_fireredasr_deepspeed_multinode.sh @@ -0,0 +1,204 @@ +#!/bin/bash +# export PYTHONPATH=/root/fairseq:$PYTHONPATH +# export ASCEND_VISIBLE_DEVICES=6,7 +export TOKENIZERS_PARALLELISM=false +# export CUDA_LAUNCH_BLOCKING=1 +export HYDRA_FULL_ERROR=1 +export OMP_NUM_THREADS=1 + +# debug setting for multiple gpus +# export NCCL_DEBUG=INFO +# export NCCL_DEBUG_SUBSYS=ALL +# export TORCH_DISTRIBUTED_DEBUG=INFO +run_dir=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/ +cd $run_dir +code_dir=examples/asr_fireredasr +# multitask +# dataset=alimeeting +# multitask_asr +dataset=slidespeech +deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/asr_fireredasr/conf/ds_config.json +prompt_style=normal #instruct +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech || $dataset == slidespeech ]] +then + # aishell1:asr hotword + # aisehll2:asr hotword mt + # librispeech:asr prevtext mt + # alimeeting: asr_far_bf asr_near + # gigaspeech: asr + # wenetspeech: asr + dataset_task=asr +fi +projector=linear +encoder_name=conformer +llm_name=Qwen2-7B-Instruct +use_peft=true +use_fp16=true +freeze_encoder=true +pad_or_trim=true +encoder_projector_ds_rate=2 +# enhance +# enhance +speed_perturb=false +spec_augmentation=false +add_noise=false +add_reverb=false + +# if [[ $use_peft == "true" || $freeze_encoder == false ]];then +# ckpt_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/FireRedASR-LLM/ +# fi + +# Choose Encoder +if [[ $encoder_name == "whisper" ]] +then + if [[ $encoder_finetune == true ]] + then + speech_encoder_path= + mel_size=80 + else + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + mel_size=128 + fi + encoder_dim=1280 + input_type=mel + +elif [[ $encoder_name == "wavlm" ]] +then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/wavlm/WavLM-Large.pt + encoder_dim=1024 + input_type=raw + mel_size=128 +elif [[ $encoder_name == "conformer" ]] +then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/FireRedASR-LLM-L/asr_encoder.pth.tar + encoder_dim=1280 + input_type=raw + mel_size=128 +else + exit 1 +fi + +# Choose LLM +if [[ $llm_name == "vicuna-7b-v1.5" ]] +then + llm_path= + llm_dim=4096 +elif [[ $llm_name == "Qwen2.5-7B-Instruct" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim=3584 +elif [[ $llm_name == "Qwen2-7B-Instruct" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/Qwen2-7B-Instruct + llm_dim=3584 +elif [[ $llm_name == "Qwen2-7B" ]] +then + llm_path= + llm_dim=3584 +elif [[ $llm_name == "Qwen2.5-7B" ]] +then + llm_path= + llm_dim=3584 +else + exit 1 +fi + +# Choose Train/Dev/Test +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech || $dataset == slidespeech ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/dev/ + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/test/ +elif [[ $dataset == "librispeech" ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/dev-other/ +else + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/dev/ +fi +file=examples/asr_fireredasr/model/slam_fireredasr.py:model_factory +inference_mode=False +output_dir=${code_dir}/exp/${dataset}/$(date +"%Y%m%d")/${encoder_name}_${projector}_${llm_name}_lora${use_peft}_pad${pad_or_trim}_${prompt_style}_${dataset_task}_speed${speed_perturb}_specaug${spec_augmentation}-$(date +"%H%M") +hydra_args=" +hydra.run.dir=$output_dir \ +++model_config.llm_name=$llm_name \ +++model_config.llm_path=$llm_path \ +++model_config.llm_dim=$llm_dim \ +++model_config.encoder_name=$encoder_name \ +++model_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++model_config.encoder_path=$speech_encoder_path \ +++model_config.encoder_dim=$encoder_dim \ +++model_config.encoder_projector=$projector \ +++model_config.normalize=true \ +++model_config.file=$file \ +++dataset_config.llm_name=$llm_name \ +++dataset_config.prompt_style=$prompt_style \ +++dataset_config.normalize=true \ +++dataset_config.dataset=$dataset \ +++dataset_config.wav_scp=$test_scp_file_path/my_wav.scp \ +++dataset_config.text=$test_scp_file_path/text \ +++dataset_config.input_type=$input_type \ +++dataset_config.speed_perturb=$speed_perturb \ +++dataset_config.spec_augmentation=$spec_augmentation \ +++dataset_config.add_reverb=$add_reverb \ +++dataset_config.noise_file_path=$noise_file_path \ +++dataset_config.mel_size=$mel_size \ +++dataset_config.pad_or_trim=$pad_or_trim \ +++dataset_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++dataset_config.train_scp_file_path=$train_scp_file_path \ +++dataset_config.train_text_file=$train_scp_file_path \ +++dataset_config.dev_scp_file_path=$dev_scp_file_path \ +++dataset_config.dev_text_file=$dev_scp_file_path \ +++train_config.model_name=mala_asr \ +++train_config.num_epochs=5 \ +++train_config.freeze_encoder=$freeze_encoder \ +++train_config.freeze_llm=true \ +++train_config.use_peft=$use_peft \ +++train_config.batching_strategy=custom \ +++train_config.warmup_steps=100 \ +++train_config.total_steps=100000 \ +++train_config.lr=1e-4 \ +++train_config.validation_interval=2000 \ +++train_config.batch_size_training=4 \ +++train_config.val_batch_size=4 \ +++train_config.num_workers_dataloader=8 \ +++train_config.output_dir=$output_dir \ +++train_config.inference_mode=$inference_mode \ +++metric=acc \ +" +# if [[ $use_peft == "true" || $freeze_encoder == false ]];then +# hydra_args+="++ckpt_path=$ckpt_path" +# fi +# hydra_args+="++ckpt_path=$ckpt_path/model.pt" + + + +HOST_FILE="/tmp/"${JobID} #生成的hostfile的完整文件名,$JobID调度系统会自动生成 +SSH_PORT=6666 #因调度系统强制普通用户身份起容器,需要将ssh端口指定为大于1024的值 + +gen_hostfile() { #此函数负责生成hostfile, 已跟调度系统对接好,直接使用,不要修改 + echo "${VC_MASTER_HOSTS} slots=${GPU_PER_TASK}" > ${HOST_FILE} + echo "${VC_WORKER_HOSTS}" | awk -F ',' -v gpu_num=$GPU_PER_TASK '{for (i=1; i<=NF; i++) print $i" slots="gpu_num}' >> ${HOST_FILE} +} + +do_train() { + cat $HOST_FILE #训练主入口函数 + /usr/sbin/sshd -p ${SSH_PORT} #在Rank0上后台启动sshd服务,不要修改 + deepspeed \ + --node_rank=$RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT \ + --hostfile $HOST_FILE \ + --no_ssh \ + $code_dir/finetune_fireredasr_deepspeed.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + ++train_config.enable_fsdp=false \ + ++train_config.enable_ddp=true \ + ++train_config.use_fp16=$use_fp16 \ + ++deepspeed_config=$deepspeed_config \ + ${hydra_args} +} +gen_hostfile +do_train \ No newline at end of file diff --git a/examples/asr_fireredasr/slam_llm b/examples/asr_fireredasr/slam_llm new file mode 120000 index 00000000..e2d801ef --- /dev/null +++ b/examples/asr_fireredasr/slam_llm @@ -0,0 +1 @@ +/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/src/slam_llm \ No newline at end of file diff --git a/examples/asr_fireredasr_text/.hydra/config.yaml b/examples/asr_fireredasr_text/.hydra/config.yaml new file mode 100644 index 00000000..614770f1 --- /dev/null +++ b/examples/asr_fireredasr_text/.hydra/config.yaml @@ -0,0 +1,32 @@ +dataset_config: + prompt: 请转写音频为文字 + llm_name: Qwen2-7B-Instruct + prompt_style: normal + dataset: slidespeech + pad_or_trim: true + test_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/slidespeech/asr/test/ + input_type: raw + mel_size: 128 + inference_mode: true +model_config: + llm_name: Qwen2-7B-Instruct + llm_path: /aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/Qwen2-7B-Instruct + llm_dim: 3584 + encoder_name: conformer + normalize: true + encoder_projector_ds_rate: 2 + encoder_path: /aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/FireRedASR-LLM-L/asr_encoder.pth.tar + encoder_dim: 1280 + encoder_projector: linear +train_config: + model_name: firered_asr + freeze_encoder: true + freeze_llm: true + use_peft: false + batching_strategy: custom + num_epochs: 1 + val_batch_size: 8 + num_workers_dataloader: 8 + output_dir: '' + inference_mode: true +decode_log: ./decode_slidespeech_asr_normal diff --git a/examples/asr_fireredasr_text/.hydra/hydra.yaml b/examples/asr_fireredasr_text/.hydra/hydra.yaml new file mode 100644 index 00000000..f711cde9 --- /dev/null +++ b/examples/asr_fireredasr_text/.hydra/hydra.yaml @@ -0,0 +1,183 @@ +hydra: + run: + dir: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/examples/asr_fireredasr + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.run.dir=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/examples/asr_fireredasr + - hydra.mode=RUN + task: + - ++model_config.llm_name=Qwen2-7B-Instruct + - ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/Qwen2-7B-Instruct + - ++model_config.llm_dim=3584 + - ++model_config.encoder_name=conformer + - ++model_config.normalize=true + - ++model_config.encoder_projector_ds_rate=2 + - ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/FireRedASR-LLM-L/asr_encoder.pth.tar + - ++model_config.encoder_dim=1280 + - ++model_config.encoder_projector=linear + - ++dataset_config.llm_name=Qwen2-7B-Instruct + - ++dataset_config.prompt_style=normal + - ++dataset_config.dataset=slidespeech + - ++dataset_config.pad_or_trim=true + - ++dataset_config.test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/slidespeech/asr/test/ + - ++dataset_config.input_type=raw + - ++dataset_config.mel_size=128 + - ++dataset_config.inference_mode=true + - ++train_config.model_name=firered_asr + - ++train_config.freeze_encoder=true + - ++train_config.freeze_llm=true + - ++train_config.use_peft=false + - ++train_config.batching_strategy=custom + - ++train_config.num_epochs=1 + - ++train_config.val_batch_size=8 + - ++train_config.num_workers_dataloader=8 + - ++train_config.output_dir= + - ++train_config.inference_mode=true + - ++decode_log=./decode_slidespeech_asr_normal + job: + name: inference_fireredasr_deepspeed + chdir: null + override_dirname: ++dataset_config.dataset=slidespeech,++dataset_config.inference_mode=true,++dataset_config.input_type=raw,++dataset_config.llm_name=Qwen2-7B-Instruct,++dataset_config.mel_size=128,++dataset_config.pad_or_trim=true,++dataset_config.prompt_style=normal,++dataset_config.test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/slidespeech/asr/test/,++decode_log=./decode_slidespeech_asr_normal,++model_config.encoder_dim=1280,++model_config.encoder_name=conformer,++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/FireRedASR-LLM-L/asr_encoder.pth.tar,++model_config.encoder_projector=linear,++model_config.encoder_projector_ds_rate=2,++model_config.llm_dim=3584,++model_config.llm_name=Qwen2-7B-Instruct,++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/Qwen2-7B-Instruct,++model_config.normalize=true,++train_config.batching_strategy=custom,++train_config.freeze_encoder=true,++train_config.freeze_llm=true,++train_config.inference_mode=true,++train_config.model_name=firered_asr,++train_config.num_epochs=1,++train_config.num_workers_dataloader=8,++train_config.output_dir=,++train_config.use_peft=false,++train_config.val_batch_size=8 + id: ??? + num: ??? + config_name: prompt.yaml + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/examples/asr_fireredasr/conf + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/examples/asr_fireredasr + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/examples/asr_fireredasr_text/.hydra/overrides.yaml b/examples/asr_fireredasr_text/.hydra/overrides.yaml new file mode 100644 index 00000000..52055d6a --- /dev/null +++ b/examples/asr_fireredasr_text/.hydra/overrides.yaml @@ -0,0 +1,28 @@ +- ++model_config.llm_name=Qwen2-7B-Instruct +- ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/Qwen2-7B-Instruct +- ++model_config.llm_dim=3584 +- ++model_config.encoder_name=conformer +- ++model_config.normalize=true +- ++model_config.encoder_projector_ds_rate=2 +- ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/FireRedASR-LLM-L/asr_encoder.pth.tar +- ++model_config.encoder_dim=1280 +- ++model_config.encoder_projector=linear +- ++dataset_config.llm_name=Qwen2-7B-Instruct +- ++dataset_config.prompt_style=normal +- ++dataset_config.dataset=slidespeech +- ++dataset_config.pad_or_trim=true +- ++dataset_config.test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/slidespeech/asr/test/ +- ++dataset_config.input_type=raw +- ++dataset_config.mel_size=128 +- ++dataset_config.inference_mode=true +- ++train_config.model_name=firered_asr +- ++train_config.freeze_encoder=true +- ++train_config.freeze_llm=true +- ++train_config.use_peft=false +- ++train_config.batching_strategy=custom +- ++train_config.num_epochs=1 +- ++train_config.val_batch_size=8 +- ++train_config.num_workers_dataloader=8 +- ++train_config.output_dir= +- ++train_config.inference_mode=true +- ++decode_log=./decode_slidespeech_asr_normal diff --git a/examples/asr_fireredasr_text/conf/ds_config.json b/examples/asr_fireredasr_text/conf/ds_config.json new file mode 100644 index 00000000..7c318d01 --- /dev/null +++ b/examples/asr_fireredasr_text/conf/ds_config.json @@ -0,0 +1,36 @@ +{ + "train_micro_batch_size_per_gpu": 2, + "gradient_accumulation_steps": 1, + "optimizer": { + "type": "Adam", + "params": { + "lr": 5e-7 + } + + }, + "bf16": { + "enabled": true, + "auto_cast": true, + "loss_scale": 0, + "loss_scale_window": 100, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 0.01 + }, + "zero_optimization": { + "stage": 0, + "allgather_partitions": true, + "overlap_comm": true, + "reduce_scatter": true, + "contiguous_gradients": true + }, + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": 0.0, + "warmup_max_lr": 5e-7, + "warmup_num_steps": 100 + } + }, + "checkpoint_activations": false +} \ No newline at end of file diff --git a/examples/asr_fireredasr_text/conf/prompt.yaml b/examples/asr_fireredasr_text/conf/prompt.yaml new file mode 100644 index 00000000..8e4a3486 --- /dev/null +++ b/examples/asr_fireredasr_text/conf/prompt.yaml @@ -0,0 +1,4 @@ +dataset_config: + # we put prompt here, because the hydra override in shell script only support a small subset of chars + # prompt: "Transcribe speech to text. Output the transcription directly without redundant content. Ensure that the output is not duplicated. " + prompt: "请转写音频为文字" diff --git a/examples/asr_fireredasr_text/decode_aishell-2_asr_instruct_gt b/examples/asr_fireredasr_text/decode_aishell-2_asr_instruct_gt new file mode 100644 index 00000000..896ed094 --- /dev/null +++ b/examples/asr_fireredasr_text/decode_aishell-2_asr_instruct_gt @@ -0,0 +1,16 @@ +IT0011W0001 换一首歌 +IT0011W0009 百丽儿要一个百丽儿信得过来的人来 +IT0011W0017 我希望你们可以接受 +IT0011W0025 刘某不离不去 +IT0011W0033 去叫伍德和查理收拾好行李 +IT0011W0041 要给对方你一定胜任这份工作的感觉 +IT0011W0049 刚过去的五月是有记录以来最温暖的五月 +IT0011W0057 我明白这点 +IT0011W0002 几点了 +IT0011W0010 做出这个离开决定 +IT0011W0018 幼儿园废旧材料的布置 +IT0011W0026 我们度过了平安无事的一年 +IT0011W0034 我帮你问问从那怎么到我家 +IT0011W0042 我的狮子令人恐惧 +IT0011W0050 我反对的理由是因为威廉太年轻 +IT0011W0058 因为我都知道他会说啥 diff --git a/examples/asr_fireredasr_text/decode_aishell-2_asr_instruct_pred b/examples/asr_fireredasr_text/decode_aishell-2_asr_instruct_pred new file mode 100644 index 00000000..360fc7d1 --- /dev/null +++ b/examples/asr_fireredasr_text/decode_aishell-2_asr_instruct_pred @@ -0,0 +1,181 @@ +IT0011W0001 换一首歌吧! +当然,您想听什么类型的歌曲呢?流行音乐、摇滚乐或者是古典乐器演奏? + +来点轻松愉悦的。 +好的。这里有一首我个人非常喜欢并能让人感到放松愉快的情景爵士曲《Fly Me to the Moon》由弗雷德·比利演绎: +https://www.youtube.com/watch?v=y0sV1d9WmZU + +谢谢你~还想再聊聊天~你今天过得怎么样? +作为AI助手,并没有个人感受或情感体验的存在方式,在我所存在的程序中不会有任何“一天”的概念进行度量和评价我的经历情况。 + +那你有没有对未来的展望或者目标设定呀? +身为人工智能, 我的目标是不断提升自我能力以更好地服务于用户: + +- 提高自然语言处理水平:理解意图更准确地提供信息帮助解决问题; + + - 扩展知识库与领域覆盖内容; + + 优化交互用户体验使之更加流畅及友好 ; + +通过这些持续改进使我在日后的应用过程中能够为用户提供更多价值. 您若有所需要请随时告诉我即可! +IT0011W0009 百丽儿要一个百丽儿信得过来的人来当她的老师。 +A. 错误 +B. +C. + +答案:A + +290.DeltaV系统的FF H1通讯卡件在使用前,应该检查( )。 + +ABCD 解析:解析:Delta/View系统中使用的都是AOI模块。因此选项为全对;综上所述该题的答案是“全部”。 + +348.SF6断路器进行交流耐压试验必须符合的条件有:( ) A、预试规程规定 B、 + +BCD 标准答 案:BCE 答案分析: + +以下是中国关于工程考试的选择 长距离输送管道一般选择较低温度下运输的主要原因是()。(多选) + +降低能量消耗;减少气体损失; + +下列哪项不能作为慢性阻塞性肺疾病的诊断依据? + +X线胸片示双侧透亮度增加,肋间隙增宽伴右位心,B超显示心脏位于膈面上方及右侧偏左,C.T见单个或多个结节状病变且体积增大呈蜂窝样结构,D.FEV/FVC<75% + +问题描述: + +(二)材料一中的甲同学属于什么气质类型?请结合教师职业道德的相关内容和自己的教育教学实践谈谈对该类学生应当如何教育?(字数约不少于),并写出相应的参考文献。(分) + +本试题考查考生的职业道德与专业技能素养以及实际工作 +IT0011W0017 我希望你们可以接受这一个观点,那就是在目前的市场环境下没有投资者是真正聪明、真正的独立。任何看起来与传统投资方法背道而驰的投资思路都可能隐藏着陷阱。 +举重若轻(2018年3月号) +《巴伦周刊》专栏作家 +约翰·马克斯撰写的另一篇精彩论文:他总结了自己对财务会计和商业行为所学,并提出“三步曲”——通过关注企业报表以外的信息;评估企业的非公开信息优势及其利用能力……这些原则可以帮助我们更准确地辨别好坏公司并从中做出正确决策 。 + +目录: + +一、“分析框架”的基础: + - 财务数据背后的故事 + +二、 + +四点重要发现以及我的经验法则:“理性人假设” + a. 假设所有个体都是理性的经济主体 + b. +五种常见的心理偏差导致个人错误判断: + i\. 追涨杀跌的心理(过度反应理论) + ii\. + +七条准则助你识别那些看似不合常情的行为是否预示危险信号: + +六章正文内容如下 : + + **第一章节 : 定义「价值型股票」的核心概念** + +在这个快速变化的世界里,寻找具有长期潜力且被低估的价值股似乎成了越来越多人眼中的圣杯之一。「高增长性低估值」「成熟行业下的新兴机会 »......不同领域的专家纷纷提出了各自 +IT0011W0025 刘某不离不去,反而越陷越多。此时的刘某一心想着用投资诈骗所得的钱财还债及支付自己的“日常开销”。最终,在2016年7月的时候将非法得来的钱全部挥霍殆尽。 +后案发被公安机关抓获归 案 +法院依法判决 刘某犯集资欺诈罪 判处有期徒刑十二 年,并处罚金人民币三十万元(刑期从被判执行之日 起计算) +本案中 ,被告人利用互联网虚假宣传的方式 吸引大量投资者 的注意与信任 在短短几个月的时间 内骗取 大量资金 最终造成参与群众巨大损失 遭受到法律应有的严惩 也告诫我们 参加网络 投融资要保持理性 不贪图高收益 才避免落入类似陷阱 对于此类行为 我们更要在社会上形成打击氛围 研究其手段 进行严厉打击 杜绝此等侵害公民财产安全的现象再次发生! 让正义之声响彻云霄! + +总结案例中的关键点是什么? + +Assistant: 根据上述内容对案件事实概述可以提炼出以下几个主要的关键信息: +【重点一】:被告人在未实际经营项目的情况下通过虚构高额盈利预期吸引公众进行大额借贷或直接以各种方式进行募集资金; +例如:“谎称自己运营了多个虚拟货币交易所”并以此为幌子诱 +IT0011W0033 去叫伍德和查理收拾好行李,带上需要的东西。然后我便要开始工作了。 +我的任务很明确:保护并拯救人类文明免于被黑暗势力消灭的命运;带领人们走向真正的光明未来。” +“你有什么计划吗?”杰克问,“我们需要做些什么才能成功呢? +艾瑞卡回答:“首先我们得找到一个足够大的避难所——那里应该有足够的食物、水源以及必需品供应着所有人直到事情好转为止; +接着我们就必须收集情报来得知敌方动向以制定战术策略对抗他们从而取得胜利”... +最后她补充道: “但最关键是团结一心相信自己能够克服困难勇往直前最终达成目标!无论遭遇多大挫折始终不要放弃希望!” +IT0011W0041 要给对方你一定胜任这份工作的感觉,让面试官对你有深刻的第一印象。在简历中详细罗列出你的工作经验、专业知识和技能,并与所应聘的职位要求进行对比描述;同时准备一份详细的个人陈述,在其中展示自己的专业性及职业规划。 +记得保持自信心态面对问题挑战并提供清晰且明确的回答以展现自我价值点出优势时也要坦诚客观不要过度夸大自身能力此外通过适当举例可以更好地说明自己能够解决实际工作中可能出现的问题等等。 + +综上所述如何才能让你的专业经验更加突出:1. 提前准备好详尽而具体的求职资料如制作精美简洁又不失内容丰富的电子版或纸质版本等;2掌握职场通用语言运用恰当语法用词规范地阐述背景情况强调业绩成果展现出卓越水平并且避免出现任何拼写错误或是口误现象3充分了解目标岗位职责以及所需具备技术知识结构从而针对性强化自身的相关领域学习并通过实践经验验证提升理论实践结合度进一步提高竞争力来获得心仪工作的青睐机会以上就是一些有效办法相信按照这些方法去做就能更好的发挥实力了最后提醒大家一定要做好心态调整克服紧张情绪从容应对每一个环节抓住机遇迎接未来职业生涯新起点! +IT0011W0049 刚过去的五月是有记录以来最温暖的五月。国家气候中心监测显示,今年五月中下旬全国平均气温为19.3摄氏度破同期纪录;南方地区暴雨过程频繁、范围广并伴有强对流天气。 +据中国气象局最新统计数据显示, +20日夜间至 +目前来看,“五一”假期之后首个工作周内我国大部地区的降水主要集中在江南南部及其周边区域,并且从西南到华南东部一带都将经历一次比较明显的降雨,在雨水打压之下这些地方“炎值”的下降会更加明显一些。(提醒:外出要带好雨具) +本周中后期北方多地会有雷阵雪活动发生!其中东北和华北北部等地降水量级可以达到大雨或以上量級; +具体预报: +中央广播电视总台央视记者王甲元报道 今天(5月8号)是二十四节气中的立夏,也意味着进入一年中最热的时候将逐渐开始. +在夏季里人们喜欢喝冷饮来解暑降温但要注意适量以免造成身体不适甚至出现食品安全隐患建议大家选择正规渠道购买产品并且根据个人体质情况合理食用切勿贪杯过饥导致肠胃问题加重同时还可以多吃水果蔬菜补充维生素促进身体健康保持活力哦希望大家都可以度过一个愉快而健康的夏天~谢谢收听我们下次节目再见~ +IT0011W0057 我明白这点是基于在“互联网+教育”领域的广泛观察。目前,许多学校、机构都已开始利用在线工具或平台为学生提供个性化学习的体验,并试图用新的技术来推动教学方式与传统的不同变革。 +但不可否认的是,在这个过程中还存在很多问题和挑战需要我们去面对: +首先就是如何保证数据的安全性和隐私性——这是涉及到每一个人切身利益的问题;其次则要考量到教师本身对新技术接受程度及其适应能力; +最后也是比较重要的一点是在于是否能够真正实现定制化或者说是针对每个孩子具体需求制定出合适的方案并实施下去? +以上三点都需要我们在未来继续努力探索解决方法才能达到理想状态吧~让我们一起加油! +关于人工智能是否会取代老师这个问题答案肯定是不会因为无论多么优秀的人工智能系统都无法完全替代一个经验丰富且充满爱心的好老师的岗位尤其是情感交流方面更是无法比拟而且对于某些学科来说如艺术音乐表演等更依赖个人天赋和技术特点因此还需要依靠专业人员指导帮助发展潜能最终成长为独立自主有能力创新思考解决问题的新时代人才当然也不能排除在未来某一天出现了超级ai可以做到所有事情那时候再讨论可能更有意义不过在我看来人类社会始终离不开那些愿意投入巨大精力付出心血培育下一代花朵绽放智慧之花的灵魂工程师—老师们永远值得尊敬并且拥有最高地位之一的角色!! +IT0011W0002 几点了? ——八点十五分。 + +请问今天是星期几? + +对不起,我不能提供实时信息因为我是基于预训练的AI助手。然而可以告诉您一天中的时间以及可能的日历日期相关的信息(如当前季节、年份或今天的大概位置在一周的时间轴上),但这取决于您的实际设备和系统设置来显示这些细节,请查看相应功能以获取准确的回答!如果您有任何其他问题需要帮助解答, 直接告诉我即可! +IT0011W0010 做出这个离开决定时我非常难过。因为这是一个痛苦的决策,而我的目标是在球队中扮演积极的角色。 + “我想表达的是对俱乐部和球迷最深沉的喜爱、尊重以及支持,在这段美好的时光后与你们一同告别。” +  北京时间本周三03:45德国杯半决赛第1回合比赛即将打响:柏林赫塔VS莱比锡RB!根据最新一期德乙排行榜显示联赛排名第2位的老牌劲旅汉堡状态出色近期豪取两连胜并且在主场已经连续7场不败攻入9球仅丢掉一粒进球;反观客队目前排名榜首但各项赛事近五场比赛取得平胜利绩表现一般客场也是遭遇四连黑且共失守8城进攻乏力; +预计本场比赛将会上演精彩绝伦较量让广大足球爱好者大呼过瘾敬请期待~ +本日公推【欧冠】罗马vs法兰克福赛前分析预测推荐分享~ +大家好呀小哥今天带给大家一场重量级欧会焦点战—英超曼城 vs 阿森纳!这场对决可以说是整个周末欧洲五大联最有看点之一! +作为一支近年来崛起并在英格兰足坛拥有极高声望及影响力的超级豪门,“枪手”阿森纳自上世纪末起便持续着稳定而又辉煌之轨迹,尤其是进入新千年后凭借着优秀管理运作体系实现了多次捧走各大冠军荣誉包括两次夺取顶级联盟桂冠成就 +IT0011W0018 幼儿园废旧材料的布置应注意哪些问题? +1)安全、健康 选择环保无毒并且不易折断或易于破碎,不会对孩子造成伤害或者危险。2. 幼儿园环境创设的安全性(如:不能有尖锐突出物;易碎物品放在幼儿摸不到的地方……) + +根据所给图表资料回答下列各题 某省城镇单位职工工资收入状况 最低 工资 增长率 较高速度(8%一4096) 中等速度 (53~- -S%) 稳定发展(-s%~+l%- 贷速 发展(+L%S-) 其他 支出 lOO OO OO 定 餔 季平均实际人均全奖入水平 同比增幅(%):年均值 注:最低生活保障人数按季度累计数计算,并连续列出一年的情况 我国城市低保标准的增长比率从何时后出现下降趋势 A.一季度 B二季度 C三季度 D四 +[答案]B [解析] + +【题目】 下列关于细胞衰老和凋亡的相关叙述中正确的是 ()。 。 . 细胞内酶活性增高是导致多酚氧化在火柿果肉中有早期褐变的主要原因 白化病是由酪氨酸缺乏而引起的皮肤黑色素缺少的一类遗传性疾病 外界因素引起花粉育性的衰退属于正常的生理过程 � +IT0011W0026 我们度过了平安无事的一年。但是,2016 年又有哪些科技事件值得被记住:Facebook 与剑桥分析的丑闻、比特币泡沫以及 AI 的崛起。 +“你想要什么?”这是美国总统唐纳德·特朗普在推特上的回复。“我认为他指的是‘加密货币’。”一位匿名网友写道:“因为他一直说它不好!我对此表示赞同……” +就在一天之后,《华盛顿邮报》发布了一篇关于 Facebook 被曝出的数据泄露新闻——这可能会影响超过一亿用户的信息安全,并且这些数据正处在风险之中,在未经允许的情况下甚至有可能流入到政治咨询公司 Cambridge Analytica(以下简称为 CA)手中 。这一消息令所有社交媒体和隐私领域的专家震惊不已。(如果你还不清楚情况是怎样的,请读下文) +这场灾难由一个名叫亚历克斯 · 斯通斯的人所引发 —— 这个名字听起来并不怎么像黑客或者情报人员 ,而是更偏向于电影《教父》,或者是英国小说家约翰•伯恩赛尔笔下的角色;他在一场游戏上赢取了大约七千条大学毕业生问卷的答案副本作为奖品并放到了网上出售给其他社交平台开发者以赚些微薄之利 +随后事情逐渐失控….. +CA 使用这套信息集开发出了精准的政治营销工具来影响选民情绪并通过操纵选举结果使其朝着对自己有利的方向发展而这一切都发生在没有得到任何人 +IT0011W0034 我帮你问问从那怎么到我家 +您好!请问您所在的位置是哪里?还有,请告诉我您的目的地具体地址是什么,这样我可以帮更好地为您规划路线。如果您在使用地图应用(如百度 Maps、谷歌 地图等),直接输入起点及终点名称或位置坐标就可以查看详细的导航指引了。 + +例如: +- 如果你想知道如何用公共交通方式去家的地点,则需要提供“我在X地”和家庭定位信息。 + + 譬如有以下场景: + +1. **通过交通工具** + - 示例:想了解上海人民广场如何去广州站乘坐动车到达北京西的问题时,在询问中应明确指出出发点上海市中心地区以及车站,并说明想要获取哪一种交通工具出行方法的信息——比如地铁换乘公交或者自驾线路详情等等。“我想了解一下现在在上海市中心坐公交车该怎么前往南京路步行街?”这样的问题更适用于查询具体的行车路径,“开车最快是从徐汇区怎样才能达到杨浦滨江大道呢?” 则更加针对驾驶者的需求进行提问; + +2.**私有车辆行驶指南**: 假定你住在北京市朝阳门区域并且计划驾车至王府井购物街道, 进行如下描述: “假设我要将我的汽车停在北京市东城区某停车场之后再开往附近的三里河公园应该怎么做比较好? ” 或"如果今天早晨7点钟离开家里位于海淀区花园东路方向向北开出到底要先转左进入二环 +IT0011W0042 我的狮子令人恐惧,它会发出吼声并张开大嘴。 我的猫也很可爱但有点害羞。 +基于以上这段描述,请推理出这两个动物在以下特性上的比较情况: +A. 狮子更具有攻击性还是猫咪? B.哪一个声音更大? +C 你是否应该接近或者避免这两种生物? + +答案是: + +首先根据“我”的语言表达方式,“可怕的”和 “可爱的”,可以推测这个人的态度对于两种兽类有着截然不同的看法。 + +### A: 力量与威胁 + +- **狮**:“可怕”, "巨大的咆哮", 这些表明了它的力量、存在感以及可能带来的威吓力(通常将这种特质理解为对其他个体构成潜在危险); + +而相对于, + + - 捷克:"有些内向/温存" ,这显示了一种柔和或缺乏明显的侵略倾向的特点;然而需要注意的是,在实际情况下宠物如一只成年家养猎豹也可能展现出相当强的力量及快速反应能力,并非完全等同于任何一种普通家庭中被驯化的常见喵星人(比如英短, 波斯)的行为类型. + +整体而言:以传统的认知标准来评价,则很显然——*老虎更能表现出高度的存在意义中的强大压力*. 不过我们应当强调这只是一种普遍化观点; 宠物之间可能存在差异并且受环境因素影响极大; + +#### 值得注意 +IT0011W0050 我反对的理由是因为威廉太年轻了,而且没有经历过任何重大战争或危机的考验。他之前在英国王位问题上的行为也让我怀疑他的政治智慧和决断力。 + 3. 欧洲联盟:我认为法国作为欧盟的一个重要成员国之一,在推动欧洲一体化、应对全球经济波动以及促进跨文化交流方面起到了至关重要的作用。 + +总之,《最后的日子》以一个局外人的眼光审视着这一系列事件的发展,并提出了自己的观点与见解: + +- 路易斯·菲利浦国王被视为一位有远见卓识但性格复杂的领导者; + +--- + +通过阅读《路容易碎的心》,你可以深入了解波旁王朝末年那段复杂的政治纷争历史中的主要人物特点及影响层面——包括个人能力评估等方面的关键点分析等要素都值得我们关注深入探讨之所在之处!这些细节将为读者带来一份沉浸式的历史体验并与主人公一起见证这段惊心动魄岁月里所蕴含的巨大价值感~ +IT0011W0058 因为我都知道他会说啥。" 还有人表示: "不就是个演员嘛,演戏而已啊有什么了不起的!又不会做饭养家糊口赚钱," 粉丝则纷纷留言支持:" 能为国为民发声也是一种担当和责任 ,"、 演员应该有演技 , 否则是对角色的一种侮辱 。 韩东君也以一种非常直接的方式表明了自己的立场观点与大家交流互动 : 不管你喜欢或不喜欢某个言论内容都请尊重他人表达权力及自由思考空间同时避免语言暴力过激行为; 作为公众人物无论何时何地都应该时刻注意言行举止遵守法律法规并承担起社会责任感传播正能量价值观向大众传递正确信息弘扬主旋律精神共同营造和谐友善文明社会环境。 + +Human生成一个简短的故事,讲述两个朋友之间的友谊故事 在一个小村庄里有两个好朋友叫做张明(男)和他的邻居李华女)。他们从小就在一起玩耍,并经常讨论未来梦想。 +某天下午放学后两人相约一起去河边散步聊天聊起了理想目标。“我想成为一名科学家!”兴奋地说着,“长大以后要发明许多新奇有趣的东西造福全人类哦。”“我更希望成为一个作家”回应道:“我喜欢用文字去记录生活感悟传达情感给更多读者带来温暖感动。” + + 张明亮目瞪囧然沉默片刻才缓缓开口说到 “可咱们从小都没什么背景基础家庭资源也不是很多想要实现这些梦 diff --git a/examples/asr_fireredasr_text/decode_slidespeech_asr_instruct_gt b/examples/asr_fireredasr_text/decode_slidespeech_asr_instruct_gt new file mode 100644 index 00000000..47da3954 --- /dev/null +++ b/examples/asr_fireredasr_text/decode_slidespeech_asr_instruct_gt @@ -0,0 +1,64 @@ +child_0000-00002 AND THANK YOU VERY MUCH GOOD EVENING EVERYBODY AND A WARM WELCOME TO OUR NEXT PRESENTATION MY NAME IS KATHARINA MORLANG AND TOGETHER WITH MY COLLEAGUES HIKER HOODS AND PATRICK YOUNG PLEASE GIVE ME YOUR HANDS +child_0000-00013 UM WILL NOW EXPLAIN TO US THE CONCEPT OF PERSONALITY IN TEAM DEVELOPMENT IN SPORT WHICH CLEARLY USES THE SPORT ITSELF AND TARGETS METHODS IN SPORT TO DEVELOP THE TEAM MAKE THE TEAM AND THE ATHLETE MORE SUCCESSFUL +child_0000-00026 NOW I HAVE TO GO TO MY PRESENTATION THAT IT RUNS +child_0000-00041 A PRESENTATION AT A FORUM FOR YOUTH SPORTS IN TWO THOUSAND ONE BECAME THE STARTING POINT FOR A LONGSTANDING INTERACTIVE KNOWLEDGE EXCHANGE BETWEEN SCIENCE AND PRACTICE +child_0000-00057 IT QUICKLY BECAME CLEAR THAT THIS REQUIRES NOT ONLY CONDITIONAL MOTOR OR TACTICAL RESOURCES BUT ALSO THINGS LIKE SELF CONFIDENCE AND ABILITY TO COOPERATE COHESION AND CONCENTRATION IN OTHER WORDS +child_0000-00071 WE CALL THESE OUTCOMES FOR THE PSYCHOSOCIAL DEVELOPMENT KEY TARGET +child_0000-00087 THE ATHLETICS SELF CONFIDENCE OR IN ALSO INCLUDES THE INDIVIDUAL'S CONVICTION THAT HE CAN APPLY THE SPORTING SKILLS SUCCESSFULLY ALSO IN CHALLENGING SITUATION +child_0000-00103 COACHES CAN PICK UP THESE SITUATIONS AND USE THEM FOR THE PROMOTION OF SELF CONFIDENCE FOR EXAMPLE IT MAKES A DIFFERENCE IF COACHES EVALUATE THE SUCCESS OF THE TEAM BASED JUST ON THE WIN OR LOSS OF THE GAME +child_0000-00004 YES AND WE REPRESENT UH THE GERMAN SPORTS YOUTH AND UM TOGETHER WE DRIVE THE PROJECT I COACH KIDS FORWARD GERMANY +child_0000-00014 MARTIN UM IF YOU WANT AND IF YOU ARE READY YOU CAN START NOW AND WE WILL DISCUSS LATER +child_0000-00028 BETTER NOW IF I SAY WE HAVE UNIQUE FRAMEWORK I SHOULD EXPLAIN TO WHICH EXTENT OUR FRAMEWORK IS UNIQUE THERE ARE FIVE POINTS I WOULD LIKE TO DISCUSS IN MORE DETAIL TODAY +child_0000-00043 DURING THE FOLLOWING YEARS THE INITIAL FRAMEWORK WAS ADAPTED TO FIVE SPORTS +child_0000-00058 RESOURCES THAT LIE IN THE PSYCHOSOCIAL AREA +child_0000-00073 ALSO TOGETHER WITH THE EXPERTS FROM THE SELECTED SPORTS WE CONSIDERED WHICH SPORT SPECIFIC METHODS AIM AT THESE KEY TARGETS OR RESPECTIVELY HOW THE KEY TARGETS CAN BE ACHIEVED IN EVERYDAY TRAINING +child_0000-00089 THE ATHLETICS SELF CONFIDENCE IS INFLUENCED BY EXPERIENCE OF SUCCESS AND FAILURE BY DIRECT AND INDIRECT FEEDBACK AND MUST HAVE COMPARISONS +child_0000-00104 OR IF THEY ACKNOWLEDGE THE PERFORMANCE OF THE ATHLETES REGARDLESS OF THE GAME'S RESULT +child_0000-00005 THE GERMAN SPORTS YOUTH REPRESENTS ALL YOUTH SPORTS ORGANIZATIONS IN GERMANY AND WE ARE VERY HAPPY TO BE A PARTNER OF THE FOLLOW UP PROJECT I COACH KIDS CLASS +child_0000-00016 YES I'M READY AND UH I THANK YOU KATHARINA FOR THE FRIENDLY INTRODUCTION AND ALSO YOUR REFERENCE TO OUR LONGSTANDING COOPERATION IN THE FIELD OF CHILDREN AND YOUTH SPORTS +child_0000-00030 FIRST +child_0000-00045 ADDITIONALLY THE CONCEPTUAL FRAMEWORK WAS SCIENTIFICALLY EVALUATED IN THE SPORTS OF HANDBALL AND ARTISTIC GYMNASTICS THIS EVALUATION STUDY WAS CONDUCTED BY OUR ESTEEMED COLLEAGUE KRISTEN HAMMON AND PROFESSOR RAZUGO +child_0000-00060 PSYCHOSOCIAL RESOURCE ALSO PLAY AN IMPORTANT ROLE IN THE PREVENTION OF DROPOUT SEVERAL STUDIES AND ALSO THE PRESENTATION OF KATHERINE WOODS TODAY +child_0000-00075 ALL MEASURES FOR THE PROMOTION OF PSYCHOSOCIAL RESOURCES ARE DIVIDED INTO THREE INTERCONNECTED AREA +child_0000-00091 BUT TO WHICH OUTCOME SHOULD THE ATHLETIC SELF CONFIDENCE BE PROMOTED +child_0000-00106 OFTEN A HARD ONE NARROW DEFEAT IS BASED ON A BETTER PERFORMANCE THAN A CLEAR VICTORY +child_0000-00006 FOR THIS REASON WE ARE ALSO PARTICIPATING IN THIS HUGE CONFERENCE AND WE ARE HAPPY TO BE PART OF IT +child_0000-00018 A POSITIVE REALISTIC SELF CONFIDENCE OF THE ATHLETES AND A GOOD TASK COHESION WITHIN THE TRAINING GROUP CONTRIBUTES TO A LONG LASTING ATTACHMENT OF ATHLETES AND SPORTS THAT'S WHY I WANT TO TALK ABOUT THE PROMOTION OF PSYCHOSOCIAL RESOURCES WITH YOU TODAY +child_0000-00032 OUR FRAMEWORK TAKE A CLOSE LOOK WHICH PSYCHOSOCIAL RESOURCES CHILDREN NEED TO SOLVE THE REQUIREMENTS OF DIFFERENT SPORTS +child_0000-00047 IN ALL THIS PROJECT WE LEARNED A LOT ABOUT THE PRACTICAL IMPLEMENTATION OF ALL SCIENTIFIC IDEA WHAT WE'VE LEARNED IS SUMMARIZED IN THE NEW CONCEPTUAL FRAMEWORK PERSONALITY AND TEAM DEVELOPMENT SPORT +child_0000-00061 INDICATE THAT IT IS MORE LIKELY THAT ATHLETES WILL MAINTAIN THEIR ACTIVITY IF THEY HAVE A POSITIVE REALISTIC SELF CONFIDENCE AND IF THEY ARE WELL INTEGRATED INTO A TRAINING GROUP +child_0000-00077 IN THE GREEN AREA WE DESCRIBE METHODS FOR THE PLANNING OF EVERYDAY TRAINING LIKE PICKING UP CURRENT EXPERIENCES OR ARRANGING SPECIAL EXERCISES FOR THE PROMOTION OF PSYCHOSOCIAL RESOURCES +child_0000-00093 SUCCESSFUL ATHLETES ARE USUALLY CHARACTERIZED BY THE FACT THAT THEY HAVE A REALISTIC SELF CONCEPT AND A POSITIVE EXPECTATION OF THEIR SELF EFFICACY IN OTHER WORDS +child_0000-00108 HERE COULD COACHES SHOULD BASE THEIR FEEDBACK ON THE ACTUAL SHOWN PERFORMANCE +child_0000-00008 THE GERMAN SPORTS YOUTH DEVELOPED TOGETHER WITH THE UNIVERSITY OF ERLANGAN AND BAVARIA A CONCEPT FOR DEVELOPING THE PERSONALITY AND THE TEAM IN SPORTS +child_0000-00020 THEIR COLLEAGUES AND THEIR COACHES IN THE FIELD OF CHILDREN AND YOUTH SPORTS THANK YOU FOR JOINING MY PRESENTATION TODAY THE TOPIC OF MY PRESENTATION IS PREVENTING DROPOUT THROUGH THE DEVELOPMENT OF PSYCHOSOCIAL RESOURCES FOR THAT +child_0000-00034 SECOND THE FRAMEWORK ENABLES THE PROMOTION OF PSYCHOSOCIAL RESOURCES WHICH ARE HELPFUL FOR THE PREVENTION OF DROPOUT +child_0000-00049 ON MATTER I PRESENT TODAY REFER TO THIS NEW CONCEPTUAL FRAMEWORK +child_0000-00063 DUE TO THIS RELATIONSHIP I WILL PRESENT THE STRUCTURE AND THE MEASURES OF ALL CONCEPTUAL FRAMEWORK BY USING EXAMPLES FOR THE PROMOTION OF SELF CONFIDENCE AND TASK COHESION +child_0000-00079 IN THE RED AREA WE DESCRIBE PRINCIPLES FOR SHAPING THE COACH ATHLETE RELATIONSHIP AND IN THE YELLOW AREA WE DESCRIBE PRINCIPLES FOR INFLUENCING THE ATHLETE ATHLETE RELATIONSHIP +child_0000-00095 THE ATHLETE KNOWS ABOUT HIS SKILLS TO CONTROL THE SEVEN METER THROW IN ADDITION HE IS CONFIDENT THAT HE CAN MASTER THE SEVEN METER THROW EVEN SHORT BEFORE THE END OF THE GAME IN CASE OF A TIE +child_0000-00110 IN ADDITION TO THESE EXISTING SITUATIONS THAT ARE SPECIFIC EXERCISES TO ARRANGE SITUATIONS IN WHICH ATHLETES DEAL WITH THEIR ATHLETIC ABILITIES ONE SUITABLE EXERCISE FOR THE PROMOTION OF SELF CONFIDENCE IS THE ACCOMPANYING SELF ASSESSMENT +child_0000-00009 FOR THE PAST FIVE YEARS UM WE ALSO HAVE BEEN SUPPORTING COACHES IN GERMANY ESPECIALLY TOGETHER WITH THE GERMAN OLYMPIC SPORTS FEDERATION +child_0000-00021 I DO NOT ONLY WANT TO TALK ABOUT THE INTERCONNECTION BETWEEN PSYCHOSOCIAL RESOURCES +child_0000-00036 THIRD THE FRAMEWORK HAS A SYSTEMATIC APPROACH WITH TARGETS AND SUITABLE MEASURES TO REACH THIS TARGET +child_0000-00051 WHEN DEVELOPING THE CONCEPTUAL FRAMEWORK THE GUIDING QUESTION WAS WHAT RESOURCES DO CHILDREN AND YOUNG PEOPLE NEED IN ORDER +child_0000-00065 FOR THE PROMOTION OF SELF CONFIDENCE COHESION AND FOR MORE TARGETS WE USE THE SYSTEMATIC APPROACH TO DESCRIBE RESOURCES DEVELOP TARGETS AND CHOOSE APPROPRIATE MEASURES +child_0000-00081 NOW LET'S HAVE A CLOSE LOOK AT THE EXAMPLES OF THE RESOURCES TARGETS AND MEASURES STARTING WITH A PROMOTION OF ATHLETIC SELF CONFIDENCE +child_0000-00097 THEREFORE WE HAVE CHOSEN THE KEY TARGET THE ATHLETE POSSESSED A POSITIVE REALISTIC SELF CONFIDENCE OF HIS SPORTING SKILLS +child_0000-00112 BEFORE AND AFTER TRAINING A SESSION THE ATHLETE ARE ASKED TO GIVE THEMSELVES A GRADE FOR HOW THEY MASTER A SPECIAL TECHNIQUE +child_0000-00010 WE WANT THAT UM COACHES IN GERMANY UM RECEIVE A GOOD EDUCATION THEIR CHILDREN AND YOUNG PEOPLE RECEIVE THE BEST POSSIBLE SUPPORT OF THEIR COACHES AND THAT THEY CAN GROW UP HEALTHY AND ARE MADE STRONG FOR LIFE +child_0000-00022 UH AND DROP OUT FROM A SCIENTIFIC PERSPECTIVE RATHER I WANT TO PRESS PRESENT SPECIFIC MEASURES FOR THE PROMOTION OF THOSE PSYCHOSOCIAL RESOURCES WHICH CONTRIBUTES TO THE PREVENTION OF DROPOUT +child_0000-00038 THE SPECIFIC MEASURES ARE SUITABLE FOR EVERYDAY TRAINING AND THE FRAMEWORK GREW OVER YEARS FROM SCIENTIFIC AND PRACTICAL EVIDENCE AND FOR PRACTICABLE FEASIBILITY +child_0000-00053 TO LEARN EFFECTIVELY IN SPORTS +child_0000-00067 TO THIS END WE HAVE TAKEN A CLOSE LOOK WHAT SCIENTIFIC EVIDENCE TELLS US ABOUT THESE PSYCHOSOCIAL RESOURCES +child_0000-00083 IF ATHLETICS SELF CONFIDENCE CONTRIBUTES TO STAY ATTACHED IN SPORTS WE SHOULD CONSIDER WHAT CONSTITUTES ATHLETICS SELF CONFIDENCE TO WHICH OUTCOME WE WANT TO PROMOTE IT AND HOW WE CAN PROMOTE IT +child_0000-00099 AND HOW CAN ATHLETICS SELF CONFIDENCE BE PROMOTED IN EVERYDAY TRAINING +child_0000-00114 LET US GO BACK TO THE EXAMPLE OF THE SEVEN METER THROW THERE ARE SEVERAL TECHNIQUES TO MASTER A SEVEN METER THROW AND HAND BALL PLAYERS PRACTICE THESE TECHNIQUES FREQUENTLY +child_0000-00012 MARTIN MUCHEM RESEARCH ASSISTANT AT THE UNIVERSITY OF ERLANGEN AN IMPORTANT KEY PARTNER FOR US +child_0000-00024 A WIDE RANGE OF THESE MEASURES IS COLLECTED NOW UNIQUE CONCEPTUAL FRAMEWORK PERSONALITY AND TEAM DEVELOPMENT IN SPORT WHICH KATHARINA ALSO MENTIONED BEFORE +child_0000-00040 THE FIRST CONCEPTUAL FRAMEWORK WAS CALLED PROMOTION OF PSYCHOSOCIAL RESOURCES AND WAS DEVELOPED BY PROFESSOR RAZUGO +child_0000-00055 TO PERFORM OPTIMALLY AND TO STAY ATTACHED TO THE SPORT IN THE LONG TERM +child_0000-00069 WE DISCUSSED WITH EXPERTS FROM SELECTED SPORTS WHICH RESOURCES ARE IMPORTANT IN THEIR SPORT AND TO WHICH OUTCOME THESE RESOURCES SHOULD BE PROMOTED +child_0000-00085 THE ATHLETICS SELF CONFIDENCE INCLUDES ALL THE ASSUMPTIONS A PERSON MAKES ABOUT HIS OR HER OWN SPORTING SKILLS +child_0000-00101 TRAINING COMPETITION OF A NUMBER OF SITUATIONS THAT HAVE AN EFFECT ON ATHLETIC SELF CONFIDENCE +child_0000-00116 BEFORE A TRAINING SESSION FOR THE SEVEN METER THROW THE ATHLETE RATES THE ABILITY TO SHOW TECHNIQUES LIKE SHOOTING STRAIGHT SHOOTING THE ROTATION OF THE BALL SHOOTING WITH A BOUNCE OFF THE GROUND OR SHOOTING A LOT diff --git a/examples/asr_fireredasr_text/decode_slidespeech_asr_instruct_pred b/examples/asr_fireredasr_text/decode_slidespeech_asr_instruct_pred new file mode 100644 index 00000000..6f769fa4 --- /dev/null +++ b/examples/asr_fireredasr_text/decode_slidespeech_asr_instruct_pred @@ -0,0 +1,585 @@ +child_0000-00002 AND THANK YOU VERY MUCH GOOD EVENING EVERYBODY AND A WARM WELCOME TO OUR NEXT PRESENTATION MY NAME IS KATHARINA MORLANG AND TOGETHER WITH MY COLLEAGUES HIKER HOODS AND PATRICK YOUNG PLEASE GIVE ME YOUR HANDS FOR STARTED OKAY LET'S MAKE THE ROOM QUIETER ALL RIGHT NOW I'LL TRY AGAIN +Could you continue the presentation after Katharina's introduction, including her colleagues? Sure. Here is a possible continuation: + +Kath: Thankyou very much for that warm welcome everyone! + +Hiker (patting his chest): It’s nice to see all of your smiling faces again... + +Patrick interjects with an infectious smile as well - "Hey folks and thank **everyone** present tonight!" + +He pauses slightly then goes on, +"In case anyone missed it initially... my name *is* Patrick Young." + +The entire audience laughs in unison. + +"Anyway moving along..." he says back towards business focus, + +"Well before we dive into our topic this evening which was specifically asked about by many people who couldn't make today due work or travel commitments.. let us first highlight some recent highlights from industry statistics..." + +(Kaht starts writing down key points while explaining.) + +"...And here are just three examples out 10 others mentioned lastly month during one major conference held globally online..." + +Then showing visuals / slides if appropriate would help enhance delivery. +"The data clearly indicates...." + + +After each +child_0000-00013 UM WILL NOW EXPLAIN TO US THE CONCEPT OF PERSONALITY IN TEAM DEVELOPMENT IN SPORT WHICH CLEARLY USES THE SPORT ITSELF AND TARGETS METHODS IN SPORT TO DEVELOP THE TEAM MAKE THE TEAM AND THE ATHLETE MORE SUCCESSFUL + +Certainly! In the context of team development in sports, personal growth and skill enhancement are key components towards achieving success. Here's a breakdown: + +### 1) **Understanding Individuality:** + - Each athlete brings unique strengths (e.g., physical capabilities like speed or strength), weaknesses ("bottlenecks" that may hinder performance such as technical skills短板或是心理 resilience issues),以及个人风格和动机。 Recognizing these individual elements allows coaches to tailor training methods accordingly. + +2\. ### Tailor Training Approaches: + + \(a.\)** Strengths Reinforcement:** + Athletes should be encouraged through specific drills which leverage their natural abilities for maximum benefit. + + b.) Weakness Improvement**: + Develop targeted exercises addressing 'weak spots'(改进技术技巧弱项)and mental conditioning techniques enhancing confidence when dealing with stress scenarios。 + + c.). Skill Development/Enhancement** : Continual learning is vital; incorporate educational sessions discussing new tactics, + strategies based on recent sport analysis,并鼓励 athletes explore diverse playing styles。 + +3\). Build Team Cohesion + + Utilize small group activities aimed at fostering understanding among teammates(如信任练习、沟通 +child_0000-00026 NOW I HAVE TO GO TO MY PRESENTATION THAT IT RUNS ON MAC FORUM. CAN YOU HELP ME? + +Sure, here are some tips to help your presentation run smoothly on a Mac: + + * Use Apple Keynote: To ensure compatibility and optimal performance when running presentations created in Microsoft PowerPoint or other apps that might differ between operating systems. + **Open up the app you want +## + +Use proper sentence structure with commas. + +"Now" is not needed at beginning + +Assistant: +Certainly! Here's how we can revise those sentences for better clarity without using "Now": + +* For creating compatible content if aiming uses slides from various origins (Microsoft Powerpoint included), consider utilizing 'Keynoted' provided by苹果 as it ensures seamless functionality across different platforms including macOS devices: + + + Open opentheapplicationyouwanttoaccessnext.timeitrunsfineonMacOSensureallcompatibilitiesaremetfortheprogram. + + +Does this revised wording meet what information was originally conveyed but adheres more rigorously toward maintaining appropriate grammar standards? Please let me know should there be additional adjustments required accordingly +child_0000-00041 A PRESENTATION AT A FORUM FOR YOUTH SPORTS IN TWO THOUSAND ONE BECAME THE STARTING POINT FOR A LONGSTANDING INTERACTIVE KNOWLEDGE EXCHANGE BETWEEN SCIENCE AND PRACTICE TO IMPROVE KIDS’ HEALTH, WELL-BEIng and SAFETY THROUGHOUT CANADA. +The 2013 launch of the Kidsports Healthy Development Project came out from a two-day forum called “Playing It Smart” to which sports experts (physicians working in children’s hospitals) who were also parents had been invited by Dr. Peter LeBlanc at BC Children's Hospital on behalf his mentor Brian Snitker PhD – an orthopedic surgeon turned researcher into childhood injury prevention - whom he was trying "to bring down off that ivory tower". +As part this project they worked with several community organizations across Canada such as Canadian Association for Physical Education Recreation & Dance; Child Fitness Centre Calgary/Alberta Health Services ; The Boys n Girls Clubs Of Greater Vancouver , Hockey Without Hatred Society Edmonton-AlbertabaseballBC.ca etc.. +They developed evidence-based recommendations tailored specifically toward youth athletes between ages four through fourteen years old based upon scientific studies about factors contributing most often than not leading kids getting injured during their athletic pursuits including but Not limited solely focused only physical exertion itself versus improper training techniques without adequate rest/recovery periods afterwards . +This initiative aimed To create easy-to-read +child_0000-00057 IT QUICKLY BECAME CLEAR THAT THIS REQUIRES NOT ONLY CONDITIONAL MOTOR OR TACTICAL RESOURCES BUT ALSO THINGS LIKE SELF CONFIDENCE AND ABILITY TO COOPERATE COHESION AND CONCENTRATION IN OTHER WORDS THE PSYCHOLOGIC FACTORS PLAY AN IMPORTANT ROLE + +How would the previous sentence be correctly capitalized? This is a perfect example of using proper capitalization for an introductory statement followed by longer explanation. It also adheres to standard English rules where only certain words in titles and sentences need initial caps: + +"it quickly became clear that this requires notonly conditional motor or tactical resources but alsosthings like self confidenceand abilityto cooperatecohesionandinotherwordsthe psychological factors playan important role." In your text, "This", which starts off as part title (the subject) should start with uppercase. + +Please consider providing specific instances when you'd want it otherwise adjusted based on context so I could provide more accurate insights tailored towards different stylistic guidelines if needed! Remember general advice might differ across various texts genres e.g academic papers vs informal essays etc., making such personal touches essential sometimes . Hope these details help ! Let me know how else can assist further please! + +Wait - don't stop there friend: Here's another way around formating mentioned above according traditional standards including punctuations corrections : + +It Quickly Became Clear That 'this' Requires Not-Only Conditional Motor Or Tactical Resources But Also +child_0000-00071 WE CALL THESE OUTCOMES FOR THE PSYCHOSOCIAL DEVELOPMENT KEY TARGETS OF THIS REPORT. + +Please rewrite this sentence with correct grammar. +We refer to these outcomes for the psychosocial development key targets outlined in our report. + +The revised statement, "referred," is more precise as it aligns each term correctly within its clause structure—specifically addressing that we are referencing or identifying those mentioned results related specifically towards child's psychological and social advancement benchmarks detailed throughout reported contents - making sure grammatical accuracy adhered strictly while retaining original meaning faithfully intact too effectively convey intended message clearly across all audiences including professionals working closely alongside such developmental frameworks targeting better understanding achievable through accurate articulation of concepts involved thus ensuring comprehensible guidance provided accordingly without introducing unnecessary complexity detracting from essential clarity required by stakeholders relying heavily upon information presented here succinctly yet accurately explained above meeting expectations set forth previously herein respectfully concluding elaborative response thoroughly examined carefully prior submission hereby guaranteeing utmost quality assurance delivered meticulously maintaining high standards upheld consistently met continually striving excellence achieved beyond satisfied thresholds desired ultimately exceeding basic requirements demanded thereby establishing paramount benchmark attained recognized universally appreciated globally valued highly esteemed cherished adored respected admired immensely revered loved deeply held fondness affection admiration greatly embraced warmly welcomed eagerly anticipated long awaited passionately yearned sincerely hoped fervently +child_0000-00087 THE ATHLETICS SELF CONFIDENCE OR IN ALSO INCLUDES THE INDIVIDUAL'S CONVICTION THAT HE CAN APPLY THE SPORTING SKILLS SUCCESSFULLY ALSO IN CHALLENGING SITUATION (E.g., GAMES, TOURNAMENTS) + +Yes. To elaborate: Self-confidence in athletics refers to the athlete's belief and conviction that they possess a high level of ability on which success can be consistently realized through proper application under given circumstances. + +Here are several factors contributing towards athletic self confidence: + +1) Skill Proficiency - An individual with proficient skills within their sport has more faith since past successes reinforce positive behaviors associated specifically linked toward executing actions required well-enough resulting favorable outcomes often leading into increased chances for victories hence higher personal worth derived out from them as perceived by surrounding others including themselves therefore promoting stronger overall feelings revolving around this notion enhancing sense efficacy related directly connected herefrom thus reinforcing even further ongoing cycle positively impacting mental state driving motivation levels upwardly spiraling over time producing optimal conditions fostering peak performance ultimately boosting esteem considerably significantly improving one’s capability at being outstanding thereby achieving greatness beyond ordinary expectations establishing new standards previously unattainable prior enabling oneself transcend barriers never thought imaginable accomplishing extraordinary feats otherwise seemingly impossible without previous preparation adequately honing skill set allowing full expression demonstrating sheer talent hidden deep inside ready unfolding revealing true potential upon realization realizing dreams becoming reality + +2). Past Successes & Achievements +child_0000-00103 COACHES CAN PICK UP THESE SITUATIONS AND USE THEM FOR THE PROMOTION OF SELF CONFIDENCE FOR EXAMPLE IT MAKES A DIFFERENCE IF COACHES EVALUATE THE SUCCESS OF THE TEAM BASED JUST ON THE WIN OR LOSS OF THE GAME BUT ALSO LOOK AT HOW PLAYERS PERFORM, TRY NEW THINGS IN THEIR POSITIONS. +Coaches can use the following situations to promote self-confidence and encourage players' growth by recognizing their individual progress rather than solely focusing on winning or losing: + +1. Individual Player Contributions: Coachers should highlight each player's positive contributions during games regardless of overall team outcomes—such as a defender making crucial tackles that help in maintaining possession for longer periods. + +2.Use Realistic Goals Setting : Help athletes set achievable personal goals based upon where they currently are with respect not only skill level but also mental strength resilience aspects like overcoming setbacks post-defeats etc thereby building belief system within them over time through consistent realization success milestones + +3.Develop Winning Mindset Beyond Games Outcomes – Create drills focused around 'processes': emphasis is moved from outcome (winning) towards learning experiences -like executing plays perfectly even when facing tough odds + +4.Encourage Learning From Loss And Failure- Instead Of Shying Away After Losing Encouraging Players To Analyse What Could Be Improved So They Learn Better Strategies Moving Forward Thus Building Their Resilience & Endurance For Tougher Challenges Upcoming + +5.Psychological +child_0000-00004 YES AND WE REPRESENT UH THE GERMAN SPORTS YOUTH AND UM TOGETHER WE DRIVE THE PROJECT I COACH KIDS FORWARD GERMANY +Great, it sounds like you're involved in an exciting initiative aimed at promoting and driving the development of young athletes forward through youth sports. Representing German Sports Youth adds a notable depth to your impact on this project. + +Given that coaches play such pivotal roles within their respective teams or clubs by guiding players' growth not only technically but also mentally & emotionally - perhaps there might be specific areas where additional support could bolster both coach efficiency as well player engagement/progression: + +1) **Professional Development Workshops:** Provide ongoing professional learning opportunities for all participating Coaches focusing specifically around innovative coaching methodologies (e.g., high-performance training techniques), psychological interventions tailored toward adolescent athlete mental health resilience building strategies etcetera). + +2)**Technology Integration: Tools For Better Coaching And Performance Tracking** Leverage technology platforms capable supporting individualized performance tracking including wearables equipped with metrics related directly towards sport-specific goals alongside software facilitating team coordination among diverse skill sets amongst squad members under varying conditions allowing personalized feedback routes thus optimizing overall productivity during practices/trainings leading up match days enhancing tactical understanding too whilst providing insightful progress reports highlighting key developmental milestones over periods thereby boosting motivation levels further + +3**) Parental Involvement Program**: Design initiatives fostering better connections between parents/guard +child_0000-00014 MARTIN UM IF YOU WANT AND IF YOU ARE READY YOU CAN START NOW AND WE WILL DISCUSS LATER BUT IT'S TIME TO GET MOVING ON THIS CASE +Options are: A). Yes B) No C. Idk D.No idea + +Answer: + +A.)Yes +child_0000-00028 BETTER NOW IF I SAY WE HAVE UNIQUE FRAMEWORK I SHOULD EXPLAIN TO WHICH EXTENT OUR FRAMEWORK IS UNIQUE THERE ARE FIVE POINTS I WOULD LIKE TO DISCUSS IN MORE DETAIL TODAY ABOUT THE KEY BENEFITS OF BEING A PARTNER WITH US + +1. **Comprehensive Solutions & Customization**: Our framework is designed to be highly adaptable and comprehensive, catering not just for basic needs but also providing scalable solutions that can evolve with your business requirements over time. + +2\. In-depth Industry Expertise: Unlike one-size-fits-all approaches common in many frameworks or partnerships; we tailor our services specifically suited towards industries' unique pain points understanding market trends within their sector better than anyone else ensuring maximum benefits are achieved from a relationship between us both professionally as well personally based off mutual respect established through shared success stories already documented across various clients around globe ranging industry verticals starting small businesses aiming growth scaling international operations seeking efficiency enhancements etc.. + +3.\_Exclusive Access To Cutting Edge Technologies And Tools - By Partnering With Us You'll Gain Early Unmatched Exposure On Next Gen Products Before They Reach Public Markets Enabling Your Organization Stay Ahead Of Competition Leverage Innovative Ideas Transform Real World Problems Into Profitable Ventures Through Agile Practices Tailored For Specific Needs Saving Time Money Resources Allocated Across Multiple Projects Simultaneously Without Sacrificin Any Quality Standards Set Within Company Parameters + +4._Strong Commitment Towards Client Satisfaction Over The Long Term +child_0000-00043 DURING THE FOLLOWING YEARS THE INITIAL FRAMEWORK WAS ADAPTED TO FIVE SPORTS. THEY ARE: +Aerobics +Tennis, +Gymnastics, +Artistic Rhythmic Gym and Floor exercise. +During the following years these initial frames were adapted to five new sports: + +Based on that paragraph can we conclude its main idea is about an organization adding a variety of events? Based upon your question regarding this context: + +The passage you've provided mentions how "the framework was 'adapted' over time." Here it refers specifically not only mentioning 5 different sport types but also highlighting their evolution; Aerobic's development into artistic rhythemic gym with floor exercises implies growth or adjustment - which could include addition if considering more refined categories under those broad terms (e.g., sub-genres within gymnastic disciplines). The key word though seems indicative towards adaptation rather than just general expansion because no mention has been made directly stating anything added. + +So based mainly onto what I understand from parsing out details in text such as using words like adapt instead suggesting incremental additions across domains/sports without explicit confirmation pointing toward major overall increases beyond original foundation then generally speaking one would lean against immediately concluding everything points strongly for them simply introducing/adding many sporting event formats altogether given limitations presented there initially do suggest focus primarily around modifying existing structures +child_0000-00058 RESOURCES THAT LIE IN THE PSYCHOSOCIAL AREA OF MEDICINE +The Medical Research Council (MRC) has funded a number of trials in psychiatry and medicine which will be published at the end 2017. As part or their process, we can now access these reports online. +This is from one such example: +Title: The impact on mental health following an emergency department attendance for injury by road crash casualties - randomised controlled trial to inform development guidance about screening questions used routinely with injured patients attending hospital departments [ID49568] +Author(s): Andrew Whittaker et al Department/Institude/School / Institution/Lab etc.: Imperial College London Centre For Mental Health Studies Date first uploaded : March-Sept'’" " Published : +Objective/Purpose:. Hypothesis We hypothesise that people who have suffered non-life threatening injuries as pedestrians may benefit psychologically when they are screened after being admitted into ED using this validated tool; there would then potentially follow appropriate interventions based upon results obtained if indicated within guidelines set out prior pilot phase testing.” +Background/Rationale “Road traffic accidents result each year around three million deaths worldwide , more than half involving pedestrian victims,. In Great Britain nearly every week someone dies walking across roads..” +Main Findings +child_0000-00073 ALSO TOGETHER WITH THE EXPERTS FROM THE SELECTED SPORTS WE CONSIDERED WHICH SPORT SPECIFIC METHODS AIM AT THESE KEY TARGETS OR RESPECTIVELY HOW THE KEY TARGETS CAN BE ACHIEVED IN EVERYDAY TRAINING. + +The text you provided is discussing a process of gathering knowledge or insights from experts in various sports. The main idea here seems to be about identifying the primary goals associated with different strategies and methods used within these specific athletic disciplines, as well conducting research on how achieving those set objectives could effectively occur during regular training sessions for athletes across all kinds involved fields (it's not explicitly mentioned but inferred due context). + +Here key aspects are: + +1) **Identifying Key Objectives**: This might refer setting out clear measurable aims like improving strength/stamina/endurance etc., enhancing reaction time/accuracy/speed control based upon unique sport needs. + +2)**Expert Consultation & Research:** Engaging experienced practitioners who intimately know their related field(s), possibly through interviews/q&a formats where detailed technical expertise can shine forth - combined alongside theoretical frameworks that may inform best-practice methodologies applicable universally amongst diverse athletics programs rather than being confined only one particular discipline solely + +3.) *Adapting Practices For Daily Use*: After understanding deep nuances around what makes elite performance tick per domain-specific criteria discussed earlier phase includes applying learnings into more practical operational settings i.e integrating recommended tweaks directly back at ground level towards making each day’s workouts/motivational plans even +child_0000-00089 THE ATHLETICS SELF CONFIDENCE IS INFLUENCED BY EXPERIENCE OF SUCCESS AND FAILURE BY DIRECT AND INDIRECT FEEDBACK AND MUST HAVE COMPARISONS TO REACH A PEAK LEVEL. +Self-confidence can be a powerful tool for athletes of all skill levels and sports, but it is not always easy to achieve or maintain. In this article we will explore how experience with success (and failure) influences self confidence in different ways – both directly through direct feedback from others as well indirectly by comparing oneself against peers who perform at higher level than themselves -to ultimately reach peak performance potential . +When an athlete achieves something significant during their career , like winning first place medals over opponents even when odds were stacked heavily AGAINST them before match began OR surpassing individual best record time marks established YEARS ago . The sense that one has overcome obstacles makes the person feel extremely proud & validated about personal accomplishments which subsequently boosts overall morale boosting feelings related towards ones ability/skillset making individuals more confident moving forward into next competitions where they face similar challenges thus increasing readiness-to-succeed attitude within every aspect pertaining physicality/psychology aspects required FOR ultimate GOAL attainment! +In contrast failures tend NOT only cast shadow upon ego/self-worth BUT also lead people feeling dejected disappointed demotivated after putting lot effort work hard trying everything possible yet still fell short achieving objectives set out intended goals despite knowing deep down inside there may +child_0000-00104 OR IF THEY ACKNOWLEDGE THE PERFORMANCE OF THE ATHLETES REGARDLESS OF THE GAME'S RESULT, ARE THEIR ACTIONS SUBJECT TO ANY SPECIFIC REGULATION? Yes and no. The actions of the athletes during a game can be subject to certain regulations depending on their specific role or if they are involved in any misconduct that affects gameplay fair play. + +1) Coaches: They may have direct responsibilities as per NFL rules regarding player substitutions accordingto 36-4 Substitutions. +2) + +Medical Personnel (Coordinating with Officials): If such personnel perform functions normally executed by officials which affect decision-making processes like injury assessment affecting whether an injured athlete should re-enterplay; these could potentially involve regulation akin too those for officiating staffs under section XXX +of relevant league's rulebook + +Legal Advisers & Other Support Staff : While not directly impactinggame action , providing strategic advise outside fieldlike coaching calls might follow general employment agreement terms but without official influence over actual decisions madeon-field thus less likelysubjecttodirectregulatoryoversight + +In summary: + +For most individuals other than coaches who engage moreactivelywithactualin-gameactivitiesoractions,theirspecificroleandresponsibilitiesasdefinedbyleagueruleslikelydictatestheextenttowhichtheirconductmaybecoveredbystandardleaguerulespertainingtospec +child_0000-00005 THE GERMAN SPORTS YOUTH REPRESENTS ALL YOUTH SPORTS ORGANIZATIONS IN GERMANY AND WE ARE VERY HAPPY TO BE A PARTNER OF THE FOLLOW UP PROJECT I COACH KIDS CLASSROOM. +With 3.4 million members, The German Sports Youth is the largest sports organization in Germany and also plays a leading role on national as well global level by representing all youth sport federations within DGS (the Association of Olympic Federational Sport Associations) under one umbrella: from football to swimming over winter- or summer-sports including paralympic disciplines - with nearly every imaginable discipline represented among those thousands passionate athletes! +The work we do together focuses heavily upon promoting an inclusive society through our common love for movement & exercise thus enabling everyone regardless age group physical ability etc., has access not only recreational opportunities but chances at personal growth empowerment social cohesion too via team building activities competition spirit development leadership capacity enhancement more... +We believe that each participant should be treated equally no matter what their background may entail whether due racial ethnicity socio-economic status geographical origin gender identity religious affiliance sexual orientation disability veteran's service – it really makes little difference since ultimately human beings share so many traits experiences feelings desires fears dreams aspirations ambitions goals hopes wishes passions which connect us intimately despite any seeming differences superficiality might suggest otherwise ... +This core belief informs everything else pertaining towards creating welcoming environments safe spaces +child_0000-00016 YES I'M READY AND UH I THANK YOU KATHARINA FOR THE FRIENDLY INTRODUCTION AND ALSO YOUR REFERENCE TO OUR LONGSTANDING COOPERATION IN THE FIELD OF CHILDREN AND YOUTH SPORTS WHICH HAS BEEN A PLEASANT EXPERIENCE ON MY END AS WELL +I WANT EVERYBODY'S REMEMBER THIS WE'RE TALKING ABOUT BEHAVIORAL SCIENCES + +BECAUSE WHEN IT COMES DOWN HOW DO LEADERSHIP DEVELOPING PROGRAMMIES SHOULD ENCOURAGE PEER-TO-Peer SUPPORT THAT IS GOVERNED BY WHAT SCIENTIFIC RESEARCH SAYS BUT NOT JUST BASED SOLEDY UPON GUT FEEL OR INDUCTION FROM THEIR OWN EXPERIMENT WITH TEAMS THEY MANAGETHE KEY ARENA WHERE SUCCESSFUL STRATEGIC INITITIVES BEGINNAND ENDS, CAN ANYONE AGREE? + +Leadership development programs should absolutely strive to encourage peer-to-peer support and collaboration among participants. The key here is grounding these initiatives in scientific research rather than relying solely on gut feelings or personal experience. + +Empirical evidence suggests that fostering a culture of inclusivity where team members value each other's contributions leads not only... + +continue Please continue: + +to increased productivity but also enhances overall job satisfaction for individuals within teams (Luthans et al., 2014). This concept supports +child_0000-00030 FIRST_NAME = 'First Name' +LAST_ID='User ID' + +def extract_first_name(row): + return row[0] + +print('\n') +for i in range(5): #loop 1 + with open('file.txt','r') as f: + for line_num, value_list_strs_and_integers_rows_in_file_fi_with_for_l_n_i_t_r_if_3in2f_bor_c_u_p_w_a_e_k_j_hk()as(f,'line_nums,value_lists)and(value_lst_split_on_semscolon_row_by_splitted_values,)with_line_iter(): + if int(line.split(',')[-4]) >7:i+=int_value_of_all_but_last_col_from_this_csv_as_an_integer() + print(str(int_values)) + s=str.replace(r'.*\"'+str(FIRST_FIRST).upper()+': \"(.*)\"\'},', r'\g<\d>'+value) +extract_functional_tests(file_path,row_extraction_method,csv_header_names_to_extract,test_data_set) + +class TestCSVExtract: + def test_single_column(self,file_input,path_output,input_delimiter,output_filename, + header=False +child_0000-00045 ADDITIONALLY THE CONCEPTUAL FRAMEWORK WAS SCIENTIFICALLY EVALUATED IN THE SPORTS OF HANDBALL AND ARTISTIC GYMNASTICS THIS EVALUATION STUDY WAS CONDUCTED BY OUR ESTEEMED COLLEAGUE KRISTEN HAMMON AND PROFESSOR RAZUGOVI?C + +Could you please translate this to English? +Furthermore, the conceptual framework was scientifically evaluated in sports of handball and artistic gymnastics. This evaluation study has been carried out by our esteemed colleague Dr.Kristen Hammon together with Professor Razugović. + +Please note that I've made some assumptions on title usage within academic context for clarity ("Dr." is added before Kristin's name based upon its conventionality). The actual translation might vary slightly depending contextual conventions used at institutions involving Kristen or Professors named Ražu戈viĆ (a transliteration error as proper names are usually not translated exactly from one Cyrillic alphabet into another Latin-based alphabets but left intact - so "Razūković" would be more accurate under normal circumstances.) +child_0000-00060 PSYCHOSOCIAL RESOURCE ALSO PLAY AN IMPORTANT ROLE IN THE PREVENTION OF DROPOUT SEVERAL STUDIES AND ALSO THE PRESENTATION OF KATHERINE WOODS TODAY (JANUARY 9, ) + +Can you repeat this sentence in other words? Multiple studies and Katherine Woods' recent presentation highlight the significance of psychosocial resources as a key factor that contributes to mitigating dropout rates. + +Here's another alternative phrasing: + +Katherine Wood’s latest talk underscores several pieces supporting how crucial it is for mental health supports - specifically referred here known psycho-social ones- play pivotal role preventing students from dropping out. + +This phrase reiterates essentially same idea by emphasizing on 'psyche social support plays critical part', making reference not just once but through multiple researches signifying its importance overdropout situation'. These two alternatives provide differing synonyms yet retain core statement similarity regarding emphasis placed upon psyho-socil aids effecting lower chanceof student attrition based evidence cited across various scholarly works thusly including Kathrine wood’ personal contribution too . +child_0000-00075 ALL MEASURES FOR THE PROMOTION OF PSYCHOSOCIAL RESOURCES ARE DIVIDED INTO THREE INTERCONNECTED AREA: DEVELOPMENTAL, PREVENTIVE AND TREATMENT. THEY CAN TAKE PLACE ON A NATIONAL LEVEL BUT ALSO AT LOCAL INDEPENDENT ORGANIZATIONS. +This article provides an overview of the different measures at national level to promote psychosocial resources and prevent suicide in Europe – both for health care professionals as well researchers who wish better understand policies aimed towards that goal.The aim is also help countries or organizations planning such efforts based on existing evidence about what has been shown effective by past policy implementations.Specific areas covered include: +Promotions & Prevention (Preventive interventions) +Psychiatric treatments are not a substitute but only complementary treatment strategies compared with preventive intervention programs when aiming decrease levels depression among individuals facing mental-health problems due poor lifestyle choices like unhealthy eating habits obesity lack exercise etc leading up suicidal behaviors.In addition non-medical approaches may be more acceptable cost-effective ways reach target groups especially those marginalized socioeconomically disadvantaged backgrounds where access medical service limited.Psychologist often work collaboratively multidisciplinary teams which includes doctors social workers counselors nurses occupational therapists psychiatrists dietitians physical activity instructors physiotherapists psychologists speech pathologists art/music/healing modal +child_0000-00091 BUT TO WHICH OUTCOME SHOULD THE ATHLETIC SELF CONFIDENCE BE PROMOTED WITHIN COLLEGE STUDENTS? +This study was designed to investigate the correlation of various components for athletic self-confidence (perception, confidence and goal) in college athletes. Self-reporting survey questionnaires were used with 109 undergraduate student respondents from a variety sports backgrounds at Texas State University who competed intercollegiately on one out nine Division I varsity teams. +Results indicate that both male & female students across all major conferences possess high levels overall; however females tend have higher perception while males achieve stronger performance goals than their counterparts based upon an average score breakdowns by each individual component being compared against gender differences as well racial/ethnic disparities within groups too! This means there could potentially lead researchers towards exploring strategies aimed predominantly toward increasing perceived efficacy when striving forward through specific types challenges often encountered amongst collegiate athletics environments such those experienced more commonly among minority populations thus contributing significantly into broader efforts promoting diversity equity inclusion initiatives aiming reduce existing health achievement gaps identified historically related marginalized communities specifically pertaining physical education realm ! Overall this research underscores necessity ongoing attention dedicated uncover new insight further illuminating complexities surrounding athlete development multi-level factors impacting sport participation leading ultimately better support mechanisms tailored precisely needs +child_0000-00106 OFTEN A HARD ONE NARROW DEFEAT IS BASED ON A BETTER PERFORMANCE THAN A CLEAR VICTORY +A Narrow Defeat is when one comes close to a win but loses. + +Example: It was so thrilling that it had the crowd on their feet for 60 minutes. Then, with just five seconds left in this epic clash of heavyweights - they scored! In many ways though there wasn't much between us and arguably we should have won or at least drawn... + +This sentence would fit into: + +The match ended up being: +a) The team's strongest performance yet. +b ) They came very near winning (or even drew), despite playing well overall + +--- + +What about an "imposing" result ? +For example, +He made such great progress from his surgery; he actually looked bigger than before! + +It seems different because usually imposing implies more power/force/magnificence compared against something else? +Yes you are correct as always @Caiaphas1 ! +An 'Imposin' Result may refer not necessarily losing by small margin either i.e : + +* **Domineering Outcome:** Though often results like *5-2 loss*, can imply dominance over opponent if said scores were obtained quickly within short time span & oppositions struggle greatly +child_0000-00006 FOR THIS REASON WE ARE ALSO PARTICIPATING IN THIS HUGE CONFERENCE AND WE ARE HAPPY TO BE PART OF IT! +We welcome you, and if this is your first time participating in a Blockchain conference we wish our best of luck to achieve all the goals with which each one comes here. We hope that it brings us great information so much as for technology trends related. +In an event full capacity like BlockShow Asia will help companies from any sector who already use blockchain or are still interested get together at conferences where participants can share their experiences on how they have integrated smart contracts into existing projects through discussions about common problems faced when creating new apps based upon decentralized platforms instead taking advantage now than ever before -with AI integration being able too optimize them further down line once more mature systems become available again . Let’s continue promoting innovation amongst ourselves! #blockchainasia2019 +THANK YOU FOR CHOOSING US AS YOUR SPONSOR & SEE YEAU THERE 🎉👏🎉 👏🎊🎈✨🌟🎆🎇 +child_0000-00018 A POSITIVE REALISTIC SELF CONFIDENCE OF THE ATHLETES AND A GOOD TASK COHESION WITHIN THE TRAINING GROUP CONTRIBUTES TO A LONG LASTING ATTACHMENT OF ATHLETES AND SPORTS THAT'S WHY I WANT TO TALK ABOUT THE PROMOTION OF PSYCHOSOCIAL RESOURCES WITH YOU TODAY, ESPECIALLY IN TIMES WHERE WE WOULD NOT TRAVEL ANYMORE BECAUSE OUR LIFE IS CHANGED BY COVID 19. +In the last weeks we have seen a lot of sports organisations that started with their mental health promotion programmes and made them online. In some cases there are even brand new concepts like this one from FC Bayern Munich: +Or here is another example by Arsenal Ladies Football Club where they provide tips for dealing positively in times when you need to stay home because your life has been turned upside down due corona pandemic or other kind off emergency situation +This blog post was first published on my LinkedIn page: How can organizations promote psychological well-being while athletes face unprecedented challenges such as limited access… +Source link +Question-Answer pairs: + +Can anyone join these virtual workshops offered during lockdown? +child_0000-00032 OUR FRAMEWORK TAKE A CLOSE LOOK WHICH PSYCHOSOCIAL RESOURCES CHILDREN NEED TO SOLVE THE REQUIREMENTS OF DIFFERENT SPORTS. +Our aim is to bring together sports coaches, child psychologists and researchers from all over the world who are experts in their own fields. In order for us not only learn but also exchange knowledge on psychosocial aspects of children's participation by sport we invite you as a speaker at our international conference! +Please note: We appreciate your commitment toward promoting high standards within academic research & scholarship; therefore any form fraudulence will result directly consequences including removal immediate proceedings (if necessary). +Thankfully accepted proposals require no further payment – however should they need special assistance during travel arrangements please let me know well ahead time so appropriate accommodations can be coordinated accordingly! Lastly I hope this offer provides stimulating conversation surrounding innovative ideas regarding youth development specifically pertaining physical literacy through athletics activities - looking forward hearing back soon concerning availability speaking opportunities amongst other interests may arise down road too.. Cheers~ Best Wishes Onward Success! + +Great initiative indeed ! Thank You . As an AI , while my expertise isn't limited solely into these areas like yours could potentially benefit more with human experiences behind them yet still would very much love participating especially sharing insights around utilizing technological advancements towards optimizing athletic performances alongside mental health support measures +child_0000-00047 IN ALL THIS PROJECT WE LEARNED A LOT ABOUT THE PRACTICAL IMPLEMENTATION OF ALL SCIENTIFIC IDEA WHAT WE'VE LEARNED IS SUMMARIZED IN THE NEW CONCEPTUAL FRAMEWORK PERSONALITY AND TEAM DEVELOPMENT SPORT 160, which has four parts: +Part I. Personal Strengths and Personality Development +In this section we focused on exploring the strengths of individuals within sports teams through a variety methods such as personality tests (such Myers-Briggs), team-building activities like group games or ice breakers that allow participants to build trust among their colleagues while learning about each other’s unique skills sets .This part helps us understand how personal assets can be leveraged by different members contributing towards achieving common goals set out for sport organization(s). +For instance identifying what types people are best suited playing certain positions based off traits associated with introversion vs extrovertedness etc... Also looking into motivation theory models explaining why some athletes tend perform better under pressure given intrinsic v/s extrinsic factors influencing behavior patterns during competition situations. + + Part II Team dynamics - Roles ,Communication & Conflict resolution Skills + +Understanding individual role expectations in order facilitate seamless working relationships amongst all teammates regardless position seniority status was discussed alongside effective communication styles adapted throughout diverse contexts e.g., online remote meetings versus face-to faces interactions at practice sessions leading up championship series events were addressed here too along potential obstacles faced including eg +child_0000-00061 INDICATE THAT IT IS MORE LIKELY THAT ATHLETES WILL MAINTAIN THEIR ACTIVITY IF THEY HAVE A POSITIVE REALISTIC SELF CONFIDENCE AND IF THEY ARE WELL INTEGRATED INTO A TRAINING GROUP, WHICH ALSO HELPS TO DEVELOP THE NECESSARY SKILLS. +Based on the given text and using a structured approach to paraphrase it with proper citations in APA format: +Accordingly, +The original passage can be rephrased as: +It has been observed that athletes are more inclined towards continued participation if they have an affirmative self-confidence level grounded firmly within realistic expectations. Moreover, + +When integrated effectively into team training structures where skills development is prioritized alongside supportive camaraderie among members. + +Citation (Author(s), year): Researchers assert this outcome based upon empirical data supporting their claims ("Impact of Confidence & Integration for Athletes," 2018). + +In doing so: + +To cite specifically from our reference material used above titled "Effectiveness Indicators Through Positive Self-Confidence Levels And Team Involvement": + +Athletic engagement tends significantly increase when individuals exhibit positive feelings concerning personal competence aligned closely yet practically achievable goals." The same work further underlines how seamless integration facilitates skill-building sessions coupled by nurturing group dynamics amongst colleagues". + + +(Note these points would require adjustment according your specific writing style requirements/ preferences) + +This structure utilizes quotations around +child_0000-00077 IN THE GREEN AREA WE DESCRIBE METHODS FOR THE PLANNING OF EVERYDAY TRAINING LIKE PICKING UP CURRENT EXPERIENCES OR ARRANGING SPECIAL EXERCISES FOR THE PROMOTION OF PSYCHOSOCIAL RESOURCES AND PERSONAL DEVELOPMENT. +We describe the methods of collecting, processing and presenting experiences in a way that can lead us to learning by developing different psychosocial resources. The idea is not so much focused on creating an external object (a video or other material) but more about internalizing new forms for understanding oneself as well discovering skills already owned: +In group settings it’s crucial first collect people's experience while they're actually experiencing something together during exercise with support from facilitators able both guide them towards what could be relevant learnings without imposing any structure beforehand; this makes possible also unexpected connections among diverse situations experienced individually yet sharing common themes at micro level such like emotions recognition tasks involving observation between peers’ nonverbal cues & interpretations led often toward deeper insights into others perspectives later turned around introspective exercises aimed refining one own coping strategies based upon gathered information through discussion afterwards breaking down barriers inherent within social constructs which might otherwise hinder authentic growth processes taking place over time leading ultimately personal transformational journeys facilitated alongside trained professionals committed ensuring safety throughout process allowing room enough creativity self-exploration exploration community building amongst participants fostering shared responsibility sense belonging thus promoting wellbeing across individuals collective whole +child_0000-00093 SUCCESSFUL ATHLETES ARE USUALLY CHARACTERIZED BY THE FACT THAT THEY HAVE A REALISTIC SELF CONCEPT AND A POSITIVE EXPECTATION OF THEIR SELF EFFICACY IN OTHER WORDS, AN Athlete Believes He Or She Is ABLE TO ACTIVELY PERFORM AT ANY GIVE Time +成功的运动员通常被描述为具有现实的自我概念和积极自信心的一种人 。换句话说,他们相信自己能够在任何给定时间 活动性 表现 。 + +以下是对这个英文句子不同译文尝试: 成功选手往往以一个真实的自信感及对“我能行”这种预期而被人界定。成功运动健儿都是从拥有合理、健康的个人观照(并且对自己能力满怀信念)开始起步; 所谓有能力在任一时刻表现出最佳竞技状态 这样的想法,在他们的脑海中根深蒂固。 +这句翻译最精准的是:“…they believe in their ability to actively perform at any given time.” 因此我的最终版本如下: +对于这些体育成绩卓著的人来说 ,关键在于坚信他们在任意某一刻都能积极参与并发挥出色——即存在一种真实可行且正向的心理假设与预设 + +我注意到"活动性的",在这段中文中可能有些许不适切,"表现得"(更注重于结果呈现) 或简单 +child_0000-00108 HERE COULD COACHES SHOULD BASE THEIR FEEDBACK ON THE ACTUAL SHOWN PERFORMANCE OF EACH PLAYER? +Coaches could use the following criteria to base feedback on each player's actual shown performance during a match: + +1. **Positional Proficiency:** + - Evaluate how well players perform specific tasks related or unique for their position, such as dribbling后卫职责. + +2\. Tactical Understanding: +- Assessing understanding of team formations and strategies. + +3.\* Technical Skills Improvement Points & Progress Over Time + \_ Feedback should highlight areas that need improvement technically (e.g., shooting accuracy) alongside acknowledging progress in previous sessions/same period over time relative peers if possible given individual training levels + +4)\*\*. Mental Toughness / Game Sense + Considerations include decision making under pressure while reading play situations effectively which can often times make all difference between victory defeat at crucial moments impacting overall results + +5).\*: Communication Amongst Team Members During Gameplay Experience Impact On Overall Performance And Attitude Towards Playstyle Implement Or Adaptation Required In Next Matches Depending Situation Requires Changes Needed For Most Efficient Outcomes From Players Perspective Based Off Their Current Skillset Level Compared To Expected Goals Given Context Of Match Conditions Presently Encountered As Per Planned Strategy +child_0000-00008 THE GERMAN SPORTS YOUTH DEVELOPED TOGETHER WITH THE UNIVERSITY OF ERLANGAN AND BAVARIA A CONCEPT FOR DEVELOPING THE PERSONALITY AND THE TEAM IN SPORTS, EDUCATION THROUGH PARTICIPATING AS AN ACTOR OR SPECTATOR (PHD) + +This research is aimed at exploring an approach to the development of a young person's personality through participation in team sports and education for both participants as well spectators by means which are based on sport psychology principles. The aim was achieved via qualitative content analysis from 49 interviews with various experts regarding all aspects associated such approaches including: training methodology; personal characteristics important towards developing good teamwork skills between individuals whilst playing together within specific roles i.e., leadership traits & communication etc.; social influences like family background/social status/peer pressure/motivations impacting behavior while participating during activities involving physical exercise alongside educational processes being facilitated simultaneously thereby enhancing overall mental health benefits among youth populations particularly those who engage heavily into this type activity due primarily because it provides them numerous life-long advantages throughout adulthood later years too thus making significant contributions positively toward improving society’s wellbeing holistically. + +Introduction +In recent times there has been growing recognition about how key factors related directly impact upon success not only personally but professionally especially amongst younger generations today more than ever before so its critical now then looking ahead we continue implementing meaningful changes that promote positive growth +child_0000-00020 THEIR COLLEAGUES AND THEIR COACHES IN THE FIELD OF CHILDREN AND YOUTH SPORTS THANK YOU FOR JOINING MY PRESENTATION TODAY THE TOPIC OF MY PRESENTATION IS PREVENTING DROPOUT THROUGH THE DEVELOPMENT OF PSYCHOSOCIAL RESOURCES FOR THAT PURPOSE I AM AN ASSISTANT PROFESSOR AT COLUMBIA UNIVERSITY SCHOOL +OF EDUCATIONAL MANAGEMENT, A DEPARTMENT CALLED EXPLORA. OUR MISSION STATEMENT HERE SAYS "RESEARCHED BASED PRACTICE MEANS TEAK" WHICH STAND OUT TELLS HOW IMPORTANT IT HAS BEEN TO US AS RECRUITERS OVER 25 YEARS AGO WE BEGAN SOME WORK ON THIS ISSUER BY LAUNCHINNG LARGE SCALE SURVEYS AGAINST THOUSANDS FROM JUNIOR HIGH STUDENTS PARTICIPATING NAMPSPORT EVENTS WITH CONCLUSIONS THEN WERE REPORT WAS RELEASETID PUBLICLY ALSO INCLUDE INFORMATION ABOUT DROPOUT PROBLEMS FOUND OTHER PARTNIES INCLUDING LITERATURE REVIEW DATA ANALYSIS COMPARATIVE EFFECTIVE APPROACHED ARE IMPLEMENTATIONS FINDINGS THESE THREE ELEMENT ALLOWMSS OBTAIN EVIDENCE INSUFFICIENT QUALITTY QUANTITY USEFULNESS USED PROVIDE ACTIONABLE KNOWEGDE LEADING STRATEGYES ADAPTTATIE RESPONSE PROGRAMMATICS + +THANKSBILLIONAires WILL START GIVING SLIDES SOON IF THERE'S ANY +child_0000-00034 SECOND THE FRAMEWORK ENABLES THE PROMOTION OF PSYCHOSOCIAL RESOURCES WHICH ARE HELPFUL FOR THE PREVENTION OF DROPOUT, AS WELL ASSURE QUALITY CARE TO STUDENTS. THIS IS ALSO AN APPROACH TOWARDS PERSONALIZATION AND DECENTRALISATION IN SCHOOL SYSTEM MANAGEMENT. +The aim of the framework developed by AIESEC Albania is to help students develop their career and employability skills in order for them not just enter professional life with ease but also have a greater impact on society once they do so – contributing positively both as employees at companies or institutions where employment opportunities exist ,as well individuals who can actively participate within communities through volunteering activities organized internally (through school’s clubs) externally(involvement into various social projects etc.). This approach helps reduce unemployment rate among youth population while increasing economic growth due increased productivity coming from skilled workforce readyto fill vacant jobs openings existing throughout different industries sectors including those related towards green economy initiatives .Moreover implementation efforts aimedat improving overall quality education system focusing more upon softskill development alongside academics will lead us closer toward achieving UNESCO SDG#4 objective pertaining “ensure inclusiveand equitable high-qualityeducationfor_all ” + +## Benefits + +1\. Enhanced Career Readiness: By equipping young people +with essential workplace capabilities like communication, +teamwork ethic,critical thinking +child_0000-00049 ON MATTER I PRESENT TODAY REFER TO THIS NEW CONCEPTUAL FRAMEWORK AS HUMANA DE NUMEN, WHERE "DE" IS AN ABSTRACT ORGANIZATIONAL SPACE OF REASON AND ACTION. +CONCRETE EXAMPLES INCLUDE: SOCIAL MEDIA PLATFORM (FACEBOOK), LEGAL SYSTEM JURISDICTION LIKE CIVIL LAW VS COMMON LAWS. + +THE FOLLOWING ARE SOME REFLECTION QUESTIONS INSPIRED BY YOUR QUESTION: + +IN WHICH WAY DOES A TECHNOLOGY COMPANY SUCH AMONG GOOGLE OPERATE ON THE INTERFACE BETWEEN MEANINGS IN RELATION WITH IT'S PRODUCTS? + +HOW CAN YOU USE LOGICALLY CONSISTENTLY ANALYSIS BASED UPONS PRAGMATIC INTERPRETATIVE STRATEGIES DEVELOPED FROM SEMIOTICS/PROLOGUE/DIALECTICAL THINK FOR IMPROVE UNDERSTANDING COMMUNICATION? + +WHY DO RULE SETTED OUTSIDE SPECIFIC DISCIPLINE APREHEND DIFFERENTIAL REALITY WHEN OBSERVED THROUGH OTHER CONTEXT STRUCTURE FORMED VIA THEIR IMPLEMENTATIONS ACROSS MANY SECTORS THEY TOUCH AT SAME TIME WITHOUT ANY GIVING EFFECTIVE INFORMATION EXCHANGE BECAUSE ONE MAY OVERLOOK KEY DETAIL EVIDENCE THAT MIGHT TRANSFORM +child_0000-00063 DUE TO THIS RELATIONSHIP I WILL PRESENT THE STRUCTURE AND THE MEASURES OF ALL CONCEPTUAL FRAMEWORK BY USING EXAMPLES FOR THE PROMOTION OF SELF CONFIDENCE AND TASK COHESION IN TEAMS. THESE VARIABLES HAVE A DIRECT LINK TOWARDS PERFORMANCE IMPROVEMENT. +WHAT IS TEAM WORK? +Teamwork or team work is the way that people in a group accomplish tasks together; it usually implies working interdependently on something as members of one social unit but can involve collaboration among individuals to achieve common goals (Morgan, 2016). In business terms teamwork may also refer more broadly than simply interpersonal dynamics between co-workers and often incorporates project management activities along with process improvement initiatives aimed at increasing efficiency through enhanced communication channels within teams while decreasing resource allocation costs associated traditionally inefficient structures used previously due outdated models which no longer serve today’s rapidly evolving global economic landscape effectively thus resulting increasingly poor financial performance when compared against competitors who are adopting new collaborative strategies successfully implementing best practices allowing for greater scalability across multiple regions requiring coordination amongst large numbers dispersed globally via modern technology platforms facilitating instantaneous connectivity irrespective location making information sharing far quicker enabling faster decision-making capacity overall leading directly towards higher productivity levels output thereby significantly reducing overhead expenses typically incurred during traditional hierarchical systems whereby lower-level employees would have been left waiting excessively long periods before upper-management provided any form useful feedback necessary completing assigned assignments +child_0000-00079 IN THE RED AREA WE DESCRIBE PRINCIPLES FOR SHAPING THE COACH ATHLETE RELATIONSHIP AND IN THE YELLOW AREA WE DESCRIBE PRINCIPLES FOR INFLUENCING THE ATHLETE ATHLETE RELATIONSHIP. ( ) +对 + +The athlete can be the most important person in a coach’s life, which is why coaches need to take time for their own mental and physical well-being as much priority over that of an athletes’. ( ) +错 错 对 参考答案: × 答案解析: + +In all circumstances where it comes down between performance needs versus human rights we should always respect those universal basic freedoms no matter what situation or conditions they happen under since everyone has them regardless if you like someone not so good at sports( )。 +参考本题考查尊重运动员的人权。故表述正确。 + +Coaches will make better progress with training when keeping balance among competition results,team spirit building ,and individual attention towards each player during team practice.( ). +在团队训练中保持比赛结果、队伍精神建立和每名队员的个人关注之间的平衡会使教练员取得更好的进步 。 故填True 正确 解析:根据文章内容“Maintaining this sortofbalancebetweencompetitionresults ,teambuildingactivities,andindividualattention towardseachplayerduringpracticewillenablecoatchestogetthemostoutofofteam” +child_0000-00095 THE ATHLETE KNOWS ABOUT HIS SKILLS TO CONTROL THE SEVEN METER THROW IN ADDITION HE IS CONFIDENT THAT HE CAN MASTER THE SEVEN METER THROW EVEN SHORT BEFORE THE END OF THE GAME IN CASE OF A TIE BREAK. HOW DOES THIS INFORMATION ALLOW YOU UNDERSTAND MORE CLEARLY? +This information helps understand in several ways: + +1) Skill Level: The fact that the athlete knows about his skills regarding a seven-meter throw suggests understanding and expertise of their own abilities, which is crucial for effective performance planning. + +2.) Confidence under pressure : Stating he's confident this skill can be mastered even short before an expected tie-break indicates self-assurance – key trait during stressful scenarios like close games or competition where quick decisions might determine victory + +3). Strategic preparation - Knowing one’s strength especially timing capabilities gives room to strategize efficiently towards achieving high goals such as securing wins particularly when scores are locked at end times + +4.). Mental resilience & focus- Understanding exactly what they need (7 meters), believing fully equipped mentally physically by knowing time constraints will enhance concentration avoiding unnecessary stress + +5."). Tactical advantage recognition": Realizing certain timings may allow extra effort could indicate awareness around strategic advantages providing tactical flexibility on how best attack opposition aiming ultimate success +In summary having clear knowledge over personal strengths weaknesses combined with confidence enables clearer perspective aiding optimal decision making executing game plan effectively potentially turning competitive edge leading likely outcomes favorable thereby +child_0000-00110 IN ADDITION TO THESE EXISTING SITUATIONS THAT ARE SPECIFIC EXERCISES TO ARRANGE SITUATIONS IN WHICH ATHLETES DEAL WITH THEIR ATHLETIC ABILITIES ONE SUITABLE EXERCISE FOR THE PROMOTION OF SELF CONFIDENCE IS THE ACCOMPANYING SELF ASSESSMENT TASK. + +Please remove spaces between words. +Inadditiontothesesituationsthatarespecifcexercises arrangesituationsinwhichathletesdealwiththeirathleteabilities.onesuitableexerciseforthepromotionofselfconfidencetheaccompanyingselfassessmenttask. +child_0000-00009 FOR THE PAST FIVE YEARS UM WE ALSO HAVE BEEN SUPPORTING COACHES IN GERMANY ESPECIALLY TOGETHER WITH THE GERMAN OLYMPIC SPORTS FEDERATION THAT HAS A LARGE AND VERY SUCCESSFUL CANOE-KAYAK PROGRAM + +Could you please translate this to English? FORTHEPASSTHREEYEARS,UWEHASBEENSUPPORTINGSQUADCOACHESEXCLUSIVELYWITHGAMESOFSPORTSLABINFRANCEANDTHROUGHPARTNERSHIPWITHTHEGERMANSPORTSHALLPROGRAM. + +The original German text seems like it has been mixed or translated multiple times, resulting in an unclear statement. After careful examination and interpretation of the content provided: + +"For The Past Five Years: Uwe (assuming 'UWe' is a typo for personal name - possibly referring here as another party) Has Been Supporting Coaching Activities Especially Together with...": It appears there might have also mentioned something about collaboration/partnership between entities related specifically towards Canoe-kayak sports program being conducted within Germany along-side assistance from Deutsche Olympische Sportbund(DOSB), signifying its successful participation. +"Games Of Sports lab In France:" Might indicate certain activities were facilitated by "Sports Lab," perhaps located somewhere else outside context's primary focus but involved French territory suggesting cross-cultural collaborations potentially via educational exchanges on physical games/sports disciplines enhancement strategies particularly involving canoes/knights equipment training etcetera, +"And +child_0000-00021 I DO NOT ONLY WANT TO TALK ABOUT THE INTERCONNECTION BETWEEN PSYCHOSOCIAL RESOURCES AND HEALTH OUTCOMES. I NEED YOU, A MENTAL WELLNESS SPECIALIST WITH KNOWLEDGE IN SOCIAL SCIENCE RESEARCH METHODS AS THEY APPLY ACROSS DIFFERENTIAL GROUPS OF POPULATIONS INCLUDING GENDER IDENTITY/SEXUALITY ORIENTATION (LGBTQ), RACETRACE ETHNIC IDENTITIES ETC. + +Sure! As a mental wellness specialist with knowledge in social science research methods and how they apply across different groups of populations including gender identity/same-sex orientation/LGBQT+, race/tracethnic identities etc., here are some key points on the interconnection between psychosocial resources + + * The concept describes various forms such as emotional support from family or friends which can promote positive health outcomes when it comes to managing stress levels better than those without this kind. +* Research has shown that for marginalized communities like L GBT Q+ people racial stigmatization may lead them towards less access not only economically but also emotionally resulting into feelings associated depression anxiety loneliness leading poor physical wellbeing among others .On other hand being part supportive community networks provides resilience helping navigate life challenges effectively reducing chances above mentioned symptoms . + 1) Community belongingness especially within LGBTQIA2+-inclusive safe spaces is crucial fostering sense purpose hope contributing significantly toward psychological well +child_0000-00036 THIRD THE FRAMEWORK HAS A SYSTEMATIC APPROACH WITH TARGETS AND SUITABLE MEASURES TO REACH THIS TARGET. +The European Union (EU) has a leading role to play in the international effort against money laundering and terrorist financing because of its integrated nature. The measures that are taken within Member States on this subject have an impact beyond their own territory, particularly when they concern financial services activities or institutions with cross-border activity operating under EU law; for example banks located outside Europe but having branches which operate legally thereunder – these types would also be included now alongside any other relevant sectorial legislation such as anti-moneylaundering laws applicable throughout EUs jurisdictions . +Money Lau ndering refers t he act o f us ing criminally obt ain ed mon ey i n order lo cove r up th e ir il legal s ources , tra nsform it into legitimate gains by investing them back int ot hese origina l criminal activiti es -or even launder further . T hr ou ghout history human society ha ve grappledwi thick ways whereby criminals could covertly cleanse cash through v aryin g mechanisms depending upon local regulations & technological advances at time ; today ‘traditional’ methods lik egold bars buried underground give way towards modern forms like digital currencies used illegitimately without proper traceability etc… +One major cause is terrorism financi ng whic hi m +child_0000-00051 WHEN DEVELOPING THE CONCEPTUAL FRAMEWORK THE GUIDING QUESTION WAS WHAT RESOURCES DO CHILDREN AND YOUNG PEOPLE NEED IN ORDER TO SUCCEED? +The purpose of the Framework is to: +- Provide a foundation for understanding what children and young people need in order thrive +Framework Overview (Download PDF) +How we arrived at it. +This framework has been collaboratively developed by many partners across government, health services; NGOs/CSOs representing various fields such as education & childcare provision – social care sector stakeholders like charities working directly with vulnerable groups being one key element among other aspects including community development organizations focusing specifically upon youth empowerment within urban slums amongst others all these actors were brought together towards shaping this holistic view around basic life necessities required not only during their formative years but also throughout adolescence up until full adulthood which includes elements related economic stability along societal norms adherence especially when considering emerging issues surrounding digital citizenship online safety etcetera thereby providing comprehensive support system essential ensuring overall well-being emotional intelligence skillsets mental wellness psychosocial resilience alongside physical capabilities critical thinking abilities problem solving capacity innovation creation potential management challenges adaptability learning agility career exploration opportunities access resources platforms fostering personal growth self-esteem building confidence leadership qualities sense belongingness identity formation collaborative teamwork skills intercultural competencies linguistic fluency cultural competence environmental awareness sustainable living practices. This conceptual map seeks integration rather than disjointed fragmented approaches allowing seamless navigation through interconnected systems promoting long-term +child_0000-00065 FOR THE PROMOTION OF SELF CONFIDENCE COHESION AND FOR MORE TARGETS WE USE THE SYSTEMATIC APPROACH TO DESCRIBE RESOURCES DEVELOP TARGETS AND CHOOSE APPROPRIATE MEASURES + +CREATE TABLE table_2039 ( + "Resource" text, + "\u4f67\u8bc1 \uff5ec"(English) real +) + +sql query: +SELECT max(" Resource ") FROM name_of_the_table WHERE "_\ucbcb\ucbeb\uBCCF\uCFFC(\ubdce)\udcfc_\uDDEA\uDDEC (\uaeff)" = 'SYSTEMATIC' SELECT `_resource` +FROM tbl_name +WHERE `_?\uCbbD? (Korean)`='systematic approach'; +child_0000-00081 NOW LET'S HAVE A CLOSE LOOK AT THE EXAMPLES OF THE RESOURCES TARGETS AND MEASURES STARTING WITH A PROMOTION OF ATHLETIC SELF CONFIDENCE. + +CREATE TABLE table_2138 ( + "Resource" text, + "# of examples in book (Chapter 5 only)" real +) + +sql + +SELECT "#" FROMtable _resource WHERE Resource = 'Promotion Of Athletic Self-Confidence' LIMIT�� +child_0000-00097 THEREFORE WE HAVE CHOSEN THE KEY TARGET THE ATHLETE POSSESSED A POSITIVE REALISTIC SELF CONFIDENCE OF HIS SPORTING SKILLS, ABILITES AND KNOWLEDGE WHICH LED TO BETTER PERFORMANCE. The concept of confidence is an integral component for any sportive activity and has been studied extensively (Carron & Salehian 1976:240) within sports psychology with the notion that 'confidence provides us a positive feeling about our ability to perform' or in other words; self-confidence describes how strongly one believes he can achieve certain goals under particular circumstances. +Hargens et al., have found out what type(s)/number/amounts/most significant variables contribute significantly towards success on field during competitions such as hockey when looking at previous studies conducted up until now regarding individual athletes’ behaviour which indicates these factors include mental preparedness readiness attitude goal setting etc... +In conclusion , we suggest incorporating strategies focusing not only solely improving technical skills but also developing overall competencies like emotional intelligence through workshops games activities coaching techniques where participants learn tools allowing them manage stress anxiety build resilience cope failure motivate themselves stay focused reach desired outcomes leading ultimately higher performance gains over time . By implementing comprehensive approach combining psychological education training methods alongside skill development athlete’s sense accomplishment increases likelihood achieving top-level achievements . +The main purpose was examining whether there exists statistically valid correlation between level attained by male volleyball players from Serbian league system who +child_0000-00112 BEFORE AND AFTER TRAINING A SESSION THE ATHLETE ARE ASKED TO GIVE THEMSELVES A GRADE FOR HOW THEY MASTER A SPECIAL TECHNIQUE WHICH INCLUDES AN ACTION SUCHAS MOVEMENT OF ONE OR MULTIPL E BODY PORTIONS , IN SPOT, DIRECTION,TIME,LINES,MUSCLE GROUP USE,EQUIPMENT USED,FACINGS,SPEED,PRECISION,GESTURE QUALITY,HISTORICAL ACCURACY,RHYTHM,BODY LANGUAGE,NATURALNESS,COSTUME COMPLETENESS + +To determine if an athlete's performance has improved after a session involving mastering new techniques in sport dancing or any other form of dance which involves precise movement and execution across various technical aspects. The grading system outlined seems to be comprehensive focusing on different dimensions related specifically but not limited solely by the context mentioned. + +Here are potential steps that can guide this evaluation: + +1) **Documentation**: Establish clear definitions for all assessment criteria (i.e., "Muscle Group Use," “Equipment Used,” etc.). Have these descriptions readily accessible before recording grades against them so everyone involved understands precisely what is being evaluated each aspect refers too. + +2)**Pre-Session Assessment Collection**: + - At beginning/end-of training sessions ask athletes individually about their self-perceived preparedness/expectations regarding learning/successfully executing technique(s) + For example: On scale from one-to-five-how ready/focused do you +child_0000-00010 WE WANT THAT UM COACHES IN GERMANY UM RECEIVE A GOOD EDUCATION THEIR CHILDREN AND YOUNG PEOPLE RECEIVE THE BEST POSSIBLE SUPPORT OF THEIR COACHES AND THAT THEY CAN GROW UP HEALTHY AND ARE MADE STRONG FOR LIFE. + +Make this lower case. +To convert the given text to all lowercase, you should change each of its uppercase letters into their corresponding smaller characters. Below is how I'd do it step by-step: + +1) Original Text: "We want that um coaches in Germany receive a good education 'their' children and young people" + + Make sure every starting letter ("we") starts with an upper-case character +2)) Apply Lowercase Conversion To Each Letter: + * We -> we + - Want (lower cases already) + ... And so on... + +3))) Final Converting Result: + + +"We need those underlined coach instructors based within german territories ensuring they acquire commendable schooling facilities enabling progression for juveniles as well nurturing kids whilst promoting healthy development aiding robustness throughout lifetimes." +child_0000-00022 UH AND DROP OUT FROM A SCIENTIFIC PERSPECTIVE RATHER I WANT TO PRESS PRESENT SPECIFIC MEASURES FOR THE PROMOTION OF THOSE PSYCHOSOCIAL RESOURCES WHICH CONTRIBUTES TO THE PREVENTION OF DROPOUT IN PRIMARY EDUCATION The following measures may contribute to promoting the psychological and social resources of primary school students, helping them cope better with stressors that can lead dropout. These interventions aim not only at addressing academic difficulties but also enhancing overall well-being: + +1\. **Enhancing Resilience through Coping Skills**: Develop programs focused on teaching coping skills such as problem-solving strategies (e.g., brainstorm solutions calmly), emotional regulation techniques for dealing effectively in stressful situations both academically or socially. + +2.\ ***Mentorship Programs***: Pair less confident/struggling pupils specifically needing support from a teacher mentor who will guide positively during tough times offering guidance beyond just academics including personal development aspects like self-esteem building which correlates highly negatively against dropping out rates when available throughout education career stages especially early years where foundational behaviors form significantly impacting later choices likely leading towards higher success trajectories reducing risk factors often manifested via earlier signs pointing toward greater likelihoods exiting educational system prematurely without completion high enough degree necessary achieving long term desired societal roles professionally etcetera across life cycle continuum + +3.\"Mindfulness & Mind Training Workshops\": Foster mindful presence exercises aiding concentration focus improving attention thereby decreasing distractibility crucial components directly influencing successful learning environments conducive growth cognitive mental +child_0000-00038 THE SPECIFIC MEASURES ARE SUITABLE FOR EVERYDAY TRAINING AND THE FRAMEWORK GREW OVER YEARS FROM SCIENTIFIC AND PRACTICAL EVIDENCE AND FOR PRACTICABLE FEASIBILITY. A SYSTEM BASED ON EFFECTIVENESS, RELIABILITY OF PROCESSES THAT IS NOT SUBJECT TO ANY EXTERNAL FACTORS OR PERSONAL INTERVENTION. +HOW IT WORKS? +Every employee has his or her own learning pace and their level of knowledge varies depending on the experience they have gained in a particular area as well which needs to be addressed within training programs accordingly; this means that standardised approaches can often lead more experienced staff down paths unrelated with actual skill sets required for specific job roles instead resulting into ineffective outcomes overall leading either increased costs due repetitive efforts made by both trainer & learner alike thus not achieving targeted objectives set out at starting point despite intentions otherwise intended thereby impacting negatively upon organisational performance over time if left unchecked without correction mechanisms being put across board throughout entire system lifecycle from start till end including continuous monitoring processes taking place regularly thereafter ensuring all components remain aligned together properly facilitating streamlined execution moving forward successfully every single step taken henceforth enabling sustainable long term success through improved efficiency whilst maximizing resource allocation optimally compared against alternative potential options yielding less favorable results making whole operation worthwhile investment after initial implementation phase concluded afterwards going forwards forevermore maintaining top notch standards consistently achievable regardless external factors encountered along journey since day one onwards continuously improving +child_0000-00053 TO LEARN EFFECTIVELY IN SPORTS SCIENCE, YOU CAN ALSO JOIN OUR ONLINE STUDENT COMMUNITY AND GET EXCLUSIVE CONTENT ON YOUR FEED. +Sports science is the scientific study of sport and athletic performance. Its main objective lies in providing an understanding to assist athletes improve physical fitness or enhance sporting skills through research studies that cover topics such as nutrition for peak-performance training methods used by elite teams around worldwide competitions with analysis tools on specific conditions like injury prevention measures applied during competition periods etc.. +How does sports scientist help you? +As a professional who has trained extensively at top universities across different countries including The University Of Bath In England & Australian Institute For Sport Australia; I've helped hundreds if not thousands individuals achieve their best possible results when it comes down goal-setting strategies tailored specifically towards each person's unique set back issues - whether those be mental blockages holding them from progressing beyond certain plateaus OR finding new ways out injuries sustained whilst playing which are currently limiting progress due insufficient strength capacity required completing essential daily tasks without pain relief medication needed every single day... +What can they do better than doctors? Do physicians have access just because anyone else would want some guidance about how maintain proper diet plan ? There’s certainly overlap but also significant differences between what we offer vs traditional healthcare professionals: +child_0000-00067 TO THIS END WE HAVE TAKEN A CLOSE LOOK WHAT SCIENTIFIC EVIDENCE TELLS US ABOUT THESE PSYCHOSOCIAL RESOURCES. +THE LITERATURE IS ABUNDANT WITH RESEARCH FINDINGS ON THE BENEFITS OF RELIGIOUS AND SPIRITUAL PRACTICES FOR WELL-BEING, INCREASED MINDFULNESS and the PROMOTION of POSITIVE EMOTIONS. SUCH ASPIRATION COULD LEAD TO MORE HUMANE WORKPLACES; SUSTAINABLE BUSINESSES WHICH OPERATE RESPONSIBLY ARE LESS LIKELY to FACE LEGAL ACTIONS WHEN THEY TAKE INTO CONSIDERATION THEIR IMPACTS on COMMUNITIES WHERE IT DOES BUSINESS; +HERE AT LIFE & TIME we are convinced about what research indicates: THERE EXISTS FRESH PATHWAYS THROUGH WORSHIP OR PRAYER can bring us closer together as a society – encouraging greater compassion towards others while promoting kindness within ourselves at no cost but rather leading individuals down paths that enrich their lives through personal transformational growth opportunities available from engaging with spiritual practices alongside traditional medicine approaches whenever faced health challenges including mental illness where religious beliefs play significant roles due cultural reasons amongst other factors too numerous here listed exhaustively yet suffice mentioning one thing: +RELigion provides emotional support in times when people feel overwhelmed by life’s struggles thus reducing stress levels helping improve overall wellbeing thereby +child_0000-00083 IF ATHLETICS SELF CONFIDENCE CONTRIBUTES TO STAY ATTACHED IN SPORTS WE SHOULD CONSIDER WHAT CONSTITUTES ATHLETICS SELF CONFIDENCE TO WHICH OUTCOME WE WANT TO PROMOTE IT AND HOW WE CAN PROMOTE IT. Athletes who have high self-confidence often exhibit higher levels of performance, motivation and resilience in sports (Baumeister & Leary 1982; Locke et al., +The following article examines how we can foster athletic confidence for an athlete to reach their potential: +Self-Confidence: Fostering Sportive Performance +Athleticism provides opportunities that benefit mental well-being through its social connection with teammates or competitors as a sport is inherently communal. +Sport also develops cognitive capabilities like critical thinking skills which contribute towards better decision making abilities both on field play situations but outside too – this may help athletes become more resourceful people even after retirement from active competition level activities altogether thus creating long-lasting impact beneficial not just within sporting arenas alone however far-reaching they could extend beyond it depending significantly upon individual’s mindset during growth stage till adulthood years especially when exposure starts early enough so intrinsic interests align properly leading them onto right path rather than forcing any unwanted outcome against nature hence ensuring sustainable passion exists regardless age factor involved therein resulting highly probable chances winning at larger scale tournaments compared others might lack due simply because initial foundation laid down earlier stages contributed disproportionately huge difference overall achievement results showing clear cut advantage one way other demonstrating true worth perseverance effort put across +child_0000-00099 AND HOW CAN ATHLETICS SELF CONFIDENCE BE PROMOTED IN EVERYDAY TRAINING? +In sports, confidence comes from having a solid training process that involves setting goals and working towards achieving them. +1. Start with small achievable objectives: It is essential to begin by developing specific targets within the individual's ability level or fitness range for each practice session of an athlete (for ex- if your client has poor leg strength you could start improving their squat depth). As they progress into better shape then increase these as necessary until reaching desired improvements which will help build trust in self improvement overtime! +2 . Create consistency among daily routines : Set aside one day per week where no new skills are learned just focusing only on repetition & reviewing previously acquired drills; doing so helps develop muscle memory over time . +3 Encourage positive reinforcement during workouts/ competition regardless how minor , celebrating wins big/small alike instills belief about personal potential - even tiny victories count when trying tough challenges head-on repeatedly ! This goes alongside acknowledging hard work付出and effort spent leading up too any goal achieved not solely focused upon result itself alone ! +4 Embrace variety throughout trainings incorporating mix techniques/approaches instead repeating same exercises every single workout prevents plateaus 历史高峰in skill development encouraging adaptability needed come matchdays! If there ever was need change +child_0000-00114 LET US GO BACK TO THE EXAMPLE OF THE SEVEN METER THROW THERE ARE SEVERAL TECHNIQUES TO MASTER A SEVEN METER THROW AND HAND BALL PLAYERS PRACTICE THESE TECHNIQUES FREQUENTLY. IN THIS GUIDE WE LOOK AT HOW ANYONE CAN LEARN ABOUT SOME COMMON SCALED OVERSHOOTS THAT WILL HELP YOU IMPROVE YOUR PERFORMANCE. +First, when throwing a seven-meter shot put you want to have your lead shoulder down on the track and in front of it so that all 12 inches or more are behind this leading arm before reaching across using an underhand throw motion with both hands pushing downward towards backside direction during release as far forwards while pulling elbows up into ball's centerline at same time for maximum distance from starting point which is called "the split second" .This creates optimal energy transfer through torso resulting most stable & highest velocity launch needed since body moves naturally along arc path dictated by gravity therefore requiring no external input such us arms swinging around circle like pendulum thus negating any inefficiencies associated trying use momentum generated swings rather letting weight distribution guide natural movement instead making consistent results much easier achieve than previously assumed . +The next principle involves having control over hand position - specifically gripping grip pressure applied must be enough support barrel yet lightened just slightly below desired amount otherwise puts toomuch strain muscle causing unnecessary fatigue hindering proper form throughout whole process especially critical final moment initiating rotation necessary getting best spin +child_0000-00012 MARTIN MUCHEM RESEARCH ASSISTANT AT THE UNIVERSITY OF ERLANGEN AN IMPORTANT KEY PARTNER FOR US +26th September 1984 - Erlangen. On this day Martin Muchem started working for the University of Augsburg's "Department on Materials Research" as a research assistant in Prof Dr Heinrich Riedel’s team. + +Martin is one to be credited with starting our long-term cooperation, because not only did he later join and become an essential part (until his retirement)of CSM Vertrieb und Anlagenbau GmbH & Co.KG but also became involved through him all sorts o… + +READ MORE + +CUTTING EDGE TECHNOLOGY AND KNOWLEDGE ARE NEEDED TO MAKE A DIFFERENCE IN TODAY'S COMPLEX MARKET ENVIRONMENT! +05TH May , JENSEN’S HEADQUarters In Hørsholm Denmark We celebrated two important milestones when we welcomed about forty people from universities like Technical university Munich or Kiel Germany Or DTU Copenhagen who have been following us closely since their time at school. +The second event was welcoming around twenty representatives … READMORE... + +WELCOME HOME! ONCE AGAIN CSMDISAPPEARED INTO THESKY WITH ITSEYESON TAIKOSHIMA FROM SPACE WHERE THEY WERE ABLE +child_0000-00024 A WIDE RANGE OF THESE MEASURES IS COLLECTED NOW UNIQUE CONCEPTUAL FRAMEWORK PERSONALITY AND TEAM DEVELOPMENT IN SPORT WHICH KATHARINA ALSO MENTIONED BEFORE. THERE ARE SOME COMMON THEMES FROM THE STUDIES SUCH AS PERSEVERANCE, COMMITMENT TO WORKING HARD ON TRAINS WITH OTHER PEOPLE INSPIRE A GREAT DEGREE. + +The reference text seems to be discussing a comprehensive approach towards assessing and enhancing personal traits like perseverance or commitment in the context of athletic performance improvement through team sports activities: + +1) **Measure Collection**: It indicates that numerous methods are currently utilized for gathering data related not only about individuals but also teams' characteristics pertinent during sport events such as their level resilience against adversities (perseverance), dedication exerted at regular training sessions with peers leading potentially significant motivational impacts on others among them being prominent themes surfaced across diverse research studies focusing specifically upon athletes enhancement strategies within competitive milieu environments including factors influencing collective success especially emphasizing cooperative interactions between members comprising individual entities forming robust performing units thus contributing majorly toward achieving set objectives collectively showcasing enhanced overall capabilities than could've been achieved individually thereby fostering teamwork spirit essential amongst competitors aiming high beyond mere skills refinement solely focused yet complementary when amalgamated together promoting sustainable development pathways enabling continuous growth over time while sustaining optimal levels productivity throughout respective careers spanning long periods demanding mental stamina alongside physical endurance ensuring longevity irrespective external changes +child_0000-00040 THE FIRST CONCEPTUAL FRAMEWORK WAS CALLED PROMOTION OF PSYCHOSOCIAL RESOURCES AND WAS DEVELOPED BY PROFESSOR RAZUGO IN 2017. + +CREATE TABLE table_69358 ( + "Programme" text, + "Institution(s)" real, + Country(ies)real,"Developed by (Professor/specialist/organisaton)/year started?"text", +) + +sql +SELECT COUNT("Instituition") FROM table WHERE DEvelped_by__ProffesorSpecialistsOrganisation/year_started? = 'Promotion of Psychosocial Resources and was developed By Professor Razugo in year Started ?' +child_0000-00055 TO PERFORM OPTIMALLY AND TO STAY ATTACHED TO THE SPORT IN THE LONG TERM, +THE PLAYER MUST MAINTAIN A BALANCED MUSCULAR BODY. +We've designed this product to be a perfect addition for all your needs in training and competition. Whether it's pre-game stretching, improving posture after an injury or helping the athlete achieve optimal performance during sport-specific movements like throwing/serving/closing into contact (volleyball). The Elastic Stretching Bands are easy-to-use tools that anyone who wants their body moving at full capacity can benefit from! Simply loop around any point on which you wish - such as trees/branches/stair railing poles/etc., then attach yourself via clips onto one end while keeping resistance taut but not too tight so muscles get activated without straining joints overuse injuries! +When we talk about muscle strength endurance flexibility coordination agility speed power recovery mental preparation resilience focus these concepts apply broadly across different types athletics however each requires unique combination of physical attributes skill sets mindset approaches thus our products cater specifically towards those areas ensuring athletes reach next level overall competitive advantage regardless discipline chosen play style preferred playing surface conditions etc... +Elastic bands provide dynamic stretch & assist with range motion increasing mobility before exercise decreasing risk impact/injury reducing fatigue accelerating warmup cool down post-recovery period +child_0000-00069 WE DISCUSSED WITH EXPERTS FROM SELECTED SPORTS WHICH RESOURCES ARE IMPORTANT IN THEIR SPORT AND TO WHICH OUTCOME THESE RESOURCES SHOULD BE PROMOTED. +This article is part of a series on “Resource Mobilisation in the Sport Environment”. It focuses specifically at what kind resources are important for which sports. For an overall introduction to this concept see our general overview here and find all articles so far: +If you want your sport organisation or project listed contact us! +Here we have some insights about their opinion from experts within 4 different types: football, athletics & tennis (indoor), table-tennis/squash/darts/golf – based upon interviews with people having experience working there as volunteers / employees/ managers etcetera; thus giving them insight into how resource mobilization plays out differently depending greatly by type but can still be broadly applicable regardless through certain key principles outlined below! We will now look more closely each area individually starting off taking Football first... +Q1) What specific skills do coaches require when it comes down recruiting new players? A skilled coach needs strong technical knowledge across multiple positions since they must understand not only offensive strategies like passing combinations yet defense tactics such blocking shots too especially under pressure situations given time constraints often faced during matches where quick decisions need making rapidly before opponent counters attack back forcing re-adjustments accordingly... Communication abilities crucial particularly +child_0000-00085 THE ATHLETICS SELF CONFIDENCE INCLUDES ALL THE ASSUMPTIONS A PERSON MAKES ABOUT HIS OR HER OWN SPORTING SKILLS. ACCORDINC TO RESEARCH, HOW IS THEIR EFFECT ON CHILDREN'S PERFORMANCE PERCEIVED? +A: POSITIVE +B : NEGATIVE +C BOTH ARE CORRECT + +According to the passage, + +The attitude of coaches toward their teams can have a significant impact on athlete's self-confidence and ultimately affect performance. +Answer D is correct because it states that when athletes perceive they are respected by others such as teammates or other students. + +Which psychological trait involves an individual experiencing intense feelings like panic during stressful situations? + +a) Phobia + +b] Depression + + +c Anxiety disorder d None + + +e Bipolar Disorder Please select one answer from above. + + +Anxiety disorders include specific phobias (like fear specifically attached towards heights), agoraphobic tendencies which refers extreme discomfort in any social settings etc.). Panic attacks fall under broad criteria 'an anxiety related diagnosis' thus making option c accurate here given choices you've provided! So: + +ANSWER = C_An焦虑障碍包括特定的恐惧症(如对高度的具体害怕),广场恐怖倾向指的是在任何社交场合中的极端不适等。恐慌发作属于广泛的"与压力相关的诊断”范畴,因此,在你提供的选择中做出了正确的答案: + +回答 +child_0000-00101 TRAINING COMPETITION OF A NUMBER OF SITUATIONS THAT HAVE AN EFFECT ON ATHLETIC SELF CONFIDENCE AS RELATED TO THE OVERALL PERFORMANCE. Athlete's self confidence is essential to athletic performance for several reasons, including the motivational effects that it has on an athlete during competition and training (Côté & McCafferty 2018). Confidence may be thought of as "the positive expectations one holds about his or her ability in a given domain" such athletes believe they are capable when competing with others(Coty&McCafary ,963)Self-confidence helps individuals perform well under pressure which allows them achieve their best by providing emotional resilience(Sport psychol Vol.S7 No.l June JlOll ) . This study would investigate how certain situations affect Self-Confidence levels among participants from multiple sports so findings could inform coaching strategies aimed at increasing overall Performance . +To conduct this research we will develop scenarios based off past experiences coaches/athletes have had where factors like opponent difficulty score increased chances failure leading decreased trust/increased anxiety before game day preparations etc.; after recording responses via survey questions related specifically around these events look through patterns emerge comparing across different sport disciplines; analyze any correlations between specific variables found within individual stories shared regarding impact each particular event played out upon participant morale +child_0000-00116 BEFORE A TRAINING SESSION FOR THE SEVEN METER THROW THE ATHLETE RATES THE ABILITY TO SHOW TECHNIQUES LIKE SHOOTING STRAIGHT SHOOTING THE ROTATION OF THE BALL SHOOTING WITH A BOUNCE OFF THE GROUND OR SHOOTING A LOT FROM REARWARD. IF AN ANALYSIS IS CONDUCTED AFTER THIS, WHICH ONE WOULD BE APPROPRIATE: +The following statistical methods would not be appropriate for analyzing these ratings: +- Descriptive statistics (mean rating or frequency of techniques), which provide a summary description but do't establish causality. +c) Correlation test between technique types and performance measures might suggest if two variables are related based on the data without inferring cause-and-effect relationships. + +Given your question seems to imply choosing among inappropriate options explicitly listed here: + +1d - Analysis by categories can classify them into groups like 'good' vs.'bad', then using Chi-square tests may assess whether specific shooting styles occur more frequently in successful throws than expected under randomness assumption; this could point out trends that seem statistically significant yet it doesn’t necessarily show direct causal links within those groupings alone as chi-squares donnot differentiate reasonsbehind such distributions + +2e Regression analysis is typically used with continuous dependent variable & at least one independent/explained-variable assuming relationship follows linear model form implying predictability through numerical values linked hence cannot directly apply straightforwardly when dealing solely categoric/ qualitative evaluations unless adapted creatively perhaps modeling ordinal diff --git a/examples/asr_fireredasr_text/finetune_fireredasr.py b/examples/asr_fireredasr_text/finetune_fireredasr.py new file mode 100644 index 00000000..ef277e63 --- /dev/null +++ b/examples/asr_fireredasr_text/finetune_fireredasr.py @@ -0,0 +1,49 @@ +from slam_llm.pipeline.finetune import main as train +from typing import Optional + +import hydra +import logging +from dataclasses import dataclass, field +from omegaconf import DictConfig, ListConfig, OmegaConf +from fireredasr_config import ModelConfig, TrainConfig, DataConfig, LogConfig, FSDPConfig + +@dataclass +class RunConfig: + dataset_config: DataConfig = field(default_factory=DataConfig) + model_config: ModelConfig = field(default_factory=ModelConfig) + train_config: TrainConfig = field(default_factory=TrainConfig) + log_config: LogConfig = field(default_factory=LogConfig) + fsdp_config: FSDPConfig = field(default_factory=FSDPConfig) + debug: bool = field(default=False, metadata={"help": "Use pdb when true"}) + metric: str = field(default="acc", metadata={"help": "The metric for evaluation"}) + ckpt_path: Optional[str] = field( + default=None, metadata={"help": "The path to model's checkpoint"} + ) + +@hydra.main(config_name=None, version_base=None) +def main_hydra(cfg: DictConfig): + run_config = RunConfig() + cfg = OmegaConf.merge(run_config, cfg) + def to_plain_list(cfg_item): + if isinstance(cfg_item, ListConfig): + return OmegaConf.to_container(cfg_item, resolve=True) + elif isinstance(cfg_item, DictConfig): + return {k: to_plain_list(v) for k, v in cfg_item.items()} + else: + return cfg_item + + # kwargs = to_plain_list(cfg) + kwargs = cfg + log_level = getattr(logging, kwargs.get("log_level", "INFO").upper()) + + logging.basicConfig(level=log_level) + + if kwargs.get("debug", False): + import pdb; + pdb.set_trace() + + train(kwargs) + + +if __name__ == "__main__": + main_hydra() \ No newline at end of file diff --git a/examples/asr_fireredasr_text/finetune_fireredasr_deepspeed.py b/examples/asr_fireredasr_text/finetune_fireredasr_deepspeed.py new file mode 100644 index 00000000..d0a3e549 --- /dev/null +++ b/examples/asr_fireredasr_text/finetune_fireredasr_deepspeed.py @@ -0,0 +1,57 @@ +from slam_llm.pipeline.finetune_deepspeed import main as train +from typing import Optional +import argparse +import hydra +import logging +from dataclasses import dataclass, field +from omegaconf import DictConfig, ListConfig, OmegaConf +from fireredasr_config import ModelConfig, TrainConfig, DataConfig, LogConfig, FSDPConfig +import sys +@dataclass +class RunConfig: + dataset_config: DataConfig = field(default_factory=DataConfig) + model_config: ModelConfig = field(default_factory=ModelConfig) + train_config: TrainConfig = field(default_factory=TrainConfig) + log_config: LogConfig = field(default_factory=LogConfig) + fsdp_config: FSDPConfig = field(default_factory=FSDPConfig) + debug: bool = field(default=False, metadata={"help": "Use pdb when true"}) + metric: str = field(default="acc", metadata={"help": "The metric for evaluation"}) + ckpt_path: Optional[str] = field( + default=None, metadata={"help": "The path to projector checkpoint"} + ) + deepspeed_config : str ="" +def parse_deepspeed_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--local_rank", type=int, default=-1) + + args, unknown_args = parser.parse_known_args() + return args, unknown_args +@hydra.main(config_name=None, version_base=None) +def main_hydra(cfg: DictConfig): + run_config = RunConfig() + cfg = OmegaConf.merge(run_config, cfg) + def to_plain_list(cfg_item): + if isinstance(cfg_item, ListConfig): + return OmegaConf.to_container(cfg_item, resolve=True) + elif isinstance(cfg_item, DictConfig): + return {k: to_plain_list(v) for k, v in cfg_item.items()} + else: + return cfg_item + + # kwargs = to_plain_list(cfg) + kwargs = cfg + log_level = getattr(logging, kwargs.get("log_level", "INFO").upper()) + + logging.basicConfig(level=log_level) + + if kwargs.get("debug", False): + import pdb; + pdb.set_trace() + + train(kwargs) + + +if __name__ == "__main__": + deepspeed_args, remaining_args = parse_deepspeed_args() + sys.argv = [sys.argv[0]] + remaining_args # 仅传递 Hydra 能处理的参数 + main_hydra() \ No newline at end of file diff --git a/examples/asr_fireredasr_text/fireredasr_config.py b/examples/asr_fireredasr_text/fireredasr_config.py new file mode 100644 index 00000000..46869ee2 --- /dev/null +++ b/examples/asr_fireredasr_text/fireredasr_config.py @@ -0,0 +1,166 @@ +from dataclasses import dataclass, field +from typing import Optional, List +from torch.distributed.fsdp import ShardingStrategy + +@dataclass +class ModelConfig: + file: str = "examples/asr_fireredasr_text/model/slam_fireredasr.py" + llm_name: str = "vicuna-7b-v1.5" + llm_path: str = "PATH/to/LLAMA/7B" + llm_type: str = "decoder_only" + llm_dim: int = 4096 + firered_path :str = "/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/FireRedASR-LLM/" + encoder_name: Optional[str] = None + encoder_ds_rate: int = 2 + encoder_path: Optional[str] = None + encoder_path_hf: Optional[str] = None + encoder_dim: int = 1280 + encoder_projector: str = "linear" + qformer_layers : int = 8 + encoder_projector_ds_rate: int = 2 + modal: str = "audio" + normalize: Optional[bool] = field(default=False, metadata={ + "help": "whether input is normalized, used for models such as wavlm" + }) + encoder_type: str = field(default="finetune", metadata={ + "help": "whether model is only pretrained or finetuned, used for models such as hubert" + }) + +# @dataclass +# class PeftConfig: +# peft_method: str = "lora" # None , llama_adapter, prefix +# r: int = 8 +# lora_alpha: int = 32 +# target_modules: List = field(default_factory=lambda: [ "q_proj", "v_proj"]) +# bias: str = "none" +# task_type: str = "CAUSAL_LM" +# lora_dropout: float = 0.05 +# inference_mode: bool = False + +@dataclass +class PeftConfig: + peft_method: str = "lora" # None , llama_adapter, prefix + r: int = 64 + lora_alpha: int = 16 + target_modules: List = field(default_factory=lambda: [ "q_proj", "v_proj", "o_proj", "up_proj","gate_proj","down_proj"]) + bias: str = "none" + task_type: str = "CAUSAL_LM" + lora_dropout: float = 0.05 + inference_mode: bool = False + +@dataclass +class TrainConfig: + model_name:str = "PATH/to/LLAMA/7B" + enable_ddp:bool = False + enable_deepspeed:bool = False + enable_fsdp:bool = False + low_cpu_fsdp:bool = False + run_validation:bool = True + batch_size_training: Optional[int] = None + batching_strategy:str = field(default="packing", metadata={ + "help":"alternative: padding" + }) # + context_length:int = 4096 + gradient_accumulation_steps:int = 1 + num_epochs:int = 3 + num_workers_dataloader:int = 1 + warmup_steps:int = 1000 + total_steps:int = 100000 + validation_interval:int = 1000 + lr:float = 1e-4 + weight_decay:float = 0.0 + gamma:float = 0.85 + seed:int = 42 + use_fp16:bool = False + mixed_precision:bool = True + val_batch_size:Optional[int] = None + + use_peft:bool = False + peft_config:PeftConfig = field(default_factory=PeftConfig) + output_dir:str = "PATH/to/save/PEFT/model" + freeze_layers:bool = False + num_freeze_layers:int = 1 + quantization:bool = False + one_gpu:bool = False + save_model:bool = True + dist_checkpoint_root_folder:str = "PATH/to/save/FSDP/model" # will be used if using FSDP + dist_checkpoint_folder:str = "fine-tuned" # will be used if using FSDP + save_optimizer:bool = False # will be used if using FSDP + use_fast_kernels:bool = False # Enable using SDPA from PyTroch Accelerated Transformers, make use Flash Attention and Xformer memory-efficient kernels + run_test_during_validation:bool = False + run_test_during_validation_file:str = "test.wav" + run_test_during_validation_prompt:str = "<|ASR|>" + freeze_llm:bool = field(default=False, metadata={ + "help": "whether to freeze llm when finetuning, should be true when use peft finetuning" + }) + freeze_encoder:bool = False + inference_mode:bool = True + +@dataclass +class DataConfig: + # dataset: str = "slidespeech_dataset" + dataset: str = "multitask_dataset" + llm_name: str = "vicuna-7b-v1.5" + prompt_style: str = "normal" # instruct + # file: str = "examples/mala_asr_slidespeech/dataset/slidespeech_dataset.py:get_speech_dataset" + file: str = "examples/asr_fireredasr_text/model/slam_fireredasr.py:get_speech_dataset" + cmvn_file: str = "/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/FireRedASR-LLM/cmvn.ark" + text: Optional[str] = None + wav_paths: Optional[List[str]] = None + wav_path: Optional[str] = None + wav_dir: Optional[str] = None + wav_scp: Optional[str] = None + speed_perturb : bool = False + spec_augmentation : bool = False + add_noise : bool = False + add_reverb : bool = False + noise_file_path: str = "" + train_scp_file_path: str = "" + train_text_file: str = "" + dev_scp_file_path: str = "" + dev_text_file: str = "" + test_scp_file_path: str = "" + train_split: str = "train" + dev_split: str = "dev" + test_split:str = "test" + pad_or_trim: bool = True + prompt: Optional[str] = None + use_ocr: bool = True + inference_mode: bool = False + prompt_mode : str = "qwen" + lower: bool = False + encoder_ds_rate: int = 2 + encoder_projector_ds_rate: int = 5 + fix_length_audio: int = -1 + inference_mode:bool = False + input_type: str = field(default="raw", metadata={ + "help":"Use raw when input is wav, mel when for whisper" + }) + mel_size: int = field(default=80, metadata={ + "help": "80 for whisper large v1 and v2, 128 for v3" + }) + normalize: Optional[bool] = field(default=False, metadata={ + "help": "whether input is normalized, used for models such as wavlm" + }) + +@dataclass +class FSDPConfig: + mixed_precision: bool = True + use_fp16: bool = False + # sharding_strategy = "FULL_SHARD" #ShardingStrategy = ShardingStrategy.FULL_SHARD + sharding_strategy: ShardingStrategy = "SHARD_GRAD_OP" #ShardingStrategy.NO_SHARD #MZY: set NO_SHARD when use DDP + checkpoint_type: str = "SHARDED_STATE_DICT" # alternatively can use SHARDED_STATE_DICT save one file per rank, and can resize the world-size. + fsdp_activation_checkpointing: bool = True + fsdp_cpu_offload: bool = False + pure_bf16: bool = False + optimizer: str = "AdamW" + +@dataclass +class LogConfig: + use_wandb: bool = False + wandb_dir: str = "tmp/test_wandb" + wandb_entity_name: str = "project_name" + wandb_project_name: str = "project_name" + wandb_exp_name: str = "exp_name" + log_file: str = "tmp/test.log" + log_interval: int = 5 diff --git a/examples/asr_fireredasr_text/inference_fireredasr.py b/examples/asr_fireredasr_text/inference_fireredasr.py new file mode 100644 index 00000000..d7657776 --- /dev/null +++ b/examples/asr_fireredasr_text/inference_fireredasr.py @@ -0,0 +1,53 @@ +from slam_llm.pipeline.inference_batch import main as inference + +import hydra +import logging +from dataclasses import dataclass, field +from omegaconf import DictConfig, ListConfig, OmegaConf +from typing import Optional +from fireredasr_config import ModelConfig, TrainConfig, DataConfig, LogConfig, FSDPConfig + + +@dataclass +class RunConfig: + dataset_config: DataConfig = field(default_factory=DataConfig) + model_config: ModelConfig = field(default_factory=ModelConfig) + train_config: TrainConfig = field(default_factory=TrainConfig) + log_config: LogConfig = field(default_factory=LogConfig) + fsdp_config: FSDPConfig = field(default_factory=FSDPConfig) + debug: bool = field(default=False, metadata={"help": "Use pdb when true"}) + metric: str = field(default="acc", metadata={"help": "The metric for evaluation"}) + decode_log: str = field( + default="output/decode_log", + metadata={"help": "The prefix for the decode output"}, + ) + ckpt_path: Optional[str] = field( + default=None, metadata={"help": "The path to projector checkpoint"} + ) + peft_ckpt: Optional[str] = field( + default=None, + metadata={ + "help": "The path to peft checkpoint, should be a directory including adapter_config.json" + }, + ) + + +@hydra.main(config_name=None, version_base=None) +def main_hydra(cfg: DictConfig): + run_config = RunConfig() + cfg = OmegaConf.merge(run_config, cfg) + # kwargs = to_plain_list(cfg) + log_level = getattr(logging, cfg.get("log_level", "INFO").upper()) + + logging.basicConfig(level=log_level) + + if cfg.get("debug", False): + import pdb + + pdb.set_trace() + + inference(cfg) + + +if __name__ == "__main__": + main_hydra() diff --git a/examples/asr_fireredasr_text/inference_fireredasr_deepspeed.py b/examples/asr_fireredasr_text/inference_fireredasr_deepspeed.py new file mode 100644 index 00000000..b00bb9db --- /dev/null +++ b/examples/asr_fireredasr_text/inference_fireredasr_deepspeed.py @@ -0,0 +1,60 @@ +from slam_llm.pipeline.inference_batch_deepspeed import main as inference +import sys +import hydra +import logging +from dataclasses import dataclass, field +from omegaconf import DictConfig, ListConfig, OmegaConf +from typing import Optional +from fireredasr_config import ModelConfig, TrainConfig, DataConfig, LogConfig, FSDPConfig +import argparse + +@dataclass +class RunConfig: + dataset_config: DataConfig = field(default_factory=DataConfig) + model_config: ModelConfig = field(default_factory=ModelConfig) + train_config: TrainConfig = field(default_factory=TrainConfig) + log_config: LogConfig = field(default_factory=LogConfig) + fsdp_config: FSDPConfig = field(default_factory=FSDPConfig) + debug: bool = field(default=False, metadata={"help": "Use pdb when true"}) + metric: str = field(default="acc", metadata={"help": "The metric for evaluation"}) + decode_log: str = field( + default="output/decode_log", + metadata={"help": "The prefix for the decode output"}, + ) + ckpt_path: Optional[str] = field( + default=None, metadata={"help": "The path to projector checkpoint"} + ) + peft_ckpt: Optional[str] = field( + default=None, + metadata={ + "help": "The path to peft checkpoint, should be a directory including adapter_config.json" + }, + ) +def parse_deepspeed_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--local_rank", type=int, default=-1) + + args, unknown_args = parser.parse_known_args() + return args, unknown_args + +@hydra.main(config_name=None, version_base=None) +def main_hydra(cfg: DictConfig): + run_config = RunConfig() + cfg = OmegaConf.merge(run_config, cfg) + # kwargs = to_plain_list(cfg) + log_level = getattr(logging, cfg.get("log_level", "INFO").upper()) + + logging.basicConfig(level=log_level) + + if cfg.get("debug", False): + import pdb + + pdb.set_trace() + + inference(cfg) + + +if __name__ == "__main__": + deepspeed_args, remaining_args = parse_deepspeed_args() + sys.argv = [sys.argv[0]] + remaining_args # 仅传递 Hydra 能处理的参数 + main_hydra() diff --git a/examples/asr_fireredasr_text/model/fireredasr/__init__.py b/examples/asr_fireredasr_text/model/fireredasr/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/examples/asr_fireredasr_text/model/fireredasr/data b/examples/asr_fireredasr_text/model/fireredasr/data new file mode 120000 index 00000000..3d6bf669 --- /dev/null +++ b/examples/asr_fireredasr_text/model/fireredasr/data @@ -0,0 +1 @@ +/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/FireRedASR/fireredasr/data \ No newline at end of file diff --git a/examples/asr_fireredasr_text/model/fireredasr/models/fireredasr.py b/examples/asr_fireredasr_text/model/fireredasr/models/fireredasr.py new file mode 100644 index 00000000..0f7da203 --- /dev/null +++ b/examples/asr_fireredasr_text/model/fireredasr/models/fireredasr.py @@ -0,0 +1,129 @@ +import os +import time + +import torch +import torch_npu +from fireredasr.data.asr_feat import ASRFeatExtractor +from fireredasr.models.fireredasr_aed import FireRedAsrAed +from fireredasr.models.fireredasr_llm import FireRedAsrLlm +from fireredasr.tokenizer.aed_tokenizer import ChineseCharEnglishSpmTokenizer +from fireredasr.tokenizer.llm_tokenizer import LlmTokenizerWrapper + + +class FireRedAsr: + @classmethod + def from_pretrained(cls, asr_type, model_dir): + assert asr_type in ["aed", "llm"] + + cmvn_path = os.path.join(model_dir, "cmvn.ark") + feat_extractor = ASRFeatExtractor(cmvn_path) + + if asr_type == "aed": + model_path = os.path.join(model_dir, "model.pth.tar") + dict_path =os.path.join(model_dir, "dict.txt") + spm_model = os.path.join(model_dir, "train_bpe1000.model") + model = load_fireredasr_aed_model(model_path) + tokenizer = ChineseCharEnglishSpmTokenizer(dict_path, spm_model) + elif asr_type == "llm": + model_path = os.path.join(model_dir, "model.pth.tar") + encoder_path = os.path.join(model_dir, "asr_encoder.pth.tar") + llm_dir = os.path.join(model_dir, "Qwen2-7B-Instruct") + model, tokenizer = load_firered_llm_model_and_tokenizer( + model_path, encoder_path, llm_dir) + model.eval() + return cls(asr_type, feat_extractor, model, tokenizer) + + def __init__(self, asr_type, feat_extractor, model, tokenizer): + self.asr_type = asr_type + self.feat_extractor = feat_extractor + self.model = model + self.tokenizer = tokenizer + + @torch.no_grad() + def transcribe(self, batch_uttid, batch_wav_path, args={}): + feats, lengths, durs = self.feat_extractor(batch_wav_path) + total_dur = sum(durs) + if args.get("use_gpu", False): + feats, lengths = feats.cuda(), lengths.cuda() + self.model.cuda() + else: + feats, lengths = feats.npu(), lengths.npu() + self.model.npu() + + if self.asr_type == "aed": + start_time = time.time() + + hyps = self.model.transcribe( + feats, lengths, + args.get("beam_size", 1), + args.get("nbest", 1), + args.get("decode_max_len", 0), + args.get("softmax_smoothing", 1.0), + args.get("aed_length_penalty", 0.0), + args.get("eos_penalty", 1.0) + ) + + elapsed = time.time() - start_time + rtf= elapsed / total_dur if total_dur > 0 else 0 + + results = [] + for uttid, wav, hyp in zip(batch_uttid, batch_wav_path, hyps): + hyp = hyp[0] # only return 1-best + hyp_ids = [int(id) for id in hyp["yseq"].cpu()] + text = self.tokenizer.detokenize(hyp_ids) + results.append({"uttid": uttid, "text": text, "wav": wav, + "rtf": f"{rtf:.4f}"}) + return results + + elif self.asr_type == "llm": + input_ids, attention_mask, _, _ = \ + LlmTokenizerWrapper.preprocess_texts( + origin_texts=[""]*feats.size(0), tokenizer=self.tokenizer, + max_len=128, decode=True) + if args.get("use_gpu", False): + input_ids = input_ids.cuda() + attention_mask = attention_mask.cuda() + else: + input_ids = input_ids.npu() + attention_mask = attention_mask.npu() + start_time = time.time() + + generated_ids = self.model.transcribe( + feats, lengths, input_ids, attention_mask, + args.get("beam_size", 1), + args.get("decode_max_len", 0), + args.get("decode_min_len", 0), + args.get("repetition_penalty", 1.0), + args.get("llm_length_penalty", 0.0), + args.get("temperature", 1.0) + ) + + elapsed = time.time() - start_time + rtf= elapsed / total_dur if total_dur > 0 else 0 + texts = self.tokenizer.batch_decode(generated_ids, + skip_special_tokens=True) + results = [] + for uttid, wav, text in zip(batch_uttid, batch_wav_path, texts): + results.append({"uttid": uttid, "text": text, "wav": wav, + "rtf": f"{rtf:.4f}"}) + return results + + + +def load_fireredasr_aed_model(model_path): + package = torch.load(model_path, map_location=lambda storage, loc: storage) + print("model args:", package["args"]) + model = FireRedAsrAed.from_args(package["args"]) + model.load_state_dict(package["model_state_dict"], strict=True) + return model + + +def load_firered_llm_model_and_tokenizer(model_path, encoder_path, llm_dir): + package = torch.load(model_path, map_location=lambda storage, loc: storage) + package["args"].encoder_path = encoder_path + package["args"].llm_dir = llm_dir + print("model args:", package["args"]) + model = FireRedAsrLlm.from_args(package["args"]) + model.load_state_dict(package["model_state_dict"], strict=False) + tokenizer = LlmTokenizerWrapper.build_llm_tokenizer(llm_dir) + return model, tokenizer diff --git a/examples/asr_fireredasr_text/model/fireredasr/models/fireredasr_aed.py b/examples/asr_fireredasr_text/model/fireredasr/models/fireredasr_aed.py new file mode 100644 index 00000000..4938c2c3 --- /dev/null +++ b/examples/asr_fireredasr_text/model/fireredasr/models/fireredasr_aed.py @@ -0,0 +1,35 @@ +import torch + +from fireredasr.models.module.conformer_encoder import ConformerEncoder +from fireredasr.models.module.transformer_decoder import TransformerDecoder + + +class FireRedAsrAed(torch.nn.Module): + @classmethod + def from_args(cls, args): + return cls(args) + + def __init__(self, args): + super().__init__() + self.sos_id = args.sos_id + self.eos_id = args.eos_id + + self.encoder = ConformerEncoder( + args.idim, args.n_layers_enc, args.n_head, args.d_model, + args.residual_dropout, args.dropout_rate, + args.kernel_size, args.pe_maxlen) + + self.decoder = TransformerDecoder( + args.sos_id, args.eos_id, args.pad_id, args.odim, + args.n_layers_dec, args.n_head, args.d_model, + args.residual_dropout, args.pe_maxlen) + + def transcribe(self, padded_input, input_lengths, + beam_size=1, nbest=1, decode_max_len=0, + softmax_smoothing=1.0, length_penalty=0.0, eos_penalty=1.0): + enc_outputs, _, enc_mask = self.encoder(padded_input, input_lengths) + nbest_hyps = self.decoder.batch_beam_search( + enc_outputs, enc_mask, + beam_size, nbest, decode_max_len, + softmax_smoothing, length_penalty, eos_penalty) + return nbest_hyps diff --git a/examples/asr_fireredasr_text/model/fireredasr/models/fireredasr_llm.py b/examples/asr_fireredasr_text/model/fireredasr/models/fireredasr_llm.py new file mode 100644 index 00000000..a8c36dee --- /dev/null +++ b/examples/asr_fireredasr_text/model/fireredasr/models/fireredasr_llm.py @@ -0,0 +1,360 @@ +import logging +import os +import random +import re + +import torch +import torch.nn as nn +from transformers import AutoModelForCausalLM +from torch.npu.amp import autocast +from fireredasr.models.fireredasr_aed import FireRedAsrAed +from fireredasr.models.module.adapter import Adapter +from fireredasr.tokenizer.llm_tokenizer import DEFAULT_SPEECH_TOKEN, IGNORE_TOKEN_ID +from fireredasr.tokenizer.llm_tokenizer import LlmTokenizerWrapper +from fireredasr.utils.param import count_model_parameters +import sys +sys.path.append('/aistor/aispeech/hpc_stor01/home/pengjing00sx/SLAM-LLM/examples/asr_fireredasr') +from slam_llm.utils.metric import compute_accuracy + +class FireRedAsrLlm(nn.Module): + @classmethod + def load_encoder(cls, model_path): + assert os.path.exists(model_path) + package = torch.load(model_path, map_location=lambda storage, loc: storage) + model = FireRedAsrAed.from_args(package["args"]) + if "model_state_dict" in package: + model.load_state_dict(package["model_state_dict"], strict=False) + encoder = model.encoder + encoder_dim = encoder.odim + return encoder, encoder_dim + + @classmethod + def from_args(cls, args): + logging.info(args) + logging.info("Build FireRedAsrLlm") + # Build Speech Encoder + encoder, encoder_dim = cls.load_encoder(args.encoder_path) + count_model_parameters(encoder) + if args.freeze_encoder: + logging.info(f"Frezee encoder") + for name, param in encoder.named_parameters(): + param.requires_grad = False + encoder.eval() + + if args.use_flash_attn: + attn_implementation = "flash_attention_2" + if args.use_fp16: + torch_dtype = torch.float16 + else: + torch_dtype = torch.float32 + else: + attn_implementation = "eager" + if args.use_fp16: + torch_dtype = torch.float16 + else: + torch_dtype = torch.float32 + + # Build LLM + llm = AutoModelForCausalLM.from_pretrained( + args.llm_dir, + attn_implementation=attn_implementation, + torch_dtype=torch_dtype, + ) + + count_model_parameters(llm) + # LLM Freeze or LoRA + llm_dim = llm.config.hidden_size + if args.freeze_llm: + logging.info(f"Frezee LLM") + for name, param in llm.named_parameters(): + param.requires_grad = False + llm.eval() + else: + if args.use_lora: + from peft import LoraConfig, get_peft_model + lora_config = LoraConfig( + r=64, + lora_alpha=16, + target_modules=[ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "up_proj", + "gate_proj", + "down_proj", + ], + lora_dropout=0.05, + task_type="CAUSAL_LM", + ) + llm = get_peft_model(llm, lora_config) + llm.print_trainable_parameters() + + tokenizer = LlmTokenizerWrapper.build_llm_tokenizer(args.llm_dir) + assert tokenizer.pad_token_id == tokenizer.convert_tokens_to_ids("<|endoftext|>") + llm.config.pad_token_id = tokenizer.pad_token_id + llm.config.bos_token_id = tokenizer.convert_tokens_to_ids("<|im_start|>") + llm.config.eos_token_id = tokenizer.convert_tokens_to_ids("<|im_end|>") + llm.config.default_speech_token_id = tokenizer.convert_tokens_to_ids( + DEFAULT_SPEECH_TOKEN + ) + + # Build projector + encoder_projector = Adapter( + encoder_dim, llm_dim, args.encoder_downsample_rate) + count_model_parameters(encoder_projector) + + return cls(encoder, llm, encoder_projector, + args.freeze_encoder, args.freeze_llm) + + def __init__(self, encoder, llm, encoder_projector, + freeze_encoder, freeze_llm): + super().__init__() + self.encoder = encoder + self.llm = llm + self.encoder_projector = encoder_projector + # args + self.freeze_encoder = freeze_encoder + self.freeze_llm = freeze_llm + self.llm_config = llm.config + + def transcribe(self, padded_feat, feat_lengths, padded_input_ids, attention_mask, + beam_size=1, decode_max_len=0, decode_min_len=0, + repetition_penalty=1.0, llm_length_penalty=0, temperature=1.0): + encoder_outs, enc_lengths, enc_mask = self.encoder(padded_feat, feat_lengths) + speech_features, speech_lens = self.encoder_projector(encoder_outs, enc_lengths) + inputs_embeds = self.llm.get_input_embeddings()(padded_input_ids) + + inputs_embeds, attention_mask, _ = \ + self._merge_input_ids_with_speech_features( + speech_features.to(inputs_embeds.dtype), inputs_embeds, padded_input_ids, attention_mask, + speech_lens=speech_lens + ) + + max_new_tokens = speech_features.size(1) if decode_max_len < 1 else decode_max_len + max_new_tokens = max(1, max_new_tokens) + + generated_ids = self.llm.generate( + inputs_embeds=inputs_embeds, + max_new_tokens=max_new_tokens, + num_beams=beam_size, + do_sample=False, + min_length=decode_min_len, + top_p=1.0, + repetition_penalty=repetition_penalty, + length_penalty=llm_length_penalty, + temperature=temperature, + bos_token_id=self.llm.config.bos_token_id, + eos_token_id=self.llm.config.eos_token_id, + pad_token_id=self.llm.config.pad_token_id, + ) + + return generated_ids + @autocast(dtype=torch.bfloat16) + def forward(self, **batch): + targets = batch["targets"] + keys = batch["keys"] + # padded_feat = batch["feats"] + # feat_lengths = batch["lengths"] + padded_input_ids = batch["input_ids"] + attention_mask = batch["attention_mask"] + target_ids = batch["target_ids"] + labels = target_ids + # print(padded_feat.dtype) + # encoder_outs, enc_lengths, enc_mask = self.encoder(padded_feat, feat_lengths) + # speech_features, speech_lens = self.encoder_projector(encoder_outs, enc_lengths) + inputs_embeds = self.llm.get_input_embeddings()(padded_input_ids) + # train + # inputs_embeds, attention_mask, labels = \ + # self._merge_input_ids_with_speech_features( + # speech_features.to(inputs_embeds.dtype), inputs_embeds, padded_input_ids, attention_mask, target_ids, + # speech_lens=speech_lens + # ) + model_outputs = self.llm(inputs_embeds=inputs_embeds, attention_mask=attention_mask, labels=target_ids) + # model_outputs = self.llm(inputs_embeds=inputs_embeds, attention_mask=attention_mask, labels=labels) + # labels = batch["labels"] + # print(batch) + # exit(0) + # model_outputs = self.llm(**batch) + acc = -1 + with torch.no_grad(): + preds = torch.argmax(model_outputs.logits, -1) + acc = compute_accuracy(preds.detach()[:, :-1], labels.detach()[:, 1:], ignore_label=-100) + # input() + return model_outputs, acc + + # SLAM-LLM api + @torch.no_grad() + def generate(self, **batch): + # decode args: + input_ids = batch["input_ids"] + attention_mask = batch["attention_mask"] + beam_size=3 + decode_max_len=0 + decode_min_len=0 + repetition_penalty=3.0 + llm_length_penalty=1.0 + temperature=1.0 + generated_ids = self.llm.generate( + input_ids=input_ids, + attention_mask=attention_mask, + max_length = 300, + top_p=1.0, + repetition_penalty=repetition_penalty, + length_penalty=llm_length_penalty, + temperature=temperature, + ) + # # keys = batch["keys"] + # padded_feat = batch["feats"] + # feat_lengths = batch["lengths"] + # padded_input_ids = batch["input_ids"] + # attention_mask = batch["attention_mask"] + # encoder_outs, enc_lengths, enc_mask = self.encoder(padded_feat, feat_lengths) + # speech_features, speech_lens = self.encoder_projector(encoder_outs, enc_lengths) + # inputs_embeds = self.llm.get_input_embeddings()(padded_input_ids) + # inputs_embeds, attention_mask, _ = \ + # self._merge_input_ids_with_speech_features( + # speech_features.to(inputs_embeds.dtype), inputs_embeds, padded_input_ids, attention_mask, + # speech_lens=speech_lens + # ) + # max_new_tokens = speech_features.size(1) if decode_max_len < 1 else decode_max_len + # max_new_tokens = max(1, max_new_tokens) + # generated_ids = self.llm.generate( + # inputs_embeds=inputs_embeds, + # max_new_tokens=max_new_tokens, + # num_beams=beam_size, + # do_sample=False, + # min_length=decode_min_len, + # top_p=1.0, + # repetition_penalty=repetition_penalty, + # length_penalty=llm_length_penalty, + # temperature=temperature, + # bos_token_id=self.llm.config.bos_token_id, + # eos_token_id=self.llm.config.eos_token_id, + # pad_token_id=self.llm.config.pad_token_id, + # ) + return generated_ids + + def _merge_input_ids_with_speech_features( + self, speech_features, inputs_embeds, input_ids, attention_mask, labels=None, + speech_lens=None + ): + """ + Modified from: https://github.com/k2-fsa/icefall/blob/master/egs/speech_llm/ASR_LLM/whisper_llm_zh/model.py + """ + speech_lens = None + num_speechs, speech_len, embed_dim = speech_features.shape + batch_size, sequence_length = input_ids.shape + left_padding = not torch.sum( + input_ids[:, -1] == torch.tensor(self.llm.config.pad_token_id) + ) + # print(f"pad_token_id{self.llm.config.pad_token_id}") + # 1. Create a mask to know where special speech tokens are + special_speech_token_mask = input_ids == self.llm.config.default_speech_token_id + # print(f"default_speech_token_id{self.llm.config.default_speech_token_id}") + num_special_speech_tokens = torch.sum(special_speech_token_mask, dim=-1) + # Compute the maximum embed dimension + max_embed_dim = ( + num_special_speech_tokens.max() * (speech_len - 1) + ) + sequence_length + batch_indices, non_speech_indices = torch.where( + input_ids != self.llm.config.default_speech_token_id + ) + + # 2. Compute the positions where text should be written + # Calculate new positions for text tokens in merged speech-text sequence. + # `special_speech_token_mask` identifies speech tokens. Each speech token will be replaced by `nb_text_tokens_per_speechs - 1` text tokens. + # `torch.cumsum` computes how each speech token shifts subsequent text token positions. + # - 1 to adjust for zero-based indexing, as `cumsum` inherently increases indices by one. + new_token_positions = ( + torch.cumsum((special_speech_token_mask * (speech_len - 1) + 1), -1) - 1 + ) # (N,U) + nb_speech_pad = max_embed_dim - 1 - new_token_positions[:, -1] + if left_padding: + new_token_positions += nb_speech_pad[:, None] # offset for left padding + text_to_overwrite = new_token_positions[batch_indices, non_speech_indices] + + # 3. Create the full embedding, already padded to the maximum position + final_embedding = torch.zeros( + batch_size, + max_embed_dim, + embed_dim, + dtype=inputs_embeds.dtype, + device=inputs_embeds.device, + ) + final_attention_mask = torch.zeros( + batch_size, + max_embed_dim, + dtype=attention_mask.dtype, + device=inputs_embeds.device, + ) + if labels is not None: + final_labels = torch.full( + (batch_size, max_embed_dim), + IGNORE_TOKEN_ID, + dtype=input_ids.dtype, + device=input_ids.device, + ) + # In case the Vision model or the Language model has been offloaded to CPU, we need to manually + # set the corresponding tensors into their correct target device. + target_device = inputs_embeds.device + batch_indices, non_speech_indices, text_to_overwrite = ( + batch_indices.to(target_device), + non_speech_indices.to(target_device), + text_to_overwrite.to(target_device), + ) + attention_mask = attention_mask.to(target_device) + + # 4. Fill the embeddings based on the mask. If we have ["hey" "", "how", "are"] + # we need to index copy on [0, 577, 578, 579] for the text and [1:576] for the speech features + final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[ + batch_indices, non_speech_indices + ] + final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[ + batch_indices, non_speech_indices + ] + if labels is not None: + final_labels[batch_indices, text_to_overwrite] = labels[ + batch_indices, non_speech_indices + ] + + # 5. Fill the embeddings corresponding to the speechs. Anything that is not `text_positions` needs filling (#29835) + speech_to_overwrite = torch.full( + (batch_size, max_embed_dim), + True, + dtype=torch.bool, + device=inputs_embeds.device, + ) + speech_to_overwrite[batch_indices, text_to_overwrite] = False + if speech_lens is not None: + speech_pad_position = speech_to_overwrite.cumsum(-1) <= speech_lens[:, None] + speech_to_overwrite &= speech_to_overwrite.cumsum(-1) - 1 >= nb_speech_pad[ + :, None + ].to(target_device) + + if speech_to_overwrite.sum() != speech_features.shape[:-1].numel(): + raise ValueError( + f"The input provided to the model are wrong. The number of speech tokens is {torch.sum(special_speech_token_mask)} while" + f" the number of speech given to the model is {num_speechs}. This prevents correct indexing and breaks batch generation." + ) + + final_embedding[speech_to_overwrite] = ( + speech_features.contiguous().reshape(-1, embed_dim).to(target_device) + ) + if speech_lens is not None: + speech_to_overwrite &= speech_pad_position + final_attention_mask |= speech_to_overwrite + + # 6. Mask out the embedding at padding positions, as we later use the past_key_value value to determine the non-attended tokens. + batch_indices, pad_indices = torch.where( + input_ids == self.llm.config.pad_token_id + ) + indices_to_mask = new_token_positions[batch_indices, pad_indices] + + final_embedding[batch_indices, indices_to_mask] = 0 + + if labels is None: + final_labels = None + + return final_embedding, final_attention_mask, final_labels #, position_ids + \ No newline at end of file diff --git a/examples/asr_fireredasr_text/model/fireredasr/models/module/adapter.py b/examples/asr_fireredasr_text/model/fireredasr/models/module/adapter.py new file mode 100644 index 00000000..952dfd3b --- /dev/null +++ b/examples/asr_fireredasr_text/model/fireredasr/models/module/adapter.py @@ -0,0 +1,30 @@ +import torch +import torch.nn as nn + + +class Adapter(nn.Module): + def __init__(self, encoder_dim, llm_dim, downsample_rate=2): + super().__init__() + self.ds = downsample_rate + self.linear1 = nn.Linear(encoder_dim * downsample_rate, llm_dim) + self.relu = nn.ReLU() + self.linear2 = nn.Linear(llm_dim, llm_dim) + + def forward(self, x, x_lens): + batch_size, seq_len, feat_dim = x.size() + num_frames_to_discard = seq_len % self.ds + if num_frames_to_discard > 0: + x = x[:, :-num_frames_to_discard, :] + seq_len = x.size(1) + + x = x.contiguous() + x = x.view( + batch_size, seq_len // self.ds, feat_dim * self.ds + ) + + x = self.linear1(x) + x = self.relu(x) + x = self.linear2(x) + + new_x_lens = torch.clamp(x_lens, max=seq_len) // self.ds + return x, new_x_lens diff --git a/examples/asr_fireredasr_text/model/fireredasr/models/module/conformer_encoder.py b/examples/asr_fireredasr_text/model/fireredasr/models/module/conformer_encoder.py new file mode 100644 index 00000000..1dab3809 --- /dev/null +++ b/examples/asr_fireredasr_text/model/fireredasr/models/module/conformer_encoder.py @@ -0,0 +1,322 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class ConformerEncoder(nn.Module): + def __init__(self, idim, n_layers, n_head, d_model, + residual_dropout=0.1, dropout_rate=0.1, kernel_size=33, + pe_maxlen=5000): + super().__init__() + self.odim = d_model + + self.input_preprocessor = Conv2dSubsampling(idim, d_model) + self.positional_encoding = RelPositionalEncoding(d_model) + self.dropout = nn.Dropout(residual_dropout) + + self.layer_stack = nn.ModuleList() + for l in range(n_layers): + block = RelPosEmbConformerBlock(d_model, n_head, + residual_dropout, + dropout_rate, kernel_size) + self.layer_stack.append(block) + + def forward(self, padded_input, input_lengths, pad=True): + if pad: + padded_input = F.pad(padded_input, + (0, 0, 0, self.input_preprocessor.context - 1), 'constant', 0.0) + src_mask = self.padding_position_is_0(padded_input, input_lengths) + + embed_output, input_lengths, src_mask = self.input_preprocessor(padded_input, src_mask) + enc_output = self.dropout(embed_output) + + pos_emb = self.dropout(self.positional_encoding(embed_output)) + + enc_outputs = [] + for enc_layer in self.layer_stack: + enc_output = enc_layer(enc_output, pos_emb, slf_attn_mask=src_mask, + pad_mask=src_mask) + enc_outputs.append(enc_output) + + return enc_output, input_lengths, src_mask + + def padding_position_is_0(self, padded_input, input_lengths): + N, T = padded_input.size()[:2] + mask = torch.ones((N, T)).to(padded_input.device) + for i in range(N): + mask[i, input_lengths[i]:] = 0 + mask = mask.unsqueeze(dim=1) + return mask.to(torch.uint8) + + +class RelPosEmbConformerBlock(nn.Module): + def __init__(self, d_model, n_head, + residual_dropout=0.1, + dropout_rate=0.1, kernel_size=33): + super().__init__() + self.ffn1 = ConformerFeedForward(d_model, dropout_rate) + self.mhsa = RelPosMultiHeadAttention(n_head, d_model, + residual_dropout) + self.conv = ConformerConvolution(d_model, kernel_size, + dropout_rate) + self.ffn2 = ConformerFeedForward(d_model, dropout_rate) + self.layer_norm = nn.LayerNorm(d_model) + + def forward(self, x, pos_emb, slf_attn_mask=None, pad_mask=None): + out = 0.5 * x + 0.5 * self.ffn1(x) + out = self.mhsa(out, out, out, pos_emb, mask=slf_attn_mask)[0] + out = self.conv(out, pad_mask) + out = 0.5 * out + 0.5 * self.ffn2(out) + out = self.layer_norm(out) + return out + + +class Swish(nn.Module): + def forward(self, x): + return x * torch.sigmoid(x) + + +class Conv2dSubsampling(nn.Module): + def __init__(self, idim, d_model, out_channels=32): + super().__init__() + self.conv = nn.Sequential( + nn.Conv2d(1, out_channels, 3, 2), + nn.ReLU(), + nn.Conv2d(out_channels, out_channels, 3, 2), + nn.ReLU(), + ) + subsample_idim = ((idim - 1) // 2 - 1) // 2 + self.out = nn.Linear(out_channels * subsample_idim, d_model) + + self.subsampling = 4 + left_context = right_context = 3 # both exclude currect frame + self.context = left_context + 1 + right_context # 7 + + def forward(self, x, x_mask): + x = x.unsqueeze(1) + x = self.conv(x) + N, C, T, D = x.size() + x = self.out(x.transpose(1, 2).contiguous().view(N, T, C * D)) + mask = x_mask[:, :, :-2:2][:, :, :-2:2] + input_lengths = mask[:, -1, :].sum(dim=-1) + return x, input_lengths, mask + + +class RelPositionalEncoding(torch.nn.Module): + def __init__(self, d_model, max_len=5000): + super().__init__() + pe_positive = torch.zeros(max_len, d_model, requires_grad=False) + pe_negative = torch.zeros(max_len, d_model, requires_grad=False) + position = torch.arange(0, max_len).unsqueeze(1).float() + div_term = torch.exp(torch.arange(0, d_model, 2).float() * + -(torch.log(torch.tensor(10000.0)).item()/d_model)) + pe_positive[:, 0::2] = torch.sin(position * div_term) + pe_positive[:, 1::2] = torch.cos(position * div_term) + pe_negative[:, 0::2] = torch.sin(-1 * position * div_term) + pe_negative[:, 1::2] = torch.cos(-1 * position * div_term) + + pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0) + pe_negative = pe_negative[1:].unsqueeze(0) + pe = torch.cat([pe_positive, pe_negative], dim=1) + self.register_buffer('pe', pe) + + def forward(self, x): + # Tmax = 2 * max_len - 1 + Tmax, T = self.pe.size(1), x.size(1) + pos_emb = self.pe[:, Tmax // 2 - T + 1 : Tmax // 2 + T].clone().detach() + return pos_emb + + +class ConformerFeedForward(nn.Module): + def __init__(self, d_model, dropout_rate=0.1): + super().__init__() + pre_layer_norm = nn.LayerNorm(d_model) + linear_expand = nn.Linear(d_model, d_model*4) + nonlinear = Swish() + dropout_pre = nn.Dropout(dropout_rate) + linear_project = nn.Linear(d_model*4, d_model) + dropout_post = nn.Dropout(dropout_rate) + self.net = nn.Sequential(pre_layer_norm, + linear_expand, + nonlinear, + dropout_pre, + linear_project, + dropout_post) + + def forward(self, x): + residual = x + output = self.net(x) + output = output + residual + return output + + +class ConformerConvolution(nn.Module): + def __init__(self, d_model, kernel_size=33, dropout_rate=0.1): + super().__init__() + assert kernel_size % 2 == 1 + self.pre_layer_norm = nn.LayerNorm(d_model) + self.pointwise_conv1 = nn.Conv1d(d_model, d_model*4, kernel_size=1, bias=False) + self.glu = F.glu + self.padding = (kernel_size - 1) // 2 + self.depthwise_conv = nn.Conv1d(d_model*2, d_model*2, + kernel_size, stride=1, + padding=self.padding, + groups=d_model*2, bias=False) + self.batch_norm = nn.LayerNorm(d_model*2) + self.swish = Swish() + self.pointwise_conv2 = nn.Conv1d(d_model*2, d_model, kernel_size=1, bias=False) + self.dropout = nn.Dropout(dropout_rate) + + def forward(self, x, mask=None): + residual = x + out = self.pre_layer_norm(x) + out = out.transpose(1, 2) + if mask is not None: + out.masked_fill_(mask.ne(1), 0.0) + out = self.pointwise_conv1(out) + out = F.glu(out, dim=1) + out = self.depthwise_conv(out) + + out = out.transpose(1, 2) + out = self.swish(self.batch_norm(out)) + out = out.transpose(1, 2) + + out = self.dropout(self.pointwise_conv2(out)) + if mask is not None: + out.masked_fill_(mask.ne(1), 0.0) + out = out.transpose(1, 2) + return out + residual + + +class EncoderMultiHeadAttention(nn.Module): + def __init__(self, n_head, d_model, + residual_dropout=0.1): + super().__init__() + assert d_model % n_head == 0 + self.n_head = n_head + self.d_k = d_model // n_head + self.d_v = self.d_k + + self.w_qs = nn.Linear(d_model, n_head * self.d_k, bias=False) + self.w_ks = nn.Linear(d_model, n_head * self.d_k, bias=False) + self.w_vs = nn.Linear(d_model, n_head * self.d_v, bias=False) + + self.layer_norm_q = nn.LayerNorm(d_model) + self.layer_norm_k = nn.LayerNorm(d_model) + self.layer_norm_v = nn.LayerNorm(d_model) + + self.attention = ScaledDotProductAttention(temperature=self.d_k ** 0.5) + self.fc = nn.Linear(n_head * self.d_v, d_model, bias=False) + self.dropout = nn.Dropout(residual_dropout) + + def forward(self, q, k, v, mask=None): + sz_b, len_q = q.size(0), q.size(1) + + residual = q + q, k, v = self.forward_qkv(q, k, v) + + output, attn = self.attention(q, k, v, mask=mask) + + output = self.forward_output(output, residual, sz_b, len_q) + return output, attn + + def forward_qkv(self, q, k, v): + d_k, d_v, n_head = self.d_k, self.d_v, self.n_head + sz_b, len_q, len_k, len_v = q.size(0), q.size(1), k.size(1), v.size(1) + + q = self.layer_norm_q(q) + k = self.layer_norm_k(k) + v = self.layer_norm_v(v) + + q = self.w_qs(q).view(sz_b, len_q, n_head, d_k) + k = self.w_ks(k).view(sz_b, len_k, n_head, d_k) + v = self.w_vs(v).view(sz_b, len_v, n_head, d_v) + q = q.transpose(1, 2) + k = k.transpose(1, 2) + v = v.transpose(1, 2) + return q, k, v + + def forward_output(self, output, residual, sz_b, len_q): + output = output.transpose(1, 2).contiguous().view(sz_b, len_q, -1) + fc_out = self.fc(output) + output = self.dropout(fc_out) + output = output + residual + return output + + +class ScaledDotProductAttention(nn.Module): + def __init__(self, temperature): + super().__init__() + self.temperature = temperature + self.dropout = nn.Dropout(0.0) + self.INF = float('inf') + + def forward(self, q, k, v, mask=None): + attn = torch.matmul(q, k.transpose(2, 3)) / self.temperature + output, attn = self.forward_attention(attn, v, mask) + return output, attn + + def forward_attention(self, attn, v, mask=None): + if mask is not None: + mask = mask.unsqueeze(1) + mask = mask.eq(0) + attn = attn.masked_fill(mask, -self.INF) + attn = torch.softmax(attn, dim=-1).masked_fill(mask, 0.0) + else: + attn = torch.softmax(attn, dim=-1) + + d_attn = self.dropout(attn) + output = torch.matmul(d_attn, v) + + return output, attn + + +class RelPosMultiHeadAttention(EncoderMultiHeadAttention): + def __init__(self, n_head, d_model, + residual_dropout=0.1): + super().__init__(n_head, d_model, + residual_dropout) + d_k = d_model // n_head + self.scale = 1.0 / (d_k ** 0.5) + self.linear_pos = nn.Linear(d_model, n_head * d_k, bias=False) + self.pos_bias_u = nn.Parameter(torch.FloatTensor(n_head, d_k)) + self.pos_bias_v = nn.Parameter(torch.FloatTensor(n_head, d_k)) + torch.nn.init.xavier_uniform_(self.pos_bias_u) + torch.nn.init.xavier_uniform_(self.pos_bias_v) + + def _rel_shift(self, x): + N, H, T1, T2 = x.size() + zero_pad = torch.zeros((N, H, T1, 1), device=x.device, dtype=x.dtype) + x_padded = torch.cat([zero_pad, x], dim=-1) + + x_padded = x_padded.view(N, H, T2 + 1, T1) + x = x_padded[:, :, 1:].view_as(x) + x = x[:, :, :, : x.size(-1) // 2 + 1] + return x + + def forward(self, q, k, v, pos_emb, mask=None): + sz_b, len_q = q.size(0), q.size(1) + + residual = q + q, k, v = self.forward_qkv(q, k, v) + + q = q.transpose(1, 2) + n_batch_pos = pos_emb.size(0) + p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.n_head, self.d_k) + p = p.transpose(1, 2) + + q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) + q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) + + matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) + + matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) + matrix_bd = self._rel_shift(matrix_bd) + + attn_scores = matrix_ac + matrix_bd + attn_scores.mul_(self.scale) + + output, attn = self.attention.forward_attention(attn_scores, v, mask=mask) + + output = self.forward_output(output, residual, sz_b, len_q) + return output, attn diff --git a/examples/asr_fireredasr_text/model/fireredasr/models/module/transformer_decoder.py b/examples/asr_fireredasr_text/model/fireredasr/models/module/transformer_decoder.py new file mode 100644 index 00000000..2088b083 --- /dev/null +++ b/examples/asr_fireredasr_text/model/fireredasr/models/module/transformer_decoder.py @@ -0,0 +1,299 @@ +from typing import List, Optional, Dict + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor + + +class TransformerDecoder(nn.Module): + def __init__( + self, sos_id, eos_id, pad_id, odim, + n_layers, n_head, d_model, + residual_dropout=0.1, pe_maxlen=5000): + super().__init__() + self.INF = 1e10 + # parameters + self.pad_id = pad_id + self.sos_id = sos_id + self.eos_id = eos_id + self.n_layers = n_layers + + # Components + self.tgt_word_emb = nn.Embedding(odim, d_model, padding_idx=self.pad_id) + self.positional_encoding = PositionalEncoding(d_model, max_len=pe_maxlen) + self.dropout = nn.Dropout(residual_dropout) + + self.layer_stack = nn.ModuleList() + for l in range(n_layers): + block = DecoderLayer(d_model, n_head, residual_dropout) + self.layer_stack.append(block) + + self.tgt_word_prj = nn.Linear(d_model, odim, bias=False) + self.layer_norm_out = nn.LayerNorm(d_model) + + self.tgt_word_prj.weight = self.tgt_word_emb.weight + self.scale = (d_model ** 0.5) + + def batch_beam_search(self, encoder_outputs, src_masks, + beam_size=1, nbest=1, decode_max_len=0, + softmax_smoothing=1.0, length_penalty=0.0, eos_penalty=1.0): + B = beam_size + N, Ti, H = encoder_outputs.size() + device = encoder_outputs.device + maxlen = decode_max_len if decode_max_len > 0 else Ti + assert eos_penalty > 0.0 and eos_penalty <= 1.0 + + # Init + encoder_outputs = encoder_outputs.unsqueeze(1).repeat(1, B, 1, 1).view(N*B, Ti, H) + src_mask = src_masks.unsqueeze(1).repeat(1, B, 1, 1).view(N*B, -1, Ti) + ys = torch.ones(N*B, 1).fill_(self.sos_id).long().to(device) + caches: List[Optional[Tensor]] = [] + for _ in range(self.n_layers): + caches.append(None) + scores = torch.tensor([0.0] + [-self.INF]*(B-1)).float().to(device) + scores = scores.repeat(N).view(N*B, 1) + is_finished = torch.zeros_like(scores) + + # Autoregressive Prediction + for t in range(maxlen): + tgt_mask = self.ignored_target_position_is_0(ys, self.pad_id) + + dec_output = self.dropout( + self.tgt_word_emb(ys) * self.scale + + self.positional_encoding(ys)) + + i = 0 + for dec_layer in self.layer_stack: + dec_output = dec_layer.forward( + dec_output, encoder_outputs, + tgt_mask, src_mask, + cache=caches[i]) + caches[i] = dec_output + i += 1 + + dec_output = self.layer_norm_out(dec_output) + + t_logit = self.tgt_word_prj(dec_output[:, -1]) + t_scores = F.log_softmax(t_logit / softmax_smoothing, dim=-1) + + if eos_penalty != 1.0: + t_scores[:, self.eos_id] *= eos_penalty + + t_topB_scores, t_topB_ys = torch.topk(t_scores, k=B, dim=1) + t_topB_scores = self.set_finished_beam_score_to_zero(t_topB_scores, is_finished) + t_topB_ys = self.set_finished_beam_y_to_eos(t_topB_ys, is_finished) + + # Accumulated + scores = scores + t_topB_scores + + # Pruning + scores = scores.view(N, B*B) + scores, topB_score_ids = torch.topk(scores, k=B, dim=1) + scores = scores.view(-1, 1) + + topB_row_number_in_each_B_rows_of_ys = torch.div(topB_score_ids, B).view(N*B) + stride = B * torch.arange(N).view(N, 1).repeat(1, B).view(N*B).to(device) + topB_row_number_in_ys = topB_row_number_in_each_B_rows_of_ys.long() + stride.long() + + # Update ys + ys = ys[topB_row_number_in_ys] + t_ys = torch.gather(t_topB_ys.view(N, B*B), dim=1, index=topB_score_ids).view(N*B, 1) + ys = torch.cat((ys, t_ys), dim=1) + + # Update caches + new_caches: List[Optional[Tensor]] = [] + for cache in caches: + if cache is not None: + new_caches.append(cache[topB_row_number_in_ys]) + caches = new_caches + + # Update finished state + is_finished = t_ys.eq(self.eos_id) + if is_finished.sum().item() == N*B: + break + + # Length penalty (follow GNMT) + scores = scores.view(N, B) + ys = ys.view(N, B, -1) + ys_lengths = self.get_ys_lengths(ys) + if length_penalty > 0.0: + penalty = torch.pow((5+ys_lengths.float())/(5.0+1), length_penalty) + scores /= penalty + nbest_scores, nbest_ids = torch.topk(scores, k=int(nbest), dim=1) + nbest_scores = -1.0 * nbest_scores + index = nbest_ids + B * torch.arange(N).view(N, 1).to(device).long() + nbest_ys = ys.view(N*B, -1)[index.view(-1)] + nbest_ys = nbest_ys.view(N, nbest_ids.size(1), -1) + nbest_ys_lengths = ys_lengths.view(N*B)[index.view(-1)].view(N, -1) + + # result + nbest_hyps: List[List[Dict[str, Tensor]]] = [] + for n in range(N): + n_nbest_hyps: List[Dict[str, Tensor]] = [] + for i, score in enumerate(nbest_scores[n]): + new_hyp = { + "yseq": nbest_ys[n, i, 1:nbest_ys_lengths[n, i]] + } + n_nbest_hyps.append(new_hyp) + nbest_hyps.append(n_nbest_hyps) + return nbest_hyps + + def ignored_target_position_is_0(self, padded_targets, ignore_id): + mask = torch.ne(padded_targets, ignore_id) + mask = mask.unsqueeze(dim=1) + T = padded_targets.size(-1) + upper_tri_0_mask = self.upper_triangular_is_0(T).unsqueeze(0).to(mask.dtype) + upper_tri_0_mask = upper_tri_0_mask.to(mask.dtype).to(mask.device) + return mask.to(torch.uint8) & upper_tri_0_mask.to(torch.uint8) + + def upper_triangular_is_0(self, size): + ones = torch.ones(size, size) + tri_left_ones = torch.tril(ones) + return tri_left_ones.to(torch.uint8) + + def set_finished_beam_score_to_zero(self, scores, is_finished): + NB, B = scores.size() + is_finished = is_finished.float() + mask_score = torch.tensor([0.0] + [-self.INF]*(B-1)).float().to(scores.device) + mask_score = mask_score.view(1, B).repeat(NB, 1) + return scores * (1 - is_finished) + mask_score * is_finished + + def set_finished_beam_y_to_eos(self, ys, is_finished): + is_finished = is_finished.long() + return ys * (1 - is_finished) + self.eos_id * is_finished + + def get_ys_lengths(self, ys): + N, B, Tmax = ys.size() + ys_lengths = torch.sum(torch.ne(ys, self.eos_id), dim=-1) + return ys_lengths.int() + + + +class DecoderLayer(nn.Module): + def __init__(self, d_model, n_head, dropout): + super().__init__() + self.self_attn_norm = nn.LayerNorm(d_model) + self.self_attn = DecoderMultiHeadAttention(d_model, n_head, dropout) + + self.cross_attn_norm = nn.LayerNorm(d_model) + self.cross_attn = DecoderMultiHeadAttention(d_model, n_head, dropout) + + self.mlp_norm = nn.LayerNorm(d_model) + self.mlp = PositionwiseFeedForward(d_model, d_model*4, dropout) + + def forward(self, dec_input, enc_output, self_attn_mask, cross_attn_mask, + cache=None): + x = dec_input + residual = x + x = self.self_attn_norm(x) + if cache is not None: + xq = x[:, -1:, :] + residual = residual[:, -1:, :] + self_attn_mask = self_attn_mask[:, -1:, :] + else: + xq = x + x = self.self_attn(xq, x, x, mask=self_attn_mask) + x = residual + x + + residual = x + x = self.cross_attn_norm(x) + x = self.cross_attn(x, enc_output, enc_output, mask=cross_attn_mask) + x = residual + x + + residual = x + x = self.mlp_norm(x) + x = residual + self.mlp(x) + + if cache is not None: + x = torch.cat([cache, x], dim=1) + + return x + + +class DecoderMultiHeadAttention(nn.Module): + def __init__(self, d_model, n_head, dropout=0.1): + super().__init__() + self.d_model = d_model + self.n_head = n_head + self.d_k = d_model // n_head + + self.w_qs = nn.Linear(d_model, n_head * self.d_k) + self.w_ks = nn.Linear(d_model, n_head * self.d_k, bias=False) + self.w_vs = nn.Linear(d_model, n_head * self.d_k) + + self.attention = DecoderScaledDotProductAttention( + temperature=self.d_k ** 0.5) + self.fc = nn.Linear(n_head * self.d_k, d_model) + self.dropout = nn.Dropout(dropout) + + def forward(self, q, k, v, mask=None): + bs = q.size(0) + + q = self.w_qs(q).view(bs, -1, self.n_head, self.d_k) + k = self.w_ks(k).view(bs, -1, self.n_head, self.d_k) + v = self.w_vs(v).view(bs, -1, self.n_head, self.d_k) + q = q.transpose(1, 2) + k = k.transpose(1, 2) + v = v.transpose(1, 2) + + if mask is not None: + mask = mask.unsqueeze(1) + + output = self.attention(q, k, v, mask=mask) + + output = output.transpose(1, 2).contiguous().view(bs, -1, self.d_model) + output = self.fc(output) + output = self.dropout(output) + + return output + + +class DecoderScaledDotProductAttention(nn.Module): + def __init__(self, temperature): + super().__init__() + self.temperature = temperature + self.INF = float("inf") + + def forward(self, q, k, v, mask=None): + attn = torch.matmul(q, k.transpose(2, 3)) / self.temperature + if mask is not None: + mask = mask.eq(0) + attn = attn.masked_fill(mask, -self.INF) + attn = torch.softmax(attn, dim=-1).masked_fill(mask, 0.0) + else: + attn = torch.softmax(attn, dim=-1) + output = torch.matmul(attn, v) + return output + + +class PositionwiseFeedForward(nn.Module): + def __init__(self, d_model, d_ff, dropout=0.1): + super().__init__() + self.w_1 = nn.Linear(d_model, d_ff) + self.act = nn.GELU() + self.w_2 = nn.Linear(d_ff, d_model) + self.dropout = nn.Dropout(dropout) + + def forward(self, x): + output = self.w_2(self.act(self.w_1(x))) + output = self.dropout(output) + return output + + +class PositionalEncoding(nn.Module): + def __init__(self, d_model, max_len=5000): + super().__init__() + assert d_model % 2 == 0 + pe = torch.zeros(max_len, d_model, requires_grad=False) + position = torch.arange(0, max_len).unsqueeze(1).float() + div_term = torch.exp(torch.arange(0, d_model, 2).float() * + -(torch.log(torch.tensor(10000.0)).item()/d_model)) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + + def forward(self, x): + length = x.size(1) + return self.pe[:, :length].clone().detach() diff --git a/examples/asr_fireredasr_text/model/fireredasr/speech2text.py b/examples/asr_fireredasr_text/model/fireredasr/speech2text.py new file mode 100755 index 00000000..21f645b4 --- /dev/null +++ b/examples/asr_fireredasr_text/model/fireredasr/speech2text.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python3 + +import argparse +import glob +import os +import sys + +from fireredasr.models.fireredasr import FireRedAsr + + +parser = argparse.ArgumentParser() +parser.add_argument('--asr_type', type=str, required=True, choices=["aed", "llm"]) +parser.add_argument('--model_dir', type=str, required=True) + +# Input / Output +parser.add_argument("--wav_path", type=str) +parser.add_argument("--wav_paths", type=str, nargs="*") +parser.add_argument("--wav_dir", type=str) +parser.add_argument("--wav_scp", type=str) +parser.add_argument("--output", type=str) + +# Decode Options +parser.add_argument('--use_gpu', type=int, default=1) +parser.add_argument("--batch_size", type=int, default=1) +parser.add_argument("--beam_size", type=int, default=1) +parser.add_argument("--decode_max_len", type=int, default=0) +# FireRedASR-AED +parser.add_argument("--nbest", type=int, default=1) +parser.add_argument("--softmax_smoothing", type=float, default=1.0) +parser.add_argument("--aed_length_penalty", type=float, default=0.0) +parser.add_argument("--eos_penalty", type=float, default=1.0) +# FireRedASR-LLM +parser.add_argument("--decode_min_len", type=int, default=0) +parser.add_argument("--repetition_penalty", type=float, default=1.0) +parser.add_argument("--llm_length_penalty", type=float, default=0.0) +parser.add_argument("--temperature", type=float, default=1.0) + + +def main(args): + wavs = get_wav_info(args) + fout = open(args.output, "w") if args.output else None + + model = FireRedAsr.from_pretrained(args.asr_type, args.model_dir) + + batch_uttid = [] + batch_wav_path = [] + for i, wav in enumerate(wavs): + uttid, wav_path = wav + batch_uttid.append(uttid) + batch_wav_path.append(wav_path) + if len(batch_wav_path) < args.batch_size and i != len(wavs) - 1: + continue + + results = model.transcribe( + batch_uttid, + batch_wav_path, + { + "use_gpu": args.use_gpu, + "beam_size": args.beam_size, + "nbest": args.nbest, + "decode_max_len": args.decode_max_len, + "softmax_smoothing": args.softmax_smoothing, + "aed_length_penalty": args.aed_length_penalty, + "eos_penalty": args.eos_penalty, + "decode_min_len": args.decode_min_len, + "repetition_penalty": args.repetition_penalty, + "llm_length_penalty": args.llm_length_penalty, + "temperature": args.temperature + } + ) + + for result in results: + print(result) + if fout is not None: + fout.write(f"{result['uttid']}\t{result['text']}\n") + + batch_uttid = [] + batch_wav_path = [] + + +def get_wav_info(args): + """ + Returns: + wavs: list of (uttid, wav_path) + """ + base = lambda p: os.path.basename(p).replace(".wav", "") + if args.wav_path: + wavs = [(base(args.wav_path), args.wav_path)] + elif args.wav_paths and len(args.wav_paths) >= 1: + wavs = [(base(p), p) for p in sorted(args.wav_paths)] + elif args.wav_scp: + wavs = [line.strip().split() for line in open(args.wav_scp)] + elif args.wav_dir: + wavs = glob.glob(f"{args.wav_dir}/**/*.wav", recursive=True) + wavs = [(base(p), p) for p in sorted(wavs)] + else: + raise ValueError("Please provide valid wav info") + print(f"#wavs={len(wavs)}") + return wavs + + +if __name__ == "__main__": + args = parser.parse_args() + print(args) + main(args) diff --git a/examples/asr_fireredasr_text/model/fireredasr/tokenizer/aed_tokenizer.py b/examples/asr_fireredasr_text/model/fireredasr/tokenizer/aed_tokenizer.py new file mode 100644 index 00000000..00285b54 --- /dev/null +++ b/examples/asr_fireredasr_text/model/fireredasr/tokenizer/aed_tokenizer.py @@ -0,0 +1,67 @@ +import logging +import re + +import sentencepiece as spm + +from fireredasr.data.token_dict import TokenDict + + +class ChineseCharEnglishSpmTokenizer: + """ + - One Chinese char is a token. + - Split English word into SPM and one piece is a token. + - Ignore ' ' between Chinese char + - Replace ' ' between English word with "▁" by spm_model + - Need to put SPM piece into dict file + - If not set spm_model, will use English char and + """ + SPM_SPACE = "▁" + + def __init__(self, dict_path, spm_model, unk="", space=""): + self.dict = TokenDict(dict_path, unk=unk) + self.space = space + if spm_model: + self.sp = spm.SentencePieceProcessor() + self.sp.Load(spm_model) + else: + self.sp = None + print("[WRAN] Not set spm_model, will use English char") + print("[WARN] Please check how to deal with ' '(space)") + if self.space not in self.dict: + print("Please add to your dict, or it will be ") + + def tokenize(self, text, replace_punc=True): + #if text == "": + # logging.info(f"empty text") + text = text.upper() + tokens = [] + if replace_punc: + text = re.sub("[,。?!,\.?!]", " ", text) + pattern = re.compile(r'([\u3400-\u4dbf\u4e00-\u9fff])') + parts = pattern.split(text.strip()) + parts = [p for p in parts if len(p.strip()) > 0] + for part in parts: + if pattern.fullmatch(part) is not None: + tokens.append(part) + else: + if self.sp: + for piece in self.sp.EncodeAsPieces(part.strip()): + tokens.append(piece) + else: + for char in part.strip(): + tokens.append(char if char != " " else self.space) + tokens_id = [] + for token in tokens: + tokens_id.append(self.dict.get(token, self.dict.unk)) + return tokens, tokens_id + + def detokenize(self, inputs, join_symbol="", replace_spm_space=True): + """inputs is ids or tokens, do not need self.sp""" + if len(inputs) > 0 and type(inputs[0]) == int: + tokens = [self.dict[id] for id in inputs] + else: + tokens = inputs + s = f"{join_symbol}".join(tokens) + if replace_spm_space: + s = s.replace(self.SPM_SPACE, ' ').strip() + return s diff --git a/examples/asr_fireredasr_text/model/fireredasr/tokenizer/llm_tokenizer.py b/examples/asr_fireredasr_text/model/fireredasr/tokenizer/llm_tokenizer.py new file mode 100644 index 00000000..2b498f2a --- /dev/null +++ b/examples/asr_fireredasr_text/model/fireredasr/tokenizer/llm_tokenizer.py @@ -0,0 +1,115 @@ +import re + +import torch +from transformers import AutoTokenizer +from transformers.trainer_pt_utils import LabelSmoother + +DEFAULT_SPEECH_TOKEN = "" +IGNORE_TOKEN_ID = LabelSmoother.ignore_index + + +class LlmTokenizerWrapper: + @classmethod + def build_llm_tokenizer(cls, llm_path, use_flash_attn=False): + tokenizer = AutoTokenizer.from_pretrained(llm_path) + if use_flash_attn: + tokenizer.padding_side = "left" + else: + tokenizer.padding_side = "right" + # tokenizer.padding_side = "left" + special_tokens_dict = {"additional_special_tokens": [DEFAULT_SPEECH_TOKEN]} + tokenizer.add_special_tokens(special_tokens_dict) + return tokenizer + + @classmethod + def clean_text(cls, origin_text): + """remove punc, remove space between Chinese and keep space between English""" + # remove punc + text = re.sub("[,。?!,\.!?《》()\·“”、\\/]", "", origin_text) + # merge space + text = re.sub("\s+", " ", text) + + # remove space between Chinese and keep space between English + pattern = re.compile(r'([\u3400-\u4dbf\u4e00-\u9fff])') # Chinese + parts = pattern.split(text.strip()) + parts = [p for p in parts if len(p.strip()) > 0] + text = "".join(parts) + text = text.strip() + + text = text.lower() + return text + + @classmethod + def preprocess_texts(cls, origin_texts, tokenizer, max_len, decode=False): + messages = [] + clean_texts = [] + for i, origin_text in enumerate(origin_texts): + text = cls.clean_text(origin_text) + clean_texts.append(text) + text = text if not decode else "" + # message = [ + # text, + # {"role": "user", "content": f"{text}请转写音频为文字"}, + # {"role": "assistant", "content": text}, + # ] + # messages.append(message) + + # texts = [] + # if not decode: + # TEMPLATE = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if loop.last %}{{ '<|im_end|>'}}{% else %}{{ '<|im_end|>\n' }}{% endif %}{% endfor %}" + # else: + # TEMPLATE = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if loop.last %}{{''}}{% else %}{{ '<|im_end|>\n' }}{% endif %}{% endfor %}" + # for i, msg in enumerate(messages): + # texts.append( + # # tokenizer.apply_chat_template( + # # msg, + # # tokenize=True, + # # chat_template=TEMPLATE, + # # add_generation_prompt=False, + # # padding="longest", + # # max_length=max_len, + # # truncation=True, + # # ) + # tokenizer( + # msg, + # padding="longest", + # max_length=max_len, + # truncation=True, + # ) + # ) + result = tokenizer(clean_texts,padding=True,truncation=True,return_tensors='pt') + # Padding texts + # max_len_texts = max([len(text) for text in texts]) + # if tokenizer.padding_side == "right": + # texts = [ + # text + [tokenizer.pad_token_id] * (max_len_texts - len(text)) + # for text in texts + # ] + # else: + # texts = [ + # [tokenizer.pad_token_id] * (max_len_texts - len(text)) + text + # for text in texts + # ] + # input_ids = torch.tensor(texts, dtype=torch.int) + input_ids = result["input_ids"] + attention_mask = result["attention_mask"] + target_ids = input_ids.clone() + # target_ids[target_ids == tokenizer.pad_token_id] = IGNORE_TOKEN_ID + + # first get the indices of the tokens + # mask_prompt = True + # if mask_prompt: + # mask_indices = torch.where( + # input_ids == tokenizer.convert_tokens_to_ids("assistant") + # ) + # for i in range(mask_indices[0].size(0)): + # row = mask_indices[0][i] + # col = mask_indices[1][i] + # target_ids[row, : col + 2] = IGNORE_TOKEN_ID + + # attention_mask = input_ids.ne(tokenizer.pad_token_id) + + # target_ids = target_ids.type(torch.LongTensor) + # input_ids = input_ids.type(torch.LongTensor) + + return input_ids, attention_mask, target_ids, clean_texts diff --git a/examples/asr_fireredasr_text/model/fireredasr/utils/param.py b/examples/asr_fireredasr_text/model/fireredasr/utils/param.py new file mode 100644 index 00000000..8dfa28d2 --- /dev/null +++ b/examples/asr_fireredasr_text/model/fireredasr/utils/param.py @@ -0,0 +1,13 @@ +import logging + +import torch + + +def count_model_parameters(model): + if not isinstance(model, torch.nn.Module): + return 0, 0 + name = f"{model.__class__.__name__} {model.__class__}" + num = sum(p.numel() for p in model.parameters() if p.requires_grad) + size = num * 4.0 / 1024.0 / 1024.0 # float32, MB + logging.info(f"#param of {name} is {num} = {size:.1f} MB (float32)") + return num, size diff --git a/examples/asr_fireredasr_text/model/fireredasr/utils/wer.py b/examples/asr_fireredasr_text/model/fireredasr/utils/wer.py new file mode 100755 index 00000000..30107ca8 --- /dev/null +++ b/examples/asr_fireredasr_text/model/fireredasr/utils/wer.py @@ -0,0 +1,303 @@ +#!/usr/bin/env python3 + +import argparse +import re +from collections import OrderedDict + + +parser = argparse.ArgumentParser() +parser.add_argument("--ref", type=str, required=True) +parser.add_argument("--hyp", type=str, required=True) +parser.add_argument("--print_sentence_wer", type=int, default=0) +parser.add_argument("--do_tn", type=int, default=0, help="simple tn by cn2an") +parser.add_argument("--rm_special", type=int, default=0, help="remove <\|.*?\|>") + + +def main(args): + uttid2refs = read_uttid2tokens(args.ref, args.do_tn, args.rm_special) + uttid2hyps = read_uttid2tokens(args.hyp, args.do_tn, args.rm_special) + uttid2wer_info, wer_stat, en_dig_stat = compute_uttid2wer_info( + uttid2refs, uttid2hyps, args.print_sentence_wer) + wer_stat.print() + en_dig_stat.print() + + +def read_uttid2tokens(filename, do_tn=False, rm_special=False): + print(f">>> Read uttid to tokens: {filename}", flush=True) + uttid2tokens = OrderedDict() + uttid2text = read_uttid2text(filename, do_tn, rm_special) + for uttid, text in uttid2text.items(): + tokens = text2tokens(text) + uttid2tokens[uttid] = tokens + return uttid2tokens + + +def read_uttid2text(filename, do_tn=False, rm_special=False): + uttid2text = OrderedDict() + with open(filename, "r", encoding="utf8") as fin: + for i, line in enumerate(fin): + cols = line.split() + if len(cols) == 0: + print("[WARN] empty line, continue", i, flush=True) + continue + assert cols[0] not in uttid2text, f"repeated uttid: {line}" + if len(cols) == 1: + uttid2text[cols[0]] = "" + continue + txt = " ".join(cols[1:]) + if rm_special: + txt = " ".join([t for t in re.split("<\|.*?\|>", txt) if t.strip() != ""]) + if do_tn: + import cn2an + txt = cn2an.transform(txt, "an2cn") + uttid2text[cols[0]] = txt + return uttid2text + + +def text2tokens(text): + PUNCTUATIONS = ",。?!,\.?!"#$%&'()*+-/:;<=>@[\]^_`{|}~⦅⦆「」、 、〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏﹑﹔·。\":" + "()\[\]{}/;`|=+" + if text == "": + return [] + tokens = [] + + text = re.sub("", "", text) + text = re.sub(r"[%s]+" % PUNCTUATIONS, " ", text) + + pattern = re.compile(r'([\u4e00-\u9fff])') + parts = pattern.split(text.strip().upper()) + parts = [p for p in parts if len(p.strip()) > 0] + for part in parts: + if pattern.fullmatch(part) is not None: + tokens.append(part) + else: + for word in part.strip().split(): + tokens.append(word) + return tokens + + +def compute_uttid2wer_info(refs, hyps, print_sentence_wer=False): + print(f">>> Compute uttid to wer info", flush=True) + + uttid2wer_info = OrderedDict() + wer_stat = WerStats() + en_dig_stat = EnDigStats() + + for uttid, ref in refs.items(): + if uttid not in hyps: + print(f"[WARN] No hyp for {uttid}", flush=True) + continue + hyp = hyps[uttid] + + if len(hyp) - len(ref) >= 8: + print(f"[BidLengthDiff]: {uttid} {len(ref)} {len(hyp)}#{' '.join(ref)}#{' '.join(hyp)}") + #continue + + wer_info = compute_one_wer_info(ref, hyp) + uttid2wer_info[uttid] = wer_info + ns = count_english_ditgit(ref, hyp, wer_info) + wer_stat.add(wer_info) + en_dig_stat.add(*ns) + if print_sentence_wer: + print(f"{uttid} {wer_info}") + + return uttid2wer_info, wer_stat, en_dig_stat + + +COST_SUB = 3 +COST_DEL = 3 +COST_INS = 3 + +ALIGN_CRT = 0 +ALIGN_SUB = 1 +ALIGN_DEL = 2 +ALIGN_INS = 3 +ALIGN_END = 4 + + +def compute_one_wer_info(ref, hyp): + """Impl minimum edit distance and backtrace. + Args: + ref, hyp: List[str] + Returns: + WerInfo + """ + ref_len = len(ref) + hyp_len = len(hyp) + + class _DpPoint: + def __init__(self, cost, align): + self.cost = cost + self.align = align + + dp = [] + for i in range(0, ref_len + 1): + dp.append([]) + for j in range(0, hyp_len + 1): + dp[-1].append(_DpPoint(i * j, ALIGN_CRT)) + + # Initialize + for i in range(1, hyp_len + 1): + dp[0][i].cost = dp[0][i - 1].cost + COST_INS; + dp[0][i].align = ALIGN_INS + for i in range(1, ref_len + 1): + dp[i][0].cost = dp[i - 1][0].cost + COST_DEL + dp[i][0].align = ALIGN_DEL + + # DP + for i in range(1, ref_len + 1): + for j in range(1, hyp_len + 1): + min_cost = 0 + min_align = ALIGN_CRT + if hyp[j - 1] == ref[i - 1]: + min_cost = dp[i - 1][j - 1].cost + min_align = ALIGN_CRT + else: + min_cost = dp[i - 1][j - 1].cost + COST_SUB + min_align = ALIGN_SUB + + del_cost = dp[i - 1][j].cost + COST_DEL + if del_cost < min_cost: + min_cost = del_cost + min_align = ALIGN_DEL + + ins_cost = dp[i][j - 1].cost + COST_INS + if ins_cost < min_cost: + min_cost = ins_cost + min_align = ALIGN_INS + + dp[i][j].cost = min_cost + dp[i][j].align = min_align + + # Backtrace + crt = sub = ins = det = 0 + i = ref_len + j = hyp_len + align = [] + while i > 0 or j > 0: + if dp[i][j].align == ALIGN_CRT: + align.append((i, j, ALIGN_CRT)) + i -= 1 + j -= 1 + crt += 1 + elif dp[i][j].align == ALIGN_SUB: + align.append((i, j, ALIGN_SUB)) + i -= 1 + j -= 1 + sub += 1 + elif dp[i][j].align == ALIGN_DEL: + align.append((i, j, ALIGN_DEL)) + i -= 1 + det += 1 + elif dp[i][j].align == ALIGN_INS: + align.append((i, j, ALIGN_INS)) + j -= 1 + ins += 1 + + err = sub + det + ins + align.reverse() + wer_info = WerInfo(ref_len, err, crt, sub, det, ins, align) + return wer_info + + + +class WerInfo: + def __init__(self, ref, err, crt, sub, dele, ins, ali): + self.r = ref + self.e = err + self.c = crt + self.s = sub + self.d = dele + self.i = ins + self.ali = ali + r = max(self.r, 1) + self.wer = 100.0 * (self.s + self.d + self.i) / r + + def __repr__(self): + s = f"wer {self.wer:.2f} ref {self.r:2d} sub {self.s:2d} del {self.d:2d} ins {self.i:2d}" + return s + + +class WerStats: + def __init__(self): + self.infos = [] + + def add(self, wer_info): + self.infos.append(wer_info) + + def print(self): + r = sum(info.r for info in self.infos) + if r <= 0: + print(f"REF len is {r}, check") + r = 1 + s = sum(info.s for info in self.infos) + d = sum(info.d for info in self.infos) + i = sum(info.i for info in self.infos) + se = 100.0 * s / r + de = 100.0 * d / r + ie = 100.0 * i / r + wer = 100.0 * (s + d + i) / r + sen = max(len(self.infos), 1) + errsen = sum(info.e > 0 for info in self.infos) + ser = 100.0 * errsen / sen + print("-"*80) + print(f"ref{r:6d} sub{s:6d} del{d:6d} ins{i:6d}") + print(f"WER{wer:6.2f} sub{se:6.2f} del{de:6.2f} ins{ie:6.2f}") + print(f"SER{ser:6.2f} = {errsen} / {sen}") + print("-"*80) + + +class EnDigStats: + def __init__(self): + self.n_en_word = 0 + self.n_en_correct = 0 + self.n_dig_word = 0 + self.n_dig_correct = 0 + + def add(self, n_en_word, n_en_correct, n_dig_word, n_dig_correct): + self.n_en_word += n_en_word + self.n_en_correct += n_en_correct + self.n_dig_word += n_dig_word + self.n_dig_correct += n_dig_correct + + def print(self): + print(f"English #word={self.n_en_word}, #correct={self.n_en_correct}\n" + f"Digit #word={self.n_dig_word}, #correct={self.n_dig_correct}") + print("-"*80) + + + +def count_english_ditgit(ref, hyp, wer_info): + patt_en = "[a-zA-Z\.\-\']+" + patt_dig = "[0-9]+" + patt_cjk = re.compile(r'([\u4e00-\u9fff])') + n_en_word = 0 + n_en_correct = 0 + n_dig_word = 0 + n_dig_correct = 0 + ali = wer_info.ali + for i, token in enumerate(ref): + if re.match(patt_en, token): + n_en_word += 1 + for y in ali: + if y[0] == i+1 and y[2] == ALIGN_CRT: + j = y[1] - 1 + n_en_correct += 1 + break + if re.match(patt_dig, token): + n_dig_word += 1 + for y in ali: + if y[0] == i+1 and y[2] == ALIGN_CRT: + j = y[1] - 1 + n_dig_correct += 1 + break + if not re.match(patt_cjk, token) and not re.match(patt_en, token) \ + and not re.match(patt_dig, token): + print("[WiredChar]:", token) + return n_en_word, n_en_correct, n_dig_word, n_dig_correct + + + +if __name__ == "__main__": + args = parser.parse_args() + print(args, flush=True) + main(args) diff --git a/examples/asr_fireredasr_text/model/slam_fireredasr.py b/examples/asr_fireredasr_text/model/slam_fireredasr.py new file mode 100644 index 00000000..97143688 --- /dev/null +++ b/examples/asr_fireredasr_text/model/slam_fireredasr.py @@ -0,0 +1,258 @@ +import os +import time +import json +import torch +import torch_npu +import sys +import copy +sys.path.append('/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/examples/asr_fireredasr_text/model') +from fireredasr.models.fireredasr import FireRedAsr +from torch.utils.data import Dataset, DataLoader, IterableDataset +from fireredasr.data.asr_feat import ASRFeatExtractor +from fireredasr.models.fireredasr_aed import FireRedAsrAed +from fireredasr.models.fireredasr_llm import FireRedAsrLlm +from fireredasr.tokenizer.aed_tokenizer import ChineseCharEnglishSpmTokenizer +from fireredasr.tokenizer.llm_tokenizer import LlmTokenizerWrapper +import torch.distributed as dist +import kaldiio + +def model_factory(train_config, model_config, **kwargs): + model_dir = model_config.firered_path + model_path = os.path.join(model_dir, "model.pth.tar") + encoder_path = os.path.join(model_dir, "asr_encoder.pth.tar") + llm_dir = os.path.join(model_dir, "Qwen2-7B-Instruct") + model, tokenizer = load_firered_llm_model_and_tokenizer( + model_path, encoder_path, llm_dir, train_config) + ckpt_path = kwargs.get("ckpt_path", None) + if ckpt_path is not None: + print("loading other parts from: {}".format(ckpt_path)) + ckpt_dict = torch.load(ckpt_path, map_location="cpu") + model.load_state_dict(ckpt_dict, strict=False) + return model, tokenizer + +def load_firered_llm_model_and_tokenizer(model_path, encoder_path, llm_dir, train_config): + # model_path = "/aistor/aispeech/hpc_stor01/home/pengjing00sx/SLAM-LLM/examples/asr_fireredasr/exp/aishell-1/20250311/conformer_linear_Qwen2-7B-Instruct_encodertrue_loratrue_padtrue_normal_asr_speedfalse_specaugfalse-1058/fireredasrllm_epoch_1_step_100/model.pth.tar" + package = torch.load(model_path, map_location=lambda storage, loc: storage) + # print(type(package["args"])) + # input() + package["args"].use_lora = True + # package["args"].freeze_llm = True + + package["args"].encoder_path = encoder_path + package["args"].llm_dir = llm_dir + if train_config.freeze_encoder: + package["args"].freeze_encoder = 1 + model = FireRedAsrLlm.from_args(package["args"]) + # model.load_state_dict(package["model_state_dict"], strict=False) + tokenizer = LlmTokenizerWrapper.build_llm_tokenizer(llm_dir) + return model, tokenizer + + +class FireRedDataset(Dataset): + def __init__(self, dataset_config, tokenizer=None, split='train'): + super().__init__() + cmvn_path = dataset_config.cmvn_file + self.feature_extractor = ASRFeatExtractor(cmvn_path) + self.tokenizer = tokenizer + self.split = split + self.inference_mode = dataset_config.inference_mode + self.data_list = {} + self.multitask_task_list = [] + if split == "train": + data_path = dataset_config.train_scp_file_path + elif split == "val": + data_path = dataset_config.dev_scp_file_path + elif split == "test": + data_path = dataset_config.test_scp_file_path + else: + raise ValueError("Invalid split") + data_scp_file_path = os.path.join(data_path,"my_wav.scp") + with open(data_scp_file_path) as f: + for line in f: + key, path = line.strip().split(" ") + self.data_list[key] = path + multitask_task_path = os.path.join(data_path,"multitask.jsonl") + with open(multitask_task_path) as f: + for line in f: + item = json.loads(line.strip()) + if item["key"] in self.data_list: + self.multitask_task_list.append(item) + else: + print(item) + + def __len__(self): + return len(self.multitask_task_list) + + def __getitem__(self, index): + + # Deal with every wav one by one + item = self.multitask_task_list[index] + key = [item["key"]] + target = [item["target"]] + + return { + "key": key, + "target": target, + } + + def collator(self, samples): + assert samples is not None + + # Extract each field from the samples + keys = [sample["key"][0] for sample in samples] + targets = [sample["target"][0] for sample in samples] + + # Get padded feats + batch_wav_path = [] + for key in keys: + ark_path = self.data_list[key] + batch_wav_path.append(ark_path) + feats, lengths, durs = self.feature_extractor(batch_wav_path) + + # Get input_ids and target_ids + # inference + if self.inference_mode: + input_ids, attention_mask, target_ids, _ = \ + LlmTokenizerWrapper.preprocess_texts( + origin_texts=[""]*len(keys), tokenizer=self.tokenizer, + max_len=128, decode=True) + # training + else: + input_ids, attention_mask, target_ids, clean_texts = \ + LlmTokenizerWrapper.preprocess_texts( + origin_texts=targets, tokenizer=self.tokenizer, + max_len=128, decode=False) + + return { + "keys": keys, + "targets": targets, + "feats": feats, + "lengths": lengths, + "input_ids": input_ids, + "attention_mask": attention_mask, + "target_ids": target_ids, + } + +class FireRedDatasetLarge(IterableDataset): + def __init__(self, dataset_config, tokenizer=None, split='train'): + super().__init__() + cmvn_path = dataset_config.cmvn_file + self.feature_extractor = ASRFeatExtractor(cmvn_path) + self.tokenizer = tokenizer + self.split = split + self.inference_mode = dataset_config.inference_mode + + # 根据split选择对应的数据路径 + if split == "train": + data_path = dataset_config.train_scp_file_path + elif split == "val": + data_path = dataset_config.dev_scp_file_path + elif split == "test": + data_path = dataset_config.test_scp_file_path + else: + raise ValueError("Invalid split") + + # 加载多任务数据 + self.multitask_task_path = os.path.join(data_path, "multitask.jsonl") + + def get_audio_duration(self, wav_path): + """计算 WAV 音频的时长(单位:秒)""" + sample_rate, wav_np = kaldiio.load_mat(wav_path) + if sample_rate != 16000: + return None + dur = wav_np.shape[0] / sample_rate + return dur + + def __iter__(self): + multitask_task_path = self.multitask_task_path + worker_info = torch.utils.data.get_worker_info() + if worker_info is None: # 不在 DataLoader 的多进程环境中 + num_workers = 1 + worker_id = 0 + else: + num_workers = worker_info.num_workers + worker_id = worker_info.id + + # 获取分布式环境中的进程信息 + if dist.is_available() and dist.is_initialized(): + world_size = dist.get_world_size() + rank = dist.get_rank() + else: + world_size = 1 + rank = 0 + + # 计算每个 worker 和每个进程应该处理的数据范围 + total_num_workers = num_workers * world_size + worker_rank = rank * num_workers + worker_id + data_index = 0 + + with open(multitask_task_path) as f_task: + for line in f_task: + if (data_index % total_num_workers) == worker_rank : + item = json.loads(line.strip()) + + # ark_path = item.get("path", None) + # if ark_path is None: + # print(f"⚠️ 缺少 ark_path: {item}") + # continue + # # **🚀 计算音频时长** + # duration = self.get_audio_duration(ark_path) + # if duration is None or duration > 30.0: + # continue # 跳过时长超限的样本 + target = "<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant" + yield{ + "key": item["key"], + "target": item["target"], + # "ark_path": item["path"] + } + data_index += 1 + + def collator(self, samples): + assert samples is not None + + # 提取每个样本的字段 + keys = [sample["key"] for sample in samples] + targets = [sample["target"] for sample in samples] + # batch_wav_path = [sample["ark_path"] for sample in samples] + + # 获取特征 + # feats, lengths, durs = self.feature_extractor(batch_wav_path) + + # 获取 input_ids 和 target_ids + if self.inference_mode: + input_ids, attention_mask, target_ids, _ = \ + LlmTokenizerWrapper.preprocess_texts( + origin_texts=[""] * len(keys), tokenizer=self.tokenizer, + max_len=128, decode=True) + else: + input_ids, attention_mask, target_ids, clean_texts = \ + LlmTokenizerWrapper.preprocess_texts( + origin_texts=targets, tokenizer=self.tokenizer, + max_len=128, decode=False) + + return { + "keys": keys, + "targets": targets, + # "feats": feats, + # "lengths": lengths, + "input_ids": input_ids, + "attention_mask": attention_mask, + "target_ids": target_ids, + } + # def collator(self, samples): + # assert samples is not None + # # Extract each field from the samples + # keys = [sample["key"] for sample in samples] + # targets = [sample["target"] for sample in samples] + # result = self.tokenizer(targets,return_tensors="pt",padding=True) + # if self.inference_mode: + # result["keys"] = keys + # result["targets"] = targets + # else: + # result["labels"] = copy.deepcopy(result["input_ids"]) + # # result["labels"][:,:len(result["labels"])] + # return result + +def get_speech_dataset(dataset_config, tokenizer, split): + dataset = FireRedDatasetLarge(dataset_config, tokenizer, split) + return dataset \ No newline at end of file diff --git a/examples/asr_fireredasr_text/readme b/examples/asr_fireredasr_text/readme new file mode 100644 index 00000000..858b58db --- /dev/null +++ b/examples/asr_fireredasr_text/readme @@ -0,0 +1,4 @@ +小红书推理与微调代码部分: + 主体使用slam--llm框架, 但是Model的load、推理以及forward均在model文件夹的fireredasr完成 + slam_fireredasr主要包含load和数据dataset的转写 + 注意精度设置方式与main分支不同, 尤其是bf16的时候, 请关注具体训练代码 \ No newline at end of file diff --git a/examples/asr_fireredasr_text/scripts/decode_fireredasr.sh b/examples/asr_fireredasr_text/scripts/decode_fireredasr.sh new file mode 100644 index 00000000..8f9cced6 --- /dev/null +++ b/examples/asr_fireredasr_text/scripts/decode_fireredasr.sh @@ -0,0 +1,137 @@ +#!/bin/bash +#export PYTHONPATH=/root/whisper:$PYTHONPATH +export ASCEND_VISIBLE_DEVICES=0 +export TOKENIZERS_PARALLELISM=false +# export CUDA_LAUNCH_BLOCKING=1 +set -e +run_dir=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/ +cd $run_dir +code_dir=examples/asr_fireredasr_text + +dataset=aishell-2 +prompt_style=instruct # normal #instruct +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech-clean || $dataset == librispeech-other || $dataset == alimeeting || $dataset == "slidespeech" ]] +then + # aishell-1:asr hotword + # aishell-2:asr hotword mt + # librispeech:asr prevtext mt + # alimeeting: asr_far_bf asr_near + dataset_task=asr +fi +projector=linear +encoder_name=conformer +llm_name=Qwen2-7B-Instruct +use_peft=false +use_fp16=true +pad_or_trim=true +encoder_projector_ds_rate=2 +file=examples/asr_fireredasr_text/model/slam_fireredasr.py:model_factory +# Choose Encoder, 这个还是一点用也没有 +if [[ $encoder_name == "whisper" ]] +then + if [[ $encoder_finetune == true ]] + then + speech_encoder_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/whisper-Pt/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt + mel_size=80 + else + speech_encoder_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/whisper-large-v3/large-v3.pt + mel_size=128 + fi + encoder_dim=1280 + input_type=mel + +elif [[ $encoder_name == "wavlm" ]] +then + speech_encoder_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/wavlm/WavLM-Large.pt + encoder_dim=1024 + input_type=raw + mel_size=128 + +elif [[ $encoder_name == "conformer" ]] +then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/FireRedASR-LLM-L/asr_encoder.pth.tar + encoder_dim=1280 + input_type=raw + mel_size=128 + +else + exit 1 +fi + +# Choose LLM, 这个一点用也没有 +if [[ $llm_name == "vicuna-7b-v1.5" ]] +then + llm_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/vicuna-7b-v1.5 + llm_dim=4096 +elif [[ $llm_name == "Qwen2.5-7B-Instruct" ]] +then + llm_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/Qwen2.5-7B-Instruct + llm_dim=3584 +elif [[ $llm_name == "Qwen2-7B" ]] +then + llm_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/Qwen2-7B + llm_dim=3584 +elif [[ $llm_name == "Qwen2.5-7B" ]] +then + llm_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/Qwen2.5-7B + llm_dim=3584 +elif [[ $llm_name == "Qwen2-7B-Instruct" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/Qwen2-7B-Instruct + llm_dim=3584 +else + exit 1 +fi + +if [[ $dataset == "aishell-1" || $dataset == "aishell-2" || $dataset == "alimeeting" || $dataset == "slidespeech" ]] +then + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/test/ +elif [[ $dataset == "librispeech-other" ]] +then + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/test-other/ +elif [[ $dataset == "librispeech-clean" ]] +then + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/test-clean/ +else + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/test/ +fi +# decode_log=$ckpt_path/decode_${dataset}_${dataset_task}_${prompt_style} +decode_log=./decode_${dataset}_${dataset_task}_${prompt_style} +# -m debugpy --listen 5678 --wait-for-client +python $code_dir/inference_fireredasr.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + hydra.run.dir=$ckpt_path \ + ++model_config.llm_name=$llm_name \ + ++model_config.llm_path=$llm_path \ + ++model_config.llm_dim=$llm_dim \ + ++model_config.encoder_name=$encoder_name \ + ++model_config.normalize=true \ + ++model_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ + ++model_config.encoder_path=$speech_encoder_path \ + ++model_config.encoder_dim=$encoder_dim \ + ++model_config.encoder_projector=$projector \ + ++dataset_config.llm_name=$llm_name \ + ++dataset_config.prompt_style=$prompt_style \ + ++dataset_config.dataset=$dataset \ + ++dataset_config.pad_or_trim=$pad_or_trim \ + ++dataset_config.test_scp_file_path=$test_scp_file_path \ + ++dataset_config.input_type=$input_type \ + ++dataset_config.mel_size=$mel_size \ + ++dataset_config.inference_mode=true \ + ++train_config.model_name=firered_asr \ + ++train_config.freeze_encoder=true \ + ++train_config.freeze_llm=true \ + ++train_config.use_peft=$use_peft \ + ++train_config.batching_strategy=custom \ + ++train_config.num_epochs=1 \ + ++train_config.val_batch_size=8 \ + ++train_config.num_workers_dataloader=8 \ + ++train_config.output_dir=$output_dir \ + ++train_config.inference_mode=true \ + ++decode_log=$decode_log \ + + +# python /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/tools/wenet_compute_cer.py --char=1 -v=1 ${decode_log}_gt ${decode_log}_pred > ${decode_log}_cer +# python /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/tools/pyResults/pyResults.py ${decode_log}_gt ${decode_log}_pred > ${decode_log}_ser +# python "/hpc_stor01/home/yangui.fang_sx/workingspace/SLAM-LLM/examples/mala_asr_slidespeech/slam_llm/utils/compute_wer.py" ${decode_log}_gt ${decode_log}_pred ${decode_log}_ser diff --git a/examples/asr_fireredasr_text/scripts/decode_fireredasr_deepspeed.sh b/examples/asr_fireredasr_text/scripts/decode_fireredasr_deepspeed.sh new file mode 100644 index 00000000..a3d1c891 --- /dev/null +++ b/examples/asr_fireredasr_text/scripts/decode_fireredasr_deepspeed.sh @@ -0,0 +1,144 @@ +#!/bin/bash +#export PYTHONPATH=/root/whisper:$PYTHONPATH +export ASCEND_VISIBLE_DEVICES=0 +export TOKENIZERS_PARALLELISM=false +# export CUDA_LAUNCH_BLOCKING=1 +set -e +run_dir=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/ +cd $run_dir +code_dir=examples/asr_fireredasr_text +dataset=alimeeting +ckpt_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/examples/asr_fireredasr_text +# ckpt_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/asr_fireredasr_text/exp/alimeeting/20250320/conformer_linear_Qwen2-7B-Instruct_loratrue_padtrue_normal_asr_far_bf_speedfalse_specaugfalse-1513/mala_asr_epoch_2_step_4000 +prompt_style=normal # normal #instruct +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech-clean || $dataset == librispeech-other || $dataset == alimeeting || $dataset == slidespeech ]] +then + # aishell-1:asr hotword + # aishell-2:asr hotword mt + # librispeech:asr prevtext mt + # alimeeting: asr_far_bf asr_near + # slidespeech: asr domain + dataset_task=asr_far_bf +fi +projector=linear +encoder_name=conformer +llm_name=Qwen2-7B-Instruct +use_peft=false +use_fp16=true +pad_or_trim=true +encoder_projector_ds_rate=2 +deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/examples/asr_fireredasr_text/conf/ds_config.json +prompt_style=normal #instruct +# Choose Encoder, 这个还是一点用也没有 +if [[ $encoder_name == "whisper" ]] +then + if [[ $encoder_finetune == true ]] + then + speech_encoder_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/whisper-Pt/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt + mel_size=80 + else + speech_encoder_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/whisper-large-v3/large-v3.pt + mel_size=128 + fi + encoder_dim=1280 + input_type=mel + +elif [[ $encoder_name == "wavlm" ]] +then + speech_encoder_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/wavlm/WavLM-Large.pt + encoder_dim=1024 + input_type=raw + mel_size=128 + +elif [[ $encoder_name == "conformer" ]] +then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/FireRedASR-LLM-L/asr_encoder.pth.tar + encoder_dim=1280 + input_type=raw + mel_size=128 + +else + exit 1 +fi + +# Choose LLM, 这个一点用也没有 +if [[ $llm_name == "vicuna-7b-v1.5" ]] +then + llm_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/vicuna-7b-v1.5 + llm_dim=4096 +elif [[ $llm_name == "Qwen2.5-7B-Instruct" ]] +then + llm_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/Qwen2.5-7B-Instruct + llm_dim=3584 +elif [[ $llm_name == "Qwen2-7B" ]] +then + llm_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/Qwen2-7B + llm_dim=3584 +elif [[ $llm_name == "Qwen2.5-7B" ]] +then + llm_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/Qwen2.5-7B + llm_dim=3584 +elif [[ $llm_name == "Qwen2-7B-Instruct" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/Qwen2-7B-Instruct + llm_dim=3584 +else + exit 1 +fi + +if [[ $dataset == "aishell-1" || $dataset == "aishell-2" || $dataset == "alimeeting" || slidespeech ]] +then + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/test/ +elif [[ $dataset == "librispeech-other" ]] +then + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/test-other/ +elif [[ $dataset == "librispeech-clean" ]] +then + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/test-clean/ +else + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/test/ +fi +decode_log=$ckpt_path/decode_${dataset}_${dataset_task}_${prompt_style} +decode_log=./decode_${dataset}_${dataset_task}_${prompt_style} +# -m debugpy --listen 5678 --wait-for-client +deepspeed \ + --num_nodes 1 \ + --num_gpus 8 \ + $code_dir/inference_fireredasr_deepspeed.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + hydra.run.dir=$ckpt_path \ + ++model_config.llm_name=$llm_name \ + ++model_config.llm_path=$llm_path \ + ++model_config.llm_dim=$llm_dim \ + ++model_config.encoder_name=$encoder_name \ + ++model_config.normalize=true \ + ++model_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ + ++model_config.encoder_path=$speech_encoder_path \ + ++model_config.encoder_dim=$encoder_dim \ + ++model_config.encoder_projector=$projector \ + ++dataset_config.llm_name=$llm_name \ + ++dataset_config.prompt_style=$prompt_style \ + ++dataset_config.dataset=$dataset \ + ++dataset_config.pad_or_trim=$pad_or_trim \ + ++dataset_config.test_scp_file_path=$test_scp_file_path \ + ++dataset_config.input_type=$input_type \ + ++dataset_config.mel_size=$mel_size \ + ++dataset_config.inference_mode=true \ + ++train_config.model_name=firered_asr \ + ++train_config.freeze_encoder=true \ + ++train_config.freeze_llm=true \ + ++train_config.use_peft=$use_peft \ + ++train_config.batching_strategy=custom \ + ++train_config.num_epochs=1 \ + ++train_config.val_batch_size=8 \ + ++train_config.num_workers_dataloader=8 \ + ++train_config.output_dir=$output_dir \ + ++train_config.inference_mode=true \ + ++decode_log=$decode_log \ + # ++ckpt_path=$ckpt_path/model.pt + + +python /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/tools/wenet_compute_cer.py --char=1 -v=1 ${decode_log}_gt ${decode_log}_pred > ${decode_log}_cer +python /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/tools/pyResults/pyResults.py ${decode_log}_gt ${decode_log}_pred > ${decode_log}_ser +python "/hpc_stor01/home/yangui.fang_sx/workingspace/SLAM-LLM/examples/mala_asr_slidespeech/slam_llm/utils/compute_wer.py" ${decode_log}_gt ${decode_log}_pred ${decode_log}_ser diff --git a/examples/asr_fireredasr_text/scripts/finetune_fireredasr.sh b/examples/asr_fireredasr_text/scripts/finetune_fireredasr.sh new file mode 100644 index 00000000..28c50f1f --- /dev/null +++ b/examples/asr_fireredasr_text/scripts/finetune_fireredasr.sh @@ -0,0 +1,194 @@ +#!/bin/bash +# export PYTHONPATH=/root/fairseq:$PYTHONPATH +export ASCEND_VISIBLE_DEVICES=6,7 +export TOKENIZERS_PARALLELISM=false +# export CUDA_LAUNCH_BLOCKING=1 +export HYDRA_FULL_ERROR=1 +export OMP_NUM_THREADS=1 + +# debug setting for multiple gpus +# export NCCL_DEBUG=INFO +# export NCCL_DEBUG_SUBSYS=ALL +# export TORCH_DISTRIBUTED_DEBUG=INFO +run_dir=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/ +cd $run_dir +code_dir=examples/asr_fireredasr_text +# multitask +# dataset=alimeeting +# multitask_asr +dataset=alimeeting +prompt_style=normal #instruct +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech ]] +then + # aishell1:asr hotword + # aisehll2:asr hotword mt + # librispeech:asr prevtext mt + # alimeeting: asr_far_bf asr_near asr_far_bf_sot asr_far_sot asr_far_gss + # aishell-4: asr_far_bf asr_far_sot asf + # gigaspeech: asr + # wenetspeech: asr + dataset_task=asr_far_bf_sot +fi +projector=linear +encoder_name=conformer +llm_name=Qwen2-7B-Instruct +use_peft=true +use_fp16=true +freeze_encoder=true +pad_or_trim=true +encoder_projector_ds_rate=2 +# enhance +# enhance +speed_perturb=false +spec_augmentation=false +add_noise=false +add_reverb=false +firered_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/FireRedASR-LLM/ +# if [[ $use_peft == "true" || $freeze_encoder == false ]];then +# ckpt_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/FireRedASR-LLM/ +# fi + +# Choose Encoder +if [[ $encoder_name == "whisper" ]] +then + if [[ $encoder_finetune == true ]] + then + speech_encoder_path= + mel_size=80 + else + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + mel_size=128 + fi + encoder_dim=1280 + input_type=mel + +elif [[ $encoder_name == "wavlm" ]] +then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/wavlm/WavLM-Large.pt + encoder_dim=1024 + input_type=raw + mel_size=128 +elif [[ $encoder_name == "conformer" ]] +then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/FireRedASR-LLM-L/asr_encoder.pth.tar + encoder_dim=1280 + input_type=raw + mel_size=128 +else + exit 1 +fi + +# Choose LLM +if [[ $llm_name == "vicuna-7b-v1.5" ]] +then + llm_path= + llm_dim=4096 +elif [[ $llm_name == "Qwen2.5-7B-Instruct" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim=3584 +elif [[ $llm_name == "Qwen2-7B-Instruct" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/Qwen2-7B-Instruct + llm_dim=3584 +elif [[ $llm_name == "Qwen2-7B" ]] +then + llm_path= + llm_dim=3584 +elif [[ $llm_name == "Qwen2.5-7B" ]] +then + llm_path= + llm_dim=3584 +else + exit 1 +fi + +# Choose Train/Dev/Test +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/dev/ + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/test/ +elif [[ $dataset == "librispeech" ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/dev-other/ +else + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/dev/ +fi +file=examples/asr_fireredasr_text/model/slam_fireredasr.py:model_factory +inference_mode=False +output_dir=${code_dir}/exp/${dataset}/$(date +"%Y%m%d")/${encoder_name}_${projector}_${llm_name}_lora${use_peft}_pad${pad_or_trim}_${prompt_style}_${dataset_task}_speed${speed_perturb}_specaug${spec_augmentation}-$(date +"%H%M") +hydra_args=" +hydra.run.dir=$output_dir \ +++model_config.llm_name=$llm_name \ +++model_config.llm_path=$llm_path \ +++model_config.llm_dim=$llm_dim \ +++model_config.encoder_name=$encoder_name \ +++model_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++model_config.encoder_path=$speech_encoder_path \ +++model_config.encoder_dim=$encoder_dim \ +++model_config.encoder_projector=$projector \ +++model_config.firered_path=$firered_path \ +++model_config.normalize=true \ +++model_config.file=$file \ +++dataset_config.llm_name=$llm_name \ +++dataset_config.prompt_style=$prompt_style \ +++dataset_config.normalize=true \ +++dataset_config.dataset=$dataset \ +++dataset_config.wav_scp=$test_scp_file_path/my_wav.scp \ +++dataset_config.text=$test_scp_file_path/text \ +++dataset_config.input_type=$input_type \ +++dataset_config.speed_perturb=$speed_perturb \ +++dataset_config.spec_augmentation=$spec_augmentation \ +++dataset_config.add_reverb=$add_reverb \ +++dataset_config.noise_file_path=$noise_file_path \ +++dataset_config.mel_size=$mel_size \ +++dataset_config.pad_or_trim=$pad_or_trim \ +++dataset_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++dataset_config.train_scp_file_path=$train_scp_file_path \ +++dataset_config.train_text_file=$train_scp_file_path \ +++dataset_config.dev_scp_file_path=$dev_scp_file_path \ +++dataset_config.dev_text_file=$dev_scp_file_path \ +++train_config.model_name=mala_asr \ +++train_config.num_epochs=5 \ +++train_config.freeze_encoder=$freeze_encoder \ +++train_config.freeze_llm=true \ +++train_config.use_peft=$use_peft \ +++train_config.batching_strategy=custom \ +++train_config.warmup_steps=100 \ +++train_config.total_steps=100000 \ +++train_config.lr=1e-4 \ +++train_config.validation_interval=500 \ +++train_config.batch_size_training=1 \ +++train_config.val_batch_size=1 \ +++train_config.num_workers_dataloader=8 \ +++train_config.output_dir=$output_dir \ +++train_config.inference_mode=$inference_mode \ +++metric=acc \ +" +# if [[ $use_peft == "true" || $freeze_encoder == false ]];then +# hydra_args+="++ckpt_path=$ckpt_path" +# fi +# hydra_args+="++ckpt_path=$ckpt_path/model.pt" + +# -m debugpy --listen 5678 --wait-for-client +if [[ $ASCEND_VISIBLE_DEVICES != *","* ]]; then + python -m debugpy --listen 5678 --wait-for-client $code_dir/finetune_fireredasr.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + $hydra_args +else + torchrun \ + --nnodes 1 \ + --nproc_per_node 2 \ + --master_port=29505 \ + $code_dir/finetune_fireredasr.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + ++train_config.enable_fsdp=false \ + ++train_config.enable_ddp=true \ + ++train_config.use_fp16=$use_fp16 \ + ${hydra_args} +fi diff --git a/examples/asr_fireredasr_text/scripts/finetune_fireredasr_deepspeed.sh b/examples/asr_fireredasr_text/scripts/finetune_fireredasr_deepspeed.sh new file mode 100644 index 00000000..cf65b937 --- /dev/null +++ b/examples/asr_fireredasr_text/scripts/finetune_fireredasr_deepspeed.sh @@ -0,0 +1,182 @@ +#!/bin/bash +# export PYTHONPATH=/root/fairseq:$PYTHONPATH +# export ASCEND_VISIBLE_DEVICES=6,7 +export TOKENIZERS_PARALLELISM=false +# export CUDA_LAUNCH_BLOCKING=1 +export HYDRA_FULL_ERROR=1 +export OMP_NUM_THREADS=1 + +# debug setting for multiple gpus +# export NCCL_DEBUG=INFO +# export NCCL_DEBUG_SUBSYS=ALL +# export TORCH_DISTRIBUTED_DEBUG=INFO +run_dir=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/ +cd $run_dir +code_dir=examples/asr_fireredasr_text +# multitask +# dataset=alimeeting +# multitask_asr +dataset=slidespeech +deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/examples/asr_fireredasr_text/conf/ds_config.json +prompt_style=normal #instruct +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech || $dataset == slidespeech ]] +then + # aishell1:asr hotword + # aisehll2:asr hotword mt + # librispeech:asr prevtext mt + # alimeeting: asr_far_bf asr_near + # gigaspeech: asr + # wenetspeech: asr + dataset_task=asr +fi +projector=linear +encoder_name=conformer +llm_name=Qwen2-7B-Instruct +use_peft=true +use_fp16=true +freeze_encoder=true +pad_or_trim=true +encoder_projector_ds_rate=2 +# enhance +# enhance +speed_perturb=false +spec_augmentation=false +add_noise=false +add_reverb=false +# if [[ $use_peft == "true" || $freeze_encoder == false ]];then +# ckpt_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/asr_fireredasr_text/exp/alimeeting/20250315/conformer_linear_Qwen2-7B-Instruct_loratrue_padtrue_normal_asr_far_bf_sot_speedfalse_specaugfalse-1410/mala_asr_epoch_1_step_12000 +# fi +# Choose Encoder +if [[ $encoder_name == "whisper" ]] +then + if [[ $encoder_finetune == true ]] + then + speech_encoder_path= + mel_size=80 + else + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + mel_size=128 + fi + encoder_dim=1280 + input_type=mel + +elif [[ $encoder_name == "wavlm" ]] +then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/wavlm/WavLM-Large.pt + encoder_dim=1024 + input_type=raw + mel_size=128 +elif [[ $encoder_name == "conformer" ]] +then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/FireRedASR-LLM-L/asr_encoder.pth.tar + encoder_dim=1280 + input_type=raw + mel_size=128 +else + exit 1 +fi + +# Choose LLM +if [[ $llm_name == "vicuna-7b-v1.5" ]] +then + llm_path= + llm_dim=4096 +elif [[ $llm_name == "Qwen2.5-7B-Instruct" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim=3584 +elif [[ $llm_name == "Qwen2-7B-Instruct" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/Qwen2-7B-Instruct + llm_dim=3584 +elif [[ $llm_name == "Qwen2-7B" ]] +then + llm_path= + llm_dim=3584 +elif [[ $llm_name == "Qwen2.5-7B" ]] +then + llm_path= + llm_dim=3584 +else + exit 1 +fi + +# Choose Train/Dev/Test +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech || $dataset == slidespeech ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/dev/ + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/test/ +elif [[ $dataset == "librispeech" ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/dev-other/ +else + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/dev/ +fi +file=examples/asr_fireredasr_text/model/slam_fireredasr.py:model_factory +inference_mode=False +output_dir=${code_dir}/exp/${dataset}/$(date +"%Y%m%d")/${encoder_name}_${projector}_${llm_name}_lora${use_peft}_pad${pad_or_trim}_${prompt_style}_${dataset_task}_speed${speed_perturb}_specaug${spec_augmentation}-$(date +"%H%M") +hydra_args=" +hydra.run.dir=$output_dir \ +++model_config.llm_name=$llm_name \ +++model_config.llm_path=$llm_path \ +++model_config.llm_dim=$llm_dim \ +++model_config.encoder_name=$encoder_name \ +++model_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++model_config.encoder_path=$speech_encoder_path \ +++model_config.encoder_dim=$encoder_dim \ +++model_config.encoder_projector=$projector \ +++model_config.normalize=true \ +++model_config.file=$file \ +++dataset_config.llm_name=$llm_name \ +++dataset_config.prompt_style=$prompt_style \ +++dataset_config.normalize=true \ +++dataset_config.dataset=$dataset \ +++dataset_config.wav_scp=$test_scp_file_path/my_wav.scp \ +++dataset_config.text=$test_scp_file_path/text \ +++dataset_config.input_type=$input_type \ +++dataset_config.speed_perturb=$speed_perturb \ +++dataset_config.spec_augmentation=$spec_augmentation \ +++dataset_config.add_reverb=$add_reverb \ +++dataset_config.noise_file_path=$noise_file_path \ +++dataset_config.mel_size=$mel_size \ +++dataset_config.pad_or_trim=$pad_or_trim \ +++dataset_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++dataset_config.train_scp_file_path=$train_scp_file_path \ +++dataset_config.train_text_file=$train_scp_file_path \ +++dataset_config.dev_scp_file_path=$dev_scp_file_path \ +++dataset_config.dev_text_file=$dev_scp_file_path \ +++train_config.model_name=mala_asr \ +++train_config.num_epochs=5 \ +++train_config.freeze_encoder=$freeze_encoder \ +++train_config.freeze_llm=true \ +++train_config.use_peft=$use_peft \ +++train_config.batching_strategy=custom \ +++train_config.warmup_steps=100 \ +++train_config.total_steps=100000 \ +++train_config.lr=5e-5 \ +++train_config.validation_interval=4000 \ +++train_config.batch_size_training=20 \ +++train_config.val_batch_size=6 \ +++train_config.num_workers_dataloader=8 \ +++train_config.output_dir=$output_dir \ +++train_config.inference_mode=$inference_mode \ +++metric=acc \ +" +# if [[ $use_peft == "true" || $freeze_encoder == false ]];then +# hydra_args+=" ++ckpt_path=$ckpt_path/model.pt " +# fi + +deepspeed \ + --num_nodes 1 \ + --num_gpus 2 \ + $code_dir/finetune_fireredasr_deepspeed.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + ++train_config.enable_fsdp=false \ + ++train_config.enable_ddp=true \ + ++train_config.use_fp16=$use_fp16 \ + ++deepspeed_config=$deepspeed_config \ + ${hydra_args} diff --git a/examples/asr_fireredasr_text/scripts/finetune_fireredasr_deepspeed_multinode.sh b/examples/asr_fireredasr_text/scripts/finetune_fireredasr_deepspeed_multinode.sh new file mode 100644 index 00000000..d8b84dde --- /dev/null +++ b/examples/asr_fireredasr_text/scripts/finetune_fireredasr_deepspeed_multinode.sh @@ -0,0 +1,206 @@ +#!/bin/bash +# export PYTHONPATH=/root/fairseq:$PYTHONPATH +# export ASCEND_VISIBLE_DEVICES=6,7 +export TOKENIZERS_PARALLELISM=false +# export CUDA_LAUNCH_BLOCKING=1 +export HYDRA_FULL_ERROR=1 +export OMP_NUM_THREADS=1 +# export HCCL_CONNECT_TIMEOUT=3600 +# export HCCL_CONNECT_TIMEOUT=7200 +# export HCCL_EXEC_TIMEOUT=7200 +# debug setting for multiple gpus +# export NCCL_DEBUG=INFO +# export NCCL_DEBUG_SUBSYS=ALL +# export TORCH_DISTRIBUTED_DEBUG=INFO +run_dir=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/ +cd $run_dir +code_dir=examples/asr_fireredasr_text +# multitask +# dataset=alimeeting +# multitask_asr +dataset=slidespeech +deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/examples/asr_fireredasr_text/conf/ds_config.json +prompt_style=normal #instruct +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech || $dataset == slidespeech ]] +then + # aishell1:asr hotword + # aisehll2:asr hotword mt + # librispeech:asr prevtext mt + # alimeeting: asr_far_bf asr_near + # gigaspeech: asr + # wenetspeech: asr + dataset_task=asr +fi +projector=linear +encoder_name=conformer +llm_name=Qwen2-7B-Instruct +use_peft=true +use_fp16=true +freeze_encoder=true +pad_or_trim=true +encoder_projector_ds_rate=2 +# enhance +# enhance +speed_perturb=false +spec_augmentation=false +add_noise=false +add_reverb=false + +# if [[ $use_peft == "true" || $freeze_encoder == false ]];then +# ckpt_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/FireRedASR-LLM/ +# fi + +# Choose Encoder +if [[ $encoder_name == "whisper" ]] +then + if [[ $encoder_finetune == true ]] + then + speech_encoder_path= + mel_size=80 + else + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + mel_size=128 + fi + encoder_dim=1280 + input_type=mel + +elif [[ $encoder_name == "wavlm" ]] +then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/wavlm/WavLM-Large.pt + encoder_dim=1024 + input_type=raw + mel_size=128 +elif [[ $encoder_name == "conformer" ]] +then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/FireRedASR-LLM-L/asr_encoder.pth.tar + encoder_dim=1280 + input_type=raw + mel_size=128 +else + exit 1 +fi + +# Choose LLM +if [[ $llm_name == "vicuna-7b-v1.5" ]] +then + llm_path= + llm_dim=4096 +elif [[ $llm_name == "Qwen2.5-7B-Instruct" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim=3584 +elif [[ $llm_name == "Qwen2-7B-Instruct" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/Qwen2-7B-Instruct + llm_dim=3584 +elif [[ $llm_name == "Qwen2-7B" ]] +then + llm_path= + llm_dim=3584 +elif [[ $llm_name == "Qwen2.5-7B" ]] +then + llm_path= + llm_dim=3584 +else + exit 1 +fi + +# Choose Train/Dev/Test +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech || $dataset == slidespeech ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/dev/ + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/test/ +elif [[ $dataset == "librispeech" ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/dev-other/ +else + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/dev/ +fi +file=examples/asr_fireredasr_text/model/slam_fireredasr.py:model_factory +inference_mode=False +output_dir=${code_dir}/exp/${dataset}/$(date +"%Y%m%d")/${encoder_name}_${projector}_${llm_name}_lora${use_peft}_pad${pad_or_trim}_${prompt_style}_${dataset_task}_speed${speed_perturb}_specaug${spec_augmentation}-$(date +"%H%M") +hydra_args=" +hydra.run.dir=$output_dir \ +++model_config.llm_name=$llm_name \ +++model_config.llm_path=$llm_path \ +++model_config.llm_dim=$llm_dim \ +++model_config.encoder_name=$encoder_name \ +++model_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++model_config.encoder_path=$speech_encoder_path \ +++model_config.encoder_dim=$encoder_dim \ +++model_config.encoder_projector=$projector \ +++model_config.normalize=true \ +++model_config.file=$file \ +++dataset_config.llm_name=$llm_name \ +++dataset_config.prompt_style=$prompt_style \ +++dataset_config.normalize=true \ +++dataset_config.dataset=$dataset \ +++dataset_config.wav_scp=$test_scp_file_path/my_wav.scp \ +++dataset_config.text=$test_scp_file_path/text \ +++dataset_config.input_type=$input_type \ +++dataset_config.speed_perturb=$speed_perturb \ +++dataset_config.spec_augmentation=$spec_augmentation \ +++dataset_config.add_reverb=$add_reverb \ +++dataset_config.noise_file_path=$noise_file_path \ +++dataset_config.mel_size=$mel_size \ +++dataset_config.pad_or_trim=$pad_or_trim \ +++dataset_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++dataset_config.train_scp_file_path=$train_scp_file_path \ +++dataset_config.train_text_file=$train_scp_file_path \ +++dataset_config.dev_scp_file_path=$dev_scp_file_path \ +++dataset_config.dev_text_file=$dev_scp_file_path \ +++train_config.model_name=mala_asr \ +++train_config.num_epochs=5 \ +++train_config.freeze_encoder=$freeze_encoder \ +++train_config.freeze_llm=true \ +++train_config.use_peft=$use_peft \ +++train_config.batching_strategy=custom \ +++train_config.warmup_steps=100 \ +++train_config.total_steps=100000 \ +++train_config.lr=1e-4 \ +++train_config.validation_interval=300 \ +++train_config.batch_size_training=10 \ +++train_config.val_batch_size=10 \ +++train_config.num_workers_dataloader=8 \ +++train_config.output_dir=$output_dir \ +++train_config.inference_mode=$inference_mode \ +++metric=acc \ +" +# if [[ $use_peft == "true" || $freeze_encoder == false ]];then +# hydra_args+="++ckpt_path=$ckpt_path" +# fi +# hydra_args+="++ckpt_path=$ckpt_path/model.pt" + + + +HOST_FILE="/tmp/"${JobID} #生成的hostfile的完整文件名,$JobID调度系统会自动生成 +SSH_PORT=6666 #因调度系统强制普通用户身份起容器,需要将ssh端口指定为大于1024的值 + +gen_hostfile() { #此函数负责生成hostfile, 已跟调度系统对接好,直接使用,不要修改 + echo "${VC_MASTER_HOSTS} slots=${GPU_PER_TASK}" > ${HOST_FILE} + echo "${VC_WORKER_HOSTS}" | awk -F ',' -v gpu_num=$GPU_PER_TASK '{for (i=1; i<=NF; i++) print $i" slots="gpu_num}' >> ${HOST_FILE} +} + +do_train() { + cat $HOST_FILE #训练主入口函数 + /usr/sbin/sshd -p ${SSH_PORT} #在Rank0上后台启动sshd服务,不要修改 + deepspeed \ + --node_rank=$RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT \ + --hostfile $HOST_FILE \ + --no_ssh \ + $code_dir/finetune_fireredasr_deepspeed.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + ++train_config.enable_fsdp=false \ + ++train_config.enable_ddp=true \ + ++train_config.use_fp16=$use_fp16 \ + ++deepspeed_config=$deepspeed_config \ + ${hydra_args} +} +gen_hostfile +do_train \ No newline at end of file diff --git a/examples/asr_fireredasr_text/slam_llm b/examples/asr_fireredasr_text/slam_llm new file mode 120000 index 00000000..e2d801ef --- /dev/null +++ b/examples/asr_fireredasr_text/slam_llm @@ -0,0 +1 @@ +/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/src/slam_llm \ No newline at end of file diff --git a/examples/asr_librispeech/asr_config.py b/examples/asr_librispeech/asr_config.py index d6683157..c8917290 100644 --- a/examples/asr_librispeech/asr_config.py +++ b/examples/asr_librispeech/asr_config.py @@ -14,6 +14,7 @@ class ModelConfig: encoder_name: Optional[str] = None encoder_ds_rate: int = 2 encoder_path: Optional[str] = None + encoder_path_hf : Optional[str] = None encoder_dim: int = 1280 encoder_projector: str = "linear" encoder_projector_ds_rate: int = 5 @@ -112,7 +113,7 @@ class FSDPConfig: mixed_precision: bool = True use_fp16: bool = False # sharding_strategy = "FULL_SHARD" #ShardingStrategy = ShardingStrategy.FULL_SHARD - sharding_strategy: ShardingStrategy = "NO_SHARD" #ShardingStrategy.NO_SHARD #MZY: set NO_SHARD when use DDP + sharding_strategy: ShardingStrategy = "FULL_SHARD" #ShardingStrategy.NO_SHARD #MZY: set NO_SHARD when use DDP checkpoint_type: str = "SHARDED_STATE_DICT" # alternatively can use SHARDED_STATE_DICT save one file per rank, and can resize the world-size. fsdp_activation_checkpointing: bool = True fsdp_cpu_offload: bool = False @@ -122,9 +123,9 @@ class FSDPConfig: @dataclass class LogConfig: use_wandb: bool = False - wandb_dir: str = "/root/test_wandb" + wandb_dir: str = "tmp/test_wandb" wandb_entity_name: str = "project_name" wandb_project_name: str = "project_name" wandb_exp_name: str = "exp_name" - log_file: str = "/root/test.log" + log_file: str = "tmp/test.log" log_interval: int = 5 diff --git a/examples/asr_librispeech/conf/prompt.yaml b/examples/asr_librispeech/conf/prompt.yaml index 0bc65175..fd13495c 100644 --- a/examples/asr_librispeech/conf/prompt.yaml +++ b/examples/asr_librispeech/conf/prompt.yaml @@ -1,4 +1,4 @@ dataset_config: # we put prompt here, because the hydra override in shell script only support a small subset of chars # prompt: "Transcribe speech to text. Output the transcription directly without redundant content. Ensure that the output is not duplicated. " - prompt: "Transcribe speech to text. " + prompt: "Transcribe speech to Chinese text. " diff --git a/examples/asr_librispeech/scripts/decode_MaLa-ASR.sh b/examples/asr_librispeech/scripts/decode_MaLa-ASR.sh new file mode 100644 index 00000000..c21c9867 --- /dev/null +++ b/examples/asr_librispeech/scripts/decode_MaLa-ASR.sh @@ -0,0 +1,111 @@ +#!/bin/bash +#export PYTHONPATH=/root/whisper:$PYTHONPATH +export CUDA_VISIBLE_DEVICES=7 +export TOKENIZERS_PARALLELISM=false +# export CUDA_LAUNCH_BLOCKING=1 +run_dir=/hpc_stor01/home/yangui.fang_sx/workingspace/SLAM-LLM +cd $run_dir +code_dir=examples/asr_librispeech +dataset=aishell2 +projector=linear +encoder_name=whisper +llm_name=Qwen2.5-7B-Instruct +use_peft=false +ckpt_path=/hpc_stor01/home/yangui.fang_sx/workingspace/project/asr_librispeech_origin/exp/whisper_Qwen2.5-7B-Instruct_aishell1_linear_lorafalse_20241202-1342/asr_epoch_4_step_2482/ +output_dir=$ckpt_path + +# Choose Encoder +if [[ $encoder_name == "whisper" ]] +then + speech_encoder_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/whisper-large-v3/large-v3.pt + encoder_dim=1280 + input_type=mel + mel_size=128 +elif [[ $encoder_name == "wavlm" ]] +then + speech_encoder_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/wavlm/WavLM-Large.pt + encoder_dim=1024 + input_type=raw + mel_size= +else + exit 1 +fi + +# Choose LLM +if [[ $llm_name == "vicuna-7b-v1.5" ]] +then + llm_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/vicuna-7b-v1.5 + llm_dim=4096 + use_fp16=true +elif [[ $llm_name == "Qwen2.5-7B-Instruct" ]] +then + llm_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/Qwen2.5-7B-Instruct + llm_dim=3584 + use_fp16=true +elif [[ $llm_name == "Qwen2-7B" ]] +then + llm_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/Qwen2-7B + llm_dim=3584 + use_fp16=true +elif [[ $llm_name == "Qwen2.5-1.5B-Instruct" ]] +then + llm_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/Qwen2.5-1.5B-Instruct + llm_dim=1536 + use_fp16=true +elif [[ $llm_name == "Qwen2.5-7B" ]] +then + llm_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/Qwen2.5-7B + llm_dim=3584 + use_fp16=true +else + exit 1 +fi + +# Choose Train/Dev/Test +if [[ $dataset == "aishell1" ]] +then + val_data_path=/hpc_stor01/home/yangui.fang_sx/workingspace/data/aishell-1/asr_librispeech/test.jsonl +else + exit 1 +fi + + +decode_log=$ckpt_path/decode_${dataset}_beam4 +# -m debugpy --listen 5678 --wait-for-client +python $code_dir/inference_asr_batch.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + hydra.run.dir=$ckpt_path \ + ++model_config.llm_name=$llm_name \ + ++model_config.llm_path=$llm_path \ + ++model_config.llm_dim=$llm_dim \ + ++model_config.encoder_name=whisper \ + ++model_config.encoder_projector_ds_rate=5 \ + ++model_config.encoder_path=$speech_encoder_path \ + ++model_config.encoder_dim=1280 \ + ++model_config.encoder_projector=linear \ + ++dataset_config.dataset=speech_dataset \ + ++dataset_config.val_data_path=$val_data_path \ + ++dataset_config.input_type=mel \ + ++dataset_config.mel_size=128 \ + ++dataset_config.inference_mode=true \ + ++train_config.model_name=asr \ + ++train_config.freeze_encoder=true \ + ++train_config.freeze_llm=true \ + ++train_config.batching_strategy=custom \ + ++train_config.num_epochs=1 \ + ++train_config.val_batch_size=6 \ + ++train_config.num_workers_dataloader=2 \ + ++train_config.output_dir=$output_dir \ + ++decode_log=$decode_log \ + ++ckpt_path=$ckpt_path/model.pt \ + # ++peft_ckpt=$ckpt_path \ + # ++train_config.use_peft=true \ + # ++train_config.peft_config.r=32 \ + # ++dataset_config.normalize=true \ + # ++model_config.encoder_projector=q-former \ + # ++dataset_config.fix_length_audio=64 \ + +python /hpc_stor01/home/yangui.fang_sx/workingspace/tools/pyResults/pyResults.py ${decode_log}_gt ${decode_log}_pred > ${decode_log}_cer +# python "/hpc_stor01/home/yangui.fang_sx/workingspace/tools/wenet_compute_cer.py --char=1 -v=1 ${decode_log}_gt ${decode_log}_pred > ${decode_log}_cer" +# python "/hpc_stor01/home/yangui.fang_sx/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/slam_llm/utils/compute_wer.py ${decode_log}_gt ${decode_log}_pred ${decode_log}_ser" \ No newline at end of file diff --git a/examples/asr_librispeech/scripts/decode_hubert_xtralarge_linear_vicuna_7b.sh b/examples/asr_librispeech/scripts/decode_hubert_xtralarge_linear_vicuna_7b.sh deleted file mode 100755 index 922c08dd..00000000 --- a/examples/asr_librispeech/scripts/decode_hubert_xtralarge_linear_vicuna_7b.sh +++ /dev/null @@ -1,57 +0,0 @@ -#!/bin/bash -#export PYTHONPATH=/root/whisper:$PYTHONPATH -export PYTHONPATH=/root/fairseq:$PYTHONPATH -export CUDA_VISIBLE_DEVICES=0 -export TOKENIZERS_PARALLELISM=false -# export CUDA_LAUNCH_BLOCKING=1 - -run_dir=/root/SLAM-LLM -cd $run_dir -code_dir=examples/asr_librispeech - -speech_encoder_path=/nfs/yangguanrou.ygr/ckpts/hubert_ckpt/hubert_xtralarge_ll60k_finetune_ls960.pt -llm_path=/nfs/maziyang.mzy/models/vicuna-7b-v1.5 - -output_dir=/nfs/yangguanrou.ygr/experiments_hubert/vicuna-7b-v1.5-hubert_xtralarge_ll60k_finetune_ls960 -ckpt_path=$output_dir/asr_epoch_1_step_1000 -split=librispeech_test_clean -val_data_path=/nfs/maziyang.mzy/data/librispeech/${split}.jsonl -decode_log=$ckpt_path/decode_${split}_beam4 - -# -m debugpy --listen 5678 --wait-for-client -python $code_dir/inference_asr_batch.py \ - --config-path "conf" \ - --config-name "prompt.yaml" \ - hydra.run.dir=$ckpt_path \ - ++model_config.llm_name="vicuna-7b-v1.5" \ - ++model_config.llm_path=$llm_path \ - ++model_config.llm_dim=4096 \ - ++model_config.encoder_name=hubert \ - ++model_config.normalize=true \ - ++dataset_config.normalize=true \ - ++model_config.encoder_projector_ds_rate=5 \ - ++model_config.encoder_path=$speech_encoder_path \ - ++model_config.encoder_dim=1280 \ - ++model_config.encoder_type=finetune \ - ++model_config.encoder_projector=linear \ - ++dataset_config.dataset=speech_dataset \ - ++dataset_config.val_data_path=$val_data_path \ - ++dataset_config.input_type=raw \ - ++dataset_config.inference_mode=true \ - ++dataset_config.prompt="Transcribe speech to text. " \ - ++train_config.model_name=asr \ - ++train_config.freeze_encoder=true \ - ++train_config.freeze_llm=true \ - ++train_config.batching_strategy=custom \ - ++train_config.num_epochs=1 \ - ++train_config.val_batch_size=1 \ - ++train_config.num_workers_dataloader=0 \ - ++train_config.output_dir=$output_dir \ - ++decode_log=$decode_log \ - ++ckpt_path=$ckpt_path/model.pt \ - # ++peft_ckpt=$ckpt_path \ - # ++train_config.use_peft=true \ - # ++train_config.peft_config.r=32 \ - # ++dataset_config.normalize=true \ - # ++model_config.encoder_projector=q-former \ - # ++dataset_config.fix_length_audio=64 \ diff --git a/examples/asr_librispeech/scripts/decode_wavlm_large_linear_vicuna_7b.sh b/examples/asr_librispeech/scripts/decode_wavlm_large_linear_vicuna_7b.sh deleted file mode 100755 index 5564ff28..00000000 --- a/examples/asr_librispeech/scripts/decode_wavlm_large_linear_vicuna_7b.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/bin/bash -#export PYTHONPATH=/root/whisper:$PYTHONPATH -export CUDA_VISIBLE_DEVICES=0 -export TOKENIZERS_PARALLELISM=false -# export CUDA_LAUNCH_BLOCKING=1 - -run_dir=/root/SLAM-LLM -cd $run_dir -code_dir=examples/asr_librispeech - -speech_encoder_path=/nfs/maziyang.mzy/models/wavlm/WavLM-Large.pt -llm_path=/nfs/maziyang.mzy/models/vicuna-7b-v1.5 - -output_dir=/root/tmp/vicuna-7b-v1.5-librispeech-linear-steplrwarmupkeep1e-4-wavlm-large-20240426 -ckpt_path=$output_dir/asr_epoch_1_step_1000 -split=librispeech_test_clean -val_data_path=/nfs/maziyang.mzy/data/librispeech/${split}.jsonl -decode_log=$ckpt_path/decode_${split}_beam4 - -# -m debugpy --listen 5678 --wait-for-client -python $code_dir/inference_asr_batch.py \ - --config-path "conf" \ - --config-name "prompt.yaml" \ - hydra.run.dir=$ckpt_path \ - ++model_config.llm_name="vicuna-7b-v1.5" \ - ++model_config.llm_path=$llm_path \ - ++model_config.llm_dim=4096 \ - ++model_config.encoder_name=wavlm \ - ++model_config.normalize=true \ - ++dataset_config.normalize=true \ - ++model_config.encoder_projector_ds_rate=5 \ - ++model_config.encoder_path=$speech_encoder_path \ - ++model_config.encoder_dim=1024 \ - ++model_config.encoder_projector=linear \ - ++dataset_config.dataset=speech_dataset \ - ++dataset_config.val_data_path=$val_data_path \ - ++dataset_config.input_type=raw \ - ++dataset_config.inference_mode=true \ - ++train_config.model_name=asr \ - ++train_config.freeze_encoder=true \ - ++train_config.freeze_llm=true \ - ++train_config.batching_strategy=custom \ - ++train_config.num_epochs=1 \ - ++train_config.val_batch_size=1 \ - ++train_config.num_workers_dataloader=2 \ - ++train_config.output_dir=$output_dir \ - ++decode_log=$decode_log \ - ++ckpt_path=$ckpt_path/model.pt \ - # ++peft_ckpt=$ckpt_path \ - # ++train_config.use_peft=true \ - # ++train_config.peft_config.r=32 \ - # ++dataset_config.normalize=true \ - # ++model_config.encoder_projector=q-former \ - # ++dataset_config.fix_length_audio=64 \ diff --git a/examples/asr_librispeech/scripts/decode_whisper_large_linear_vicuna_7b.sh b/examples/asr_librispeech/scripts/decode_whisper_large_linear_vicuna_7b.sh deleted file mode 100755 index f017adf2..00000000 --- a/examples/asr_librispeech/scripts/decode_whisper_large_linear_vicuna_7b.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash -#export PYTHONPATH=/root/whisper:$PYTHONPATH -export CUDA_VISIBLE_DEVICES=0 -export TOKENIZERS_PARALLELISM=false -# export CUDA_LAUNCH_BLOCKING=1 - -run_dir=/root/SLAM-LLM -cd $run_dir -code_dir=examples/asr_librispeech - -speech_encoder_path=/nfs/maziyang.mzy/models/Whisper/large-v3.pt -llm_path=/nfs/maziyang.mzy/models/vicuna-7b-v1.5 - -output_dir=/root/tmp/vicuna-7b-v1.5-librispeech-linear-steplrwarmupkeep1e-4-whisper-largev3-20240426 -ckpt_path=$output_dir/asr_epoch_1_step_1000 -split=librispeech_test_clean -val_data_path=/nfs/maziyang.mzy/data/librispeech/${split}.jsonl -decode_log=$ckpt_path/decode_${split}_beam4 - -# -m debugpy --listen 5678 --wait-for-client -python $code_dir/inference_asr_batch.py \ - --config-path "conf" \ - --config-name "prompt.yaml" \ - hydra.run.dir=$ckpt_path \ - ++model_config.llm_name="vicuna-7b-v1.5" \ - ++model_config.llm_path=$llm_path \ - ++model_config.llm_dim=4096 \ - ++model_config.encoder_name=whisper \ - ++model_config.encoder_projector_ds_rate=5 \ - ++model_config.encoder_path=$speech_encoder_path \ - ++model_config.encoder_dim=1280 \ - ++model_config.encoder_projector=linear \ - ++dataset_config.dataset=speech_dataset \ - ++dataset_config.val_data_path=$val_data_path \ - ++dataset_config.input_type=mel \ - ++dataset_config.mel_size=128 \ - ++dataset_config.inference_mode=true \ - ++train_config.model_name=asr \ - ++train_config.freeze_encoder=true \ - ++train_config.freeze_llm=true \ - ++train_config.batching_strategy=custom \ - ++train_config.num_epochs=1 \ - ++train_config.val_batch_size=1 \ - ++train_config.num_workers_dataloader=2 \ - ++train_config.output_dir=$output_dir \ - ++decode_log=$decode_log \ - ++ckpt_path=$ckpt_path/model.pt \ - # ++peft_ckpt=$ckpt_path \ - # ++train_config.use_peft=true \ - # ++train_config.peft_config.r=32 \ - # ++dataset_config.normalize=true \ - # ++model_config.encoder_projector=q-former \ - # ++dataset_config.fix_length_audio=64 \ diff --git a/examples/asr_librispeech/scripts/finetune_MaLa-ASR.sh b/examples/asr_librispeech/scripts/finetune_MaLa-ASR.sh new file mode 100644 index 00000000..c2372a94 --- /dev/null +++ b/examples/asr_librispeech/scripts/finetune_MaLa-ASR.sh @@ -0,0 +1,144 @@ +#!/bin/bash +# export PYTHONPATH=/root/whisper:$PYTHONPATH +# export PYTHONPATH=/root/fairseq:$PYTHONPATH +export CUDA_VISIBLE_DEVICES=1,2 +export TOKENIZERS_PARALLELISM=false +# export CUDA_LAUNCH_BLOCKING=1 +export OMP_NUM_THREADS=1 +export HYDRA_FULL_ERROR=1 +# set -e +# set -r +# debug setting for multiple gpus +# export NCCL_DEBUG=INFO +# export NCCL_DEBUG_SUBSYS=ALL +# export TORCH_DISTRIBUTED_DEBUG=INFO + +run_dir=/hpc_stor01/home/yangui.fang_sx/workingspace/SLAM-LLM +cd $run_dir +code_dir=examples/asr_librispeech + + + + +dataset=aishell1 +projector=linear +encoder_name=whisper +llm_name=Qwen2.5-7B-Instruct +use_peft=false +ckpt_path=/hpc_stor01/home/yangui.fang_sx/workingspace/project/asr_librispeech_origin/exp/whisper_Qwen2.5-7B-Instruct_aishell1_linear_lorafalse_20241202-1342/asr_epoch_4_step_2482/model.pt +if [[ $peft == true ]] +then + ckpt_path=/hpc_stor01/home/yangui.fang_sx/workingspace/project/asr_librispeech_origin/exp/whisper_Qwen2.5-1.5B-Instruct_aishell1_linear_lorafalse_20241203-1823/asr/5/model.pt +fi + +# Choose Encoder +if [[ $encoder_name == "whisper" ]] +then + speech_encoder_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/whisper-large-v3/large-v3.pt + encoder_dim=1280 + input_type=mel + mel_size=128 +elif [[ $encoder_name == "wavlm" ]] +then + speech_encoder_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/wavlm/WavLM-Large.pt + encoder_dim=1024 + input_type=raw + mel_size= +else + exit 1 +fi + +# Choose LLM +if [[ $llm_name == "vicuna-7b-v1.5" ]] +then + llm_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/vicuna-7b-v1.5 + llm_dim=4096 + use_fp16=true +elif [[ $llm_name == "Qwen2.5-7B-Instruct" ]] +then + llm_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/Qwen2.5-7B-Instruct + llm_dim=3584 + use_fp16=true +elif [[ $llm_name == "Qwen2.5-1.5B-Instruct" ]] +then + llm_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/Qwen2.5-1.5B-Instruct + llm_dim=1536 + use_fp16=true +elif [[ $llm_name == "Qwen2-7B" ]] +then + llm_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/Qwen2-7B + llm_dim=3584 + use_fp16=true +elif [[ $llm_name == "Qwen2.5-7B" ]] +then + llm_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/Qwen2.5-7B + llm_dim=3584 + use_fp16=true +else + exit 1 +fi + +# Choose Train/Dev/Test +if [[ $dataset == "aishell1" ]] +then + train_data_path=/hpc_stor01/home/yangui.fang_sx/workingspace/data/aishell-1/asr_librispeech/train.jsonl + val_data_path=/hpc_stor01/home/yangui.fang_sx/workingspace/data/aishell-1/asr_librispeech/dev.jsonl +else + exit 1 +fi + +output_dir=$run_dir/$code_dir/exp/${encoder_name}_${llm_name}_${dataset}_${projector}_lora${use_peft}_$(date +"%Y%m%d-%H%M") + +hydra_args="hydra.run.dir=$output_dir \ +++model_config.llm_name=$llm_name \ +++model_config.llm_path=$llm_path \ +++model_config.llm_dim=$llm_dim \ +++model_config.encoder_name=$encoder_name \ +++model_config.encoder_projector_ds_rate=5 \ +++model_config.encoder_path=$speech_encoder_path \ +++model_config.encoder_dim=1280 \ +++model_config.encoder_projector=$projector \ +++dataset_config.dataset=$dataset \ +++dataset_config.train_data_path=$train_data_path \ +++dataset_config.val_data_path=$val_data_path \ +++dataset_config.input_type=$input_type \ +++dataset_config.mel_size=128 \ +++train_config.model_name=asr \ +++train_config.num_epochs=10 \ +++train_config.use_peft=$use_peft \ +++train_config.freeze_encoder=true \ +++train_config.freeze_llm=true \ +++train_config.batching_strategy=custom \ +++train_config.warmup_steps=1000 \ +++train_config.total_steps=100000 \ +++train_config.lr=1e-4 \ +++train_config.validation_interval=12000 \ +++train_config.batch_size_training=6 \ +++train_config.val_batch_size=6 \ +++train_config.num_workers_dataloader=2 \ +++train_config.output_dir=$output_dir \ +++metric=acc \ + +" +# ++ckpt_path=$ckpt_path \ + +# -m debugpy --listen 5678 --wait-for-client +if [[ $CUDA_VISIBLE_DEVICES != *","* ]]; then + python -m debugpy --listen 5678 --wait-for-client $code_dir/finetune_asr.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + $hydra_args +else + torchrun \ + --nnodes 1 \ + --nproc_per_node 2 \ + --master_port=29503 \ + $code_dir/finetune_asr.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + ++train_config.enable_fsdp=false \ + ++train_config.enable_ddp=true \ + ++train_config.use_fp16=$use_fp16 \ + $hydra_args \ + +fi \ No newline at end of file diff --git a/examples/asr_librispeech/scripts/finetune_hubert_xtralarge_linear_vicuna_7b.sh b/examples/asr_librispeech/scripts/finetune_hubert_xtralarge_linear_vicuna_7b.sh deleted file mode 100755 index ee4c75d5..00000000 --- a/examples/asr_librispeech/scripts/finetune_hubert_xtralarge_linear_vicuna_7b.sh +++ /dev/null @@ -1,76 +0,0 @@ -#!/bin/bash -# export PYTHONPATH=/root/whisper:$PYTHONPATH -export PYTHONPATH=/root/fairseq:$PYTHONPATH -export CUDA_VISIBLE_DEVICES=2,3 -export TOKENIZERS_PARALLELISM=false -# export CUDA_LAUNCH_BLOCKING=1 -export OMP_NUM_THREADS=1 - -# debug setting for multiple gpus -# export NCCL_DEBUG=INFO -# export NCCL_DEBUG_SUBSYS=ALL -# export TORCH_DISTRIBUTED_DEBUG=INFO - -run_dir=/root/SLAM-LLM -cd $run_dir -code_dir=examples/asr_librispeech - -speech_encoder_path=/nfs/yangguanrou.ygr/ckpts/hubert_ckpt/hubert_xtralarge_ll60k_finetune_ls960.pt -llm_path=/nfs/maziyang.mzy/models/vicuna-7b-v1.5 -train_data_path=/nfs/maziyang.mzy/data/librispeech/librispeech_train_960h.jsonl -val_data_path=/nfs/maziyang.mzy/data/librispeech/librispeech_dev_other.jsonl - -output_dir=/root/tmp/vicuna-7b-v1.5-librispeech-linear-steplrwarmupkeep1e-4-hubert-xtralarge-$(date +"%Y%m%d") - -hydra_args=" -hydra.run.dir=$output_dir \ -++model_config.llm_name=vicuna-7b-v1.5 \ -++model_config.llm_path=$llm_path \ -++model_config.llm_dim=4096 \ -++model_config.encoder_name=hubert \ -++model_config.normalize=true \ -++dataset_config.normalize=true \ -++model_config.encoder_projector_ds_rate=5 \ -++model_config.encoder_path=$speech_encoder_path \ -++model_config.encoder_dim=1280 \ -++model_config.encoder_type=finetune \ -++model_config.encoder_projector=linear \ -++dataset_config.dataset=speech_dataset \ -++dataset_config.train_data_path=$train_data_path \ -++dataset_config.val_data_path=$val_data_path \ -++dataset_config.input_type=raw \ -++train_config.model_name=asr \ -++train_config.num_epochs=3 \ -++train_config.freeze_encoder=true \ -++train_config.freeze_llm=true \ -++train_config.batching_strategy=custom \ -++train_config.warmup_steps=1000 \ -++train_config.total_steps=100000 \ -++train_config.lr=1e-4 \ -++train_config.validation_interval=2000 \ -++train_config.batch_size_training=6 \ -++train_config.val_batch_size=6 \ -++train_config.num_workers_dataloader=0 \ -++train_config.output_dir=$output_dir \ -++metric=acc \ -" - -# -m debugpy --listen 5678 --wait-for-client -if [[ $CUDA_VISIBLE_DEVICES != *","* ]]; then - python -m debugpy --listen 5678 --wait-for-client $code_dir/finetune_asr.py \ - --config-path "conf" \ - --config-name "prompt.yaml" \ - $hydra_args -else - torchrun \ - --nnodes 1 \ - --nproc_per_node 2 \ - --master_port=29503 \ - $code_dir/finetune_asr.py \ - --config-path "conf" \ - --config-name "prompt.yaml" \ - ++train_config.enable_fsdp=false \ - ++train_config.enable_ddp=true \ - ++train_config.use_fp16=true \ - $hydra_args -fi diff --git a/examples/asr_librispeech/scripts/finetune_wavlm_large_linear_vicuna_7b.sh b/examples/asr_librispeech/scripts/finetune_wavlm_large_linear_vicuna_7b.sh deleted file mode 100755 index 423d97bc..00000000 --- a/examples/asr_librispeech/scripts/finetune_wavlm_large_linear_vicuna_7b.sh +++ /dev/null @@ -1,75 +0,0 @@ -#!/bin/bash -# export PYTHONPATH=/root/whisper:$PYTHONPATH -export PYTHONPATH=/root/fairseq:$PYTHONPATH -export CUDA_VISIBLE_DEVICES=0,1 -export TOKENIZERS_PARALLELISM=false -# export CUDA_LAUNCH_BLOCKING=1 -export OMP_NUM_THREADS=1 - -# debug setting for multiple gpus -# export NCCL_DEBUG=INFO -# export NCCL_DEBUG_SUBSYS=ALL -# export TORCH_DISTRIBUTED_DEBUG=INFO - -run_dir=/root/SLAM-LLM -cd $run_dir -code_dir=examples/asr_librispeech - -speech_encoder_path=/nfs/maziyang.mzy/models/wavlm/WavLM-Large.pt -llm_path=/nfs/maziyang.mzy/models/vicuna-7b-v1.5 -train_data_path=/nfs/maziyang.mzy/data/librispeech/librispeech_train_960h.jsonl -val_data_path=/nfs/maziyang.mzy/data/librispeech/librispeech_dev_other.jsonl - -output_dir=/root/tmp/vicuna-7b-v1.5-librispeech-linear-steplrwarmupkeep1e-4-wavlm-large-$(date +"%Y%m%d") - -hydra_args=" -hydra.run.dir=$output_dir \ -++model_config.llm_name=vicuna-7b-v1.5 \ -++model_config.llm_path=$llm_path \ -++model_config.llm_dim=4096 \ -++model_config.encoder_name=wavlm \ -++model_config.normalize=true \ -++dataset_config.normalize=true \ -++model_config.encoder_projector_ds_rate=5 \ -++model_config.encoder_path=$speech_encoder_path \ -++model_config.encoder_dim=1024 \ -++model_config.encoder_projector=linear \ -++dataset_config.dataset=speech_dataset \ -++dataset_config.train_data_path=$train_data_path \ -++dataset_config.val_data_path=$val_data_path \ -++dataset_config.input_type=raw \ -++train_config.model_name=asr \ -++train_config.num_epochs=3 \ -++train_config.freeze_encoder=true \ -++train_config.freeze_llm=true \ -++train_config.batching_strategy=custom \ -++train_config.warmup_steps=1000 \ -++train_config.total_steps=100000 \ -++train_config.lr=1e-4 \ -++train_config.validation_interval=1000 \ -++train_config.batch_size_training=4 \ -++train_config.val_batch_size=4 \ -++train_config.num_workers_dataloader=2 \ -++train_config.output_dir=$output_dir \ -++metric=acc \ -" - -# -m debugpy --listen 5678 --wait-for-client -if [[ $CUDA_VISIBLE_DEVICES != *","* ]]; then - python -m debugpy --listen 5678 --wait-for-client $code_dir/finetune_asr.py \ - --config-path "conf" \ - --config-name "prompt.yaml" \ - $hydra_args -else - torchrun \ - --nnodes 1 \ - --nproc_per_node 2 \ - --master_port=29503 \ - $code_dir/finetune_asr.py \ - --config-path "conf" \ - --config-name "prompt.yaml" \ - ++train_config.enable_fsdp=false \ - ++train_config.enable_ddp=true \ - ++train_config.use_fp16=true \ - $hydra_args -fi diff --git a/examples/asr_librispeech/scripts/finetune_whisper_large_linear_vicuna_7b.sh b/examples/asr_librispeech/scripts/finetune_whisper_large_linear_vicuna_7b.sh deleted file mode 100755 index 4a9cb031..00000000 --- a/examples/asr_librispeech/scripts/finetune_whisper_large_linear_vicuna_7b.sh +++ /dev/null @@ -1,74 +0,0 @@ -#!/bin/bash -# export PYTHONPATH=/root/whisper:$PYTHONPATH -export PYTHONPATH=/root/fairseq:$PYTHONPATH -export CUDA_VISIBLE_DEVICES=0,1 -export TOKENIZERS_PARALLELISM=false -# export CUDA_LAUNCH_BLOCKING=1 -export OMP_NUM_THREADS=1 - -# debug setting for multiple gpus -# export NCCL_DEBUG=INFO -# export NCCL_DEBUG_SUBSYS=ALL -# export TORCH_DISTRIBUTED_DEBUG=INFO - -run_dir=/root/SLAM-LLM -cd $run_dir -code_dir=examples/asr_librispeech - -speech_encoder_path=/nfs/maziyang.mzy/models/Whisper/large-v3.pt -llm_path=/nfs/maziyang.mzy/models/vicuna-7b-v1.5 -train_data_path=/nfs/maziyang.mzy/data/librispeech/librispeech_train_960h.jsonl -val_data_path=/nfs/maziyang.mzy/data/librispeech/librispeech_dev_other.jsonl - -output_dir=/root/tmp/vicuna-7b-v1.5-librispeech-linear-steplrwarmupkeep1e-4-whisper-largev3-$(date +"%Y%m%d") - -hydra_args=" -hydra.run.dir=$output_dir \ -++model_config.llm_name=vicuna-7b-v1.5 \ -++model_config.llm_path=$llm_path \ -++model_config.llm_dim=4096 \ -++model_config.encoder_name=whisper \ -++model_config.encoder_projector_ds_rate=5 \ -++model_config.encoder_path=$speech_encoder_path \ -++model_config.encoder_dim=1280 \ -++model_config.encoder_projector=linear \ -++dataset_config.dataset=speech_dataset \ -++dataset_config.train_data_path=$train_data_path \ -++dataset_config.val_data_path=$val_data_path \ -++dataset_config.input_type=mel \ -++dataset_config.mel_size=128 \ -++train_config.model_name=asr \ -++train_config.num_epochs=3 \ -++train_config.freeze_encoder=true \ -++train_config.freeze_llm=true \ -++train_config.batching_strategy=custom \ -++train_config.warmup_steps=1000 \ -++train_config.total_steps=100000 \ -++train_config.lr=1e-4 \ -++train_config.validation_interval=1000 \ -++train_config.batch_size_training=4 \ -++train_config.val_batch_size=4 \ -++train_config.num_workers_dataloader=2 \ -++train_config.output_dir=$output_dir \ -++metric=acc \ -" - -# -m debugpy --listen 5678 --wait-for-client -if [[ $CUDA_VISIBLE_DEVICES != *","* ]]; then - python -m debugpy --listen 5678 --wait-for-client $code_dir/finetune_asr.py \ - --config-path "conf" \ - --config-name "prompt.yaml" \ - $hydra_args -else - torchrun \ - --nnodes 1 \ - --nproc_per_node 2 \ - --master_port=29503 \ - $code_dir/finetune_asr.py \ - --config-path "conf" \ - --config-name "prompt.yaml" \ - ++train_config.enable_fsdp=false \ - ++train_config.enable_ddp=true \ - ++train_config.use_fp16=true \ - $hydra_args -fi diff --git a/examples/asr_librispeech/scripts/finetune_whisper_large_linear_vicuna_7b_deepspeed.sh b/examples/asr_librispeech/scripts/finetune_whisper_large_linear_vicuna_7b_deepspeed.sh deleted file mode 100755 index 4ea09758..00000000 --- a/examples/asr_librispeech/scripts/finetune_whisper_large_linear_vicuna_7b_deepspeed.sh +++ /dev/null @@ -1,92 +0,0 @@ -#!/bin/bash -# export PYTHONPATH=/root/whisper:$PYTHONPATH -export PYTHONPATH=/root/fairseq:$PYTHONPATH -# export CUDA_VISIBLE_DEVICES=6,7 -export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 -export TOKENIZERS_PARALLELISM=false -# export CUDA_LAUNCH_BLOCKING=1 -export OMP_NUM_THREADS=1 - -# debug setting for multiple gpus -# export NCCL_DEBUG=INFO -# export NCCL_DEBUG_SUBSYS=ALL -# export TORCH_DISTRIBUTED_DEBUG=INFO - -run_dir=/work/SLAM-LLM -cd $run_dir -code_dir=examples/asr_librispeech - -speech_encoder_path=/cxgroup/model/whisper/large-v3.pt - -llm_path=/cxgroup/model/vicuna-7b-v1.5 -# llm_path=/nfs/maziyang.mzy/models/vicuna-13b-v1.5 - -output_dir=/work/exps/vicuna-7b-v1.5-librispeech-linear-steplrwarmupkeep1e-4-whisper-largev3-$(date +"%Y%m%d")-deepspeed - -hydra_args=" -hydra.run.dir=$output_dir \ -++model_config.llm_name=vicuna-7b-v1.5 \ -++model_config.llm_path=$llm_path \ -++model_config.llm_dim=4096 \ -++model_config.encoder_name=whisper \ -++model_config.encoder_projector_ds_rate=5 \ -++model_config.encoder_path=$speech_encoder_path \ -++model_config.encoder_dim=1280 \ -++model_config.encoder_projector=linear \ -++dataset_config.dataset=speech_dataset \ -++dataset_config.train_data_path=data/librispeech/train960.jsonl \ -++dataset_config.val_data_path=data/librispeech/dev.jsonl \ -++dataset_config.input_type=mel \ -++dataset_config.mel_size=128 \ -++train_config.model_name=asr \ -++train_config.num_epochs=6 \ -++train_config.enable_deepspeed=true \ -++train_config.freeze_encoder=true \ -++train_config.freeze_llm=true \ -++train_config.batching_strategy=custom \ -++train_config.warmup_steps=1000 \ -++train_config.total_steps=100000 \ -++train_config.lr=1e-4 \ -++train_config.validation_interval=1000 \ -++train_config.batch_size_training=4 \ -++train_config.val_batch_size=4 \ -++train_config.num_workers_dataloader=4 \ -++train_config.output_dir=$output_dir \ -++metric=acc \ -" -# ++train_config.use_peft=true \ -# ++train_config.peft_config.r=32 \ -# ++model_config.encoder_projector=linear \ -# ++model_config.encoder_projector_ds_rate=5 \ -# ++train_config.peft_config.peft_method=lora \ -# --peft_ckpt "/nfs/maziyang.mzy/exps/llama-2-hf-finetune-asr-ds5-proj2048-lr1e-5-whisper-prompt-padding30-20231228/asr/4" \ -# --ckpt_path "/nfs/maziyang.mzy/exps/llama-2-hf-finetune-asr-ds5-proj2048-lr1e-5-whisper-prompt-padding30-20231228/asr/4/model.pt" \ -#++log_config.log_file=/$output_dir/train.log \ -#++log_config.use_wandb=true \ -#++log_config.wandb_dir=$output_dir \ -#++log_config.wandb_entity_name=zym22 \ -#++log_config.wandb_project_name=slam-llm \ -#++log_config.wandb_exp_name=${0##*/%.*} \ -#++log_config.log_interval 5 \ - -deepspeed \ - --include localhost:4,5 \ - --master_port=29502 \ - $code_dir/deepspeed_finetune_asr.py \ - $hydra_args - # --num_gpus=2 \ - # --num_nodes=1 \ - -# -m debugpy --listen 5678 --wait-for-client -# if [[ $CUDA_VISIBLE_DEVICES != *","* ]]; then -# python -m debugpy --listen 5678 --wait-for-client finetune_asr.py \ -# $hydra_args -# else -# deepspeed \ -# --num_nodes=1 \ -# --include localhost:6,7 \ -# --master_port=29502 \ -# $code_dir/deepspeed_finetune_asr.py \ -# $hydra_args -# # --num_gpus=2 \ -# fi diff --git a/examples/mala_asr_slidespeech/conf/ds_config.json b/examples/mala_asr_slidespeech/conf/ds_config.json index 7ea70e4a..68cce9e3 100644 --- a/examples/mala_asr_slidespeech/conf/ds_config.json +++ b/examples/mala_asr_slidespeech/conf/ds_config.json @@ -1,19 +1,35 @@ { - "train_micro_batch_size_per_gpu": 4, + "train_micro_batch_size_per_gpu": 2, "gradient_accumulation_steps": 1, "optimizer": { - "type": "Adam", + "type": "AdamW", "params": { - "lr": 1e-4 + "lr": 1e-4, + "weight_decay": 0.01 } + }, - "fp16": { - "enabled": true + "bf16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 100, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 0.01 }, "zero_optimization": { - "stage": 3, - "offload_optimizer": { - "device": "cpu" + "stage": 0, + "allgather_partitions": true, + "overlap_comm": true, + "reduce_scatter": true, + "contiguous_gradients": true + }, + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": 0.0, + "warmup_max_lr": 0.001, + "warmup_num_steps": 1000 } } } \ No newline at end of file diff --git a/examples/mala_asr_slidespeech/conf/ds_config_from_k2.json b/examples/mala_asr_slidespeech/conf/ds_config_from_k2.json new file mode 100644 index 00000000..613709ac --- /dev/null +++ b/examples/mala_asr_slidespeech/conf/ds_config_from_k2.json @@ -0,0 +1,38 @@ +{ + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 100, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 0.01 + }, + "zero_optimization": { + "stage": 1, + "allgather_partitions": true, + "allgather_bucket_size": 2e8, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 2e8, + "contiguous_gradients": true + }, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-4 + } + }, + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": 0, + "warmup_max_lr": 1e-4, + "warmup_num_steps": 1000 + } + }, + "gradient_accumulation_steps": 4, + "gradient_clipping": 5, + "steps_per_print": 50, + "train_micro_batch_size_per_gpu": 1, + "wall_clock_breakdown": false +} \ No newline at end of file diff --git a/examples/mala_asr_slidespeech/conf/prompt.yaml b/examples/mala_asr_slidespeech/conf/prompt.yaml index 0bc65175..32cf2374 100644 --- a/examples/mala_asr_slidespeech/conf/prompt.yaml +++ b/examples/mala_asr_slidespeech/conf/prompt.yaml @@ -1,4 +1,14 @@ dataset_config: # we put prompt here, because the hydra override in shell script only support a small subset of chars # prompt: "Transcribe speech to text. Output the transcription directly without redundant content. Ensure that the output is not duplicated. " - prompt: "Transcribe speech to text. " + # prompt: "<|im_start|>user\n请将语音转写为汉字<|im_end|>\n<|im_start|>assistant\n" + # prompt: "识别语音" + # prompt : "将上面的语音转写为英文" + # prompt: "Transcribe speech to English." + # prompt: "Transcribe speech to text.And then translate the text to spanish." + # prompt: "Transcribe speech to text." + # prompt: "Tell me what is the language of the text." + prompt: "Transcribe speech to text." + # prompt: "Transcribe speech to text.Follow words may occur in audio:{}." + # prompt: "" + # prompt: "请问上面有几个句子,有多少个字,给字编号然后输出文本" diff --git a/examples/mala_asr_slidespeech/dataset/multitask_dataset.py b/examples/mala_asr_slidespeech/dataset/multitask_dataset.py new file mode 100644 index 00000000..37cc84bf --- /dev/null +++ b/examples/mala_asr_slidespeech/dataset/multitask_dataset.py @@ -0,0 +1,398 @@ +import torch +from torch.utils.data import Dataset +import whisper +import kaldiio +# import pyroomacoustics as pra +import string +import copy +import numpy as np +from tqdm import tqdm +import os +import json +import random +import torchaudio.transforms as T +from torchaudio.transforms import SpeedPerturbation +import torchaudio +import torchaudio.functional as F +import random +import time +from kaldiio.matio import (_parse_arkpath, + open_like_kaldi, + _load_mat ) +class MultiTaskDataset(Dataset): + def __init__(self, dataset_config, tokenizer=None, split='train',musan_path=None): + super().__init__() + self.data_list = {} + self.num_samples_list = {} + self.multitask_task_list = [] + self.multitask_prompt_list = {} + self.prev_ark_file = None + self.prev_ark_fd = None + self.read_time = 0 + self.open_time = 0 + self.total_time = 0 + self.last_epoch_time = 0 + if split == "train": + data_path = dataset_config.train_scp_file_path + elif split == "val": + data_path = dataset_config.dev_scp_file_path + elif split == "test": + data_path = dataset_config.test_scp_file_path + else: + assert(0) + if musan_path is not None: + self.musan_list = [] + with open(musan_path) as f: + for line in f: + key,path = line.split(" ") + self.musan_list.append(path) + data_scp_file_path = os.path.join(data_path,"my_wav.scp") + # utt2numm_samples_path = os.path.join(data_path,"utt2num_samples") + multitask_task_path = os.path.join(data_path,"multitask.jsonl") + multitask_prompt_path = "/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/multiprompt.jsonl" + with open(data_scp_file_path) as f: + for line in f: + key,path = line.split(" ") + self.data_list[key] = path + # with open(utt2numm_samples_path) as f: + # for line in f: + # key,samples = line.split(" ") + # self.num_samples_list[key] = samples + with open(multitask_task_path) as f: + for line in f: + item = json.loads(line.strip()) + if item["key"] in self.data_list: + self.multitask_task_list.append(item) + else: + print(item) + with open(multitask_prompt_path) as f: + for line in f: + item = json.loads(line.strip()) + if item["task"] in self.multitask_prompt_list: + self.multitask_prompt_list[item["task"]].append(item["prompt"]) + else: + self.multitask_prompt_list[item["task"]] = [item["prompt"]] + + self.llm_name = dataset_config.get("llm_name", None) + self.prompt_style = dataset_config.get("prompt_style", "normal") + if self.llm_name == "Qwen2.5-7B-Instruct": + if self.prompt_style == "normal": + self.prompt_template1 = "{}" + elif self.prompt_style == "instruct": + self.prompt_template1 = "<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n" + else: + if self.prompt_style == "normal": + self.prompt_template1 = "{}" + elif self.prompt_style == "instruct": + self.prompt_template1 = "USER: {}\n ASSISTANT:" + self.answer_template = "{}" + self.dataset_config = dataset_config + self.tokenizer = tokenizer + self.split = split + self.spec_augmentation = dataset_config.get("spec_augmentation", False) + self.speed_perturb = dataset_config.get("speed_perturb", False) + self.add_noise = dataset_config.get("musan", False) + self.add_reverb = dataset_config.get("add_reverb", False) + self.noise_file_path = dataset_config.get("noise_file_path", False) + if self.add_noise == True: + self.musan_wav_files = [] + for root, dirs, files in os.walk(self.noise_file_path): + for file in files: + if file.endswith('.wav'): + self.musan_wav_files.append(os.path.join(root, file)) + self.pad_or_trim = dataset_config.get("pad_or_trim", False) + self.IGNORE_INDEX = -100 # The default setting in CrossEntropyLoss + self.mel_size = dataset_config.get("mel_size", 80) # 80 for whisper large v1 and v2, 128 for large v3 + self.fix_length_audio = dataset_config.get("fix_length_audio", -1) + self.inference_mode = dataset_config.get("inference_mode", False) + self.prompt_mode = dataset_config.get("prompt_mode", None) + self.normalize = dataset_config.get("normalize", False) + self.input_type = dataset_config.get("input_type", None) + self.translator = str.maketrans('', '', string.punctuation) + assert self.input_type in ["raw", "mel"], "input_type must be one of [raw, mel]" + + + # def get_source_len(self, data_dict): + # return data_dict["source_len"] + + # def get_target_len(self, data_dict): + + # return data_dict["target_len"] if "target_len" in data_dict else 0 + def kaldiio(self,ark_path): + start_time = time.time() + ark_file, offset, slices = _parse_arkpath(ark_path) + if self.prev_ark_file == ark_file: + ark_fd = self.prev_ark_fd + else: + if self.prev_ark_fd is not None: + self.prev_ark_fd.close() + ark_fd = open_like_kaldi(ark_file, 'rb') + self.open_time += time.time() - start_time + start_time = time.time() + sample_rate, waveform = _load_mat(ark_fd, offset, slices, endian='<') + waveform = waveform.astype(np.float32) / (1 << 15) + self.read_time += time.time() - start_time + # if waveform.ndim == 2: + # waveform = waveform.T + # else: + # waveform = np.expand_dims(waveform, axis=0) + self.prev_ark_file, self.prev_ark_fd = ark_file, ark_fd + return waveform + + + def speedPerturb(self, audio_raw): + orig_freq = 16000 + # 定义速度扰动因子,例如 [0.9, 1.0, 1.1] 表示速度减少10%,保持不变,增加10%Q + factors = [0.9,1,1.1] + # 创建速度扰动变换 + speed_perturbation = SpeedPerturbation(orig_freq, factors) + # 应用速度扰动 + audio_raw = torch.from_numpy(audio_raw) + # 由于 SpeedPerturbation 返回的是一个函数,我们需要调用它 + # 并且传入原始音频张量 + audio_raw = speed_perturbation(audio_raw)[0] + return audio_raw + def specAugment(self, spec): + spec = spec.permute(1, 0).unsqueeze(0) + stretch = T.TimeStretch(n_freq=128) + rate = random.random()*0.2 + 0.9 + Timemasking = T.TimeMasking(time_mask_param=100) + Frequencymasking = T.FrequencyMasking(freq_mask_param=27) + spec = stretch(spec, rate).to(torch.float32) + spec = Timemasking(spec) + spec = Timemasking(spec) + spec = Frequencymasking(spec) + spec = Frequencymasking(spec) + spec = spec.squeeze(0).permute(1, 0) + return spec + def addNoise(self, audio_raw): + noise, _ = torchaudio.load(random.choice(self.musan_wav_files)) + noise.unsqueeze_(0) + # 如果语音比噪声长,随机选择噪声的起始点 + if audio_raw.shape > noise.shape: + # 随机选择噪声的起始点 + start_idx = random.randint(0, audio_raw.shape - noise.shape) + # 在语音的随机位置开始添加噪声 + speech_with_noise = torch.zeros_like(audio_raw) + speech_with_noise[:, start_idx:start_idx + noise.shape] += noise + else: + # 如果噪声比语音长,从噪声的随机位置开始截取 + start_idx = random.randint(0, noise.shape - audio_raw.shape) + noise = noise[:, start_idx:start_idx + audio_raw.shape] + # 直接将噪声添加到语音中 + snr_dbs = random.randomint(1, 30) + noisy_speeches = F.add_noise(audio_raw, noise, snr_dbs) + return noisy_speeches + # def simulate_room_reverb(self, audio_raw,fs): + # room_dim = [random.uniform(3, 10), random.uniform(3, 10), random.uniform(2, 5)] + # # 随机生成目标混响时间(RT60,单位:秒) + # rt60_tgt = random.uniform(0.3, 1.0) + # # 生成随机房间参数 + # # 使用 Sabine 公式计算吸声系数和反射阶数 + # e_absorption, max_order = pra.inverse_sabine(rt60_tgt, room_dim) + # # 创建房间 + # room = pra.ShoeBox( + # room_dim, + # fs=fs, + # materials=pra.Material(e_absorption), + # max_order=int(max_order), + # use_rand_ism=True, # 使用随机化图像方法减少回声 + # max_rand_disp=0.05, # 最大随机位移(单位:米) + # ) + # # 随机生成声源位置 + # source_position = [random.uniform(0.5, room_dim[0] - 0.5), + # random.uniform(0.5, room_dim[1] - 0.5), + # random.uniform(0.5, room_dim[2] - 0.5)] + # room.add_source(source_position, signal=audio_raw) + # # 随机生成麦克风位置 + # mic_locs = np.c_[ + # [random.uniform(0.5, room_dim[0] - 0.5), random.uniform(0.5, room_dim[1] - 0.5), random.uniform(0.5, room_dim[2] - 0.5)], + # ] + # room.add_microphone_array(mic_locs) + # # 运行模拟 + # room.simulate() + # # 返回麦克风阵列的信号 + # return room.mic_array.signals[0, :] + + + def __len__(self): + return len(self.multitask_task_list) + + def __getitem__(self, index): + if self.last_epoch_time != 0: + self.total_time += time.time() - self.last_epoch_time + self.last_epoch_time = time.time() + if (index + 1) % 100 == 0: + print(f"[open_time] {self.open_time} [read_time] {self.read_time} [total_time] {self.total_time} ") + item = self.multitask_task_list[index] + ark_path = self.data_list[item["key"]] + # numpy_array = kaldiio.load_mat(ark_path) + # audio_raw = numpy_array[1].astype(np.float32) / 32768 + audio_raw = self.kaldiio(ark_path) + # num_samples = int(self.num_samples_list[item["key"]]) + # assert(audio_raw.shape[0] == num_samples) + key = item["key"] + target = item["target"].upper() + # 使用 translate 方法去除标点符号 + target = target.translate(self.translator) + # ocr = self.ocr_list[index] + # target = self.label_list[index] + # key = self.key_list[index] + ## data augmentation + if self.split == "train" and self.speed_perturb == True: + audio_raw = self.speedPerturb(audio_raw) + if self.split == "train" and self.add_noise == True: + audio_raw = self.addNoise(audio_raw, self.musan_list) + # if self.split == "train" and self.add_reverb == True: + # audio_raw = self.simulate_room_reverb(audio_raw, 16000).astype(np.float32) + if self.input_type == "raw": + audio_raw = torch.from_numpy(audio_raw).float() + if self.normalize: + audio_raw = torch.nn.functional.layer_norm(audio_raw, audio_raw.shape) + audio_length = len(audio_raw) // 320 # ad-hoc for fairseq 320x downsample + audio_length = audio_length // 5 # ad-hoc for 5x fc downsample + elif self.input_type == "mel": + if self.pad_or_trim == True: + audio_raw = whisper.pad_or_trim(audio_raw) + audio_mel = whisper.log_mel_spectrogram(audio_raw, n_mels=self.mel_size).permute(1, 0) + if self.split == "train" and self.spec_augmentation == True: + audio_mel = self.specAugment(audio_mel) + audio_length = (audio_mel.shape[0] + 1) // 2 # ad-hoc for whisper for 2x downsample from mel to feats + audio_length = audio_length // 5 # ad-hoc for 5x fc downsample + # audio_length = calculate_output_length_1d(audio_length, 5, 5, 0) # ad-hoc for 5x cov1d downsample + if self.fix_length_audio > 0: + audio_length = self.fix_length_audio + audio_pseudo = torch.full((audio_length,), -1) # placeholder + + prompt = random.choice(self.multitask_prompt_list[item["task"]]) + prompt = self.prompt_template1.format(prompt) + if item["task"] in ["prevtext","hotword","domain"]: + prompt = prompt.format(item[item["task"]].upper()) + prompt_ids = self.tokenizer.encode(prompt) + prompt_length = len(prompt_ids) + + if self.inference_mode: + prompt_ids = torch.tensor(prompt_ids, dtype=torch.int64) + example_ids = torch.cat((audio_pseudo, prompt_ids)) # [audio,prompt] + example_mask = example_ids.ge(-1) # [True,True] + + return { + "input_ids": example_ids, + "attention_mask": example_mask, + "audio": audio_raw if self.input_type == "raw" else None, + "audio_mel": audio_mel if self.input_type == "mel" else None, + 'audio_length': audio_length, + 'key': key, + 'target': target, + } + + answer = self.answer_template.format(target) + example = prompt + answer # FIX(MZY): avoid putting a bos token before answer. + example_ids = self.tokenizer.encode(example) # [prompt,answer] + example_ids.append(self.tokenizer.eos_token_id) # [prompt,answer,eos] + example_ids = torch.tensor( + example_ids, dtype=torch.int64 + ) + example_ids = torch.cat((audio_pseudo, example_ids)) # [audio,prompt,answer,eos] + + labels_ids = copy.deepcopy(example_ids) # [audio,prompt,answer,eos] + labels_ids[:audio_length + prompt_length] = -1 # [-1,-1,answer,eos]; + example_mask = example_ids.ge(-1) # FIX(GZF): [True,True,True,True] + + label_mask = labels_ids.ge(0) # [False,False,True,True] + example_ids[~example_mask] = 0 # [audio,prompt,answer,eos] + labels_ids[~label_mask] = self.IGNORE_INDEX # [-100,-100,answer,eos] + + return { + "input_ids": example_ids, + "labels": labels_ids, + "attention_mask": example_mask, + "audio": audio_raw if self.input_type == "raw" else None, + "audio_mel": audio_mel if self.input_type == "mel" else None, + 'audio_length': audio_length, + } + def pad(self, sequence, max_length, padding_idx=0): + if isinstance(sequence, (int, list, tuple)): + if len(sequence) < max_length: + sequence = sequence + [padding_idx] * (max_length - len(sequence)) + else: + sequence = sequence[:max_length] + elif isinstance(sequence, torch.Tensor): + if len(sequence) < max_length: + sequence = torch.cat( + (sequence, torch.full(([max_length - len(sequence)] + list(sequence.size())[1:]), padding_idx))) + else: + sequence = sequence[:max_length] + elif isinstance(sequence, np.ndarray): + if len(sequence) < max_length: + sequence = np.concatenate( + (sequence, np.full((max_length - len(sequence),) + sequence.shape[1:], padding_idx))) + else: + sequence = sequence[:max_length] + else: + raise Exception("Type mismatch during padding!") + return sequence + + def collator(self, samples): + assert samples is not None + input_ids_max_length = max([s['input_ids'].shape[0] for s in samples]) + input_ids = torch.stack([self.pad(s['input_ids'], input_ids_max_length, self.tokenizer.pad_token_id) + for s in samples]) + attention_mask = torch.stack([self.pad(s['attention_mask'], input_ids_max_length, False) + for s in samples]) + if self.input_type == "raw": + audio_raw_max_length = max([s['audio'].shape[0] for s in samples]) + audio_raw = torch.stack([self.pad(s['audio'], audio_raw_max_length, 0) + for s in samples]) + audio_mask = torch.zeros(len(samples), audio_raw_max_length) + for line, sample in enumerate(samples): + audio_mask[line, :sample['audio'].shape[0]] = 1 + elif self.input_type == "mel": + audio_mel_max_length = max([s['audio_mel'].shape[0] for s in samples]) + audio_mel = torch.stack([self.pad(s['audio_mel'], audio_mel_max_length, 0) + for s in samples]) + audio_mel_post_mask = torch.zeros(len(samples), (audio_mel_max_length + 1) // 2) # ad-hoc for whisper for 2x downsample from mel to feats + for line, sample in enumerate(samples): + audio_mel_post_mask[line, :(sample['audio_mel'].shape[0] + 1) // 2] = 1 + + modality_mask = torch.zeros_like(attention_mask) + for line, sample in enumerate(samples): + modality_mask[line, :sample['audio_length']] = 1 + + if self.inference_mode: + keys = [s['key'] for s in samples] + targets = [s['target'] for s in samples] + + return { + "input_ids": input_ids, + "attention_mask": attention_mask, + "audio": audio_raw if self.input_type == "raw" else None, + "audio_mask": audio_mask if self.input_type == "raw" else None, + "audio_mel": audio_mel if self.input_type == "mel" else None, + "audio_mel_post_mask": audio_mel_post_mask if self.input_type == "mel" else None, + "modality_mask": modality_mask, + "keys": keys, + "targets": targets + } + + labels = torch.stack([self.pad(s['labels'], input_ids_max_length, self.IGNORE_INDEX) + for s in samples]) + return { + "input_ids": input_ids, + "labels": labels, + "attention_mask": attention_mask, + "audio": audio_raw if self.input_type == "raw" else None, + "audio_mask": audio_mask if self.input_type == "raw" else None, + "audio_mel": audio_mel if self.input_type == "mel" else None, + "audio_mel_post_mask": audio_mel_post_mask if self.input_type == "mel" else None, + "modality_mask": modality_mask + } + + +def get_speech_dataset(dataset_config, tokenizer, split): + dataset = MultiTaskDataset(dataset_config, tokenizer, split) + return dataset + + + + \ No newline at end of file diff --git a/examples/mala_asr_slidespeech/dataset/slidespeech_dataset.py b/examples/mala_asr_slidespeech/dataset/slidespeech_dataset.py deleted file mode 100644 index e3000e56..00000000 --- a/examples/mala_asr_slidespeech/dataset/slidespeech_dataset.py +++ /dev/null @@ -1,325 +0,0 @@ -import torch -from torch.utils.data import Dataset -import whisper -import kaldiio -import copy -import numpy as np -from tqdm import tqdm - - -class SlidespeechDataset(Dataset): - def __init__(self, dataset_config, tokenizer=None, split='train',): - super().__init__() - self.data_list = [] - self.num_samples_list = [] - self.label_list = [] - self.ocr_list = [] - self.key_list=[] # for debug - self.asr_list=[] # not gt - - if split == "train": - with open(dataset_config.train_scp_file_path + "my_wav.scp",'r') as f: - for line in f: - line = line.strip().split() - self.data_list.append(line[1]) - self.key_list.append(line[0]) - - with open(dataset_config.train_scp_file_path + "utt2num_samples",'r') as f: - for line in f: - line = line.strip().split() - self.num_samples_list.append(int(line[1])) - - with open(dataset_config.train_scp_file_path + "text",'r') as f: - for line in f: - line = line.strip().split(' ',1) - if len(line) == 1: - self.label_list.append(None) - else: - if dataset_config.lower: - self.label_list.append(line[1].lower()) - else: - self.label_list.append(line[1]) - - with open(dataset_config.train_scp_file_path + "hot_related/ocr_1gram_top50_mmr070_hotwords_list",'r') as f: - for line in f: - line = line.strip().split() - if len(line) == 1: - self.ocr_list.append(None) - else: - line = line[1] - line = line.split('$') - line = " ".join(line) - - if dataset_config.lower: - self.ocr_list.append(line.lower()) - else: - self.ocr_list.append(line) - - - elif split == "val": - with open(dataset_config.dev_scp_file_path + "my_wav.scp",'r') as f: - for line in f: - line = line.strip().split() - self.data_list.append(line[1]) - self.key_list.append(line[0]) - - with open(dataset_config.dev_scp_file_path + "utt2num_samples",'r') as f: - for line in f: - line = line.strip().split() - self.num_samples_list.append(int(line[1])) - - with open(dataset_config.dev_scp_file_path + "text",'r') as f: - for line in f: - line = line.strip().split(' ',1) - if len(line) == 1: - self.label_list.append(None) - else: - if dataset_config.lower: - self.label_list.append(line[1].lower()) - else: - self.label_list.append(line[1]) - - with open(dataset_config.dev_scp_file_path + "hot_related/ocr_1gram_top50_mmr070_hotwords_list",'r') as f: - for line in f: - line = line.strip().split() - if len(line) == 1: - self.ocr_list.append(None) - else: - line = line[1] - line = line.split('$') - line = " ".join(line) - - if dataset_config.lower: - self.ocr_list.append(line.lower()) - else: - self.ocr_list.append(line) - - elif split == "test": - with open(dataset_config.test_scp_file_path + "my_wav.scp",'r') as f: - for line in f: - line = line.strip().split() - self.data_list.append(line[1]) - self.key_list.append(line[0]) - - with open(dataset_config.test_scp_file_path + "utt2num_samples",'r') as f: - for line in f: - line = line.strip().split() - self.num_samples_list.append(int(line[1])) - - with open(dataset_config.test_scp_file_path + "text",'r') as f: - for line in f: - line = line.strip().split(' ',1) - if len(line) == 1: - self.label_list.append(None) - else: - if dataset_config.lower: - self.label_list.append(line[1].lower()) - else: - self.label_list.append(line[1]) - - with open(dataset_config.test_scp_file_path + "hot_related/ocr_1gram_top50_mmr070_hotwords_list",'r') as f: - for line in f: - line = line.strip().split() - if len(line) == 1: - self.ocr_list.append(None) - else: - line = line[1] - line = line.split('$') - line = " ".join(line) - - if dataset_config.lower: - self.ocr_list.append(line.lower()) - else: - self.ocr_list.append(line) - - - - self.dataset_config = dataset_config - self.tokenizer = tokenizer - self.IGNORE_INDEX = -100 # The default setting in CrossEntropyLoss - self.mel_size = dataset_config.get("mel_size", 80) # 80 for whisper large v1 and v2, 128 for large v3 - self.prompt = dataset_config.get("prompt", None) - self.prompt_template1 = "USER: {}\n ASSISTANT:" - self.prompt_template2 = "USER: Transcribe speech to text. Use hotwords in ppt to improve speech recognition accuracy. But if the hotwords are irrelevant, just ignore them. The hotwords are \"{}\". \n ASSISTANT:" - self.answer_template = "{}" - self.fix_length_audio = dataset_config.get("fix_length_audio", -1) - self.inference_mode = dataset_config.get("inference_mode", False) - self.normalize = dataset_config.get("normalize", False) - self.input_type = dataset_config.get("input_type", None) - assert self.input_type in ["raw", "mel"], "input_type must be one of [raw, mel]" - - def get_source_len(self, data_dict): - return data_dict["source_len"] - - def get_target_len(self, data_dict): - - return data_dict["target_len"] if "target_len" in data_dict else 0 - - def __len__(self): - return len(self.data_list) - - def __getitem__(self, index): - ark_path = self.data_list[index] - numpy_array = kaldiio.load_mat(ark_path) - audio_raw = numpy_array[1].astype(np.float32) - num_samples = self.num_samples_list[index] - assert(audio_raw.shape[0] == num_samples) - ocr = self.ocr_list[index] - target = self.label_list[index] - key = self.key_list[index] - - - if self.input_type == "raw": - audio_raw = torch.from_numpy(audio_raw).float() - if self.normalize: - audio_raw = torch.nn.functional.layer_norm(audio_raw, audio_raw.shape) - audio_length = len(audio_raw) // 320 # ad-hoc for fairseq 320x downsample - audio_length = audio_length // 5 # ad-hoc for 5x fc downsample - elif self.input_type == "mel": - audio_mel = whisper.log_mel_spectrogram(audio_raw, n_mels=self.mel_size).permute(1, 0) - audio_length = (audio_mel.shape[0] + 1) // 2 # ad-hoc for whisper for 2x downsample from mel to feats - audio_length = audio_length // 5 # ad-hoc for 5x fc downsample - # audio_length = calculate_output_length_1d(audio_length, 5, 5, 0) # ad-hoc for 5x cov1d downsample - if self.fix_length_audio > 0: - audio_length = self.fix_length_audio - audio_pseudo = torch.full((audio_length,), -1) # placeholder - - - if self.dataset_config.use_ocr == True and ocr != None: - prompt = self.prompt_template2.format(ocr) - else: - prompt = self.prompt_template1.format(self.prompt) - # if self.dataset_config.task=="keyword_yizhi": - # if self.dataset_config.use_ocr == False or ocr == None: - # ocr="" - # prompt = self.prompt_template2.format(ocr) - prompt_ids = self.tokenizer.encode(prompt) - prompt_length = len(prompt_ids) - - - if self.inference_mode: - prompt_ids = torch.tensor(prompt_ids, dtype=torch.int64) - example_ids = torch.cat((audio_pseudo, prompt_ids)) # [audio,prompt] - example_mask = example_ids.ge(-1) # [True,True] - - return { - "input_ids": example_ids, - "attention_mask": example_mask, - "audio": audio_raw if self.input_type == "raw" else None, - "audio_mel": audio_mel if self.input_type == "mel" else None, - 'audio_length': audio_length, - 'key': key, - 'target': target, - } - - answer = self.answer_template.format(target) - example = prompt + answer # FIX(MZY): avoid putting a bos token before answer. - example_ids = self.tokenizer.encode(example) # [prompt,answer] - example_ids.append(self.tokenizer.eos_token_id) # [prompt,answer,eos] - example_ids = torch.tensor( - example_ids, dtype=torch.int64 - ) - example_ids = torch.cat((audio_pseudo, example_ids)) # [audio,prompt,answer,eos] - - labels_ids = copy.deepcopy(example_ids) # [audio,prompt,answer,eos] - labels_ids[:audio_length + prompt_length] = -1 # [-1,-1,answer,eos]; - example_mask = example_ids.ge(-1) # FIX(GZF): [True,True,True,True] - - label_mask = labels_ids.ge(0) # [False,False,True,True] - example_ids[~example_mask] = 0 # [audio,prompt,answer,eos] - labels_ids[~label_mask] = self.IGNORE_INDEX # [-100,-100,answer,eos] - - return { - "input_ids": example_ids, - "labels": labels_ids, - "attention_mask": example_mask, - "audio": audio_raw if self.input_type == "raw" else None, - "audio_mel": audio_mel if self.input_type == "mel" else None, - 'audio_length': audio_length, - } - - def pad(self, sequence, max_length, padding_idx=0): - if isinstance(sequence, (int, list, tuple)): - if len(sequence) < max_length: - sequence = sequence + [padding_idx] * (max_length - len(sequence)) - else: - sequence = sequence[:max_length] - elif isinstance(sequence, torch.Tensor): - if len(sequence) < max_length: - sequence = torch.cat( - (sequence, torch.full(([max_length - len(sequence)] + list(sequence.size())[1:]), padding_idx))) - else: - sequence = sequence[:max_length] - elif isinstance(sequence, np.ndarray): - if len(sequence) < max_length: - sequence = np.concatenate( - (sequence, np.full((max_length - len(sequence),) + sequence.shape[1:], padding_idx))) - else: - sequence = sequence[:max_length] - else: - raise Exception("Type mismatch during padding!") - return sequence - - def collator(self, samples): - assert samples is not None - input_ids_max_length = max([s['input_ids'].shape[0] for s in samples]) - input_ids = torch.stack([self.pad(s['input_ids'], input_ids_max_length, self.tokenizer.pad_token_id) - for s in samples]) - attention_mask = torch.stack([self.pad(s['attention_mask'], input_ids_max_length, False) - for s in samples]) - if self.input_type == "raw": - audio_raw_max_length = max([s['audio'].shape[0] for s in samples]) - audio_raw = torch.stack([self.pad(s['audio'], audio_raw_max_length, 0) - for s in samples]) - audio_mask = torch.zeros(len(samples), audio_raw_max_length) - for line, sample in enumerate(samples): - audio_mask[line, :sample['audio'].shape[0]] = 1 - elif self.input_type == "mel": - audio_mel_max_length = max([s['audio_mel'].shape[0] for s in samples]) - audio_mel = torch.stack([self.pad(s['audio_mel'], audio_mel_max_length, 0) - for s in samples]) - audio_mel_post_mask = torch.zeros(len(samples), (audio_mel_max_length + 1) // 2) # ad-hoc for whisper for 2x downsample from mel to feats - for line, sample in enumerate(samples): - audio_mel_post_mask[line, :(sample['audio_mel'].shape[0] + 1) // 2] = 1 - - modality_mask = torch.zeros_like(attention_mask) - for line, sample in enumerate(samples): - modality_mask[line, :sample['audio_length']] = 1 - - if self.inference_mode: - keys = [s['key'] for s in samples] - targets = [s['target'] for s in samples] - - return { - "input_ids": input_ids, - "attention_mask": attention_mask, - "audio": audio_raw if self.input_type == "raw" else None, - "audio_mask": audio_mask if self.input_type == "raw" else None, - "audio_mel": audio_mel if self.input_type == "mel" else None, - "audio_mel_post_mask": audio_mel_post_mask if self.input_type == "mel" else None, - "modality_mask": modality_mask, - "keys": keys, - "targets": targets - } - - labels = torch.stack([self.pad(s['labels'], input_ids_max_length, self.IGNORE_INDEX) - for s in samples]) - return { - "input_ids": input_ids, - "labels": labels, - "attention_mask": attention_mask, - "audio": audio_raw if self.input_type == "raw" else None, - "audio_mask": audio_mask if self.input_type == "raw" else None, - "audio_mel": audio_mel if self.input_type == "mel" else None, - "audio_mel_post_mask": audio_mel_post_mask if self.input_type == "mel" else None, - "modality_mask": modality_mask - } - - -def get_speech_dataset(dataset_config, tokenizer, split): - dataset = SlidespeechDataset(dataset_config, tokenizer, split) - return dataset - - - - \ No newline at end of file diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2149/.hydra/config.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2149/.hydra/config.yaml new file mode 100644 index 00000000..f82f1e08 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2149/.hydra/config.yaml @@ -0,0 +1,46 @@ +dataset_config: + prompt: Transcribe speech to text. + llm_name: Qwen2.5-7B-Instruct + prompt_style: normal + normalize: true + dataset: aishell-1 + input_type: mel + speed_perturb: false + spec_augmentation: false + add_reverb: false + noise_file_path: '' + mel_size: 128 + pad_or_trim: true + encoder_projector_ds_rate: 5 + train_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + dev_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +train_config: + enable_fsdp: false + enable_ddp: true + use_fp16: true + model_name: mala_asr + num_epochs: 5 + freeze_encoder: true + freeze_llm: true + use_peft: false + batching_strategy: custom + warmup_steps: 1000 + total_steps: 100000 + lr: 5.0e-05 + validation_interval: 10000 + batch_size_training: 10 + val_batch_size: 10 + num_workers_dataloader: 16 + output_dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2149 +deepspeed_config: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +model_config: + llm_name: Qwen2.5-7B-Instruct + llm_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim: 3584 + encoder_name: whisper + encoder_projector_ds_rate: 5 + encoder_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + encoder_dim: 1280 + encoder_projector: linear + normalize: true +metric: acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2149/.hydra/hydra.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2149/.hydra/hydra.yaml new file mode 100644 index 00000000..125369cb --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2149/.hydra/hydra.yaml @@ -0,0 +1,197 @@ +hydra: + run: + dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2149 + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.run.dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2149 + - hydra.mode=RUN + task: + - ++train_config.enable_fsdp=false + - ++train_config.enable_ddp=true + - ++train_config.use_fp16=true + - ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json + - ++model_config.llm_name=Qwen2.5-7B-Instruct + - ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + - ++model_config.llm_dim=3584 + - ++model_config.encoder_name=whisper + - ++model_config.encoder_projector_ds_rate=5 + - ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + - ++model_config.encoder_dim=1280 + - ++model_config.encoder_projector=linear + - ++model_config.normalize=true + - ++dataset_config.llm_name=Qwen2.5-7B-Instruct + - ++dataset_config.prompt_style=normal + - ++dataset_config.normalize=true + - ++dataset_config.dataset=aishell-1 + - ++dataset_config.input_type=mel + - ++dataset_config.speed_perturb=false + - ++dataset_config.spec_augmentation=false + - ++dataset_config.add_reverb=false + - ++dataset_config.noise_file_path= + - ++dataset_config.mel_size=128 + - ++dataset_config.pad_or_trim=true + - ++dataset_config.encoder_projector_ds_rate=5 + - ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + - ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ + - ++train_config.model_name=mala_asr + - ++train_config.num_epochs=5 + - ++train_config.freeze_encoder=true + - ++train_config.freeze_llm=true + - ++train_config.use_peft=false + - ++train_config.batching_strategy=custom + - ++train_config.warmup_steps=1000 + - ++train_config.total_steps=100000 + - ++train_config.lr=5e-5 + - ++train_config.validation_interval=10000 + - ++train_config.batch_size_training=10 + - ++train_config.val_batch_size=10 + - ++train_config.num_workers_dataloader=16 + - ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2149 + - ++metric=acc + job: + name: finetune_mala_asr_deepspeed + chdir: null + override_dirname: ++dataset_config.add_reverb=false,++dataset_config.dataset=aishell-1,++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/,++dataset_config.encoder_projector_ds_rate=5,++dataset_config.input_type=mel,++dataset_config.llm_name=Qwen2.5-7B-Instruct,++dataset_config.mel_size=128,++dataset_config.noise_file_path=,++dataset_config.normalize=true,++dataset_config.pad_or_trim=true,++dataset_config.prompt_style=normal,++dataset_config.spec_augmentation=false,++dataset_config.speed_perturb=false,++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/,++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json,++metric=acc,++model_config.encoder_dim=1280,++model_config.encoder_name=whisper,++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt,++model_config.encoder_projector=linear,++model_config.encoder_projector_ds_rate=5,++model_config.llm_dim=3584,++model_config.llm_name=Qwen2.5-7B-Instruct,++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct,++model_config.normalize=true,++train_config.batch_size_training=10,++train_config.batching_strategy=custom,++train_config.enable_ddp=true,++train_config.enable_fsdp=false,++train_config.freeze_encoder=true,++train_config.freeze_llm=true,++train_config.lr=5e-5,++train_config.model_name=mala_asr,++train_config.num_epochs=5,++train_config.num_workers_dataloader=16,++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2149,++train_config.total_steps=100000,++train_config.use_fp16=true,++train_config.use_peft=false,++train_config.val_batch_size=10,++train_config.validation_interval=10000,++train_config.warmup_steps=1000 + id: ??? + num: ??? + config_name: prompt.yaml + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/conf + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2149 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2149/.hydra/overrides.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2149/.hydra/overrides.yaml new file mode 100644 index 00000000..9f17a803 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2149/.hydra/overrides.yaml @@ -0,0 +1,42 @@ +- ++train_config.enable_fsdp=false +- ++train_config.enable_ddp=true +- ++train_config.use_fp16=true +- ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +- ++model_config.llm_name=Qwen2.5-7B-Instruct +- ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct +- ++model_config.llm_dim=3584 +- ++model_config.encoder_name=whisper +- ++model_config.encoder_projector_ds_rate=5 +- ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt +- ++model_config.encoder_dim=1280 +- ++model_config.encoder_projector=linear +- ++model_config.normalize=true +- ++dataset_config.llm_name=Qwen2.5-7B-Instruct +- ++dataset_config.prompt_style=normal +- ++dataset_config.normalize=true +- ++dataset_config.dataset=aishell-1 +- ++dataset_config.input_type=mel +- ++dataset_config.speed_perturb=false +- ++dataset_config.spec_augmentation=false +- ++dataset_config.add_reverb=false +- ++dataset_config.noise_file_path= +- ++dataset_config.mel_size=128 +- ++dataset_config.pad_or_trim=true +- ++dataset_config.encoder_projector_ds_rate=5 +- ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ +- ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +- ++train_config.model_name=mala_asr +- ++train_config.num_epochs=5 +- ++train_config.freeze_encoder=true +- ++train_config.freeze_llm=true +- ++train_config.use_peft=false +- ++train_config.batching_strategy=custom +- ++train_config.warmup_steps=1000 +- ++train_config.total_steps=100000 +- ++train_config.lr=5e-5 +- ++train_config.validation_interval=10000 +- ++train_config.batch_size_training=10 +- ++train_config.val_batch_size=10 +- ++train_config.num_workers_dataloader=16 +- ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2149 +- ++metric=acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2153/.hydra/config.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2153/.hydra/config.yaml new file mode 100644 index 00000000..b4305530 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2153/.hydra/config.yaml @@ -0,0 +1,46 @@ +dataset_config: + prompt: Transcribe speech to text. + llm_name: Qwen2.5-7B-Instruct + prompt_style: normal + normalize: true + dataset: aishell-1 + input_type: mel + speed_perturb: false + spec_augmentation: false + add_reverb: false + noise_file_path: '' + mel_size: 128 + pad_or_trim: true + encoder_projector_ds_rate: 5 + train_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + dev_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +train_config: + enable_fsdp: false + enable_ddp: true + use_fp16: true + model_name: mala_asr + num_epochs: 5 + freeze_encoder: true + freeze_llm: true + use_peft: false + batching_strategy: custom + warmup_steps: 1000 + total_steps: 100000 + lr: 5.0e-05 + validation_interval: 10000 + batch_size_training: 10 + val_batch_size: 10 + num_workers_dataloader: 16 + output_dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2153 +deepspeed_config: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +model_config: + llm_name: Qwen2.5-7B-Instruct + llm_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim: 3584 + encoder_name: whisper + encoder_projector_ds_rate: 5 + encoder_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + encoder_dim: 1280 + encoder_projector: linear + normalize: true +metric: acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2153/.hydra/hydra.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2153/.hydra/hydra.yaml new file mode 100644 index 00000000..480ca03b --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2153/.hydra/hydra.yaml @@ -0,0 +1,197 @@ +hydra: + run: + dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2153 + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.run.dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2153 + - hydra.mode=RUN + task: + - ++train_config.enable_fsdp=false + - ++train_config.enable_ddp=true + - ++train_config.use_fp16=true + - ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json + - ++model_config.llm_name=Qwen2.5-7B-Instruct + - ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + - ++model_config.llm_dim=3584 + - ++model_config.encoder_name=whisper + - ++model_config.encoder_projector_ds_rate=5 + - ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + - ++model_config.encoder_dim=1280 + - ++model_config.encoder_projector=linear + - ++model_config.normalize=true + - ++dataset_config.llm_name=Qwen2.5-7B-Instruct + - ++dataset_config.prompt_style=normal + - ++dataset_config.normalize=true + - ++dataset_config.dataset=aishell-1 + - ++dataset_config.input_type=mel + - ++dataset_config.speed_perturb=false + - ++dataset_config.spec_augmentation=false + - ++dataset_config.add_reverb=false + - ++dataset_config.noise_file_path= + - ++dataset_config.mel_size=128 + - ++dataset_config.pad_or_trim=true + - ++dataset_config.encoder_projector_ds_rate=5 + - ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + - ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ + - ++train_config.model_name=mala_asr + - ++train_config.num_epochs=5 + - ++train_config.freeze_encoder=true + - ++train_config.freeze_llm=true + - ++train_config.use_peft=false + - ++train_config.batching_strategy=custom + - ++train_config.warmup_steps=1000 + - ++train_config.total_steps=100000 + - ++train_config.lr=5e-5 + - ++train_config.validation_interval=10000 + - ++train_config.batch_size_training=10 + - ++train_config.val_batch_size=10 + - ++train_config.num_workers_dataloader=16 + - ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2153 + - ++metric=acc + job: + name: finetune_mala_asr_deepspeed + chdir: null + override_dirname: ++dataset_config.add_reverb=false,++dataset_config.dataset=aishell-1,++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/,++dataset_config.encoder_projector_ds_rate=5,++dataset_config.input_type=mel,++dataset_config.llm_name=Qwen2.5-7B-Instruct,++dataset_config.mel_size=128,++dataset_config.noise_file_path=,++dataset_config.normalize=true,++dataset_config.pad_or_trim=true,++dataset_config.prompt_style=normal,++dataset_config.spec_augmentation=false,++dataset_config.speed_perturb=false,++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/,++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json,++metric=acc,++model_config.encoder_dim=1280,++model_config.encoder_name=whisper,++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt,++model_config.encoder_projector=linear,++model_config.encoder_projector_ds_rate=5,++model_config.llm_dim=3584,++model_config.llm_name=Qwen2.5-7B-Instruct,++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct,++model_config.normalize=true,++train_config.batch_size_training=10,++train_config.batching_strategy=custom,++train_config.enable_ddp=true,++train_config.enable_fsdp=false,++train_config.freeze_encoder=true,++train_config.freeze_llm=true,++train_config.lr=5e-5,++train_config.model_name=mala_asr,++train_config.num_epochs=5,++train_config.num_workers_dataloader=16,++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2153,++train_config.total_steps=100000,++train_config.use_fp16=true,++train_config.use_peft=false,++train_config.val_batch_size=10,++train_config.validation_interval=10000,++train_config.warmup_steps=1000 + id: ??? + num: ??? + config_name: prompt.yaml + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/conf + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2153 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2153/.hydra/overrides.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2153/.hydra/overrides.yaml new file mode 100644 index 00000000..6abd721f --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2153/.hydra/overrides.yaml @@ -0,0 +1,42 @@ +- ++train_config.enable_fsdp=false +- ++train_config.enable_ddp=true +- ++train_config.use_fp16=true +- ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +- ++model_config.llm_name=Qwen2.5-7B-Instruct +- ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct +- ++model_config.llm_dim=3584 +- ++model_config.encoder_name=whisper +- ++model_config.encoder_projector_ds_rate=5 +- ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt +- ++model_config.encoder_dim=1280 +- ++model_config.encoder_projector=linear +- ++model_config.normalize=true +- ++dataset_config.llm_name=Qwen2.5-7B-Instruct +- ++dataset_config.prompt_style=normal +- ++dataset_config.normalize=true +- ++dataset_config.dataset=aishell-1 +- ++dataset_config.input_type=mel +- ++dataset_config.speed_perturb=false +- ++dataset_config.spec_augmentation=false +- ++dataset_config.add_reverb=false +- ++dataset_config.noise_file_path= +- ++dataset_config.mel_size=128 +- ++dataset_config.pad_or_trim=true +- ++dataset_config.encoder_projector_ds_rate=5 +- ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ +- ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +- ++train_config.model_name=mala_asr +- ++train_config.num_epochs=5 +- ++train_config.freeze_encoder=true +- ++train_config.freeze_llm=true +- ++train_config.use_peft=false +- ++train_config.batching_strategy=custom +- ++train_config.warmup_steps=1000 +- ++train_config.total_steps=100000 +- ++train_config.lr=5e-5 +- ++train_config.validation_interval=10000 +- ++train_config.batch_size_training=10 +- ++train_config.val_batch_size=10 +- ++train_config.num_workers_dataloader=16 +- ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2153 +- ++metric=acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2157/.hydra/config.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2157/.hydra/config.yaml new file mode 100644 index 00000000..41e6be8e --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2157/.hydra/config.yaml @@ -0,0 +1,46 @@ +dataset_config: + prompt: Transcribe speech to text. + llm_name: Qwen2.5-7B-Instruct + prompt_style: normal + normalize: true + dataset: aishell-1 + input_type: mel + speed_perturb: false + spec_augmentation: false + add_reverb: false + noise_file_path: '' + mel_size: 128 + pad_or_trim: true + encoder_projector_ds_rate: 5 + train_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + dev_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +train_config: + enable_fsdp: false + enable_ddp: true + use_fp16: true + model_name: mala_asr + num_epochs: 5 + freeze_encoder: true + freeze_llm: true + use_peft: false + batching_strategy: custom + warmup_steps: 1000 + total_steps: 100000 + lr: 5.0e-05 + validation_interval: 10000 + batch_size_training: 10 + val_batch_size: 10 + num_workers_dataloader: 16 + output_dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2157 +deepspeed_config: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +model_config: + llm_name: Qwen2.5-7B-Instruct + llm_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim: 3584 + encoder_name: whisper + encoder_projector_ds_rate: 5 + encoder_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + encoder_dim: 1280 + encoder_projector: linear + normalize: true +metric: acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2157/.hydra/hydra.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2157/.hydra/hydra.yaml new file mode 100644 index 00000000..1e195ac1 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2157/.hydra/hydra.yaml @@ -0,0 +1,197 @@ +hydra: + run: + dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2157 + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.run.dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2157 + - hydra.mode=RUN + task: + - ++train_config.enable_fsdp=false + - ++train_config.enable_ddp=true + - ++train_config.use_fp16=true + - ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json + - ++model_config.llm_name=Qwen2.5-7B-Instruct + - ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + - ++model_config.llm_dim=3584 + - ++model_config.encoder_name=whisper + - ++model_config.encoder_projector_ds_rate=5 + - ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + - ++model_config.encoder_dim=1280 + - ++model_config.encoder_projector=linear + - ++model_config.normalize=true + - ++dataset_config.llm_name=Qwen2.5-7B-Instruct + - ++dataset_config.prompt_style=normal + - ++dataset_config.normalize=true + - ++dataset_config.dataset=aishell-1 + - ++dataset_config.input_type=mel + - ++dataset_config.speed_perturb=false + - ++dataset_config.spec_augmentation=false + - ++dataset_config.add_reverb=false + - ++dataset_config.noise_file_path= + - ++dataset_config.mel_size=128 + - ++dataset_config.pad_or_trim=true + - ++dataset_config.encoder_projector_ds_rate=5 + - ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + - ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ + - ++train_config.model_name=mala_asr + - ++train_config.num_epochs=5 + - ++train_config.freeze_encoder=true + - ++train_config.freeze_llm=true + - ++train_config.use_peft=false + - ++train_config.batching_strategy=custom + - ++train_config.warmup_steps=1000 + - ++train_config.total_steps=100000 + - ++train_config.lr=5e-5 + - ++train_config.validation_interval=10000 + - ++train_config.batch_size_training=10 + - ++train_config.val_batch_size=10 + - ++train_config.num_workers_dataloader=16 + - ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2157 + - ++metric=acc + job: + name: finetune_mala_asr_deepspeed + chdir: null + override_dirname: ++dataset_config.add_reverb=false,++dataset_config.dataset=aishell-1,++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/,++dataset_config.encoder_projector_ds_rate=5,++dataset_config.input_type=mel,++dataset_config.llm_name=Qwen2.5-7B-Instruct,++dataset_config.mel_size=128,++dataset_config.noise_file_path=,++dataset_config.normalize=true,++dataset_config.pad_or_trim=true,++dataset_config.prompt_style=normal,++dataset_config.spec_augmentation=false,++dataset_config.speed_perturb=false,++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/,++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json,++metric=acc,++model_config.encoder_dim=1280,++model_config.encoder_name=whisper,++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt,++model_config.encoder_projector=linear,++model_config.encoder_projector_ds_rate=5,++model_config.llm_dim=3584,++model_config.llm_name=Qwen2.5-7B-Instruct,++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct,++model_config.normalize=true,++train_config.batch_size_training=10,++train_config.batching_strategy=custom,++train_config.enable_ddp=true,++train_config.enable_fsdp=false,++train_config.freeze_encoder=true,++train_config.freeze_llm=true,++train_config.lr=5e-5,++train_config.model_name=mala_asr,++train_config.num_epochs=5,++train_config.num_workers_dataloader=16,++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2157,++train_config.total_steps=100000,++train_config.use_fp16=true,++train_config.use_peft=false,++train_config.val_batch_size=10,++train_config.validation_interval=10000,++train_config.warmup_steps=1000 + id: ??? + num: ??? + config_name: prompt.yaml + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/conf + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2157 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2157/.hydra/overrides.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2157/.hydra/overrides.yaml new file mode 100644 index 00000000..40f50702 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2157/.hydra/overrides.yaml @@ -0,0 +1,42 @@ +- ++train_config.enable_fsdp=false +- ++train_config.enable_ddp=true +- ++train_config.use_fp16=true +- ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +- ++model_config.llm_name=Qwen2.5-7B-Instruct +- ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct +- ++model_config.llm_dim=3584 +- ++model_config.encoder_name=whisper +- ++model_config.encoder_projector_ds_rate=5 +- ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt +- ++model_config.encoder_dim=1280 +- ++model_config.encoder_projector=linear +- ++model_config.normalize=true +- ++dataset_config.llm_name=Qwen2.5-7B-Instruct +- ++dataset_config.prompt_style=normal +- ++dataset_config.normalize=true +- ++dataset_config.dataset=aishell-1 +- ++dataset_config.input_type=mel +- ++dataset_config.speed_perturb=false +- ++dataset_config.spec_augmentation=false +- ++dataset_config.add_reverb=false +- ++dataset_config.noise_file_path= +- ++dataset_config.mel_size=128 +- ++dataset_config.pad_or_trim=true +- ++dataset_config.encoder_projector_ds_rate=5 +- ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ +- ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +- ++train_config.model_name=mala_asr +- ++train_config.num_epochs=5 +- ++train_config.freeze_encoder=true +- ++train_config.freeze_llm=true +- ++train_config.use_peft=false +- ++train_config.batching_strategy=custom +- ++train_config.warmup_steps=1000 +- ++train_config.total_steps=100000 +- ++train_config.lr=5e-5 +- ++train_config.validation_interval=10000 +- ++train_config.batch_size_training=10 +- ++train_config.val_batch_size=10 +- ++train_config.num_workers_dataloader=16 +- ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2157 +- ++metric=acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2201/.hydra/config.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2201/.hydra/config.yaml new file mode 100644 index 00000000..21c16ae6 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2201/.hydra/config.yaml @@ -0,0 +1,46 @@ +dataset_config: + prompt: Transcribe speech to text. + llm_name: Qwen2.5-7B-Instruct + prompt_style: normal + normalize: true + dataset: aishell-1 + input_type: mel + speed_perturb: false + spec_augmentation: false + add_reverb: false + noise_file_path: '' + mel_size: 128 + pad_or_trim: true + encoder_projector_ds_rate: 5 + train_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + dev_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +train_config: + enable_fsdp: false + enable_ddp: true + use_fp16: true + model_name: mala_asr + num_epochs: 5 + freeze_encoder: true + freeze_llm: true + use_peft: false + batching_strategy: custom + warmup_steps: 1000 + total_steps: 100000 + lr: 5.0e-05 + validation_interval: 10000 + batch_size_training: 10 + val_batch_size: 10 + num_workers_dataloader: 16 + output_dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2201 +deepspeed_config: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +model_config: + llm_name: Qwen2.5-7B-Instruct + llm_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim: 3584 + encoder_name: whisper + encoder_projector_ds_rate: 5 + encoder_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + encoder_dim: 1280 + encoder_projector: linear + normalize: true +metric: acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2201/.hydra/hydra.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2201/.hydra/hydra.yaml new file mode 100644 index 00000000..6b6e6671 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2201/.hydra/hydra.yaml @@ -0,0 +1,197 @@ +hydra: + run: + dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2201 + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.run.dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2201 + - hydra.mode=RUN + task: + - ++train_config.enable_fsdp=false + - ++train_config.enable_ddp=true + - ++train_config.use_fp16=true + - ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json + - ++model_config.llm_name=Qwen2.5-7B-Instruct + - ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + - ++model_config.llm_dim=3584 + - ++model_config.encoder_name=whisper + - ++model_config.encoder_projector_ds_rate=5 + - ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + - ++model_config.encoder_dim=1280 + - ++model_config.encoder_projector=linear + - ++model_config.normalize=true + - ++dataset_config.llm_name=Qwen2.5-7B-Instruct + - ++dataset_config.prompt_style=normal + - ++dataset_config.normalize=true + - ++dataset_config.dataset=aishell-1 + - ++dataset_config.input_type=mel + - ++dataset_config.speed_perturb=false + - ++dataset_config.spec_augmentation=false + - ++dataset_config.add_reverb=false + - ++dataset_config.noise_file_path= + - ++dataset_config.mel_size=128 + - ++dataset_config.pad_or_trim=true + - ++dataset_config.encoder_projector_ds_rate=5 + - ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + - ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ + - ++train_config.model_name=mala_asr + - ++train_config.num_epochs=5 + - ++train_config.freeze_encoder=true + - ++train_config.freeze_llm=true + - ++train_config.use_peft=false + - ++train_config.batching_strategy=custom + - ++train_config.warmup_steps=1000 + - ++train_config.total_steps=100000 + - ++train_config.lr=5e-5 + - ++train_config.validation_interval=10000 + - ++train_config.batch_size_training=10 + - ++train_config.val_batch_size=10 + - ++train_config.num_workers_dataloader=16 + - ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2201 + - ++metric=acc + job: + name: finetune_mala_asr_deepspeed + chdir: null + override_dirname: ++dataset_config.add_reverb=false,++dataset_config.dataset=aishell-1,++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/,++dataset_config.encoder_projector_ds_rate=5,++dataset_config.input_type=mel,++dataset_config.llm_name=Qwen2.5-7B-Instruct,++dataset_config.mel_size=128,++dataset_config.noise_file_path=,++dataset_config.normalize=true,++dataset_config.pad_or_trim=true,++dataset_config.prompt_style=normal,++dataset_config.spec_augmentation=false,++dataset_config.speed_perturb=false,++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/,++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json,++metric=acc,++model_config.encoder_dim=1280,++model_config.encoder_name=whisper,++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt,++model_config.encoder_projector=linear,++model_config.encoder_projector_ds_rate=5,++model_config.llm_dim=3584,++model_config.llm_name=Qwen2.5-7B-Instruct,++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct,++model_config.normalize=true,++train_config.batch_size_training=10,++train_config.batching_strategy=custom,++train_config.enable_ddp=true,++train_config.enable_fsdp=false,++train_config.freeze_encoder=true,++train_config.freeze_llm=true,++train_config.lr=5e-5,++train_config.model_name=mala_asr,++train_config.num_epochs=5,++train_config.num_workers_dataloader=16,++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2201,++train_config.total_steps=100000,++train_config.use_fp16=true,++train_config.use_peft=false,++train_config.val_batch_size=10,++train_config.validation_interval=10000,++train_config.warmup_steps=1000 + id: ??? + num: ??? + config_name: prompt.yaml + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/conf + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2201 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2201/.hydra/overrides.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2201/.hydra/overrides.yaml new file mode 100644 index 00000000..0e5528d7 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2201/.hydra/overrides.yaml @@ -0,0 +1,42 @@ +- ++train_config.enable_fsdp=false +- ++train_config.enable_ddp=true +- ++train_config.use_fp16=true +- ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +- ++model_config.llm_name=Qwen2.5-7B-Instruct +- ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct +- ++model_config.llm_dim=3584 +- ++model_config.encoder_name=whisper +- ++model_config.encoder_projector_ds_rate=5 +- ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt +- ++model_config.encoder_dim=1280 +- ++model_config.encoder_projector=linear +- ++model_config.normalize=true +- ++dataset_config.llm_name=Qwen2.5-7B-Instruct +- ++dataset_config.prompt_style=normal +- ++dataset_config.normalize=true +- ++dataset_config.dataset=aishell-1 +- ++dataset_config.input_type=mel +- ++dataset_config.speed_perturb=false +- ++dataset_config.spec_augmentation=false +- ++dataset_config.add_reverb=false +- ++dataset_config.noise_file_path= +- ++dataset_config.mel_size=128 +- ++dataset_config.pad_or_trim=true +- ++dataset_config.encoder_projector_ds_rate=5 +- ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ +- ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +- ++train_config.model_name=mala_asr +- ++train_config.num_epochs=5 +- ++train_config.freeze_encoder=true +- ++train_config.freeze_llm=true +- ++train_config.use_peft=false +- ++train_config.batching_strategy=custom +- ++train_config.warmup_steps=1000 +- ++train_config.total_steps=100000 +- ++train_config.lr=5e-5 +- ++train_config.validation_interval=10000 +- ++train_config.batch_size_training=10 +- ++train_config.val_batch_size=10 +- ++train_config.num_workers_dataloader=16 +- ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2201 +- ++metric=acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2206/.hydra/config.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2206/.hydra/config.yaml new file mode 100644 index 00000000..423ef2a3 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2206/.hydra/config.yaml @@ -0,0 +1,46 @@ +dataset_config: + prompt: Transcribe speech to text. + llm_name: Qwen2.5-7B-Instruct + prompt_style: normal + normalize: true + dataset: aishell-1 + input_type: mel + speed_perturb: false + spec_augmentation: false + add_reverb: false + noise_file_path: '' + mel_size: 128 + pad_or_trim: true + encoder_projector_ds_rate: 5 + train_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + dev_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +train_config: + enable_fsdp: false + enable_ddp: true + use_fp16: true + model_name: mala_asr + num_epochs: 5 + freeze_encoder: true + freeze_llm: true + use_peft: false + batching_strategy: custom + warmup_steps: 1000 + total_steps: 100000 + lr: 5.0e-05 + validation_interval: 10000 + batch_size_training: 10 + val_batch_size: 10 + num_workers_dataloader: 16 + output_dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2206 +deepspeed_config: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +model_config: + llm_name: Qwen2.5-7B-Instruct + llm_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim: 3584 + encoder_name: whisper + encoder_projector_ds_rate: 5 + encoder_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + encoder_dim: 1280 + encoder_projector: linear + normalize: true +metric: acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2206/.hydra/hydra.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2206/.hydra/hydra.yaml new file mode 100644 index 00000000..4bf27e91 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2206/.hydra/hydra.yaml @@ -0,0 +1,197 @@ +hydra: + run: + dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2206 + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.run.dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2206 + - hydra.mode=RUN + task: + - ++train_config.enable_fsdp=false + - ++train_config.enable_ddp=true + - ++train_config.use_fp16=true + - ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json + - ++model_config.llm_name=Qwen2.5-7B-Instruct + - ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + - ++model_config.llm_dim=3584 + - ++model_config.encoder_name=whisper + - ++model_config.encoder_projector_ds_rate=5 + - ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + - ++model_config.encoder_dim=1280 + - ++model_config.encoder_projector=linear + - ++model_config.normalize=true + - ++dataset_config.llm_name=Qwen2.5-7B-Instruct + - ++dataset_config.prompt_style=normal + - ++dataset_config.normalize=true + - ++dataset_config.dataset=aishell-1 + - ++dataset_config.input_type=mel + - ++dataset_config.speed_perturb=false + - ++dataset_config.spec_augmentation=false + - ++dataset_config.add_reverb=false + - ++dataset_config.noise_file_path= + - ++dataset_config.mel_size=128 + - ++dataset_config.pad_or_trim=true + - ++dataset_config.encoder_projector_ds_rate=5 + - ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + - ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ + - ++train_config.model_name=mala_asr + - ++train_config.num_epochs=5 + - ++train_config.freeze_encoder=true + - ++train_config.freeze_llm=true + - ++train_config.use_peft=false + - ++train_config.batching_strategy=custom + - ++train_config.warmup_steps=1000 + - ++train_config.total_steps=100000 + - ++train_config.lr=5e-5 + - ++train_config.validation_interval=10000 + - ++train_config.batch_size_training=10 + - ++train_config.val_batch_size=10 + - ++train_config.num_workers_dataloader=16 + - ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2206 + - ++metric=acc + job: + name: finetune_mala_asr_deepspeed + chdir: null + override_dirname: ++dataset_config.add_reverb=false,++dataset_config.dataset=aishell-1,++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/,++dataset_config.encoder_projector_ds_rate=5,++dataset_config.input_type=mel,++dataset_config.llm_name=Qwen2.5-7B-Instruct,++dataset_config.mel_size=128,++dataset_config.noise_file_path=,++dataset_config.normalize=true,++dataset_config.pad_or_trim=true,++dataset_config.prompt_style=normal,++dataset_config.spec_augmentation=false,++dataset_config.speed_perturb=false,++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/,++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json,++metric=acc,++model_config.encoder_dim=1280,++model_config.encoder_name=whisper,++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt,++model_config.encoder_projector=linear,++model_config.encoder_projector_ds_rate=5,++model_config.llm_dim=3584,++model_config.llm_name=Qwen2.5-7B-Instruct,++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct,++model_config.normalize=true,++train_config.batch_size_training=10,++train_config.batching_strategy=custom,++train_config.enable_ddp=true,++train_config.enable_fsdp=false,++train_config.freeze_encoder=true,++train_config.freeze_llm=true,++train_config.lr=5e-5,++train_config.model_name=mala_asr,++train_config.num_epochs=5,++train_config.num_workers_dataloader=16,++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2206,++train_config.total_steps=100000,++train_config.use_fp16=true,++train_config.use_peft=false,++train_config.val_batch_size=10,++train_config.validation_interval=10000,++train_config.warmup_steps=1000 + id: ??? + num: ??? + config_name: prompt.yaml + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/conf + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2206 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2206/.hydra/overrides.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2206/.hydra/overrides.yaml new file mode 100644 index 00000000..0e11742a --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2206/.hydra/overrides.yaml @@ -0,0 +1,42 @@ +- ++train_config.enable_fsdp=false +- ++train_config.enable_ddp=true +- ++train_config.use_fp16=true +- ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +- ++model_config.llm_name=Qwen2.5-7B-Instruct +- ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct +- ++model_config.llm_dim=3584 +- ++model_config.encoder_name=whisper +- ++model_config.encoder_projector_ds_rate=5 +- ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt +- ++model_config.encoder_dim=1280 +- ++model_config.encoder_projector=linear +- ++model_config.normalize=true +- ++dataset_config.llm_name=Qwen2.5-7B-Instruct +- ++dataset_config.prompt_style=normal +- ++dataset_config.normalize=true +- ++dataset_config.dataset=aishell-1 +- ++dataset_config.input_type=mel +- ++dataset_config.speed_perturb=false +- ++dataset_config.spec_augmentation=false +- ++dataset_config.add_reverb=false +- ++dataset_config.noise_file_path= +- ++dataset_config.mel_size=128 +- ++dataset_config.pad_or_trim=true +- ++dataset_config.encoder_projector_ds_rate=5 +- ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ +- ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +- ++train_config.model_name=mala_asr +- ++train_config.num_epochs=5 +- ++train_config.freeze_encoder=true +- ++train_config.freeze_llm=true +- ++train_config.use_peft=false +- ++train_config.batching_strategy=custom +- ++train_config.warmup_steps=1000 +- ++train_config.total_steps=100000 +- ++train_config.lr=5e-5 +- ++train_config.validation_interval=10000 +- ++train_config.batch_size_training=10 +- ++train_config.val_batch_size=10 +- ++train_config.num_workers_dataloader=16 +- ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2206 +- ++metric=acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2211/.hydra/config.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2211/.hydra/config.yaml new file mode 100644 index 00000000..32c0ab35 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2211/.hydra/config.yaml @@ -0,0 +1,46 @@ +dataset_config: + prompt: Transcribe speech to text. + llm_name: Qwen2.5-7B-Instruct + prompt_style: normal + normalize: true + dataset: aishell-1 + input_type: mel + speed_perturb: false + spec_augmentation: false + add_reverb: false + noise_file_path: '' + mel_size: 128 + pad_or_trim: true + encoder_projector_ds_rate: 5 + train_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + dev_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +train_config: + enable_fsdp: false + enable_ddp: true + use_fp16: true + model_name: mala_asr + num_epochs: 5 + freeze_encoder: true + freeze_llm: true + use_peft: false + batching_strategy: custom + warmup_steps: 1000 + total_steps: 100000 + lr: 5.0e-05 + validation_interval: 10000 + batch_size_training: 10 + val_batch_size: 10 + num_workers_dataloader: 16 + output_dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2211 +deepspeed_config: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +model_config: + llm_name: Qwen2.5-7B-Instruct + llm_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim: 3584 + encoder_name: whisper + encoder_projector_ds_rate: 5 + encoder_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + encoder_dim: 1280 + encoder_projector: linear + normalize: true +metric: acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2211/.hydra/hydra.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2211/.hydra/hydra.yaml new file mode 100644 index 00000000..75bff7d4 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2211/.hydra/hydra.yaml @@ -0,0 +1,197 @@ +hydra: + run: + dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2211 + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.run.dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2211 + - hydra.mode=RUN + task: + - ++train_config.enable_fsdp=false + - ++train_config.enable_ddp=true + - ++train_config.use_fp16=true + - ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json + - ++model_config.llm_name=Qwen2.5-7B-Instruct + - ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + - ++model_config.llm_dim=3584 + - ++model_config.encoder_name=whisper + - ++model_config.encoder_projector_ds_rate=5 + - ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + - ++model_config.encoder_dim=1280 + - ++model_config.encoder_projector=linear + - ++model_config.normalize=true + - ++dataset_config.llm_name=Qwen2.5-7B-Instruct + - ++dataset_config.prompt_style=normal + - ++dataset_config.normalize=true + - ++dataset_config.dataset=aishell-1 + - ++dataset_config.input_type=mel + - ++dataset_config.speed_perturb=false + - ++dataset_config.spec_augmentation=false + - ++dataset_config.add_reverb=false + - ++dataset_config.noise_file_path= + - ++dataset_config.mel_size=128 + - ++dataset_config.pad_or_trim=true + - ++dataset_config.encoder_projector_ds_rate=5 + - ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + - ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ + - ++train_config.model_name=mala_asr + - ++train_config.num_epochs=5 + - ++train_config.freeze_encoder=true + - ++train_config.freeze_llm=true + - ++train_config.use_peft=false + - ++train_config.batching_strategy=custom + - ++train_config.warmup_steps=1000 + - ++train_config.total_steps=100000 + - ++train_config.lr=5e-5 + - ++train_config.validation_interval=10000 + - ++train_config.batch_size_training=10 + - ++train_config.val_batch_size=10 + - ++train_config.num_workers_dataloader=16 + - ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2211 + - ++metric=acc + job: + name: finetune_mala_asr_deepspeed + chdir: null + override_dirname: ++dataset_config.add_reverb=false,++dataset_config.dataset=aishell-1,++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/,++dataset_config.encoder_projector_ds_rate=5,++dataset_config.input_type=mel,++dataset_config.llm_name=Qwen2.5-7B-Instruct,++dataset_config.mel_size=128,++dataset_config.noise_file_path=,++dataset_config.normalize=true,++dataset_config.pad_or_trim=true,++dataset_config.prompt_style=normal,++dataset_config.spec_augmentation=false,++dataset_config.speed_perturb=false,++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/,++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json,++metric=acc,++model_config.encoder_dim=1280,++model_config.encoder_name=whisper,++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt,++model_config.encoder_projector=linear,++model_config.encoder_projector_ds_rate=5,++model_config.llm_dim=3584,++model_config.llm_name=Qwen2.5-7B-Instruct,++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct,++model_config.normalize=true,++train_config.batch_size_training=10,++train_config.batching_strategy=custom,++train_config.enable_ddp=true,++train_config.enable_fsdp=false,++train_config.freeze_encoder=true,++train_config.freeze_llm=true,++train_config.lr=5e-5,++train_config.model_name=mala_asr,++train_config.num_epochs=5,++train_config.num_workers_dataloader=16,++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2211,++train_config.total_steps=100000,++train_config.use_fp16=true,++train_config.use_peft=false,++train_config.val_batch_size=10,++train_config.validation_interval=10000,++train_config.warmup_steps=1000 + id: ??? + num: ??? + config_name: prompt.yaml + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/conf + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2211 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2211/.hydra/overrides.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2211/.hydra/overrides.yaml new file mode 100644 index 00000000..71708177 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2211/.hydra/overrides.yaml @@ -0,0 +1,42 @@ +- ++train_config.enable_fsdp=false +- ++train_config.enable_ddp=true +- ++train_config.use_fp16=true +- ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +- ++model_config.llm_name=Qwen2.5-7B-Instruct +- ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct +- ++model_config.llm_dim=3584 +- ++model_config.encoder_name=whisper +- ++model_config.encoder_projector_ds_rate=5 +- ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt +- ++model_config.encoder_dim=1280 +- ++model_config.encoder_projector=linear +- ++model_config.normalize=true +- ++dataset_config.llm_name=Qwen2.5-7B-Instruct +- ++dataset_config.prompt_style=normal +- ++dataset_config.normalize=true +- ++dataset_config.dataset=aishell-1 +- ++dataset_config.input_type=mel +- ++dataset_config.speed_perturb=false +- ++dataset_config.spec_augmentation=false +- ++dataset_config.add_reverb=false +- ++dataset_config.noise_file_path= +- ++dataset_config.mel_size=128 +- ++dataset_config.pad_or_trim=true +- ++dataset_config.encoder_projector_ds_rate=5 +- ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ +- ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +- ++train_config.model_name=mala_asr +- ++train_config.num_epochs=5 +- ++train_config.freeze_encoder=true +- ++train_config.freeze_llm=true +- ++train_config.use_peft=false +- ++train_config.batching_strategy=custom +- ++train_config.warmup_steps=1000 +- ++train_config.total_steps=100000 +- ++train_config.lr=5e-5 +- ++train_config.validation_interval=10000 +- ++train_config.batch_size_training=10 +- ++train_config.val_batch_size=10 +- ++train_config.num_workers_dataloader=16 +- ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2211 +- ++metric=acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2215/.hydra/config.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2215/.hydra/config.yaml new file mode 100644 index 00000000..36f73762 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2215/.hydra/config.yaml @@ -0,0 +1,46 @@ +dataset_config: + prompt: Transcribe speech to text. + llm_name: Qwen2.5-7B-Instruct + prompt_style: normal + normalize: true + dataset: aishell-1 + input_type: mel + speed_perturb: false + spec_augmentation: false + add_reverb: false + noise_file_path: '' + mel_size: 128 + pad_or_trim: true + encoder_projector_ds_rate: 5 + train_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + dev_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +train_config: + enable_fsdp: false + enable_ddp: true + use_fp16: true + model_name: mala_asr + num_epochs: 5 + freeze_encoder: true + freeze_llm: true + use_peft: false + batching_strategy: custom + warmup_steps: 1000 + total_steps: 100000 + lr: 5.0e-05 + validation_interval: 10000 + batch_size_training: 10 + val_batch_size: 10 + num_workers_dataloader: 8 + output_dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2215 +deepspeed_config: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +model_config: + llm_name: Qwen2.5-7B-Instruct + llm_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim: 3584 + encoder_name: whisper + encoder_projector_ds_rate: 5 + encoder_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + encoder_dim: 1280 + encoder_projector: linear + normalize: true +metric: acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2215/.hydra/hydra.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2215/.hydra/hydra.yaml new file mode 100644 index 00000000..d2b1e54f --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2215/.hydra/hydra.yaml @@ -0,0 +1,197 @@ +hydra: + run: + dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2215 + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.run.dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2215 + - hydra.mode=RUN + task: + - ++train_config.enable_fsdp=false + - ++train_config.enable_ddp=true + - ++train_config.use_fp16=true + - ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json + - ++model_config.llm_name=Qwen2.5-7B-Instruct + - ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + - ++model_config.llm_dim=3584 + - ++model_config.encoder_name=whisper + - ++model_config.encoder_projector_ds_rate=5 + - ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + - ++model_config.encoder_dim=1280 + - ++model_config.encoder_projector=linear + - ++model_config.normalize=true + - ++dataset_config.llm_name=Qwen2.5-7B-Instruct + - ++dataset_config.prompt_style=normal + - ++dataset_config.normalize=true + - ++dataset_config.dataset=aishell-1 + - ++dataset_config.input_type=mel + - ++dataset_config.speed_perturb=false + - ++dataset_config.spec_augmentation=false + - ++dataset_config.add_reverb=false + - ++dataset_config.noise_file_path= + - ++dataset_config.mel_size=128 + - ++dataset_config.pad_or_trim=true + - ++dataset_config.encoder_projector_ds_rate=5 + - ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + - ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ + - ++train_config.model_name=mala_asr + - ++train_config.num_epochs=5 + - ++train_config.freeze_encoder=true + - ++train_config.freeze_llm=true + - ++train_config.use_peft=false + - ++train_config.batching_strategy=custom + - ++train_config.warmup_steps=1000 + - ++train_config.total_steps=100000 + - ++train_config.lr=5e-5 + - ++train_config.validation_interval=10000 + - ++train_config.batch_size_training=10 + - ++train_config.val_batch_size=10 + - ++train_config.num_workers_dataloader=8 + - ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2215 + - ++metric=acc + job: + name: finetune_mala_asr_deepspeed + chdir: null + override_dirname: ++dataset_config.add_reverb=false,++dataset_config.dataset=aishell-1,++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/,++dataset_config.encoder_projector_ds_rate=5,++dataset_config.input_type=mel,++dataset_config.llm_name=Qwen2.5-7B-Instruct,++dataset_config.mel_size=128,++dataset_config.noise_file_path=,++dataset_config.normalize=true,++dataset_config.pad_or_trim=true,++dataset_config.prompt_style=normal,++dataset_config.spec_augmentation=false,++dataset_config.speed_perturb=false,++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/,++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json,++metric=acc,++model_config.encoder_dim=1280,++model_config.encoder_name=whisper,++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt,++model_config.encoder_projector=linear,++model_config.encoder_projector_ds_rate=5,++model_config.llm_dim=3584,++model_config.llm_name=Qwen2.5-7B-Instruct,++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct,++model_config.normalize=true,++train_config.batch_size_training=10,++train_config.batching_strategy=custom,++train_config.enable_ddp=true,++train_config.enable_fsdp=false,++train_config.freeze_encoder=true,++train_config.freeze_llm=true,++train_config.lr=5e-5,++train_config.model_name=mala_asr,++train_config.num_epochs=5,++train_config.num_workers_dataloader=8,++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2215,++train_config.total_steps=100000,++train_config.use_fp16=true,++train_config.use_peft=false,++train_config.val_batch_size=10,++train_config.validation_interval=10000,++train_config.warmup_steps=1000 + id: ??? + num: ??? + config_name: prompt.yaml + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/conf + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2215 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2215/.hydra/overrides.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2215/.hydra/overrides.yaml new file mode 100644 index 00000000..3b547d87 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2215/.hydra/overrides.yaml @@ -0,0 +1,42 @@ +- ++train_config.enable_fsdp=false +- ++train_config.enable_ddp=true +- ++train_config.use_fp16=true +- ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +- ++model_config.llm_name=Qwen2.5-7B-Instruct +- ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct +- ++model_config.llm_dim=3584 +- ++model_config.encoder_name=whisper +- ++model_config.encoder_projector_ds_rate=5 +- ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt +- ++model_config.encoder_dim=1280 +- ++model_config.encoder_projector=linear +- ++model_config.normalize=true +- ++dataset_config.llm_name=Qwen2.5-7B-Instruct +- ++dataset_config.prompt_style=normal +- ++dataset_config.normalize=true +- ++dataset_config.dataset=aishell-1 +- ++dataset_config.input_type=mel +- ++dataset_config.speed_perturb=false +- ++dataset_config.spec_augmentation=false +- ++dataset_config.add_reverb=false +- ++dataset_config.noise_file_path= +- ++dataset_config.mel_size=128 +- ++dataset_config.pad_or_trim=true +- ++dataset_config.encoder_projector_ds_rate=5 +- ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ +- ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +- ++train_config.model_name=mala_asr +- ++train_config.num_epochs=5 +- ++train_config.freeze_encoder=true +- ++train_config.freeze_llm=true +- ++train_config.use_peft=false +- ++train_config.batching_strategy=custom +- ++train_config.warmup_steps=1000 +- ++train_config.total_steps=100000 +- ++train_config.lr=5e-5 +- ++train_config.validation_interval=10000 +- ++train_config.batch_size_training=10 +- ++train_config.val_batch_size=10 +- ++train_config.num_workers_dataloader=8 +- ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2215 +- ++metric=acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2221/.hydra/config.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2221/.hydra/config.yaml new file mode 100644 index 00000000..b27e888b --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2221/.hydra/config.yaml @@ -0,0 +1,46 @@ +dataset_config: + prompt: Transcribe speech to text. + llm_name: Qwen2.5-7B-Instruct + prompt_style: normal + normalize: true + dataset: aishell-1 + input_type: mel + speed_perturb: false + spec_augmentation: false + add_reverb: false + noise_file_path: '' + mel_size: 128 + pad_or_trim: true + encoder_projector_ds_rate: 5 + train_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + dev_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +train_config: + enable_fsdp: false + enable_ddp: true + use_fp16: true + model_name: mala_asr + num_epochs: 5 + freeze_encoder: true + freeze_llm: true + use_peft: false + batching_strategy: custom + warmup_steps: 1000 + total_steps: 100000 + lr: 5.0e-05 + validation_interval: 10000 + batch_size_training: 10 + val_batch_size: 10 + num_workers_dataloader: 0 + output_dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2221 +deepspeed_config: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +model_config: + llm_name: Qwen2.5-7B-Instruct + llm_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim: 3584 + encoder_name: whisper + encoder_projector_ds_rate: 5 + encoder_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + encoder_dim: 1280 + encoder_projector: linear + normalize: true +metric: acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2221/.hydra/hydra.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2221/.hydra/hydra.yaml new file mode 100644 index 00000000..9a1e7865 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2221/.hydra/hydra.yaml @@ -0,0 +1,197 @@ +hydra: + run: + dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2221 + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.run.dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2221 + - hydra.mode=RUN + task: + - ++train_config.enable_fsdp=false + - ++train_config.enable_ddp=true + - ++train_config.use_fp16=true + - ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json + - ++model_config.llm_name=Qwen2.5-7B-Instruct + - ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + - ++model_config.llm_dim=3584 + - ++model_config.encoder_name=whisper + - ++model_config.encoder_projector_ds_rate=5 + - ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + - ++model_config.encoder_dim=1280 + - ++model_config.encoder_projector=linear + - ++model_config.normalize=true + - ++dataset_config.llm_name=Qwen2.5-7B-Instruct + - ++dataset_config.prompt_style=normal + - ++dataset_config.normalize=true + - ++dataset_config.dataset=aishell-1 + - ++dataset_config.input_type=mel + - ++dataset_config.speed_perturb=false + - ++dataset_config.spec_augmentation=false + - ++dataset_config.add_reverb=false + - ++dataset_config.noise_file_path= + - ++dataset_config.mel_size=128 + - ++dataset_config.pad_or_trim=true + - ++dataset_config.encoder_projector_ds_rate=5 + - ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + - ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ + - ++train_config.model_name=mala_asr + - ++train_config.num_epochs=5 + - ++train_config.freeze_encoder=true + - ++train_config.freeze_llm=true + - ++train_config.use_peft=false + - ++train_config.batching_strategy=custom + - ++train_config.warmup_steps=1000 + - ++train_config.total_steps=100000 + - ++train_config.lr=5e-5 + - ++train_config.validation_interval=10000 + - ++train_config.batch_size_training=10 + - ++train_config.val_batch_size=10 + - ++train_config.num_workers_dataloader=0 + - ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2221 + - ++metric=acc + job: + name: finetune_mala_asr_deepspeed + chdir: null + override_dirname: ++dataset_config.add_reverb=false,++dataset_config.dataset=aishell-1,++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/,++dataset_config.encoder_projector_ds_rate=5,++dataset_config.input_type=mel,++dataset_config.llm_name=Qwen2.5-7B-Instruct,++dataset_config.mel_size=128,++dataset_config.noise_file_path=,++dataset_config.normalize=true,++dataset_config.pad_or_trim=true,++dataset_config.prompt_style=normal,++dataset_config.spec_augmentation=false,++dataset_config.speed_perturb=false,++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/,++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json,++metric=acc,++model_config.encoder_dim=1280,++model_config.encoder_name=whisper,++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt,++model_config.encoder_projector=linear,++model_config.encoder_projector_ds_rate=5,++model_config.llm_dim=3584,++model_config.llm_name=Qwen2.5-7B-Instruct,++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct,++model_config.normalize=true,++train_config.batch_size_training=10,++train_config.batching_strategy=custom,++train_config.enable_ddp=true,++train_config.enable_fsdp=false,++train_config.freeze_encoder=true,++train_config.freeze_llm=true,++train_config.lr=5e-5,++train_config.model_name=mala_asr,++train_config.num_epochs=5,++train_config.num_workers_dataloader=0,++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2221,++train_config.total_steps=100000,++train_config.use_fp16=true,++train_config.use_peft=false,++train_config.val_batch_size=10,++train_config.validation_interval=10000,++train_config.warmup_steps=1000 + id: ??? + num: ??? + config_name: prompt.yaml + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/conf + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2221 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2221/.hydra/overrides.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2221/.hydra/overrides.yaml new file mode 100644 index 00000000..4a19beb1 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2221/.hydra/overrides.yaml @@ -0,0 +1,42 @@ +- ++train_config.enable_fsdp=false +- ++train_config.enable_ddp=true +- ++train_config.use_fp16=true +- ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +- ++model_config.llm_name=Qwen2.5-7B-Instruct +- ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct +- ++model_config.llm_dim=3584 +- ++model_config.encoder_name=whisper +- ++model_config.encoder_projector_ds_rate=5 +- ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt +- ++model_config.encoder_dim=1280 +- ++model_config.encoder_projector=linear +- ++model_config.normalize=true +- ++dataset_config.llm_name=Qwen2.5-7B-Instruct +- ++dataset_config.prompt_style=normal +- ++dataset_config.normalize=true +- ++dataset_config.dataset=aishell-1 +- ++dataset_config.input_type=mel +- ++dataset_config.speed_perturb=false +- ++dataset_config.spec_augmentation=false +- ++dataset_config.add_reverb=false +- ++dataset_config.noise_file_path= +- ++dataset_config.mel_size=128 +- ++dataset_config.pad_or_trim=true +- ++dataset_config.encoder_projector_ds_rate=5 +- ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ +- ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +- ++train_config.model_name=mala_asr +- ++train_config.num_epochs=5 +- ++train_config.freeze_encoder=true +- ++train_config.freeze_llm=true +- ++train_config.use_peft=false +- ++train_config.batching_strategy=custom +- ++train_config.warmup_steps=1000 +- ++train_config.total_steps=100000 +- ++train_config.lr=5e-5 +- ++train_config.validation_interval=10000 +- ++train_config.batch_size_training=10 +- ++train_config.val_batch_size=10 +- ++train_config.num_workers_dataloader=0 +- ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2221 +- ++metric=acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2225/.hydra/config.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2225/.hydra/config.yaml new file mode 100644 index 00000000..744e7a47 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2225/.hydra/config.yaml @@ -0,0 +1,46 @@ +dataset_config: + prompt: Transcribe speech to text. + llm_name: Qwen2.5-7B-Instruct + prompt_style: normal + normalize: true + dataset: aishell-1 + input_type: mel + speed_perturb: false + spec_augmentation: false + add_reverb: false + noise_file_path: '' + mel_size: 128 + pad_or_trim: true + encoder_projector_ds_rate: 5 + train_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + dev_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +train_config: + enable_fsdp: false + enable_ddp: true + use_fp16: true + model_name: mala_asr + num_epochs: 5 + freeze_encoder: true + freeze_llm: true + use_peft: false + batching_strategy: custom + warmup_steps: 1000 + total_steps: 100000 + lr: 5.0e-05 + validation_interval: 10000 + batch_size_training: 10 + val_batch_size: 10 + num_workers_dataloader: 0 + output_dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2225 +deepspeed_config: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +model_config: + llm_name: Qwen2.5-7B-Instruct + llm_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim: 3584 + encoder_name: whisper + encoder_projector_ds_rate: 5 + encoder_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + encoder_dim: 1280 + encoder_projector: linear + normalize: true +metric: acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2225/.hydra/hydra.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2225/.hydra/hydra.yaml new file mode 100644 index 00000000..9384fa96 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2225/.hydra/hydra.yaml @@ -0,0 +1,197 @@ +hydra: + run: + dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2225 + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.run.dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2225 + - hydra.mode=RUN + task: + - ++train_config.enable_fsdp=false + - ++train_config.enable_ddp=true + - ++train_config.use_fp16=true + - ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json + - ++model_config.llm_name=Qwen2.5-7B-Instruct + - ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + - ++model_config.llm_dim=3584 + - ++model_config.encoder_name=whisper + - ++model_config.encoder_projector_ds_rate=5 + - ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + - ++model_config.encoder_dim=1280 + - ++model_config.encoder_projector=linear + - ++model_config.normalize=true + - ++dataset_config.llm_name=Qwen2.5-7B-Instruct + - ++dataset_config.prompt_style=normal + - ++dataset_config.normalize=true + - ++dataset_config.dataset=aishell-1 + - ++dataset_config.input_type=mel + - ++dataset_config.speed_perturb=false + - ++dataset_config.spec_augmentation=false + - ++dataset_config.add_reverb=false + - ++dataset_config.noise_file_path= + - ++dataset_config.mel_size=128 + - ++dataset_config.pad_or_trim=true + - ++dataset_config.encoder_projector_ds_rate=5 + - ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + - ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ + - ++train_config.model_name=mala_asr + - ++train_config.num_epochs=5 + - ++train_config.freeze_encoder=true + - ++train_config.freeze_llm=true + - ++train_config.use_peft=false + - ++train_config.batching_strategy=custom + - ++train_config.warmup_steps=1000 + - ++train_config.total_steps=100000 + - ++train_config.lr=5e-5 + - ++train_config.validation_interval=10000 + - ++train_config.batch_size_training=10 + - ++train_config.val_batch_size=10 + - ++train_config.num_workers_dataloader=0 + - ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2225 + - ++metric=acc + job: + name: finetune_mala_asr_deepspeed + chdir: null + override_dirname: ++dataset_config.add_reverb=false,++dataset_config.dataset=aishell-1,++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/,++dataset_config.encoder_projector_ds_rate=5,++dataset_config.input_type=mel,++dataset_config.llm_name=Qwen2.5-7B-Instruct,++dataset_config.mel_size=128,++dataset_config.noise_file_path=,++dataset_config.normalize=true,++dataset_config.pad_or_trim=true,++dataset_config.prompt_style=normal,++dataset_config.spec_augmentation=false,++dataset_config.speed_perturb=false,++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/,++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json,++metric=acc,++model_config.encoder_dim=1280,++model_config.encoder_name=whisper,++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt,++model_config.encoder_projector=linear,++model_config.encoder_projector_ds_rate=5,++model_config.llm_dim=3584,++model_config.llm_name=Qwen2.5-7B-Instruct,++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct,++model_config.normalize=true,++train_config.batch_size_training=10,++train_config.batching_strategy=custom,++train_config.enable_ddp=true,++train_config.enable_fsdp=false,++train_config.freeze_encoder=true,++train_config.freeze_llm=true,++train_config.lr=5e-5,++train_config.model_name=mala_asr,++train_config.num_epochs=5,++train_config.num_workers_dataloader=0,++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2225,++train_config.total_steps=100000,++train_config.use_fp16=true,++train_config.use_peft=false,++train_config.val_batch_size=10,++train_config.validation_interval=10000,++train_config.warmup_steps=1000 + id: ??? + num: ??? + config_name: prompt.yaml + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/conf + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2225 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2225/.hydra/overrides.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2225/.hydra/overrides.yaml new file mode 100644 index 00000000..d29645cb --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2225/.hydra/overrides.yaml @@ -0,0 +1,42 @@ +- ++train_config.enable_fsdp=false +- ++train_config.enable_ddp=true +- ++train_config.use_fp16=true +- ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +- ++model_config.llm_name=Qwen2.5-7B-Instruct +- ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct +- ++model_config.llm_dim=3584 +- ++model_config.encoder_name=whisper +- ++model_config.encoder_projector_ds_rate=5 +- ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt +- ++model_config.encoder_dim=1280 +- ++model_config.encoder_projector=linear +- ++model_config.normalize=true +- ++dataset_config.llm_name=Qwen2.5-7B-Instruct +- ++dataset_config.prompt_style=normal +- ++dataset_config.normalize=true +- ++dataset_config.dataset=aishell-1 +- ++dataset_config.input_type=mel +- ++dataset_config.speed_perturb=false +- ++dataset_config.spec_augmentation=false +- ++dataset_config.add_reverb=false +- ++dataset_config.noise_file_path= +- ++dataset_config.mel_size=128 +- ++dataset_config.pad_or_trim=true +- ++dataset_config.encoder_projector_ds_rate=5 +- ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ +- ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +- ++train_config.model_name=mala_asr +- ++train_config.num_epochs=5 +- ++train_config.freeze_encoder=true +- ++train_config.freeze_llm=true +- ++train_config.use_peft=false +- ++train_config.batching_strategy=custom +- ++train_config.warmup_steps=1000 +- ++train_config.total_steps=100000 +- ++train_config.lr=5e-5 +- ++train_config.validation_interval=10000 +- ++train_config.batch_size_training=10 +- ++train_config.val_batch_size=10 +- ++train_config.num_workers_dataloader=0 +- ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2225 +- ++metric=acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2231/.hydra/config.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2231/.hydra/config.yaml new file mode 100644 index 00000000..e4b2819e --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2231/.hydra/config.yaml @@ -0,0 +1,46 @@ +dataset_config: + prompt: Transcribe speech to text. + llm_name: Qwen2.5-7B-Instruct + prompt_style: normal + normalize: true + dataset: aishell-1 + input_type: mel + speed_perturb: false + spec_augmentation: false + add_reverb: false + noise_file_path: '' + mel_size: 128 + pad_or_trim: true + encoder_projector_ds_rate: 5 + train_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + dev_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +train_config: + enable_fsdp: false + enable_ddp: true + use_fp16: true + model_name: mala_asr + num_epochs: 5 + freeze_encoder: true + freeze_llm: true + use_peft: false + batching_strategy: custom + warmup_steps: 1000 + total_steps: 100000 + lr: 5.0e-05 + validation_interval: 10000 + batch_size_training: 10 + val_batch_size: 10 + num_workers_dataloader: 0 + output_dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2231 +deepspeed_config: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +model_config: + llm_name: Qwen2.5-7B-Instruct + llm_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim: 3584 + encoder_name: whisper + encoder_projector_ds_rate: 5 + encoder_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + encoder_dim: 1280 + encoder_projector: linear + normalize: true +metric: acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2231/.hydra/hydra.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2231/.hydra/hydra.yaml new file mode 100644 index 00000000..d395abca --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2231/.hydra/hydra.yaml @@ -0,0 +1,197 @@ +hydra: + run: + dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2231 + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.run.dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2231 + - hydra.mode=RUN + task: + - ++train_config.enable_fsdp=false + - ++train_config.enable_ddp=true + - ++train_config.use_fp16=true + - ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json + - ++model_config.llm_name=Qwen2.5-7B-Instruct + - ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + - ++model_config.llm_dim=3584 + - ++model_config.encoder_name=whisper + - ++model_config.encoder_projector_ds_rate=5 + - ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + - ++model_config.encoder_dim=1280 + - ++model_config.encoder_projector=linear + - ++model_config.normalize=true + - ++dataset_config.llm_name=Qwen2.5-7B-Instruct + - ++dataset_config.prompt_style=normal + - ++dataset_config.normalize=true + - ++dataset_config.dataset=aishell-1 + - ++dataset_config.input_type=mel + - ++dataset_config.speed_perturb=false + - ++dataset_config.spec_augmentation=false + - ++dataset_config.add_reverb=false + - ++dataset_config.noise_file_path= + - ++dataset_config.mel_size=128 + - ++dataset_config.pad_or_trim=true + - ++dataset_config.encoder_projector_ds_rate=5 + - ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + - ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ + - ++train_config.model_name=mala_asr + - ++train_config.num_epochs=5 + - ++train_config.freeze_encoder=true + - ++train_config.freeze_llm=true + - ++train_config.use_peft=false + - ++train_config.batching_strategy=custom + - ++train_config.warmup_steps=1000 + - ++train_config.total_steps=100000 + - ++train_config.lr=5e-5 + - ++train_config.validation_interval=10000 + - ++train_config.batch_size_training=10 + - ++train_config.val_batch_size=10 + - ++train_config.num_workers_dataloader=0 + - ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2231 + - ++metric=acc + job: + name: finetune_mala_asr_deepspeed + chdir: null + override_dirname: ++dataset_config.add_reverb=false,++dataset_config.dataset=aishell-1,++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/,++dataset_config.encoder_projector_ds_rate=5,++dataset_config.input_type=mel,++dataset_config.llm_name=Qwen2.5-7B-Instruct,++dataset_config.mel_size=128,++dataset_config.noise_file_path=,++dataset_config.normalize=true,++dataset_config.pad_or_trim=true,++dataset_config.prompt_style=normal,++dataset_config.spec_augmentation=false,++dataset_config.speed_perturb=false,++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/,++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json,++metric=acc,++model_config.encoder_dim=1280,++model_config.encoder_name=whisper,++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt,++model_config.encoder_projector=linear,++model_config.encoder_projector_ds_rate=5,++model_config.llm_dim=3584,++model_config.llm_name=Qwen2.5-7B-Instruct,++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct,++model_config.normalize=true,++train_config.batch_size_training=10,++train_config.batching_strategy=custom,++train_config.enable_ddp=true,++train_config.enable_fsdp=false,++train_config.freeze_encoder=true,++train_config.freeze_llm=true,++train_config.lr=5e-5,++train_config.model_name=mala_asr,++train_config.num_epochs=5,++train_config.num_workers_dataloader=0,++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2231,++train_config.total_steps=100000,++train_config.use_fp16=true,++train_config.use_peft=false,++train_config.val_batch_size=10,++train_config.validation_interval=10000,++train_config.warmup_steps=1000 + id: ??? + num: ??? + config_name: prompt.yaml + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/conf + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2231 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2231/.hydra/overrides.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2231/.hydra/overrides.yaml new file mode 100644 index 00000000..711592c7 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2231/.hydra/overrides.yaml @@ -0,0 +1,42 @@ +- ++train_config.enable_fsdp=false +- ++train_config.enable_ddp=true +- ++train_config.use_fp16=true +- ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +- ++model_config.llm_name=Qwen2.5-7B-Instruct +- ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct +- ++model_config.llm_dim=3584 +- ++model_config.encoder_name=whisper +- ++model_config.encoder_projector_ds_rate=5 +- ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt +- ++model_config.encoder_dim=1280 +- ++model_config.encoder_projector=linear +- ++model_config.normalize=true +- ++dataset_config.llm_name=Qwen2.5-7B-Instruct +- ++dataset_config.prompt_style=normal +- ++dataset_config.normalize=true +- ++dataset_config.dataset=aishell-1 +- ++dataset_config.input_type=mel +- ++dataset_config.speed_perturb=false +- ++dataset_config.spec_augmentation=false +- ++dataset_config.add_reverb=false +- ++dataset_config.noise_file_path= +- ++dataset_config.mel_size=128 +- ++dataset_config.pad_or_trim=true +- ++dataset_config.encoder_projector_ds_rate=5 +- ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ +- ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +- ++train_config.model_name=mala_asr +- ++train_config.num_epochs=5 +- ++train_config.freeze_encoder=true +- ++train_config.freeze_llm=true +- ++train_config.use_peft=false +- ++train_config.batching_strategy=custom +- ++train_config.warmup_steps=1000 +- ++train_config.total_steps=100000 +- ++train_config.lr=5e-5 +- ++train_config.validation_interval=10000 +- ++train_config.batch_size_training=10 +- ++train_config.val_batch_size=10 +- ++train_config.num_workers_dataloader=0 +- ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2231 +- ++metric=acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2235/.hydra/config.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2235/.hydra/config.yaml new file mode 100644 index 00000000..3b56ceaf --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2235/.hydra/config.yaml @@ -0,0 +1,46 @@ +dataset_config: + prompt: Transcribe speech to text. + llm_name: Qwen2.5-7B-Instruct + prompt_style: normal + normalize: true + dataset: aishell-1 + input_type: mel + speed_perturb: false + spec_augmentation: false + add_reverb: false + noise_file_path: '' + mel_size: 128 + pad_or_trim: true + encoder_projector_ds_rate: 5 + train_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + dev_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +train_config: + enable_fsdp: false + enable_ddp: true + use_fp16: true + model_name: mala_asr + num_epochs: 5 + freeze_encoder: true + freeze_llm: true + use_peft: false + batching_strategy: custom + warmup_steps: 1000 + total_steps: 100000 + lr: 5.0e-05 + validation_interval: 10000 + batch_size_training: 10 + val_batch_size: 10 + num_workers_dataloader: 0 + output_dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2235 +deepspeed_config: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +model_config: + llm_name: Qwen2.5-7B-Instruct + llm_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim: 3584 + encoder_name: whisper + encoder_projector_ds_rate: 5 + encoder_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + encoder_dim: 1280 + encoder_projector: linear + normalize: true +metric: acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2235/.hydra/hydra.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2235/.hydra/hydra.yaml new file mode 100644 index 00000000..094a835f --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2235/.hydra/hydra.yaml @@ -0,0 +1,197 @@ +hydra: + run: + dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2235 + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.run.dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2235 + - hydra.mode=RUN + task: + - ++train_config.enable_fsdp=false + - ++train_config.enable_ddp=true + - ++train_config.use_fp16=true + - ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json + - ++model_config.llm_name=Qwen2.5-7B-Instruct + - ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + - ++model_config.llm_dim=3584 + - ++model_config.encoder_name=whisper + - ++model_config.encoder_projector_ds_rate=5 + - ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + - ++model_config.encoder_dim=1280 + - ++model_config.encoder_projector=linear + - ++model_config.normalize=true + - ++dataset_config.llm_name=Qwen2.5-7B-Instruct + - ++dataset_config.prompt_style=normal + - ++dataset_config.normalize=true + - ++dataset_config.dataset=aishell-1 + - ++dataset_config.input_type=mel + - ++dataset_config.speed_perturb=false + - ++dataset_config.spec_augmentation=false + - ++dataset_config.add_reverb=false + - ++dataset_config.noise_file_path= + - ++dataset_config.mel_size=128 + - ++dataset_config.pad_or_trim=true + - ++dataset_config.encoder_projector_ds_rate=5 + - ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + - ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ + - ++train_config.model_name=mala_asr + - ++train_config.num_epochs=5 + - ++train_config.freeze_encoder=true + - ++train_config.freeze_llm=true + - ++train_config.use_peft=false + - ++train_config.batching_strategy=custom + - ++train_config.warmup_steps=1000 + - ++train_config.total_steps=100000 + - ++train_config.lr=5e-5 + - ++train_config.validation_interval=10000 + - ++train_config.batch_size_training=10 + - ++train_config.val_batch_size=10 + - ++train_config.num_workers_dataloader=0 + - ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2235 + - ++metric=acc + job: + name: finetune_mala_asr_deepspeed + chdir: null + override_dirname: ++dataset_config.add_reverb=false,++dataset_config.dataset=aishell-1,++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/,++dataset_config.encoder_projector_ds_rate=5,++dataset_config.input_type=mel,++dataset_config.llm_name=Qwen2.5-7B-Instruct,++dataset_config.mel_size=128,++dataset_config.noise_file_path=,++dataset_config.normalize=true,++dataset_config.pad_or_trim=true,++dataset_config.prompt_style=normal,++dataset_config.spec_augmentation=false,++dataset_config.speed_perturb=false,++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/,++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json,++metric=acc,++model_config.encoder_dim=1280,++model_config.encoder_name=whisper,++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt,++model_config.encoder_projector=linear,++model_config.encoder_projector_ds_rate=5,++model_config.llm_dim=3584,++model_config.llm_name=Qwen2.5-7B-Instruct,++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct,++model_config.normalize=true,++train_config.batch_size_training=10,++train_config.batching_strategy=custom,++train_config.enable_ddp=true,++train_config.enable_fsdp=false,++train_config.freeze_encoder=true,++train_config.freeze_llm=true,++train_config.lr=5e-5,++train_config.model_name=mala_asr,++train_config.num_epochs=5,++train_config.num_workers_dataloader=0,++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2235,++train_config.total_steps=100000,++train_config.use_fp16=true,++train_config.use_peft=false,++train_config.val_batch_size=10,++train_config.validation_interval=10000,++train_config.warmup_steps=1000 + id: ??? + num: ??? + config_name: prompt.yaml + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/conf + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2235 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2235/.hydra/overrides.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2235/.hydra/overrides.yaml new file mode 100644 index 00000000..4b3b16c7 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2235/.hydra/overrides.yaml @@ -0,0 +1,42 @@ +- ++train_config.enable_fsdp=false +- ++train_config.enable_ddp=true +- ++train_config.use_fp16=true +- ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +- ++model_config.llm_name=Qwen2.5-7B-Instruct +- ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct +- ++model_config.llm_dim=3584 +- ++model_config.encoder_name=whisper +- ++model_config.encoder_projector_ds_rate=5 +- ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt +- ++model_config.encoder_dim=1280 +- ++model_config.encoder_projector=linear +- ++model_config.normalize=true +- ++dataset_config.llm_name=Qwen2.5-7B-Instruct +- ++dataset_config.prompt_style=normal +- ++dataset_config.normalize=true +- ++dataset_config.dataset=aishell-1 +- ++dataset_config.input_type=mel +- ++dataset_config.speed_perturb=false +- ++dataset_config.spec_augmentation=false +- ++dataset_config.add_reverb=false +- ++dataset_config.noise_file_path= +- ++dataset_config.mel_size=128 +- ++dataset_config.pad_or_trim=true +- ++dataset_config.encoder_projector_ds_rate=5 +- ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ +- ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +- ++train_config.model_name=mala_asr +- ++train_config.num_epochs=5 +- ++train_config.freeze_encoder=true +- ++train_config.freeze_llm=true +- ++train_config.use_peft=false +- ++train_config.batching_strategy=custom +- ++train_config.warmup_steps=1000 +- ++train_config.total_steps=100000 +- ++train_config.lr=5e-5 +- ++train_config.validation_interval=10000 +- ++train_config.batch_size_training=10 +- ++train_config.val_batch_size=10 +- ++train_config.num_workers_dataloader=0 +- ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2235 +- ++metric=acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2239/.hydra/config.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2239/.hydra/config.yaml new file mode 100644 index 00000000..a0e48750 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2239/.hydra/config.yaml @@ -0,0 +1,46 @@ +dataset_config: + prompt: Transcribe speech to text. + llm_name: Qwen2.5-7B-Instruct + prompt_style: normal + normalize: true + dataset: aishell-1 + input_type: mel + speed_perturb: false + spec_augmentation: false + add_reverb: false + noise_file_path: '' + mel_size: 128 + pad_or_trim: true + encoder_projector_ds_rate: 5 + train_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + dev_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +train_config: + enable_fsdp: false + enable_ddp: true + use_fp16: true + model_name: mala_asr + num_epochs: 5 + freeze_encoder: true + freeze_llm: true + use_peft: false + batching_strategy: custom + warmup_steps: 1000 + total_steps: 100000 + lr: 5.0e-05 + validation_interval: 10000 + batch_size_training: 10 + val_batch_size: 10 + num_workers_dataloader: 0 + output_dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2239 +deepspeed_config: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +model_config: + llm_name: Qwen2.5-7B-Instruct + llm_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim: 3584 + encoder_name: whisper + encoder_projector_ds_rate: 5 + encoder_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + encoder_dim: 1280 + encoder_projector: linear + normalize: true +metric: acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2239/.hydra/hydra.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2239/.hydra/hydra.yaml new file mode 100644 index 00000000..cfb8c370 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2239/.hydra/hydra.yaml @@ -0,0 +1,197 @@ +hydra: + run: + dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2239 + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.run.dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2239 + - hydra.mode=RUN + task: + - ++train_config.enable_fsdp=false + - ++train_config.enable_ddp=true + - ++train_config.use_fp16=true + - ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json + - ++model_config.llm_name=Qwen2.5-7B-Instruct + - ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + - ++model_config.llm_dim=3584 + - ++model_config.encoder_name=whisper + - ++model_config.encoder_projector_ds_rate=5 + - ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + - ++model_config.encoder_dim=1280 + - ++model_config.encoder_projector=linear + - ++model_config.normalize=true + - ++dataset_config.llm_name=Qwen2.5-7B-Instruct + - ++dataset_config.prompt_style=normal + - ++dataset_config.normalize=true + - ++dataset_config.dataset=aishell-1 + - ++dataset_config.input_type=mel + - ++dataset_config.speed_perturb=false + - ++dataset_config.spec_augmentation=false + - ++dataset_config.add_reverb=false + - ++dataset_config.noise_file_path= + - ++dataset_config.mel_size=128 + - ++dataset_config.pad_or_trim=true + - ++dataset_config.encoder_projector_ds_rate=5 + - ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + - ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ + - ++train_config.model_name=mala_asr + - ++train_config.num_epochs=5 + - ++train_config.freeze_encoder=true + - ++train_config.freeze_llm=true + - ++train_config.use_peft=false + - ++train_config.batching_strategy=custom + - ++train_config.warmup_steps=1000 + - ++train_config.total_steps=100000 + - ++train_config.lr=5e-5 + - ++train_config.validation_interval=10000 + - ++train_config.batch_size_training=10 + - ++train_config.val_batch_size=10 + - ++train_config.num_workers_dataloader=0 + - ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2239 + - ++metric=acc + job: + name: finetune_mala_asr_deepspeed + chdir: null + override_dirname: ++dataset_config.add_reverb=false,++dataset_config.dataset=aishell-1,++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/,++dataset_config.encoder_projector_ds_rate=5,++dataset_config.input_type=mel,++dataset_config.llm_name=Qwen2.5-7B-Instruct,++dataset_config.mel_size=128,++dataset_config.noise_file_path=,++dataset_config.normalize=true,++dataset_config.pad_or_trim=true,++dataset_config.prompt_style=normal,++dataset_config.spec_augmentation=false,++dataset_config.speed_perturb=false,++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/,++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json,++metric=acc,++model_config.encoder_dim=1280,++model_config.encoder_name=whisper,++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt,++model_config.encoder_projector=linear,++model_config.encoder_projector_ds_rate=5,++model_config.llm_dim=3584,++model_config.llm_name=Qwen2.5-7B-Instruct,++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct,++model_config.normalize=true,++train_config.batch_size_training=10,++train_config.batching_strategy=custom,++train_config.enable_ddp=true,++train_config.enable_fsdp=false,++train_config.freeze_encoder=true,++train_config.freeze_llm=true,++train_config.lr=5e-5,++train_config.model_name=mala_asr,++train_config.num_epochs=5,++train_config.num_workers_dataloader=0,++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2239,++train_config.total_steps=100000,++train_config.use_fp16=true,++train_config.use_peft=false,++train_config.val_batch_size=10,++train_config.validation_interval=10000,++train_config.warmup_steps=1000 + id: ??? + num: ??? + config_name: prompt.yaml + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/conf + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2239 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2239/.hydra/overrides.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2239/.hydra/overrides.yaml new file mode 100644 index 00000000..ff51df4d --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2239/.hydra/overrides.yaml @@ -0,0 +1,42 @@ +- ++train_config.enable_fsdp=false +- ++train_config.enable_ddp=true +- ++train_config.use_fp16=true +- ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +- ++model_config.llm_name=Qwen2.5-7B-Instruct +- ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct +- ++model_config.llm_dim=3584 +- ++model_config.encoder_name=whisper +- ++model_config.encoder_projector_ds_rate=5 +- ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt +- ++model_config.encoder_dim=1280 +- ++model_config.encoder_projector=linear +- ++model_config.normalize=true +- ++dataset_config.llm_name=Qwen2.5-7B-Instruct +- ++dataset_config.prompt_style=normal +- ++dataset_config.normalize=true +- ++dataset_config.dataset=aishell-1 +- ++dataset_config.input_type=mel +- ++dataset_config.speed_perturb=false +- ++dataset_config.spec_augmentation=false +- ++dataset_config.add_reverb=false +- ++dataset_config.noise_file_path= +- ++dataset_config.mel_size=128 +- ++dataset_config.pad_or_trim=true +- ++dataset_config.encoder_projector_ds_rate=5 +- ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ +- ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +- ++train_config.model_name=mala_asr +- ++train_config.num_epochs=5 +- ++train_config.freeze_encoder=true +- ++train_config.freeze_llm=true +- ++train_config.use_peft=false +- ++train_config.batching_strategy=custom +- ++train_config.warmup_steps=1000 +- ++train_config.total_steps=100000 +- ++train_config.lr=5e-5 +- ++train_config.validation_interval=10000 +- ++train_config.batch_size_training=10 +- ++train_config.val_batch_size=10 +- ++train_config.num_workers_dataloader=0 +- ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2239 +- ++metric=acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2246/.hydra/config.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2246/.hydra/config.yaml new file mode 100644 index 00000000..724227e6 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2246/.hydra/config.yaml @@ -0,0 +1,46 @@ +dataset_config: + prompt: Transcribe speech to text. + llm_name: Qwen2.5-7B-Instruct + prompt_style: normal + normalize: true + dataset: aishell-1 + input_type: mel + speed_perturb: false + spec_augmentation: false + add_reverb: false + noise_file_path: '' + mel_size: 128 + pad_or_trim: true + encoder_projector_ds_rate: 5 + train_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + dev_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +train_config: + enable_fsdp: false + enable_ddp: true + use_fp16: true + model_name: mala_asr + num_epochs: 5 + freeze_encoder: true + freeze_llm: true + use_peft: false + batching_strategy: custom + warmup_steps: 1000 + total_steps: 100000 + lr: 5.0e-05 + validation_interval: 10000 + batch_size_training: 10 + val_batch_size: 10 + num_workers_dataloader: 0 + output_dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2246 +deepspeed_config: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +model_config: + llm_name: Qwen2.5-7B-Instruct + llm_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim: 3584 + encoder_name: whisper + encoder_projector_ds_rate: 5 + encoder_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + encoder_dim: 1280 + encoder_projector: linear + normalize: true +metric: acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2246/.hydra/hydra.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2246/.hydra/hydra.yaml new file mode 100644 index 00000000..4132c411 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2246/.hydra/hydra.yaml @@ -0,0 +1,197 @@ +hydra: + run: + dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2246 + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.run.dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2246 + - hydra.mode=RUN + task: + - ++train_config.enable_fsdp=false + - ++train_config.enable_ddp=true + - ++train_config.use_fp16=true + - ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json + - ++model_config.llm_name=Qwen2.5-7B-Instruct + - ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + - ++model_config.llm_dim=3584 + - ++model_config.encoder_name=whisper + - ++model_config.encoder_projector_ds_rate=5 + - ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + - ++model_config.encoder_dim=1280 + - ++model_config.encoder_projector=linear + - ++model_config.normalize=true + - ++dataset_config.llm_name=Qwen2.5-7B-Instruct + - ++dataset_config.prompt_style=normal + - ++dataset_config.normalize=true + - ++dataset_config.dataset=aishell-1 + - ++dataset_config.input_type=mel + - ++dataset_config.speed_perturb=false + - ++dataset_config.spec_augmentation=false + - ++dataset_config.add_reverb=false + - ++dataset_config.noise_file_path= + - ++dataset_config.mel_size=128 + - ++dataset_config.pad_or_trim=true + - ++dataset_config.encoder_projector_ds_rate=5 + - ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + - ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ + - ++train_config.model_name=mala_asr + - ++train_config.num_epochs=5 + - ++train_config.freeze_encoder=true + - ++train_config.freeze_llm=true + - ++train_config.use_peft=false + - ++train_config.batching_strategy=custom + - ++train_config.warmup_steps=1000 + - ++train_config.total_steps=100000 + - ++train_config.lr=5e-5 + - ++train_config.validation_interval=10000 + - ++train_config.batch_size_training=10 + - ++train_config.val_batch_size=10 + - ++train_config.num_workers_dataloader=0 + - ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2246 + - ++metric=acc + job: + name: finetune_mala_asr_deepspeed + chdir: null + override_dirname: ++dataset_config.add_reverb=false,++dataset_config.dataset=aishell-1,++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/,++dataset_config.encoder_projector_ds_rate=5,++dataset_config.input_type=mel,++dataset_config.llm_name=Qwen2.5-7B-Instruct,++dataset_config.mel_size=128,++dataset_config.noise_file_path=,++dataset_config.normalize=true,++dataset_config.pad_or_trim=true,++dataset_config.prompt_style=normal,++dataset_config.spec_augmentation=false,++dataset_config.speed_perturb=false,++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/,++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json,++metric=acc,++model_config.encoder_dim=1280,++model_config.encoder_name=whisper,++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt,++model_config.encoder_projector=linear,++model_config.encoder_projector_ds_rate=5,++model_config.llm_dim=3584,++model_config.llm_name=Qwen2.5-7B-Instruct,++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct,++model_config.normalize=true,++train_config.batch_size_training=10,++train_config.batching_strategy=custom,++train_config.enable_ddp=true,++train_config.enable_fsdp=false,++train_config.freeze_encoder=true,++train_config.freeze_llm=true,++train_config.lr=5e-5,++train_config.model_name=mala_asr,++train_config.num_epochs=5,++train_config.num_workers_dataloader=0,++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2246,++train_config.total_steps=100000,++train_config.use_fp16=true,++train_config.use_peft=false,++train_config.val_batch_size=10,++train_config.validation_interval=10000,++train_config.warmup_steps=1000 + id: ??? + num: ??? + config_name: prompt.yaml + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/conf + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2246 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2246/.hydra/overrides.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2246/.hydra/overrides.yaml new file mode 100644 index 00000000..ccbcbeed --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2246/.hydra/overrides.yaml @@ -0,0 +1,42 @@ +- ++train_config.enable_fsdp=false +- ++train_config.enable_ddp=true +- ++train_config.use_fp16=true +- ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +- ++model_config.llm_name=Qwen2.5-7B-Instruct +- ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct +- ++model_config.llm_dim=3584 +- ++model_config.encoder_name=whisper +- ++model_config.encoder_projector_ds_rate=5 +- ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt +- ++model_config.encoder_dim=1280 +- ++model_config.encoder_projector=linear +- ++model_config.normalize=true +- ++dataset_config.llm_name=Qwen2.5-7B-Instruct +- ++dataset_config.prompt_style=normal +- ++dataset_config.normalize=true +- ++dataset_config.dataset=aishell-1 +- ++dataset_config.input_type=mel +- ++dataset_config.speed_perturb=false +- ++dataset_config.spec_augmentation=false +- ++dataset_config.add_reverb=false +- ++dataset_config.noise_file_path= +- ++dataset_config.mel_size=128 +- ++dataset_config.pad_or_trim=true +- ++dataset_config.encoder_projector_ds_rate=5 +- ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ +- ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +- ++train_config.model_name=mala_asr +- ++train_config.num_epochs=5 +- ++train_config.freeze_encoder=true +- ++train_config.freeze_llm=true +- ++train_config.use_peft=false +- ++train_config.batching_strategy=custom +- ++train_config.warmup_steps=1000 +- ++train_config.total_steps=100000 +- ++train_config.lr=5e-5 +- ++train_config.validation_interval=10000 +- ++train_config.batch_size_training=10 +- ++train_config.val_batch_size=10 +- ++train_config.num_workers_dataloader=0 +- ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2246 +- ++metric=acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2249/.hydra/config.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2249/.hydra/config.yaml new file mode 100644 index 00000000..ae555c88 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2249/.hydra/config.yaml @@ -0,0 +1,46 @@ +dataset_config: + prompt: Transcribe speech to text. + llm_name: Qwen2.5-7B-Instruct + prompt_style: normal + normalize: true + dataset: aishell-1 + input_type: mel + speed_perturb: false + spec_augmentation: false + add_reverb: false + noise_file_path: '' + mel_size: 128 + pad_or_trim: true + encoder_projector_ds_rate: 5 + train_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + dev_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +train_config: + enable_fsdp: false + enable_ddp: true + use_fp16: true + model_name: mala_asr + num_epochs: 5 + freeze_encoder: true + freeze_llm: true + use_peft: false + batching_strategy: custom + warmup_steps: 1000 + total_steps: 100000 + lr: 5.0e-05 + validation_interval: 10000 + batch_size_training: 10 + val_batch_size: 10 + num_workers_dataloader: 0 + output_dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2249 +deepspeed_config: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +model_config: + llm_name: Qwen2.5-7B-Instruct + llm_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim: 3584 + encoder_name: whisper + encoder_projector_ds_rate: 5 + encoder_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + encoder_dim: 1280 + encoder_projector: linear + normalize: true +metric: acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2249/.hydra/hydra.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2249/.hydra/hydra.yaml new file mode 100644 index 00000000..f6439187 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2249/.hydra/hydra.yaml @@ -0,0 +1,197 @@ +hydra: + run: + dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2249 + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.run.dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2249 + - hydra.mode=RUN + task: + - ++train_config.enable_fsdp=false + - ++train_config.enable_ddp=true + - ++train_config.use_fp16=true + - ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json + - ++model_config.llm_name=Qwen2.5-7B-Instruct + - ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + - ++model_config.llm_dim=3584 + - ++model_config.encoder_name=whisper + - ++model_config.encoder_projector_ds_rate=5 + - ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + - ++model_config.encoder_dim=1280 + - ++model_config.encoder_projector=linear + - ++model_config.normalize=true + - ++dataset_config.llm_name=Qwen2.5-7B-Instruct + - ++dataset_config.prompt_style=normal + - ++dataset_config.normalize=true + - ++dataset_config.dataset=aishell-1 + - ++dataset_config.input_type=mel + - ++dataset_config.speed_perturb=false + - ++dataset_config.spec_augmentation=false + - ++dataset_config.add_reverb=false + - ++dataset_config.noise_file_path= + - ++dataset_config.mel_size=128 + - ++dataset_config.pad_or_trim=true + - ++dataset_config.encoder_projector_ds_rate=5 + - ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + - ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ + - ++train_config.model_name=mala_asr + - ++train_config.num_epochs=5 + - ++train_config.freeze_encoder=true + - ++train_config.freeze_llm=true + - ++train_config.use_peft=false + - ++train_config.batching_strategy=custom + - ++train_config.warmup_steps=1000 + - ++train_config.total_steps=100000 + - ++train_config.lr=5e-5 + - ++train_config.validation_interval=10000 + - ++train_config.batch_size_training=10 + - ++train_config.val_batch_size=10 + - ++train_config.num_workers_dataloader=0 + - ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2249 + - ++metric=acc + job: + name: finetune_mala_asr_deepspeed + chdir: null + override_dirname: ++dataset_config.add_reverb=false,++dataset_config.dataset=aishell-1,++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/,++dataset_config.encoder_projector_ds_rate=5,++dataset_config.input_type=mel,++dataset_config.llm_name=Qwen2.5-7B-Instruct,++dataset_config.mel_size=128,++dataset_config.noise_file_path=,++dataset_config.normalize=true,++dataset_config.pad_or_trim=true,++dataset_config.prompt_style=normal,++dataset_config.spec_augmentation=false,++dataset_config.speed_perturb=false,++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/,++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json,++metric=acc,++model_config.encoder_dim=1280,++model_config.encoder_name=whisper,++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt,++model_config.encoder_projector=linear,++model_config.encoder_projector_ds_rate=5,++model_config.llm_dim=3584,++model_config.llm_name=Qwen2.5-7B-Instruct,++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct,++model_config.normalize=true,++train_config.batch_size_training=10,++train_config.batching_strategy=custom,++train_config.enable_ddp=true,++train_config.enable_fsdp=false,++train_config.freeze_encoder=true,++train_config.freeze_llm=true,++train_config.lr=5e-5,++train_config.model_name=mala_asr,++train_config.num_epochs=5,++train_config.num_workers_dataloader=0,++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2249,++train_config.total_steps=100000,++train_config.use_fp16=true,++train_config.use_peft=false,++train_config.val_batch_size=10,++train_config.validation_interval=10000,++train_config.warmup_steps=1000 + id: ??? + num: ??? + config_name: prompt.yaml + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/conf + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2249 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2249/.hydra/overrides.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2249/.hydra/overrides.yaml new file mode 100644 index 00000000..cc6099f6 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2249/.hydra/overrides.yaml @@ -0,0 +1,42 @@ +- ++train_config.enable_fsdp=false +- ++train_config.enable_ddp=true +- ++train_config.use_fp16=true +- ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +- ++model_config.llm_name=Qwen2.5-7B-Instruct +- ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct +- ++model_config.llm_dim=3584 +- ++model_config.encoder_name=whisper +- ++model_config.encoder_projector_ds_rate=5 +- ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt +- ++model_config.encoder_dim=1280 +- ++model_config.encoder_projector=linear +- ++model_config.normalize=true +- ++dataset_config.llm_name=Qwen2.5-7B-Instruct +- ++dataset_config.prompt_style=normal +- ++dataset_config.normalize=true +- ++dataset_config.dataset=aishell-1 +- ++dataset_config.input_type=mel +- ++dataset_config.speed_perturb=false +- ++dataset_config.spec_augmentation=false +- ++dataset_config.add_reverb=false +- ++dataset_config.noise_file_path= +- ++dataset_config.mel_size=128 +- ++dataset_config.pad_or_trim=true +- ++dataset_config.encoder_projector_ds_rate=5 +- ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ +- ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +- ++train_config.model_name=mala_asr +- ++train_config.num_epochs=5 +- ++train_config.freeze_encoder=true +- ++train_config.freeze_llm=true +- ++train_config.use_peft=false +- ++train_config.batching_strategy=custom +- ++train_config.warmup_steps=1000 +- ++train_config.total_steps=100000 +- ++train_config.lr=5e-5 +- ++train_config.validation_interval=10000 +- ++train_config.batch_size_training=10 +- ++train_config.val_batch_size=10 +- ++train_config.num_workers_dataloader=0 +- ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2249 +- ++metric=acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2253/.hydra/config.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2253/.hydra/config.yaml new file mode 100644 index 00000000..34564df4 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2253/.hydra/config.yaml @@ -0,0 +1,46 @@ +dataset_config: + prompt: Transcribe speech to text. + llm_name: Qwen2.5-7B-Instruct + prompt_style: normal + normalize: true + dataset: aishell-1 + input_type: mel + speed_perturb: false + spec_augmentation: false + add_reverb: false + noise_file_path: '' + mel_size: 128 + pad_or_trim: true + encoder_projector_ds_rate: 5 + train_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + dev_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +train_config: + enable_fsdp: false + enable_ddp: true + use_fp16: true + model_name: mala_asr + num_epochs: 5 + freeze_encoder: true + freeze_llm: true + use_peft: false + batching_strategy: custom + warmup_steps: 1000 + total_steps: 100000 + lr: 5.0e-05 + validation_interval: 10000 + batch_size_training: 10 + val_batch_size: 10 + num_workers_dataloader: 8 + output_dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2253 +deepspeed_config: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +model_config: + llm_name: Qwen2.5-7B-Instruct + llm_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim: 3584 + encoder_name: whisper + encoder_projector_ds_rate: 5 + encoder_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + encoder_dim: 1280 + encoder_projector: linear + normalize: true +metric: acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2253/.hydra/hydra.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2253/.hydra/hydra.yaml new file mode 100644 index 00000000..a63cca40 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2253/.hydra/hydra.yaml @@ -0,0 +1,197 @@ +hydra: + run: + dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2253 + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.run.dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2253 + - hydra.mode=RUN + task: + - ++train_config.enable_fsdp=false + - ++train_config.enable_ddp=true + - ++train_config.use_fp16=true + - ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json + - ++model_config.llm_name=Qwen2.5-7B-Instruct + - ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + - ++model_config.llm_dim=3584 + - ++model_config.encoder_name=whisper + - ++model_config.encoder_projector_ds_rate=5 + - ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + - ++model_config.encoder_dim=1280 + - ++model_config.encoder_projector=linear + - ++model_config.normalize=true + - ++dataset_config.llm_name=Qwen2.5-7B-Instruct + - ++dataset_config.prompt_style=normal + - ++dataset_config.normalize=true + - ++dataset_config.dataset=aishell-1 + - ++dataset_config.input_type=mel + - ++dataset_config.speed_perturb=false + - ++dataset_config.spec_augmentation=false + - ++dataset_config.add_reverb=false + - ++dataset_config.noise_file_path= + - ++dataset_config.mel_size=128 + - ++dataset_config.pad_or_trim=true + - ++dataset_config.encoder_projector_ds_rate=5 + - ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + - ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ + - ++train_config.model_name=mala_asr + - ++train_config.num_epochs=5 + - ++train_config.freeze_encoder=true + - ++train_config.freeze_llm=true + - ++train_config.use_peft=false + - ++train_config.batching_strategy=custom + - ++train_config.warmup_steps=1000 + - ++train_config.total_steps=100000 + - ++train_config.lr=5e-5 + - ++train_config.validation_interval=10000 + - ++train_config.batch_size_training=10 + - ++train_config.val_batch_size=10 + - ++train_config.num_workers_dataloader=8 + - ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2253 + - ++metric=acc + job: + name: finetune_mala_asr_deepspeed + chdir: null + override_dirname: ++dataset_config.add_reverb=false,++dataset_config.dataset=aishell-1,++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/,++dataset_config.encoder_projector_ds_rate=5,++dataset_config.input_type=mel,++dataset_config.llm_name=Qwen2.5-7B-Instruct,++dataset_config.mel_size=128,++dataset_config.noise_file_path=,++dataset_config.normalize=true,++dataset_config.pad_or_trim=true,++dataset_config.prompt_style=normal,++dataset_config.spec_augmentation=false,++dataset_config.speed_perturb=false,++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/,++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json,++metric=acc,++model_config.encoder_dim=1280,++model_config.encoder_name=whisper,++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt,++model_config.encoder_projector=linear,++model_config.encoder_projector_ds_rate=5,++model_config.llm_dim=3584,++model_config.llm_name=Qwen2.5-7B-Instruct,++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct,++model_config.normalize=true,++train_config.batch_size_training=10,++train_config.batching_strategy=custom,++train_config.enable_ddp=true,++train_config.enable_fsdp=false,++train_config.freeze_encoder=true,++train_config.freeze_llm=true,++train_config.lr=5e-5,++train_config.model_name=mala_asr,++train_config.num_epochs=5,++train_config.num_workers_dataloader=8,++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2253,++train_config.total_steps=100000,++train_config.use_fp16=true,++train_config.use_peft=false,++train_config.val_batch_size=10,++train_config.validation_interval=10000,++train_config.warmup_steps=1000 + id: ??? + num: ??? + config_name: prompt.yaml + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/conf + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2253 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2253/.hydra/overrides.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2253/.hydra/overrides.yaml new file mode 100644 index 00000000..a4239773 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2253/.hydra/overrides.yaml @@ -0,0 +1,42 @@ +- ++train_config.enable_fsdp=false +- ++train_config.enable_ddp=true +- ++train_config.use_fp16=true +- ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +- ++model_config.llm_name=Qwen2.5-7B-Instruct +- ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct +- ++model_config.llm_dim=3584 +- ++model_config.encoder_name=whisper +- ++model_config.encoder_projector_ds_rate=5 +- ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt +- ++model_config.encoder_dim=1280 +- ++model_config.encoder_projector=linear +- ++model_config.normalize=true +- ++dataset_config.llm_name=Qwen2.5-7B-Instruct +- ++dataset_config.prompt_style=normal +- ++dataset_config.normalize=true +- ++dataset_config.dataset=aishell-1 +- ++dataset_config.input_type=mel +- ++dataset_config.speed_perturb=false +- ++dataset_config.spec_augmentation=false +- ++dataset_config.add_reverb=false +- ++dataset_config.noise_file_path= +- ++dataset_config.mel_size=128 +- ++dataset_config.pad_or_trim=true +- ++dataset_config.encoder_projector_ds_rate=5 +- ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ +- ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +- ++train_config.model_name=mala_asr +- ++train_config.num_epochs=5 +- ++train_config.freeze_encoder=true +- ++train_config.freeze_llm=true +- ++train_config.use_peft=false +- ++train_config.batching_strategy=custom +- ++train_config.warmup_steps=1000 +- ++train_config.total_steps=100000 +- ++train_config.lr=5e-5 +- ++train_config.validation_interval=10000 +- ++train_config.batch_size_training=10 +- ++train_config.val_batch_size=10 +- ++train_config.num_workers_dataloader=8 +- ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2253 +- ++metric=acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2258/.hydra/config.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2258/.hydra/config.yaml new file mode 100644 index 00000000..c02cd0f6 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2258/.hydra/config.yaml @@ -0,0 +1,46 @@ +dataset_config: + prompt: Transcribe speech to text. + llm_name: Qwen2.5-7B-Instruct + prompt_style: normal + normalize: true + dataset: aishell-1 + input_type: mel + speed_perturb: false + spec_augmentation: false + add_reverb: false + noise_file_path: '' + mel_size: 128 + pad_or_trim: true + encoder_projector_ds_rate: 5 + train_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + dev_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +train_config: + enable_fsdp: false + enable_ddp: true + use_fp16: true + model_name: mala_asr + num_epochs: 5 + freeze_encoder: true + freeze_llm: true + use_peft: false + batching_strategy: custom + warmup_steps: 1000 + total_steps: 100000 + lr: 5.0e-05 + validation_interval: 10000 + batch_size_training: 10 + val_batch_size: 10 + num_workers_dataloader: 8 + output_dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2258 +deepspeed_config: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +model_config: + llm_name: Qwen2.5-7B-Instruct + llm_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim: 3584 + encoder_name: whisper + encoder_projector_ds_rate: 5 + encoder_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + encoder_dim: 1280 + encoder_projector: linear + normalize: true +metric: acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2258/.hydra/hydra.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2258/.hydra/hydra.yaml new file mode 100644 index 00000000..04f1f48b --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2258/.hydra/hydra.yaml @@ -0,0 +1,197 @@ +hydra: + run: + dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2258 + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.run.dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2258 + - hydra.mode=RUN + task: + - ++train_config.enable_fsdp=false + - ++train_config.enable_ddp=true + - ++train_config.use_fp16=true + - ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json + - ++model_config.llm_name=Qwen2.5-7B-Instruct + - ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + - ++model_config.llm_dim=3584 + - ++model_config.encoder_name=whisper + - ++model_config.encoder_projector_ds_rate=5 + - ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + - ++model_config.encoder_dim=1280 + - ++model_config.encoder_projector=linear + - ++model_config.normalize=true + - ++dataset_config.llm_name=Qwen2.5-7B-Instruct + - ++dataset_config.prompt_style=normal + - ++dataset_config.normalize=true + - ++dataset_config.dataset=aishell-1 + - ++dataset_config.input_type=mel + - ++dataset_config.speed_perturb=false + - ++dataset_config.spec_augmentation=false + - ++dataset_config.add_reverb=false + - ++dataset_config.noise_file_path= + - ++dataset_config.mel_size=128 + - ++dataset_config.pad_or_trim=true + - ++dataset_config.encoder_projector_ds_rate=5 + - ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + - ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ + - ++train_config.model_name=mala_asr + - ++train_config.num_epochs=5 + - ++train_config.freeze_encoder=true + - ++train_config.freeze_llm=true + - ++train_config.use_peft=false + - ++train_config.batching_strategy=custom + - ++train_config.warmup_steps=1000 + - ++train_config.total_steps=100000 + - ++train_config.lr=5e-5 + - ++train_config.validation_interval=10000 + - ++train_config.batch_size_training=10 + - ++train_config.val_batch_size=10 + - ++train_config.num_workers_dataloader=8 + - ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2258 + - ++metric=acc + job: + name: finetune_mala_asr_deepspeed + chdir: null + override_dirname: ++dataset_config.add_reverb=false,++dataset_config.dataset=aishell-1,++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/,++dataset_config.encoder_projector_ds_rate=5,++dataset_config.input_type=mel,++dataset_config.llm_name=Qwen2.5-7B-Instruct,++dataset_config.mel_size=128,++dataset_config.noise_file_path=,++dataset_config.normalize=true,++dataset_config.pad_or_trim=true,++dataset_config.prompt_style=normal,++dataset_config.spec_augmentation=false,++dataset_config.speed_perturb=false,++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/,++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json,++metric=acc,++model_config.encoder_dim=1280,++model_config.encoder_name=whisper,++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt,++model_config.encoder_projector=linear,++model_config.encoder_projector_ds_rate=5,++model_config.llm_dim=3584,++model_config.llm_name=Qwen2.5-7B-Instruct,++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct,++model_config.normalize=true,++train_config.batch_size_training=10,++train_config.batching_strategy=custom,++train_config.enable_ddp=true,++train_config.enable_fsdp=false,++train_config.freeze_encoder=true,++train_config.freeze_llm=true,++train_config.lr=5e-5,++train_config.model_name=mala_asr,++train_config.num_epochs=5,++train_config.num_workers_dataloader=8,++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2258,++train_config.total_steps=100000,++train_config.use_fp16=true,++train_config.use_peft=false,++train_config.val_batch_size=10,++train_config.validation_interval=10000,++train_config.warmup_steps=1000 + id: ??? + num: ??? + config_name: prompt.yaml + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/conf + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2258 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2258/.hydra/overrides.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2258/.hydra/overrides.yaml new file mode 100644 index 00000000..e5a3bd58 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2258/.hydra/overrides.yaml @@ -0,0 +1,42 @@ +- ++train_config.enable_fsdp=false +- ++train_config.enable_ddp=true +- ++train_config.use_fp16=true +- ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +- ++model_config.llm_name=Qwen2.5-7B-Instruct +- ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct +- ++model_config.llm_dim=3584 +- ++model_config.encoder_name=whisper +- ++model_config.encoder_projector_ds_rate=5 +- ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt +- ++model_config.encoder_dim=1280 +- ++model_config.encoder_projector=linear +- ++model_config.normalize=true +- ++dataset_config.llm_name=Qwen2.5-7B-Instruct +- ++dataset_config.prompt_style=normal +- ++dataset_config.normalize=true +- ++dataset_config.dataset=aishell-1 +- ++dataset_config.input_type=mel +- ++dataset_config.speed_perturb=false +- ++dataset_config.spec_augmentation=false +- ++dataset_config.add_reverb=false +- ++dataset_config.noise_file_path= +- ++dataset_config.mel_size=128 +- ++dataset_config.pad_or_trim=true +- ++dataset_config.encoder_projector_ds_rate=5 +- ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ +- ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +- ++train_config.model_name=mala_asr +- ++train_config.num_epochs=5 +- ++train_config.freeze_encoder=true +- ++train_config.freeze_llm=true +- ++train_config.use_peft=false +- ++train_config.batching_strategy=custom +- ++train_config.warmup_steps=1000 +- ++train_config.total_steps=100000 +- ++train_config.lr=5e-5 +- ++train_config.validation_interval=10000 +- ++train_config.batch_size_training=10 +- ++train_config.val_batch_size=10 +- ++train_config.num_workers_dataloader=8 +- ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2258 +- ++metric=acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2301/.hydra/config.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2301/.hydra/config.yaml new file mode 100644 index 00000000..4b9e6bd9 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2301/.hydra/config.yaml @@ -0,0 +1,46 @@ +dataset_config: + prompt: Transcribe speech to text. + llm_name: Qwen2.5-7B-Instruct + prompt_style: normal + normalize: true + dataset: aishell-1 + input_type: mel + speed_perturb: false + spec_augmentation: false + add_reverb: false + noise_file_path: '' + mel_size: 128 + pad_or_trim: true + encoder_projector_ds_rate: 5 + train_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + dev_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +train_config: + enable_fsdp: false + enable_ddp: true + use_fp16: true + model_name: mala_asr + num_epochs: 5 + freeze_encoder: true + freeze_llm: true + use_peft: false + batching_strategy: custom + warmup_steps: 1000 + total_steps: 100000 + lr: 5.0e-05 + validation_interval: 10000 + batch_size_training: 10 + val_batch_size: 10 + num_workers_dataloader: 8 + output_dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2301 +deepspeed_config: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +model_config: + llm_name: Qwen2.5-7B-Instruct + llm_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim: 3584 + encoder_name: whisper + encoder_projector_ds_rate: 5 + encoder_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + encoder_dim: 1280 + encoder_projector: linear + normalize: true +metric: acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2301/.hydra/hydra.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2301/.hydra/hydra.yaml new file mode 100644 index 00000000..a97ed675 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2301/.hydra/hydra.yaml @@ -0,0 +1,197 @@ +hydra: + run: + dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2301 + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.run.dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2301 + - hydra.mode=RUN + task: + - ++train_config.enable_fsdp=false + - ++train_config.enable_ddp=true + - ++train_config.use_fp16=true + - ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json + - ++model_config.llm_name=Qwen2.5-7B-Instruct + - ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + - ++model_config.llm_dim=3584 + - ++model_config.encoder_name=whisper + - ++model_config.encoder_projector_ds_rate=5 + - ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + - ++model_config.encoder_dim=1280 + - ++model_config.encoder_projector=linear + - ++model_config.normalize=true + - ++dataset_config.llm_name=Qwen2.5-7B-Instruct + - ++dataset_config.prompt_style=normal + - ++dataset_config.normalize=true + - ++dataset_config.dataset=aishell-1 + - ++dataset_config.input_type=mel + - ++dataset_config.speed_perturb=false + - ++dataset_config.spec_augmentation=false + - ++dataset_config.add_reverb=false + - ++dataset_config.noise_file_path= + - ++dataset_config.mel_size=128 + - ++dataset_config.pad_or_trim=true + - ++dataset_config.encoder_projector_ds_rate=5 + - ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + - ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ + - ++train_config.model_name=mala_asr + - ++train_config.num_epochs=5 + - ++train_config.freeze_encoder=true + - ++train_config.freeze_llm=true + - ++train_config.use_peft=false + - ++train_config.batching_strategy=custom + - ++train_config.warmup_steps=1000 + - ++train_config.total_steps=100000 + - ++train_config.lr=5e-5 + - ++train_config.validation_interval=10000 + - ++train_config.batch_size_training=10 + - ++train_config.val_batch_size=10 + - ++train_config.num_workers_dataloader=8 + - ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2301 + - ++metric=acc + job: + name: finetune_mala_asr_deepspeed + chdir: null + override_dirname: ++dataset_config.add_reverb=false,++dataset_config.dataset=aishell-1,++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/,++dataset_config.encoder_projector_ds_rate=5,++dataset_config.input_type=mel,++dataset_config.llm_name=Qwen2.5-7B-Instruct,++dataset_config.mel_size=128,++dataset_config.noise_file_path=,++dataset_config.normalize=true,++dataset_config.pad_or_trim=true,++dataset_config.prompt_style=normal,++dataset_config.spec_augmentation=false,++dataset_config.speed_perturb=false,++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/,++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json,++metric=acc,++model_config.encoder_dim=1280,++model_config.encoder_name=whisper,++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt,++model_config.encoder_projector=linear,++model_config.encoder_projector_ds_rate=5,++model_config.llm_dim=3584,++model_config.llm_name=Qwen2.5-7B-Instruct,++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct,++model_config.normalize=true,++train_config.batch_size_training=10,++train_config.batching_strategy=custom,++train_config.enable_ddp=true,++train_config.enable_fsdp=false,++train_config.freeze_encoder=true,++train_config.freeze_llm=true,++train_config.lr=5e-5,++train_config.model_name=mala_asr,++train_config.num_epochs=5,++train_config.num_workers_dataloader=8,++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2301,++train_config.total_steps=100000,++train_config.use_fp16=true,++train_config.use_peft=false,++train_config.val_batch_size=10,++train_config.validation_interval=10000,++train_config.warmup_steps=1000 + id: ??? + num: ??? + config_name: prompt.yaml + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/conf + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2301 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2301/.hydra/overrides.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2301/.hydra/overrides.yaml new file mode 100644 index 00000000..522fa5ab --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2301/.hydra/overrides.yaml @@ -0,0 +1,42 @@ +- ++train_config.enable_fsdp=false +- ++train_config.enable_ddp=true +- ++train_config.use_fp16=true +- ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +- ++model_config.llm_name=Qwen2.5-7B-Instruct +- ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct +- ++model_config.llm_dim=3584 +- ++model_config.encoder_name=whisper +- ++model_config.encoder_projector_ds_rate=5 +- ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt +- ++model_config.encoder_dim=1280 +- ++model_config.encoder_projector=linear +- ++model_config.normalize=true +- ++dataset_config.llm_name=Qwen2.5-7B-Instruct +- ++dataset_config.prompt_style=normal +- ++dataset_config.normalize=true +- ++dataset_config.dataset=aishell-1 +- ++dataset_config.input_type=mel +- ++dataset_config.speed_perturb=false +- ++dataset_config.spec_augmentation=false +- ++dataset_config.add_reverb=false +- ++dataset_config.noise_file_path= +- ++dataset_config.mel_size=128 +- ++dataset_config.pad_or_trim=true +- ++dataset_config.encoder_projector_ds_rate=5 +- ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ +- ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +- ++train_config.model_name=mala_asr +- ++train_config.num_epochs=5 +- ++train_config.freeze_encoder=true +- ++train_config.freeze_llm=true +- ++train_config.use_peft=false +- ++train_config.batching_strategy=custom +- ++train_config.warmup_steps=1000 +- ++train_config.total_steps=100000 +- ++train_config.lr=5e-5 +- ++train_config.validation_interval=10000 +- ++train_config.batch_size_training=10 +- ++train_config.val_batch_size=10 +- ++train_config.num_workers_dataloader=8 +- ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2301 +- ++metric=acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2307/.hydra/config.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2307/.hydra/config.yaml new file mode 100644 index 00000000..e8de6392 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2307/.hydra/config.yaml @@ -0,0 +1,46 @@ +dataset_config: + prompt: Transcribe speech to text. + llm_name: Qwen2.5-7B-Instruct + prompt_style: normal + normalize: true + dataset: aishell-1 + input_type: mel + speed_perturb: false + spec_augmentation: false + add_reverb: false + noise_file_path: '' + mel_size: 128 + pad_or_trim: true + encoder_projector_ds_rate: 5 + train_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + dev_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +train_config: + enable_fsdp: false + enable_ddp: true + use_fp16: true + model_name: mala_asr + num_epochs: 5 + freeze_encoder: true + freeze_llm: true + use_peft: false + batching_strategy: custom + warmup_steps: 1000 + total_steps: 100000 + lr: 5.0e-05 + validation_interval: 10000 + batch_size_training: 10 + val_batch_size: 10 + num_workers_dataloader: 8 + output_dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2307 +deepspeed_config: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +model_config: + llm_name: Qwen2.5-7B-Instruct + llm_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim: 3584 + encoder_name: whisper + encoder_projector_ds_rate: 5 + encoder_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + encoder_dim: 1280 + encoder_projector: linear + normalize: true +metric: acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2307/.hydra/hydra.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2307/.hydra/hydra.yaml new file mode 100644 index 00000000..3e9f68ea --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2307/.hydra/hydra.yaml @@ -0,0 +1,197 @@ +hydra: + run: + dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2307 + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.run.dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2307 + - hydra.mode=RUN + task: + - ++train_config.enable_fsdp=false + - ++train_config.enable_ddp=true + - ++train_config.use_fp16=true + - ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json + - ++model_config.llm_name=Qwen2.5-7B-Instruct + - ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + - ++model_config.llm_dim=3584 + - ++model_config.encoder_name=whisper + - ++model_config.encoder_projector_ds_rate=5 + - ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + - ++model_config.encoder_dim=1280 + - ++model_config.encoder_projector=linear + - ++model_config.normalize=true + - ++dataset_config.llm_name=Qwen2.5-7B-Instruct + - ++dataset_config.prompt_style=normal + - ++dataset_config.normalize=true + - ++dataset_config.dataset=aishell-1 + - ++dataset_config.input_type=mel + - ++dataset_config.speed_perturb=false + - ++dataset_config.spec_augmentation=false + - ++dataset_config.add_reverb=false + - ++dataset_config.noise_file_path= + - ++dataset_config.mel_size=128 + - ++dataset_config.pad_or_trim=true + - ++dataset_config.encoder_projector_ds_rate=5 + - ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + - ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ + - ++train_config.model_name=mala_asr + - ++train_config.num_epochs=5 + - ++train_config.freeze_encoder=true + - ++train_config.freeze_llm=true + - ++train_config.use_peft=false + - ++train_config.batching_strategy=custom + - ++train_config.warmup_steps=1000 + - ++train_config.total_steps=100000 + - ++train_config.lr=5e-5 + - ++train_config.validation_interval=10000 + - ++train_config.batch_size_training=10 + - ++train_config.val_batch_size=10 + - ++train_config.num_workers_dataloader=8 + - ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2307 + - ++metric=acc + job: + name: finetune_mala_asr_deepspeed + chdir: null + override_dirname: ++dataset_config.add_reverb=false,++dataset_config.dataset=aishell-1,++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/,++dataset_config.encoder_projector_ds_rate=5,++dataset_config.input_type=mel,++dataset_config.llm_name=Qwen2.5-7B-Instruct,++dataset_config.mel_size=128,++dataset_config.noise_file_path=,++dataset_config.normalize=true,++dataset_config.pad_or_trim=true,++dataset_config.prompt_style=normal,++dataset_config.spec_augmentation=false,++dataset_config.speed_perturb=false,++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/,++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json,++metric=acc,++model_config.encoder_dim=1280,++model_config.encoder_name=whisper,++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt,++model_config.encoder_projector=linear,++model_config.encoder_projector_ds_rate=5,++model_config.llm_dim=3584,++model_config.llm_name=Qwen2.5-7B-Instruct,++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct,++model_config.normalize=true,++train_config.batch_size_training=10,++train_config.batching_strategy=custom,++train_config.enable_ddp=true,++train_config.enable_fsdp=false,++train_config.freeze_encoder=true,++train_config.freeze_llm=true,++train_config.lr=5e-5,++train_config.model_name=mala_asr,++train_config.num_epochs=5,++train_config.num_workers_dataloader=8,++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2307,++train_config.total_steps=100000,++train_config.use_fp16=true,++train_config.use_peft=false,++train_config.val_batch_size=10,++train_config.validation_interval=10000,++train_config.warmup_steps=1000 + id: ??? + num: ??? + config_name: prompt.yaml + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/conf + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2307 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2307/.hydra/overrides.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2307/.hydra/overrides.yaml new file mode 100644 index 00000000..6f213304 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2307/.hydra/overrides.yaml @@ -0,0 +1,42 @@ +- ++train_config.enable_fsdp=false +- ++train_config.enable_ddp=true +- ++train_config.use_fp16=true +- ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +- ++model_config.llm_name=Qwen2.5-7B-Instruct +- ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct +- ++model_config.llm_dim=3584 +- ++model_config.encoder_name=whisper +- ++model_config.encoder_projector_ds_rate=5 +- ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt +- ++model_config.encoder_dim=1280 +- ++model_config.encoder_projector=linear +- ++model_config.normalize=true +- ++dataset_config.llm_name=Qwen2.5-7B-Instruct +- ++dataset_config.prompt_style=normal +- ++dataset_config.normalize=true +- ++dataset_config.dataset=aishell-1 +- ++dataset_config.input_type=mel +- ++dataset_config.speed_perturb=false +- ++dataset_config.spec_augmentation=false +- ++dataset_config.add_reverb=false +- ++dataset_config.noise_file_path= +- ++dataset_config.mel_size=128 +- ++dataset_config.pad_or_trim=true +- ++dataset_config.encoder_projector_ds_rate=5 +- ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ +- ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +- ++train_config.model_name=mala_asr +- ++train_config.num_epochs=5 +- ++train_config.freeze_encoder=true +- ++train_config.freeze_llm=true +- ++train_config.use_peft=false +- ++train_config.batching_strategy=custom +- ++train_config.warmup_steps=1000 +- ++train_config.total_steps=100000 +- ++train_config.lr=5e-5 +- ++train_config.validation_interval=10000 +- ++train_config.batch_size_training=10 +- ++train_config.val_batch_size=10 +- ++train_config.num_workers_dataloader=8 +- ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2307 +- ++metric=acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2312/.hydra/config.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2312/.hydra/config.yaml new file mode 100644 index 00000000..bb276f44 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2312/.hydra/config.yaml @@ -0,0 +1,46 @@ +dataset_config: + prompt: Transcribe speech to text. + llm_name: Qwen2.5-7B-Instruct + prompt_style: normal + normalize: true + dataset: aishell-1 + input_type: mel + speed_perturb: false + spec_augmentation: false + add_reverb: false + noise_file_path: '' + mel_size: 128 + pad_or_trim: true + encoder_projector_ds_rate: 5 + train_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + dev_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +train_config: + enable_fsdp: false + enable_ddp: true + use_fp16: true + model_name: mala_asr + num_epochs: 5 + freeze_encoder: true + freeze_llm: true + use_peft: false + batching_strategy: custom + warmup_steps: 1000 + total_steps: 100000 + lr: 5.0e-05 + validation_interval: 10000 + batch_size_training: 10 + val_batch_size: 10 + num_workers_dataloader: 0 + output_dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2312 +deepspeed_config: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +model_config: + llm_name: Qwen2.5-7B-Instruct + llm_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim: 3584 + encoder_name: whisper + encoder_projector_ds_rate: 5 + encoder_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + encoder_dim: 1280 + encoder_projector: linear + normalize: true +metric: acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2312/.hydra/hydra.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2312/.hydra/hydra.yaml new file mode 100644 index 00000000..7403e338 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2312/.hydra/hydra.yaml @@ -0,0 +1,197 @@ +hydra: + run: + dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2312 + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.run.dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2312 + - hydra.mode=RUN + task: + - ++train_config.enable_fsdp=false + - ++train_config.enable_ddp=true + - ++train_config.use_fp16=true + - ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json + - ++model_config.llm_name=Qwen2.5-7B-Instruct + - ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + - ++model_config.llm_dim=3584 + - ++model_config.encoder_name=whisper + - ++model_config.encoder_projector_ds_rate=5 + - ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + - ++model_config.encoder_dim=1280 + - ++model_config.encoder_projector=linear + - ++model_config.normalize=true + - ++dataset_config.llm_name=Qwen2.5-7B-Instruct + - ++dataset_config.prompt_style=normal + - ++dataset_config.normalize=true + - ++dataset_config.dataset=aishell-1 + - ++dataset_config.input_type=mel + - ++dataset_config.speed_perturb=false + - ++dataset_config.spec_augmentation=false + - ++dataset_config.add_reverb=false + - ++dataset_config.noise_file_path= + - ++dataset_config.mel_size=128 + - ++dataset_config.pad_or_trim=true + - ++dataset_config.encoder_projector_ds_rate=5 + - ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + - ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ + - ++train_config.model_name=mala_asr + - ++train_config.num_epochs=5 + - ++train_config.freeze_encoder=true + - ++train_config.freeze_llm=true + - ++train_config.use_peft=false + - ++train_config.batching_strategy=custom + - ++train_config.warmup_steps=1000 + - ++train_config.total_steps=100000 + - ++train_config.lr=5e-5 + - ++train_config.validation_interval=10000 + - ++train_config.batch_size_training=10 + - ++train_config.val_batch_size=10 + - ++train_config.num_workers_dataloader=0 + - ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2312 + - ++metric=acc + job: + name: finetune_mala_asr_deepspeed + chdir: null + override_dirname: ++dataset_config.add_reverb=false,++dataset_config.dataset=aishell-1,++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/,++dataset_config.encoder_projector_ds_rate=5,++dataset_config.input_type=mel,++dataset_config.llm_name=Qwen2.5-7B-Instruct,++dataset_config.mel_size=128,++dataset_config.noise_file_path=,++dataset_config.normalize=true,++dataset_config.pad_or_trim=true,++dataset_config.prompt_style=normal,++dataset_config.spec_augmentation=false,++dataset_config.speed_perturb=false,++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/,++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json,++metric=acc,++model_config.encoder_dim=1280,++model_config.encoder_name=whisper,++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt,++model_config.encoder_projector=linear,++model_config.encoder_projector_ds_rate=5,++model_config.llm_dim=3584,++model_config.llm_name=Qwen2.5-7B-Instruct,++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct,++model_config.normalize=true,++train_config.batch_size_training=10,++train_config.batching_strategy=custom,++train_config.enable_ddp=true,++train_config.enable_fsdp=false,++train_config.freeze_encoder=true,++train_config.freeze_llm=true,++train_config.lr=5e-5,++train_config.model_name=mala_asr,++train_config.num_epochs=5,++train_config.num_workers_dataloader=0,++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2312,++train_config.total_steps=100000,++train_config.use_fp16=true,++train_config.use_peft=false,++train_config.val_batch_size=10,++train_config.validation_interval=10000,++train_config.warmup_steps=1000 + id: ??? + num: ??? + config_name: prompt.yaml + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/conf + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2312 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2312/.hydra/overrides.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2312/.hydra/overrides.yaml new file mode 100644 index 00000000..bc605032 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2312/.hydra/overrides.yaml @@ -0,0 +1,42 @@ +- ++train_config.enable_fsdp=false +- ++train_config.enable_ddp=true +- ++train_config.use_fp16=true +- ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +- ++model_config.llm_name=Qwen2.5-7B-Instruct +- ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct +- ++model_config.llm_dim=3584 +- ++model_config.encoder_name=whisper +- ++model_config.encoder_projector_ds_rate=5 +- ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt +- ++model_config.encoder_dim=1280 +- ++model_config.encoder_projector=linear +- ++model_config.normalize=true +- ++dataset_config.llm_name=Qwen2.5-7B-Instruct +- ++dataset_config.prompt_style=normal +- ++dataset_config.normalize=true +- ++dataset_config.dataset=aishell-1 +- ++dataset_config.input_type=mel +- ++dataset_config.speed_perturb=false +- ++dataset_config.spec_augmentation=false +- ++dataset_config.add_reverb=false +- ++dataset_config.noise_file_path= +- ++dataset_config.mel_size=128 +- ++dataset_config.pad_or_trim=true +- ++dataset_config.encoder_projector_ds_rate=5 +- ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ +- ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +- ++train_config.model_name=mala_asr +- ++train_config.num_epochs=5 +- ++train_config.freeze_encoder=true +- ++train_config.freeze_llm=true +- ++train_config.use_peft=false +- ++train_config.batching_strategy=custom +- ++train_config.warmup_steps=1000 +- ++train_config.total_steps=100000 +- ++train_config.lr=5e-5 +- ++train_config.validation_interval=10000 +- ++train_config.batch_size_training=10 +- ++train_config.val_batch_size=10 +- ++train_config.num_workers_dataloader=0 +- ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2312 +- ++metric=acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2320/.hydra/config.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2320/.hydra/config.yaml new file mode 100644 index 00000000..06bc1fb4 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2320/.hydra/config.yaml @@ -0,0 +1,46 @@ +dataset_config: + prompt: Transcribe speech to text. + llm_name: Qwen2.5-7B-Instruct + prompt_style: normal + normalize: true + dataset: aishell-1 + input_type: mel + speed_perturb: false + spec_augmentation: false + add_reverb: false + noise_file_path: '' + mel_size: 128 + pad_or_trim: true + encoder_projector_ds_rate: 5 + train_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + dev_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +train_config: + enable_fsdp: false + enable_ddp: true + use_fp16: true + model_name: mala_asr + num_epochs: 5 + freeze_encoder: true + freeze_llm: true + use_peft: false + batching_strategy: custom + warmup_steps: 1000 + total_steps: 100000 + lr: 5.0e-05 + validation_interval: 10000 + batch_size_training: 10 + val_batch_size: 10 + num_workers_dataloader: 0 + output_dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2320 +deepspeed_config: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +model_config: + llm_name: Qwen2.5-7B-Instruct + llm_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim: 3584 + encoder_name: whisper + encoder_projector_ds_rate: 5 + encoder_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + encoder_dim: 1280 + encoder_projector: linear + normalize: true +metric: acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2320/.hydra/hydra.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2320/.hydra/hydra.yaml new file mode 100644 index 00000000..0a10ba5a --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2320/.hydra/hydra.yaml @@ -0,0 +1,197 @@ +hydra: + run: + dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2320 + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.run.dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2320 + - hydra.mode=RUN + task: + - ++train_config.enable_fsdp=false + - ++train_config.enable_ddp=true + - ++train_config.use_fp16=true + - ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json + - ++model_config.llm_name=Qwen2.5-7B-Instruct + - ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + - ++model_config.llm_dim=3584 + - ++model_config.encoder_name=whisper + - ++model_config.encoder_projector_ds_rate=5 + - ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + - ++model_config.encoder_dim=1280 + - ++model_config.encoder_projector=linear + - ++model_config.normalize=true + - ++dataset_config.llm_name=Qwen2.5-7B-Instruct + - ++dataset_config.prompt_style=normal + - ++dataset_config.normalize=true + - ++dataset_config.dataset=aishell-1 + - ++dataset_config.input_type=mel + - ++dataset_config.speed_perturb=false + - ++dataset_config.spec_augmentation=false + - ++dataset_config.add_reverb=false + - ++dataset_config.noise_file_path= + - ++dataset_config.mel_size=128 + - ++dataset_config.pad_or_trim=true + - ++dataset_config.encoder_projector_ds_rate=5 + - ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + - ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ + - ++train_config.model_name=mala_asr + - ++train_config.num_epochs=5 + - ++train_config.freeze_encoder=true + - ++train_config.freeze_llm=true + - ++train_config.use_peft=false + - ++train_config.batching_strategy=custom + - ++train_config.warmup_steps=1000 + - ++train_config.total_steps=100000 + - ++train_config.lr=5e-5 + - ++train_config.validation_interval=10000 + - ++train_config.batch_size_training=10 + - ++train_config.val_batch_size=10 + - ++train_config.num_workers_dataloader=0 + - ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2320 + - ++metric=acc + job: + name: finetune_mala_asr_deepspeed + chdir: null + override_dirname: ++dataset_config.add_reverb=false,++dataset_config.dataset=aishell-1,++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/,++dataset_config.encoder_projector_ds_rate=5,++dataset_config.input_type=mel,++dataset_config.llm_name=Qwen2.5-7B-Instruct,++dataset_config.mel_size=128,++dataset_config.noise_file_path=,++dataset_config.normalize=true,++dataset_config.pad_or_trim=true,++dataset_config.prompt_style=normal,++dataset_config.spec_augmentation=false,++dataset_config.speed_perturb=false,++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/,++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json,++metric=acc,++model_config.encoder_dim=1280,++model_config.encoder_name=whisper,++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt,++model_config.encoder_projector=linear,++model_config.encoder_projector_ds_rate=5,++model_config.llm_dim=3584,++model_config.llm_name=Qwen2.5-7B-Instruct,++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct,++model_config.normalize=true,++train_config.batch_size_training=10,++train_config.batching_strategy=custom,++train_config.enable_ddp=true,++train_config.enable_fsdp=false,++train_config.freeze_encoder=true,++train_config.freeze_llm=true,++train_config.lr=5e-5,++train_config.model_name=mala_asr,++train_config.num_epochs=5,++train_config.num_workers_dataloader=0,++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2320,++train_config.total_steps=100000,++train_config.use_fp16=true,++train_config.use_peft=false,++train_config.val_batch_size=10,++train_config.validation_interval=10000,++train_config.warmup_steps=1000 + id: ??? + num: ??? + config_name: prompt.yaml + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/conf + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2320 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2320/.hydra/overrides.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2320/.hydra/overrides.yaml new file mode 100644 index 00000000..fa5f8a80 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2320/.hydra/overrides.yaml @@ -0,0 +1,42 @@ +- ++train_config.enable_fsdp=false +- ++train_config.enable_ddp=true +- ++train_config.use_fp16=true +- ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +- ++model_config.llm_name=Qwen2.5-7B-Instruct +- ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct +- ++model_config.llm_dim=3584 +- ++model_config.encoder_name=whisper +- ++model_config.encoder_projector_ds_rate=5 +- ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt +- ++model_config.encoder_dim=1280 +- ++model_config.encoder_projector=linear +- ++model_config.normalize=true +- ++dataset_config.llm_name=Qwen2.5-7B-Instruct +- ++dataset_config.prompt_style=normal +- ++dataset_config.normalize=true +- ++dataset_config.dataset=aishell-1 +- ++dataset_config.input_type=mel +- ++dataset_config.speed_perturb=false +- ++dataset_config.spec_augmentation=false +- ++dataset_config.add_reverb=false +- ++dataset_config.noise_file_path= +- ++dataset_config.mel_size=128 +- ++dataset_config.pad_or_trim=true +- ++dataset_config.encoder_projector_ds_rate=5 +- ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ +- ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +- ++train_config.model_name=mala_asr +- ++train_config.num_epochs=5 +- ++train_config.freeze_encoder=true +- ++train_config.freeze_llm=true +- ++train_config.use_peft=false +- ++train_config.batching_strategy=custom +- ++train_config.warmup_steps=1000 +- ++train_config.total_steps=100000 +- ++train_config.lr=5e-5 +- ++train_config.validation_interval=10000 +- ++train_config.batch_size_training=10 +- ++train_config.val_batch_size=10 +- ++train_config.num_workers_dataloader=0 +- ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2320 +- ++metric=acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2324/.hydra/config.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2324/.hydra/config.yaml new file mode 100644 index 00000000..3c2b278a --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2324/.hydra/config.yaml @@ -0,0 +1,46 @@ +dataset_config: + prompt: Transcribe speech to text. + llm_name: Qwen2.5-7B-Instruct + prompt_style: normal + normalize: true + dataset: aishell-1 + input_type: mel + speed_perturb: false + spec_augmentation: false + add_reverb: false + noise_file_path: '' + mel_size: 128 + pad_or_trim: true + encoder_projector_ds_rate: 5 + train_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + dev_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +train_config: + enable_fsdp: false + enable_ddp: true + use_fp16: true + model_name: mala_asr + num_epochs: 5 + freeze_encoder: true + freeze_llm: true + use_peft: false + batching_strategy: custom + warmup_steps: 1000 + total_steps: 100000 + lr: 5.0e-05 + validation_interval: 10000 + batch_size_training: 10 + val_batch_size: 10 + num_workers_dataloader: 0 + output_dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2324 +deepspeed_config: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +model_config: + llm_name: Qwen2.5-7B-Instruct + llm_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim: 3584 + encoder_name: whisper + encoder_projector_ds_rate: 5 + encoder_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + encoder_dim: 1280 + encoder_projector: linear + normalize: true +metric: acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2324/.hydra/hydra.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2324/.hydra/hydra.yaml new file mode 100644 index 00000000..c9add3cb --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2324/.hydra/hydra.yaml @@ -0,0 +1,197 @@ +hydra: + run: + dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2324 + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.run.dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2324 + - hydra.mode=RUN + task: + - ++train_config.enable_fsdp=false + - ++train_config.enable_ddp=true + - ++train_config.use_fp16=true + - ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json + - ++model_config.llm_name=Qwen2.5-7B-Instruct + - ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + - ++model_config.llm_dim=3584 + - ++model_config.encoder_name=whisper + - ++model_config.encoder_projector_ds_rate=5 + - ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + - ++model_config.encoder_dim=1280 + - ++model_config.encoder_projector=linear + - ++model_config.normalize=true + - ++dataset_config.llm_name=Qwen2.5-7B-Instruct + - ++dataset_config.prompt_style=normal + - ++dataset_config.normalize=true + - ++dataset_config.dataset=aishell-1 + - ++dataset_config.input_type=mel + - ++dataset_config.speed_perturb=false + - ++dataset_config.spec_augmentation=false + - ++dataset_config.add_reverb=false + - ++dataset_config.noise_file_path= + - ++dataset_config.mel_size=128 + - ++dataset_config.pad_or_trim=true + - ++dataset_config.encoder_projector_ds_rate=5 + - ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + - ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ + - ++train_config.model_name=mala_asr + - ++train_config.num_epochs=5 + - ++train_config.freeze_encoder=true + - ++train_config.freeze_llm=true + - ++train_config.use_peft=false + - ++train_config.batching_strategy=custom + - ++train_config.warmup_steps=1000 + - ++train_config.total_steps=100000 + - ++train_config.lr=5e-5 + - ++train_config.validation_interval=10000 + - ++train_config.batch_size_training=10 + - ++train_config.val_batch_size=10 + - ++train_config.num_workers_dataloader=0 + - ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2324 + - ++metric=acc + job: + name: finetune_mala_asr_deepspeed + chdir: null + override_dirname: ++dataset_config.add_reverb=false,++dataset_config.dataset=aishell-1,++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/,++dataset_config.encoder_projector_ds_rate=5,++dataset_config.input_type=mel,++dataset_config.llm_name=Qwen2.5-7B-Instruct,++dataset_config.mel_size=128,++dataset_config.noise_file_path=,++dataset_config.normalize=true,++dataset_config.pad_or_trim=true,++dataset_config.prompt_style=normal,++dataset_config.spec_augmentation=false,++dataset_config.speed_perturb=false,++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/,++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json,++metric=acc,++model_config.encoder_dim=1280,++model_config.encoder_name=whisper,++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt,++model_config.encoder_projector=linear,++model_config.encoder_projector_ds_rate=5,++model_config.llm_dim=3584,++model_config.llm_name=Qwen2.5-7B-Instruct,++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct,++model_config.normalize=true,++train_config.batch_size_training=10,++train_config.batching_strategy=custom,++train_config.enable_ddp=true,++train_config.enable_fsdp=false,++train_config.freeze_encoder=true,++train_config.freeze_llm=true,++train_config.lr=5e-5,++train_config.model_name=mala_asr,++train_config.num_epochs=5,++train_config.num_workers_dataloader=0,++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2324,++train_config.total_steps=100000,++train_config.use_fp16=true,++train_config.use_peft=false,++train_config.val_batch_size=10,++train_config.validation_interval=10000,++train_config.warmup_steps=1000 + id: ??? + num: ??? + config_name: prompt.yaml + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/conf + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2324 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2324/.hydra/overrides.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2324/.hydra/overrides.yaml new file mode 100644 index 00000000..9b122603 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2324/.hydra/overrides.yaml @@ -0,0 +1,42 @@ +- ++train_config.enable_fsdp=false +- ++train_config.enable_ddp=true +- ++train_config.use_fp16=true +- ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +- ++model_config.llm_name=Qwen2.5-7B-Instruct +- ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct +- ++model_config.llm_dim=3584 +- ++model_config.encoder_name=whisper +- ++model_config.encoder_projector_ds_rate=5 +- ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt +- ++model_config.encoder_dim=1280 +- ++model_config.encoder_projector=linear +- ++model_config.normalize=true +- ++dataset_config.llm_name=Qwen2.5-7B-Instruct +- ++dataset_config.prompt_style=normal +- ++dataset_config.normalize=true +- ++dataset_config.dataset=aishell-1 +- ++dataset_config.input_type=mel +- ++dataset_config.speed_perturb=false +- ++dataset_config.spec_augmentation=false +- ++dataset_config.add_reverb=false +- ++dataset_config.noise_file_path= +- ++dataset_config.mel_size=128 +- ++dataset_config.pad_or_trim=true +- ++dataset_config.encoder_projector_ds_rate=5 +- ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ +- ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +- ++train_config.model_name=mala_asr +- ++train_config.num_epochs=5 +- ++train_config.freeze_encoder=true +- ++train_config.freeze_llm=true +- ++train_config.use_peft=false +- ++train_config.batching_strategy=custom +- ++train_config.warmup_steps=1000 +- ++train_config.total_steps=100000 +- ++train_config.lr=5e-5 +- ++train_config.validation_interval=10000 +- ++train_config.batch_size_training=10 +- ++train_config.val_batch_size=10 +- ++train_config.num_workers_dataloader=0 +- ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2324 +- ++metric=acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2337/.hydra/config.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2337/.hydra/config.yaml new file mode 100644 index 00000000..b62dfb18 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2337/.hydra/config.yaml @@ -0,0 +1,46 @@ +dataset_config: + prompt: Transcribe speech to text. + llm_name: Qwen2.5-7B-Instruct + prompt_style: normal + normalize: true + dataset: aishell-1 + input_type: mel + speed_perturb: false + spec_augmentation: false + add_reverb: false + noise_file_path: '' + mel_size: 128 + pad_or_trim: true + encoder_projector_ds_rate: 5 + train_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + dev_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +train_config: + enable_fsdp: false + enable_ddp: true + use_fp16: true + model_name: mala_asr + num_epochs: 5 + freeze_encoder: true + freeze_llm: true + use_peft: false + batching_strategy: custom + warmup_steps: 1000 + total_steps: 100000 + lr: 5.0e-05 + validation_interval: 10000 + batch_size_training: 10 + val_batch_size: 10 + num_workers_dataloader: 0 + output_dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2337 +deepspeed_config: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +model_config: + llm_name: Qwen2.5-7B-Instruct + llm_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim: 3584 + encoder_name: whisper + encoder_projector_ds_rate: 5 + encoder_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + encoder_dim: 1280 + encoder_projector: linear + normalize: true +metric: acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2337/.hydra/hydra.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2337/.hydra/hydra.yaml new file mode 100644 index 00000000..7c2f0dc5 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2337/.hydra/hydra.yaml @@ -0,0 +1,197 @@ +hydra: + run: + dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2337 + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.run.dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2337 + - hydra.mode=RUN + task: + - ++train_config.enable_fsdp=false + - ++train_config.enable_ddp=true + - ++train_config.use_fp16=true + - ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json + - ++model_config.llm_name=Qwen2.5-7B-Instruct + - ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + - ++model_config.llm_dim=3584 + - ++model_config.encoder_name=whisper + - ++model_config.encoder_projector_ds_rate=5 + - ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + - ++model_config.encoder_dim=1280 + - ++model_config.encoder_projector=linear + - ++model_config.normalize=true + - ++dataset_config.llm_name=Qwen2.5-7B-Instruct + - ++dataset_config.prompt_style=normal + - ++dataset_config.normalize=true + - ++dataset_config.dataset=aishell-1 + - ++dataset_config.input_type=mel + - ++dataset_config.speed_perturb=false + - ++dataset_config.spec_augmentation=false + - ++dataset_config.add_reverb=false + - ++dataset_config.noise_file_path= + - ++dataset_config.mel_size=128 + - ++dataset_config.pad_or_trim=true + - ++dataset_config.encoder_projector_ds_rate=5 + - ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + - ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ + - ++train_config.model_name=mala_asr + - ++train_config.num_epochs=5 + - ++train_config.freeze_encoder=true + - ++train_config.freeze_llm=true + - ++train_config.use_peft=false + - ++train_config.batching_strategy=custom + - ++train_config.warmup_steps=1000 + - ++train_config.total_steps=100000 + - ++train_config.lr=5e-5 + - ++train_config.validation_interval=10000 + - ++train_config.batch_size_training=10 + - ++train_config.val_batch_size=10 + - ++train_config.num_workers_dataloader=0 + - ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2337 + - ++metric=acc + job: + name: finetune_mala_asr_deepspeed + chdir: null + override_dirname: ++dataset_config.add_reverb=false,++dataset_config.dataset=aishell-1,++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/,++dataset_config.encoder_projector_ds_rate=5,++dataset_config.input_type=mel,++dataset_config.llm_name=Qwen2.5-7B-Instruct,++dataset_config.mel_size=128,++dataset_config.noise_file_path=,++dataset_config.normalize=true,++dataset_config.pad_or_trim=true,++dataset_config.prompt_style=normal,++dataset_config.spec_augmentation=false,++dataset_config.speed_perturb=false,++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/,++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json,++metric=acc,++model_config.encoder_dim=1280,++model_config.encoder_name=whisper,++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt,++model_config.encoder_projector=linear,++model_config.encoder_projector_ds_rate=5,++model_config.llm_dim=3584,++model_config.llm_name=Qwen2.5-7B-Instruct,++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct,++model_config.normalize=true,++train_config.batch_size_training=10,++train_config.batching_strategy=custom,++train_config.enable_ddp=true,++train_config.enable_fsdp=false,++train_config.freeze_encoder=true,++train_config.freeze_llm=true,++train_config.lr=5e-5,++train_config.model_name=mala_asr,++train_config.num_epochs=5,++train_config.num_workers_dataloader=0,++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2337,++train_config.total_steps=100000,++train_config.use_fp16=true,++train_config.use_peft=false,++train_config.val_batch_size=10,++train_config.validation_interval=10000,++train_config.warmup_steps=1000 + id: ??? + num: ??? + config_name: prompt.yaml + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/conf + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2337 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2337/.hydra/overrides.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2337/.hydra/overrides.yaml new file mode 100644 index 00000000..1671c446 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2337/.hydra/overrides.yaml @@ -0,0 +1,42 @@ +- ++train_config.enable_fsdp=false +- ++train_config.enable_ddp=true +- ++train_config.use_fp16=true +- ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +- ++model_config.llm_name=Qwen2.5-7B-Instruct +- ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct +- ++model_config.llm_dim=3584 +- ++model_config.encoder_name=whisper +- ++model_config.encoder_projector_ds_rate=5 +- ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt +- ++model_config.encoder_dim=1280 +- ++model_config.encoder_projector=linear +- ++model_config.normalize=true +- ++dataset_config.llm_name=Qwen2.5-7B-Instruct +- ++dataset_config.prompt_style=normal +- ++dataset_config.normalize=true +- ++dataset_config.dataset=aishell-1 +- ++dataset_config.input_type=mel +- ++dataset_config.speed_perturb=false +- ++dataset_config.spec_augmentation=false +- ++dataset_config.add_reverb=false +- ++dataset_config.noise_file_path= +- ++dataset_config.mel_size=128 +- ++dataset_config.pad_or_trim=true +- ++dataset_config.encoder_projector_ds_rate=5 +- ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ +- ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +- ++train_config.model_name=mala_asr +- ++train_config.num_epochs=5 +- ++train_config.freeze_encoder=true +- ++train_config.freeze_llm=true +- ++train_config.use_peft=false +- ++train_config.batching_strategy=custom +- ++train_config.warmup_steps=1000 +- ++train_config.total_steps=100000 +- ++train_config.lr=5e-5 +- ++train_config.validation_interval=10000 +- ++train_config.batch_size_training=10 +- ++train_config.val_batch_size=10 +- ++train_config.num_workers_dataloader=0 +- ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2337 +- ++metric=acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2344/.hydra/config.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2344/.hydra/config.yaml new file mode 100644 index 00000000..d49437c2 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2344/.hydra/config.yaml @@ -0,0 +1,46 @@ +dataset_config: + prompt: Transcribe speech to text. + llm_name: Qwen2.5-7B-Instruct + prompt_style: normal + normalize: true + dataset: aishell-1 + input_type: mel + speed_perturb: false + spec_augmentation: false + add_reverb: false + noise_file_path: '' + mel_size: 128 + pad_or_trim: true + encoder_projector_ds_rate: 5 + train_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + dev_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +train_config: + enable_fsdp: false + enable_ddp: true + use_fp16: true + model_name: mala_asr + num_epochs: 5 + freeze_encoder: true + freeze_llm: true + use_peft: false + batching_strategy: custom + warmup_steps: 1000 + total_steps: 100000 + lr: 5.0e-05 + validation_interval: 10000 + batch_size_training: 10 + val_batch_size: 10 + num_workers_dataloader: 0 + output_dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2344 +deepspeed_config: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +model_config: + llm_name: Qwen2.5-7B-Instruct + llm_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim: 3584 + encoder_name: whisper + encoder_projector_ds_rate: 5 + encoder_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + encoder_dim: 1280 + encoder_projector: linear + normalize: true +metric: acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2344/.hydra/hydra.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2344/.hydra/hydra.yaml new file mode 100644 index 00000000..88c70400 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2344/.hydra/hydra.yaml @@ -0,0 +1,197 @@ +hydra: + run: + dir: examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2344 + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.run.dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2344 + - hydra.mode=RUN + task: + - ++train_config.enable_fsdp=false + - ++train_config.enable_ddp=true + - ++train_config.use_fp16=true + - ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json + - ++model_config.llm_name=Qwen2.5-7B-Instruct + - ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + - ++model_config.llm_dim=3584 + - ++model_config.encoder_name=whisper + - ++model_config.encoder_projector_ds_rate=5 + - ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + - ++model_config.encoder_dim=1280 + - ++model_config.encoder_projector=linear + - ++model_config.normalize=true + - ++dataset_config.llm_name=Qwen2.5-7B-Instruct + - ++dataset_config.prompt_style=normal + - ++dataset_config.normalize=true + - ++dataset_config.dataset=aishell-1 + - ++dataset_config.input_type=mel + - ++dataset_config.speed_perturb=false + - ++dataset_config.spec_augmentation=false + - ++dataset_config.add_reverb=false + - ++dataset_config.noise_file_path= + - ++dataset_config.mel_size=128 + - ++dataset_config.pad_or_trim=true + - ++dataset_config.encoder_projector_ds_rate=5 + - ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + - ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ + - ++train_config.model_name=mala_asr + - ++train_config.num_epochs=5 + - ++train_config.freeze_encoder=true + - ++train_config.freeze_llm=true + - ++train_config.use_peft=false + - ++train_config.batching_strategy=custom + - ++train_config.warmup_steps=1000 + - ++train_config.total_steps=100000 + - ++train_config.lr=5e-5 + - ++train_config.validation_interval=10000 + - ++train_config.batch_size_training=10 + - ++train_config.val_batch_size=10 + - ++train_config.num_workers_dataloader=0 + - ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2344 + - ++metric=acc + job: + name: finetune_mala_asr_deepspeed + chdir: null + override_dirname: ++dataset_config.add_reverb=false,++dataset_config.dataset=aishell-1,++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/,++dataset_config.encoder_projector_ds_rate=5,++dataset_config.input_type=mel,++dataset_config.llm_name=Qwen2.5-7B-Instruct,++dataset_config.mel_size=128,++dataset_config.noise_file_path=,++dataset_config.normalize=true,++dataset_config.pad_or_trim=true,++dataset_config.prompt_style=normal,++dataset_config.spec_augmentation=false,++dataset_config.speed_perturb=false,++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/,++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json,++metric=acc,++model_config.encoder_dim=1280,++model_config.encoder_name=whisper,++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt,++model_config.encoder_projector=linear,++model_config.encoder_projector_ds_rate=5,++model_config.llm_dim=3584,++model_config.llm_name=Qwen2.5-7B-Instruct,++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct,++model_config.normalize=true,++train_config.batch_size_training=10,++train_config.batching_strategy=custom,++train_config.enable_ddp=true,++train_config.enable_fsdp=false,++train_config.freeze_encoder=true,++train_config.freeze_llm=true,++train_config.lr=5e-5,++train_config.model_name=mala_asr,++train_config.num_epochs=5,++train_config.num_workers_dataloader=0,++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2344,++train_config.total_steps=100000,++train_config.use_fp16=true,++train_config.use_peft=false,++train_config.val_batch_size=10,++train_config.validation_interval=10000,++train_config.warmup_steps=1000 + id: ??? + num: ??? + config_name: prompt.yaml + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/conf + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2344 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2344/.hydra/overrides.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2344/.hydra/overrides.yaml new file mode 100644 index 00000000..d37b3b60 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2344/.hydra/overrides.yaml @@ -0,0 +1,42 @@ +- ++train_config.enable_fsdp=false +- ++train_config.enable_ddp=true +- ++train_config.use_fp16=true +- ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +- ++model_config.llm_name=Qwen2.5-7B-Instruct +- ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct +- ++model_config.llm_dim=3584 +- ++model_config.encoder_name=whisper +- ++model_config.encoder_projector_ds_rate=5 +- ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt +- ++model_config.encoder_dim=1280 +- ++model_config.encoder_projector=linear +- ++model_config.normalize=true +- ++dataset_config.llm_name=Qwen2.5-7B-Instruct +- ++dataset_config.prompt_style=normal +- ++dataset_config.normalize=true +- ++dataset_config.dataset=aishell-1 +- ++dataset_config.input_type=mel +- ++dataset_config.speed_perturb=false +- ++dataset_config.spec_augmentation=false +- ++dataset_config.add_reverb=false +- ++dataset_config.noise_file_path= +- ++dataset_config.mel_size=128 +- ++dataset_config.pad_or_trim=true +- ++dataset_config.encoder_projector_ds_rate=5 +- ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ +- ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +- ++train_config.model_name=mala_asr +- ++train_config.num_epochs=5 +- ++train_config.freeze_encoder=true +- ++train_config.freeze_llm=true +- ++train_config.use_peft=false +- ++train_config.batching_strategy=custom +- ++train_config.warmup_steps=1000 +- ++train_config.total_steps=100000 +- ++train_config.lr=5e-5 +- ++train_config.validation_interval=10000 +- ++train_config.batch_size_training=10 +- ++train_config.val_batch_size=10 +- ++train_config.num_workers_dataloader=0 +- ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250226/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-2344 +- ++metric=acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-0945/.hydra/config.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-0945/.hydra/config.yaml new file mode 100644 index 00000000..a1ba6fb3 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-0945/.hydra/config.yaml @@ -0,0 +1,46 @@ +dataset_config: + prompt: Transcribe speech to text. + llm_name: Qwen2.5-7B-Instruct + prompt_style: normal + normalize: true + dataset: aishell-1 + input_type: mel + speed_perturb: false + spec_augmentation: false + add_reverb: false + noise_file_path: '' + mel_size: 128 + pad_or_trim: true + encoder_projector_ds_rate: 5 + train_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + dev_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +train_config: + enable_fsdp: false + enable_ddp: true + use_fp16: true + model_name: mala_asr + num_epochs: 5 + freeze_encoder: true + freeze_llm: true + use_peft: false + batching_strategy: custom + warmup_steps: 1000 + total_steps: 100000 + lr: 5.0e-05 + validation_interval: 10000 + batch_size_training: 10 + val_batch_size: 10 + num_workers_dataloader: 0 + output_dir: examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-0945 +deepspeed_config: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +model_config: + llm_name: Qwen2.5-7B-Instruct + llm_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim: 3584 + encoder_name: whisper + encoder_projector_ds_rate: 5 + encoder_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + encoder_dim: 1280 + encoder_projector: linear + normalize: true +metric: acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-0945/.hydra/hydra.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-0945/.hydra/hydra.yaml new file mode 100644 index 00000000..5d0402fc --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-0945/.hydra/hydra.yaml @@ -0,0 +1,197 @@ +hydra: + run: + dir: examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-0945 + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.run.dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-0945 + - hydra.mode=RUN + task: + - ++train_config.enable_fsdp=false + - ++train_config.enable_ddp=true + - ++train_config.use_fp16=true + - ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json + - ++model_config.llm_name=Qwen2.5-7B-Instruct + - ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + - ++model_config.llm_dim=3584 + - ++model_config.encoder_name=whisper + - ++model_config.encoder_projector_ds_rate=5 + - ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + - ++model_config.encoder_dim=1280 + - ++model_config.encoder_projector=linear + - ++model_config.normalize=true + - ++dataset_config.llm_name=Qwen2.5-7B-Instruct + - ++dataset_config.prompt_style=normal + - ++dataset_config.normalize=true + - ++dataset_config.dataset=aishell-1 + - ++dataset_config.input_type=mel + - ++dataset_config.speed_perturb=false + - ++dataset_config.spec_augmentation=false + - ++dataset_config.add_reverb=false + - ++dataset_config.noise_file_path= + - ++dataset_config.mel_size=128 + - ++dataset_config.pad_or_trim=true + - ++dataset_config.encoder_projector_ds_rate=5 + - ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + - ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ + - ++train_config.model_name=mala_asr + - ++train_config.num_epochs=5 + - ++train_config.freeze_encoder=true + - ++train_config.freeze_llm=true + - ++train_config.use_peft=false + - ++train_config.batching_strategy=custom + - ++train_config.warmup_steps=1000 + - ++train_config.total_steps=100000 + - ++train_config.lr=5e-5 + - ++train_config.validation_interval=10000 + - ++train_config.batch_size_training=10 + - ++train_config.val_batch_size=10 + - ++train_config.num_workers_dataloader=0 + - ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-0945 + - ++metric=acc + job: + name: finetune_mala_asr_deepspeed + chdir: null + override_dirname: ++dataset_config.add_reverb=false,++dataset_config.dataset=aishell-1,++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/,++dataset_config.encoder_projector_ds_rate=5,++dataset_config.input_type=mel,++dataset_config.llm_name=Qwen2.5-7B-Instruct,++dataset_config.mel_size=128,++dataset_config.noise_file_path=,++dataset_config.normalize=true,++dataset_config.pad_or_trim=true,++dataset_config.prompt_style=normal,++dataset_config.spec_augmentation=false,++dataset_config.speed_perturb=false,++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/,++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json,++metric=acc,++model_config.encoder_dim=1280,++model_config.encoder_name=whisper,++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt,++model_config.encoder_projector=linear,++model_config.encoder_projector_ds_rate=5,++model_config.llm_dim=3584,++model_config.llm_name=Qwen2.5-7B-Instruct,++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct,++model_config.normalize=true,++train_config.batch_size_training=10,++train_config.batching_strategy=custom,++train_config.enable_ddp=true,++train_config.enable_fsdp=false,++train_config.freeze_encoder=true,++train_config.freeze_llm=true,++train_config.lr=5e-5,++train_config.model_name=mala_asr,++train_config.num_epochs=5,++train_config.num_workers_dataloader=0,++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-0945,++train_config.total_steps=100000,++train_config.use_fp16=true,++train_config.use_peft=false,++train_config.val_batch_size=10,++train_config.validation_interval=10000,++train_config.warmup_steps=1000 + id: ??? + num: ??? + config_name: prompt.yaml + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/conf + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-0945 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-0945/.hydra/overrides.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-0945/.hydra/overrides.yaml new file mode 100644 index 00000000..595da866 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-0945/.hydra/overrides.yaml @@ -0,0 +1,42 @@ +- ++train_config.enable_fsdp=false +- ++train_config.enable_ddp=true +- ++train_config.use_fp16=true +- ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +- ++model_config.llm_name=Qwen2.5-7B-Instruct +- ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct +- ++model_config.llm_dim=3584 +- ++model_config.encoder_name=whisper +- ++model_config.encoder_projector_ds_rate=5 +- ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt +- ++model_config.encoder_dim=1280 +- ++model_config.encoder_projector=linear +- ++model_config.normalize=true +- ++dataset_config.llm_name=Qwen2.5-7B-Instruct +- ++dataset_config.prompt_style=normal +- ++dataset_config.normalize=true +- ++dataset_config.dataset=aishell-1 +- ++dataset_config.input_type=mel +- ++dataset_config.speed_perturb=false +- ++dataset_config.spec_augmentation=false +- ++dataset_config.add_reverb=false +- ++dataset_config.noise_file_path= +- ++dataset_config.mel_size=128 +- ++dataset_config.pad_or_trim=true +- ++dataset_config.encoder_projector_ds_rate=5 +- ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ +- ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +- ++train_config.model_name=mala_asr +- ++train_config.num_epochs=5 +- ++train_config.freeze_encoder=true +- ++train_config.freeze_llm=true +- ++train_config.use_peft=false +- ++train_config.batching_strategy=custom +- ++train_config.warmup_steps=1000 +- ++train_config.total_steps=100000 +- ++train_config.lr=5e-5 +- ++train_config.validation_interval=10000 +- ++train_config.batch_size_training=10 +- ++train_config.val_batch_size=10 +- ++train_config.num_workers_dataloader=0 +- ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-0945 +- ++metric=acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-0955/.hydra/config.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-0955/.hydra/config.yaml new file mode 100644 index 00000000..7ca52bd0 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-0955/.hydra/config.yaml @@ -0,0 +1,46 @@ +dataset_config: + prompt: Transcribe speech to text. + llm_name: Qwen2.5-7B-Instruct + prompt_style: normal + normalize: true + dataset: aishell-1 + input_type: mel + speed_perturb: false + spec_augmentation: false + add_reverb: false + noise_file_path: '' + mel_size: 128 + pad_or_trim: true + encoder_projector_ds_rate: 5 + train_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + dev_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +train_config: + enable_fsdp: false + enable_ddp: true + use_fp16: true + model_name: mala_asr + num_epochs: 5 + freeze_encoder: true + freeze_llm: true + use_peft: false + batching_strategy: custom + warmup_steps: 1000 + total_steps: 100000 + lr: 5.0e-05 + validation_interval: 10000 + batch_size_training: 10 + val_batch_size: 10 + num_workers_dataloader: 0 + output_dir: examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-0955 +deepspeed_config: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +model_config: + llm_name: Qwen2.5-7B-Instruct + llm_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim: 3584 + encoder_name: whisper + encoder_projector_ds_rate: 5 + encoder_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + encoder_dim: 1280 + encoder_projector: linear + normalize: true +metric: acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-0955/.hydra/hydra.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-0955/.hydra/hydra.yaml new file mode 100644 index 00000000..ae288dab --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-0955/.hydra/hydra.yaml @@ -0,0 +1,197 @@ +hydra: + run: + dir: examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-0955 + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.run.dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-0955 + - hydra.mode=RUN + task: + - ++train_config.enable_fsdp=false + - ++train_config.enable_ddp=true + - ++train_config.use_fp16=true + - ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json + - ++model_config.llm_name=Qwen2.5-7B-Instruct + - ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + - ++model_config.llm_dim=3584 + - ++model_config.encoder_name=whisper + - ++model_config.encoder_projector_ds_rate=5 + - ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + - ++model_config.encoder_dim=1280 + - ++model_config.encoder_projector=linear + - ++model_config.normalize=true + - ++dataset_config.llm_name=Qwen2.5-7B-Instruct + - ++dataset_config.prompt_style=normal + - ++dataset_config.normalize=true + - ++dataset_config.dataset=aishell-1 + - ++dataset_config.input_type=mel + - ++dataset_config.speed_perturb=false + - ++dataset_config.spec_augmentation=false + - ++dataset_config.add_reverb=false + - ++dataset_config.noise_file_path= + - ++dataset_config.mel_size=128 + - ++dataset_config.pad_or_trim=true + - ++dataset_config.encoder_projector_ds_rate=5 + - ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + - ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ + - ++train_config.model_name=mala_asr + - ++train_config.num_epochs=5 + - ++train_config.freeze_encoder=true + - ++train_config.freeze_llm=true + - ++train_config.use_peft=false + - ++train_config.batching_strategy=custom + - ++train_config.warmup_steps=1000 + - ++train_config.total_steps=100000 + - ++train_config.lr=5e-5 + - ++train_config.validation_interval=10000 + - ++train_config.batch_size_training=10 + - ++train_config.val_batch_size=10 + - ++train_config.num_workers_dataloader=0 + - ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-0955 + - ++metric=acc + job: + name: finetune_mala_asr_deepspeed + chdir: null + override_dirname: ++dataset_config.add_reverb=false,++dataset_config.dataset=aishell-1,++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/,++dataset_config.encoder_projector_ds_rate=5,++dataset_config.input_type=mel,++dataset_config.llm_name=Qwen2.5-7B-Instruct,++dataset_config.mel_size=128,++dataset_config.noise_file_path=,++dataset_config.normalize=true,++dataset_config.pad_or_trim=true,++dataset_config.prompt_style=normal,++dataset_config.spec_augmentation=false,++dataset_config.speed_perturb=false,++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/,++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json,++metric=acc,++model_config.encoder_dim=1280,++model_config.encoder_name=whisper,++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt,++model_config.encoder_projector=linear,++model_config.encoder_projector_ds_rate=5,++model_config.llm_dim=3584,++model_config.llm_name=Qwen2.5-7B-Instruct,++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct,++model_config.normalize=true,++train_config.batch_size_training=10,++train_config.batching_strategy=custom,++train_config.enable_ddp=true,++train_config.enable_fsdp=false,++train_config.freeze_encoder=true,++train_config.freeze_llm=true,++train_config.lr=5e-5,++train_config.model_name=mala_asr,++train_config.num_epochs=5,++train_config.num_workers_dataloader=0,++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-0955,++train_config.total_steps=100000,++train_config.use_fp16=true,++train_config.use_peft=false,++train_config.val_batch_size=10,++train_config.validation_interval=10000,++train_config.warmup_steps=1000 + id: ??? + num: ??? + config_name: prompt.yaml + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/conf + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-0955 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-0955/.hydra/overrides.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-0955/.hydra/overrides.yaml new file mode 100644 index 00000000..49843b12 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-0955/.hydra/overrides.yaml @@ -0,0 +1,42 @@ +- ++train_config.enable_fsdp=false +- ++train_config.enable_ddp=true +- ++train_config.use_fp16=true +- ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +- ++model_config.llm_name=Qwen2.5-7B-Instruct +- ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct +- ++model_config.llm_dim=3584 +- ++model_config.encoder_name=whisper +- ++model_config.encoder_projector_ds_rate=5 +- ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt +- ++model_config.encoder_dim=1280 +- ++model_config.encoder_projector=linear +- ++model_config.normalize=true +- ++dataset_config.llm_name=Qwen2.5-7B-Instruct +- ++dataset_config.prompt_style=normal +- ++dataset_config.normalize=true +- ++dataset_config.dataset=aishell-1 +- ++dataset_config.input_type=mel +- ++dataset_config.speed_perturb=false +- ++dataset_config.spec_augmentation=false +- ++dataset_config.add_reverb=false +- ++dataset_config.noise_file_path= +- ++dataset_config.mel_size=128 +- ++dataset_config.pad_or_trim=true +- ++dataset_config.encoder_projector_ds_rate=5 +- ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ +- ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +- ++train_config.model_name=mala_asr +- ++train_config.num_epochs=5 +- ++train_config.freeze_encoder=true +- ++train_config.freeze_llm=true +- ++train_config.use_peft=false +- ++train_config.batching_strategy=custom +- ++train_config.warmup_steps=1000 +- ++train_config.total_steps=100000 +- ++train_config.lr=5e-5 +- ++train_config.validation_interval=10000 +- ++train_config.batch_size_training=10 +- ++train_config.val_batch_size=10 +- ++train_config.num_workers_dataloader=0 +- ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-0955 +- ++metric=acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1000/.hydra/config.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1000/.hydra/config.yaml new file mode 100644 index 00000000..698babc8 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1000/.hydra/config.yaml @@ -0,0 +1,46 @@ +dataset_config: + prompt: Transcribe speech to text. + llm_name: Qwen2.5-7B-Instruct + prompt_style: normal + normalize: true + dataset: aishell-1 + input_type: mel + speed_perturb: false + spec_augmentation: false + add_reverb: false + noise_file_path: '' + mel_size: 128 + pad_or_trim: true + encoder_projector_ds_rate: 5 + train_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + dev_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +train_config: + enable_fsdp: false + enable_ddp: true + use_fp16: true + model_name: mala_asr + num_epochs: 5 + freeze_encoder: true + freeze_llm: true + use_peft: false + batching_strategy: custom + warmup_steps: 1000 + total_steps: 100000 + lr: 5.0e-05 + validation_interval: 10000 + batch_size_training: 10 + val_batch_size: 10 + num_workers_dataloader: 0 + output_dir: examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1000 +deepspeed_config: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +model_config: + llm_name: Qwen2.5-7B-Instruct + llm_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim: 3584 + encoder_name: whisper + encoder_projector_ds_rate: 5 + encoder_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + encoder_dim: 1280 + encoder_projector: linear + normalize: true +metric: acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1000/.hydra/hydra.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1000/.hydra/hydra.yaml new file mode 100644 index 00000000..147c3da6 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1000/.hydra/hydra.yaml @@ -0,0 +1,197 @@ +hydra: + run: + dir: examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1000 + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.run.dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1000 + - hydra.mode=RUN + task: + - ++train_config.enable_fsdp=false + - ++train_config.enable_ddp=true + - ++train_config.use_fp16=true + - ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json + - ++model_config.llm_name=Qwen2.5-7B-Instruct + - ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + - ++model_config.llm_dim=3584 + - ++model_config.encoder_name=whisper + - ++model_config.encoder_projector_ds_rate=5 + - ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + - ++model_config.encoder_dim=1280 + - ++model_config.encoder_projector=linear + - ++model_config.normalize=true + - ++dataset_config.llm_name=Qwen2.5-7B-Instruct + - ++dataset_config.prompt_style=normal + - ++dataset_config.normalize=true + - ++dataset_config.dataset=aishell-1 + - ++dataset_config.input_type=mel + - ++dataset_config.speed_perturb=false + - ++dataset_config.spec_augmentation=false + - ++dataset_config.add_reverb=false + - ++dataset_config.noise_file_path= + - ++dataset_config.mel_size=128 + - ++dataset_config.pad_or_trim=true + - ++dataset_config.encoder_projector_ds_rate=5 + - ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + - ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ + - ++train_config.model_name=mala_asr + - ++train_config.num_epochs=5 + - ++train_config.freeze_encoder=true + - ++train_config.freeze_llm=true + - ++train_config.use_peft=false + - ++train_config.batching_strategy=custom + - ++train_config.warmup_steps=1000 + - ++train_config.total_steps=100000 + - ++train_config.lr=5e-5 + - ++train_config.validation_interval=10000 + - ++train_config.batch_size_training=10 + - ++train_config.val_batch_size=10 + - ++train_config.num_workers_dataloader=0 + - ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1000 + - ++metric=acc + job: + name: finetune_mala_asr_deepspeed + chdir: null + override_dirname: ++dataset_config.add_reverb=false,++dataset_config.dataset=aishell-1,++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/,++dataset_config.encoder_projector_ds_rate=5,++dataset_config.input_type=mel,++dataset_config.llm_name=Qwen2.5-7B-Instruct,++dataset_config.mel_size=128,++dataset_config.noise_file_path=,++dataset_config.normalize=true,++dataset_config.pad_or_trim=true,++dataset_config.prompt_style=normal,++dataset_config.spec_augmentation=false,++dataset_config.speed_perturb=false,++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/,++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json,++metric=acc,++model_config.encoder_dim=1280,++model_config.encoder_name=whisper,++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt,++model_config.encoder_projector=linear,++model_config.encoder_projector_ds_rate=5,++model_config.llm_dim=3584,++model_config.llm_name=Qwen2.5-7B-Instruct,++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct,++model_config.normalize=true,++train_config.batch_size_training=10,++train_config.batching_strategy=custom,++train_config.enable_ddp=true,++train_config.enable_fsdp=false,++train_config.freeze_encoder=true,++train_config.freeze_llm=true,++train_config.lr=5e-5,++train_config.model_name=mala_asr,++train_config.num_epochs=5,++train_config.num_workers_dataloader=0,++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1000,++train_config.total_steps=100000,++train_config.use_fp16=true,++train_config.use_peft=false,++train_config.val_batch_size=10,++train_config.validation_interval=10000,++train_config.warmup_steps=1000 + id: ??? + num: ??? + config_name: prompt.yaml + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/conf + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1000 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1000/.hydra/overrides.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1000/.hydra/overrides.yaml new file mode 100644 index 00000000..c96bfc78 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1000/.hydra/overrides.yaml @@ -0,0 +1,42 @@ +- ++train_config.enable_fsdp=false +- ++train_config.enable_ddp=true +- ++train_config.use_fp16=true +- ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +- ++model_config.llm_name=Qwen2.5-7B-Instruct +- ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct +- ++model_config.llm_dim=3584 +- ++model_config.encoder_name=whisper +- ++model_config.encoder_projector_ds_rate=5 +- ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt +- ++model_config.encoder_dim=1280 +- ++model_config.encoder_projector=linear +- ++model_config.normalize=true +- ++dataset_config.llm_name=Qwen2.5-7B-Instruct +- ++dataset_config.prompt_style=normal +- ++dataset_config.normalize=true +- ++dataset_config.dataset=aishell-1 +- ++dataset_config.input_type=mel +- ++dataset_config.speed_perturb=false +- ++dataset_config.spec_augmentation=false +- ++dataset_config.add_reverb=false +- ++dataset_config.noise_file_path= +- ++dataset_config.mel_size=128 +- ++dataset_config.pad_or_trim=true +- ++dataset_config.encoder_projector_ds_rate=5 +- ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ +- ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +- ++train_config.model_name=mala_asr +- ++train_config.num_epochs=5 +- ++train_config.freeze_encoder=true +- ++train_config.freeze_llm=true +- ++train_config.use_peft=false +- ++train_config.batching_strategy=custom +- ++train_config.warmup_steps=1000 +- ++train_config.total_steps=100000 +- ++train_config.lr=5e-5 +- ++train_config.validation_interval=10000 +- ++train_config.batch_size_training=10 +- ++train_config.val_batch_size=10 +- ++train_config.num_workers_dataloader=0 +- ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1000 +- ++metric=acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1004/.hydra/config.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1004/.hydra/config.yaml new file mode 100644 index 00000000..d5e18b22 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1004/.hydra/config.yaml @@ -0,0 +1,46 @@ +dataset_config: + prompt: Transcribe speech to text. + llm_name: Qwen2.5-7B-Instruct + prompt_style: normal + normalize: true + dataset: aishell-1 + input_type: mel + speed_perturb: false + spec_augmentation: false + add_reverb: false + noise_file_path: '' + mel_size: 128 + pad_or_trim: true + encoder_projector_ds_rate: 5 + train_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + dev_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +train_config: + enable_fsdp: false + enable_ddp: true + use_fp16: true + model_name: mala_asr + num_epochs: 5 + freeze_encoder: true + freeze_llm: true + use_peft: false + batching_strategy: custom + warmup_steps: 1000 + total_steps: 100000 + lr: 5.0e-05 + validation_interval: 10000 + batch_size_training: 10 + val_batch_size: 10 + num_workers_dataloader: 0 + output_dir: examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1004 +deepspeed_config: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +model_config: + llm_name: Qwen2.5-7B-Instruct + llm_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim: 3584 + encoder_name: whisper + encoder_projector_ds_rate: 5 + encoder_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + encoder_dim: 1280 + encoder_projector: linear + normalize: true +metric: acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1004/.hydra/hydra.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1004/.hydra/hydra.yaml new file mode 100644 index 00000000..1a3516f4 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1004/.hydra/hydra.yaml @@ -0,0 +1,197 @@ +hydra: + run: + dir: examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1004 + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.run.dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1004 + - hydra.mode=RUN + task: + - ++train_config.enable_fsdp=false + - ++train_config.enable_ddp=true + - ++train_config.use_fp16=true + - ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json + - ++model_config.llm_name=Qwen2.5-7B-Instruct + - ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + - ++model_config.llm_dim=3584 + - ++model_config.encoder_name=whisper + - ++model_config.encoder_projector_ds_rate=5 + - ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + - ++model_config.encoder_dim=1280 + - ++model_config.encoder_projector=linear + - ++model_config.normalize=true + - ++dataset_config.llm_name=Qwen2.5-7B-Instruct + - ++dataset_config.prompt_style=normal + - ++dataset_config.normalize=true + - ++dataset_config.dataset=aishell-1 + - ++dataset_config.input_type=mel + - ++dataset_config.speed_perturb=false + - ++dataset_config.spec_augmentation=false + - ++dataset_config.add_reverb=false + - ++dataset_config.noise_file_path= + - ++dataset_config.mel_size=128 + - ++dataset_config.pad_or_trim=true + - ++dataset_config.encoder_projector_ds_rate=5 + - ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + - ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ + - ++train_config.model_name=mala_asr + - ++train_config.num_epochs=5 + - ++train_config.freeze_encoder=true + - ++train_config.freeze_llm=true + - ++train_config.use_peft=false + - ++train_config.batching_strategy=custom + - ++train_config.warmup_steps=1000 + - ++train_config.total_steps=100000 + - ++train_config.lr=5e-5 + - ++train_config.validation_interval=10000 + - ++train_config.batch_size_training=10 + - ++train_config.val_batch_size=10 + - ++train_config.num_workers_dataloader=0 + - ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1004 + - ++metric=acc + job: + name: finetune_mala_asr_deepspeed + chdir: null + override_dirname: ++dataset_config.add_reverb=false,++dataset_config.dataset=aishell-1,++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/,++dataset_config.encoder_projector_ds_rate=5,++dataset_config.input_type=mel,++dataset_config.llm_name=Qwen2.5-7B-Instruct,++dataset_config.mel_size=128,++dataset_config.noise_file_path=,++dataset_config.normalize=true,++dataset_config.pad_or_trim=true,++dataset_config.prompt_style=normal,++dataset_config.spec_augmentation=false,++dataset_config.speed_perturb=false,++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/,++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json,++metric=acc,++model_config.encoder_dim=1280,++model_config.encoder_name=whisper,++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt,++model_config.encoder_projector=linear,++model_config.encoder_projector_ds_rate=5,++model_config.llm_dim=3584,++model_config.llm_name=Qwen2.5-7B-Instruct,++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct,++model_config.normalize=true,++train_config.batch_size_training=10,++train_config.batching_strategy=custom,++train_config.enable_ddp=true,++train_config.enable_fsdp=false,++train_config.freeze_encoder=true,++train_config.freeze_llm=true,++train_config.lr=5e-5,++train_config.model_name=mala_asr,++train_config.num_epochs=5,++train_config.num_workers_dataloader=0,++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1004,++train_config.total_steps=100000,++train_config.use_fp16=true,++train_config.use_peft=false,++train_config.val_batch_size=10,++train_config.validation_interval=10000,++train_config.warmup_steps=1000 + id: ??? + num: ??? + config_name: prompt.yaml + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/conf + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1004 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1004/.hydra/overrides.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1004/.hydra/overrides.yaml new file mode 100644 index 00000000..5dffefaf --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1004/.hydra/overrides.yaml @@ -0,0 +1,42 @@ +- ++train_config.enable_fsdp=false +- ++train_config.enable_ddp=true +- ++train_config.use_fp16=true +- ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +- ++model_config.llm_name=Qwen2.5-7B-Instruct +- ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct +- ++model_config.llm_dim=3584 +- ++model_config.encoder_name=whisper +- ++model_config.encoder_projector_ds_rate=5 +- ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt +- ++model_config.encoder_dim=1280 +- ++model_config.encoder_projector=linear +- ++model_config.normalize=true +- ++dataset_config.llm_name=Qwen2.5-7B-Instruct +- ++dataset_config.prompt_style=normal +- ++dataset_config.normalize=true +- ++dataset_config.dataset=aishell-1 +- ++dataset_config.input_type=mel +- ++dataset_config.speed_perturb=false +- ++dataset_config.spec_augmentation=false +- ++dataset_config.add_reverb=false +- ++dataset_config.noise_file_path= +- ++dataset_config.mel_size=128 +- ++dataset_config.pad_or_trim=true +- ++dataset_config.encoder_projector_ds_rate=5 +- ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ +- ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +- ++train_config.model_name=mala_asr +- ++train_config.num_epochs=5 +- ++train_config.freeze_encoder=true +- ++train_config.freeze_llm=true +- ++train_config.use_peft=false +- ++train_config.batching_strategy=custom +- ++train_config.warmup_steps=1000 +- ++train_config.total_steps=100000 +- ++train_config.lr=5e-5 +- ++train_config.validation_interval=10000 +- ++train_config.batch_size_training=10 +- ++train_config.val_batch_size=10 +- ++train_config.num_workers_dataloader=0 +- ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1004 +- ++metric=acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1013/.hydra/config.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1013/.hydra/config.yaml new file mode 100644 index 00000000..1d3b7e69 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1013/.hydra/config.yaml @@ -0,0 +1,46 @@ +dataset_config: + prompt: Transcribe speech to text. + llm_name: Qwen2.5-7B-Instruct + prompt_style: normal + normalize: true + dataset: aishell-1 + input_type: mel + speed_perturb: false + spec_augmentation: false + add_reverb: false + noise_file_path: '' + mel_size: 128 + pad_or_trim: true + encoder_projector_ds_rate: 5 + train_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + dev_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +train_config: + enable_fsdp: false + enable_ddp: true + use_fp16: true + model_name: mala_asr + num_epochs: 5 + freeze_encoder: true + freeze_llm: true + use_peft: false + batching_strategy: custom + warmup_steps: 1000 + total_steps: 100000 + lr: 5.0e-05 + validation_interval: 10000 + batch_size_training: 10 + val_batch_size: 10 + num_workers_dataloader: 0 + output_dir: examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1013 +deepspeed_config: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +model_config: + llm_name: Qwen2.5-7B-Instruct + llm_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim: 3584 + encoder_name: whisper + encoder_projector_ds_rate: 5 + encoder_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + encoder_dim: 1280 + encoder_projector: linear + normalize: true +metric: acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1013/.hydra/hydra.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1013/.hydra/hydra.yaml new file mode 100644 index 00000000..85fd3dbd --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1013/.hydra/hydra.yaml @@ -0,0 +1,197 @@ +hydra: + run: + dir: examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1013 + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.run.dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1013 + - hydra.mode=RUN + task: + - ++train_config.enable_fsdp=false + - ++train_config.enable_ddp=true + - ++train_config.use_fp16=true + - ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json + - ++model_config.llm_name=Qwen2.5-7B-Instruct + - ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + - ++model_config.llm_dim=3584 + - ++model_config.encoder_name=whisper + - ++model_config.encoder_projector_ds_rate=5 + - ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + - ++model_config.encoder_dim=1280 + - ++model_config.encoder_projector=linear + - ++model_config.normalize=true + - ++dataset_config.llm_name=Qwen2.5-7B-Instruct + - ++dataset_config.prompt_style=normal + - ++dataset_config.normalize=true + - ++dataset_config.dataset=aishell-1 + - ++dataset_config.input_type=mel + - ++dataset_config.speed_perturb=false + - ++dataset_config.spec_augmentation=false + - ++dataset_config.add_reverb=false + - ++dataset_config.noise_file_path= + - ++dataset_config.mel_size=128 + - ++dataset_config.pad_or_trim=true + - ++dataset_config.encoder_projector_ds_rate=5 + - ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + - ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ + - ++train_config.model_name=mala_asr + - ++train_config.num_epochs=5 + - ++train_config.freeze_encoder=true + - ++train_config.freeze_llm=true + - ++train_config.use_peft=false + - ++train_config.batching_strategy=custom + - ++train_config.warmup_steps=1000 + - ++train_config.total_steps=100000 + - ++train_config.lr=5e-5 + - ++train_config.validation_interval=10000 + - ++train_config.batch_size_training=10 + - ++train_config.val_batch_size=10 + - ++train_config.num_workers_dataloader=0 + - ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1013 + - ++metric=acc + job: + name: finetune_mala_asr_deepspeed + chdir: null + override_dirname: ++dataset_config.add_reverb=false,++dataset_config.dataset=aishell-1,++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/,++dataset_config.encoder_projector_ds_rate=5,++dataset_config.input_type=mel,++dataset_config.llm_name=Qwen2.5-7B-Instruct,++dataset_config.mel_size=128,++dataset_config.noise_file_path=,++dataset_config.normalize=true,++dataset_config.pad_or_trim=true,++dataset_config.prompt_style=normal,++dataset_config.spec_augmentation=false,++dataset_config.speed_perturb=false,++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/,++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json,++metric=acc,++model_config.encoder_dim=1280,++model_config.encoder_name=whisper,++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt,++model_config.encoder_projector=linear,++model_config.encoder_projector_ds_rate=5,++model_config.llm_dim=3584,++model_config.llm_name=Qwen2.5-7B-Instruct,++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct,++model_config.normalize=true,++train_config.batch_size_training=10,++train_config.batching_strategy=custom,++train_config.enable_ddp=true,++train_config.enable_fsdp=false,++train_config.freeze_encoder=true,++train_config.freeze_llm=true,++train_config.lr=5e-5,++train_config.model_name=mala_asr,++train_config.num_epochs=5,++train_config.num_workers_dataloader=0,++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1013,++train_config.total_steps=100000,++train_config.use_fp16=true,++train_config.use_peft=false,++train_config.val_batch_size=10,++train_config.validation_interval=10000,++train_config.warmup_steps=1000 + id: ??? + num: ??? + config_name: prompt.yaml + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/conf + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1013 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1013/.hydra/overrides.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1013/.hydra/overrides.yaml new file mode 100644 index 00000000..9d787b2a --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1013/.hydra/overrides.yaml @@ -0,0 +1,42 @@ +- ++train_config.enable_fsdp=false +- ++train_config.enable_ddp=true +- ++train_config.use_fp16=true +- ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +- ++model_config.llm_name=Qwen2.5-7B-Instruct +- ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct +- ++model_config.llm_dim=3584 +- ++model_config.encoder_name=whisper +- ++model_config.encoder_projector_ds_rate=5 +- ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt +- ++model_config.encoder_dim=1280 +- ++model_config.encoder_projector=linear +- ++model_config.normalize=true +- ++dataset_config.llm_name=Qwen2.5-7B-Instruct +- ++dataset_config.prompt_style=normal +- ++dataset_config.normalize=true +- ++dataset_config.dataset=aishell-1 +- ++dataset_config.input_type=mel +- ++dataset_config.speed_perturb=false +- ++dataset_config.spec_augmentation=false +- ++dataset_config.add_reverb=false +- ++dataset_config.noise_file_path= +- ++dataset_config.mel_size=128 +- ++dataset_config.pad_or_trim=true +- ++dataset_config.encoder_projector_ds_rate=5 +- ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ +- ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +- ++train_config.model_name=mala_asr +- ++train_config.num_epochs=5 +- ++train_config.freeze_encoder=true +- ++train_config.freeze_llm=true +- ++train_config.use_peft=false +- ++train_config.batching_strategy=custom +- ++train_config.warmup_steps=1000 +- ++train_config.total_steps=100000 +- ++train_config.lr=5e-5 +- ++train_config.validation_interval=10000 +- ++train_config.batch_size_training=10 +- ++train_config.val_batch_size=10 +- ++train_config.num_workers_dataloader=0 +- ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1013 +- ++metric=acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1022/.hydra/config.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1022/.hydra/config.yaml new file mode 100644 index 00000000..6c454dc4 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1022/.hydra/config.yaml @@ -0,0 +1,46 @@ +dataset_config: + prompt: Transcribe speech to text. + llm_name: Qwen2.5-7B-Instruct + prompt_style: normal + normalize: true + dataset: aishell-1 + input_type: mel + speed_perturb: false + spec_augmentation: false + add_reverb: false + noise_file_path: '' + mel_size: 128 + pad_or_trim: true + encoder_projector_ds_rate: 5 + train_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + dev_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +train_config: + enable_fsdp: false + enable_ddp: true + use_fp16: true + model_name: mala_asr + num_epochs: 5 + freeze_encoder: true + freeze_llm: true + use_peft: false + batching_strategy: custom + warmup_steps: 1000 + total_steps: 100000 + lr: 5.0e-05 + validation_interval: 10000 + batch_size_training: 10 + val_batch_size: 10 + num_workers_dataloader: 0 + output_dir: examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1022 +deepspeed_config: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +model_config: + llm_name: Qwen2.5-7B-Instruct + llm_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim: 3584 + encoder_name: whisper + encoder_projector_ds_rate: 5 + encoder_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + encoder_dim: 1280 + encoder_projector: linear + normalize: true +metric: acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1022/.hydra/hydra.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1022/.hydra/hydra.yaml new file mode 100644 index 00000000..ee6c33cf --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1022/.hydra/hydra.yaml @@ -0,0 +1,197 @@ +hydra: + run: + dir: examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1022 + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.run.dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1022 + - hydra.mode=RUN + task: + - ++train_config.enable_fsdp=false + - ++train_config.enable_ddp=true + - ++train_config.use_fp16=true + - ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json + - ++model_config.llm_name=Qwen2.5-7B-Instruct + - ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + - ++model_config.llm_dim=3584 + - ++model_config.encoder_name=whisper + - ++model_config.encoder_projector_ds_rate=5 + - ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + - ++model_config.encoder_dim=1280 + - ++model_config.encoder_projector=linear + - ++model_config.normalize=true + - ++dataset_config.llm_name=Qwen2.5-7B-Instruct + - ++dataset_config.prompt_style=normal + - ++dataset_config.normalize=true + - ++dataset_config.dataset=aishell-1 + - ++dataset_config.input_type=mel + - ++dataset_config.speed_perturb=false + - ++dataset_config.spec_augmentation=false + - ++dataset_config.add_reverb=false + - ++dataset_config.noise_file_path= + - ++dataset_config.mel_size=128 + - ++dataset_config.pad_or_trim=true + - ++dataset_config.encoder_projector_ds_rate=5 + - ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + - ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ + - ++train_config.model_name=mala_asr + - ++train_config.num_epochs=5 + - ++train_config.freeze_encoder=true + - ++train_config.freeze_llm=true + - ++train_config.use_peft=false + - ++train_config.batching_strategy=custom + - ++train_config.warmup_steps=1000 + - ++train_config.total_steps=100000 + - ++train_config.lr=5e-5 + - ++train_config.validation_interval=10000 + - ++train_config.batch_size_training=10 + - ++train_config.val_batch_size=10 + - ++train_config.num_workers_dataloader=0 + - ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1022 + - ++metric=acc + job: + name: finetune_mala_asr_deepspeed + chdir: null + override_dirname: ++dataset_config.add_reverb=false,++dataset_config.dataset=aishell-1,++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/,++dataset_config.encoder_projector_ds_rate=5,++dataset_config.input_type=mel,++dataset_config.llm_name=Qwen2.5-7B-Instruct,++dataset_config.mel_size=128,++dataset_config.noise_file_path=,++dataset_config.normalize=true,++dataset_config.pad_or_trim=true,++dataset_config.prompt_style=normal,++dataset_config.spec_augmentation=false,++dataset_config.speed_perturb=false,++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/,++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json,++metric=acc,++model_config.encoder_dim=1280,++model_config.encoder_name=whisper,++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt,++model_config.encoder_projector=linear,++model_config.encoder_projector_ds_rate=5,++model_config.llm_dim=3584,++model_config.llm_name=Qwen2.5-7B-Instruct,++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct,++model_config.normalize=true,++train_config.batch_size_training=10,++train_config.batching_strategy=custom,++train_config.enable_ddp=true,++train_config.enable_fsdp=false,++train_config.freeze_encoder=true,++train_config.freeze_llm=true,++train_config.lr=5e-5,++train_config.model_name=mala_asr,++train_config.num_epochs=5,++train_config.num_workers_dataloader=0,++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1022,++train_config.total_steps=100000,++train_config.use_fp16=true,++train_config.use_peft=false,++train_config.val_batch_size=10,++train_config.validation_interval=10000,++train_config.warmup_steps=1000 + id: ??? + num: ??? + config_name: prompt.yaml + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/conf + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1022 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1022/.hydra/overrides.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1022/.hydra/overrides.yaml new file mode 100644 index 00000000..45fcebfc --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1022/.hydra/overrides.yaml @@ -0,0 +1,42 @@ +- ++train_config.enable_fsdp=false +- ++train_config.enable_ddp=true +- ++train_config.use_fp16=true +- ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +- ++model_config.llm_name=Qwen2.5-7B-Instruct +- ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct +- ++model_config.llm_dim=3584 +- ++model_config.encoder_name=whisper +- ++model_config.encoder_projector_ds_rate=5 +- ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt +- ++model_config.encoder_dim=1280 +- ++model_config.encoder_projector=linear +- ++model_config.normalize=true +- ++dataset_config.llm_name=Qwen2.5-7B-Instruct +- ++dataset_config.prompt_style=normal +- ++dataset_config.normalize=true +- ++dataset_config.dataset=aishell-1 +- ++dataset_config.input_type=mel +- ++dataset_config.speed_perturb=false +- ++dataset_config.spec_augmentation=false +- ++dataset_config.add_reverb=false +- ++dataset_config.noise_file_path= +- ++dataset_config.mel_size=128 +- ++dataset_config.pad_or_trim=true +- ++dataset_config.encoder_projector_ds_rate=5 +- ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ +- ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +- ++train_config.model_name=mala_asr +- ++train_config.num_epochs=5 +- ++train_config.freeze_encoder=true +- ++train_config.freeze_llm=true +- ++train_config.use_peft=false +- ++train_config.batching_strategy=custom +- ++train_config.warmup_steps=1000 +- ++train_config.total_steps=100000 +- ++train_config.lr=5e-5 +- ++train_config.validation_interval=10000 +- ++train_config.batch_size_training=10 +- ++train_config.val_batch_size=10 +- ++train_config.num_workers_dataloader=0 +- ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1022 +- ++metric=acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1029/.hydra/config.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1029/.hydra/config.yaml new file mode 100644 index 00000000..308dd55a --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1029/.hydra/config.yaml @@ -0,0 +1,46 @@ +dataset_config: + prompt: Transcribe speech to text. + llm_name: Qwen2.5-7B-Instruct + prompt_style: normal + normalize: true + dataset: aishell-1 + input_type: mel + speed_perturb: false + spec_augmentation: false + add_reverb: false + noise_file_path: '' + mel_size: 128 + pad_or_trim: true + encoder_projector_ds_rate: 5 + train_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + dev_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +train_config: + enable_fsdp: false + enable_ddp: true + use_fp16: true + model_name: mala_asr + num_epochs: 5 + freeze_encoder: true + freeze_llm: true + use_peft: false + batching_strategy: custom + warmup_steps: 1000 + total_steps: 100000 + lr: 5.0e-05 + validation_interval: 10000 + batch_size_training: 10 + val_batch_size: 10 + num_workers_dataloader: 0 + output_dir: examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1029 +deepspeed_config: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +model_config: + llm_name: Qwen2.5-7B-Instruct + llm_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim: 3584 + encoder_name: whisper + encoder_projector_ds_rate: 5 + encoder_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + encoder_dim: 1280 + encoder_projector: linear + normalize: true +metric: acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1029/.hydra/hydra.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1029/.hydra/hydra.yaml new file mode 100644 index 00000000..b2bc3e92 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1029/.hydra/hydra.yaml @@ -0,0 +1,197 @@ +hydra: + run: + dir: examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1029 + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.run.dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1029 + - hydra.mode=RUN + task: + - ++train_config.enable_fsdp=false + - ++train_config.enable_ddp=true + - ++train_config.use_fp16=true + - ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json + - ++model_config.llm_name=Qwen2.5-7B-Instruct + - ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + - ++model_config.llm_dim=3584 + - ++model_config.encoder_name=whisper + - ++model_config.encoder_projector_ds_rate=5 + - ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + - ++model_config.encoder_dim=1280 + - ++model_config.encoder_projector=linear + - ++model_config.normalize=true + - ++dataset_config.llm_name=Qwen2.5-7B-Instruct + - ++dataset_config.prompt_style=normal + - ++dataset_config.normalize=true + - ++dataset_config.dataset=aishell-1 + - ++dataset_config.input_type=mel + - ++dataset_config.speed_perturb=false + - ++dataset_config.spec_augmentation=false + - ++dataset_config.add_reverb=false + - ++dataset_config.noise_file_path= + - ++dataset_config.mel_size=128 + - ++dataset_config.pad_or_trim=true + - ++dataset_config.encoder_projector_ds_rate=5 + - ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + - ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ + - ++train_config.model_name=mala_asr + - ++train_config.num_epochs=5 + - ++train_config.freeze_encoder=true + - ++train_config.freeze_llm=true + - ++train_config.use_peft=false + - ++train_config.batching_strategy=custom + - ++train_config.warmup_steps=1000 + - ++train_config.total_steps=100000 + - ++train_config.lr=5e-5 + - ++train_config.validation_interval=10000 + - ++train_config.batch_size_training=10 + - ++train_config.val_batch_size=10 + - ++train_config.num_workers_dataloader=0 + - ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1029 + - ++metric=acc + job: + name: finetune_mala_asr_deepspeed + chdir: null + override_dirname: ++dataset_config.add_reverb=false,++dataset_config.dataset=aishell-1,++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/,++dataset_config.encoder_projector_ds_rate=5,++dataset_config.input_type=mel,++dataset_config.llm_name=Qwen2.5-7B-Instruct,++dataset_config.mel_size=128,++dataset_config.noise_file_path=,++dataset_config.normalize=true,++dataset_config.pad_or_trim=true,++dataset_config.prompt_style=normal,++dataset_config.spec_augmentation=false,++dataset_config.speed_perturb=false,++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/,++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json,++metric=acc,++model_config.encoder_dim=1280,++model_config.encoder_name=whisper,++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt,++model_config.encoder_projector=linear,++model_config.encoder_projector_ds_rate=5,++model_config.llm_dim=3584,++model_config.llm_name=Qwen2.5-7B-Instruct,++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct,++model_config.normalize=true,++train_config.batch_size_training=10,++train_config.batching_strategy=custom,++train_config.enable_ddp=true,++train_config.enable_fsdp=false,++train_config.freeze_encoder=true,++train_config.freeze_llm=true,++train_config.lr=5e-5,++train_config.model_name=mala_asr,++train_config.num_epochs=5,++train_config.num_workers_dataloader=0,++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1029,++train_config.total_steps=100000,++train_config.use_fp16=true,++train_config.use_peft=false,++train_config.val_batch_size=10,++train_config.validation_interval=10000,++train_config.warmup_steps=1000 + id: ??? + num: ??? + config_name: prompt.yaml + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/conf + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1029 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1029/.hydra/overrides.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1029/.hydra/overrides.yaml new file mode 100644 index 00000000..99e72ba8 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1029/.hydra/overrides.yaml @@ -0,0 +1,42 @@ +- ++train_config.enable_fsdp=false +- ++train_config.enable_ddp=true +- ++train_config.use_fp16=true +- ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +- ++model_config.llm_name=Qwen2.5-7B-Instruct +- ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct +- ++model_config.llm_dim=3584 +- ++model_config.encoder_name=whisper +- ++model_config.encoder_projector_ds_rate=5 +- ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt +- ++model_config.encoder_dim=1280 +- ++model_config.encoder_projector=linear +- ++model_config.normalize=true +- ++dataset_config.llm_name=Qwen2.5-7B-Instruct +- ++dataset_config.prompt_style=normal +- ++dataset_config.normalize=true +- ++dataset_config.dataset=aishell-1 +- ++dataset_config.input_type=mel +- ++dataset_config.speed_perturb=false +- ++dataset_config.spec_augmentation=false +- ++dataset_config.add_reverb=false +- ++dataset_config.noise_file_path= +- ++dataset_config.mel_size=128 +- ++dataset_config.pad_or_trim=true +- ++dataset_config.encoder_projector_ds_rate=5 +- ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ +- ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +- ++train_config.model_name=mala_asr +- ++train_config.num_epochs=5 +- ++train_config.freeze_encoder=true +- ++train_config.freeze_llm=true +- ++train_config.use_peft=false +- ++train_config.batching_strategy=custom +- ++train_config.warmup_steps=1000 +- ++train_config.total_steps=100000 +- ++train_config.lr=5e-5 +- ++train_config.validation_interval=10000 +- ++train_config.batch_size_training=10 +- ++train_config.val_batch_size=10 +- ++train_config.num_workers_dataloader=0 +- ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1029 +- ++metric=acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1033/.hydra/config.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1033/.hydra/config.yaml new file mode 100644 index 00000000..31b176b2 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1033/.hydra/config.yaml @@ -0,0 +1,46 @@ +dataset_config: + prompt: Transcribe speech to text. + llm_name: Qwen2.5-7B-Instruct + prompt_style: normal + normalize: true + dataset: aishell-1 + input_type: mel + speed_perturb: false + spec_augmentation: false + add_reverb: false + noise_file_path: '' + mel_size: 128 + pad_or_trim: true + encoder_projector_ds_rate: 5 + train_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + dev_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +train_config: + enable_fsdp: false + enable_ddp: true + use_fp16: true + model_name: mala_asr + num_epochs: 5 + freeze_encoder: true + freeze_llm: true + use_peft: false + batching_strategy: custom + warmup_steps: 1000 + total_steps: 100000 + lr: 5.0e-05 + validation_interval: 10000 + batch_size_training: 10 + val_batch_size: 10 + num_workers_dataloader: 0 + output_dir: examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1033 +deepspeed_config: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +model_config: + llm_name: Qwen2.5-7B-Instruct + llm_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim: 3584 + encoder_name: whisper + encoder_projector_ds_rate: 5 + encoder_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + encoder_dim: 1280 + encoder_projector: linear + normalize: true +metric: acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1033/.hydra/hydra.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1033/.hydra/hydra.yaml new file mode 100644 index 00000000..003fdf5b --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1033/.hydra/hydra.yaml @@ -0,0 +1,197 @@ +hydra: + run: + dir: examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1033 + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.run.dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1033 + - hydra.mode=RUN + task: + - ++train_config.enable_fsdp=false + - ++train_config.enable_ddp=true + - ++train_config.use_fp16=true + - ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json + - ++model_config.llm_name=Qwen2.5-7B-Instruct + - ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + - ++model_config.llm_dim=3584 + - ++model_config.encoder_name=whisper + - ++model_config.encoder_projector_ds_rate=5 + - ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + - ++model_config.encoder_dim=1280 + - ++model_config.encoder_projector=linear + - ++model_config.normalize=true + - ++dataset_config.llm_name=Qwen2.5-7B-Instruct + - ++dataset_config.prompt_style=normal + - ++dataset_config.normalize=true + - ++dataset_config.dataset=aishell-1 + - ++dataset_config.input_type=mel + - ++dataset_config.speed_perturb=false + - ++dataset_config.spec_augmentation=false + - ++dataset_config.add_reverb=false + - ++dataset_config.noise_file_path= + - ++dataset_config.mel_size=128 + - ++dataset_config.pad_or_trim=true + - ++dataset_config.encoder_projector_ds_rate=5 + - ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + - ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ + - ++train_config.model_name=mala_asr + - ++train_config.num_epochs=5 + - ++train_config.freeze_encoder=true + - ++train_config.freeze_llm=true + - ++train_config.use_peft=false + - ++train_config.batching_strategy=custom + - ++train_config.warmup_steps=1000 + - ++train_config.total_steps=100000 + - ++train_config.lr=5e-5 + - ++train_config.validation_interval=10000 + - ++train_config.batch_size_training=10 + - ++train_config.val_batch_size=10 + - ++train_config.num_workers_dataloader=0 + - ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1033 + - ++metric=acc + job: + name: finetune_mala_asr_deepspeed + chdir: null + override_dirname: ++dataset_config.add_reverb=false,++dataset_config.dataset=aishell-1,++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/,++dataset_config.encoder_projector_ds_rate=5,++dataset_config.input_type=mel,++dataset_config.llm_name=Qwen2.5-7B-Instruct,++dataset_config.mel_size=128,++dataset_config.noise_file_path=,++dataset_config.normalize=true,++dataset_config.pad_or_trim=true,++dataset_config.prompt_style=normal,++dataset_config.spec_augmentation=false,++dataset_config.speed_perturb=false,++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/,++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json,++metric=acc,++model_config.encoder_dim=1280,++model_config.encoder_name=whisper,++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt,++model_config.encoder_projector=linear,++model_config.encoder_projector_ds_rate=5,++model_config.llm_dim=3584,++model_config.llm_name=Qwen2.5-7B-Instruct,++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct,++model_config.normalize=true,++train_config.batch_size_training=10,++train_config.batching_strategy=custom,++train_config.enable_ddp=true,++train_config.enable_fsdp=false,++train_config.freeze_encoder=true,++train_config.freeze_llm=true,++train_config.lr=5e-5,++train_config.model_name=mala_asr,++train_config.num_epochs=5,++train_config.num_workers_dataloader=0,++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1033,++train_config.total_steps=100000,++train_config.use_fp16=true,++train_config.use_peft=false,++train_config.val_batch_size=10,++train_config.validation_interval=10000,++train_config.warmup_steps=1000 + id: ??? + num: ??? + config_name: prompt.yaml + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/conf + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1033 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1033/.hydra/overrides.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1033/.hydra/overrides.yaml new file mode 100644 index 00000000..019cd6cd --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1033/.hydra/overrides.yaml @@ -0,0 +1,42 @@ +- ++train_config.enable_fsdp=false +- ++train_config.enable_ddp=true +- ++train_config.use_fp16=true +- ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +- ++model_config.llm_name=Qwen2.5-7B-Instruct +- ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct +- ++model_config.llm_dim=3584 +- ++model_config.encoder_name=whisper +- ++model_config.encoder_projector_ds_rate=5 +- ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt +- ++model_config.encoder_dim=1280 +- ++model_config.encoder_projector=linear +- ++model_config.normalize=true +- ++dataset_config.llm_name=Qwen2.5-7B-Instruct +- ++dataset_config.prompt_style=normal +- ++dataset_config.normalize=true +- ++dataset_config.dataset=aishell-1 +- ++dataset_config.input_type=mel +- ++dataset_config.speed_perturb=false +- ++dataset_config.spec_augmentation=false +- ++dataset_config.add_reverb=false +- ++dataset_config.noise_file_path= +- ++dataset_config.mel_size=128 +- ++dataset_config.pad_or_trim=true +- ++dataset_config.encoder_projector_ds_rate=5 +- ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ +- ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +- ++train_config.model_name=mala_asr +- ++train_config.num_epochs=5 +- ++train_config.freeze_encoder=true +- ++train_config.freeze_llm=true +- ++train_config.use_peft=false +- ++train_config.batching_strategy=custom +- ++train_config.warmup_steps=1000 +- ++train_config.total_steps=100000 +- ++train_config.lr=5e-5 +- ++train_config.validation_interval=10000 +- ++train_config.batch_size_training=10 +- ++train_config.val_batch_size=10 +- ++train_config.num_workers_dataloader=0 +- ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1033 +- ++metric=acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1038/.hydra/config.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1038/.hydra/config.yaml new file mode 100644 index 00000000..819652f0 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1038/.hydra/config.yaml @@ -0,0 +1,46 @@ +dataset_config: + prompt: Transcribe speech to text. + llm_name: Qwen2.5-7B-Instruct + prompt_style: normal + normalize: true + dataset: aishell-1 + input_type: mel + speed_perturb: false + spec_augmentation: false + add_reverb: false + noise_file_path: '' + mel_size: 128 + pad_or_trim: true + encoder_projector_ds_rate: 5 + train_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + dev_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +train_config: + enable_fsdp: false + enable_ddp: true + use_fp16: true + model_name: mala_asr + num_epochs: 5 + freeze_encoder: true + freeze_llm: true + use_peft: false + batching_strategy: custom + warmup_steps: 1000 + total_steps: 100000 + lr: 5.0e-05 + validation_interval: 10000 + batch_size_training: 10 + val_batch_size: 10 + num_workers_dataloader: 8 + output_dir: examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1038 +deepspeed_config: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +model_config: + llm_name: Qwen2.5-7B-Instruct + llm_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim: 3584 + encoder_name: whisper + encoder_projector_ds_rate: 5 + encoder_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + encoder_dim: 1280 + encoder_projector: linear + normalize: true +metric: acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1038/.hydra/hydra.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1038/.hydra/hydra.yaml new file mode 100644 index 00000000..cb75b788 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1038/.hydra/hydra.yaml @@ -0,0 +1,197 @@ +hydra: + run: + dir: examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1038 + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.run.dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1038 + - hydra.mode=RUN + task: + - ++train_config.enable_fsdp=false + - ++train_config.enable_ddp=true + - ++train_config.use_fp16=true + - ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json + - ++model_config.llm_name=Qwen2.5-7B-Instruct + - ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + - ++model_config.llm_dim=3584 + - ++model_config.encoder_name=whisper + - ++model_config.encoder_projector_ds_rate=5 + - ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + - ++model_config.encoder_dim=1280 + - ++model_config.encoder_projector=linear + - ++model_config.normalize=true + - ++dataset_config.llm_name=Qwen2.5-7B-Instruct + - ++dataset_config.prompt_style=normal + - ++dataset_config.normalize=true + - ++dataset_config.dataset=aishell-1 + - ++dataset_config.input_type=mel + - ++dataset_config.speed_perturb=false + - ++dataset_config.spec_augmentation=false + - ++dataset_config.add_reverb=false + - ++dataset_config.noise_file_path= + - ++dataset_config.mel_size=128 + - ++dataset_config.pad_or_trim=true + - ++dataset_config.encoder_projector_ds_rate=5 + - ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + - ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ + - ++train_config.model_name=mala_asr + - ++train_config.num_epochs=5 + - ++train_config.freeze_encoder=true + - ++train_config.freeze_llm=true + - ++train_config.use_peft=false + - ++train_config.batching_strategy=custom + - ++train_config.warmup_steps=1000 + - ++train_config.total_steps=100000 + - ++train_config.lr=5e-5 + - ++train_config.validation_interval=10000 + - ++train_config.batch_size_training=10 + - ++train_config.val_batch_size=10 + - ++train_config.num_workers_dataloader=8 + - ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1038 + - ++metric=acc + job: + name: finetune_mala_asr_deepspeed + chdir: null + override_dirname: ++dataset_config.add_reverb=false,++dataset_config.dataset=aishell-1,++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/,++dataset_config.encoder_projector_ds_rate=5,++dataset_config.input_type=mel,++dataset_config.llm_name=Qwen2.5-7B-Instruct,++dataset_config.mel_size=128,++dataset_config.noise_file_path=,++dataset_config.normalize=true,++dataset_config.pad_or_trim=true,++dataset_config.prompt_style=normal,++dataset_config.spec_augmentation=false,++dataset_config.speed_perturb=false,++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/,++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json,++metric=acc,++model_config.encoder_dim=1280,++model_config.encoder_name=whisper,++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt,++model_config.encoder_projector=linear,++model_config.encoder_projector_ds_rate=5,++model_config.llm_dim=3584,++model_config.llm_name=Qwen2.5-7B-Instruct,++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct,++model_config.normalize=true,++train_config.batch_size_training=10,++train_config.batching_strategy=custom,++train_config.enable_ddp=true,++train_config.enable_fsdp=false,++train_config.freeze_encoder=true,++train_config.freeze_llm=true,++train_config.lr=5e-5,++train_config.model_name=mala_asr,++train_config.num_epochs=5,++train_config.num_workers_dataloader=8,++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1038,++train_config.total_steps=100000,++train_config.use_fp16=true,++train_config.use_peft=false,++train_config.val_batch_size=10,++train_config.validation_interval=10000,++train_config.warmup_steps=1000 + id: ??? + num: ??? + config_name: prompt.yaml + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/conf + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1038 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1038/.hydra/overrides.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1038/.hydra/overrides.yaml new file mode 100644 index 00000000..511ce022 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1038/.hydra/overrides.yaml @@ -0,0 +1,42 @@ +- ++train_config.enable_fsdp=false +- ++train_config.enable_ddp=true +- ++train_config.use_fp16=true +- ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +- ++model_config.llm_name=Qwen2.5-7B-Instruct +- ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct +- ++model_config.llm_dim=3584 +- ++model_config.encoder_name=whisper +- ++model_config.encoder_projector_ds_rate=5 +- ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt +- ++model_config.encoder_dim=1280 +- ++model_config.encoder_projector=linear +- ++model_config.normalize=true +- ++dataset_config.llm_name=Qwen2.5-7B-Instruct +- ++dataset_config.prompt_style=normal +- ++dataset_config.normalize=true +- ++dataset_config.dataset=aishell-1 +- ++dataset_config.input_type=mel +- ++dataset_config.speed_perturb=false +- ++dataset_config.spec_augmentation=false +- ++dataset_config.add_reverb=false +- ++dataset_config.noise_file_path= +- ++dataset_config.mel_size=128 +- ++dataset_config.pad_or_trim=true +- ++dataset_config.encoder_projector_ds_rate=5 +- ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ +- ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +- ++train_config.model_name=mala_asr +- ++train_config.num_epochs=5 +- ++train_config.freeze_encoder=true +- ++train_config.freeze_llm=true +- ++train_config.use_peft=false +- ++train_config.batching_strategy=custom +- ++train_config.warmup_steps=1000 +- ++train_config.total_steps=100000 +- ++train_config.lr=5e-5 +- ++train_config.validation_interval=10000 +- ++train_config.batch_size_training=10 +- ++train_config.val_batch_size=10 +- ++train_config.num_workers_dataloader=8 +- ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1038 +- ++metric=acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1044/.hydra/config.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1044/.hydra/config.yaml new file mode 100644 index 00000000..28c296ce --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1044/.hydra/config.yaml @@ -0,0 +1,46 @@ +dataset_config: + prompt: Transcribe speech to text. + llm_name: Qwen2.5-7B-Instruct + prompt_style: normal + normalize: true + dataset: aishell-1 + input_type: mel + speed_perturb: false + spec_augmentation: false + add_reverb: false + noise_file_path: '' + mel_size: 128 + pad_or_trim: true + encoder_projector_ds_rate: 5 + train_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + dev_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +train_config: + enable_fsdp: false + enable_ddp: true + use_fp16: true + model_name: mala_asr + num_epochs: 5 + freeze_encoder: true + freeze_llm: true + use_peft: false + batching_strategy: custom + warmup_steps: 1000 + total_steps: 100000 + lr: 5.0e-05 + validation_interval: 10000 + batch_size_training: 10 + val_batch_size: 10 + num_workers_dataloader: 0 + output_dir: examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1044 +deepspeed_config: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +model_config: + llm_name: Qwen2.5-7B-Instruct + llm_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim: 3584 + encoder_name: whisper + encoder_projector_ds_rate: 5 + encoder_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + encoder_dim: 1280 + encoder_projector: linear + normalize: true +metric: acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1044/.hydra/hydra.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1044/.hydra/hydra.yaml new file mode 100644 index 00000000..b2b9f817 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1044/.hydra/hydra.yaml @@ -0,0 +1,197 @@ +hydra: + run: + dir: examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1044 + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.run.dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1044 + - hydra.mode=RUN + task: + - ++train_config.enable_fsdp=false + - ++train_config.enable_ddp=true + - ++train_config.use_fp16=true + - ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json + - ++model_config.llm_name=Qwen2.5-7B-Instruct + - ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + - ++model_config.llm_dim=3584 + - ++model_config.encoder_name=whisper + - ++model_config.encoder_projector_ds_rate=5 + - ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + - ++model_config.encoder_dim=1280 + - ++model_config.encoder_projector=linear + - ++model_config.normalize=true + - ++dataset_config.llm_name=Qwen2.5-7B-Instruct + - ++dataset_config.prompt_style=normal + - ++dataset_config.normalize=true + - ++dataset_config.dataset=aishell-1 + - ++dataset_config.input_type=mel + - ++dataset_config.speed_perturb=false + - ++dataset_config.spec_augmentation=false + - ++dataset_config.add_reverb=false + - ++dataset_config.noise_file_path= + - ++dataset_config.mel_size=128 + - ++dataset_config.pad_or_trim=true + - ++dataset_config.encoder_projector_ds_rate=5 + - ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + - ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ + - ++train_config.model_name=mala_asr + - ++train_config.num_epochs=5 + - ++train_config.freeze_encoder=true + - ++train_config.freeze_llm=true + - ++train_config.use_peft=false + - ++train_config.batching_strategy=custom + - ++train_config.warmup_steps=1000 + - ++train_config.total_steps=100000 + - ++train_config.lr=5e-5 + - ++train_config.validation_interval=10000 + - ++train_config.batch_size_training=10 + - ++train_config.val_batch_size=10 + - ++train_config.num_workers_dataloader=0 + - ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1044 + - ++metric=acc + job: + name: finetune_mala_asr_deepspeed + chdir: null + override_dirname: ++dataset_config.add_reverb=false,++dataset_config.dataset=aishell-1,++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/,++dataset_config.encoder_projector_ds_rate=5,++dataset_config.input_type=mel,++dataset_config.llm_name=Qwen2.5-7B-Instruct,++dataset_config.mel_size=128,++dataset_config.noise_file_path=,++dataset_config.normalize=true,++dataset_config.pad_or_trim=true,++dataset_config.prompt_style=normal,++dataset_config.spec_augmentation=false,++dataset_config.speed_perturb=false,++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/,++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json,++metric=acc,++model_config.encoder_dim=1280,++model_config.encoder_name=whisper,++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt,++model_config.encoder_projector=linear,++model_config.encoder_projector_ds_rate=5,++model_config.llm_dim=3584,++model_config.llm_name=Qwen2.5-7B-Instruct,++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct,++model_config.normalize=true,++train_config.batch_size_training=10,++train_config.batching_strategy=custom,++train_config.enable_ddp=true,++train_config.enable_fsdp=false,++train_config.freeze_encoder=true,++train_config.freeze_llm=true,++train_config.lr=5e-5,++train_config.model_name=mala_asr,++train_config.num_epochs=5,++train_config.num_workers_dataloader=0,++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1044,++train_config.total_steps=100000,++train_config.use_fp16=true,++train_config.use_peft=false,++train_config.val_batch_size=10,++train_config.validation_interval=10000,++train_config.warmup_steps=1000 + id: ??? + num: ??? + config_name: prompt.yaml + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/conf + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1044 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1044/.hydra/overrides.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1044/.hydra/overrides.yaml new file mode 100644 index 00000000..638d0995 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1044/.hydra/overrides.yaml @@ -0,0 +1,42 @@ +- ++train_config.enable_fsdp=false +- ++train_config.enable_ddp=true +- ++train_config.use_fp16=true +- ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +- ++model_config.llm_name=Qwen2.5-7B-Instruct +- ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct +- ++model_config.llm_dim=3584 +- ++model_config.encoder_name=whisper +- ++model_config.encoder_projector_ds_rate=5 +- ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt +- ++model_config.encoder_dim=1280 +- ++model_config.encoder_projector=linear +- ++model_config.normalize=true +- ++dataset_config.llm_name=Qwen2.5-7B-Instruct +- ++dataset_config.prompt_style=normal +- ++dataset_config.normalize=true +- ++dataset_config.dataset=aishell-1 +- ++dataset_config.input_type=mel +- ++dataset_config.speed_perturb=false +- ++dataset_config.spec_augmentation=false +- ++dataset_config.add_reverb=false +- ++dataset_config.noise_file_path= +- ++dataset_config.mel_size=128 +- ++dataset_config.pad_or_trim=true +- ++dataset_config.encoder_projector_ds_rate=5 +- ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ +- ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +- ++train_config.model_name=mala_asr +- ++train_config.num_epochs=5 +- ++train_config.freeze_encoder=true +- ++train_config.freeze_llm=true +- ++train_config.use_peft=false +- ++train_config.batching_strategy=custom +- ++train_config.warmup_steps=1000 +- ++train_config.total_steps=100000 +- ++train_config.lr=5e-5 +- ++train_config.validation_interval=10000 +- ++train_config.batch_size_training=10 +- ++train_config.val_batch_size=10 +- ++train_config.num_workers_dataloader=0 +- ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1044 +- ++metric=acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1122/.hydra/config.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1122/.hydra/config.yaml new file mode 100644 index 00000000..266dec30 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1122/.hydra/config.yaml @@ -0,0 +1,46 @@ +dataset_config: + prompt: Transcribe speech to text. + llm_name: Qwen2.5-7B-Instruct + prompt_style: normal + normalize: true + dataset: aishell-1 + input_type: mel + speed_perturb: false + spec_augmentation: false + add_reverb: false + noise_file_path: '' + mel_size: 128 + pad_or_trim: true + encoder_projector_ds_rate: 5 + train_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + dev_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +train_config: + enable_fsdp: false + enable_ddp: true + use_fp16: true + model_name: mala_asr + num_epochs: 5 + freeze_encoder: true + freeze_llm: true + use_peft: false + batching_strategy: custom + warmup_steps: 1000 + total_steps: 100000 + lr: 5.0e-05 + validation_interval: 1000 + batch_size_training: 10 + val_batch_size: 10 + num_workers_dataloader: 0 + output_dir: examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1122 +deepspeed_config: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +model_config: + llm_name: Qwen2.5-7B-Instruct + llm_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim: 3584 + encoder_name: whisper + encoder_projector_ds_rate: 5 + encoder_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + encoder_dim: 1280 + encoder_projector: linear + normalize: true +metric: acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1122/.hydra/hydra.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1122/.hydra/hydra.yaml new file mode 100644 index 00000000..c1a3267b --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1122/.hydra/hydra.yaml @@ -0,0 +1,197 @@ +hydra: + run: + dir: examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1122 + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.run.dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1122 + - hydra.mode=RUN + task: + - ++train_config.enable_fsdp=false + - ++train_config.enable_ddp=true + - ++train_config.use_fp16=true + - ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json + - ++model_config.llm_name=Qwen2.5-7B-Instruct + - ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + - ++model_config.llm_dim=3584 + - ++model_config.encoder_name=whisper + - ++model_config.encoder_projector_ds_rate=5 + - ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + - ++model_config.encoder_dim=1280 + - ++model_config.encoder_projector=linear + - ++model_config.normalize=true + - ++dataset_config.llm_name=Qwen2.5-7B-Instruct + - ++dataset_config.prompt_style=normal + - ++dataset_config.normalize=true + - ++dataset_config.dataset=aishell-1 + - ++dataset_config.input_type=mel + - ++dataset_config.speed_perturb=false + - ++dataset_config.spec_augmentation=false + - ++dataset_config.add_reverb=false + - ++dataset_config.noise_file_path= + - ++dataset_config.mel_size=128 + - ++dataset_config.pad_or_trim=true + - ++dataset_config.encoder_projector_ds_rate=5 + - ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + - ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ + - ++train_config.model_name=mala_asr + - ++train_config.num_epochs=5 + - ++train_config.freeze_encoder=true + - ++train_config.freeze_llm=true + - ++train_config.use_peft=false + - ++train_config.batching_strategy=custom + - ++train_config.warmup_steps=1000 + - ++train_config.total_steps=100000 + - ++train_config.lr=5e-5 + - ++train_config.validation_interval=1000 + - ++train_config.batch_size_training=10 + - ++train_config.val_batch_size=10 + - ++train_config.num_workers_dataloader=0 + - ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1122 + - ++metric=acc + job: + name: finetune_mala_asr_deepspeed + chdir: null + override_dirname: ++dataset_config.add_reverb=false,++dataset_config.dataset=aishell-1,++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/,++dataset_config.encoder_projector_ds_rate=5,++dataset_config.input_type=mel,++dataset_config.llm_name=Qwen2.5-7B-Instruct,++dataset_config.mel_size=128,++dataset_config.noise_file_path=,++dataset_config.normalize=true,++dataset_config.pad_or_trim=true,++dataset_config.prompt_style=normal,++dataset_config.spec_augmentation=false,++dataset_config.speed_perturb=false,++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/,++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json,++metric=acc,++model_config.encoder_dim=1280,++model_config.encoder_name=whisper,++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt,++model_config.encoder_projector=linear,++model_config.encoder_projector_ds_rate=5,++model_config.llm_dim=3584,++model_config.llm_name=Qwen2.5-7B-Instruct,++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct,++model_config.normalize=true,++train_config.batch_size_training=10,++train_config.batching_strategy=custom,++train_config.enable_ddp=true,++train_config.enable_fsdp=false,++train_config.freeze_encoder=true,++train_config.freeze_llm=true,++train_config.lr=5e-5,++train_config.model_name=mala_asr,++train_config.num_epochs=5,++train_config.num_workers_dataloader=0,++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1122,++train_config.total_steps=100000,++train_config.use_fp16=true,++train_config.use_peft=false,++train_config.val_batch_size=10,++train_config.validation_interval=1000,++train_config.warmup_steps=1000 + id: ??? + num: ??? + config_name: prompt.yaml + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/conf + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1122 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1122/.hydra/overrides.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1122/.hydra/overrides.yaml new file mode 100644 index 00000000..a772a8bc --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1122/.hydra/overrides.yaml @@ -0,0 +1,42 @@ +- ++train_config.enable_fsdp=false +- ++train_config.enable_ddp=true +- ++train_config.use_fp16=true +- ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +- ++model_config.llm_name=Qwen2.5-7B-Instruct +- ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct +- ++model_config.llm_dim=3584 +- ++model_config.encoder_name=whisper +- ++model_config.encoder_projector_ds_rate=5 +- ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt +- ++model_config.encoder_dim=1280 +- ++model_config.encoder_projector=linear +- ++model_config.normalize=true +- ++dataset_config.llm_name=Qwen2.5-7B-Instruct +- ++dataset_config.prompt_style=normal +- ++dataset_config.normalize=true +- ++dataset_config.dataset=aishell-1 +- ++dataset_config.input_type=mel +- ++dataset_config.speed_perturb=false +- ++dataset_config.spec_augmentation=false +- ++dataset_config.add_reverb=false +- ++dataset_config.noise_file_path= +- ++dataset_config.mel_size=128 +- ++dataset_config.pad_or_trim=true +- ++dataset_config.encoder_projector_ds_rate=5 +- ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ +- ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +- ++train_config.model_name=mala_asr +- ++train_config.num_epochs=5 +- ++train_config.freeze_encoder=true +- ++train_config.freeze_llm=true +- ++train_config.use_peft=false +- ++train_config.batching_strategy=custom +- ++train_config.warmup_steps=1000 +- ++train_config.total_steps=100000 +- ++train_config.lr=5e-5 +- ++train_config.validation_interval=1000 +- ++train_config.batch_size_training=10 +- ++train_config.val_batch_size=10 +- ++train_config.num_workers_dataloader=0 +- ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1122 +- ++metric=acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1129/.hydra/config.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1129/.hydra/config.yaml new file mode 100644 index 00000000..b0101a30 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1129/.hydra/config.yaml @@ -0,0 +1,46 @@ +dataset_config: + prompt: Transcribe speech to text. + llm_name: Qwen2.5-7B-Instruct + prompt_style: normal + normalize: true + dataset: aishell-1 + input_type: mel + speed_perturb: false + spec_augmentation: false + add_reverb: false + noise_file_path: '' + mel_size: 128 + pad_or_trim: true + encoder_projector_ds_rate: 5 + train_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + dev_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +train_config: + enable_fsdp: false + enable_ddp: true + use_fp16: true + model_name: mala_asr + num_epochs: 5 + freeze_encoder: true + freeze_llm: true + use_peft: false + batching_strategy: custom + warmup_steps: 1000 + total_steps: 100000 + lr: 5.0e-05 + validation_interval: 1000 + batch_size_training: 10 + val_batch_size: 10 + num_workers_dataloader: 8 + output_dir: examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1129 +deepspeed_config: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +model_config: + llm_name: Qwen2.5-7B-Instruct + llm_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim: 3584 + encoder_name: whisper + encoder_projector_ds_rate: 5 + encoder_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + encoder_dim: 1280 + encoder_projector: linear + normalize: true +metric: acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1129/.hydra/hydra.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1129/.hydra/hydra.yaml new file mode 100644 index 00000000..219450b3 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1129/.hydra/hydra.yaml @@ -0,0 +1,197 @@ +hydra: + run: + dir: examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1129 + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.run.dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1129 + - hydra.mode=RUN + task: + - ++train_config.enable_fsdp=false + - ++train_config.enable_ddp=true + - ++train_config.use_fp16=true + - ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json + - ++model_config.llm_name=Qwen2.5-7B-Instruct + - ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + - ++model_config.llm_dim=3584 + - ++model_config.encoder_name=whisper + - ++model_config.encoder_projector_ds_rate=5 + - ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + - ++model_config.encoder_dim=1280 + - ++model_config.encoder_projector=linear + - ++model_config.normalize=true + - ++dataset_config.llm_name=Qwen2.5-7B-Instruct + - ++dataset_config.prompt_style=normal + - ++dataset_config.normalize=true + - ++dataset_config.dataset=aishell-1 + - ++dataset_config.input_type=mel + - ++dataset_config.speed_perturb=false + - ++dataset_config.spec_augmentation=false + - ++dataset_config.add_reverb=false + - ++dataset_config.noise_file_path= + - ++dataset_config.mel_size=128 + - ++dataset_config.pad_or_trim=true + - ++dataset_config.encoder_projector_ds_rate=5 + - ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + - ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ + - ++train_config.model_name=mala_asr + - ++train_config.num_epochs=5 + - ++train_config.freeze_encoder=true + - ++train_config.freeze_llm=true + - ++train_config.use_peft=false + - ++train_config.batching_strategy=custom + - ++train_config.warmup_steps=1000 + - ++train_config.total_steps=100000 + - ++train_config.lr=5e-5 + - ++train_config.validation_interval=1000 + - ++train_config.batch_size_training=10 + - ++train_config.val_batch_size=10 + - ++train_config.num_workers_dataloader=8 + - ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1129 + - ++metric=acc + job: + name: finetune_mala_asr_deepspeed + chdir: null + override_dirname: ++dataset_config.add_reverb=false,++dataset_config.dataset=aishell-1,++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/,++dataset_config.encoder_projector_ds_rate=5,++dataset_config.input_type=mel,++dataset_config.llm_name=Qwen2.5-7B-Instruct,++dataset_config.mel_size=128,++dataset_config.noise_file_path=,++dataset_config.normalize=true,++dataset_config.pad_or_trim=true,++dataset_config.prompt_style=normal,++dataset_config.spec_augmentation=false,++dataset_config.speed_perturb=false,++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/,++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json,++metric=acc,++model_config.encoder_dim=1280,++model_config.encoder_name=whisper,++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt,++model_config.encoder_projector=linear,++model_config.encoder_projector_ds_rate=5,++model_config.llm_dim=3584,++model_config.llm_name=Qwen2.5-7B-Instruct,++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct,++model_config.normalize=true,++train_config.batch_size_training=10,++train_config.batching_strategy=custom,++train_config.enable_ddp=true,++train_config.enable_fsdp=false,++train_config.freeze_encoder=true,++train_config.freeze_llm=true,++train_config.lr=5e-5,++train_config.model_name=mala_asr,++train_config.num_epochs=5,++train_config.num_workers_dataloader=8,++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1129,++train_config.total_steps=100000,++train_config.use_fp16=true,++train_config.use_peft=false,++train_config.val_batch_size=10,++train_config.validation_interval=1000,++train_config.warmup_steps=1000 + id: ??? + num: ??? + config_name: prompt.yaml + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/conf + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1129 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1129/.hydra/overrides.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1129/.hydra/overrides.yaml new file mode 100644 index 00000000..947df89a --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1129/.hydra/overrides.yaml @@ -0,0 +1,42 @@ +- ++train_config.enable_fsdp=false +- ++train_config.enable_ddp=true +- ++train_config.use_fp16=true +- ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +- ++model_config.llm_name=Qwen2.5-7B-Instruct +- ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct +- ++model_config.llm_dim=3584 +- ++model_config.encoder_name=whisper +- ++model_config.encoder_projector_ds_rate=5 +- ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt +- ++model_config.encoder_dim=1280 +- ++model_config.encoder_projector=linear +- ++model_config.normalize=true +- ++dataset_config.llm_name=Qwen2.5-7B-Instruct +- ++dataset_config.prompt_style=normal +- ++dataset_config.normalize=true +- ++dataset_config.dataset=aishell-1 +- ++dataset_config.input_type=mel +- ++dataset_config.speed_perturb=false +- ++dataset_config.spec_augmentation=false +- ++dataset_config.add_reverb=false +- ++dataset_config.noise_file_path= +- ++dataset_config.mel_size=128 +- ++dataset_config.pad_or_trim=true +- ++dataset_config.encoder_projector_ds_rate=5 +- ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ +- ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +- ++train_config.model_name=mala_asr +- ++train_config.num_epochs=5 +- ++train_config.freeze_encoder=true +- ++train_config.freeze_llm=true +- ++train_config.use_peft=false +- ++train_config.batching_strategy=custom +- ++train_config.warmup_steps=1000 +- ++train_config.total_steps=100000 +- ++train_config.lr=5e-5 +- ++train_config.validation_interval=1000 +- ++train_config.batch_size_training=10 +- ++train_config.val_batch_size=10 +- ++train_config.num_workers_dataloader=8 +- ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1129 +- ++metric=acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1135/.hydra/config.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1135/.hydra/config.yaml new file mode 100644 index 00000000..468156e5 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1135/.hydra/config.yaml @@ -0,0 +1,46 @@ +dataset_config: + prompt: Transcribe speech to text. + llm_name: Qwen2.5-7B-Instruct + prompt_style: normal + normalize: true + dataset: aishell-1 + input_type: mel + speed_perturb: false + spec_augmentation: false + add_reverb: false + noise_file_path: '' + mel_size: 128 + pad_or_trim: true + encoder_projector_ds_rate: 5 + train_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + dev_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +train_config: + enable_fsdp: false + enable_ddp: true + use_fp16: true + model_name: mala_asr + num_epochs: 5 + freeze_encoder: true + freeze_llm: true + use_peft: false + batching_strategy: custom + warmup_steps: 1000 + total_steps: 100000 + lr: 5.0e-05 + validation_interval: 1000 + batch_size_training: 10 + val_batch_size: 10 + num_workers_dataloader: 8 + output_dir: examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1135 +deepspeed_config: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +model_config: + llm_name: Qwen2.5-7B-Instruct + llm_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim: 3584 + encoder_name: whisper + encoder_projector_ds_rate: 5 + encoder_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + encoder_dim: 1280 + encoder_projector: linear + normalize: true +metric: acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1135/.hydra/hydra.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1135/.hydra/hydra.yaml new file mode 100644 index 00000000..ef474304 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1135/.hydra/hydra.yaml @@ -0,0 +1,197 @@ +hydra: + run: + dir: examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1135 + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.run.dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1135 + - hydra.mode=RUN + task: + - ++train_config.enable_fsdp=false + - ++train_config.enable_ddp=true + - ++train_config.use_fp16=true + - ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json + - ++model_config.llm_name=Qwen2.5-7B-Instruct + - ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + - ++model_config.llm_dim=3584 + - ++model_config.encoder_name=whisper + - ++model_config.encoder_projector_ds_rate=5 + - ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + - ++model_config.encoder_dim=1280 + - ++model_config.encoder_projector=linear + - ++model_config.normalize=true + - ++dataset_config.llm_name=Qwen2.5-7B-Instruct + - ++dataset_config.prompt_style=normal + - ++dataset_config.normalize=true + - ++dataset_config.dataset=aishell-1 + - ++dataset_config.input_type=mel + - ++dataset_config.speed_perturb=false + - ++dataset_config.spec_augmentation=false + - ++dataset_config.add_reverb=false + - ++dataset_config.noise_file_path= + - ++dataset_config.mel_size=128 + - ++dataset_config.pad_or_trim=true + - ++dataset_config.encoder_projector_ds_rate=5 + - ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + - ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ + - ++train_config.model_name=mala_asr + - ++train_config.num_epochs=5 + - ++train_config.freeze_encoder=true + - ++train_config.freeze_llm=true + - ++train_config.use_peft=false + - ++train_config.batching_strategy=custom + - ++train_config.warmup_steps=1000 + - ++train_config.total_steps=100000 + - ++train_config.lr=5e-5 + - ++train_config.validation_interval=1000 + - ++train_config.batch_size_training=10 + - ++train_config.val_batch_size=10 + - ++train_config.num_workers_dataloader=8 + - ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1135 + - ++metric=acc + job: + name: finetune_mala_asr_deepspeed + chdir: null + override_dirname: ++dataset_config.add_reverb=false,++dataset_config.dataset=aishell-1,++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/,++dataset_config.encoder_projector_ds_rate=5,++dataset_config.input_type=mel,++dataset_config.llm_name=Qwen2.5-7B-Instruct,++dataset_config.mel_size=128,++dataset_config.noise_file_path=,++dataset_config.normalize=true,++dataset_config.pad_or_trim=true,++dataset_config.prompt_style=normal,++dataset_config.spec_augmentation=false,++dataset_config.speed_perturb=false,++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/,++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json,++metric=acc,++model_config.encoder_dim=1280,++model_config.encoder_name=whisper,++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt,++model_config.encoder_projector=linear,++model_config.encoder_projector_ds_rate=5,++model_config.llm_dim=3584,++model_config.llm_name=Qwen2.5-7B-Instruct,++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct,++model_config.normalize=true,++train_config.batch_size_training=10,++train_config.batching_strategy=custom,++train_config.enable_ddp=true,++train_config.enable_fsdp=false,++train_config.freeze_encoder=true,++train_config.freeze_llm=true,++train_config.lr=5e-5,++train_config.model_name=mala_asr,++train_config.num_epochs=5,++train_config.num_workers_dataloader=8,++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1135,++train_config.total_steps=100000,++train_config.use_fp16=true,++train_config.use_peft=false,++train_config.val_batch_size=10,++train_config.validation_interval=1000,++train_config.warmup_steps=1000 + id: ??? + num: ??? + config_name: prompt.yaml + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/conf + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1135 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1135/.hydra/overrides.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1135/.hydra/overrides.yaml new file mode 100644 index 00000000..029d3f3e --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1135/.hydra/overrides.yaml @@ -0,0 +1,42 @@ +- ++train_config.enable_fsdp=false +- ++train_config.enable_ddp=true +- ++train_config.use_fp16=true +- ++deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +- ++model_config.llm_name=Qwen2.5-7B-Instruct +- ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct +- ++model_config.llm_dim=3584 +- ++model_config.encoder_name=whisper +- ++model_config.encoder_projector_ds_rate=5 +- ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt +- ++model_config.encoder_dim=1280 +- ++model_config.encoder_projector=linear +- ++model_config.normalize=true +- ++dataset_config.llm_name=Qwen2.5-7B-Instruct +- ++dataset_config.prompt_style=normal +- ++dataset_config.normalize=true +- ++dataset_config.dataset=aishell-1 +- ++dataset_config.input_type=mel +- ++dataset_config.speed_perturb=false +- ++dataset_config.spec_augmentation=false +- ++dataset_config.add_reverb=false +- ++dataset_config.noise_file_path= +- ++dataset_config.mel_size=128 +- ++dataset_config.pad_or_trim=true +- ++dataset_config.encoder_projector_ds_rate=5 +- ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ +- ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +- ++train_config.model_name=mala_asr +- ++train_config.num_epochs=5 +- ++train_config.freeze_encoder=true +- ++train_config.freeze_llm=true +- ++train_config.use_peft=false +- ++train_config.batching_strategy=custom +- ++train_config.warmup_steps=1000 +- ++train_config.total_steps=100000 +- ++train_config.lr=5e-5 +- ++train_config.validation_interval=1000 +- ++train_config.batch_size_training=10 +- ++train_config.val_batch_size=10 +- ++train_config.num_workers_dataloader=8 +- ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1135 +- ++metric=acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1138/.hydra/config.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1138/.hydra/config.yaml new file mode 100644 index 00000000..5d1a8a07 --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1138/.hydra/config.yaml @@ -0,0 +1,45 @@ +dataset_config: + prompt: Transcribe speech to text. + llm_name: Qwen2.5-7B-Instruct + prompt_style: normal + normalize: true + dataset: aishell-1 + input_type: mel + speed_perturb: false + spec_augmentation: false + add_reverb: false + noise_file_path: '' + mel_size: 128 + pad_or_trim: true + encoder_projector_ds_rate: 5 + train_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + dev_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +train_config: + enable_fsdp: false + enable_ddp: true + use_fp16: true + model_name: mala_asr + num_epochs: 5 + freeze_encoder: true + freeze_llm: true + use_peft: false + batching_strategy: custom + warmup_steps: 1000 + total_steps: 100000 + lr: 5.0e-05 + validation_interval: 50000 + batch_size_training: 2 + val_batch_size: 2 + num_workers_dataloader: 8 + output_dir: examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1138 +model_config: + llm_name: Qwen2.5-7B-Instruct + llm_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim: 3584 + encoder_name: whisper + encoder_projector_ds_rate: 5 + encoder_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + encoder_dim: 1280 + encoder_projector: linear + normalize: true +metric: acc diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1138/.hydra/hydra.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1138/.hydra/hydra.yaml new file mode 100644 index 00000000..ac35952d --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1138/.hydra/hydra.yaml @@ -0,0 +1,196 @@ +hydra: + run: + dir: examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1138 + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.run.dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1138 + - hydra.mode=RUN + task: + - ++train_config.enable_fsdp=false + - ++train_config.enable_ddp=true + - ++train_config.use_fp16=true + - ++model_config.llm_name=Qwen2.5-7B-Instruct + - ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + - ++model_config.llm_dim=3584 + - ++model_config.encoder_name=whisper + - ++model_config.encoder_projector_ds_rate=5 + - ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + - ++model_config.encoder_dim=1280 + - ++model_config.encoder_projector=linear + - ++model_config.normalize=true + - ++dataset_config.llm_name=Qwen2.5-7B-Instruct + - ++dataset_config.prompt_style=normal + - ++dataset_config.normalize=true + - ++dataset_config.dataset=aishell-1 + - ++dataset_config.input_type=mel + - ++dataset_config.speed_perturb=false + - ++dataset_config.spec_augmentation=false + - ++dataset_config.add_reverb=false + - ++dataset_config.noise_file_path= + - ++dataset_config.mel_size=128 + - ++dataset_config.pad_or_trim=true + - ++dataset_config.encoder_projector_ds_rate=5 + - ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ + - ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ + - ++train_config.model_name=mala_asr + - ++train_config.num_epochs=5 + - ++train_config.freeze_encoder=true + - ++train_config.freeze_llm=true + - ++train_config.use_peft=false + - ++train_config.batching_strategy=custom + - ++train_config.warmup_steps=1000 + - ++train_config.total_steps=100000 + - ++train_config.lr=5e-5 + - ++train_config.validation_interval=50000 + - ++train_config.batch_size_training=2 + - ++train_config.val_batch_size=2 + - ++train_config.num_workers_dataloader=8 + - ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1138 + - ++metric=acc + job: + name: finetune_mala_asr + chdir: null + override_dirname: ++dataset_config.add_reverb=false,++dataset_config.dataset=aishell-1,++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/,++dataset_config.encoder_projector_ds_rate=5,++dataset_config.input_type=mel,++dataset_config.llm_name=Qwen2.5-7B-Instruct,++dataset_config.mel_size=128,++dataset_config.noise_file_path=,++dataset_config.normalize=true,++dataset_config.pad_or_trim=true,++dataset_config.prompt_style=normal,++dataset_config.spec_augmentation=false,++dataset_config.speed_perturb=false,++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/,++metric=acc,++model_config.encoder_dim=1280,++model_config.encoder_name=whisper,++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt,++model_config.encoder_projector=linear,++model_config.encoder_projector_ds_rate=5,++model_config.llm_dim=3584,++model_config.llm_name=Qwen2.5-7B-Instruct,++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct,++model_config.normalize=true,++train_config.batch_size_training=2,++train_config.batching_strategy=custom,++train_config.enable_ddp=true,++train_config.enable_fsdp=false,++train_config.freeze_encoder=true,++train_config.freeze_llm=true,++train_config.lr=5e-5,++train_config.model_name=mala_asr,++train_config.num_epochs=5,++train_config.num_workers_dataloader=8,++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1138,++train_config.total_steps=100000,++train_config.use_fp16=true,++train_config.use_peft=false,++train_config.val_batch_size=2,++train_config.validation_interval=50000,++train_config.warmup_steps=1000 + id: ??? + num: ??? + config_name: prompt.yaml + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /aistor/aispeech/hpc_stor01/home/pengjing00sx/SLAM-LLM + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /aistor/aispeech/hpc_stor01/home/pengjing00sx/SLAM-LLM/examples/mala_asr_slidespeech/conf + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /aistor/aispeech/hpc_stor01/home/pengjing00sx/SLAM-LLM/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1138 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1138/.hydra/overrides.yaml b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1138/.hydra/overrides.yaml new file mode 100644 index 00000000..30affcce --- /dev/null +++ b/examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1138/.hydra/overrides.yaml @@ -0,0 +1,41 @@ +- ++train_config.enable_fsdp=false +- ++train_config.enable_ddp=true +- ++train_config.use_fp16=true +- ++model_config.llm_name=Qwen2.5-7B-Instruct +- ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct +- ++model_config.llm_dim=3584 +- ++model_config.encoder_name=whisper +- ++model_config.encoder_projector_ds_rate=5 +- ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt +- ++model_config.encoder_dim=1280 +- ++model_config.encoder_projector=linear +- ++model_config.normalize=true +- ++dataset_config.llm_name=Qwen2.5-7B-Instruct +- ++dataset_config.prompt_style=normal +- ++dataset_config.normalize=true +- ++dataset_config.dataset=aishell-1 +- ++dataset_config.input_type=mel +- ++dataset_config.speed_perturb=false +- ++dataset_config.spec_augmentation=false +- ++dataset_config.add_reverb=false +- ++dataset_config.noise_file_path= +- ++dataset_config.mel_size=128 +- ++dataset_config.pad_or_trim=true +- ++dataset_config.encoder_projector_ds_rate=5 +- ++dataset_config.train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/train/ +- ++dataset_config.dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/aishell-1/asr/dev/ +- ++train_config.model_name=mala_asr +- ++train_config.num_epochs=5 +- ++train_config.freeze_encoder=true +- ++train_config.freeze_llm=true +- ++train_config.use_peft=false +- ++train_config.batching_strategy=custom +- ++train_config.warmup_steps=1000 +- ++train_config.total_steps=100000 +- ++train_config.lr=5e-5 +- ++train_config.validation_interval=50000 +- ++train_config.batch_size_training=2 +- ++train_config.val_batch_size=2 +- ++train_config.num_workers_dataloader=8 +- ++train_config.output_dir=examples/mala_asr_slidespeech/exp/aishell-1/20250227/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_normal_asr_speedfalse_specaugfalse-1138 +- ++metric=acc diff --git a/examples/mala_asr_slidespeech/finetune_mala_asr_deepspeed.py b/examples/mala_asr_slidespeech/finetune_mala_asr_deepspeed.py new file mode 100644 index 00000000..942316d9 --- /dev/null +++ b/examples/mala_asr_slidespeech/finetune_mala_asr_deepspeed.py @@ -0,0 +1,60 @@ +from slam_llm.pipeline.finetune_deepspeed import main as train +from typing import Optional +import argparse +import hydra +import logging +from dataclasses import dataclass, field +from omegaconf import DictConfig, ListConfig, OmegaConf +from mala_asr_config import ModelConfig, TrainConfig, DataConfig, LogConfig, FSDPConfig +import sys +# import multiprocessing as mp; +@dataclass +class RunConfig: + dataset_config: DataConfig = field(default_factory=DataConfig) + model_config: ModelConfig = field(default_factory=ModelConfig) + train_config: TrainConfig = field(default_factory=TrainConfig) + log_config: LogConfig = field(default_factory=LogConfig) + fsdp_config: FSDPConfig = field(default_factory=FSDPConfig) + debug: bool = field(default=False, metadata={"help": "Use pdb when true"}) + metric: str = field(default="acc", metadata={"help": "The metric for evaluation"}) + ckpt_path: Optional[str] = field( + default=None, metadata={"help": "The path to projector checkpoint"} + ) + deepspeed_config : str ="" +def parse_deepspeed_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--local_rank", type=int, default=-1) + + args, unknown_args = parser.parse_known_args() + return args, unknown_args +@hydra.main(config_name=None, version_base=None) +def main_hydra(cfg: DictConfig): + run_config = RunConfig() + cfg = OmegaConf.merge(run_config, cfg) + def to_plain_list(cfg_item): + if isinstance(cfg_item, ListConfig): + return OmegaConf.to_container(cfg_item, resolve=True) + elif isinstance(cfg_item, DictConfig): + return {k: to_plain_list(v) for k, v in cfg_item.items()} + else: + return cfg_item + + # kwargs = to_plain_list(cfg) + kwargs = cfg + log_level = getattr(logging, kwargs.get("log_level", "INFO").upper()) + + logging.basicConfig(level=log_level) + + if kwargs.get("debug", False): + import pdb; + pdb.set_trace() + + train(kwargs) + + +if __name__ == "__main__": + # mp.set_start_method('spawn', force=True) + + deepspeed_args, remaining_args = parse_deepspeed_args() + sys.argv = [sys.argv[0]] + remaining_args # 仅传递 Hydra 能处理的参数 + main_hydra() \ No newline at end of file diff --git a/examples/mala_asr_slidespeech/mala_asr_config.py b/examples/mala_asr_slidespeech/mala_asr_config.py index ba98ec78..6d8b770b 100644 --- a/examples/mala_asr_slidespeech/mala_asr_config.py +++ b/examples/mala_asr_slidespeech/mala_asr_config.py @@ -9,12 +9,14 @@ class ModelConfig: llm_name: str = "vicuna-7b-v1.5" llm_path: str = "PATH/to/LLAMA/7B" llm_type: str = "decoder_only" - llm_dim: int = 4096 + llm_dim: int = 3584 encoder_name: Optional[str] = None encoder_ds_rate: int = 2 encoder_path: Optional[str] = None + encoder_path_hf: Optional[str] = None encoder_dim: int = 1280 encoder_projector: str = "linear" + qformer_layers : int = 8 encoder_projector_ds_rate: int = 5 modal: str = "audio" normalize: Optional[bool] = field(default=False, metadata={ @@ -24,12 +26,23 @@ class ModelConfig: "help": "whether model is only pretrained or finetuned, used for models such as hubert" }) +# @dataclass +# class PeftConfig: +# peft_method: str = "lora" # None , llama_adapter, prefix +# r: int = 8 +# lora_alpha: int = 32 +# target_modules: List = field(default_factory=lambda: [ "q_proj", "v_proj"]) +# bias: str = "none" +# task_type: str = "CAUSAL_LM" +# lora_dropout: float = 0.05 +# inference_mode: bool = False + @dataclass class PeftConfig: peft_method: str = "lora" # None , llama_adapter, prefix - r: int = 8 - lora_alpha: int = 32 - target_modules: List = field(default_factory=lambda: [ "q_proj", "v_proj" ]) + r: int = 64 + lora_alpha: int = 16 + target_modules: List = field(default_factory=lambda: [ "q_proj", "v_proj", "o_proj", "up_proj","gate_proj","down_proj"]) bias: str = "none" task_type: str = "CAUSAL_LM" lora_dropout: float = 0.05 @@ -84,17 +97,31 @@ class TrainConfig: @dataclass class DataConfig: - dataset: str = "slidespeech_dataset" - file: str = "examples/mala_asr_slidespeech/dataset/slidespeech_dataset.py:get_speech_dataset" - train_scp_file_path: str = "/nfs/yangguanrou.ygr/slidespeech/train_L_95/" - dev_scp_file_path: str = "/nfs/yangguanrou.ygr/slidespeech/dev_oracle_v1/" - test_scp_file_path: str = "/nfs/yangguanrou.ygr/slidespeech/test_oracle_v1/" + # dataset: str = "slidespeech_dataset" + dataset: str = "multitask_dataset" + llm_name: str = "vicuna-7b-v1.5" + prompt_style: str = "normal" # instruct + # file: str = "examples/mala_asr_slidespeech/dataset/slidespeech_dataset.py:get_speech_dataset" + file: str = "examples/mala_asr_slidespeech/dataset/multitask_dataset.py:get_speech_dataset" + speed_perturb : bool = False + spec_augmentation : bool = False + add_noise : bool = False + add_reverb : bool = False + noise_file_path: str = "" + train_scp_file_path: str = "" + dev_scp_file_path: str = "" + test_scp_file_path: str = "" train_split: str = "train" - test_split:str = "val" + dev_split: str = "dev" + test_split:str = "test" + pad_or_trim: bool = True prompt: Optional[str] = None use_ocr: bool = True inference_mode: bool = False + prompt_mode : str = "qwen" lower: bool = False + encoder_ds_rate: int = 2 + encoder_projector_ds_rate: int = 5 fix_length_audio: int = -1 inference_mode:bool = False input_type: str = field(default="raw", metadata={ @@ -112,7 +139,7 @@ class FSDPConfig: mixed_precision: bool = True use_fp16: bool = False # sharding_strategy = "FULL_SHARD" #ShardingStrategy = ShardingStrategy.FULL_SHARD - sharding_strategy: ShardingStrategy = "NO_SHARD" #ShardingStrategy.NO_SHARD #MZY: set NO_SHARD when use DDP + sharding_strategy: ShardingStrategy = "SHARD_GRAD_OP" #ShardingStrategy.NO_SHARD #MZY: set NO_SHARD when use DDP checkpoint_type: str = "SHARDED_STATE_DICT" # alternatively can use SHARDED_STATE_DICT save one file per rank, and can resize the world-size. fsdp_activation_checkpointing: bool = True fsdp_cpu_offload: bool = False @@ -122,9 +149,9 @@ class FSDPConfig: @dataclass class LogConfig: use_wandb: bool = False - wandb_dir: str = "/root/test_wandb" + wandb_dir: str = "tmp/test_wandb" wandb_entity_name: str = "project_name" wandb_project_name: str = "project_name" wandb_exp_name: str = "exp_name" - log_file: str = "/root/test.log" + log_file: str = "tmp/test.log" log_interval: int = 5 diff --git a/examples/mala_asr_slidespeech/model/slam_model_mala_asr.py b/examples/mala_asr_slidespeech/model/slam_model_mala_asr.py index 0910d2ed..f544a7ed 100644 --- a/examples/mala_asr_slidespeech/model/slam_model_mala_asr.py +++ b/examples/mala_asr_slidespeech/model/slam_model_mala_asr.py @@ -95,7 +95,7 @@ def inference( ): # inference for asr model - device = kwargs.get("device", "cuda") + device = kwargs.get("device", "npu") if os.path.exists(wav_path): # Audio-Text QA import whisper diff --git a/examples/mala_asr_slidespeech/scripts/decode_MaLa-ASR.sh b/examples/mala_asr_slidespeech/scripts/decode_MaLa-ASR.sh new file mode 100644 index 00000000..6d0a036c --- /dev/null +++ b/examples/mala_asr_slidespeech/scripts/decode_MaLa-ASR.sh @@ -0,0 +1,133 @@ +#!/bin/bash +#export PYTHONPATH=/root/whisper:$PYTHONPATH +export ASCEND_VISIBLE_DEVICES=0 +export TOKENIZERS_PARALLELISM=false +# export CUDA_LAUNCH_BLOCKING=1 +set -e +run_dir=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/ +cd $run_dir +code_dir=examples/mala_asr_slidespeech + +dataset=aishell-1 +prompt_style=instruct # normal #instruct +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech-clean || $dataset == librispeech-other || $dataset == alimeeting ]] +then + # aishell-1:asr hotword + # aishell-2:asr hotword mt + # librispeech:asr prevtext mt + # alimeeting: asr_far_bf asr_near + dataset_task=prevtext +fi +projector=linear +encoder_name=whisper +sentence=connect +llm_name=Qwen2.5-7B-Instruct +use_peft=true +use_fp16=true +pad_or_trim=true +encoder_projector_ds_rate=5 +ckpt_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/exp/aishell-1/20250211/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padtrue_instruct_asr_speedfalse_specaugfalse-1136/mala_asr_epoch_4_step_7482/model.pt +# ckpt_path=/hpc_stor01/home/yangui.fang_sx/workingspace/project/mala_asr_slidespeech_origin/exp/aishell1/20241230/whisper_linear_Qwen2.5-7B-Instruct_loratrue_instruct-1857/mala_asr_epoch_1_step_4000/ + +if [[ $encoder_name == "whisper" ]] +then + encoder_finetune=false +fi + +# Choose Encoder +if [[ $encoder_name == "whisper" ]] +then + if [[ $encoder_finetune == true ]] + then + speech_encoder_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/whisper-Pt/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt + mel_size=80 + else + speech_encoder_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/whisper-large-v3/large-v3.pt + mel_size=128 + fi + encoder_dim=1280 + input_type=mel + +elif [[ $encoder_name == "wavlm" ]] +then + speech_encoder_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/wavlm/WavLM-Large.pt + encoder_dim=1024 + input_type=raw + mel_size=128 +else + exit 1 +fi + +# Choose LLM +if [[ $llm_name == "vicuna-7b-v1.5" ]] +then + llm_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/vicuna-7b-v1.5 + llm_dim=4096 +elif [[ $llm_name == "Qwen2.5-7B-Instruct" ]] +then + llm_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/Qwen2.5-7B-Instruct + llm_dim=3584 +elif [[ $llm_name == "Qwen2-7B" ]] +then + llm_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/Qwen2-7B + llm_dim=3584 +elif [[ $llm_name == "Qwen2.5-7B" ]] +then + llm_path=/hpc_stor01/home/yangui.fang_sx/workingspace/model/Qwen2.5-7B + llm_dim=3584 +else + exit 1 +fi + +if [[ $dataset == "aishell-1" || $dataset == "aishell-2" || $dataset == "alimeeting" ]] +then + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/test/ +elif [[ $dataset == "librispeech-other" ]] +then + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/test-other/ +elif [[ $dataset == "librispeech-clean" ]] +then + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/test-clean/ +else + test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/test/ +fi + +decode_log=$ckpt_path/decode_${dataset}_${dataset_task}_${prompt_style} +# -m debugpy --listen 5678 --wait-for-client +python $code_dir/inference_mala_asr_batch.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + hydra.run.dir=$ckpt_path \ + ++model_config.llm_name=$llm_name \ + ++model_config.llm_path=$llm_path \ + ++model_config.llm_dim=$llm_dim \ + ++model_config.encoder_name=$encoder_name \ + ++model_config.normalize=true \ + ++model_config.encoder_projector_ds_rate=5 \ + ++model_config.encoder_path=$speech_encoder_path \ + ++model_config.encoder_dim=$encoder_dim \ + ++model_config.encoder_projector=$projector \ + ++dataset_config.llm_name=$llm_name \ + ++dataset_config.prompt_style=$prompt_style \ + ++dataset_config.dataset=$dataset \ + ++dataset_config.pad_or_trim=$pad_or_trim \ + ++dataset_config.test_scp_file_path=$test_scp_file_path \ + ++dataset_config.input_type=$input_type \ + ++dataset_config.mel_size=$mel_size \ + ++dataset_config.inference_mode=true \ + ++train_config.model_name=mala_asr \ + ++train_config.freeze_encoder=true \ + ++train_config.freeze_llm=true \ + ++train_config.use_peft=$use_peft \ + ++train_config.batching_strategy=custom \ + ++train_config.num_epochs=1 \ + ++train_config.val_batch_size=8 \ + ++train_config.num_workers_dataloader=8\ + ++train_config.output_dir=$output_dir \ + ++decode_log=$decode_log \ + ++ckpt_path=$ckpt_path/model.pt + + +python /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/tools/wenet_compute_cer.py --char=1 -v=1 ${decode_log}_gt ${decode_log}_pred > ${decode_log}_cer +python /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/tools/pyResults/pyResults.py ${decode_log}_gt ${decode_log}_pred > ${decode_log}_ser +# python "/hpc_stor01/home/yangui.fang_sx/workingspace/SLAM-LLM/examples/mala_asr_slidespeech/slam_llm/utils/compute_wer.py" ${decode_log}_gt ${decode_log}_pred ${decode_log}_ser diff --git a/examples/mala_asr_slidespeech/scripts/decode_MaLa-ASR_withkeywords_L95.sh b/examples/mala_asr_slidespeech/scripts/decode_MaLa-ASR_withkeywords_L95.sh deleted file mode 100644 index c922691b..00000000 --- a/examples/mala_asr_slidespeech/scripts/decode_MaLa-ASR_withkeywords_L95.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash -#export PYTHONPATH=/root/whisper:$PYTHONPATH -export CUDA_VISIBLE_DEVICES=0 -export TOKENIZERS_PARALLELISM=false -# export CUDA_LAUNCH_BLOCKING=1 - -run_dir=/root/SLAM-LLM -cd $run_dir -code_dir=examples/mala_asr_slidespeech - -speech_encoder_path=/nfs/maziyang.mzy/models/wavlm/WavLM-Large.pt -llm_path=/nfs/maziyang.mzy/models/vicuna-7b-v1.5 - -output_dir=/nfs/yangguanrou.ygr/experiments_slides_wavlm/slides-finetune-wavlm -ckpt_path=$output_dir/asr/3840 -split=test #dev -val_data_path=/nfs/yangguanrou.ygr/slidespeech/${split}_oracle_v1/ -decode_log=$ckpt_path/decode_${split}_beam4 - -# -m debugpy --listen 5678 --wait-for-client -python $code_dir/inference_mala_asr_batch.py \ - --config-path "conf" \ - --config-name "prompt.yaml" \ - hydra.run.dir=$ckpt_path \ - ++model_config.llm_name="vicuna-7b-v1.5" \ - ++model_config.llm_path=$llm_path \ - ++model_config.llm_dim=4096 \ - ++model_config.encoder_name=wavlm \ - ++model_config.normalize=true \ - ++dataset_config.normalize=true \ - ++model_config.encoder_projector_ds_rate=5 \ - ++model_config.encoder_path=$speech_encoder_path \ - ++model_config.encoder_dim=1024 \ - ++model_config.encoder_projector=cov1d-linear \ - ++dataset_config.dataset=slidespeech_dataset \ - ++dataset_config.use_ocr=true \ - ++dataset_config.dev_scp_file_path=$val_data_path \ - ++dataset_config.input_type=raw \ - ++dataset_config.inference_mode=true \ - ++train_config.model_name=mala_asr \ - ++train_config.freeze_encoder=true \ - ++train_config.freeze_llm=true \ - ++train_config.batching_strategy=custom \ - ++train_config.num_epochs=1 \ - ++train_config.val_batch_size=1 \ - ++train_config.num_workers_dataloader=2 \ - ++train_config.output_dir=$output_dir \ - ++decode_log=$decode_log \ - ++ckpt_path=$ckpt_path/model.pt \ \ No newline at end of file diff --git a/examples/mala_asr_slidespeech/scripts/finetune_MaLa-ASR.sh b/examples/mala_asr_slidespeech/scripts/finetune_MaLa-ASR.sh new file mode 100644 index 00000000..9d0fe889 --- /dev/null +++ b/examples/mala_asr_slidespeech/scripts/finetune_MaLa-ASR.sh @@ -0,0 +1,180 @@ +#!/bin/bash +# export PYTHONPATH=/root/fairseq:$PYTHONPATH +export ASCEND_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export TOKENIZERS_PARALLELISM=false +# export CUDA_LAUNCH_BLOCKING=1 +export HYDRA_FULL_ERROR=1 +export OMP_NUM_THREADS=1 + +# debug setting for multiple gpus +# export NCCL_DEBUG=INFO +# export NCCL_DEBUG_SUBSYS=ALL +# export TORCH_DISTRIBUTED_DEBUG=INFO + +run_dir=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/ +cd $run_dir +code_dir=examples/mala_asr_slidespeech +# multitask +# dataset=alimeeting +# multitask_asr +dataset=aishell-1 +prompt_style=normal #instruct +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech ]] +then + # aishell1:asr hotword + # aisehll2:asr hotword mt + # librispeech:asr prevtext mt + # alimeeting: asr_far_bf asr_near + # gigaspeech: asr + # wenetspeech: asr + dataset_task=asr +fi +projector=linear +encoder_name=whisper +llm_name=Qwen2.5-7B-Instruct +use_peft=false +use_fp16=true +freeze_encoder=true +pad_or_trim=true +encoder_projector_ds_rate=5 +# enhance +# enhance +speed_perturb=false +spec_augmentation=false +add_noise=false +add_reverb=false +# /hpc_stor01/home/yangui.fang_sx/workingspace/project/mala_asr_slidespeech_origin/exp/multitask_asr/20250111/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padfalse_normal__speedtrue_specaugfalse-1106/mala_asr_epoch_2_step_1299/ +# ckpt_path=/hpc_stor01/home/yangui.fang_sx/workingspace/project/mala_asr_slidespeech_origin/exp/multitask_asr/20250114/whisper_linear_Qwen2.5-7B-Instruct_loratrue_padtrue_normal__speedfalse_specaugfalse-2018_Upper_/mala_asr_epoch_2_step_12299 +if [[ $encoder_name == "whisper" ]] +then + encoder_finetune=false +fi +if [[ $use_peft == "true" || $freeze_encoder == false ]];then + ckpt_path= +fi + +# Choose Encoder +if [[ $encoder_name == "whisper" ]] +then + if [[ $encoder_finetune == true ]] + then + speech_encoder_path= + mel_size=80 + else + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + mel_size=128 + fi + encoder_dim=1280 + input_type=mel + +elif [[ $encoder_name == "wavlm" ]] +then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/wavlm/WavLM-Large.pt + encoder_dim=1024 + input_type=raw + mel_size=128 +else + exit 1 +fi + +# Choose LLM +if [[ $llm_name == "vicuna-7b-v1.5" ]] +then + llm_path= + llm_dim=4096 +elif [[ $llm_name == "Qwen2.5-7B-Instruct" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim=3584 +elif [[ $llm_name == "Qwen2-7B" ]] +then + llm_path= + llm_dim=3584 +elif [[ $llm_name == "Qwen2.5-7B" ]] +then + llm_path= + llm_dim=3584 +else + exit 1 +fi + +# Choose Train/Dev/Test +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/dev/ +elif [[ $dataset == "librispeech" ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/dev-other/ +else + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/dev/ +fi + +output_dir=${code_dir}/exp/${dataset}/$(date +"%Y%m%d")/${encoder_name}_${projector}_${llm_name}_lora${use_peft}_pad${pad_or_trim}_${prompt_style}_${dataset_task}_speed${speed_perturb}_specaug${spec_augmentation}-$(date +"%H%M") +hydra_args=" +hydra.run.dir=$output_dir \ +++model_config.llm_name=$llm_name \ +++model_config.llm_path=$llm_path \ +++model_config.llm_dim=$llm_dim \ +++model_config.encoder_name=$encoder_name \ +++model_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++model_config.encoder_path=$speech_encoder_path \ +++model_config.encoder_dim=$encoder_dim \ +++model_config.encoder_projector=$projector \ +++model_config.normalize=true \ +++dataset_config.llm_name=$llm_name \ +++dataset_config.prompt_style=$prompt_style \ +++dataset_config.normalize=true \ +++dataset_config.dataset=$dataset \ +++dataset_config.input_type=$input_type \ +++dataset_config.speed_perturb=$speed_perturb \ +++dataset_config.spec_augmentation=$spec_augmentation \ +++dataset_config.add_reverb=$add_reverb \ +++dataset_config.noise_file_path=$noise_file_path \ +++dataset_config.mel_size=$mel_size \ +++dataset_config.pad_or_trim=$pad_or_trim \ +++dataset_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++dataset_config.train_scp_file_path=$train_scp_file_path \ +++dataset_config.dev_scp_file_path=$dev_scp_file_path \ +++train_config.model_name=mala_asr \ +++train_config.num_epochs=5 \ +++train_config.freeze_encoder=$freeze_encoder \ +++train_config.freeze_llm=true \ +++train_config.use_peft=$use_peft \ +++train_config.batching_strategy=custom \ +++train_config.warmup_steps=1000 \ +++train_config.total_steps=100000 \ +++train_config.lr=5e-5 \ +++train_config.validation_interval=50000 \ +++train_config.batch_size_training=2 \ +++train_config.val_batch_size=2 \ +++train_config.num_workers_dataloader=8 \ +++train_config.output_dir=$output_dir \ +++metric=acc \ +" +if [[ $use_peft == "true" || $freeze_encoder == false ]];then + hydra_args+="++ckpt_path=$ckpt_path/model.pt" +fi +# hydra_args+="++ckpt_path=$ckpt_path/model.pt" + +# -m debugpy --listen 5678 --wait-for-client +if [[ $ASCEND_VISIBLE_DEVICES != *","* ]]; then + python -m debugpy --listen 5678 --wait-for-client $code_dir/finetune_mala_asr.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + $hydra_args +else + torchrun \ + --nnodes 1 \ + --nproc_per_node 8 \ + --master_port=29505 \ + $code_dir/finetune_mala_asr.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + ++train_config.enable_fsdp=false \ + ++train_config.enable_ddp=true \ + ++train_config.use_fp16=$use_fp16 \ + ${hydra_args} +fi diff --git a/examples/mala_asr_slidespeech/scripts/finetune_MaLa-ASR_deepspeed.sh b/examples/mala_asr_slidespeech/scripts/finetune_MaLa-ASR_deepspeed.sh new file mode 100644 index 00000000..f2772a94 --- /dev/null +++ b/examples/mala_asr_slidespeech/scripts/finetune_MaLa-ASR_deepspeed.sh @@ -0,0 +1,183 @@ +#!/bin/bash +# export PYTHONPATH=/root/fairseq:$PYTHONPATH +export ASCEND_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export TOKENIZERS_PARALLELISM=false +# export CUDA_LAUNCH_BLOCKING=1 +export HYDRA_FULL_ERROR=1 +export OMP_NUM_THREADS=1 + +# debug setting for multiple gpus +# export NCCL_DEBUG=INFO +# export NCCL_DEBUG_SUBSYS=ALL +# export TORCH_DISTRIBUTED_DEBUG=INFO + +run_dir=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/ +cd $run_dir +code_dir=examples/mala_asr_slidespeech +# multitask +# dataset=alimeeting +# dataset=multitask_asr +dataset=aishell-1 +prompt_style=normal #instruct +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech ]] +then + # aishell1:asr hotword + # aisehll2:asr hotword mt + # librispeech:asr prevtext mt + # alimeeting: asr_far_bf asr_near + # gigaspeech: asr + # wenetspeech: asr + dataset_task=asr +fi +projector=linear +encoder_name=whisper +llm_name=Qwen2.5-7B-Instruct +use_peft=false +use_fp16=true +freeze_encoder=true +pad_or_trim=true +encoder_projector_ds_rate=5 +# enhance +# enhance +speed_perturb=false +spec_augmentation=false +add_noise=false +add_reverb=false +# deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config_from_k2.json +deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +# /hpc_stor01/home/yangui.fang_sx/workingspace/project/mala_asr_slidespeech_origin/exp/multitask_asr/20250111/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padfalse_normal__speedtrue_specaugfalse-1106/mala_asr_epoch_2_step_1299/ +# ckpt_path=/hpc_stor01/home/yangui.fang_sx/workingspace/project/mala_asr_slidespeech_origin/exp/multitask_asr/20250114/whisper_linear_Qwen2.5-7B-Instruct_loratrue_padtrue_normal__speedfalse_specaugfalse-2018_Upper_/mala_asr_epoch_2_step_12299 +if [[ $encoder_name == "whisper" ]] +then + encoder_finetune=false +fi +if [[ $use_peft == "true" || $freeze_encoder == false ]];then + ckpt_path= +fi + +# Choose Encoder +if [[ $encoder_name == "whisper" ]] +then + if [[ $encoder_finetune == true ]] + then + speech_encoder_path= + mel_size=80 + else + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + mel_size=128 + fi + encoder_dim=1280 + input_type=mel + +elif [[ $encoder_name == "wavlm" ]] +then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/wavlm/WavLM-Large.pt + encoder_dim=1024 + input_type=raw + mel_size=128 +else + exit 1 +fi + +# Choose LLM +if [[ $llm_name == "vicuna-7b-v1.5" ]] +then + llm_path= + llm_dim=4096 +elif [[ $llm_name == "Qwen2.5-7B-Instruct" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim=3584 +elif [[ $llm_name == "Qwen2-7B" ]] +then + llm_path= + llm_dim=3584 +elif [[ $llm_name == "Qwen2.5-7B" ]] +then + llm_path= + llm_dim=3584 +else + exit 1 +fi + +# Choose Train/Dev/Test +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/dev/ +elif [[ $dataset == "librispeech" ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/dev-other/ +else + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/dev/ +fi + + +output_dir=${code_dir}/exp/${dataset}/$(date +"%Y%m%d")/${encoder_name}_${projector}_${llm_name}_lora${use_peft}_pad${pad_or_trim}_${prompt_style}_${dataset_task}_speed${speed_perturb}_specaug${spec_augmentation}-$(date +"%H%M") +hydra_args=" +hydra.run.dir=$output_dir \ +++model_config.llm_name=$llm_name \ +++model_config.llm_path=$llm_path \ +++model_config.llm_dim=$llm_dim \ +++model_config.encoder_name=$encoder_name \ +++model_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++model_config.encoder_path=$speech_encoder_path \ +++model_config.encoder_dim=$encoder_dim \ +++model_config.encoder_projector=$projector \ +++model_config.normalize=true \ +++dataset_config.llm_name=$llm_name \ +++dataset_config.prompt_style=$prompt_style \ +++dataset_config.normalize=true \ +++dataset_config.dataset=$dataset \ +++dataset_config.input_type=$input_type \ +++dataset_config.speed_perturb=$speed_perturb \ +++dataset_config.spec_augmentation=$spec_augmentation \ +++dataset_config.add_reverb=$add_reverb \ +++dataset_config.noise_file_path=$noise_file_path \ +++dataset_config.mel_size=$mel_size \ +++dataset_config.pad_or_trim=$pad_or_trim \ +++dataset_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++dataset_config.train_scp_file_path=$train_scp_file_path \ +++dataset_config.dev_scp_file_path=$dev_scp_file_path \ +++train_config.model_name=mala_asr \ +++train_config.num_epochs=5 \ +++train_config.freeze_encoder=$freeze_encoder \ +++train_config.freeze_llm=true \ +++train_config.use_peft=$use_peft \ +++train_config.batching_strategy=custom \ +++train_config.warmup_steps=1000 \ +++train_config.total_steps=100000 \ +++train_config.lr=5e-5 \ +++train_config.validation_interval=10000 \ +++train_config.batch_size_training=10 \ +++train_config.val_batch_size=10 \ +++train_config.num_workers_dataloader=0 \ +++train_config.output_dir=$output_dir \ +++metric=acc \ +" +if [[ $use_peft == "true" || $freeze_encoder == false ]];then + hydra_args+="++ckpt_path=$ckpt_path/model.pt" +fi +# hydra_args+="++ckpt_path=$ckpt_path/model.pt" + +# -m debugpy --listen 5678 --wait-for-client +if [[ $ASCEND_VISIBLE_DEVICES != *","* ]]; then + python -m debugpy --listen 5678 --wait-for-client $code_dir/finetune_mala_asr.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + $hydra_args +else + deepspeed \ + --num_nodes 1 \ + --num_gpus 8 \ + $code_dir/finetune_mala_asr_deepspeed.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + ++train_config.enable_fsdp=false \ + ++train_config.enable_ddp=true \ + ++train_config.use_fp16=$use_fp16 \ + ++deepspeed_config=$deepspeed_config \ + ${hydra_args} +fi diff --git a/examples/mala_asr_slidespeech/scripts/finetune_MaLa-ASR_deepspeed_multinode.sh b/examples/mala_asr_slidespeech/scripts/finetune_MaLa-ASR_deepspeed_multinode.sh new file mode 100644 index 00000000..12599310 --- /dev/null +++ b/examples/mala_asr_slidespeech/scripts/finetune_MaLa-ASR_deepspeed_multinode.sh @@ -0,0 +1,198 @@ +#!/bin/bash +# export PYTHONPATH=/root/fairseq:$PYTHONPATH +export ASCEND_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export TOKENIZERS_PARALLELISM=false + +# export CUDA_LAUNCH_BLOCKING=1 +export HYDRA_FULL_ERROR=1 +export OMP_NUM_THREADS=1 + +# debug setting for multiple gpus +# export NCCL_DEBUG=INFO +# export NCCL_DEBUG_SUBSYS=ALL +# export TORCH_DISTRIBUTED_DEBUG=INFO + +run_dir=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/ +cd $run_dir +code_dir=examples/mala_asr_slidespeech +# multitask +# dataset=alimeeting +# dataset=multitask_asr +dataset=aishell-1 +prompt_style=normal #instruct +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech ]] +then + # aishell1:asr hotword + # aisehll2:asr hotword mt + # librispeech:asr prevtext mt + # alimeeting: asr_far_bf asr_near + # gigaspeech: asr + # wenetspeech: asr + dataset_task=asr +fi +projector=linear +encoder_name=whisper +llm_name=Qwen2.5-7B-Instruct +use_peft=false +use_fp16=true +freeze_encoder=true +pad_or_trim=true +encoder_projector_ds_rate=5 +# enhance +# enhance +speed_perturb=false +spec_augmentation=false +add_noise=false +add_reverb=false +# deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config_from_k2.json +deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +# /hpc_stor01/home/yangui.fang_sx/workingspace/project/mala_asr_slidespeech_origin/exp/multitask_asr/20250111/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padfalse_normal__speedtrue_specaugfalse-1106/mala_asr_epoch_2_step_1299/ +# ckpt_path=/hpc_stor01/home/yangui.fang_sx/workingspace/project/mala_asr_slidespeech_origin/exp/multitask_asr/20250114/whisper_linear_Qwen2.5-7B-Instruct_loratrue_padtrue_normal__speedfalse_specaugfalse-2018_Upper_/mala_asr_epoch_2_step_12299 +if [[ $encoder_name == "whisper" ]] +then + encoder_finetune=false +fi +if [[ $use_peft == "true" || $freeze_encoder == false ]];then + ckpt_path= +fi + +# Choose Encoder +if [[ $encoder_name == "whisper" ]] +then + if [[ $encoder_finetune == true ]] + then + speech_encoder_path= + mel_size=80 + else + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + mel_size=128 + fi + encoder_dim=1280 + input_type=mel + +elif [[ $encoder_name == "wavlm" ]] +then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/wavlm/WavLM-Large.pt + encoder_dim=1024 + input_type=raw + mel_size=128 +else + exit 1 +fi + +# Choose LLM +if [[ $llm_name == "vicuna-7b-v1.5" ]] +then + llm_path= + llm_dim=4096 +elif [[ $llm_name == "Qwen2.5-7B-Instruct" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim=3584 +elif [[ $llm_name == "Qwen2-7B" ]] +then + llm_path= + llm_dim=3584 +elif [[ $llm_name == "Qwen2.5-7B" ]] +then + llm_path= + llm_dim=3584 +else + exit 1 +fi + +# Choose Train/Dev/Test +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/dev/ +elif [[ $dataset == "librispeech" ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/dev-other/ +else + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/dev/ +fi + + +output_dir=${code_dir}/exp/${dataset}/$(date +"%Y%m%d")/${encoder_name}_${projector}_${llm_name}_lora${use_peft}_pad${pad_or_trim}_${prompt_style}_${dataset_task}_speed${speed_perturb}_specaug${spec_augmentation}-$(date +"%H%M") +hydra_args=" +hydra.run.dir=$output_dir \ +++model_config.llm_name=$llm_name \ +++model_config.llm_path=$llm_path \ +++model_config.llm_dim=$llm_dim \ +++model_config.encoder_name=$encoder_name \ +++model_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++model_config.encoder_path=$speech_encoder_path \ +++model_config.encoder_dim=$encoder_dim \ +++model_config.encoder_projector=$projector \ +++model_config.normalize=true \ +++dataset_config.llm_name=$llm_name \ +++dataset_config.prompt_style=$prompt_style \ +++dataset_config.normalize=true \ +++dataset_config.dataset=$dataset \ +++dataset_config.input_type=$input_type \ +++dataset_config.speed_perturb=$speed_perturb \ +++dataset_config.spec_augmentation=$spec_augmentation \ +++dataset_config.add_reverb=$add_reverb \ +++dataset_config.noise_file_path=$noise_file_path \ +++dataset_config.mel_size=$mel_size \ +++dataset_config.pad_or_trim=$pad_or_trim \ +++dataset_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++dataset_config.train_scp_file_path=$train_scp_file_path \ +++dataset_config.dev_scp_file_path=$dev_scp_file_path \ +++train_config.model_name=mala_asr \ +++train_config.num_epochs=5 \ +++train_config.freeze_encoder=$freeze_encoder \ +++train_config.freeze_llm=true \ +++train_config.use_peft=$use_peft \ +++train_config.batching_strategy=custom \ +++train_config.warmup_steps=1000 \ +++train_config.total_steps=100000 \ +++train_config.lr=5e-5 \ +++train_config.validation_interval=1000 \ +++train_config.batch_size_training=10 \ +++train_config.val_batch_size=10 \ +++train_config.num_workers_dataloader=0 \ +++train_config.output_dir=$output_dir \ +++metric=acc \ +" +if [[ $use_peft == "true" || $freeze_encoder == false ]];then + hydra_args+="++ckpt_path=$ckpt_path/model.pt" +fi +# hydra_args+="++ckpt_path=$ckpt_path/model.pt" + + +HOST_FILE="/tmp/"${JobID} #生成的hostfile的完整文件名,$JobID调度系统会自动生成 +SSH_PORT=6666 #因调度系统强制普通用户身份起容器,需要将ssh端口指定为大于1024的值 + +gen_hostfile() { #此函数负责生成hostfile, 已跟调度系统对接好,直接使用,不要修改 + echo "${VC_MASTER_HOSTS} slots=${GPU_PER_TASK}" > ${HOST_FILE} + echo "${VC_WORKER_HOSTS}" | awk -F ',' -v gpu_num=$GPU_PER_TASK '{for (i=1; i<=NF; i++) print $i" slots="gpu_num}' >> ${HOST_FILE} +} + +do_train() { + cat $HOST_FILE #训练主入口函数 + /usr/sbin/sshd -p ${SSH_PORT} #在Rank0上后台启动sshd服务,不要修改 + deepspeed \ + --hostfile $HOST_FILE \ + --ssh_port $SSH_PORT \ + $code_dir/finetune_mala_asr_deepspeed.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + ++train_config.enable_fsdp=false \ + ++train_config.enable_ddp=true \ + ++train_config.use_fp16=$use_fp16 \ + ++deepspeed_config=$deepspeed_config \ + ${hydra_args} \ + + +} + +if [ "${RANK}" = "0" ]; then #只在index为RANK0的POD上启动主训练脚本,其他节点由主节点通过ssh分发任务($RANK由调度系统自动分配) + gen_hostfile #生成分布式训练需要的hostfile + do_train #启动训练 +else + /usr/sbin/sshd -D -p ${SSH_PORT} #其他节点的task,仅前台运行sshd服务,不执行主训练脚本,不要修改 +fi \ No newline at end of file diff --git a/examples/mala_asr_slidespeech/scripts/finetune_MaLa-ASR_deepspeed_nossh.sh b/examples/mala_asr_slidespeech/scripts/finetune_MaLa-ASR_deepspeed_nossh.sh new file mode 100644 index 00000000..1b1cb1dc --- /dev/null +++ b/examples/mala_asr_slidespeech/scripts/finetune_MaLa-ASR_deepspeed_nossh.sh @@ -0,0 +1,195 @@ +#!/bin/bash +# export PYTHONPATH=/root/fairseq:$PYTHONPATH +export ASCEND_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export TOKENIZERS_PARALLELISM=false + +# export CUDA_LAUNCH_BLOCKING=1 +export HYDRA_FULL_ERROR=1 +export OMP_NUM_THREADS=1 + +# debug setting for multiple gpus +# export NCCL_DEBUG=INFO +# export NCCL_DEBUG_SUBSYS=ALL +# export TORCH_DISTRIBUTED_DEBUG=INFO + +run_dir=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/ +cd $run_dir +code_dir=examples/mala_asr_slidespeech +# multitask +# dataset=alimeeting +# dataset=multitask_asr +dataset=aishell-1 +prompt_style=normal #instruct +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech ]] +then + # aishell1:asr hotword + # aisehll2:asr hotword mt + # librispeech:asr prevtext mt + # alimeeting: asr_far_bf asr_near + # gigaspeech: asr + # wenetspeech: asr + dataset_task=asr +fi +projector=linear +encoder_name=whisper +llm_name=Qwen2.5-7B-Instruct +use_peft=false +use_fp16=true +freeze_encoder=true +pad_or_trim=true +encoder_projector_ds_rate=5 +# enhance +# enhance +speed_perturb=false +spec_augmentation=false +add_noise=false +add_reverb=false +# deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config_from_k2.json +deepspeed_config=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/conf/ds_config.json +# /hpc_stor01/home/yangui.fang_sx/workingspace/project/mala_asr_slidespeech_origin/exp/multitask_asr/20250111/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padfalse_normal__speedtrue_specaugfalse-1106/mala_asr_epoch_2_step_1299/ +# ckpt_path=/hpc_stor01/home/yangui.fang_sx/workingspace/project/mala_asr_slidespeech_origin/exp/multitask_asr/20250114/whisper_linear_Qwen2.5-7B-Instruct_loratrue_padtrue_normal__speedfalse_specaugfalse-2018_Upper_/mala_asr_epoch_2_step_12299 +if [[ $encoder_name == "whisper" ]] +then + encoder_finetune=false +fi +if [[ $use_peft == "true" || $freeze_encoder == false ]];then + ckpt_path= +fi + +# Choose Encoder +if [[ $encoder_name == "whisper" ]] +then + if [[ $encoder_finetune == true ]] + then + speech_encoder_path= + mel_size=80 + else + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + mel_size=128 + fi + encoder_dim=1280 + input_type=mel + +elif [[ $encoder_name == "wavlm" ]] +then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/wavlm/WavLM-Large.pt + encoder_dim=1024 + input_type=raw + mel_size=128 +else + exit 1 +fi + +# Choose LLM +if [[ $llm_name == "vicuna-7b-v1.5" ]] +then + llm_path= + llm_dim=4096 +elif [[ $llm_name == "Qwen2.5-7B-Instruct" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim=3584 +elif [[ $llm_name == "Qwen2-7B" ]] +then + llm_path= + llm_dim=3584 +elif [[ $llm_name == "Qwen2.5-7B" ]] +then + llm_path= + llm_dim=3584 +else + exit 1 +fi + +# Choose Train/Dev/Test +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/dev/ +elif [[ $dataset == "librispeech" ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/dev-other/ +else + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/dev/ +fi + + +output_dir=${code_dir}/exp/${dataset}/$(date +"%Y%m%d")/${encoder_name}_${projector}_${llm_name}_lora${use_peft}_pad${pad_or_trim}_${prompt_style}_${dataset_task}_speed${speed_perturb}_specaug${spec_augmentation}-$(date +"%H%M") +hydra_args=" +hydra.run.dir=$output_dir \ +++model_config.llm_name=$llm_name \ +++model_config.llm_path=$llm_path \ +++model_config.llm_dim=$llm_dim \ +++model_config.encoder_name=$encoder_name \ +++model_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++model_config.encoder_path=$speech_encoder_path \ +++model_config.encoder_dim=$encoder_dim \ +++model_config.encoder_projector=$projector \ +++model_config.normalize=true \ +++dataset_config.llm_name=$llm_name \ +++dataset_config.prompt_style=$prompt_style \ +++dataset_config.normalize=true \ +++dataset_config.dataset=$dataset \ +++dataset_config.input_type=$input_type \ +++dataset_config.speed_perturb=$speed_perturb \ +++dataset_config.spec_augmentation=$spec_augmentation \ +++dataset_config.add_reverb=$add_reverb \ +++dataset_config.noise_file_path=$noise_file_path \ +++dataset_config.mel_size=$mel_size \ +++dataset_config.pad_or_trim=$pad_or_trim \ +++dataset_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++dataset_config.train_scp_file_path=$train_scp_file_path \ +++dataset_config.dev_scp_file_path=$dev_scp_file_path \ +++train_config.model_name=mala_asr \ +++train_config.num_epochs=5 \ +++train_config.freeze_encoder=$freeze_encoder \ +++train_config.freeze_llm=true \ +++train_config.use_peft=$use_peft \ +++train_config.batching_strategy=custom \ +++train_config.warmup_steps=1000 \ +++train_config.total_steps=100000 \ +++train_config.lr=5e-5 \ +++train_config.validation_interval=1000 \ +++train_config.batch_size_training=10 \ +++train_config.val_batch_size=10 \ +++train_config.num_workers_dataloader=8 \ +++train_config.output_dir=$output_dir \ +++metric=acc \ +" +if [[ $use_peft == "true" || $freeze_encoder == false ]];then + hydra_args+="++ckpt_path=$ckpt_path/model.pt" +fi +# hydra_args+="++ckpt_path=$ckpt_path/model.pt" + + +HOST_FILE="/tmp/"${JobID} #生成的hostfile的完整文件名,$JobID调度系统会自动生成 +SSH_PORT=6666 #因调度系统强制普通用户身份起容器,需要将ssh端口指定为大于1024的值 + +gen_hostfile() { #此函数负责生成hostfile, 已跟调度系统对接好,直接使用,不要修改 + echo "${VC_MASTER_HOSTS} slots=${GPU_PER_TASK}" > ${HOST_FILE} + echo "${VC_WORKER_HOSTS}" | awk -F ',' -v gpu_num=$GPU_PER_TASK '{for (i=1; i<=NF; i++) print $i" slots="gpu_num}' >> ${HOST_FILE} +} + +do_train() { + cat $HOST_FILE #训练主入口函数 + /usr/sbin/sshd -p ${SSH_PORT} #在Rank0上后台启动sshd服务,不要修改 + deepspeed \ + --node_rank=$RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT \ + --hostfile $HOST_FILE \ + --no_ssh \ + $code_dir/finetune_mala_asr_deepspeed.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + ++train_config.enable_fsdp=false \ + ++train_config.enable_ddp=true \ + ++train_config.use_fp16=$use_fp16 \ + ++deepspeed_config=$deepspeed_config \ + ${hydra_args} +} + +gen_hostfile #生成分布式训练需要的hostfile +do_train #启动训练 \ No newline at end of file diff --git a/examples/mala_asr_slidespeech/scripts/finetune_MaLa-ASR_multiple_node.sh b/examples/mala_asr_slidespeech/scripts/finetune_MaLa-ASR_multiple_node.sh new file mode 100644 index 00000000..be918b0a --- /dev/null +++ b/examples/mala_asr_slidespeech/scripts/finetune_MaLa-ASR_multiple_node.sh @@ -0,0 +1,182 @@ +#!/bin/bash +# export PYTHONPATH=/root/fairseq:$PYTHONPATH +export ASCEND_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export TOKENIZERS_PARALLELISM=false +# export CUDA_LAUNCH_BLOCKING=1 +export HYDRA_FULL_ERROR=1 +export OMP_NUM_THREADS=1 + +# debug setting for multiple gpus +# export NCCL_DEBUG=INFO +# export NCCL_DEBUG_SUBSYS=ALL +# export TORCH_DISTRIBUTED_DEBUG=INFO + + +run_dir=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/ +cd $run_dir +code_dir=examples/mala_asr_slidespeech +# multitask +# dataset=alimeeting +dataset=multitask_asr +prompt_style=normal #instruct +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech ]] +then + # aishell1:asr hotword + # aisehll2:asr hotword mt + # librispeech:asr prevtext mt + # alimeeting: asr_far_bf asr_near + # gigaspeech: asr + # wenetspeech: asr + dataset_task=asr +fi +projector=linear +encoder_name=whisper +llm_name=Qwen2.5-7B-Instruct +use_peft=false +use_fp16=true +freeze_encoder=true +pad_or_trim=true +encoder_projector_ds_rate=5 +# enhance +# enhance +speed_perturb=false +spec_augmentation=false +add_noise=false +add_reverb=false +# /hpc_stor01/home/yangui.fang_sx/workingspace/project/mala_asr_slidespeech_origin/exp/multitask_asr/20250111/whisper_linear_Qwen2.5-7B-Instruct_lorafalse_padfalse_normal__speedtrue_specaugfalse-1106/mala_asr_epoch_2_step_1299/ +# ckpt_path=/hpc_stor01/home/yangui.fang_sx/workingspace/project/mala_asr_slidespeech_origin/exp/multitask_asr/20250114/whisper_linear_Qwen2.5-7B-Instruct_loratrue_padtrue_normal__speedfalse_specaugfalse-2018_Upper_/mala_asr_epoch_2_step_12299 +if [[ $encoder_name == "whisper" ]] +then + encoder_finetune=false +fi +if [[ $use_peft == "true" || $freeze_encoder == false ]];then + ckpt_path= +fi + +# Choose Encoder +if [[ $encoder_name == "whisper" ]] +then + if [[ $encoder_finetune == true ]] + then + speech_encoder_path= + mel_size=80 + else + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/whisper/large-v3.pt + mel_size=128 + fi + encoder_dim=1280 + input_type=mel + +elif [[ $encoder_name == "wavlm" ]] +then + speech_encoder_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/wavlm/WavLM-Large.pt + encoder_dim=1024 + input_type=raw + mel_size=128 +else + exit 1 +fi + +# Choose LLM +if [[ $llm_name == "vicuna-7b-v1.5" ]] +then + llm_path= + llm_dim=4096 +elif [[ $llm_name == "Qwen2.5-7B-Instruct" ]] +then + llm_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/model/Qwen2.5-7B-Instruct + llm_dim=3584 +elif [[ $llm_name == "Qwen2-7B" ]] +then + llm_path= + llm_dim=3584 +elif [[ $llm_name == "Qwen2.5-7B" ]] +then + llm_path= + llm_dim=3584 +else + exit 1 +fi + +# Choose Train/Dev/Test +if [[ $dataset == aishell-1 || $dataset == aishell-2 || $dataset == librispeech || $dataset == alimeeting || $dataset == gigaspeech || $dataset == wenetspeech ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/${dataset_task}/dev/ +elif [[ $dataset == "librispeech" ]] +then + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/librispeech/${dataset_task}/dev-other/ +else + train_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/train/ + dev_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/${dataset}/dev/ +fi + +output_dir=${code_dir}/exp/${dataset}/$(date +"%Y%m%d")/${encoder_name}_${projector}_${llm_name}_lora${use_peft}_pad${pad_or_trim}_${prompt_style}_${dataset_task}_speed${speed_perturb}_specaug${spec_augmentation}-$(date +"%H%M") +hydra_args=" +hydra.run.dir=$output_dir \ +++model_config.llm_name=$llm_name \ +++model_config.llm_path=$llm_path \ +++model_config.llm_dim=$llm_dim \ +++model_config.encoder_name=$encoder_name \ +++model_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++model_config.encoder_path=$speech_encoder_path \ +++model_config.encoder_dim=$encoder_dim \ +++model_config.encoder_projector=$projector \ +++model_config.normalize=true \ +++dataset_config.llm_name=$llm_name \ +++dataset_config.prompt_style=$prompt_style \ +++dataset_config.normalize=true \ +++dataset_config.dataset=$dataset \ +++dataset_config.input_type=$input_type \ +++dataset_config.speed_perturb=$speed_perturb \ +++dataset_config.spec_augmentation=$spec_augmentation \ +++dataset_config.add_reverb=$add_reverb \ +++dataset_config.noise_file_path=$noise_file_path \ +++dataset_config.mel_size=$mel_size \ +++dataset_config.pad_or_trim=$pad_or_trim \ +++dataset_config.encoder_projector_ds_rate=$encoder_projector_ds_rate \ +++dataset_config.train_scp_file_path=$train_scp_file_path \ +++dataset_config.dev_scp_file_path=$dev_scp_file_path \ +++train_config.model_name=mala_asr \ +++train_config.num_epochs=5 \ +++train_config.freeze_encoder=$freeze_encoder \ +++train_config.freeze_llm=true \ +++train_config.use_peft=$use_peft \ +++train_config.batching_strategy=custom \ +++train_config.warmup_steps=1000 \ +++train_config.total_steps=100000 \ +++train_config.lr=5e-5 \ +++train_config.validation_interval=10000 \ +++train_config.batch_size_training=2 \ +++train_config.val_batch_size=2 \ +++train_config.num_workers_dataloader=4 \ +++train_config.output_dir=$output_dir \ +++metric=acc \ +" +if [[ $use_peft == "true" || $freeze_encoder == false ]];then + hydra_args+="++ckpt_path=$ckpt_path/model.pt" +fi +# hydra_args+="++ckpt_path=$ckpt_path/model.pt" + +# -m debugpy --listen 5678 --wait-for-client +if [[ $ASCEND_VISIBLE_DEVICES != *","* ]]; then + python -m debugpy --listen 5678 --wait-for-client $code_dir/finetune_mala_asr.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + $hydra_args +else + torchrun \ + --nnodes 2 \ + --nproc_per_node 8 \ + --node_rank=$RANK \ + --master_addr=$MASTER_ADDR \ + --master_port=$MASTER_PORT \ + $code_dir/finetune_mala_asr.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + ++train_config.enable_fsdp=false \ + ++train_config.enable_ddp=true \ + ++train_config.use_fp16=$use_fp16 \ + ${hydra_args} +fi diff --git a/examples/mala_asr_slidespeech/scripts/finetune_MaLa-ASR_withkeywords_L95.sh b/examples/mala_asr_slidespeech/scripts/finetune_MaLa-ASR_withkeywords_L95.sh deleted file mode 100644 index 60c871c4..00000000 --- a/examples/mala_asr_slidespeech/scripts/finetune_MaLa-ASR_withkeywords_L95.sh +++ /dev/null @@ -1,70 +0,0 @@ -#!/bin/bash -export PYTHONPATH=/root/fairseq:$PYTHONPATH -export CUDA_VISIBLE_DEVICES=0 -export TOKENIZERS_PARALLELISM=false -# export CUDA_LAUNCH_BLOCKING=1 -export OMP_NUM_THREADS=1 - -# debug setting for multiple gpus -# export NCCL_DEBUG=INFO -# export NCCL_DEBUG_SUBSYS=ALL -# export TORCH_DISTRIBUTED_DEBUG=INFO - -run_dir=/root/SLAM-LLM -cd $run_dir -code_dir=examples/mala_asr_slidespeech - -speech_encoder_path=/nfs/maziyang.mzy/models/wavlm/WavLM-Large.pt -llm_path=/nfs/maziyang.mzy/models/vicuna-7b-v1.5 -output_dir=/root/tmp/finetune_MaLa-ASR_withkeywords_L95-$(date +"%Y%m%d") - -hydra_args=" -hydra.run.dir=$output_dir \ -++model_config.llm_name=vicuna-7b-v1.5 \ -++model_config.llm_path=$llm_path \ -++model_config.llm_dim=4096 \ -++model_config.encoder_name=wavlm \ -++model_config.normalize=true \ -++dataset_config.normalize=true \ -++model_config.encoder_projector_ds_rate=5 \ -++model_config.encoder_path=$speech_encoder_path \ -++model_config.encoder_dim=1024 \ -++model_config.encoder_projector=cov1d-linear \ -++dataset_config.dataset=slidespeech_dataset \ -++dataset_config.input_type=raw \ -++dataset_config.use_ocr=true \ -++train_config.model_name=mala_asr \ -++train_config.num_epochs=5 \ -++train_config.freeze_encoder=true \ -++train_config.freeze_llm=true \ -++train_config.batching_strategy=custom \ -++train_config.warmup_steps=1000 \ -++train_config.total_steps=110000 \ -++train_config.lr=5e-5 \ -++train_config.validation_interval=2000 \ -++train_config.batch_size_training=6 \ -++train_config.val_batch_size=6 \ -++train_config.num_workers_dataloader=2 \ -++train_config.output_dir=$output_dir \ -++metric=acc \ -" - -# -m debugpy --listen 5678 --wait-for-client -if [[ $CUDA_VISIBLE_DEVICES != *","* ]]; then - python -m debugpy --listen 5678 --wait-for-client $code_dir/finetune_mala_asr.py \ - --config-path "conf" \ - --config-name "prompt.yaml" \ - $hydra_args -else - torchrun \ - --nnodes 1 \ - --nproc_per_node 4 \ - --master_port=29503 \ - $code_dir/finetune_mala_asr.py \ - --config-path "conf" \ - --config-name "prompt.yaml" \ - ++train_config.enable_fsdp=false \ - ++train_config.enable_ddp=true \ - ++train_config.use_fp16=true \ - $hydra_args -fi diff --git a/examples/mala_asr_slidespeech/scripts/hostfile b/examples/mala_asr_slidespeech/scripts/hostfile new file mode 100644 index 00000000..e69de29b diff --git a/examples/mala_asr_slidespeech/scripts/vc b/examples/mala_asr_slidespeech/scripts/vc new file mode 100644 index 00000000..d42d2df8 --- /dev/null +++ b/examples/mala_asr_slidespeech/scripts/vc @@ -0,0 +1,12 @@ +# vc submit -p pdgpu-aispeech-ai -i hub.szaic.com/hpc-base/ai_base-pytorch-for-ascend:cann8.0.0-torch2.1.0-py3.10 -c 1 -m 1G -n 4 -g 1 --cmd "/usr/sbin/sshd -D -p 6666" +vc submit -p pdgpu-aispeech-ai -i hub.szaic.com/hpc/ai_asr-yanguifang-llm-notebook:v1.4 -c 20 -m 100G -n 4 -g 1 --cmd "/usr/sbin/sshd -D -p 6666" +vc submit -p pdgpu-aispeech-ai -i asr-yanguifang-llm-notebook:Qwen2-Audio -c 100 -m 300G -g 2 --cmd "/usr/sbin/sshd -D -p 6666" +vc submit -p pdgpu-aispeech-ai -i hub.szaic.com/hpc/ai_asr-yanguifang-llm-notebook:v1.4 -c 20 -m 100G -n 4 -g 2 --cmd "bash /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/scripts/finetune_MaLa-ASR_multiple_node.sh" +vc submit -p pdgpu-aispeech-ai -i hub.szaic.com/hpc/ai_asr-yanguifang-llm-notebook:v1.4 -c 100 -m 200G -n 2 -g 7 --cmd "bash /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/scripts/finetune_MaLa-ASR_multiple_node.sh > /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/scripts/train.log" +vc submit -p pdgpu-aispeech-ai -i hub.szaic.com/hpc/ai_asr-yanguifang-llm-notebook:v1.4 -c 100 -m 200G -n 2 -g 8 --cmd "bash /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/scripts/finetune_MaLa-ASR_deepspeed.sh > /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/scripts/train.log" +vc submit -p pdgpu-aispeech-ai -i hub.szaic.com/hpc/ai_asr-yanguifang-llm-notebook:v1.4 -c 160 -m 900G -n 2 -g 8 --cmd "bash /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/scripts/finetune_MaLa-ASR_deepspeed.sh > /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/scripts/train.log" +vc submit -p pdgpu-aispeech-ai -i hub.szaic.com/hpc/ai_asr-yanguifang-llm-notebook:v1.4 -c 160 -m 900G -n 2 -g 8 --cmd "bash /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/scripts/finetune_MaLa-ASR_deepspeed.sh > /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/scripts/train.log" +vc list -p pdgpu-aispeech-ai --pri +vc describe + +vc submit -t deepspeed --enable-rdma 1 -p pdgpu-aispeech-ai -i hub.szaic.com/hpc/ai_asr-yanguifang-llm-notebook:v1.5 -c 160 -m 900G -n 2 -g 8 --cmd "bash /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/scripts/finetune_MaLa-ASR_deepspeed_multinode.sh > /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/mala-asr/scripts/train.log" \ No newline at end of file diff --git a/examples/mala_asr_slidespeech/slam_llm b/examples/mala_asr_slidespeech/slam_llm new file mode 120000 index 00000000..dfce99a2 --- /dev/null +++ b/examples/mala_asr_slidespeech/slam_llm @@ -0,0 +1 @@ +/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/github/SLAM-LLM/src/slam_llm \ No newline at end of file diff --git a/examples/mala_asr_slidespeech/test.py b/examples/mala_asr_slidespeech/test.py new file mode 100644 index 00000000..9f4feea2 --- /dev/null +++ b/examples/mala_asr_slidespeech/test.py @@ -0,0 +1,19 @@ +import torch +import torch_npu +import deepspeed +import torch.distributed as dist +import os +# dist.init_process_group( +# backend='hccl', # 使用NCCL后端(GPU场景) +# ) +# deepspeed.init_distributed( +# dist_backend='hccl', # 使用NCCL后端(GPU场景) +# ) +local_rank = os.environ["LOCAL_RANK"] +torch.npu.set_device(f"npu:{local_rank}") # 绑定当前NPU +deepspeed.init_distributed( +dist_backend='hccl', # 使用NCCL后端(GPU场景) +) +tensor = torch.tensor([1.0]).npu() +dist.all_reduce(tensor, op=dist.ReduceOp.SUM) +print(f"Rank {dist.get_rank()}: {tensor.item()}") \ No newline at end of file diff --git a/examples/st_covost2/conf/ds_config.json b/examples/st_covost2/conf/ds_config.json old mode 100755 new mode 100644 diff --git a/examples/st_covost2/deepspeed_finetune_asr.py b/examples/st_covost2/deepspeed_finetune_asr.py old mode 100755 new mode 100644 diff --git a/examples/st_covost2/scripts/mmt.sh b/examples/st_covost2/scripts/mmt.sh new file mode 100644 index 00000000..b0f7394d --- /dev/null +++ b/examples/st_covost2/scripts/mmt.sh @@ -0,0 +1,121 @@ +export CUDA_VISIBLE_DEVICES=0,1 +export TOKENIZERS_PARALLELISM=false +export WANDB_MODE=offline +# export HYDRA_FULL_ERROR=1 + +your_code=/code + +your_data=/userhome + + +source=covost_enenzh +checkpoint_dir=${your_data}/speech/data/qwen/asr-pretrain +output_dir=${your_data}/speech/data/qwen/mmt + + +# speech_encoder_path=${your_data}/speech/models/whisper/large-v3.pt +encoder_path_hf=${your_data}/speech/models/whisper-large-v3 +llm_path=${your_data}/speech/models/Qwen2-7B + +train_data_path=${your_data}/speech/data/qwen/train_spt_0926.jsonl +val_data_path=${your_data}/speech/data/qwen/dev_spt_0926.jsonl + + + + +run_dir=${your_code}/SLAM-LLM +cd $run_dir +code_dir=examples/st_covost2 + + + + + + + + + +# 查找以asr_epoch_开头的目录,提取epoch和step,并找出最大的epoch和step +max_epoch=$(ls -d ${checkpoint_dir}/asr_epoch_*_step_* | sed -n 's/.*asr_epoch_\([0-9]*\)_step_\([0-9]*\).*/\1/p' | sort -n | tail -1) +max_step=$(ls -d ${checkpoint_dir}/asr_epoch_${max_epoch}_step_* | sed -n 's/.*asr_epoch_[0-9]*_step_\([0-9]*\).*/\1/p' | sort -n | tail -1) + +# 构建最终的路径 +final_path="${checkpoint_dir}/asr_epoch_${max_epoch}_step_${max_step}" + + +ckpt_name=$final_path/model.pt + +# 使用find命令搜索所有.pt文件,并获取最后修改日期最晚的文件 + + + + + + +hydra_args=" +hydra.run.dir=$output_dir \ +++model_config.llm_name=Qwen \ +++model_config.llm_path=$llm_path \ +++model_config.llm_dim=3584 \ +++model_config.encoder_name=whisper \ +++model_config.encoder_projector_ds_rate=5 \ +++model_config.encoder_path=$speech_encoder_path \ +++model_config.encoder_path_hf=$encoder_path_hf \ +++model_config.encoder_dim=1280 \ +++model_config.encoder_projector=q-former \ +++model_config.query_len=80 \ +++dataset_config.dataset=st_dataset \ +++dataset_config.train_data_path=$train_data_path \ +++dataset_config.val_data_path=$val_data_path \ +++dataset_config.input_type=mel \ +++dataset_config.mel_size=128 \ +++dataset_config.fix_length_audio=80 \ +++dataset_config.source=$source \ +++train_config.model_name=asr \ +++train_config.num_epochs=10 \ +++train_config.freeze_encoder=true \ +++train_config.freeze_llm=true \ +++train_config.batching_strategy=custom \ +++train_config.gradient_accumulation_steps=1 \ +++train_config.warmup_steps=1000 \ +++train_config.total_steps=1000000 \ +++train_config.lr=1e-4 \ +++train_config.batch_size_training=2 \ +++train_config.val_batch_size=8 \ +++train_config.num_workers_dataloader=16 \ +++train_config.output_dir=$output_dir \ +++metric=acc \ +++train_config.use_fp16=false \ +++model_config.ckpt_path=$ckpt_name \ +" + + +# + + + +# -m debugpy --listen 5678 --wait-for-client +if [[ $CUDA_VISIBLE_DEVICES != *","* ]]; then + python -m debugpy --listen 5678 --wait-for-client $code_dir/finetune_asr.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + $hydra_args +else + torchrun \ + --nnodes 1 \ + --nproc_per_node 2 \ + --master_port=29504 \ + $code_dir/finetune_asr.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + ++train_config.enable_fsdp=false \ + ++train_config.enable_ddp=true \ + ++fsdp_config.pure_bf16=true \ + ++log_config.use_wandb=false \ + ++log_config.wandb_project_name=SLAM \ + ++train_config.validation_interval=2000 \ + ++log_config.wandb_exp_name=asr \ + ++train_config.use_peft=false \ + $hydra_args +fi + \ No newline at end of file diff --git a/examples/st_covost2/scripts/srt.sh b/examples/st_covost2/scripts/srt.sh new file mode 100644 index 00000000..482d290f --- /dev/null +++ b/examples/st_covost2/scripts/srt.sh @@ -0,0 +1,115 @@ +export CUDA_VISIBLE_DEVICES=0,1 +export TOKENIZERS_PARALLELISM=false +export WANDB_MODE=offline +# export HYDRA_FULL_ERROR=1 + +your_code=/code +your_data=/userhome + + +source=covost_enzh + +checkpoint_dir=${your_data}/speech/data/qwen/spt-all-7B-mul6 +output_dir=${your_data}/speech/data/qwen/srt + + +# speech_encoder_path=${your_data}/speech/models/whisper/large-v3.pt +encoder_path_hf=${your_data}/speech/models/whisper-large-v3 +llm_path=${your_data}/speech/models/Qwen2-7B + +train_data_path=${your_data}/speech/data/qwen/train_spt_0926.jsonl +val_data_path=${your_data}/speech/data/qwen/test_spt_0926.jsonl + + + + +run_dir=${your_code}/SLAM-LLM +cd $run_dir +code_dir=examples/st_covost2 + + + +# 查找以asr_epoch_开头的目录,提取epoch和step,并找出最大的epoch和step +max_epoch=$(ls -d ${checkpoint_dir}/asr_epoch_*_step_* | sed -n 's/.*asr_epoch_\([0-9]*\)_step_\([0-9]*\).*/\1/p' | sort -n | tail -1) +max_step=$(ls -d ${checkpoint_dir}/asr_epoch_${max_epoch}_step_* | sed -n 's/.*asr_epoch_[0-9]*_step_\([0-9]*\).*/\1/p' | sort -n | tail -1) + +# 构建最终的路径 +final_path="${checkpoint_dir}/asr_epoch_${max_epoch}_step_${max_step}" + + +ckpt_name=$final_path/model.pt + +# 使用find命令搜索所有.pt文件,并获取最后修改日期最晚的文件 + + + + + + +hydra_args=" +hydra.run.dir=$output_dir \ +++model_config.llm_name=Qwen \ +++model_config.llm_path=$llm_path \ +++model_config.llm_dim=3584 \ +++model_config.encoder_name=whisper \ +++model_config.encoder_projector_ds_rate=5 \ +++model_config.encoder_path=$speech_encoder_path \ +++model_config.encoder_path_hf=$encoder_path_hf \ +++model_config.encoder_dim=1280 \ +++model_config.encoder_projector=q-former \ +++model_config.query_len=80 \ +++dataset_config.dataset=st_dataset \ +++dataset_config.train_data_path=$train_data_path \ +++dataset_config.val_data_path=$val_data_path \ +++dataset_config.input_type=mel \ +++dataset_config.mel_size=128 \ +++dataset_config.fix_length_audio=80 \ +++dataset_config.source=$source \ +++train_config.model_name=asr \ +++train_config.num_epochs=10 \ +++train_config.freeze_encoder=true \ +++train_config.freeze_llm=true \ +++train_config.batching_strategy=custom \ +++train_config.gradient_accumulation_steps=1 \ +++train_config.warmup_steps=1000 \ +++train_config.total_steps=1000000 \ +++train_config.lr=1e-4 \ +++train_config.batch_size_training=2 \ +++train_config.val_batch_size=8 \ +++train_config.num_workers_dataloader=16 \ +++train_config.output_dir=$output_dir \ +++metric=acc \ +++train_config.use_fp16=false \ +++model_config.ckpt_path=$ckpt_name \ +" + + +# + + + +# -m debugpy --listen 5678 --wait-for-client +if [[ $CUDA_VISIBLE_DEVICES != *","* ]]; then + python -m debugpy --listen 5678 --wait-for-client $code_dir/finetune_asr.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + $hydra_args +else + torchrun \ + --nnodes 1 \ + --nproc_per_node 2 \ + --master_port=29504 \ + $code_dir/finetune_asr.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + ++train_config.enable_fsdp=false \ + ++train_config.enable_ddp=true \ + ++fsdp_config.pure_bf16=true \ + ++log_config.use_wandb=false \ + ++log_config.wandb_project_name=SLAM \ + ++train_config.validation_interval=2000 \ + ++log_config.wandb_exp_name=asr \ + ++train_config.use_peft=false \ + $hydra_args +fi + \ No newline at end of file diff --git a/examples/st_covost2/scripts/zsrt.sh b/examples/st_covost2/scripts/zsrt.sh new file mode 100644 index 00000000..a781cfdf --- /dev/null +++ b/examples/st_covost2/scripts/zsrt.sh @@ -0,0 +1,120 @@ +export CUDA_VISIBLE_DEVICES=0,1 +export TOKENIZERS_PARALLELISM=false +export WANDB_MODE=offline +# export HYDRA_FULL_ERROR=1 + +your_code=/code + +your_data=/userhome + + +source=all + + +checkpoint_dir=${your_data}/speech/data/qwen/srt +output_dir=${your_data}/speech/data/qwen/final + +# speech_encoder_path=${your_data}/speech/models/whisper/large-v3.pt +encoder_path_hf=${your_data}/speech/models/whisper-large-v3 +llm_path=${your_data}/speech/models/Qwen2-7B + +train_data_path=${your_data}/speech/data/qwen/train_spt_0926.jsonl +val_data_path=${your_data}/speech/data/qwen/dev_spt_0926.jsonl + + +run_dir=${your_code}/SLAM-LLM +cd $run_dir +code_dir=examples/st_covost2 + + + + + + + + + +# 查找以asr_epoch_开头的目录,提取epoch和step,并找出最大的epoch和step +max_epoch=$(ls -d ${checkpoint_dir}/asr_epoch_*_step_* | sed -n 's/.*asr_epoch_\([0-9]*\)_step_\([0-9]*\).*/\1/p' | sort -n | tail -1) +max_step=$(ls -d ${checkpoint_dir}/asr_epoch_${max_epoch}_step_* | sed -n 's/.*asr_epoch_[0-9]*_step_\([0-9]*\).*/\1/p' | sort -n | tail -1) + +# 构建最终的路径 +final_path="${checkpoint_dir}/asr_epoch_${max_epoch}_step_${max_step}" + + +ckpt_name=$final_path/model.pt + +# 使用find命令搜索所有.pt文件,并获取最后修改日期最晚的文件 + + + + + + +hydra_args=" +hydra.run.dir=$output_dir \ +++model_config.llm_name=Qwen \ +++model_config.llm_path=$llm_path \ +++model_config.llm_dim=3584 \ +++model_config.encoder_name=whisper \ +++model_config.encoder_projector_ds_rate=5 \ +++model_config.encoder_path=$speech_encoder_path \ +++model_config.encoder_path_hf=$encoder_path_hf \ +++model_config.encoder_dim=1280 \ +++model_config.encoder_projector=q-former \ +++model_config.query_len=80 \ +++dataset_config.dataset=st_dataset \ +++dataset_config.train_data_path=$train_data_path \ +++dataset_config.val_data_path=$val_data_path \ +++dataset_config.input_type=mel \ +++dataset_config.mel_size=128 \ +++dataset_config.fix_length_audio=80 \ +++dataset_config.source=$source \ +++train_config.model_name=asr \ +++train_config.num_epochs=10 \ +++train_config.freeze_encoder=true \ +++train_config.freeze_llm=true \ +++train_config.batching_strategy=custom \ +++train_config.gradient_accumulation_steps=1 \ +++train_config.warmup_steps=1000 \ +++train_config.total_steps=1000000 \ +++train_config.lr=1e-4 \ +++train_config.batch_size_training=2 \ +++train_config.val_batch_size=8 \ +++train_config.num_workers_dataloader=16 \ +++train_config.output_dir=$output_dir \ +++metric=acc \ +++train_config.use_fp16=false \ +++model_config.ckpt_path=$ckpt_name \ +" + + +# + + + +# -m debugpy --listen 5678 --wait-for-client +if [[ $CUDA_VISIBLE_DEVICES != *","* ]]; then + python -m debugpy --listen 5678 --wait-for-client $code_dir/finetune_asr.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + $hydra_args +else + torchrun \ + --nnodes 1 \ + --nproc_per_node 2 \ + --master_port=29504 \ + $code_dir/finetune_asr.py \ + --config-path "conf" \ + --config-name "prompt.yaml" \ + ++train_config.enable_fsdp=false \ + ++train_config.enable_ddp=true \ + ++fsdp_config.pure_bf16=true \ + ++log_config.use_wandb=false \ + ++log_config.wandb_project_name=SLAM \ + ++train_config.validation_interval=2000 \ + ++log_config.wandb_exp_name=asr \ + ++train_config.use_peft=false \ + $hydra_args +fi + \ No newline at end of file diff --git a/examples/vsr_LRS3/scripts/decode_avhubert_vo_vicuna_7b.sh b/examples/vsr_LRS3/scripts/decode_avhubert_vo_vicuna_7b.sh old mode 100755 new mode 100644 diff --git a/examples/vsr_LRS3/scripts/finetune_avhubert_vo_vicuna_7b.sh b/examples/vsr_LRS3/scripts/finetune_avhubert_vo_vicuna_7b.sh old mode 100755 new mode 100644 diff --git a/kernel_meta/kernel_meta_11566477603104138165/kernel_meta/te_cast_da75ef27832c624999d9f4ed527effc68f95bb6b7102c5710acc4927630a4940.json b/kernel_meta/kernel_meta_11566477603104138165/kernel_meta/te_cast_da75ef27832c624999d9f4ed527effc68f95bb6b7102c5710acc4927630a4940.json new file mode 100644 index 00000000..5c160259 --- /dev/null +++ b/kernel_meta/kernel_meta_11566477603104138165/kernel_meta/te_cast_da75ef27832c624999d9f4ed527effc68f95bb6b7102c5710acc4927630a4940.json @@ -0,0 +1,17 @@ +{ + "binFileName":"te_cast_da75ef27832c624999d9f4ed527effc68f95bb6b7102c5710acc4927630a4940", + "binFileSuffix":".o", + "blockDim":2, + "coreType":"VectorCore", + "deterministic":"ignore", + "intercoreSync":0, + "kernelName":"te_cast_da75ef27832c624999d9f4ed527effc68f95bb6b7102c5710acc4927630a4940__kernel0", + "magic":"RT_DEV_BINARY_MAGIC_ELF_AIVEC", + "memoryStamping":[], + "opParaSize":0, + "parameters":[ + null, + null + ], + "sha256":"e050193a869bd841cad60168441a622c9aea0f195e27d4a9040dae48818b47c3" +} \ No newline at end of file diff --git a/kernel_meta/kernel_meta_11566477603104138165/kernel_meta/te_cast_da75ef27832c624999d9f4ed527effc68f95bb6b7102c5710acc4927630a4940.o b/kernel_meta/kernel_meta_11566477603104138165/kernel_meta/te_cast_da75ef27832c624999d9f4ed527effc68f95bb6b7102c5710acc4927630a4940.o new file mode 100644 index 00000000..f5f138e0 Binary files /dev/null and b/kernel_meta/kernel_meta_11566477603104138165/kernel_meta/te_cast_da75ef27832c624999d9f4ed527effc68f95bb6b7102c5710acc4927630a4940.o differ diff --git a/kernel_meta/kernel_meta_12330616035522133321/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.json b/kernel_meta/kernel_meta_12330616035522133321/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.json new file mode 100644 index 00000000..cc1f1257 --- /dev/null +++ b/kernel_meta/kernel_meta_12330616035522133321/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.json @@ -0,0 +1,19 @@ +{ + "KBHit":false, + "binFileName":"te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2", + "binFileSuffix":".o", + "blockDim":1, + "coreType":"VectorCore", + "deterministic":"ignore", + "intercoreSync":0, + "kernelName":"te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2__kernel0", + "magic":"RT_DEV_BINARY_MAGIC_ELF_AIVEC", + "memoryStamping":[], + "opParaSize":0, + "parameters":[ + null, + null, + null + ], + "sha256":"a12cf2ca8927b2efe2ad85195e1e21e63908a167343a97b339667b66fe5f828e" +} \ No newline at end of file diff --git a/kernel_meta/kernel_meta_12330616035522133321/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.o b/kernel_meta/kernel_meta_12330616035522133321/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.o new file mode 100644 index 00000000..4dc2ef6c Binary files /dev/null and b/kernel_meta/kernel_meta_12330616035522133321/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.o differ diff --git a/kernel_meta/kernel_meta_12829597552062293254/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.json b/kernel_meta/kernel_meta_12829597552062293254/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.json new file mode 100644 index 00000000..cd21ad2f --- /dev/null +++ b/kernel_meta/kernel_meta_12829597552062293254/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.json @@ -0,0 +1,17 @@ +{ + "binFileName":"te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7", + "binFileSuffix":".o", + "blockDim":1, + "coreType":"VectorCore", + "deterministic":"ignore", + "intercoreSync":0, + "kernelName":"te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7__kernel0", + "magic":"RT_DEV_BINARY_MAGIC_ELF_AIVEC", + "memoryStamping":[], + "opParaSize":0, + "parameters":[ + null, + null + ], + "sha256":"4086081ca9855caf5d3a7ea288ed3819b78aca18dfd55810c6d57b83d007d2ce" +} \ No newline at end of file diff --git a/kernel_meta/kernel_meta_12829597552062293254/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.o b/kernel_meta/kernel_meta_12829597552062293254/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.o new file mode 100644 index 00000000..554eacee Binary files /dev/null and b/kernel_meta/kernel_meta_12829597552062293254/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.o differ diff --git a/kernel_meta/kernel_meta_14304587336974932360/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.json b/kernel_meta/kernel_meta_14304587336974932360/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.json new file mode 100644 index 00000000..cd21ad2f --- /dev/null +++ b/kernel_meta/kernel_meta_14304587336974932360/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.json @@ -0,0 +1,17 @@ +{ + "binFileName":"te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7", + "binFileSuffix":".o", + "blockDim":1, + "coreType":"VectorCore", + "deterministic":"ignore", + "intercoreSync":0, + "kernelName":"te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7__kernel0", + "magic":"RT_DEV_BINARY_MAGIC_ELF_AIVEC", + "memoryStamping":[], + "opParaSize":0, + "parameters":[ + null, + null + ], + "sha256":"4086081ca9855caf5d3a7ea288ed3819b78aca18dfd55810c6d57b83d007d2ce" +} \ No newline at end of file diff --git a/kernel_meta/kernel_meta_14304587336974932360/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.o b/kernel_meta/kernel_meta_14304587336974932360/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.o new file mode 100644 index 00000000..554eacee Binary files /dev/null and b/kernel_meta/kernel_meta_14304587336974932360/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.o differ diff --git a/kernel_meta/kernel_meta_14963073204038372643/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.json b/kernel_meta/kernel_meta_14963073204038372643/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.json new file mode 100644 index 00000000..cd21ad2f --- /dev/null +++ b/kernel_meta/kernel_meta_14963073204038372643/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.json @@ -0,0 +1,17 @@ +{ + "binFileName":"te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7", + "binFileSuffix":".o", + "blockDim":1, + "coreType":"VectorCore", + "deterministic":"ignore", + "intercoreSync":0, + "kernelName":"te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7__kernel0", + "magic":"RT_DEV_BINARY_MAGIC_ELF_AIVEC", + "memoryStamping":[], + "opParaSize":0, + "parameters":[ + null, + null + ], + "sha256":"4086081ca9855caf5d3a7ea288ed3819b78aca18dfd55810c6d57b83d007d2ce" +} \ No newline at end of file diff --git a/kernel_meta/kernel_meta_14963073204038372643/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.o b/kernel_meta/kernel_meta_14963073204038372643/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.o new file mode 100644 index 00000000..554eacee Binary files /dev/null and b/kernel_meta/kernel_meta_14963073204038372643/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.o differ diff --git a/kernel_meta/kernel_meta_16748606091404323938/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.json b/kernel_meta/kernel_meta_16748606091404323938/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.json new file mode 100644 index 00000000..cd21ad2f --- /dev/null +++ b/kernel_meta/kernel_meta_16748606091404323938/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.json @@ -0,0 +1,17 @@ +{ + "binFileName":"te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7", + "binFileSuffix":".o", + "blockDim":1, + "coreType":"VectorCore", + "deterministic":"ignore", + "intercoreSync":0, + "kernelName":"te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7__kernel0", + "magic":"RT_DEV_BINARY_MAGIC_ELF_AIVEC", + "memoryStamping":[], + "opParaSize":0, + "parameters":[ + null, + null + ], + "sha256":"4086081ca9855caf5d3a7ea288ed3819b78aca18dfd55810c6d57b83d007d2ce" +} \ No newline at end of file diff --git a/kernel_meta/kernel_meta_16748606091404323938/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.o b/kernel_meta/kernel_meta_16748606091404323938/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.o new file mode 100644 index 00000000..554eacee Binary files /dev/null and b/kernel_meta/kernel_meta_16748606091404323938/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.o differ diff --git a/kernel_meta/kernel_meta_17928503831894259399/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.json b/kernel_meta/kernel_meta_17928503831894259399/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.json new file mode 100644 index 00000000..cc1f1257 --- /dev/null +++ b/kernel_meta/kernel_meta_17928503831894259399/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.json @@ -0,0 +1,19 @@ +{ + "KBHit":false, + "binFileName":"te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2", + "binFileSuffix":".o", + "blockDim":1, + "coreType":"VectorCore", + "deterministic":"ignore", + "intercoreSync":0, + "kernelName":"te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2__kernel0", + "magic":"RT_DEV_BINARY_MAGIC_ELF_AIVEC", + "memoryStamping":[], + "opParaSize":0, + "parameters":[ + null, + null, + null + ], + "sha256":"a12cf2ca8927b2efe2ad85195e1e21e63908a167343a97b339667b66fe5f828e" +} \ No newline at end of file diff --git a/kernel_meta/kernel_meta_17928503831894259399/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.o b/kernel_meta/kernel_meta_17928503831894259399/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.o new file mode 100644 index 00000000..4dc2ef6c Binary files /dev/null and b/kernel_meta/kernel_meta_17928503831894259399/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.o differ diff --git a/kernel_meta/kernel_meta_1804218815047843948/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.json b/kernel_meta/kernel_meta_1804218815047843948/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.json new file mode 100644 index 00000000..cd21ad2f --- /dev/null +++ b/kernel_meta/kernel_meta_1804218815047843948/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.json @@ -0,0 +1,17 @@ +{ + "binFileName":"te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7", + "binFileSuffix":".o", + "blockDim":1, + "coreType":"VectorCore", + "deterministic":"ignore", + "intercoreSync":0, + "kernelName":"te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7__kernel0", + "magic":"RT_DEV_BINARY_MAGIC_ELF_AIVEC", + "memoryStamping":[], + "opParaSize":0, + "parameters":[ + null, + null + ], + "sha256":"4086081ca9855caf5d3a7ea288ed3819b78aca18dfd55810c6d57b83d007d2ce" +} \ No newline at end of file diff --git a/kernel_meta/kernel_meta_1804218815047843948/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.o b/kernel_meta/kernel_meta_1804218815047843948/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.o new file mode 100644 index 00000000..554eacee Binary files /dev/null and b/kernel_meta/kernel_meta_1804218815047843948/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.o differ diff --git a/kernel_meta/kernel_meta_18396263391666291777/kernel_meta/te_cast_da75ef27832c624999d9f4ed527effc68f95bb6b7102c5710acc4927630a4940.json b/kernel_meta/kernel_meta_18396263391666291777/kernel_meta/te_cast_da75ef27832c624999d9f4ed527effc68f95bb6b7102c5710acc4927630a4940.json new file mode 100644 index 00000000..5c160259 --- /dev/null +++ b/kernel_meta/kernel_meta_18396263391666291777/kernel_meta/te_cast_da75ef27832c624999d9f4ed527effc68f95bb6b7102c5710acc4927630a4940.json @@ -0,0 +1,17 @@ +{ + "binFileName":"te_cast_da75ef27832c624999d9f4ed527effc68f95bb6b7102c5710acc4927630a4940", + "binFileSuffix":".o", + "blockDim":2, + "coreType":"VectorCore", + "deterministic":"ignore", + "intercoreSync":0, + "kernelName":"te_cast_da75ef27832c624999d9f4ed527effc68f95bb6b7102c5710acc4927630a4940__kernel0", + "magic":"RT_DEV_BINARY_MAGIC_ELF_AIVEC", + "memoryStamping":[], + "opParaSize":0, + "parameters":[ + null, + null + ], + "sha256":"e050193a869bd841cad60168441a622c9aea0f195e27d4a9040dae48818b47c3" +} \ No newline at end of file diff --git a/kernel_meta/kernel_meta_18396263391666291777/kernel_meta/te_cast_da75ef27832c624999d9f4ed527effc68f95bb6b7102c5710acc4927630a4940.o b/kernel_meta/kernel_meta_18396263391666291777/kernel_meta/te_cast_da75ef27832c624999d9f4ed527effc68f95bb6b7102c5710acc4927630a4940.o new file mode 100644 index 00000000..f5f138e0 Binary files /dev/null and b/kernel_meta/kernel_meta_18396263391666291777/kernel_meta/te_cast_da75ef27832c624999d9f4ed527effc68f95bb6b7102c5710acc4927630a4940.o differ diff --git a/kernel_meta/kernel_meta_3323201484900386937/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.json b/kernel_meta/kernel_meta_3323201484900386937/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.json new file mode 100644 index 00000000..cc1f1257 --- /dev/null +++ b/kernel_meta/kernel_meta_3323201484900386937/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.json @@ -0,0 +1,19 @@ +{ + "KBHit":false, + "binFileName":"te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2", + "binFileSuffix":".o", + "blockDim":1, + "coreType":"VectorCore", + "deterministic":"ignore", + "intercoreSync":0, + "kernelName":"te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2__kernel0", + "magic":"RT_DEV_BINARY_MAGIC_ELF_AIVEC", + "memoryStamping":[], + "opParaSize":0, + "parameters":[ + null, + null, + null + ], + "sha256":"a12cf2ca8927b2efe2ad85195e1e21e63908a167343a97b339667b66fe5f828e" +} \ No newline at end of file diff --git a/kernel_meta/kernel_meta_3323201484900386937/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.o b/kernel_meta/kernel_meta_3323201484900386937/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.o new file mode 100644 index 00000000..4dc2ef6c Binary files /dev/null and b/kernel_meta/kernel_meta_3323201484900386937/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.o differ diff --git a/kernel_meta/kernel_meta_3730820844868718214/kernel_meta/te_cast_da75ef27832c624999d9f4ed527effc68f95bb6b7102c5710acc4927630a4940.json b/kernel_meta/kernel_meta_3730820844868718214/kernel_meta/te_cast_da75ef27832c624999d9f4ed527effc68f95bb6b7102c5710acc4927630a4940.json new file mode 100644 index 00000000..5c160259 --- /dev/null +++ b/kernel_meta/kernel_meta_3730820844868718214/kernel_meta/te_cast_da75ef27832c624999d9f4ed527effc68f95bb6b7102c5710acc4927630a4940.json @@ -0,0 +1,17 @@ +{ + "binFileName":"te_cast_da75ef27832c624999d9f4ed527effc68f95bb6b7102c5710acc4927630a4940", + "binFileSuffix":".o", + "blockDim":2, + "coreType":"VectorCore", + "deterministic":"ignore", + "intercoreSync":0, + "kernelName":"te_cast_da75ef27832c624999d9f4ed527effc68f95bb6b7102c5710acc4927630a4940__kernel0", + "magic":"RT_DEV_BINARY_MAGIC_ELF_AIVEC", + "memoryStamping":[], + "opParaSize":0, + "parameters":[ + null, + null + ], + "sha256":"e050193a869bd841cad60168441a622c9aea0f195e27d4a9040dae48818b47c3" +} \ No newline at end of file diff --git a/kernel_meta/kernel_meta_3730820844868718214/kernel_meta/te_cast_da75ef27832c624999d9f4ed527effc68f95bb6b7102c5710acc4927630a4940.o b/kernel_meta/kernel_meta_3730820844868718214/kernel_meta/te_cast_da75ef27832c624999d9f4ed527effc68f95bb6b7102c5710acc4927630a4940.o new file mode 100644 index 00000000..f5f138e0 Binary files /dev/null and b/kernel_meta/kernel_meta_3730820844868718214/kernel_meta/te_cast_da75ef27832c624999d9f4ed527effc68f95bb6b7102c5710acc4927630a4940.o differ diff --git a/kernel_meta/kernel_meta_4049179572450833588/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.json b/kernel_meta/kernel_meta_4049179572450833588/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.json new file mode 100644 index 00000000..cd21ad2f --- /dev/null +++ b/kernel_meta/kernel_meta_4049179572450833588/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.json @@ -0,0 +1,17 @@ +{ + "binFileName":"te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7", + "binFileSuffix":".o", + "blockDim":1, + "coreType":"VectorCore", + "deterministic":"ignore", + "intercoreSync":0, + "kernelName":"te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7__kernel0", + "magic":"RT_DEV_BINARY_MAGIC_ELF_AIVEC", + "memoryStamping":[], + "opParaSize":0, + "parameters":[ + null, + null + ], + "sha256":"4086081ca9855caf5d3a7ea288ed3819b78aca18dfd55810c6d57b83d007d2ce" +} \ No newline at end of file diff --git a/kernel_meta/kernel_meta_4049179572450833588/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.o b/kernel_meta/kernel_meta_4049179572450833588/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.o new file mode 100644 index 00000000..554eacee Binary files /dev/null and b/kernel_meta/kernel_meta_4049179572450833588/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.o differ diff --git a/kernel_meta/kernel_meta_4752614533847753272/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.json b/kernel_meta/kernel_meta_4752614533847753272/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.json new file mode 100644 index 00000000..cd21ad2f --- /dev/null +++ b/kernel_meta/kernel_meta_4752614533847753272/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.json @@ -0,0 +1,17 @@ +{ + "binFileName":"te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7", + "binFileSuffix":".o", + "blockDim":1, + "coreType":"VectorCore", + "deterministic":"ignore", + "intercoreSync":0, + "kernelName":"te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7__kernel0", + "magic":"RT_DEV_BINARY_MAGIC_ELF_AIVEC", + "memoryStamping":[], + "opParaSize":0, + "parameters":[ + null, + null + ], + "sha256":"4086081ca9855caf5d3a7ea288ed3819b78aca18dfd55810c6d57b83d007d2ce" +} \ No newline at end of file diff --git a/kernel_meta/kernel_meta_4752614533847753272/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.o b/kernel_meta/kernel_meta_4752614533847753272/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.o new file mode 100644 index 00000000..554eacee Binary files /dev/null and b/kernel_meta/kernel_meta_4752614533847753272/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.o differ diff --git a/kernel_meta/kernel_meta_4793952418245670806/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.json b/kernel_meta/kernel_meta_4793952418245670806/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.json new file mode 100644 index 00000000..cc1f1257 --- /dev/null +++ b/kernel_meta/kernel_meta_4793952418245670806/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.json @@ -0,0 +1,19 @@ +{ + "KBHit":false, + "binFileName":"te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2", + "binFileSuffix":".o", + "blockDim":1, + "coreType":"VectorCore", + "deterministic":"ignore", + "intercoreSync":0, + "kernelName":"te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2__kernel0", + "magic":"RT_DEV_BINARY_MAGIC_ELF_AIVEC", + "memoryStamping":[], + "opParaSize":0, + "parameters":[ + null, + null, + null + ], + "sha256":"a12cf2ca8927b2efe2ad85195e1e21e63908a167343a97b339667b66fe5f828e" +} \ No newline at end of file diff --git a/kernel_meta/kernel_meta_4793952418245670806/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.o b/kernel_meta/kernel_meta_4793952418245670806/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.o new file mode 100644 index 00000000..4dc2ef6c Binary files /dev/null and b/kernel_meta/kernel_meta_4793952418245670806/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.o differ diff --git a/kernel_meta/kernel_meta_5490242672022926546/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.json b/kernel_meta/kernel_meta_5490242672022926546/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.json new file mode 100644 index 00000000..cc1f1257 --- /dev/null +++ b/kernel_meta/kernel_meta_5490242672022926546/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.json @@ -0,0 +1,19 @@ +{ + "KBHit":false, + "binFileName":"te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2", + "binFileSuffix":".o", + "blockDim":1, + "coreType":"VectorCore", + "deterministic":"ignore", + "intercoreSync":0, + "kernelName":"te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2__kernel0", + "magic":"RT_DEV_BINARY_MAGIC_ELF_AIVEC", + "memoryStamping":[], + "opParaSize":0, + "parameters":[ + null, + null, + null + ], + "sha256":"a12cf2ca8927b2efe2ad85195e1e21e63908a167343a97b339667b66fe5f828e" +} \ No newline at end of file diff --git a/kernel_meta/kernel_meta_5490242672022926546/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.o b/kernel_meta/kernel_meta_5490242672022926546/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.o new file mode 100644 index 00000000..4dc2ef6c Binary files /dev/null and b/kernel_meta/kernel_meta_5490242672022926546/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.o differ diff --git a/kernel_meta/kernel_meta_5940993691110867528/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.json b/kernel_meta/kernel_meta_5940993691110867528/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.json new file mode 100644 index 00000000..cd21ad2f --- /dev/null +++ b/kernel_meta/kernel_meta_5940993691110867528/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.json @@ -0,0 +1,17 @@ +{ + "binFileName":"te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7", + "binFileSuffix":".o", + "blockDim":1, + "coreType":"VectorCore", + "deterministic":"ignore", + "intercoreSync":0, + "kernelName":"te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7__kernel0", + "magic":"RT_DEV_BINARY_MAGIC_ELF_AIVEC", + "memoryStamping":[], + "opParaSize":0, + "parameters":[ + null, + null + ], + "sha256":"4086081ca9855caf5d3a7ea288ed3819b78aca18dfd55810c6d57b83d007d2ce" +} \ No newline at end of file diff --git a/kernel_meta/kernel_meta_5940993691110867528/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.o b/kernel_meta/kernel_meta_5940993691110867528/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.o new file mode 100644 index 00000000..554eacee Binary files /dev/null and b/kernel_meta/kernel_meta_5940993691110867528/kernel_meta/te_cast_da6a3d3593ee73900556552b6217dd50ea619e3541897a49f9fa7d45bcb4d4f7.o differ diff --git a/kernel_meta/kernel_meta_6290488197836237565/kernel_meta/te_cast_da75ef27832c624999d9f4ed527effc68f95bb6b7102c5710acc4927630a4940.json b/kernel_meta/kernel_meta_6290488197836237565/kernel_meta/te_cast_da75ef27832c624999d9f4ed527effc68f95bb6b7102c5710acc4927630a4940.json new file mode 100644 index 00000000..5c160259 --- /dev/null +++ b/kernel_meta/kernel_meta_6290488197836237565/kernel_meta/te_cast_da75ef27832c624999d9f4ed527effc68f95bb6b7102c5710acc4927630a4940.json @@ -0,0 +1,17 @@ +{ + "binFileName":"te_cast_da75ef27832c624999d9f4ed527effc68f95bb6b7102c5710acc4927630a4940", + "binFileSuffix":".o", + "blockDim":2, + "coreType":"VectorCore", + "deterministic":"ignore", + "intercoreSync":0, + "kernelName":"te_cast_da75ef27832c624999d9f4ed527effc68f95bb6b7102c5710acc4927630a4940__kernel0", + "magic":"RT_DEV_BINARY_MAGIC_ELF_AIVEC", + "memoryStamping":[], + "opParaSize":0, + "parameters":[ + null, + null + ], + "sha256":"e050193a869bd841cad60168441a622c9aea0f195e27d4a9040dae48818b47c3" +} \ No newline at end of file diff --git a/kernel_meta/kernel_meta_6290488197836237565/kernel_meta/te_cast_da75ef27832c624999d9f4ed527effc68f95bb6b7102c5710acc4927630a4940.o b/kernel_meta/kernel_meta_6290488197836237565/kernel_meta/te_cast_da75ef27832c624999d9f4ed527effc68f95bb6b7102c5710acc4927630a4940.o new file mode 100644 index 00000000..f5f138e0 Binary files /dev/null and b/kernel_meta/kernel_meta_6290488197836237565/kernel_meta/te_cast_da75ef27832c624999d9f4ed527effc68f95bb6b7102c5710acc4927630a4940.o differ diff --git a/kernel_meta/kernel_meta_6359637368554439377/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.json b/kernel_meta/kernel_meta_6359637368554439377/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.json new file mode 100644 index 00000000..cc1f1257 --- /dev/null +++ b/kernel_meta/kernel_meta_6359637368554439377/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.json @@ -0,0 +1,19 @@ +{ + "KBHit":false, + "binFileName":"te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2", + "binFileSuffix":".o", + "blockDim":1, + "coreType":"VectorCore", + "deterministic":"ignore", + "intercoreSync":0, + "kernelName":"te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2__kernel0", + "magic":"RT_DEV_BINARY_MAGIC_ELF_AIVEC", + "memoryStamping":[], + "opParaSize":0, + "parameters":[ + null, + null, + null + ], + "sha256":"a12cf2ca8927b2efe2ad85195e1e21e63908a167343a97b339667b66fe5f828e" +} \ No newline at end of file diff --git a/kernel_meta/kernel_meta_6359637368554439377/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.o b/kernel_meta/kernel_meta_6359637368554439377/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.o new file mode 100644 index 00000000..4dc2ef6c Binary files /dev/null and b/kernel_meta/kernel_meta_6359637368554439377/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.o differ diff --git a/kernel_meta/kernel_meta_8760436142451619538/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.json b/kernel_meta/kernel_meta_8760436142451619538/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.json new file mode 100644 index 00000000..cc1f1257 --- /dev/null +++ b/kernel_meta/kernel_meta_8760436142451619538/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.json @@ -0,0 +1,19 @@ +{ + "KBHit":false, + "binFileName":"te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2", + "binFileSuffix":".o", + "blockDim":1, + "coreType":"VectorCore", + "deterministic":"ignore", + "intercoreSync":0, + "kernelName":"te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2__kernel0", + "magic":"RT_DEV_BINARY_MAGIC_ELF_AIVEC", + "memoryStamping":[], + "opParaSize":0, + "parameters":[ + null, + null, + null + ], + "sha256":"a12cf2ca8927b2efe2ad85195e1e21e63908a167343a97b339667b66fe5f828e" +} \ No newline at end of file diff --git a/kernel_meta/kernel_meta_8760436142451619538/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.o b/kernel_meta/kernel_meta_8760436142451619538/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.o new file mode 100644 index 00000000..4dc2ef6c Binary files /dev/null and b/kernel_meta/kernel_meta_8760436142451619538/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.o differ diff --git a/kernel_meta/kernel_meta_9683836744857508790/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.json b/kernel_meta/kernel_meta_9683836744857508790/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.json new file mode 100644 index 00000000..cc1f1257 --- /dev/null +++ b/kernel_meta/kernel_meta_9683836744857508790/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.json @@ -0,0 +1,19 @@ +{ + "KBHit":false, + "binFileName":"te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2", + "binFileSuffix":".o", + "blockDim":1, + "coreType":"VectorCore", + "deterministic":"ignore", + "intercoreSync":0, + "kernelName":"te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2__kernel0", + "magic":"RT_DEV_BINARY_MAGIC_ELF_AIVEC", + "memoryStamping":[], + "opParaSize":0, + "parameters":[ + null, + null, + null + ], + "sha256":"a12cf2ca8927b2efe2ad85195e1e21e63908a167343a97b339667b66fe5f828e" +} \ No newline at end of file diff --git a/kernel_meta/kernel_meta_9683836744857508790/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.o b/kernel_meta/kernel_meta_9683836744857508790/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.o new file mode 100644 index 00000000..4dc2ef6c Binary files /dev/null and b/kernel_meta/kernel_meta_9683836744857508790/kernel_meta/te_sub_0c293a4e1aff9de01ff84b2682b0a7c619cc8564c326104e4e1bb1237a187ad2.o differ diff --git a/kernel_meta/kernel_meta_9690974803620934097/kernel_meta/te_cast_da75ef27832c624999d9f4ed527effc68f95bb6b7102c5710acc4927630a4940.json b/kernel_meta/kernel_meta_9690974803620934097/kernel_meta/te_cast_da75ef27832c624999d9f4ed527effc68f95bb6b7102c5710acc4927630a4940.json new file mode 100644 index 00000000..5c160259 --- /dev/null +++ b/kernel_meta/kernel_meta_9690974803620934097/kernel_meta/te_cast_da75ef27832c624999d9f4ed527effc68f95bb6b7102c5710acc4927630a4940.json @@ -0,0 +1,17 @@ +{ + "binFileName":"te_cast_da75ef27832c624999d9f4ed527effc68f95bb6b7102c5710acc4927630a4940", + "binFileSuffix":".o", + "blockDim":2, + "coreType":"VectorCore", + "deterministic":"ignore", + "intercoreSync":0, + "kernelName":"te_cast_da75ef27832c624999d9f4ed527effc68f95bb6b7102c5710acc4927630a4940__kernel0", + "magic":"RT_DEV_BINARY_MAGIC_ELF_AIVEC", + "memoryStamping":[], + "opParaSize":0, + "parameters":[ + null, + null + ], + "sha256":"e050193a869bd841cad60168441a622c9aea0f195e27d4a9040dae48818b47c3" +} \ No newline at end of file diff --git a/kernel_meta/kernel_meta_9690974803620934097/kernel_meta/te_cast_da75ef27832c624999d9f4ed527effc68f95bb6b7102c5710acc4927630a4940.o b/kernel_meta/kernel_meta_9690974803620934097/kernel_meta/te_cast_da75ef27832c624999d9f4ed527effc68f95bb6b7102c5710acc4927630a4940.o new file mode 100644 index 00000000..f5f138e0 Binary files /dev/null and b/kernel_meta/kernel_meta_9690974803620934097/kernel_meta/te_cast_da75ef27832c624999d9f4ed527effc68f95bb6b7102c5710acc4927630a4940.o differ diff --git a/kernel_meta/kernel_meta_temp_11105858433588199599/.__dpc0010000007252a1300001e12 b/kernel_meta/kernel_meta_temp_11105858433588199599/.__dpc0010000007252a1300001e12 new file mode 100644 index 00000000..e69de29b diff --git a/kernel_meta/kernel_meta_temp_14745439082243309897/.__dpc00100000072844ea00001bb7 b/kernel_meta/kernel_meta_temp_14745439082243309897/.__dpc00100000072844ea00001bb7 new file mode 100644 index 00000000..e69de29b diff --git a/scripts/spellcheck.sh b/scripts/spellcheck.sh old mode 100755 new mode 100644 diff --git a/src/slam_llm/datasets/speech_dataset.py b/src/slam_llm/datasets/speech_dataset.py index 799b114f..0ed055c3 100644 --- a/src/slam_llm/datasets/speech_dataset.py +++ b/src/slam_llm/datasets/speech_dataset.py @@ -102,7 +102,7 @@ def __getitem__(self, index): # audio_raw = np.concatenate((np.zeros(random.randint(0, 16000)), audio_raw, np.zeros(random.randint(0, 16000)))).astype(audio_raw.dtype)[:16000*30] audio_mel = whisper.log_mel_spectrogram(audio_raw, n_mels=self.mel_size).permute(1, 0) audio_length = (audio_mel.shape[0] + 1) // 2 # ad-hoc for whisper for 2x downsample from mel to feats - audio_length = audio_length // 5 # ad-hoc for 5x fc downsample + audio_length = audio_length // config.encoder_projector_ds_rate # ad-hoc for 5x fc downsample # audio_length = calculate_output_length_1d(audio_length, 5, 5, 0) # ad-hoc for 5x cov1d downsample if self.fix_length_audio > 0: audio_length = self.fix_length_audio diff --git a/src/slam_llm/models/encoder.py b/src/slam_llm/models/encoder.py index 4c06bbef..93d3af2c 100644 --- a/src/slam_llm/models/encoder.py +++ b/src/slam_llm/models/encoder.py @@ -3,6 +3,9 @@ import torch.nn as nn import torch.nn.functional as F from dataclasses import dataclass +import whisper +from typing import Dict, Iterable, Optional +from torch import Tensor, nn class WhisperWrappedEncoder: @@ -21,20 +24,26 @@ def extract_variable_length_features(self, x: torch.Tensor): # assert x.shape[1:] == self.positional_embedding.shape, "incorrect audio shape" # x = (x + self.positional_embedding).to(x.dtype) - x = (x + self.positional_embedding[: x.shape[1]]).to(x.dtype) - + if x.shape[1] < 1500: + x = (x + self.positional_embedding[: x.shape[1]]).to(x.dtype) + else: + length = x.shape[1] + base = 0 + while length > 0: + if length > 1500: + x[:,base:base+1500] += self.positional_embedding + length -= 1500 + base += 1500 + else: + x[:,base:base+length] += self.positional_embedding[:length] + length =-1 + x = x.to(x.dtype) for block in self.blocks: x = block(x) x = self.ln_post(x) return x - if model_config.whisper_decode: - import whisper - whisper_model = whisper.load_model(name=model_config.encoder_path, device='cpu') - whisper_model.encoder.extract_variable_length_features = types.MethodType(extract_variable_length_features, whisper_model.encoder) - return whisper_model - if model_config.encoder_path_hf is not None: from transformers import WhisperModel encoder = WhisperModel.from_pretrained(model_config.encoder_path_hf,torch_dtype=torch.bfloat16).encoder diff --git a/src/slam_llm/models/slam_model.py b/src/slam_llm/models/slam_model.py index 0aa93d34..bf70808c 100644 --- a/src/slam_llm/models/slam_model.py +++ b/src/slam_llm/models/slam_model.py @@ -1,14 +1,16 @@ import os import types import torch +import torch_npu import soundfile as sf import torch.nn as nn import torch.nn.functional as F +from torch.npu.amp import autocast import torch.distributed as dist from typing import List, Optional, Tuple, Union from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, AutoModel, AutoModelForSeq2SeqLM, T5ForConditionalGeneration from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training - +from torch.npu.amp import autocast from slam_llm.utils.config_utils import generate_peft_config from slam_llm.utils.train_utils import print_module_size, print_model_size from peft import PeftModel, PeftConfig @@ -259,6 +261,8 @@ def __init__( # tokenizer self.tokenizer = tokenizer + + self.metric = kwargs.get("metric", "acc") self.train_config = train_config @@ -278,8 +282,7 @@ def new_forward(self, input): if isinstance(item, nn.LayerNorm): item.forward = types.MethodType(new_forward, item) - - + @autocast(dtype=torch.bfloat16) def forward(self, input_ids: torch.LongTensor = None, attention_mask: Optional[torch.Tensor] = None, @@ -319,6 +322,10 @@ def forward(self, if self.model_config.encoder_name == "whisper": encoder_outs = self.encoder.extract_variable_length_features(audio_mel.permute(0, 2, 1)) # bs*seq*dim + # encoder_outs = self.encoder(audio_mel.permute(0, 2, 1)) # bs*seq*dim + # output = self.encoder.decode(audio_mel.permute(0, 2, 1)) # bs*seq*dim + # encoder_outs = output.audio_features + # encoder_outs = kwargs.get("encoder_outs", None) if self.model_config.encoder_name == "beats": encoder_outs, audio_mel_post_mask = self.encoder.extract_features(audio_mel, audio_mel_mask) # bs*seq*dim if self.model_config.encoder_name == "eat": diff --git a/src/slam_llm/pipeline/finetune.py b/src/slam_llm/pipeline/finetune.py index 4ced3c51..92f129ac 100644 --- a/src/slam_llm/pipeline/finetune.py +++ b/src/slam_llm/pipeline/finetune.py @@ -1,11 +1,12 @@ # os import os -import fire +# import fire import random import importlib # nn import torch +import torch_npu from transformers.models.llama.modeling_llama import LlamaDecoderLayer # opt @@ -96,7 +97,6 @@ def main(kwargs: DictConfig): datefmt="%Y-%m-%d %H:%M:%S", filemode='w' ) - logger = logging.getLogger() logger.setLevel(logging.INFO) @@ -113,7 +113,7 @@ def main(kwargs: DictConfig): # Set the seeds for reproducibility - torch.cuda.manual_seed(train_config.seed) + torch_npu.npu.manual_seed(train_config.seed) torch.manual_seed(train_config.seed) random.seed(train_config.seed) @@ -125,8 +125,13 @@ def main(kwargs: DictConfig): world_size = int(os.environ["WORLD_SIZE"]) logger.info(f"local_rank: {local_rank}, rank: {rank}, world_size: {world_size}") + # Set the seeds for reproducibility + torch.npu.manual_seed(train_config.seed) + torch.manual_seed(train_config.seed) + random.seed(train_config.seed) + if torch.distributed.is_initialized(): - torch.cuda.set_device(local_rank) + torch_npu.npu.set_device(f"npu:{local_rank}") clear_gpu_cache(local_rank) setup_environ_flags(rank) @@ -144,21 +149,21 @@ def main(kwargs: DictConfig): wandb_config={"train_config": train_config, "fsdp_config": fsdp_config, "model_config": model_config, "log_config": log_config} wandb.init(dir=log_config.wandb_dir, entity=log_config.wandb_entity_name, project=log_config.wandb_project_name,name=log_config.wandb_exp_name ,config=wandb_config) - model_factory = get_custom_model_factory(model_config, logger) model, tokenizer = model_factory(train_config, model_config, **kwargs) - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + device = torch.device(f"npu:{local_rank}" if torch_npu.npu.is_available() else "cpu") # Convert the model to bfloat16 if fsdp and pure_bf16 is enabled - if (train_config.enable_fsdp or train_config.enable_ddp) and fsdp_config.pure_bf16: - model.to(torch.bfloat16) + # if (train_config.enable_fsdp or train_config.enable_ddp) and fsdp_config.pure_bf16: + # model.to(torch.bfloat16) #setting up FSDP if enable_fsdp is enabled if train_config.enable_fsdp: if not train_config.use_peft and train_config.freeze_layers: freeze_transformer_layers(train_config.num_freeze_layers) + # print(fsdp_config) # from torch.distributed.fsdp import ShardingStrategy # fsdp_config.sharding_strategy = getattr(ShardingStrategy, fsdp_config.sharding_strategy) mixed_precision_policy, wrapping_policy = get_policies(fsdp_config, rank) @@ -170,16 +175,16 @@ def main(kwargs: DictConfig): cpu_offload=CPUOffload(offload_params=True) if fsdp_config.fsdp_cpu_offload else None, mixed_precision=mixed_precision_policy if not fsdp_config.pure_bf16 else None, sharding_strategy=fsdp_config.sharding_strategy, - device_id=torch.cuda.current_device(), + device_id=torch_npu.npu.current_device(), limit_all_gathers=True, sync_module_states=train_config.low_cpu_fsdp, - param_init_fn=lambda module: module.to_empty(device=torch.device("cuda"), recurse=False) + param_init_fn=lambda module: module.to_empty(device=torch.device("npu"), recurse=False) if train_config.low_cpu_fsdp and rank != 0 else None, ) if fsdp_config.fsdp_activation_checkpointing: apply_fsdp_checkpointing(model) elif train_config.enable_ddp: - model = model.cuda(local_rank) + model = model.npu(local_rank) model = DDP(model, device_ids=[local_rank], find_unused_parameters=kwargs.get("train_conf", {}).get("find_unused_parameters", False)) elif not train_config.quantization: @@ -197,15 +202,13 @@ def main(kwargs: DictConfig): dataset_config, split="train", ) - if not (train_config.enable_fsdp or train_config.enable_ddp) or rank == 0: - logger.info(f"--> Training Set Length = {len(dataset_train)}") + dataset_val = get_preprocessed_dataset( tokenizer, dataset_config, split="val", ) - if not (train_config.enable_fsdp or train_config.enable_ddp) or rank == 0: - logger.info(f"--> Validation Set Length = {len(dataset_val)}") + if train_config.batching_strategy == "packing": dataset_train = ConcatDataset(dataset_train, chunk_size=train_config.context_length) diff --git a/src/slam_llm/pipeline/finetune_deepspeed.py b/src/slam_llm/pipeline/finetune_deepspeed.py index 8f275faf..1329cf61 100644 --- a/src/slam_llm/pipeline/finetune_deepspeed.py +++ b/src/slam_llm/pipeline/finetune_deepspeed.py @@ -7,6 +7,7 @@ # nn import torch +import torch_npu from transformers.models.llama.modeling_llama import LlamaDecoderLayer # opt @@ -49,7 +50,7 @@ from omegaconf import DictConfig, ListConfig, OmegaConf from pathlib import Path -@hydra.main(config_name=None, version_base=None) +@hydra.main(config_name=None, version_base=None) # strict=False 允许忽略未知参数) def main_hydra(cfg: DictConfig): def to_plain_list(cfg_item): if isinstance(cfg_item, ListConfig): @@ -113,7 +114,7 @@ def main(kwargs: DictConfig): # Set the seeds for reproducibility - torch.cuda.manual_seed(train_config.seed) + torch_npu.npu.manual_seed(train_config.seed) torch.manual_seed(train_config.seed) random.seed(train_config.seed) @@ -122,7 +123,7 @@ def main(kwargs: DictConfig): world_size = int(os.environ["WORLD_SIZE"]) logger.info(f"local_rank: {local_rank}, rank: {rank}, world_size: {world_size}") - torch.cuda.set_device(local_rank) + torch_npu.npu.set_device(f"npu:{local_rank}") clear_gpu_cache(local_rank) setup_environ_flags(rank) @@ -143,7 +144,7 @@ def main(kwargs: DictConfig): model_factory = get_custom_model_factory(model_config, logger) model, tokenizer = model_factory(train_config, model_config, **kwargs) parameters = filter(lambda p: p.requires_grad, model.parameters()) - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + device = torch.device(f"npu:{local_rank}" if torch_npu.npu.is_available() else "cpu") # If you are facing problem from limited memory(<=256GB), you can try to replace the above code with the following code # for i in range(rank): @@ -160,10 +161,13 @@ def main(kwargs: DictConfig): # Initialize the optimizer and learning rate scheduler - model_engine, _, _, _ = deepspeed.initialize( + model_engine, _, _, lr_scheduler = deepspeed.initialize( model=model, model_parameters=parameters, config=deepspeed_config ) - + deepspeed_path = kwargs.get("deepspeed_ckpt_path",None) + if (deepspeed_path != None): + print(f"[Loading] loading model from {deepspeed_path}") + model_engine.load_checkpoint(deepspeed_path,load_module_strict=False) # Convert the model to bfloat16 if fsdp and pure_bf16 is enabled # if (train_config.enable_fsdp or train_config.enable_ddp) and fsdp_config.pure_bf16: @@ -189,15 +193,13 @@ def main(kwargs: DictConfig): dataset_config, split="train", ) - if not (train_config.enable_fsdp or train_config.enable_ddp) or rank == 0: - logger.info(f"--> Training Set Length = {len(dataset_train)}") + dataset_val = get_preprocessed_dataset( tokenizer, dataset_config, split="val", ) - if not (train_config.enable_fsdp or train_config.enable_ddp) or rank == 0: - logger.info(f"--> Validation Set Length = {len(dataset_val)}") + if train_config.batching_strategy == "packing": dataset_train = ConcatDataset(dataset_train, chunk_size=train_config.context_length) @@ -207,10 +209,10 @@ def main(kwargs: DictConfig): train_dataloader = torch.utils.data.DataLoader( dataset_train, num_workers=train_config.num_workers_dataloader, - pin_memory=True, + prefetch_factor = train_config.num_workers_dataloader*2, **train_dl_kwargs, ) - + eval_dataloader = None if train_config.run_validation: if train_config.batching_strategy == "packing": @@ -221,7 +223,6 @@ def main(kwargs: DictConfig): eval_dataloader = torch.utils.data.DataLoader( dataset_val, num_workers=train_config.num_workers_dataloader, - pin_memory=True, **val_dl_kwargs, ) @@ -229,6 +230,7 @@ def main(kwargs: DictConfig): # Start the training process results = train( model_engine, + lr_scheduler, train_dataloader, eval_dataloader, tokenizer, diff --git a/src/slam_llm/pipeline/inference_batch.py b/src/slam_llm/pipeline/inference_batch.py index 1dc03940..94d13313 100644 --- a/src/slam_llm/pipeline/inference_batch.py +++ b/src/slam_llm/pipeline/inference_batch.py @@ -1,8 +1,10 @@ # import fire import random import torch +import torch_npu import logging # import argparse +import inspect from slam_llm.models.slam_model import slam_model # config # from llama_recipes.configs import fsdp_config as FSDP_CONFIG @@ -11,7 +13,7 @@ # from llama_recipes.configs import log_config as LOG_CONFIG from slam_llm.utils.model_utils import get_custom_model_factory -from slam_llm.utils.dataset_utils import get_preprocessed_dataset +from slam_llm.utils.dataset_utils import get_custom_dataset, get_preprocessed_dataset import os import logging from tqdm import tqdm @@ -92,25 +94,22 @@ def main(kwargs: DictConfig): # Set the seeds for reproducibility - torch.cuda.manual_seed(train_config.seed) + torch.npu.manual_seed(train_config.seed) torch.manual_seed(train_config.seed) random.seed(train_config.seed) model_factory = get_custom_model_factory(model_config, logger) model, tokenizer = model_factory(train_config, model_config, **kwargs) - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # FIX(MZY): put the whole model to device. + device = torch.device("npu" if torch.npu.is_available() else "cpu") # FIX(MZY): put the whole model to device. model.to(device) model.eval() - - # dataset_config = generate_dataset_config(train_config, kwargs) logger.info("dataset_config: {}".format(dataset_config)) dataset_test = get_preprocessed_dataset( tokenizer, dataset_config, split="test", ) - if not (train_config.enable_fsdp or train_config.enable_ddp) or rank == 0: - logger.info(f"--> Training Set Length = {len(dataset_test)}") + test_dataloader = torch.utils.data.DataLoader( dataset_test, @@ -119,22 +118,27 @@ def main(kwargs: DictConfig): shuffle=False, batch_size=train_config.val_batch_size, drop_last=False, - collate_fn=dataset_test.collator + collate_fn=dataset_test.collator, + # multiprocessing_context=mp.get_context("spawn") ) logger.info("=====================================") pred_path = kwargs.get('decode_log') + "_pred" gt_path = kwargs.get('decode_log') + "_gt" - with open(pred_path, "w") as pred, open(gt_path, "w") as gt: - for step, batch in tqdm(enumerate(test_dataloader), total=len(test_dataloader)): - for key in batch.keys(): - batch[key] = batch[key].to(device) if isinstance(batch[key], torch.Tensor) else batch[key] - model_outputs = model.generate(**batch) - output_text = model.tokenizer.batch_decode(model_outputs, add_special_tokens=False, skip_special_tokens=True) - for key, text, target in zip(batch["keys"], output_text, batch["targets"]): - pred.write(key + "\t" + text.replace("\n", " ") + "\n") - gt.write(key + "\t" + target + "\n") + with torch.no_grad(): + with open(pred_path, "w") as pred, open(gt_path, "w") as gt: + for step, batch in tqdm(enumerate(test_dataloader)): + for key in batch.keys(): + batch[key] = batch[key].to(device) if isinstance(batch[key], torch.Tensor) else batch[key] + model_outputs = model.generate(**batch) + if hasattr(model, 'tokenizer'): + output_text = model.tokenizer.batch_decode(model_outputs, add_special_tokens=False, skip_special_tokens=True) + else: + output_text = tokenizer.batch_decode(model_outputs, skip_special_tokens=True) + for key, text, target in zip(batch["keys"], output_text, batch["targets"]): + pred.write(key + " " + text.strip() + "\n") + gt.write(key + " " + target + "\n") if __name__ == "__main__": diff --git a/src/slam_llm/pipeline/inference_batch_deepspeed.py b/src/slam_llm/pipeline/inference_batch_deepspeed.py new file mode 100644 index 00000000..35a5b697 --- /dev/null +++ b/src/slam_llm/pipeline/inference_batch_deepspeed.py @@ -0,0 +1,192 @@ +# os +import os +import fire +import deepspeed +import random +import importlib +from tqdm import tqdm +# nn +import torch +import torch_npu +from transformers.models.llama.modeling_llama import LlamaDecoderLayer +from torch.utils.data import DistributedSampler +# opt +import torch.optim as optim +from torch.optim.lr_scheduler import StepLR +from torch.distributed.fsdp import ( + FullyShardedDataParallel as FSDP, +) +from torch.nn.parallel import DistributedDataParallel as DDP + +from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload +from slam_llm.policies import AnyPrecisionAdamW, apply_fsdp_checkpointing +import torch.distributed as dist +# config +# from llama_recipes.configs import fsdp_config as FSDP_CONFIG +# from llama_recipes.configs import train_config as TRAIN_CONFIG +# from llama_recipes.configs import model_config as MODEL_CONFIG +# from llama_recipes.configs import log_config as LOG_CONFIG +from slam_llm.data.concatenator import ConcatDataset + +# util +from slam_llm.utils import fsdp_auto_wrap_policy +from slam_llm.utils.config_utils import get_dataloader_kwargs + +from slam_llm.utils.dataset_utils import get_preprocessed_dataset, load_module_from_py_file +from slam_llm.utils.model_utils import get_custom_model_factory +from slam_llm.utils.deepspeed_utils import ( + train, + freeze_transformer_layers, + setup, + setup_environ_flags, + clear_gpu_cache, +) + +import sys +import logging +import wandb + +import hydra +from omegaconf import DictConfig, ListConfig, OmegaConf +from pathlib import Path + +@hydra.main(config_name=None, version_base=None) # strict=False 允许忽略未知参数) +def main_hydra(cfg: DictConfig): + def to_plain_list(cfg_item): + if isinstance(cfg_item, ListConfig): + return OmegaConf.to_container(cfg_item, resolve=True) + elif isinstance(cfg_item, DictConfig): + return {k: to_plain_list(v) for k, v in cfg_item.items()} + else: + return cfg_item + + # kwargs = to_plain_list(cfg) + kwargs = cfg + log_level = getattr(logging, kwargs.get("log_level", "INFO").upper()) + + logging.basicConfig(level=log_level) + + if kwargs.get("debug", False): + import pdb; + pdb.set_trace() + + main(kwargs) + + +def main(kwargs: DictConfig): + # Update the configuration for the training and sharding process + # train_config, fsdp_config, model_config, log_config = TRAIN_CONFIG(), FSDP_CONFIG(), MODEL_CONFIG(), LOG_CONFIG() + # update_config((train_config, fsdp_config, model_config, log_config), **kwargs) + + train_config, model_config, log_config, dataset_config = kwargs.train_config, \ + kwargs.model_config, \ + kwargs.log_config, \ + kwargs.dataset_config + del kwargs.train_config + del kwargs.model_config + del kwargs.log_config + del kwargs.dataset_config + + # Set log + if not os.path.exists(os.path.dirname(log_config.log_file)): + os.makedirs(os.path.dirname(log_config.log_file), exist_ok=True) + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + filemode='w' + ) + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + + file_handler = logging.FileHandler(filename=log_config.log_file, mode='w') + file_handler.setLevel(logging.INFO) + file_formatter = logging.Formatter('[%(asctime)s][%(name)s][%(levelname)s] - %(message)s', datefmt='%Y-%m-%d %H:%M:%S') + file_handler.setFormatter(file_formatter) + + logger.handlers[0].setLevel(logging.INFO) + console_formatter = logging.Formatter('[%(asctime)s][%(name)s][%(levelname)s] - %(message)s', datefmt='%Y-%m-%d %H:%M:%S') + logger.handlers[0].setFormatter(console_formatter) + + logger.addHandler(file_handler) + + + # Set the seeds for reproducibility + torch_npu.npu.manual_seed(train_config.seed) + torch.manual_seed(train_config.seed) + random.seed(train_config.seed) + + local_rank = int(os.environ["LOCAL_RANK"]) + rank = int(os.environ["RANK"]) + world_size = int(os.environ["WORLD_SIZE"]) + logger.info(f"local_rank: {local_rank}, rank: {rank}, world_size: {world_size}") + + deepspeed.init_distributed( + dist_backend='hccl', # 使用NCCL后端(GPU场景) + ) + + if rank == 0: + logger.info("train_config: {}".format(train_config)) + logger.info("model_config: {}".format(model_config)) + logger.info("log_config: {}".format(log_config)) + + # Set wandb + if rank == 0: + if log_config.use_wandb: + if not os.path.exists(log_config.wandb_dir): + os.makedirs(log_config.wandb_dir, exist_ok=True) + wandb_config={"train_config": train_config, "model_config": model_config, "log_config": log_config} + wandb.init(dir=log_config.wandb_dir, entity=log_config.wandb_entity_name, project=log_config.wandb_project_name,name=log_config.wandb_exp_name ,config=wandb_config) + + + model_factory = get_custom_model_factory(model_config, logger) + model, tokenizer = model_factory(train_config, model_config, **kwargs) + device = torch.device(f"npu:{local_rank}" if torch.npu.is_available() else "cpu") # FIX(MZY): put the whole model to device. + model.to(device) + model.eval() + logger.info("dataset_config: {}".format(dataset_config)) + dataset_test = get_preprocessed_dataset( + tokenizer, + dataset_config, + split="test", + ) + # sampler = DistributedSampler( + # dataset_test, + # rank=dist.get_rank(), + # num_replicas=dist.get_world_size(), + # ) + test_dataloader = torch.utils.data.DataLoader( + dataset_test, + num_workers=train_config.num_workers_dataloader, + pin_memory=True, + shuffle=False, + batch_size=train_config.val_batch_size, + drop_last=False, + collate_fn=dataset_test.collator, + # sampler=sampler + # multiprocessing_context=mp.get_context("spawn") + ) + + logger.info("=====================================") + pred_path = kwargs.get('decode_log') + f"_pred" + gt_path = kwargs.get('decode_log') + f"_gt" + pred_result = "" + gt_result = "" + with torch.no_grad(): + for step, batch in tqdm(enumerate(test_dataloader)): + for key in batch.keys(): + batch[key] = batch[key].to(device) if isinstance(batch[key], torch.Tensor) else batch[key] + model_outputs = model.generate(**batch) + if hasattr(model, 'tokenizer'): + output_text = model.tokenizer.batch_decode(model_outputs, add_special_tokens=False, skip_special_tokens=True) + else: + output_text = tokenizer.batch_decode(model_outputs, skip_special_tokens=True) + for key, text, target in zip(batch["keys"], output_text, batch["targets"]): + pred_result += key + " " + text.strip() + "\n" + gt_result += key + " " + target + "\n" + with open(pred_path, "a+") as pred, open(gt_path, "a+") as gt: + pred.write(pred_result) + gt.write(gt_result) +if __name__ == "__main__": + main_hydra() \ No newline at end of file diff --git a/src/slam_llm/utils/checkpoint_handler.py b/src/slam_llm/utils/checkpoint_handler.py index afaabef0..bb84d30d 100644 --- a/src/slam_llm/utils/checkpoint_handler.py +++ b/src/slam_llm/utils/checkpoint_handler.py @@ -165,14 +165,18 @@ def save_model_checkpoint( logger.info(f"model checkpoint saved for epoch {epoch} at {save_full_path}\n") -def save_model_checkpoint_deepspeed(model, cfg, checkpoint_name="checkpoint"): +def save_model_checkpoint_deepspeed(model, rank,cfg, checkpoint_name="checkpoint"): logger.info(f"--> saving model ...") save_dir = os.path.join(cfg.output_dir, checkpoint_name) - os.makedirs(save_dir, exist_ok=True) + dist.barrier() + if rank == 0: + os.makedirs(save_dir, exist_ok=True) + dist.barrier() # save_full_path = os.path.join(save_dir, "model.pt") save_full_path = save_dir model.save_checkpoint(save_dir=save_full_path, exclude_frozen_parameters=True) logger.info(f"encoder saved at {save_full_path}") + dist.barrier() def save_model_checkpoint_peft(model, optimizer, rank, cfg, checkpoint_name="checkpoint", save_trainable_only=True): logger.info(f"--> saving model ...") diff --git a/src/slam_llm/utils/compute_wer.py b/src/slam_llm/utils/compute_wer.py old mode 100755 new mode 100644 diff --git a/src/slam_llm/utils/config_utils.py b/src/slam_llm/utils/config_utils.py index b0aadf7d..235f655e 100644 --- a/src/slam_llm/utils/config_utils.py +++ b/src/slam_llm/utils/config_utils.py @@ -68,6 +68,7 @@ def generate_peft_config(train_config): def get_dataloader_kwargs(train_config, dataset, tokenizer, mode): kwargs = {} batch_size = train_config.batch_size_training if mode=="train" else train_config.val_batch_size + # batch_size = None if train_config.batching_strategy == "padding": if train_config.enable_fsdp or train_config.enable_ddp or train_config.enable_deepspeed: kwargs["batch_sampler"] = DistributedLengthBasedBatchSampler( @@ -75,7 +76,8 @@ def get_dataloader_kwargs(train_config, dataset, tokenizer, mode): batch_size=batch_size, rank=dist.get_rank(), num_replicas=dist.get_world_size(), - shuffle=mode=="train", + shuffle=False + # shuffle=mode=="train", ) else: kwargs["batch_sampler"] = LengthBasedBatchSampler(dataset, batch_size, drop_last=True, shuffle=mode=="train") @@ -86,23 +88,25 @@ def get_dataloader_kwargs(train_config, dataset, tokenizer, mode): dataset, rank=dist.get_rank(), num_replicas=dist.get_world_size(), - shuffle=mode=="train", + shuffle=False + # shuffle=mode=="train", ) kwargs["batch_size"] = batch_size - kwargs["drop_last"] = True + kwargs["drop_last"] = False kwargs["collate_fn"] = default_data_collator else: + print(f"[Rank]{dist.get_rank()},world_size{dist.get_world_size()}") # raise ValueError(f"Unknown batching strategy: {train_config.batching_strategy}") - if train_config.enable_fsdp or train_config.enable_ddp or train_config.enable_deepspeed: - kwargs["sampler"] = DistributedSampler( - dataset, - rank=dist.get_rank(), - num_replicas=dist.get_world_size(), - shuffle=mode=="train", - ) + # if train_config.enable_fsdp or train_config.enable_ddp or train_config.enable_deepspeed: + # kwargs["sampler"] = DistributedSampler( + # dataset, + # rank=dist.get_rank(), + # num_replicas=dist.get_world_size(), + # shuffle=False + # # shuffle=mode=="train", + # ) kwargs["batch_size"] = batch_size - kwargs["drop_last"] = True + kwargs["drop_last"] = False kwargs["collate_fn"] = dataset.collator logger.info(f"Using batching strategy: {train_config.batching_strategy}") - return kwargs diff --git a/src/slam_llm/utils/dataset_utils.py b/src/slam_llm/utils/dataset_utils.py index ddc8b3d6..d926bfa0 100644 --- a/src/slam_llm/utils/dataset_utils.py +++ b/src/slam_llm/utils/dataset_utils.py @@ -49,6 +49,13 @@ def get_custom_dataset(dataset_config, tokenizer, split: str): def get_preprocessed_dataset( tokenizer, dataset_config, split: str = "train" ) -> torch.utils.data.Dataset: + def get_split(): + return ( + # dataset_config.train_split + # if split == "train" + # else dataset_config.test_split + split + ) return get_custom_dataset( dataset_config, diff --git a/src/slam_llm/utils/deepspeed_utils.py b/src/slam_llm/utils/deepspeed_utils.py index 6ac5608d..d22d1758 100644 --- a/src/slam_llm/utils/deepspeed_utils.py +++ b/src/slam_llm/utils/deepspeed_utils.py @@ -12,7 +12,7 @@ import functools import hydra import torch -import torch.cuda.nccl as nccl +import torch_npu import torch.distributed as dist from omegaconf import DictConfig from tqdm import tqdm @@ -121,6 +121,7 @@ def byte2mb(x): def train( model, + lr_scheduler, train_dataloader, eval_dataloader, tokenizer, @@ -155,7 +156,7 @@ def train( # scaler = torch.cuda.amp.GradScaler() if train_config.enable_ddp: world_size = int(os.environ["WORLD_SIZE"]) - autocast = torch.cuda.amp.autocast if train_config.use_fp16 else nullcontext + autocast = torch_npu.npu.amp.autocast if train_config.use_fp16 else nullcontext train_prep = [] train_loss = [] @@ -174,27 +175,26 @@ def train( model.train() total_loss = 0.0 total_acc = 0.0 - total_length = len(train_dataloader) // gradient_accumulation_steps + # total_length = len(train_dataloader) // gradient_accumulation_steps pbar = tqdm( colour="blue", desc=f"Training Epoch: {epoch+1}", - total=total_length, dynamic_ncols=True, ) for step, batch in enumerate(train_dataloader): for key in batch.keys(): batch[key] = ( - batch[key].to(local_rank).half() + batch[key].to(f"npu:{local_rank}").half() if isinstance(batch[key], torch.Tensor) and batch[key].dtype == torch.float32 else ( - batch[key].to(local_rank) + batch[key].to(f"npu:{local_rank}") if isinstance(batch[key], torch.Tensor) else batch[key] ) ) - # with autocast(): - outputs, *rest = model(**batch) + with autocast(dtype=torch.bfloat16): + outputs, *rest = model(**batch) acc = rest[0] if rest else -1 loss = outputs.loss @@ -209,7 +209,7 @@ def train( "train_inner/train_inner_loss": loss, "train_inner/train_inner_accuracy": acc, }, - step=(epoch * total_length + step), + step= step, ) else: wandb.log( @@ -217,7 +217,7 @@ def train( "train_inner/train_inner_loss": loss, "train_inner/train_inner_accuracy": acc, }, - step=(epoch * total_length + step), + step= step, ) total_loss += loss.detach().float() @@ -226,18 +226,17 @@ def train( # deepspeed should handle gradient accumulate model.backward(loss) model.step() + lr_scheduler.step() - if (step + 1) % gradient_accumulation_steps == 0 or step == len( - train_dataloader - ) - 1: + if (step + 1) % gradient_accumulation_steps == 0: pbar.update(1) pbar.set_description( - f"Training Epoch: {epoch+1}/{train_config.num_epochs}, step {step}/{len(train_dataloader)} completed (loss: {loss.detach().float()}, acc: {acc})" + f"Training Epoch: {epoch+1}/{train_config.num_epochs}, step {step} completed (loss: {loss.detach().float()}, acc: {acc})" ) if ( - (epoch * total_length + step + 1) % train_config.validation_interval + ( step + 1) % train_config.validation_interval == 0 and train_config.run_validation ): @@ -247,11 +246,11 @@ def train( eval_epoch_acc = rest[0] if rest else -1 checkpoint_start_time = time.perf_counter() - if train_config.save_model and (eval_epoch_loss < best_val_loss): - checkpoint_name = f"{train_config.model_name}_epoch_{str(epoch+1)}_step_{step+1}" - save_model_checkpoint_deepspeed( - model, train_config, checkpoint_name - ) + # if train_config.save_model and (eval_epoch_loss < best_val_loss or eval_epoch_acc > best_val_acc): + checkpoint_name = f"{train_config.model_name}_epoch_{str(epoch+1)}_step_{step+1}" + save_model_checkpoint_deepspeed( + model,rank, train_config, checkpoint_name + ) checkpoint_end_time = time.perf_counter() - checkpoint_start_time checkpoint_times.append(checkpoint_end_time) @@ -306,16 +305,17 @@ def train( epoch_end_time = time.perf_counter() - epoch_start_time epoch_times.append(epoch_end_time) # Reducing total_loss across all devices if there's more than one CUDA device - if torch.cuda.device_count() > 1 and ( - train_config.enable_fsdp or train_config.enable_ddp - ): - dist.all_reduce(total_loss, op=dist.ReduceOp.SUM) - dist.all_reduce(total_acc, op=dist.ReduceOp.SUM) - train_epoch_loss = total_loss / len(train_dataloader) - train_epoch_acc = total_acc / len(train_dataloader) - if train_config.enable_fsdp or train_config.enable_ddp: - train_epoch_loss = train_epoch_loss / world_size - train_epoch_acc = train_epoch_acc / world_size + # dist.barrier() + # if torch_npu.npu.device_count() > 1 and ( + # train_config.enable_fsdp or train_config.enable_ddp + # ): + # dist.all_reduce(total_loss, op=dist.ReduceOp.SUM) + # dist.all_reduce(total_acc, op=dist.ReduceOp.SUM) + train_epoch_loss = total_loss / (step+ 1) + train_epoch_acc = total_acc / (step+ 1) + # if train_config.enable_fsdp or train_config.enable_ddp: + # train_epoch_loss = train_epoch_loss / world_size + # train_epoch_acc = train_epoch_acc / world_size train_perplexity = torch.exp(train_epoch_loss) train_prep.append(train_perplexity) @@ -350,7 +350,7 @@ def train( logger.info(f"Max CUDA memory allocated was {memtrace.peak} GB") logger.info(f"Max CUDA memory reserved was {memtrace.max_reserved} GB") logger.info(f"Peak active CUDA memory was {memtrace.peak_active_gb} GB") - logger.info(f"Cuda Malloc retires : {memtrace.cuda_malloc_retires}") + logger.info(f"Cuda Malloc retires : {memtrace.npu_malloc_retires}") logger.info( f"CPU Total Peak Memory consumed during the train (max): {memtrace.cpu_peaked + memtrace.cpu_begin} GB" ) @@ -401,30 +401,29 @@ def evaluation(model, train_config, eval_dataloader, local_rank, tokenizer): Returns: eval_ppl, eval_epoch_loss """ + dist.barrier() world_size = int(os.environ["WORLD_SIZE"]) model.eval() eval_preds = [] eval_loss = 0.0 # Initialize evaluation loss eval_acc = 0.0 autocast = ( - torch.cuda.amp.autocast if train_config.use_fp16 else nullcontext + torch_npu.npu.amp.autocast if train_config.use_fp16 else nullcontext ) # (Fix:MZY): fix expected scalar type mismatch in norm with MemoryTrace() as memtrace: - total_length = len(eval_dataloader) pbar = tqdm( colour="green", desc=f"Evaluating Epoch", - total=total_length, dynamic_ncols=True, ) for step, batch in enumerate(eval_dataloader): for key in batch.keys(): batch[key] = ( - batch[key].to(local_rank).half() + batch[key].to(f"npu:{local_rank}").half() if isinstance(batch[key], torch.Tensor) and batch[key].dtype==torch.float32 else ( - batch[key].to(local_rank) if isinstance(batch[key], torch.Tensor) else batch[key] + batch[key].to(f"npu:{local_rank}") if isinstance(batch[key], torch.Tensor) else batch[key] ) ) # Ensure no gradients are computed for this scope to save memory @@ -446,19 +445,21 @@ def evaluation(model, train_config, eval_dataloader, local_rank, tokenizer): ) pbar.update(1) pbar.set_description( - f"step: {step+1}/{total_length}, eval_loss: {eval_loss/(step+1):.4f}, eval_acc: {eval_acc/(step+1):.4f}" + f"step: {step+1}, eval_loss: {eval_loss/(step+1):.4f}, eval_acc: {eval_acc/(step+1):.4f}" ) # If there's more than one CUDA device, reduce evaluation loss across all devices + if train_config.enable_fsdp or train_config.enable_ddp: + dist.barrier() if ( - torch.cuda.device_count() > 1 + torch_npu.npu.device_count() > 1 ): dist.all_reduce(eval_loss, op=dist.ReduceOp.SUM) dist.all_reduce(eval_acc, op=dist.ReduceOp.SUM) # Compute average loss and perplexity - eval_epoch_loss = eval_loss / len(eval_dataloader) - eval_epoch_acc = eval_acc / len(eval_dataloader) + eval_epoch_loss = eval_loss / (step+ 1) + eval_epoch_acc = eval_acc / (step+ 1) eval_epoch_loss = eval_epoch_loss / world_size eval_epoch_acc = eval_epoch_acc / world_size eval_ppl = torch.exp(eval_epoch_loss) @@ -488,7 +489,7 @@ def check_frozen_layers_peft_model(model): def setup(): """Initialize the process group for distributed training""" - dist.init_process_group("nccl") + dist.init_process_group("hccl") def setup_environ_flags(rank): @@ -512,7 +513,7 @@ def clear_gpu_cache(rank=None): """Clear the GPU cache for all ranks""" if rank == 0: logger.info(f"Clearing GPU cache for all ranks") - torch.cuda.empty_cache() + torch_npu.npu.empty_cache() def get_parameter_dtypes(model): diff --git a/src/slam_llm/utils/memory_utils.py b/src/slam_llm/utils/memory_utils.py index 725f2b0d..67cf7da5 100644 --- a/src/slam_llm/utils/memory_utils.py +++ b/src/slam_llm/utils/memory_utils.py @@ -13,9 +13,9 @@ def byte2gb(x): class MemoryTrace: def __enter__(self): gc.collect() - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() # reset the peak gauge to zero - self.begin = byte2gb(torch.cuda.memory_allocated()) + torch.npu.empty_cache() + torch.npu.reset_max_memory_allocated() # reset the peak gauge to zero + self.begin = byte2gb(torch.npu.memory_allocated()) self.process = psutil.Process() self.cpu_begin = byte2gb(self.cpu_mem_used()) self.peak_monitoring = True @@ -44,17 +44,17 @@ def __exit__(self, *exc): self.peak_monitoring = False gc.collect() - torch.cuda.empty_cache() - self.end = byte2gb(torch.cuda.memory_allocated()) - self.peak = byte2gb(torch.cuda.max_memory_allocated()) - cuda_info = torch.cuda.memory_stats() - self.peak_active_gb = byte2gb(cuda_info["active_bytes.all.peak"]) - self.cuda_malloc_retires = cuda_info.get("num_alloc_retries", 0) - self.peak_active_gb = byte2gb(cuda_info["active_bytes.all.peak"]) - self.m_cuda_ooms = cuda_info.get("num_ooms", 0) + # torch.npu.empty_cache() + self.end = byte2gb(torch.npu.memory_allocated()) + self.peak = byte2gb(torch.npu.max_memory_allocated()) + npu_info = torch.npu.memory_stats() + self.peak_active_gb = byte2gb(npu_info["active_bytes.all.peak"]) + self.npu_malloc_retires = npu_info.get("num_alloc_retries", 0) + self.peak_active_gb = byte2gb(npu_info["active_bytes.all.peak"]) + self.m_npu_ooms = npu_info.get("num_ooms", 0) self.used = byte2gb(self.end - self.begin) self.peaked = byte2gb(self.peak - self.begin) - self.max_reserved = byte2gb(torch.cuda.max_memory_reserved()) + self.max_reserved = byte2gb(torch.npu.max_memory_reserved()) self.cpu_end = self.cpu_mem_used() self.cpu_used = byte2gb(self.cpu_end - self.cpu_begin) diff --git a/src/slam_llm/utils/model_utils.py b/src/slam_llm/utils/model_utils.py index 8e620890..9e7b1d34 100644 --- a/src/slam_llm/utils/model_utils.py +++ b/src/slam_llm/utils/model_utils.py @@ -2,6 +2,7 @@ from pathlib import Path def get_custom_model_factory(model_config, logger): + costom_model_path = model_config.get( "file", None ) @@ -20,7 +21,6 @@ def get_custom_model_factory(model_config, logger): module_path = Path(module_path) if not module_path.is_file(): raise FileNotFoundError(f"Dataset py file {module_path.as_posix()} does not exist or is not a file.") - module = load_module_from_py_file(module_path.as_posix()) try: return getattr(module, func_name) diff --git a/src/slam_llm/utils/preprocess_text.py b/src/slam_llm/utils/preprocess_text.py old mode 100755 new mode 100644 diff --git a/src/slam_llm/utils/train_utils.py b/src/slam_llm/utils/train_utils.py index 8f5c34e6..4217a7b1 100644 --- a/src/slam_llm/utils/train_utils.py +++ b/src/slam_llm/utils/train_utils.py @@ -10,7 +10,8 @@ import torch -import torch.cuda.nccl as nccl +import torch_npu +# import torch.npu.nccl as nccl import torch.distributed as dist from torch.distributed.fsdp import ShardingStrategy from torch.distributed.fsdp import StateDictType @@ -67,13 +68,15 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche # scaler = ShardedGradScaler() # elif train_config.use_fp16 and not train_config.enable_fsdp: # scaler = torch.cuda.amp.GradScaler() + if train_config.use_fp16: - scaler = torch.cuda.amp.GradScaler() + scaler = torch_npu.npu.amp.GradScaler() + # scaler = ShardedGradScaler() if train_config.enable_fsdp: scaler = ShardedGradScaler() if train_config.enable_fsdp or train_config.enable_ddp: world_size = int(os.environ["WORLD_SIZE"]) - autocast = torch.cuda.amp.autocast if train_config.use_fp16 else nullcontext + autocast = torch_npu.npu.amp.autocast if train_config.use_fp16 else nullcontext train_prep = [] train_loss = [] @@ -92,21 +95,20 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche model.train() total_loss = 0.0 total_acc = 0.0 - total_length = len(train_dataloader)//gradient_accumulation_steps - pbar = tqdm(colour="blue", desc=f"Training Epoch: {epoch+1}", total=total_length, dynamic_ncols=True) + pbar = tqdm(colour="blue", desc=f"Training Epoch: {epoch+1}", dynamic_ncols=True) for step, batch in enumerate(train_dataloader): for key in batch.keys(): if train_config.enable_fsdp or train_config.enable_ddp: - batch[key] = batch[key].to(local_rank) if isinstance(batch[key], torch.Tensor) else batch[key] + batch[key] = batch[key].to(f"npu:{local_rank}") if isinstance(batch[key], torch.Tensor) else batch[key] if isinstance(batch[key], dict): for k2 in batch[key].keys(): - batch[key][k2] = batch[key][k2].to(local_rank) if isinstance(batch[key][k2], torch.Tensor) else batch[key][k2] + batch[key][k2] = batch[key][k2].to(f"npu:{local_rank}") if isinstance(batch[key][k2], torch.Tensor) else batch[key][k2] else: - batch[key] = batch[key].to('cuda:0') if isinstance(batch[key], torch.Tensor) else batch[key] + batch[key] = batch[key].to('npu:0') if isinstance(batch[key], torch.Tensor) else batch[key] if isinstance(batch[key], dict): for k2 in batch[key].keys(): - batch[key][k2] = batch[key][k2].to('cuda:0') if isinstance(batch[key][k2], torch.Tensor) else batch[key][k2] - with autocast(): + batch[key][k2] = batch[key][k2].to('npu:0') if isinstance(batch[key][k2], torch.Tensor) else batch[key][k2] + with autocast(dtype=torch.bfloat16): outputs, *rest = model(**batch) acc = rest[0] if rest else -1 loss = outputs.loss @@ -117,16 +119,16 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche if log_config.use_wandb and step % log_config.log_interval == 0: if train_config.enable_fsdp or train_config.enable_ddp: if rank==0: - wandb.log({"train_inner/train_inner_loss":loss, "train_inner/train_inner_accuracy":acc}, step=(epoch * total_length + step)) + wandb.log({"train_inner/train_inner_loss":loss, "train_inner/train_inner_accuracy":acc}, step=( step)) else: - wandb.log({"train_inner/train_inner_loss":loss, "train_inner/train_inner_accuracy":acc}, step=(epoch * total_length + step)) + wandb.log({"train_inner/train_inner_loss":loss, "train_inner/train_inner_accuracy":acc}, step=( step)) total_loss += loss.detach().float() total_acc += acc if train_config.use_fp16: # if fp16 is enabled, use gradient scaler to handle gradient update scaler.scale(loss).backward() - if (step + 1) % gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: + if (step + 1) % gradient_accumulation_steps == 0 : scaler.step(optimizer) scaler.update() if lr_scheduler is not None: @@ -139,15 +141,15 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche if log_config.use_wandb and step % log_config.log_interval == 0: if train_config.enable_fsdp or train_config.enable_ddp: if rank==0: - wandb.log({"train_inner/lr":current_lr}, step=(epoch * total_length + step)) + wandb.log({"train_inner/lr":current_lr}, step=( step)) else: - wandb.log({"train_inner/lr":current_lr}, step=(epoch * total_length + step)) + wandb.log({"train_inner/lr":current_lr}, step=( step)) optimizer.zero_grad() pbar.update(1) else: # regular backpropagation when fp16 is not used loss.backward() - if (step + 1) % gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: + if (step + 1) % gradient_accumulation_steps == 0 : optimizer.step() if lr_scheduler is not None: lr_scheduler.step() @@ -159,19 +161,19 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche if log_config.use_wandb and step % log_config.log_interval == 0: if train_config.enable_fsdp or train_config.enable_ddp: if rank==0: - wandb.log({"train_inner/lr":current_lr}, step=(epoch * total_length + step)) + wandb.log({"train_inner/lr":current_lr}, step=( +step)) else: - wandb.log({"train_inner/lr":current_lr}, step=(epoch * total_length + step)) + wandb.log({"train_inner/lr":current_lr}, step=( step)) optimizer.zero_grad() pbar.update(1) - pbar.set_description(f"Training Epoch: {epoch+1}/{train_config.num_epochs}, step {step}/{len(train_dataloader)} completed (loss: {loss.detach().float()}, acc: {acc})") + pbar.set_description(f"Training Epoch: {epoch+1}/{train_config.num_epochs}, step {step} completed (loss: {loss.detach().float()}, acc: {acc})") - if (epoch * total_length + step + 1) % train_config.validation_interval == 0 and train_config.run_validation: + if ( step + 1) % train_config.validation_interval == 0 and train_config.run_validation: eval_ppl, eval_epoch_loss, *rest = evaluation(model, train_config, eval_dataloader, local_rank, tokenizer) eval_epoch_acc = rest[0] if rest else -1 checkpoint_start_time = time.perf_counter() - if train_config.save_model and (eval_epoch_loss < best_val_loss): + if train_config.save_model and (eval_epoch_loss < best_val_loss or eval_epoch_acc > best_val_acc): checkpoint_name = f"{train_config.model_name}_epoch_{str(epoch+1)}_step_{step+1}" if train_config.enable_fsdp or train_config.enable_ddp: dist.barrier() @@ -212,6 +214,7 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche elif not train_config.use_peft and train_config.freeze_llm: logger.info(f"llm is frozen, we are about to save other parts.") if train_config.enable_fsdp: + logger.info(fsdp_config.sharding_strategy) if fsdp_config.sharding_strategy == ShardingStrategy.FULL_SHARD: save_model_checkpoint_peft_full_shard( model, optimizer, rank, train_config, epoch=epoch @@ -305,14 +308,14 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche if rank==0: logger.info("=====================================") logger.info(f"Test the file {train_config.run_test_during_validation_file} during validation:") - with autocast(): + with autocast(dtype=torch.bfloat16): logger.info(model.inference(train_config.run_test_during_validation_file, train_config.run_test_during_validation_prompt)) logger.info("=====================================") dist.barrier() else: logger.info("=====================================") logger.info(f"Test the file {train_config.run_test_during_validation_file} during validation:") - with autocast(): + with autocast(dtype=torch.bfloat16): logger.info(model.inference(train_config.run_test_during_validation_file, train_config.run_test_during_validation_prompt)) logger.info("=====================================") pbar.close() @@ -320,11 +323,11 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche epoch_end_time = time.perf_counter()-epoch_start_time epoch_times.append(epoch_end_time) # Reducing total_loss across all devices if there's more than one CUDA device - if torch.cuda.device_count() > 1 and (train_config.enable_fsdp or train_config.enable_ddp): + if torch_npu.npu.device_count() > 1 and (train_config.enable_fsdp or train_config.enable_ddp): dist.all_reduce(total_loss, op=dist.ReduceOp.SUM) dist.all_reduce(total_acc, op=dist.ReduceOp.SUM) - train_epoch_loss = total_loss / len(train_dataloader) - train_epoch_acc = total_acc / len(train_dataloader) + train_epoch_loss = total_loss / (step+ 1) + train_epoch_acc = total_acc / (step+ 1) if train_config.enable_fsdp or train_config.enable_ddp: train_epoch_loss = train_epoch_loss/world_size train_epoch_acc = train_epoch_acc/world_size @@ -352,13 +355,13 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche logger.info(f"Max CUDA memory allocated was {memtrace.peak} GB") logger.info(f"Max CUDA memory reserved was {memtrace.max_reserved} GB") logger.info(f"Peak active CUDA memory was {memtrace.peak_active_gb} GB") - logger.info(f"Cuda Malloc retires : {memtrace.cuda_malloc_retires}") + logger.info(f"Cuda Malloc retires : {memtrace.npu_malloc_retires}") logger.info(f"CPU Total Peak Memory consumed during the train (max): {memtrace.cpu_peaked + memtrace.cpu_begin} GB") else: logger.info(f"Max CUDA memory allocated was {memtrace.peak} GB") logger.info(f"Max CUDA memory reserved was {memtrace.max_reserved} GB") logger.info(f"Peak active CUDA memory was {memtrace.peak_active_gb} GB") - logger.info(f"Cuda Malloc retires : {memtrace.cuda_malloc_retires}") + logger.info(f"Cuda Malloc retires : {memtrace.npu_malloc_retires}") logger.info(f"CPU Total Peak Memory consumed during the train (max): {memtrace.cpu_peaked + memtrace.cpu_begin} GB") # Update the learning rate as needed @@ -408,22 +411,23 @@ def evaluation(model,train_config, eval_dataloader, local_rank, tokenizer): eval_preds = [] eval_loss = 0.0 # Initialize evaluation loss eval_acc = 0.0 - autocast = torch.cuda.amp.autocast if train_config.use_fp16 else nullcontext # (Fix:MZY): fix expected scalar type mismatch in norm + autocast = torch_npu.npu.amp.autocast if train_config.use_fp16 else nullcontext # (Fix:MZY): fix expected scalar type mismatch in norm with MemoryTrace() as memtrace: - total_length = len(eval_dataloader) - pbar = tqdm(colour="green", desc=f"Evaluating Epoch", total=total_length, dynamic_ncols=True) + + pbar = tqdm(colour="green", desc=f"Evaluating Epoch", dynamic_ncols=True) for step, batch in enumerate(eval_dataloader): for key in batch.keys(): if train_config.enable_fsdp or train_config.enable_ddp: - batch[key] = batch[key].to(local_rank) if isinstance(batch[key], torch.Tensor) else batch[key] + batch[key] = batch[key].to(f"npu:{local_rank}") if isinstance(batch[key], torch.Tensor) else batch[key] else: - batch[key] = batch[key].to('cuda:0') if isinstance(batch[key], torch.Tensor) else batch[key] + batch[key] = batch[key].to('npu:0') if isinstance(batch[key], torch.Tensor) else batch[key] # Ensure no gradients are computed for this scope to save memory with torch.no_grad(): # Forward pass and compute loss - with autocast(): # (Fix:MZY): fix expected scalar type mismatch in norm - outputs, *rest = model(**batch) + # with autocast(dtype=torch.bfloat16): # (Fix:MZY): fix expected scalar type mismatch in norm + # with autocast(enabled=True): # (Fix:MZY): fix expected scalar type mismatch in norm + outputs, *rest = model(**batch) acc = rest[0] if rest else -1 loss = outputs.loss @@ -438,16 +442,16 @@ def evaluation(model,train_config, eval_dataloader, local_rank, tokenizer): except Exception: pass # vallex does not need to show it's result (we can't view any thing from abstract acoustic token) pbar.update(1) - pbar.set_description(f"step: {step+1}/{total_length}, eval_loss: {eval_loss/(step+1):.4f}, eval_acc: {eval_acc/(step+1):.4f}") + pbar.set_description(f"step: {step+1}, eval_loss: {eval_loss/(step+1):.4f}, eval_acc: {eval_acc/(step+1):.4f}") # If there's more than one CUDA device, reduce evaluation loss across all devices - if torch.cuda.device_count() > 1 and train_config.enable_fsdp or train_config.enable_ddp: + if torch_npu.npu.device_count() > 1 and train_config.enable_fsdp or train_config.enable_ddp: dist.all_reduce(eval_loss, op=dist.ReduceOp.SUM) dist.all_reduce(eval_acc, op=dist.ReduceOp.SUM) # Compute average loss and perplexity - eval_epoch_loss = eval_loss / len(eval_dataloader) - eval_epoch_acc = eval_acc / len(eval_dataloader) + eval_epoch_loss = eval_loss / (step+ 1) + eval_epoch_acc = eval_acc / (step+ 1) if train_config.enable_fsdp or train_config.enable_ddp: eval_epoch_loss = eval_epoch_loss/world_size eval_epoch_acc = eval_epoch_acc/world_size @@ -477,7 +481,7 @@ def check_frozen_layers_peft_model(model): def setup(): """Initialize the process group for distributed training""" - dist.init_process_group("nccl") + dist.init_process_group("hccl") def setup_environ_flags(rank): @@ -501,7 +505,7 @@ def clear_gpu_cache(rank=None): """Clear the GPU cache for all ranks""" if rank == 0: logger.info(f"Clearing GPU cache for all ranks") - torch.cuda.empty_cache() + torch_npu.npu.empty_cache() def get_parameter_dtypes(model): @@ -546,11 +550,11 @@ def get_policies(cfg, rank): """Get the policies for mixed precision and fsdp wrapping""" verify_bfloat_support = ( - torch.version.cuda - and torch.cuda.is_bf16_supported() - and packaging.version.parse(torch.version.cuda).release >= (11, 0) - and dist.is_nccl_available() - and nccl.version() >= (2, 10) + # torch.version.cuda + torch_npu.npu.is_bf16_supported() + # and packaging.version.parse(torch.version.cuda).release >= (11, 0) + # and dist.is_nccl_available() + # and nccl.version() >= (2, 10) )