Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
7 changes: 5 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,20 @@ debug.sh
transformers
wandb/
log/
**/exp/
*.log
outputs/
data/
jobs/
debug/
audio/

extra-info
kernel_meta
examples/s2s/scripts/debug
examples/vsr_LRS3/scripts/decode_avhubert_vo_vicuna_7b_noself.sh
examples/asr_librispeech/scripts/decode_hubert_xtralarge_linear_vicuna_7b_copy.sh
examples/vsr_LRS3/scripts/decode_avhubert_vo_vicuna_7b_copy.sh
scripts_all
examples/hotwords_librispeech
examples/asr_librispeech/scripts/decode_hubert_xtralarge_linear_vicuna_7b_debug.sh
examples/asr_librispeech/scripts/decode_hubert_xtralarge_linear_vicuna_7b_debug.sh
**/fusion_result.*
32 changes: 32 additions & 0 deletions .hydra/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
dataset_config:
prompt: 请转写音频为文字
llm_name: Qwen2-7B-Instruct
prompt_style: normal
dataset: slidespeech
pad_or_trim: true
test_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/slidespeech/asr/test/
input_type: raw
mel_size: 128
inference_mode: true
model_config:
llm_name: Qwen2-7B-Instruct
llm_path: /aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/Qwen2-7B-Instruct
llm_dim: 3584
encoder_name: conformer
normalize: true
encoder_projector_ds_rate: 2
encoder_path: /aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/FireRedASR-LLM-L/asr_encoder.pth.tar
encoder_dim: 1280
encoder_projector: linear
train_config:
model_name: firered_asr
freeze_encoder: true
freeze_llm: true
use_peft: false
batching_strategy: custom
num_epochs: 1
val_batch_size: 8
num_workers_dataloader: 8
output_dir: ''
inference_mode: true
decode_log: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/examples/asr_fireredasr/decode_slidespeech_asr_normal
183 changes: 183 additions & 0 deletions .hydra/hydra.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
hydra:
run:
dir: ''
sweep:
dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
subdir: ${hydra.job.num}
launcher:
_target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
sweeper:
_target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
max_batch_size: null
params: null
help:
app_name: ${hydra.job.name}
header: '${hydra.help.app_name} is powered by Hydra.

'
footer: 'Powered by Hydra (https://hydra.cc)

Use --hydra-help to view Hydra specific help

'
template: '${hydra.help.header}

== Configuration groups ==

Compose your configuration from those groups (group=option)


$APP_CONFIG_GROUPS


== Config ==

Override anything in the config (foo.bar=value)


$CONFIG


${hydra.help.footer}

'
hydra_help:
template: 'Hydra (${hydra.runtime.version})

See https://hydra.cc for more info.


== Flags ==

$FLAGS_HELP


== Configuration groups ==

Compose your configuration from those groups (For example, append hydra/job_logging=disabled
to command line)


$HYDRA_CONFIG_GROUPS


Use ''--cfg hydra'' to Show the Hydra config.

'
hydra_help: ???
hydra_logging:
version: 1
formatters:
simple:
format: '[%(asctime)s][HYDRA] %(message)s'
handlers:
console:
class: logging.StreamHandler
formatter: simple
stream: ext://sys.stdout
root:
level: INFO
handlers:
- console
loggers:
logging_example:
level: DEBUG
disable_existing_loggers: false
job_logging:
version: 1
formatters:
simple:
format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
handlers:
console:
class: logging.StreamHandler
formatter: simple
stream: ext://sys.stdout
file:
class: logging.FileHandler
formatter: simple
filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
root:
level: INFO
handlers:
- console
- file
disable_existing_loggers: false
env: {}
mode: RUN
searchpath: []
callbacks: {}
output_subdir: .hydra
overrides:
hydra:
- hydra.run.dir=
- hydra.mode=RUN
task:
- ++model_config.llm_name=Qwen2-7B-Instruct
- ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/Qwen2-7B-Instruct
- ++model_config.llm_dim=3584
- ++model_config.encoder_name=conformer
- ++model_config.normalize=true
- ++model_config.encoder_projector_ds_rate=2
- ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/FireRedASR-LLM-L/asr_encoder.pth.tar
- ++model_config.encoder_dim=1280
- ++model_config.encoder_projector=linear
- ++dataset_config.llm_name=Qwen2-7B-Instruct
- ++dataset_config.prompt_style=normal
- ++dataset_config.dataset=slidespeech
- ++dataset_config.pad_or_trim=true
- ++dataset_config.test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/slidespeech/asr/test/
- ++dataset_config.input_type=raw
- ++dataset_config.mel_size=128
- ++dataset_config.inference_mode=true
- ++train_config.model_name=firered_asr
- ++train_config.freeze_encoder=true
- ++train_config.freeze_llm=true
- ++train_config.use_peft=false
- ++train_config.batching_strategy=custom
- ++train_config.num_epochs=1
- ++train_config.val_batch_size=8
- ++train_config.num_workers_dataloader=8
- ++train_config.output_dir=
- ++train_config.inference_mode=true
- ++decode_log=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/examples/asr_fireredasr/decode_slidespeech_asr_normal
job:
name: inference_fireredasr_deepspeed
chdir: null
override_dirname: ++dataset_config.dataset=slidespeech,++dataset_config.inference_mode=true,++dataset_config.input_type=raw,++dataset_config.llm_name=Qwen2-7B-Instruct,++dataset_config.mel_size=128,++dataset_config.pad_or_trim=true,++dataset_config.prompt_style=normal,++dataset_config.test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/slidespeech/asr/test/,++decode_log=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/examples/asr_fireredasr/decode_slidespeech_asr_normal,++model_config.encoder_dim=1280,++model_config.encoder_name=conformer,++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/FireRedASR-LLM-L/asr_encoder.pth.tar,++model_config.encoder_projector=linear,++model_config.encoder_projector_ds_rate=2,++model_config.llm_dim=3584,++model_config.llm_name=Qwen2-7B-Instruct,++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/Qwen2-7B-Instruct,++model_config.normalize=true,++train_config.batching_strategy=custom,++train_config.freeze_encoder=true,++train_config.freeze_llm=true,++train_config.inference_mode=true,++train_config.model_name=firered_asr,++train_config.num_epochs=1,++train_config.num_workers_dataloader=8,++train_config.output_dir=,++train_config.use_peft=false,++train_config.val_batch_size=8
id: ???
num: ???
config_name: prompt.yaml
env_set: {}
env_copy: []
config:
override_dirname:
kv_sep: '='
item_sep: ','
exclude_keys: []
runtime:
version: 1.3.2
version_base: '1.3'
cwd: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM
config_sources:
- path: hydra.conf
schema: pkg
provider: hydra
- path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/examples/asr_fireredasr/conf
schema: file
provider: main
- path: ''
schema: structured
provider: schema
output_dir: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM
choices:
hydra/env: default
hydra/callbacks: null
hydra/job_logging: default
hydra/hydra_logging: default
hydra/hydra_help: default
hydra/help: default
hydra/sweeper: basic
hydra/launcher: basic
hydra/output: default
verbose: false
28 changes: 28 additions & 0 deletions .hydra/overrides.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
- ++model_config.llm_name=Qwen2-7B-Instruct
- ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/Qwen2-7B-Instruct
- ++model_config.llm_dim=3584
- ++model_config.encoder_name=conformer
- ++model_config.normalize=true
- ++model_config.encoder_projector_ds_rate=2
- ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/FireRedASR-LLM-L/asr_encoder.pth.tar
- ++model_config.encoder_dim=1280
- ++model_config.encoder_projector=linear
- ++dataset_config.llm_name=Qwen2-7B-Instruct
- ++dataset_config.prompt_style=normal
- ++dataset_config.dataset=slidespeech
- ++dataset_config.pad_or_trim=true
- ++dataset_config.test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/slidespeech/asr/test/
- ++dataset_config.input_type=raw
- ++dataset_config.mel_size=128
- ++dataset_config.inference_mode=true
- ++train_config.model_name=firered_asr
- ++train_config.freeze_encoder=true
- ++train_config.freeze_llm=true
- ++train_config.use_peft=false
- ++train_config.batching_strategy=custom
- ++train_config.num_epochs=1
- ++train_config.val_batch_size=8
- ++train_config.num_workers_dataloader=8
- ++train_config.output_dir=
- ++train_config.inference_mode=true
- ++decode_log=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/examples/asr_fireredasr/decode_slidespeech_asr_normal
68 changes: 68 additions & 0 deletions examples/aispeech_asr/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# MALA-ASR_SLIDESPEECH

## Guides

[MaLa-ASR](https://www.arxiv.org/abs/2406.05839) is an LLM-based ASR model that can integrate textual keywords extracted from presentation slides to improve recognition of conference content.

![](docs/demo.png)

## Model Architecture

We use the official WavLM-Large model as our speech encoder, the public Vicuna 7B as our large language model decoder, and a simple-structured linear projector, consisting of a 1-D convolution layer and two linear layers as our projector. Refer to the [paper](https://www.arxiv.org/abs/2406.05839) for more details.

![](docs/model.png)

## Performance and checkpoints
We only train the linear projector in this recipe.
Encoder | Projector | LLM | dev | test
|---|---|---|---|---|
[WavLM-large](https://drive.google.com/file/d/12-cB34qCTvByWT-QtOcZaqwwO21FLSqU/view) | [Linear](https://drive.google.com/file/d/1hYS5UI3W0WVOZRVbqWxDUWIFMO9VgzHk/view?usp=drive_link)(~15.74M) | [vicuna-7b-v1.5](https://huggingface.co/lmsys/vicuna-7b-v1.5) | 8.91 | 9.14


## Data preparation
Refer to official [SLIDESPEECH CORPUS](https://slidespeech.github.io/).

Specifically, take the file `slidespeech_dataset.py` as an example, the dataset requires four files: `my_wav.scp`, `utt2num_samples`, `text`, `hot_related/ocr_1gram_top50_mmr070_hotwords_list`.

`my_wav.scp` is a file of audio path lists. We transform wav file to ark file, so this file looks like
```
ID1 xxx/slidespeech/dev_oracle_v1/data/format.1/data_wav.ark:22
ID2 xxx/slidespeech/dev_oracle_v1/data/format.1/data_wav.ark:90445
...
```

To generate this file, you can get audio wavs from https://www.openslr.org/144/ and get the time segments from https://slidespeech.github.io/. The second website provides segments, transcription text, OCR results at https://speech-lab-share-data.oss-cn-shanghai.aliyuncs.com/SlideSpeech/related_files.tar.gz (~1.37GB). You need to segment the wav by the timestamps provided in `segments` file.


This _related_files.tar.gz_ also provides `text` and a file named `keywords`. The file `keywords` refers to `hot_related/ocr_1gram_top50_mmr070_hotwords_list`, which contains hotwords list.

`utt2num_samples` contains the length of the wavs, which looks like
```
ID1 103680
ID2 181600
...
```

## Decode with checkpoints
```
bash decode_MaLa-ASR_withkeywords_L95.sh
```
Modify the path including `speech_encoder_path`, `llm_path`, `output_dir`, `ckpt_path` and `decode_log` in the script when you run the shell script.

## Train a new model

### Use self-supervised model(such as WavLM) as the encoder
```
bash finetune_MaLa-ASR_withkeywords_L95.sh
```

## Citation
You can refer to the paper for more results.
```
@inproceedings{yang2024malaasr,
title={MaLa-ASR: Multimedia-Assisted LLM-Based ASR},
author={Guanrou Yang and Ziyang Ma and Fan Yu and Zhifu Gao and Shiliang Zhang and Xie Chen},
booktitle={Proc. INTERSPEECH},
year={2024},
}
```
38 changes: 38 additions & 0 deletions examples/aispeech_asr/conf/ds_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
{
"train_micro_batch_size_per_gpu": 2,
"gradient_accumulation_steps": 1,
"optimizer": {
"type": "AdamW",
"params": {
"lr": 5e-5,
"betas": [0.9, 0.999],
"eps": 1e-06,
"weight_decay": 0.01
}
},
"bf16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 100,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 0.01
},
"zero_optimization": {
"stage": 2,
"allgather_partitions": true,
"overlap_comm": true,
"reduce_scatter": true,
"contiguous_gradients": true
},
"scheduler": {
"type": "WarmupLR",
"params": {
"warmup_min_lr": 0.00,
"warmup_max_lr": 0.00005,
"warmup_num_steps": 1000
}
},
"checkpoint_activations": false

}
Loading
Loading