X-LANCE · nuaalixu · Feb 27, 2025 · Mar 7, 2025 · Mar 7, 2025 · Mar 10, 2025
diff --git a/.gitignore b/.gitignore
@@ -9,17 +9,20 @@ debug.sh
 transformers
 wandb/
 log/
+**/exp/
 *.log
 outputs/
 data/
 jobs/
 debug/
 audio/
-
+extra-info
+kernel_meta
 examples/s2s/scripts/debug
 examples/vsr_LRS3/scripts/decode_avhubert_vo_vicuna_7b_noself.sh
 examples/asr_librispeech/scripts/decode_hubert_xtralarge_linear_vicuna_7b_copy.sh
 examples/vsr_LRS3/scripts/decode_avhubert_vo_vicuna_7b_copy.sh
 scripts_all
 examples/hotwords_librispeech
-examples/asr_librispeech/scripts/decode_hubert_xtralarge_linear_vicuna_7b_debug.sh
+examples/asr_librispeech/scripts/decode_hubert_xtralarge_linear_vicuna_7b_debug.sh
+**/fusion_result.*
diff --git a/.hydra/config.yaml b/.hydra/config.yaml
@@ -0,0 +1,32 @@
+dataset_config:
+  prompt: 请转写音频为文字
+  llm_name: Qwen2-7B-Instruct
+  prompt_style: normal
+  dataset: slidespeech
+  pad_or_trim: true
+  test_scp_file_path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/slidespeech/asr/test/
+  input_type: raw
+  mel_size: 128
+  inference_mode: true
+model_config:
+  llm_name: Qwen2-7B-Instruct
+  llm_path: /aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/Qwen2-7B-Instruct
+  llm_dim: 3584
+  encoder_name: conformer
+  normalize: true
+  encoder_projector_ds_rate: 2
+  encoder_path: /aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/FireRedASR-LLM-L/asr_encoder.pth.tar
+  encoder_dim: 1280
+  encoder_projector: linear
+train_config:
+  model_name: firered_asr
+  freeze_encoder: true
+  freeze_llm: true
+  use_peft: false
+  batching_strategy: custom
+  num_epochs: 1
+  val_batch_size: 8
+  num_workers_dataloader: 8
+  output_dir: ''
+  inference_mode: true
+decode_log: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/examples/asr_fireredasr/decode_slidespeech_asr_normal
diff --git a/.hydra/hydra.yaml b/.hydra/hydra.yaml
@@ -0,0 +1,183 @@
+hydra:
+  run:
+    dir: ''
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+
+      Use --hydra-help to view Hydra specific help
+
+      '
+    template: '${hydra.help.header}
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (group=option)
+
+
+      $APP_CONFIG_GROUPS
+
+
+      == Config ==
+
+      Override anything in the config (foo.bar=value)
+
+
+      $CONFIG
+
+
+      ${hydra.help.footer}
+
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+
+      See https://hydra.cc for more info.
+
+
+      == Flags ==
+
+      $FLAGS_HELP
+
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+
+
+      $HYDRA_CONFIG_GROUPS
+
+
+      Use ''--cfg hydra'' to Show the Hydra config.
+
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.run.dir=
+    - hydra.mode=RUN
+    task:
+    - ++model_config.llm_name=Qwen2-7B-Instruct
+    - ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/Qwen2-7B-Instruct
+    - ++model_config.llm_dim=3584
+    - ++model_config.encoder_name=conformer
+    - ++model_config.normalize=true
+    - ++model_config.encoder_projector_ds_rate=2
+    - ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/FireRedASR-LLM-L/asr_encoder.pth.tar
+    - ++model_config.encoder_dim=1280
+    - ++model_config.encoder_projector=linear
+    - ++dataset_config.llm_name=Qwen2-7B-Instruct
+    - ++dataset_config.prompt_style=normal
+    - ++dataset_config.dataset=slidespeech
+    - ++dataset_config.pad_or_trim=true
+    - ++dataset_config.test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/slidespeech/asr/test/
+    - ++dataset_config.input_type=raw
+    - ++dataset_config.mel_size=128
+    - ++dataset_config.inference_mode=true
+    - ++train_config.model_name=firered_asr
+    - ++train_config.freeze_encoder=true
+    - ++train_config.freeze_llm=true
+    - ++train_config.use_peft=false
+    - ++train_config.batching_strategy=custom
+    - ++train_config.num_epochs=1
+    - ++train_config.val_batch_size=8
+    - ++train_config.num_workers_dataloader=8
+    - ++train_config.output_dir=
+    - ++train_config.inference_mode=true
+    - ++decode_log=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/examples/asr_fireredasr/decode_slidespeech_asr_normal
+  job:
+    name: inference_fireredasr_deepspeed
+    chdir: null
+    override_dirname: ++dataset_config.dataset=slidespeech,++dataset_config.inference_mode=true,++dataset_config.input_type=raw,++dataset_config.llm_name=Qwen2-7B-Instruct,++dataset_config.mel_size=128,++dataset_config.pad_or_trim=true,++dataset_config.prompt_style=normal,++dataset_config.test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/slidespeech/asr/test/,++decode_log=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/examples/asr_fireredasr/decode_slidespeech_asr_normal,++model_config.encoder_dim=1280,++model_config.encoder_name=conformer,++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/FireRedASR-LLM-L/asr_encoder.pth.tar,++model_config.encoder_projector=linear,++model_config.encoder_projector_ds_rate=2,++model_config.llm_dim=3584,++model_config.llm_name=Qwen2-7B-Instruct,++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/Qwen2-7B-Instruct,++model_config.normalize=true,++train_config.batching_strategy=custom,++train_config.freeze_encoder=true,++train_config.freeze_llm=true,++train_config.inference_mode=true,++train_config.model_name=firered_asr,++train_config.num_epochs=1,++train_config.num_workers_dataloader=8,++train_config.output_dir=,++train_config.use_peft=false,++train_config.val_batch_size=8
+    id: ???
+    num: ???
+    config_name: prompt.yaml
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.3'
+    cwd: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/examples/asr_fireredasr/conf
+      schema: file
+      provider: main
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM
+    choices:
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: false
diff --git a/.hydra/overrides.yaml b/.hydra/overrides.yaml
@@ -0,0 +1,28 @@
+- ++model_config.llm_name=Qwen2-7B-Instruct
+- ++model_config.llm_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/Qwen2-7B-Instruct
+- ++model_config.llm_dim=3584
+- ++model_config.encoder_name=conformer
+- ++model_config.normalize=true
+- ++model_config.encoder_projector_ds_rate=2
+- ++model_config.encoder_path=/aistor/aispeech/hpc_stor01/home/pengjing00sx/FireRedASR/pretrained_models/FireRedASR-LLM-L/asr_encoder.pth.tar
+- ++model_config.encoder_dim=1280
+- ++model_config.encoder_projector=linear
+- ++dataset_config.llm_name=Qwen2-7B-Instruct
+- ++dataset_config.prompt_style=normal
+- ++dataset_config.dataset=slidespeech
+- ++dataset_config.pad_or_trim=true
+- ++dataset_config.test_scp_file_path=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/data/slidespeech/asr/test/
+- ++dataset_config.input_type=raw
+- ++dataset_config.mel_size=128
+- ++dataset_config.inference_mode=true
+- ++train_config.model_name=firered_asr
+- ++train_config.freeze_encoder=true
+- ++train_config.freeze_llm=true
+- ++train_config.use_peft=false
+- ++train_config.batching_strategy=custom
+- ++train_config.num_epochs=1
+- ++train_config.val_batch_size=8
+- ++train_config.num_workers_dataloader=8
+- ++train_config.output_dir=
+- ++train_config.inference_mode=true
+- ++decode_log=/aistor/aispeech/hpc_stor01/home/fangyangui/workingspace/project/SLAM-LLM/examples/asr_fireredasr/decode_slidespeech_asr_normal
diff --git a/examples/aispeech_asr/README.md b/examples/aispeech_asr/README.md
@@ -0,0 +1,68 @@
+# MALA-ASR_SLIDESPEECH
+
+## Guides
+
+[MaLa-ASR](https://www.arxiv.org/abs/2406.05839) is an LLM-based ASR model that can integrate textual keywords extracted from presentation slides to improve recognition of conference content. 
+
+![](docs/demo.png)
+
+## Model Architecture
+
+We use the official WavLM-Large model as our speech encoder, the public Vicuna 7B as our large language model decoder, and a simple-structured linear projector, consisting of a 1-D convolution layer and two linear layers as our projector. Refer to the [paper](https://www.arxiv.org/abs/2406.05839) for more details.
+
+![](docs/model.png)
+
+## Performance and checkpoints
+We only train the linear projector in this recipe.
+Encoder | Projector | LLM | dev | test
+|---|---|---|---|---|
+[WavLM-large](https://drive.google.com/file/d/12-cB34qCTvByWT-QtOcZaqwwO21FLSqU/view) | [Linear](https://drive.google.com/file/d/1hYS5UI3W0WVOZRVbqWxDUWIFMO9VgzHk/view?usp=drive_link)(~15.74M) | [vicuna-7b-v1.5](https://huggingface.co/lmsys/vicuna-7b-v1.5) | 8.91 | 9.14 
+
+
+## Data preparation
+Refer to official [SLIDESPEECH CORPUS](https://slidespeech.github.io/).
+
+Specifically, take the file `slidespeech_dataset.py` as an example, the dataset requires four files: `my_wav.scp`, `utt2num_samples`, `text`, `hot_related/ocr_1gram_top50_mmr070_hotwords_list`.
+
+`my_wav.scp` is a file of audio path lists. We transform wav file to ark file, so this file looks like  
+```
+ID1 xxx/slidespeech/dev_oracle_v1/data/format.1/data_wav.ark:22  
+ID2 xxx/slidespeech/dev_oracle_v1/data/format.1/data_wav.ark:90445
+...
+```
+
+To generate this file, you can get audio wavs from https://www.openslr.org/144/ and get the time segments from https://slidespeech.github.io/. The second website provides segments, transcription text, OCR results at https://speech-lab-share-data.oss-cn-shanghai.aliyuncs.com/SlideSpeech/related_files.tar.gz (~1.37GB). You need to segment the wav by the timestamps provided in `segments` file.
+
+
+ This _related_files.tar.gz_ also provides `text` and a file named `keywords`. The file `keywords` refers to `hot_related/ocr_1gram_top50_mmr070_hotwords_list`, which contains hotwords list.
+
+`utt2num_samples` contains the length of the wavs, which looks like   
+```
+ID1 103680  
+ID2 181600  
+...
+```
+
+## Decode with checkpoints
+```
+bash decode_MaLa-ASR_withkeywords_L95.sh
+```
+Modify the path including `speech_encoder_path`, `llm_path`, `output_dir`, `ckpt_path` and `decode_log` in the script when you run the shell script. 
+
+## Train a new model
+
+### Use self-supervised model(such as WavLM) as the encoder
+```
+bash finetune_MaLa-ASR_withkeywords_L95.sh
+```
+
+##  Citation
+You can refer to the paper for more results. 
+```
+@inproceedings{yang2024malaasr,
+      title={MaLa-ASR: Multimedia-Assisted LLM-Based ASR}, 
+      author={Guanrou Yang and Ziyang Ma and Fan Yu and Zhifu Gao and Shiliang Zhang and Xie Chen},
+      booktitle={Proc. INTERSPEECH},
+      year={2024},
+}
+```
diff --git a/examples/aispeech_asr/conf/ds_config.json b/examples/aispeech_asr/conf/ds_config.json
@@ -0,0 +1,38 @@
+{
+    "train_micro_batch_size_per_gpu": 2,
+    "gradient_accumulation_steps": 1,
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": 5e-5,
+            "betas": [0.9, 0.999],
+            "eps": 1e-06,
+            "weight_decay": 0.01
+    }
+    },
+    "bf16": {
+        "enabled": true,
+        "loss_scale": 0,
+        "loss_scale_window": 100,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 0.01
+    },
+    "zero_optimization": {
+        "stage": 2,
+        "allgather_partitions": true,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "contiguous_gradients": true
+    },
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": 0.00,
+            "warmup_max_lr": 0.00005,
+            "warmup_num_steps": 1000
+        }
+    },
+    "checkpoint_activations": false
+
+}