X-LANCE
diff --git a/‎examples/seld_spatialsoundqa/README.md‎
Lines changed: 44 additions & 13 deletions b/‎examples/seld_spatialsoundqa/README.md‎
Lines changed: 44 additions & 13 deletions
diff --git a/‎examples/seld_spatialsoundqa/assets/74.npy‎
656 KB b/‎examples/seld_spatialsoundqa/assets/74.npy‎
656 KB
diff --git a/‎examples/seld_spatialsoundqa/assets/75.npy‎
592 KB b/‎examples/seld_spatialsoundqa/assets/75.npy‎
592 KB
diff --git a/‎examples/seld_spatialsoundqa/assets/YCqvbWnTBfTk.wav‎
625 KB b/‎examples/seld_spatialsoundqa/assets/YCqvbWnTBfTk.wav‎
625 KB
diff --git a/‎examples/seld_spatialsoundqa/assets/Yq4Z8j3IalYs.wav‎
625 KB b/‎examples/seld_spatialsoundqa/assets/Yq4Z8j3IalYs.wav‎
625 KB
diff --git a/‎examples/seld_spatialsoundqa/assets/performance.png‎
509 KB b/‎examples/seld_spatialsoundqa/assets/performance.png‎
509 KB
diff --git a/‎examples/seld_spatialsoundqa/dataset/spatial_audio_dataset.py‎
Lines changed: 2 additions & 3 deletions b/‎examples/seld_spatialsoundqa/dataset/spatial_audio_dataset.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎examples/seld_spatialsoundqa/finetune_seld.py‎
Lines changed: 7 additions & 18 deletions b/‎examples/seld_spatialsoundqa/finetune_seld.py‎
Lines changed: 7 additions & 18 deletions
diff --git a/‎examples/seld_spatialsoundqa/inference.ipynb‎
Lines changed: 786 additions & 0 deletions b/‎examples/seld_spatialsoundqa/inference.ipynb‎
Lines changed: 786 additions & 0 deletions
diff --git a/‎examples/seld_spatialsoundqa/inference_seld_batch.py‎
Lines changed: 2 additions & 7 deletions b/‎examples/seld_spatialsoundqa/inference_seld_batch.py‎
Lines changed: 2 additions & 7 deletions
@@ -1,39 +1,70 @@
 # <img src="assets/bat.png" alt="SELD_SpatialSoundQA" width="25" height="25"> SELD_SpatialSoundQA
 
-This repo hosts the code and models of "[BAT: Learning to Reason about Spatial Sounds with Large Language Models](https://arxiv.org/abs/2402.01591)" [ICML 2024 [bib](https://github.com/zszheng147/Spatial-AST#citation)]. 
+This repo hosts the code and models of "[BAT: Learning to Reason about Spatial Sounds with Large Language Models](https://arxiv.org/abs/2402.01591)" [ICML 2024 [bib](https://github.com/X-LANCE/SLAM-LLM/tree/main/examples/seld_spatialsoundqa#citation)]. 
 
 Checkout our [demo page](https://zhishengzheng.com/BAT/) and enjoy a QA game with spatial audio.
 
-## Performance and checkpoints
-Encoder | Projector | PEFT | LLM
-|---|---|---|---|
-[Spatial-AST](https://huggingface.co/zhisheng01/Bat/blob/main/spatial-ast.pth) | Q-Former | adapter |[llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b) 
+## Performance evaluation on **SpatialSoundQA**
+We use [Spatial-AST](https://huggingface.co/datasets/zhisheng01/SpatialAudio/blob/main/SpatialAST/finetuned.pth) as audio encoder, [llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b) as LLM backbone. We finetune the model by adding Q-Former and LoRA. To calculate MAP, you can refer to [calculate_map.py](https://github.com/X-LANCE/SLAM-LLM/blob/main/examples/seld_spatialsoundqa/scripts/calculate_map.py)
+<img src="assets/performance.png" alt="xxx">
+
+
+## Checkpoints
+Encoder | Projector | LLM | 
+|---|---|---|
+[Spatial-AST](https://huggingface.co/datasets/zhisheng01/SpatialAudio/blob/main/SpatialAST/finetuned.pth) | [Q-former](https://huggingface.co/datasets/zhisheng01/SpatialAudio/blob/main/BAT/model.pt)(~73.56M) | [llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b) |
+
+## Demo (Spatial Audio Inference)
+Try [`inference.ipynb`](https://github.com/X-LANCE/SLAM-LLM/blob/main/examples/seld_spatialsoundqa/inference.ipynb).
+
 
 ## Data preparation
 You need to prepare the data jsonl in this format. Below is an example.  
-You can download the SpatialSoundQA dataset from [huggingface](https://huggingface.co/datasets/zhisheng01/SpatialSoundQA).
-```
-{"audio_id": "eval/audio/YI-HlrcP6Qg4", "reverb_id": "q9vSo1VnCiC/0.npy", "audio_id2": null, "reverb_id2": null, "question_id": 0, "question_type": "CLASSIFICATION", "question": "Enumerate the sound occurrences in the audio clip.", "answer": "accelerating, revving, vroom; car; vehicle"}
+You can download the SpatialSoundQA dataset from [SpatialAudio](https://huggingface.co/datasets/zhisheng01/SpatialAudio).
+```json
+{
+  "audio_id": "eval/audio/YI-HlrcP6Qg4",
+  "reverb_id": "q9vSo1VnCiC/0.npy", 
+  "audio_id2": null, 
+  "reverb_id2": null, 
+  "question_id": 0, 
+  "question_type": "CLASSIFICATION", 
+  "question": "Enumerate the sound occurrences in the audio clip.", 
+  "answer": "accelerating, revving, vroom; car; vehicle"
+}
+
 ...
-{"audio_id": "eval/audio/YZX2fVPmUidA", "reverb_id": "q9vSo1VnCiC/32.npy", "audio_id2": "eval/audio/YjNjUU01quLs", "reverb_id2": "q9vSo1VnCiC/31.npy", "question_id": 58, "question_type": "MIXUP_NONBINARY_DISTANCE", "question": "How far away is the sound of the banjo from the sound of the whack, thwack?", "answer": "2m"}
+
+{
+  "audio_id": "eval/audio/YZX2fVPmUidA", 
+  "reverb_id": "q9vSo1VnCiC/32.npy", 
+  "audio_id2": "eval/audio/YjNjUU01quLs", 
+  "reverb_id2": "q9vSo1VnCiC/31.npy", 
+  "question_id": 58, 
+  "question_type": "MIXUP_NONBINARY_DISTANCE", 
+  "question": "How far away is the sound of the banjo from the sound of the whack, thwack?", 
+  "answer": "2m"
+}
 ```
 
 ## Train a new model
 ```bash
-bash examples/seld_spatialsoundqa/scripts/finetune_spatial-ast_qformer_llama_2_7b.sh
+cd examples/seld_spatialsoundqa/
+bash scripts/finetune_spatial-ast_qformer_llama_2_7b.sh
 ```
 
 ## Decoding with checkpoints
 ```bash
-bash examples/seld_spatialsoundqa/scripts/decode_spatial-ast_qformer_llama_2_7b.sh
+cd examples/seld_spatialsoundqa/
+bash scripts/decode_spatial-ast_qformer_llama_2_7b.sh
 ```
 
 
 ## TODO
 - [x] Decode with checkpoints
 - [x] Upload SpatialSoundQA dataset
-- [ ] Upload pretrained checkpoints
-- [ ] Update model performance
+- [x] Upload pretrained checkpoints
+- [x] Update model performance
 
 ## Citation
 ```
 
@@ -37,9 +37,8 @@ def __init__(
             split,
         ):
         super().__init__()
-        dataset_path = os.path.join(dataset_config['qa_data_root'], dataset_config['stage'], split + '.jsonl')
-        with open(dataset_path) as f:
-            self.data = [json.loads(line) for line in f.readlines()]
+        dataset_path = os.path.join(dataset_config['qa_data_root'], dataset_config['stage'], split + '.json')
+        self.data = json.load(open(dataset_path))["data"]
 
         self.anechoic_data_root = dataset_config['anechoic_data_root'] # which is AudioSet in this case
         self.reverb_data_root = dataset_config['reverb_data_root']
 
@@ -1,5 +1,6 @@
 import hydra
 import logging
+from typing import Optional
 from dataclasses import dataclass, field
 from omegaconf import DictConfig, ListConfig, OmegaConf
 
@@ -16,32 +17,20 @@ class RunConfig:
     peft_config: PeftConfig = field(default_factory=PeftConfig)
     debug: bool = field(default=False, metadata={"help": "Use pdb when true"})
     metric: str = field(default="acc", metadata={"help": "The metric for evaluation"})
-    ckpt_path: str = field(
-        default="output/model.pt", metadata={"help": "The path to projector checkpoint"}
+    ckpt_path: Optional[str] = field(
+        default=None, metadata={"help": "The path to projector checkpoint"}
     )
 
 @hydra.main(config_name=None, version_base=None)
 def main_hydra(cfg: DictConfig):
     run_config = RunConfig()
     cfg = OmegaConf.merge(run_config, cfg)
-    def to_plain_list(cfg_item):
-        if isinstance(cfg_item, ListConfig):
-            return OmegaConf.to_container(cfg_item, resolve=True)
-        elif isinstance(cfg_item, DictConfig):
-            return {k: to_plain_list(v) for k, v in cfg_item.items()}
-        else:
-            return cfg_item
-    
-    # kwargs = to_plain_list(cfg)
-    kwargs = cfg
-    log_level = getattr(logging, kwargs.get("log_level", "INFO").upper())
+    cfg.train_config.peft_config = cfg.peft_config
+
+    log_level = getattr(logging, cfg.get("log_level", "INFO").upper())
     logging.basicConfig(level=log_level)
-    
-    if kwargs.get("debug", False):
-        import pdb;
-        pdb.set_trace()
 
-    train(kwargs)
+    train(cfg)
 
 
 if __name__ == "__main__":
 
@@ -36,16 +36,11 @@ class RunConfig:
 def main_hydra(cfg: DictConfig):
     run_config = RunConfig()
     cfg = OmegaConf.merge(run_config, cfg)
-    # kwargs = to_plain_list(cfg)
-    log_level = getattr(logging, cfg.get("log_level", "INFO").upper())
+    cfg.train_config.peft_config = cfg.peft_config
 
+    log_level = getattr(logging, cfg.get("log_level", "INFO").upper())
     logging.basicConfig(level=log_level)
 
-    if cfg.get("debug", False):
-        import pdb
-
-        pdb.set_trace()
-
     inference(cfg)