diff --git a/examples/seld_spatialsoundqa/README.md b/examples/seld_spatialsoundqa/README.md index 2cfcb63e..7a032027 100644 --- a/examples/seld_spatialsoundqa/README.md +++ b/examples/seld_spatialsoundqa/README.md @@ -15,7 +15,15 @@ Encoder | Projector | LLM | [Spatial-AST](https://huggingface.co/datasets/zhisheng01/SpatialAudio/blob/main/SpatialAST/finetuned.pth) | [Q-former](https://huggingface.co/datasets/zhisheng01/SpatialAudio/blob/main/BAT/model.pt)(~73.56M) | [llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b) | ## Demo (Spatial Audio Inference) -Try [`inference.ipynb`](https://github.com/X-LANCE/SLAM-LLM/blob/main/examples/seld_spatialsoundqa/inference.ipynb). +### Environment setup +``` +cd SLAM-LLM/examples/seld_spatialsoundqa/ +pip install -r requirements.txt +cd SLAM-LLM/ +pip install -e . +``` + +Then try [`inference.ipynb`](https://github.com/X-LANCE/SLAM-LLM/blob/main/examples/seld_spatialsoundqa/inference.ipynb). ## Data preparation diff --git a/examples/seld_spatialsoundqa/requirements.txt b/examples/seld_spatialsoundqa/requirements.txt new file mode 100644 index 00000000..b28d845a --- /dev/null +++ b/examples/seld_spatialsoundqa/requirements.txt @@ -0,0 +1,18 @@ +timm==0.9.10 +soundfile +numpy==1.26.4 +HyperPyYAML==1.2.2 +conformer==0.3.2 +deepspeed==0.14.2; sys_platform == 'linux' +diffusers==0.27.2 +gradio==5.3.0 +grpcio==1.57.0 +grpcio-tools==1.57.0 +inflect==7.3.1 +matplotlib==3.7.5 +lightning==2.2.4 +wget==3.2 +librosa +torchaudio==2.3.0 +torchlibrosa +transformers==4.51.0 diff --git a/examples/seld_spatialsoundqa/seld_config.py b/examples/seld_spatialsoundqa/seld_config.py index ce69f27f..65741a44 100644 --- a/examples/seld_spatialsoundqa/seld_config.py +++ b/examples/seld_spatialsoundqa/seld_config.py @@ -17,6 +17,7 @@ class ModelConfig: encoder_projector: str = "q-former" encoder_dim: int = 768 qformer_layers: int = 8 + query_len: int = 64 @dataclass class PeftConfig: