diff --git a/examples/seld_spatialsoundqa/README.md b/examples/seld_spatialsoundqa/README.md
index 2cfcb63e..7a032027 100644
--- a/examples/seld_spatialsoundqa/README.md
+++ b/examples/seld_spatialsoundqa/README.md
@@ -15,7 +15,15 @@ Encoder | Projector | LLM |
 [Spatial-AST](https://huggingface.co/datasets/zhisheng01/SpatialAudio/blob/main/SpatialAST/finetuned.pth) | [Q-former](https://huggingface.co/datasets/zhisheng01/SpatialAudio/blob/main/BAT/model.pt)(~73.56M) | [llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b) |
 
 ## Demo (Spatial Audio Inference)
-Try [`inference.ipynb`](https://github.com/X-LANCE/SLAM-LLM/blob/main/examples/seld_spatialsoundqa/inference.ipynb).
+### Environment setup
+```
+cd SLAM-LLM/examples/seld_spatialsoundqa/
+pip install -r requirements.txt
+cd SLAM-LLM/
+pip install -e .
+```
+
+Then try [`inference.ipynb`](https://github.com/X-LANCE/SLAM-LLM/blob/main/examples/seld_spatialsoundqa/inference.ipynb).
 
 
 ## Data preparation
diff --git a/examples/seld_spatialsoundqa/requirements.txt b/examples/seld_spatialsoundqa/requirements.txt
new file mode 100644
index 00000000..b28d845a
--- /dev/null
+++ b/examples/seld_spatialsoundqa/requirements.txt
@@ -0,0 +1,18 @@
+timm==0.9.10
+soundfile
+numpy==1.26.4
+HyperPyYAML==1.2.2
+conformer==0.3.2
+deepspeed==0.14.2; sys_platform == 'linux'
+diffusers==0.27.2
+gradio==5.3.0
+grpcio==1.57.0
+grpcio-tools==1.57.0
+inflect==7.3.1
+matplotlib==3.7.5
+lightning==2.2.4
+wget==3.2
+librosa
+torchaudio==2.3.0
+torchlibrosa
+transformers==4.51.0
diff --git a/examples/seld_spatialsoundqa/seld_config.py b/examples/seld_spatialsoundqa/seld_config.py
index ce69f27f..65741a44 100644
--- a/examples/seld_spatialsoundqa/seld_config.py
+++ b/examples/seld_spatialsoundqa/seld_config.py
@@ -17,6 +17,7 @@ class ModelConfig:
     encoder_projector: str = "q-former"
     encoder_dim: int = 768
     qformer_layers: int = 8
+    query_len: int = 64
 
 @dataclass
 class PeftConfig: