Skip to content

Commit 56fb822

Browse files
committed
10.11
1 parent 752b96e commit 56fb822

File tree

6 files changed

+53
-25
lines changed

6 files changed

+53
-25
lines changed

examples/slam_aac/scripts/clap_refine.sh

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,16 @@ run_dir=/data/wenxi.chen/SLAM-LLM
55
cd $run_dir
66
code_dir=examples/slam_aac
77

8+
clap_dir=/data/xiquan.li/models/clap
9+
inference_data_path=/data/wenxi.chen/data/clotho/evaluation_single.jsonl
10+
output_dir=/data/wenxi.chen/cp/wavcaps_pt_v7_epoch4-clotho_ft-seed10086_btz4_lr8e-6-short_prompt_10w/aac_epoch_1_step_4500
11+
12+
echo "Running CLAP-Refine"
13+
814
# -m debugpy --listen 6666 --wait-for-client
915
python ${code_dir}/utils/clap_refine.py \
1016
--start_beam 2 --end_beam 8 \
11-
--clap_ckpt /data/xiquan.li/models/clap/best_model.pt \
12-
--config /data/xiquan.li/models/clap/clap_config.yaml \
13-
--test_jsonl /data/xiquan.li/data/rz_cap/clotho/test_single.jsonl \
14-
--exp_explorer /data/wenxi.chen/models/clotho
17+
--clap_ckpt $clap_dir/best_model.pt \
18+
--config $clap_dir/clap_config.yaml \
19+
--test_jsonl $inference_data_path \
20+
--exp_explorer $output_dir

examples/slam_aac/scripts/inference_audiocaps_CLAP_Refine.sh

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,21 +8,21 @@ code_dir=examples/slam_aac
88

99
audio_encoder_path=/data/xiquan.li/models/EAT-base_epoch30_ft.pt
1010
llm_path=/data/xiquan.li/models/vicuna-7b-v1.5
11+
clap_dir=/data/xiquan.li/models/clap
1112

12-
seed=42
1313
encoder_projector_ds_rate=5
1414

1515
inference_data_path=/data/wenxi.chen/data/audiocaps/new_test.jsonl
1616
output_dir=/data/wenxi.chen/cp/aac_epoch_2_step_182_audiocaps_seed42
1717

18-
# 定义beam范围
18+
# define the beam size range
1919
beam_range=(2 3 4 5 6 7 8)
2020

2121
for num_beams in "${beam_range[@]}"; do
2222
decode_log=$output_dir/decode_beam${num_beams}
2323

24-
if [ -f "$decode_log" ]; then
25-
echo "Decode log $decode_log already exists, skipping this beam size..."
24+
if [ -f "${decode_log}_pred" ]; then
25+
echo "Decode log ${decode_log}_pred already exists, skipping this beam size..."
2626
continue
2727
fi
2828

@@ -55,8 +55,8 @@ for num_beams in "${beam_range[@]}"; do
5555
++train_config.model_name=aac \
5656
++train_config.batching_strategy=custom \
5757
++train_config.num_epochs=1 \
58-
++train_config.val_batch_size=8 \
59-
++train_config.num_workers_dataloader=8 \
58+
++train_config.val_batch_size=4 \
59+
++train_config.num_workers_dataloader=0 \
6060
++train_config.output_dir=$output_dir \
6161
++train_config.freeze_encoder=true \
6262
++train_config.freeze_llm=false \
@@ -66,3 +66,17 @@ for num_beams in "${beam_range[@]}"; do
6666
++decode_log=$decode_log \
6767
++model_config.num_beams=$num_beams
6868
done
69+
70+
# note: to inference model trained the linear layer only, you could set '++train_config.use_peft=false' and 'train_config.freeze_llm=true'
71+
72+
echo "Running CLAP-Refine"
73+
74+
# -m debugpy --listen 6666 --wait-for-client
75+
python ${code_dir}/utils/clap_refine.py \
76+
--start_beam 2 --end_beam 8 \
77+
--clap_ckpt $clap_dir/best_model.pt \
78+
--config $clap_dir/clap_config.yaml \
79+
--test_jsonl $inference_data_path \
80+
--exp_explorer $output_dir
81+
82+
# bash /data/wenxi.chen/SLAM-LLM/examples/slam_aac/scripts/inference_audiocaps_CLAP_Refine.sh

examples/slam_aac/scripts/inference_audiocaps_bs.sh

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#!/bin/bash
2-
export CUDA_VISIBLE_DEVICES=1
2+
export CUDA_VISIBLE_DEVICES=0
33
export TOKENIZERS_PARALLELISM=false
44

55
run_dir=/data/wenxi.chen/SLAM-LLM
@@ -9,9 +9,8 @@ code_dir=examples/slam_aac
99
audio_encoder_path=/data/xiquan.li/models/EAT-base_epoch30_ft.pt
1010
llm_path=/data/xiquan.li/models/vicuna-7b-v1.5
1111

12-
seed=42
1312
encoder_projector_ds_rate=5
14-
num_beams=8
13+
num_beams=4
1514

1615
inference_data_path=/data/wenxi.chen/data/audiocaps/new_test.jsonl
1716
output_dir=/data/wenxi.chen/cp/aac_epoch_2_step_182_audiocaps_seed42

examples/slam_aac/scripts/inference_clotho_CLAP_Refine.sh

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,21 +8,21 @@ code_dir=examples/slam_aac
88

99
audio_encoder_path=/data/xiquan.li/models/EAT-base_epoch30_ft.pt
1010
llm_path=/data/xiquan.li/models/vicuna-7b-v1.5
11+
clap_dir=/data/xiquan.li/models/clap
1112

12-
seed=42
1313
encoder_projector_ds_rate=5
1414

1515
inference_data_path=/data/wenxi.chen/data/clotho/evaluation_single.jsonl
1616
output_dir=/data/wenxi.chen/cp/wavcaps_pt_v7_epoch4-clotho_ft-seed10086_btz4_lr8e-6-short_prompt_10w/aac_epoch_1_step_4500
1717

18-
18+
# define the beam size range
1919
beam_range=(2 3 4 5 6 7 8)
2020

2121
for num_beams in "${beam_range[@]}"; do
2222
decode_log=$output_dir/decode_beam${num_beams}
2323

24-
if [ -f "$decode_log" ]; then
25-
echo "Decode log $decode_log already exists, skipping this beam size..."
24+
if [ -f "${decode_log}_pred" ]; then
25+
echo "Decode log ${decode_log}_pred already exists, skipping this beam size..."
2626
continue
2727
fi
2828

@@ -55,8 +55,8 @@ for num_beams in "${beam_range[@]}"; do
5555
++train_config.model_name=aac \
5656
++train_config.batching_strategy=custom \
5757
++train_config.num_epochs=1 \
58-
++train_config.val_batch_size=8 \
59-
++train_config.num_workers_dataloader=8 \
58+
++train_config.val_batch_size=4 \
59+
++train_config.num_workers_dataloader=0 \
6060
++train_config.output_dir=$output_dir \
6161
++train_config.freeze_encoder=true \
6262
++train_config.freeze_llm=false \
@@ -68,4 +68,15 @@ for num_beams in "${beam_range[@]}"; do
6868
done
6969

7070
# note: to inference model trained the linear layer only, you could set '++train_config.use_peft=false' and 'train_config.freeze_llm=true'
71+
72+
echo "Running CLAP-Refine"
73+
74+
# -m debugpy --listen 6666 --wait-for-client
75+
python ${code_dir}/utils/clap_refine.py \
76+
--start_beam 2 --end_beam 8 \
77+
--clap_ckpt $clap_dir/best_model.pt \
78+
--config $clap_dir/clap_config.yaml \
79+
--test_jsonl $inference_data_path \
80+
--exp_explorer $output_dir
81+
7182
# bash /data/wenxi.chen/SLAM-LLM/examples/slam_aac/scripts/inference_clotho_CLAP_Refine.sh

examples/slam_aac/scripts/inference_clotho_bs.sh

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#!/bin/bash
2-
export CUDA_VISIBLE_DEVICES=4
2+
export CUDA_VISIBLE_DEVICES=1
33
export TOKENIZERS_PARALLELISM=false
44

55
run_dir=/data/wenxi.chen/SLAM-LLM
@@ -9,9 +9,8 @@ code_dir=examples/slam_aac
99
audio_encoder_path=/data/xiquan.li/models/EAT-base_epoch30_ft.pt
1010
llm_path=/data/xiquan.li/models/vicuna-7b-v1.5
1111

12-
seed=42
1312
encoder_projector_ds_rate=5
14-
num_beams=8
13+
num_beams=4
1514

1615
inference_data_path=/data/wenxi.chen/data/clotho/evaluation_single.jsonl
1716
output_dir=/data/wenxi.chen/cp/wavcaps_pt_v7_epoch4-clotho_ft-seed10086_btz4_lr8e-6-short_prompt_10w/aac_epoch_1_step_4500

examples/slam_aac/utils/clap_refine.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -127,8 +127,7 @@ def encode_audio(dl):
127127
config = args.config
128128
exp_explorer, test_jsonl = args.exp_explorer, args.test_jsonl
129129
start_beam, end_beam = args.start_beam, args.end_beam
130-
cand_files = [f'{exp_explorer}/decode_log_test_clean_beam{i}_repetition_penalty1_pred' for i in range(start_beam, end_beam+1)]
131-
130+
cand_files = [f'{exp_explorer}/decode_beam{i}_pred' for i in range(start_beam, end_beam+1)]
132131

133132
print(f"--Clap re-ranking for beam {start_beam}~{end_beam}--")
134133

@@ -163,7 +162,7 @@ def encode_audio(dl):
163162
best_captions.append(cand_captions[int(indices[args.rank-1][i])][i])
164163

165164
# Write
166-
output_file = exp_explorer + '/' + f"decode_log_test_clean_beam_{start_beam}-{end_beam}_pred"
165+
output_file = exp_explorer + '/' + f"decode_beam{start_beam}-{end_beam}_pred"
167166
with open(output_file, 'w') as f:
168167
for i, caption in enumerate(best_captions):
169168
audio_id = audio_ids[i]

0 commit comments

Comments
 (0)