10.11

cwx-worst-one · cwx-worst-one · commit 56fb82252c46 · 2024-10-11T08:18:44.000Z
diff --git a/examples/slam_aac/scripts/clap_refine.sh b/examples/slam_aac/scripts/clap_refine.sh
@@ -5,10 +5,16 @@ run_dir=/data/wenxi.chen/SLAM-LLM
 cd $run_dir
 code_dir=examples/slam_aac
 
+clap_dir=/data/xiquan.li/models/clap
+inference_data_path=/data/wenxi.chen/data/clotho/evaluation_single.jsonl
+output_dir=/data/wenxi.chen/cp/wavcaps_pt_v7_epoch4-clotho_ft-seed10086_btz4_lr8e-6-short_prompt_10w/aac_epoch_1_step_4500
+
+echo "Running CLAP-Refine"
+
 # -m debugpy --listen 6666 --wait-for-client
 python ${code_dir}/utils/clap_refine.py \
     --start_beam 2 --end_beam 8 \
-    --clap_ckpt /data/xiquan.li/models/clap/best_model.pt \
-    --config /data/xiquan.li/models/clap/clap_config.yaml \
-    --test_jsonl /data/xiquan.li/data/rz_cap/clotho/test_single.jsonl \
-    --exp_explorer /data/wenxi.chen/models/clotho
+    --clap_ckpt $clap_dir/best_model.pt \
+    --config $clap_dir/clap_config.yaml \
+    --test_jsonl $inference_data_path \
+    --exp_explorer $output_dir
diff --git a/examples/slam_aac/scripts/inference_audiocaps_CLAP_Refine.sh b/examples/slam_aac/scripts/inference_audiocaps_CLAP_Refine.sh
@@ -8,21 +8,21 @@ code_dir=examples/slam_aac
 
 audio_encoder_path=/data/xiquan.li/models/EAT-base_epoch30_ft.pt
 llm_path=/data/xiquan.li/models/vicuna-7b-v1.5
+clap_dir=/data/xiquan.li/models/clap
 
-seed=42
 encoder_projector_ds_rate=5
 
 inference_data_path=/data/wenxi.chen/data/audiocaps/new_test.jsonl
 output_dir=/data/wenxi.chen/cp/aac_epoch_2_step_182_audiocaps_seed42
 
-# 定义beam范围
+# define the beam size range
 beam_range=(2 3 4 5 6 7 8)
 
 for num_beams in "${beam_range[@]}"; do
     decode_log=$output_dir/decode_beam${num_beams}
 
-    if [ -f "$decode_log" ]; then
-        echo "Decode log $decode_log already exists, skipping this beam size..."
+    if [ -f "${decode_log}_pred" ]; then
+        echo "Decode log ${decode_log}_pred already exists, skipping this beam size..."
         continue
     fi
 
@@ -55,8 +55,8 @@ for num_beams in "${beam_range[@]}"; do
         ++train_config.model_name=aac \
         ++train_config.batching_strategy=custom \
         ++train_config.num_epochs=1 \
-        ++train_config.val_batch_size=8 \
-        ++train_config.num_workers_dataloader=8 \
+        ++train_config.val_batch_size=4 \
+        ++train_config.num_workers_dataloader=0 \
         ++train_config.output_dir=$output_dir \
         ++train_config.freeze_encoder=true \
         ++train_config.freeze_llm=false \
@@ -66,3 +66,17 @@ for num_beams in "${beam_range[@]}"; do
         ++decode_log=$decode_log \
         ++model_config.num_beams=$num_beams
 done
+
+# note: to inference model trained the linear layer only, you could set '++train_config.use_peft=false' and 'train_config.freeze_llm=true'
+
+echo "Running CLAP-Refine"
+
+# -m debugpy --listen 6666 --wait-for-client
+python ${code_dir}/utils/clap_refine.py \
+    --start_beam 2 --end_beam 8 \
+    --clap_ckpt $clap_dir/best_model.pt \
+    --config $clap_dir/clap_config.yaml \
+    --test_jsonl $inference_data_path \
+    --exp_explorer $output_dir
+
+# bash /data/wenxi.chen/SLAM-LLM/examples/slam_aac/scripts/inference_audiocaps_CLAP_Refine.sh
diff --git a/examples/slam_aac/scripts/inference_audiocaps_bs.sh b/examples/slam_aac/scripts/inference_audiocaps_bs.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-export CUDA_VISIBLE_DEVICES=1
+export CUDA_VISIBLE_DEVICES=0
 export TOKENIZERS_PARALLELISM=false
 
 run_dir=/data/wenxi.chen/SLAM-LLM
@@ -9,9 +9,8 @@ code_dir=examples/slam_aac
 audio_encoder_path=/data/xiquan.li/models/EAT-base_epoch30_ft.pt
 llm_path=/data/xiquan.li/models/vicuna-7b-v1.5
 
-seed=42
 encoder_projector_ds_rate=5
-num_beams=8
+num_beams=4
 
 inference_data_path=/data/wenxi.chen/data/audiocaps/new_test.jsonl
 output_dir=/data/wenxi.chen/cp/aac_epoch_2_step_182_audiocaps_seed42
diff --git a/examples/slam_aac/scripts/inference_clotho_CLAP_Refine.sh b/examples/slam_aac/scripts/inference_clotho_CLAP_Refine.sh
@@ -8,21 +8,21 @@ code_dir=examples/slam_aac
 
 audio_encoder_path=/data/xiquan.li/models/EAT-base_epoch30_ft.pt
 llm_path=/data/xiquan.li/models/vicuna-7b-v1.5
+clap_dir=/data/xiquan.li/models/clap
 
-seed=42
 encoder_projector_ds_rate=5
 
 inference_data_path=/data/wenxi.chen/data/clotho/evaluation_single.jsonl
 output_dir=/data/wenxi.chen/cp/wavcaps_pt_v7_epoch4-clotho_ft-seed10086_btz4_lr8e-6-short_prompt_10w/aac_epoch_1_step_4500
 
-
+# define the beam size range
 beam_range=(2 3 4 5 6 7 8)
 
 for num_beams in "${beam_range[@]}"; do
     decode_log=$output_dir/decode_beam${num_beams}
 
-    if [ -f "$decode_log" ]; then
-        echo "Decode log $decode_log already exists, skipping this beam size..."
+    if [ -f "${decode_log}_pred" ]; then
+        echo "Decode log ${decode_log}_pred already exists, skipping this beam size..."
         continue
     fi
 
@@ -55,8 +55,8 @@ for num_beams in "${beam_range[@]}"; do
         ++train_config.model_name=aac \
         ++train_config.batching_strategy=custom \
         ++train_config.num_epochs=1 \
-        ++train_config.val_batch_size=8 \
-        ++train_config.num_workers_dataloader=8 \
+        ++train_config.val_batch_size=4 \
+        ++train_config.num_workers_dataloader=0 \
         ++train_config.output_dir=$output_dir \
         ++train_config.freeze_encoder=true \
         ++train_config.freeze_llm=false \
@@ -68,4 +68,15 @@ for num_beams in "${beam_range[@]}"; do
 done
 
 # note: to inference model trained the linear layer only, you could set '++train_config.use_peft=false' and 'train_config.freeze_llm=true'
+
+echo "Running CLAP-Refine"
+
+# -m debugpy --listen 6666 --wait-for-client
+python ${code_dir}/utils/clap_refine.py \
+    --start_beam 2 --end_beam 8 \
+    --clap_ckpt $clap_dir/best_model.pt \
+    --config $clap_dir/clap_config.yaml \
+    --test_jsonl $inference_data_path \
+    --exp_explorer $output_dir
+
 # bash /data/wenxi.chen/SLAM-LLM/examples/slam_aac/scripts/inference_clotho_CLAP_Refine.sh
diff --git a/examples/slam_aac/scripts/inference_clotho_bs.sh b/examples/slam_aac/scripts/inference_clotho_bs.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-export CUDA_VISIBLE_DEVICES=4
+export CUDA_VISIBLE_DEVICES=1
 export TOKENIZERS_PARALLELISM=false
 
 run_dir=/data/wenxi.chen/SLAM-LLM
@@ -9,9 +9,8 @@ code_dir=examples/slam_aac
 audio_encoder_path=/data/xiquan.li/models/EAT-base_epoch30_ft.pt
 llm_path=/data/xiquan.li/models/vicuna-7b-v1.5
 
-seed=42
 encoder_projector_ds_rate=5
-num_beams=8
+num_beams=4
 
 inference_data_path=/data/wenxi.chen/data/clotho/evaluation_single.jsonl
 output_dir=/data/wenxi.chen/cp/wavcaps_pt_v7_epoch4-clotho_ft-seed10086_btz4_lr8e-6-short_prompt_10w/aac_epoch_1_step_4500
diff --git a/examples/slam_aac/utils/clap_refine.py b/examples/slam_aac/utils/clap_refine.py
@@ -127,8 +127,7 @@ def encode_audio(dl):
     config = args.config
     exp_explorer, test_jsonl = args.exp_explorer, args.test_jsonl
     start_beam, end_beam = args.start_beam, args.end_beam
-    cand_files = [f'{exp_explorer}/decode_log_test_clean_beam{i}_repetition_penalty1_pred' for i in range(start_beam, end_beam+1)]
-
+    cand_files = [f'{exp_explorer}/decode_beam{i}_pred' for i in range(start_beam, end_beam+1)]
 
     print(f"--Clap re-ranking for beam {start_beam}~{end_beam}--")
 
@@ -163,7 +162,7 @@ def encode_audio(dl):
         best_captions.append(cand_captions[int(indices[args.rank-1][i])][i])
 
     # Write
-    output_file = exp_explorer + '/' + f"decode_log_test_clean_beam_{start_beam}-{end_beam}_pred"
+    output_file = exp_explorer + '/' + f"decode_beam{start_beam}-{end_beam}_pred"
     with open(output_file, 'w') as f: 
         for i, caption in enumerate(best_captions): 
             audio_id = audio_ids[i]