|
| 1 | +#!/bin/bash |
| 2 | + |
| 3 | +# Define test cases: |
| 4 | +# Format: |
| 5 | +# TEST_NAMES=("Test 1" "Test 2" ...) |
| 6 | +# TEST_COMMANDS=("command1" "command2" ...) |
| 7 | +# TEST_EXPECTED_SCORES=(0.123 0.456 ...) |
| 8 | + |
| 9 | +TEST_NAMES=( |
| 10 | + "FirstMistral (Alpha, Logits)" |
| 11 | + "RZ" |
| 12 | + "Qwen (Alpha)" |
| 13 | + "Monot5" |
| 14 | + "Duot5" |
| 15 | +) |
| 16 | + |
| 17 | +TEST_COMMANDS=( |
| 18 | + "python src/rank_llm/scripts/run_rank_llm.py --model_path=castorini/first_mistral --top_k_candidates=50 --dataset=dl19 --retrieval_method=bm25 --prompt_mode=rank_GPT --context_size=4096 --use_alpha --use_logits --max_queries=3" |
| 19 | + "python src/rank_llm/scripts/run_rank_llm.py --model_path=castorini/rank_zephyr_7b_v1_full --top_k_candidates=50 --dataset=dl20 --retrieval_method=SPLADE++_EnsembleDistil_ONNX --prompt_mode=rank_GPT --context_size=4096 --max_queries=3" |
| 20 | + "python src/rank_llm/scripts/run_rank_llm.py --model_path=Qwen/Qwen2.5-7B-Instruct --top_k_candidates=50 --dataset=dl21 --retrieval_method=bm25 --prompt_mode=rank_GPT --context_size=4096 --variable_passages --max_queries=3" |
| 21 | + "python src/rank_llm/scripts/run_rank_llm.py --model_path=castorini/monot5-3b-msmarco-10k --top_k_candidates=50 --dataset=dl22 --retrieval_method=bm25 --prompt_mode=rank_GPT --context_size=4096 --variable_passages --max_queries=3" |
| 22 | + "python src/rank_llm/scripts/run_rank_llm.py --model_path=castorini/duot5-3b-msmarco-10k --top_k_candidates=50 --dataset=dl23 --retrieval_method=bm25 --prompt_mode=rank_GPT --context_size=4096 --variable_passages --max_queries=1" |
| 23 | +) |
| 24 | + |
| 25 | +TEST_EXPECTED_SCORES=( |
| 26 | + 0.8085 |
| 27 | + 0.7662 |
| 28 | + 0.7157 |
| 29 | + 0.3997 |
| 30 | + 0.7246 |
| 31 | +) |
| 32 | + |
| 33 | +for i in "${!TEST_NAMES[@]}"; do |
| 34 | + NAME="${TEST_NAMES[$i]}" |
| 35 | + COMMAND="${TEST_COMMANDS[$i]}" |
| 36 | + EXPECTED_SCORE="${TEST_EXPECTED_SCORES[$i]}" |
| 37 | + |
| 38 | + echo "Running $NAME..." |
| 39 | + |
| 40 | + OUTPUT=$(eval "$COMMAND" 2>&1) |
| 41 | + |
| 42 | + SCORE=$(echo "$OUTPUT" | grep -oP 'ndcg_cut_10\s+all\s+\K\d+\.\d+') |
| 43 | + |
| 44 | + if [ -z "$SCORE" ]; then |
| 45 | + echo "❌ ERROR: Could not extract nDCG@10 score for '$NAME'" |
| 46 | + continue |
| 47 | + fi |
| 48 | + |
| 49 | + LOWER_BOUND=$(echo "$EXPECTED_SCORE * 0.975" | bc -l) |
| 50 | + UPPER_BOUND=$(echo "$EXPECTED_SCORE * 1.025" | bc -l) |
| 51 | + PASSED=$(echo "$SCORE >= $LOWER_BOUND && $SCORE <= $UPPER_BOUND" | bc -l) |
| 52 | + |
| 53 | + if [ "$PASSED" -eq 1 ]; then |
| 54 | + echo "$NAME: PASS ✅ (Actual Score: $SCORE, Expected Score: $EXPECTED_SCORE)" |
| 55 | + else |
| 56 | + echo "$NAME: FAIL ❌ (Actual Score: $SCORE, Expected Score: $EXPECTED_SCORE)" |
| 57 | + fi |
| 58 | +done |
0 commit comments