Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
6047669
Add MAEB task selection method with correlation and clustering
isaac-chung Jan 4, 2026
d1d4521
Enhance MAEB task selection with domain/category/language preservation
isaac-chung Jan 4, 2026
9d6b459
Rearrange MAEB notebook: clustering before task selection + outlier-b…
isaac-chung Jan 4, 2026
236534a
Fix iterative task removal to skip protected pairs instead of stopping
isaac-chung Jan 4, 2026
8eb681f
Add model performance heatmap after results DataFrame
isaac-chung Jan 4, 2026
b05918f
Add MAEB(audio-text) benchmark and improve task selection notebook
isaac-chung Jan 5, 2026
f1688c9
Expand MAEB(audio-text) to full collection and fix msclap revision
isaac-chung Jan 5, 2026
61ed764
add missing files
isaac-chung Jan 5, 2026
217399b
Add LaTeX table generation script for MAEB benchmarks
isaac-chung Jan 5, 2026
8b023dc
make lint
isaac-chung Jan 5, 2026
627b11c
Add MAEB benchmark with all 96 audio modality tasks
isaac-chung Jan 6, 2026
fb98995
Fix UrbanSound8k task class naming conflict
isaac-chung Jan 6, 2026
3147c20
Upgrade datasets to v4+ for Python 3.14 compatibility
isaac-chung Jan 6, 2026
b244226
Track uv.lock in repository
isaac-chung Jan 6, 2026
fb7061a
Export SpeechCommandsZeroshotClassificationv02 task
isaac-chung Jan 6, 2026
411a4ce
Fix JamAlt task class naming to match metadata names
isaac-chung Jan 6, 2026
e1c1d64
Revert "Track uv.lock in repository"
isaac-chung Jan 6, 2026
aebf51d
Revert "Upgrade datasets to v4+ for Python 3.14 compatibility"
isaac-chung Jan 6, 2026
24d11a0
Add MAEB benchmarks to leaderboard with Audio tab
isaac-chung Jan 7, 2026
5c541ec
Merge remote-tracking branch 'origin/maeb' into maeb-task-selection
isaac-chung Jan 7, 2026
0d989b5
Update MAEB_AUDIO_TEXT benchmark and contacts
isaac-chung Jan 7, 2026
20193f1
Remove zero-shot tasks from MAEB(audio) benchmark
isaac-chung Jan 7, 2026
bf49fb3
fix metadata test for SpeechCommandsZeroshotv0.02
isaac-chung Jan 7, 2026
643f674
Merge origin/maeb into maeb-task-selection
isaac-chung Jan 7, 2026
27b0dfb
add 2 draft benchmarks and fix dep python issue
isaac-chung Jan 7, 2026
8c94b75
add MAEB(audio-text, lite)
isaac-chung Jan 7, 2026
87254f6
[MAEB] Refactor audio benchmarks for model coverage optimization
isaac-chung Jan 7, 2026
3af5e6e
Fix leaderboard prerun using gr.update() objects instead of raw values
isaac-chung Jan 7, 2026
6494a1e
Fix leaderboard race conditions and remove AudioCapsMiniReranking
isaac-chung Jan 7, 2026
f8a5ede
Update LaTeX table generator to use lite benchmarks
isaac-chung Jan 8, 2026
e172ae3
Fix radar chart task type matching for Any2Any tasks
isaac-chung Jan 8, 2026
bde282e
Add MAEB evaluation time analysis notebook
isaac-chung Jan 8, 2026
54f5e02
Merge maeb branch into maeb-task-selection
isaac-chung Jan 9, 2026
7906a8c
Add MAEB(audio-text, extended) and refine MAEB(audio-text, lite)
isaac-chung Jan 9, 2026
e26491c
Add CLI script to calculate benchmark eval times
isaac-chung Jan 9, 2026
77b234b
Add script to compute lite vs extended benchmark correlations
isaac-chung Jan 9, 2026
5091e5a
Merge branch 'maeb' into maeb-task-selection
isaac-chung Jan 9, 2026
247ade3
Fix UrbanSound8kClassification import in zxx __init__.py
isaac-chung Jan 9, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ leaderboard-test-all:

run-leaderboard:
@echo "--- 🚀 Running leaderboard locally ---"
uv run --extra leaderboard python -m mteb.leaderboard.app
uv run --no-sync --extra leaderboard python -m mteb.leaderboard.app

format-citations:
@echo "--- 🧹 Formatting citations ---"
Expand Down
10 changes: 10 additions & 0 deletions mteb/benchmarks/benchmarks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@
JMTEB_LITE_V1,
JMTEB_V2,
LONG_EMBED,
MAEB,
MAEB_AUDIO_EXTENDED,
MAEB_AUDIO_LITE,
MAEB_AUDIO_TEXT_EXTENDED,
MAEB_AUDIO_TEXT_LITE,
MIEB_ENG,
MIEB_IMG,
MIEB_LITE,
Expand Down Expand Up @@ -80,6 +85,11 @@
"JMTEB_LITE_V1",
"JMTEB_V2",
"LONG_EMBED",
"MAEB",
"MAEB_AUDIO_EXTENDED",
"MAEB_AUDIO_LITE",
"MAEB_AUDIO_TEXT_EXTENDED",
"MAEB_AUDIO_TEXT_LITE",
"MIEB_ENG",
"MIEB_IMG",
"MIEB_LITE",
Expand Down
318 changes: 318 additions & 0 deletions mteb/benchmarks/benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -2728,3 +2728,321 @@
""",
contacts=["lsz05"],
)

MAEB = Benchmark(
name="MAEB",
display_name="MAEB, Full",
icon="https://raw.githubusercontent.com/DennisSuitters/LibreICONS/master/svg/libre-gui-activity.svg",
tasks=get_tasks(
tasks=[
# Classification (35)
"AmbientAcousticContext",
"AudioSet",
"AudioSetMini",
"BeijingOpera",
"BirdCLEF",
"BirdSet",
"CommonLanguageAgeDetection",
"CommonLanguageGenderDetection",
"CommonLanguageLanguageDetection",
"CREMA_D",
"ESC50",
"FSD2019Kaggle",
"FSD50K",
"FSDD",
"GTZANGenre",
"GunshotTriangulation",
"IEMOCAPEmotion",
"IEMOCAPGender",
"LibriCount",
"MInDS14",
"MridinghamStroke",
"MridinghamTonic",
"NSynth",
"SIBFLEURS",
"SpeechCommands",
"SpokeNEnglish",
"SpokenQAForIC",
"TUTAcousticScenes",
"UrbanSound8k",
"VocalSound",
"VoxCelebSA",
"VoxLingua107_Top10",
"VoxPopuliAccentID",
"VoxPopuliGenderID",
"VoxPopuliLanguageID",
# Clustering (10)
"AmbientAcousticContextClustering",
"CREMA_DClustering",
"ESC50Clustering",
"GTZANGenreClustering",
"MusicGenreClustering",
"VehicleSoundClustering",
"VoiceGenderClustering",
"VoxCelebClustering",
"VoxPopuliAccentClustering",
"VoxPopuliGenderClustering",
# PairClassification (5)
"CREMADPairClassification",
"ESC50PairClassification",
"NMSQAPairClassification",
"VocalSoundPairClassification",
"VoxPopuliAccentPairClassification",
# Reranking (5)
"ESC50AudioReranking",
"FSDnoisy18kAudioReranking",
"GTZANAudioReranking",
"UrbanSound8KAudioReranking",
"VocalSoundAudioReranking",
# Zeroshot Classification (5)
"ESC50_Zeroshot",
"RavdessZeroshot",
"SpeechCommandsZeroshotv0.01",
"SpeechCommandsZeroshotv0.02",
"UrbanSound8kZeroshot",
# Audio-to-Text Retrieval (18)
"AudioCapsA2TRetrieval",
"AudioSetStrongA2TRetrieval",
"ClothoA2TRetrieval",
"CMUArcticA2TRetrieval",
"CommonVoice17A2TRetrieval",
"CommonVoice21A2TRetrieval",
"EmoVDBA2TRetrieval",
"FleursA2TRetrieval",
"GigaSpeechA2TRetrieval",
"HiFiTTSA2TRetrieval",
"JamAltArtistA2ARetrieval",
"JamAltLyricA2TRetrieval",
"JLCorpusA2TRetrieval",
"LibriTTSA2TRetrieval",
"MACSA2TRetrieval",
"MusicCapsA2TRetrieval",
"SoundDescsA2TRetrieval",
"UrbanSound8KA2TRetrieval",
# Text-to-Audio Retrieval (17)
"AudioCapsT2ARetrieval",
"AudioSetStrongT2ARetrieval",
"ClothoT2ARetrieval",
"CMUArcticT2ARetrieval",
"CommonVoice17T2ARetrieval",
"CommonVoice21T2ARetrieval",
"EmoVDBT2ARetrieval",
"FleursT2ARetrieval",
"GigaSpeechT2ARetrieval",
"HiFiTTST2ARetrieval",
"JamAltLyricT2ARetrieval",
"JLCorpusT2ARetrieval",
"LibriTTST2ARetrieval",
"MACST2ARetrieval",
"MusicCapsT2ARetrieval",
"SoundDescsT2ARetrieval",
"SpokenSQuADT2ARetrieval",
"UrbanSound8KT2ARetrieval",
]
),
description="""MAEB is the full Massive Audio Embedding Benchmark (v1), containing 95 tasks with audio modality across 7 task types: classification, clustering, pair classification, reranking, zero-shot classification, audio-to-text retrieval, and text-to-audio retrieval.""",
reference=None,
citation="",
contacts=["AdnanElAssadi56", "isaac-chung", "KennethEnevoldsen", "Samoed"],
)

MAEB_AUDIO_LITE = Benchmark(
name="MAEB(audio, lite)",
display_name="Audio, Lite",
icon="https://raw.githubusercontent.com/DennisSuitters/LibreICONS/master/svg/libre-gui-activity.svg",
tasks=get_tasks(
tasks=[
# AudioMultilabelClassification (2)
"FSD50K",
"SIBFLEURS",
# AudioClassification (5)
"VoxPopuliAccentID",
"MInDS14",
"VoxPopuliGenderID",
"BeijingOpera",
"AmbientAcousticContext",
# AudioReranking (5)
"ESC50AudioReranking",
"UrbanSound8KAudioReranking",
"GTZANAudioReranking",
"FSDnoisy18kAudioReranking",
"VocalSoundAudioReranking",
# AudioClustering (5)
"VoiceGenderClustering",
"VoxPopuliAccentClustering",
"AmbientAcousticContextClustering",
"VoxCelebClustering",
"VoxPopuliGenderClustering",
# AudioPairClassification (1)
"VoxPopuliAccentPairClassification",
]
),
description="""MAEB(audio, lite) is a lightweight audio-only benchmark with 18 tasks optimized for maximum model coverage (44 models). Tasks span 5 task types: classification, multilabel classification, reranking, clustering, and pair classification. Selected by greedy task addition while maximizing the number of models with complete results.""",
reference=None,
citation="",
contacts=["AdnanElAssadi56", "isaac-chung", "KennethEnevoldsen", "Samoed"],
)

MAEB_AUDIO_EXTENDED = Benchmark(
name="MAEB(audio, extended)",
display_name="Audio, Extended",
icon="https://raw.githubusercontent.com/DennisSuitters/LibreICONS/master/svg/libre-gui-activity.svg",
tasks=get_tasks(
tasks=[
# AudioMultilabelClassification (4)
"FSD50K",
"SIBFLEURS",
"FSD2019Kaggle",
"AudioSetMini",
# AudioClassification (28)
"VoxPopuliAccentID",
"MInDS14",
"VoxPopuliGenderID",
"BeijingOpera",
"AmbientAcousticContext",
"CREMA_D",
"VoxCelebSA",
"TUTAcousticScenes",
"NSynth",
"VocalSound",
"VoxLingua107_Top10",
"ESC50",
"CommonLanguageAgeDetection",
"IEMOCAPEmotion",
"CommonLanguageLanguageDetection",
"CommonLanguageGenderDetection",
"IEMOCAPGender",
"SpokeNEnglish",
"FSDD",
"LibriCount",
"GTZANGenre",
"BirdCLEF",
"VoxPopuliLanguageID",
"MridinghamStroke",
"GunshotTriangulation",
"SpeechCommands",
"MridinghamTonic",
"BirdSet",
# AudioReranking (5)
"ESC50AudioReranking",
"UrbanSound8KAudioReranking",
"GTZANAudioReranking",
"FSDnoisy18kAudioReranking",
"VocalSoundAudioReranking",
# AudioClustering (10)
"VoiceGenderClustering",
"VoxPopuliAccentClustering",
"AmbientAcousticContextClustering",
"VoxCelebClustering",
"VoxPopuliGenderClustering",
"VehicleSoundClustering",
"MusicGenreClustering",
"ESC50Clustering",
"CREMA_DClustering",
"GTZANGenreClustering",
# AudioPairClassification (5)
"VoxPopuliAccentPairClassification",
"ESC50PairClassification",
"NMSQAPairClassification",
"VocalSoundPairClassification",
"CREMADPairClassification",
# Audio2AudioRetrieval (1)
"JamAltArtistA2ARetrieval",
]
),
description="""MAEB(audio, extended) is an extended audio-only benchmark with 53 tasks supporting 38 models with complete results. Tasks span 6 task types: classification (28), multilabel classification (4), reranking (5), clustering (10), pair classification (5), and audio-to-audio retrieval (1). This benchmark provides broader task diversity while maintaining good model coverage.""",
reference=None,
citation="",
contacts=["AdnanElAssadi56", "isaac-chung", "KennethEnevoldsen", "Samoed"],
)

MAEB_AUDIO_TEXT_LITE = Benchmark(
name="MAEB(audio-text, lite)",
display_name="Audio-Text, Lite",
icon="https://raw.githubusercontent.com/DennisSuitters/LibreICONS/master/svg/libre-gui-activity.svg",
tasks=get_tasks(
tasks=[
# Any2AnyRetrieval - Text to Audio (13)
"AudioCapsT2ARetrieval",
"AudioSetStrongT2ARetrieval",
"CMUArcticT2ARetrieval",
"EmoVDBT2ARetrieval",
"GigaSpeechT2ARetrieval",
"HiFiTTST2ARetrieval",
"JLCorpusT2ARetrieval",
"JamAltLyricT2ARetrieval",
"LibriTTST2ARetrieval",
"MACST2ARetrieval",
"MusicCapsT2ARetrieval",
"SpokenSQuADT2ARetrieval",
"UrbanSound8KT2ARetrieval",
# AudioZeroshotClassification (4)
"ESC50_Zeroshot",
"RavdessZeroshot",
"SpeechCommandsZeroshotv0.02",
"UrbanSound8kZeroshot",
]
),
description="""MAEB(audio-text, lite) is a lightweight benchmark for audio-text cross-modal models with 17 tasks optimized for maximum model coverage (10 models). Tasks span 2 task types: text-to-audio retrieval (13) and zero-shot classification (4). Models include CLAP variants, wav2clip, and speecht5.""",
reference=None,
citation="",
contacts=["AdnanElAssadi56", "isaac-chung", "KennethEnevoldsen", "Samoed"],
)

MAEB_AUDIO_TEXT_EXTENDED = Benchmark(
name="MAEB(audio-text, extended)",
display_name="Audio-Text, Extended",
icon="https://raw.githubusercontent.com/DennisSuitters/LibreICONS/master/svg/libre-gui-activity.svg",
tasks=get_tasks(
tasks=[
# === From Lite ===
# Any2AnyRetrieval - Audio to Text (13)
"AudioCapsA2TRetrieval",
"AudioSetStrongA2TRetrieval",
"CMUArcticA2TRetrieval",
"EmoVDBA2TRetrieval",
"GigaSpeechA2TRetrieval",
"HiFiTTSA2TRetrieval",
"JLCorpusA2TRetrieval",
"JamAltLyricA2TRetrieval",
"LibriTTSA2TRetrieval",
"MACSA2TRetrieval",
"MusicCapsA2TRetrieval",
"SpokenSQuADT2ARetrieval",
"UrbanSound8KA2TRetrieval",
# Any2AnyRetrieval - Text to Audio (12)
"AudioCapsT2ARetrieval",
"AudioSetStrongT2ARetrieval",
"CMUArcticT2ARetrieval",
"EmoVDBT2ARetrieval",
"GigaSpeechT2ARetrieval",
"HiFiTTST2ARetrieval",
"JLCorpusT2ARetrieval",
"JamAltLyricT2ARetrieval",
"LibriTTST2ARetrieval",
"MACST2ARetrieval",
"MusicCapsT2ARetrieval",
"UrbanSound8KT2ARetrieval",
# AudioZeroshotClassification (5)
"ESC50_Zeroshot",
"RavdessZeroshot",
"SpeechCommandsZeroshotv0.01",
"SpeechCommandsZeroshotv0.02",
"UrbanSound8kZeroshot",
# === Extended additions ===
# Audio captioning (2)
"ClothoA2TRetrieval",
"ClothoT2ARetrieval",
# Multilingual - Fleurs (2) - 102 languages
"FleursA2TRetrieval",
"FleursT2ARetrieval",
# Multilingual - CommonVoice 21 (2) - 82+ languages
"CommonVoice21A2TRetrieval",
"CommonVoice21T2ARetrieval",
]
),
description="""MAEB(audio-text, extended) is an extended benchmark for audio-text cross-modal models with 36 tasks. Includes all 30 tasks from the lite version plus Clotho (audio captioning), Fleurs (102 languages), and CommonVoice 21 (82+ languages) for comprehensive multilingual coverage. Task types: audio-text retrieval (31) and zero-shot classification (5).""",
reference=None,
citation="",
contacts=["AdnanElAssadi56", "isaac-chung", "KennethEnevoldsen", "Samoed"],
)
Loading