snap-research · octo-patch · Mar 23, 2026
diff --git a/README.MD b/README.MD
@@ -67,6 +67,11 @@ bash scripts/evaluate_claude.sh
 bash scripts/evaluate_gemini.sh
 ```
 
+* Evaluate MiniMax models (MiniMax-M2.5, MiniMax-M2.5-highspeed with 204K context)
+```
+bash scripts/evaluate_minimax.sh
+```
+
 * Evaluate models available on Huggingface
 ```
 bash scripts/evaluate_hf_llm.sh

diff --git a/global_methods.py b/global_methods.py
@@ -4,6 +4,8 @@
 import time
 import sys
 import os
+import re
+import httpx
 
 import google.generativeai as genai
 from anthropic import Anthropic
@@ -13,6 +15,9 @@ def get_openai_embedding(texts, model="text-embedding-ada-002"):
    texts = [text.replace("\n", " ") for text in texts]
    return np.array([openai.Embedding.create(input = texts, model=model)['data'][i]['embedding'] for i in range(len(texts))])
 
+def set_minimax_key():
+    pass
+
 def set_anthropic_key():
     pass
 
@@ -79,6 +84,40 @@ def run_claude(query, max_new_tokens, model_name):
     return message.content[0].text
 
 
+def run_minimax(query, max_new_tokens, model_name, temperature=0):
+    """Run MiniMax model via OpenAI-compatible API."""
+
+    if model_name == 'minimax-m2.5':
+        api_model_name = "MiniMax-M2.5"
+    elif model_name == 'minimax-m2.5-highspeed':
+        api_model_name = "MiniMax-M2.5-highspeed"
+    elif model_name == 'minimax-m2.7':
+        api_model_name = "MiniMax-M2.7"
+    else:
+        api_model_name = model_name
+
+    url = "https://api.minimax.io/v1/chat/completions"
+    headers = {
+        "Authorization": f"Bearer {os.environ.get('MINIMAX_API_KEY', '')}",
+        "Content-Type": "application/json",
+    }
+    # MiniMax temperature must be in (0.0, 1.0]
+    clamped_temp = max(0.01, min(temperature, 1.0)) if temperature > 0 else 0.01
+    payload = {
+        "model": api_model_name,
+        "messages": [{"role": "user", "content": query}],
+        "max_tokens": max_new_tokens,
+        "temperature": clamped_temp,
+    }
+    response = httpx.post(url, headers=headers, json=payload, timeout=120)
+    response.raise_for_status()
+    data = response.json()
+    text = data["choices"][0]["message"]["content"]
+    # Strip thinking tags if present (M2.5 models may include them)
+    text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL).strip()
+    return text
+
+
 def run_gemini(model, content: str, max_tokens: int = 0):
 
     try:

diff --git a/scripts/env.sh b/scripts/env.sh
@@ -24,5 +24,8 @@ export GOOGLE_API_KEY=
 # Anthropic API Key
 export ANTHROPIC_API_KEY=
 
+# MiniMax API Key
+export MINIMAX_API_KEY=
+
 # HuggingFace Token
 export HF_TOKEN=
diff --git a/scripts/evaluate_minimax.sh b/scripts/evaluate_minimax.sh
@@ -0,0 +1,12 @@
+# sets necessary environment variables
+source scripts/env.sh
+
+# Evaluate MiniMax-M2.5
+python3 task_eval/evaluate_qa.py \
+    --data-file $DATA_FILE_PATH --out-file $OUT_DIR/$QA_OUTPUT_FILE \
+    --model minimax-m2.5 --batch-size 10
+
+# Evaluate MiniMax-M2.5-highspeed (204K context, faster inference)
+python3 task_eval/evaluate_qa.py \
+    --data-file $DATA_FILE_PATH --out-file $OUT_DIR/$QA_OUTPUT_FILE \
+    --model minimax-m2.5-highspeed --batch-size 10
diff --git a/task_eval/evaluate_qa.py b/task_eval/evaluate_qa.py
@@ -5,12 +5,13 @@
 import os, json
 from tqdm import tqdm
 import argparse
-from global_methods import set_openai_key, set_anthropic_key, set_gemini_key
+from global_methods import set_openai_key, set_anthropic_key, set_gemini_key, set_minimax_key
 from task_eval.evaluation import eval_question_answering
 from task_eval.evaluation_stats import analyze_aggr_acc
 from task_eval.gpt_utils import get_gpt_answers
 from task_eval.claude_utils import get_claude_answers
 from task_eval.gemini_utils import get_gemini_answers
+from task_eval.minimax_utils import get_minimax_answers
 from task_eval.hf_llm_utils import init_hf_model, get_hf_answers
 
 import numpy as np
@@ -56,7 +57,10 @@ def main():
             model_name = "models/gemini-1.0-pro-latest"
 
         gemini_model = genai.GenerativeModel(model_name)
-
+
+    elif 'minimax' in args.model:
+        set_minimax_key()
+
     elif any([model_name in args.model for model_name in ['gemma', 'llama', 'mistral']]):
         hf_pipeline, hf_model_name = init_hf_model(args)
 
@@ -90,6 +94,8 @@ def main():
             answers = get_claude_answers(data, out_data, prediction_key, args)
         elif 'gemini' in args.model:
             answers = get_gemini_answers(gemini_model, data, out_data, prediction_key, args)
+        elif 'minimax' in args.model:
+            answers = get_minimax_answers(data, out_data, prediction_key, args)
         elif any([model_name in args.model for model_name in ['gemma', 'llama', 'mistral']]):
             answers = get_hf_answers(data, out_data, args, hf_pipeline, hf_model_name)
         else:

diff --git a/task_eval/minimax_utils.py b/task_eval/minimax_utils.py
@@ -0,0 +1,203 @@
+import sys
+from pathlib import Path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+import random
+import os, json
+from tqdm import tqdm
+import time
+from global_methods import run_minimax
+
+
+MAX_LENGTH = {
+    'minimax-m2.5': 204000,
+    'minimax-m2.5-highspeed': 204000,
+    'minimax-m2.7': 1000000,
+}
+PER_QA_TOKEN_BUDGET = 50
+
+QA_PROMPT = """
+Based on the above context, write an answer in the form of a short phrase for the following question. Answer with exact words from the context whenever possible.
+
+Question: {} Short answer:
+"""
+
+QA_PROMPT_CAT_5 = """
+Based on the above context, answer the following question.
+
+Question: {} Short answer:
+"""
+
+QA_PROMPT_BATCH = """
+Based on the above conversations, write short answers for each of the following questions in a few words. Write the answers in the form of a json dictionary where each entry contains the string format of question number as 'key' and the short answer as value. Use single-quote characters for named entities. Answer with exact words from the conversations whenever possible.
+
+"""
+
+CONV_START_PROMPT = "Below is a conversation between two people: {} and {}. The conversation takes place over multiple days and the date of each conversation is wriiten at the beginning of the conversation.\n\n"
+
+
+def process_ouput(text):
+
+    text = text.strip()
+    if text[0] != '{':
+        start = text.index('{')
+        text = text[start:].strip()
+
+    return json.loads(text)
+
+
+def get_cat_5_answer(model_prediction, answer_key):
+
+    model_prediction = model_prediction.strip().lower()
+    if len(model_prediction) == 1:
+        if 'a' in model_prediction:
+            return answer_key['a']
+        else:
+            return answer_key['b']
+    elif len(model_prediction) == 3:
+        if '(a)' in model_prediction:
+            return answer_key['a']
+        else:
+            return answer_key['b']
+    else:
+        return model_prediction
+
+
+def get_input_context(data, num_question_tokens, model, args):
+
+    query_conv = ''
+    stop = False
+    session_nums = [int(k.split('_')[-1]) for k in data.keys() if 'session' in k and 'date_time' not in k]
+    for i in range(min(session_nums), max(session_nums) + 1):
+        if 'session_%s' % i in data:
+            query_conv += "\n\n"
+            for dialog in data['session_%s' % i][::-1]:
+                turn = ''
+                turn = dialog['speaker'] + ' said, \"' + dialog['text'] + '\"' + '\n'
+                if "blip_caption" in dialog:
+                    turn += ' and shared %s.' % dialog["blip_caption"]
+                turn += '\n'
+
+                query_conv = turn + query_conv
+
+            query_conv = '\nDATE: ' + data['session_%s_date_time' % i] + '\n' + 'CONVERSATION:\n' + query_conv
+        if stop:
+            break
+
+    return query_conv
+
+
+def get_minimax_answers(in_data, out_data, prediction_key, args):
+
+    assert len(in_data['qa']) == len(out_data['qa']), (len(in_data['qa']), len(out_data['qa']))
+
+    # start instruction prompt
+    speakers_names = list(set([d['speaker'] for d in in_data['conversation']['session_1']]))
+    start_prompt = CONV_START_PROMPT.format(speakers_names[0], speakers_names[1])
+    start_tokens = 100
+
+    if args.rag_mode:
+        raise NotImplementedError
+    else:
+        context_database, query_vectors = None, None
+
+    for batch_start_idx in tqdm(range(0, len(in_data['qa']), args.batch_size), desc='Generating answers'):
+
+        questions = []
+        include_idxs = []
+        cat_5_idxs = []
+        cat_5_answers = []
+        for i in range(batch_start_idx, batch_start_idx + args.batch_size):
+
+            if i >= len(in_data['qa']):
+                break
+
+            qa = in_data['qa'][i]
+
+            if prediction_key not in out_data['qa'][i] or args.overwrite:
+                include_idxs.append(i)
+            else:
+                continue
+
+            if qa['category'] == 2:
+                questions.append(qa['question'] + ' Use DATE of CONVERSATION to answer with an approximate date.')
+            elif qa['category'] == 5:
+                question = qa['question'] + " Select the correct answer: (a) {} (b) {}. "
+                if random.random() < 0.5:
+                    question = question.format('Not mentioned in the conversation', qa['answer'])
+                    answer = {'a': 'Not mentioned in the conversation', 'b': qa['answer']}
+                else:
+                    question = question.format(qa['answer'], 'Not mentioned in the conversation')
+                    answer = {'b': 'Not mentioned in the conversation', 'a': qa['answer']}
+
+                cat_5_idxs.append(len(questions))
+                questions.append(question)
+                cat_5_answers.append(answer)
+            else:
+                questions.append(qa['question'])
+
+        if questions == []:
+            continue
+
+        context_ids = None
+        if args.use_rag:
+            raise NotImplementedError
+        else:
+            question_prompt = QA_PROMPT_BATCH + "\n".join(["%s: %s" % (k, q) for k, q in enumerate(questions)])
+            num_question_tokens = 100
+            query_conv = get_input_context(in_data['conversation'], num_question_tokens + start_tokens, None, args)
+            query_conv = start_prompt + query_conv
+
+        if args.batch_size == 1:
+
+            query = query_conv + '\n\n' + QA_PROMPT.format(questions[0]) if len(cat_5_idxs) == 0 else query_conv + '\n\n' + QA_PROMPT_CAT_5.format(questions[0])
+            answer = run_minimax(query, PER_QA_TOKEN_BUDGET, args.model)
+
+            if len(cat_5_idxs) > 0:
+                answer = get_cat_5_answer(answer, cat_5_answers[0])
+
+            out_data['qa'][include_idxs[0]][prediction_key] = answer.strip()
+            if args.use_rag:
+                out_data['qa'][include_idxs[0]][prediction_key + '_context'] = context_ids
+
+        else:
+            query = query_conv + '\n' + question_prompt
+
+            trials = 0
+            while trials < 5:
+                try:
+                    trials += 1
+                    answer = run_minimax(query, PER_QA_TOKEN_BUDGET * args.batch_size, args.model)
+                    answer = answer.replace('\\"', "'").replace('json', '').replace('`', '').strip()
+                    answers = process_ouput(answer.strip())
+                    break
+                except json.decoder.JSONDecodeError:
+                    pass
+
+            for k, idx in enumerate(include_idxs):
+                try:
+                    answers = process_ouput(answer.strip())
+                    if k in cat_5_idxs:
+                        predicted_answer = get_cat_5_answer(answers[str(k)], cat_5_answers[cat_5_idxs.index(k)])
+                        out_data['qa'][idx][prediction_key] = predicted_answer
+                    else:
+                        try:
+                            out_data['qa'][idx][prediction_key] = str(answers[str(k)]).replace('(a)', '').replace('(b)', '').strip()
+                        except:
+                            out_data['qa'][idx][prediction_key] = ', '.join([str(n) for n in list(answers[str(k)].values())])
+                except:
+                    try:
+                        answers = json.loads(answer.strip())
+                        if k in cat_5_idxs:
+                            predicted_answer = get_cat_5_answer(answers[k], cat_5_answers[cat_5_idxs.index(k)])
+                            out_data['qa'][idx][prediction_key] = predicted_answer
+                        else:
+                            out_data['qa'][idx][prediction_key] = answers[k].replace('(a)', '').replace('(b)', '').strip()
+                    except:
+                        if k in cat_5_idxs:
+                            predicted_answer = get_cat_5_answer(answer.strip(), cat_5_answers[cat_5_idxs.index(k)])
+                            out_data['qa'][idx][prediction_key] = predicted_answer
+                        else:
+                            out_data['qa'][idx][prediction_key] = json.loads(answer.strip().replace('(a)', '').replace('(b)', '').split('\n')[k])[0]
+
+    return out_data
diff --git a/tests/__init__.py b/tests/__init__.py