Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions README.MD
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,11 @@ bash scripts/evaluate_claude.sh
bash scripts/evaluate_gemini.sh
```

* Evaluate MiniMax models (MiniMax-M2.5, MiniMax-M2.5-highspeed with 204K context)
```
bash scripts/evaluate_minimax.sh
```

* Evaluate models available on Huggingface
```
bash scripts/evaluate_hf_llm.sh
Expand Down
39 changes: 39 additions & 0 deletions global_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import time
import sys
import os
import re
import httpx

import google.generativeai as genai
from anthropic import Anthropic
Expand All @@ -13,6 +15,9 @@ def get_openai_embedding(texts, model="text-embedding-ada-002"):
texts = [text.replace("\n", " ") for text in texts]
return np.array([openai.Embedding.create(input = texts, model=model)['data'][i]['embedding'] for i in range(len(texts))])

def set_minimax_key():
pass

def set_anthropic_key():
pass

Expand Down Expand Up @@ -79,6 +84,40 @@ def run_claude(query, max_new_tokens, model_name):
return message.content[0].text


def run_minimax(query, max_new_tokens, model_name, temperature=0):
"""Run MiniMax model via OpenAI-compatible API."""

if model_name == 'minimax-m2.5':
api_model_name = "MiniMax-M2.5"
elif model_name == 'minimax-m2.5-highspeed':
api_model_name = "MiniMax-M2.5-highspeed"
elif model_name == 'minimax-m2.7':
api_model_name = "MiniMax-M2.7"
else:
api_model_name = model_name

url = "https://api.minimax.io/v1/chat/completions"
headers = {
"Authorization": f"Bearer {os.environ.get('MINIMAX_API_KEY', '')}",
"Content-Type": "application/json",
}
# MiniMax temperature must be in (0.0, 1.0]
clamped_temp = max(0.01, min(temperature, 1.0)) if temperature > 0 else 0.01
payload = {
"model": api_model_name,
"messages": [{"role": "user", "content": query}],
"max_tokens": max_new_tokens,
"temperature": clamped_temp,
}
response = httpx.post(url, headers=headers, json=payload, timeout=120)
response.raise_for_status()
data = response.json()
text = data["choices"][0]["message"]["content"]
# Strip thinking tags if present (M2.5 models may include them)
text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL).strip()
return text


def run_gemini(model, content: str, max_tokens: int = 0):

try:
Expand Down
3 changes: 3 additions & 0 deletions scripts/env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,8 @@ export GOOGLE_API_KEY=
# Anthropic API Key
export ANTHROPIC_API_KEY=

# MiniMax API Key
export MINIMAX_API_KEY=

# HuggingFace Token
export HF_TOKEN=
12 changes: 12 additions & 0 deletions scripts/evaluate_minimax.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# sets necessary environment variables
source scripts/env.sh

# Evaluate MiniMax-M2.5
python3 task_eval/evaluate_qa.py \
--data-file $DATA_FILE_PATH --out-file $OUT_DIR/$QA_OUTPUT_FILE \
--model minimax-m2.5 --batch-size 10

# Evaluate MiniMax-M2.5-highspeed (204K context, faster inference)
python3 task_eval/evaluate_qa.py \
--data-file $DATA_FILE_PATH --out-file $OUT_DIR/$QA_OUTPUT_FILE \
--model minimax-m2.5-highspeed --batch-size 10
10 changes: 8 additions & 2 deletions task_eval/evaluate_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,13 @@
import os, json
from tqdm import tqdm
import argparse
from global_methods import set_openai_key, set_anthropic_key, set_gemini_key
from global_methods import set_openai_key, set_anthropic_key, set_gemini_key, set_minimax_key
from task_eval.evaluation import eval_question_answering
from task_eval.evaluation_stats import analyze_aggr_acc
from task_eval.gpt_utils import get_gpt_answers
from task_eval.claude_utils import get_claude_answers
from task_eval.gemini_utils import get_gemini_answers
from task_eval.minimax_utils import get_minimax_answers
from task_eval.hf_llm_utils import init_hf_model, get_hf_answers

import numpy as np
Expand Down Expand Up @@ -56,7 +57,10 @@ def main():
model_name = "models/gemini-1.0-pro-latest"

gemini_model = genai.GenerativeModel(model_name)


elif 'minimax' in args.model:
set_minimax_key()

elif any([model_name in args.model for model_name in ['gemma', 'llama', 'mistral']]):
hf_pipeline, hf_model_name = init_hf_model(args)

Expand Down Expand Up @@ -90,6 +94,8 @@ def main():
answers = get_claude_answers(data, out_data, prediction_key, args)
elif 'gemini' in args.model:
answers = get_gemini_answers(gemini_model, data, out_data, prediction_key, args)
elif 'minimax' in args.model:
answers = get_minimax_answers(data, out_data, prediction_key, args)
elif any([model_name in args.model for model_name in ['gemma', 'llama', 'mistral']]):
answers = get_hf_answers(data, out_data, args, hf_pipeline, hf_model_name)
else:
Expand Down
203 changes: 203 additions & 0 deletions task_eval/minimax_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))

import random
import os, json
from tqdm import tqdm
import time
from global_methods import run_minimax


MAX_LENGTH = {
'minimax-m2.5': 204000,
'minimax-m2.5-highspeed': 204000,
'minimax-m2.7': 1000000,
}
PER_QA_TOKEN_BUDGET = 50

QA_PROMPT = """
Based on the above context, write an answer in the form of a short phrase for the following question. Answer with exact words from the context whenever possible.

Question: {} Short answer:
"""

QA_PROMPT_CAT_5 = """
Based on the above context, answer the following question.

Question: {} Short answer:
"""

QA_PROMPT_BATCH = """
Based on the above conversations, write short answers for each of the following questions in a few words. Write the answers in the form of a json dictionary where each entry contains the string format of question number as 'key' and the short answer as value. Use single-quote characters for named entities. Answer with exact words from the conversations whenever possible.

"""

CONV_START_PROMPT = "Below is a conversation between two people: {} and {}. The conversation takes place over multiple days and the date of each conversation is wriiten at the beginning of the conversation.\n\n"


def process_ouput(text):

text = text.strip()
if text[0] != '{':
start = text.index('{')
text = text[start:].strip()

return json.loads(text)


def get_cat_5_answer(model_prediction, answer_key):

model_prediction = model_prediction.strip().lower()
if len(model_prediction) == 1:
if 'a' in model_prediction:
return answer_key['a']
else:
return answer_key['b']
elif len(model_prediction) == 3:
if '(a)' in model_prediction:
return answer_key['a']
else:
return answer_key['b']
else:
return model_prediction


def get_input_context(data, num_question_tokens, model, args):

query_conv = ''
stop = False
session_nums = [int(k.split('_')[-1]) for k in data.keys() if 'session' in k and 'date_time' not in k]
for i in range(min(session_nums), max(session_nums) + 1):
if 'session_%s' % i in data:
query_conv += "\n\n"
for dialog in data['session_%s' % i][::-1]:
turn = ''
turn = dialog['speaker'] + ' said, \"' + dialog['text'] + '\"' + '\n'
if "blip_caption" in dialog:
turn += ' and shared %s.' % dialog["blip_caption"]
turn += '\n'

query_conv = turn + query_conv

query_conv = '\nDATE: ' + data['session_%s_date_time' % i] + '\n' + 'CONVERSATION:\n' + query_conv
if stop:
break

return query_conv


def get_minimax_answers(in_data, out_data, prediction_key, args):

assert len(in_data['qa']) == len(out_data['qa']), (len(in_data['qa']), len(out_data['qa']))

# start instruction prompt
speakers_names = list(set([d['speaker'] for d in in_data['conversation']['session_1']]))
start_prompt = CONV_START_PROMPT.format(speakers_names[0], speakers_names[1])
start_tokens = 100

if args.rag_mode:
raise NotImplementedError
else:
context_database, query_vectors = None, None

for batch_start_idx in tqdm(range(0, len(in_data['qa']), args.batch_size), desc='Generating answers'):

questions = []
include_idxs = []
cat_5_idxs = []
cat_5_answers = []
for i in range(batch_start_idx, batch_start_idx + args.batch_size):

if i >= len(in_data['qa']):
break

qa = in_data['qa'][i]

if prediction_key not in out_data['qa'][i] or args.overwrite:
include_idxs.append(i)
else:
continue

if qa['category'] == 2:
questions.append(qa['question'] + ' Use DATE of CONVERSATION to answer with an approximate date.')
elif qa['category'] == 5:
question = qa['question'] + " Select the correct answer: (a) {} (b) {}. "
if random.random() < 0.5:
question = question.format('Not mentioned in the conversation', qa['answer'])
answer = {'a': 'Not mentioned in the conversation', 'b': qa['answer']}
else:
question = question.format(qa['answer'], 'Not mentioned in the conversation')
answer = {'b': 'Not mentioned in the conversation', 'a': qa['answer']}

cat_5_idxs.append(len(questions))
questions.append(question)
cat_5_answers.append(answer)
else:
questions.append(qa['question'])

if questions == []:
continue

context_ids = None
if args.use_rag:
raise NotImplementedError
else:
question_prompt = QA_PROMPT_BATCH + "\n".join(["%s: %s" % (k, q) for k, q in enumerate(questions)])
num_question_tokens = 100
query_conv = get_input_context(in_data['conversation'], num_question_tokens + start_tokens, None, args)
query_conv = start_prompt + query_conv

if args.batch_size == 1:

query = query_conv + '\n\n' + QA_PROMPT.format(questions[0]) if len(cat_5_idxs) == 0 else query_conv + '\n\n' + QA_PROMPT_CAT_5.format(questions[0])
answer = run_minimax(query, PER_QA_TOKEN_BUDGET, args.model)

if len(cat_5_idxs) > 0:
answer = get_cat_5_answer(answer, cat_5_answers[0])

out_data['qa'][include_idxs[0]][prediction_key] = answer.strip()
if args.use_rag:
out_data['qa'][include_idxs[0]][prediction_key + '_context'] = context_ids

else:
query = query_conv + '\n' + question_prompt

trials = 0
while trials < 5:
try:
trials += 1
answer = run_minimax(query, PER_QA_TOKEN_BUDGET * args.batch_size, args.model)
answer = answer.replace('\\"', "'").replace('json', '').replace('`', '').strip()
answers = process_ouput(answer.strip())
break
except json.decoder.JSONDecodeError:
pass

for k, idx in enumerate(include_idxs):
try:
answers = process_ouput(answer.strip())
if k in cat_5_idxs:
predicted_answer = get_cat_5_answer(answers[str(k)], cat_5_answers[cat_5_idxs.index(k)])
out_data['qa'][idx][prediction_key] = predicted_answer
else:
try:
out_data['qa'][idx][prediction_key] = str(answers[str(k)]).replace('(a)', '').replace('(b)', '').strip()
except:
out_data['qa'][idx][prediction_key] = ', '.join([str(n) for n in list(answers[str(k)].values())])
except:
try:
answers = json.loads(answer.strip())
if k in cat_5_idxs:
predicted_answer = get_cat_5_answer(answers[k], cat_5_answers[cat_5_idxs.index(k)])
out_data['qa'][idx][prediction_key] = predicted_answer
else:
out_data['qa'][idx][prediction_key] = answers[k].replace('(a)', '').replace('(b)', '').strip()
except:
if k in cat_5_idxs:
predicted_answer = get_cat_5_answer(answer.strip(), cat_5_answers[cat_5_idxs.index(k)])
out_data['qa'][idx][prediction_key] = predicted_answer
else:
out_data['qa'][idx][prediction_key] = json.loads(answer.strip().replace('(a)', '').replace('(b)', '').split('\n')[k])[0]

return out_data
Empty file added tests/__init__.py
Empty file.
Loading