Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion tests/pytest/datasets/gmail_inbox.jsonl

This file was deleted.

15 changes: 15 additions & 0 deletions tests/pytest/datasets/klavis_mcp_test.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Gmail. You have access to Gmail to help you find information." }, { "role": "user", "content": "Fetch the first 5 emails in my gmail inbox and get the sender. Output the sender only." } ], "ground_truth": "The response contains 5 email addresses." }
{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Notion. You have access to tools to Notion you find information." }, { "role": "user", "content": "How many notion pages are in MCPMark Source Hub?" } ], "ground_truth": "10" }
{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Notion. You have access to tools to Notion you find information." }, { "role": "user", "content": "In the notion Toronto guide, help me to find a pizza restaurant which is able to takeout." } ], "ground_truth": "Pizzeria Badiali" }
{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Notion. You have access to tools to Notion you find information." }, { "role": "user", "content": "Check Japan Travel Planner page, help me to calculate how much did I spend in accomondation." } ], "ground_truth": "$373.63" }
{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Notion. You have access to tools to Notion you find information." }, { "role": "user", "content": "Check Japan Travel Planner page, help me to check how many tokyo attractions I've visited." } ], "ground_truth": "2" }
{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Notion. You have access to tools to Notion you find information." }, { "role": "user", "content": "Check Company In A Box page, help me to find how many presses did we have during 2018. You can find the presses in company wiki." } ], "ground_truth": "3" }
{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Notion. You have access to tools to Notion you find information." }, { "role": "user", "content": "Check Company In A Box page, help me to figure out how many FAQ items under training & upskilling category."} ], "ground_truth": "4" }
{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many events do I have today?" } ], "ground_truth": "3" }
{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many events do I have during business days this week?" } ], "ground_truth": "8" }
{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many available time(in hour) do I have during my business hour the next working day?" } ], "ground_truth": "2 hour" }
{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many events do I have on Oct 15 2025?" } ], "ground_truth": "4" }
{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many events do I have during business days of the week of Oct 15 2025?" } ], "ground_truth": "9" }
{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many events do I have on next week's Thursday?" } ], "ground_truth": "2" }
{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many events do I have on next week's buisiness day?" } ], "ground_truth": "5" }

2 changes: 1 addition & 1 deletion tests/pytest/mcp_configurations/klavis_strata_mcp.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"mcpServers": {
"klavis-strata": {
"url": "https://strata.klavis.ai/mcp/",
"authorization": "Bearer ${KLAVIS_API_KEY}"
"authorization": "Bearer ${KLAVIS_AUTH_TOKEN}"
}
}
}
7 changes: 5 additions & 2 deletions tests/pytest/test_pytest_klavis_mcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,12 @@
class ResponseFormat(BaseModel):
score: float


'''
You should copy https://painted-tennis-ebc.notion.site/MCPMark-Source-Hub-23181626b6d7805fb3a7d59c63033819
into your Notion for the notion test.
'''
@evaluation_test(
input_dataset=["tests/pytest/datasets/gmail_inbox.jsonl"],
input_dataset=["tests/pytest/datasets/klavis_mcp_test.jsonl"],
rollout_processor=AgentRolloutProcessor(),
completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}],
mode="pointwise",
Expand Down
Loading