Enhance math evaluation scoring by introducing weighted contributions… #32

Workflow file for this run

	name: Python CI

	concurrency:
	group: ${{ github.workflow }}-${{ github.ref }}
	cancel-in-progress: true

	on:
	push:
	branches: [main]
	paths-ignore:
	- "docs/**"
	- "*.md"
	pull_request:
	branches: [main]
	paths-ignore:
	- "docs/**"
	- "*.md"
	workflow_dispatch:

	jobs:
	lint-and-type-check:
	name: Lint & Type Check
	runs-on: ubuntu-latest
	steps:
	- uses: actions/checkout@v4
	with:
	fetch-depth: 0 # Fetch all history for all tags and branches

	- name: Set up Python 3.12
	uses: actions/setup-python@v5
	with:
	python-version: "3.12"

	- name: Install uv
	uses: astral-sh/setup-uv@v6
	with:
	enable-cache: true

	- name: Install the project
	run: uv sync --locked --all-extras --dev

	- name: Install tau2 for testing
	run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main

	- name: Lint with flake8
	run: uv run flake8 eval_protocol tests examples scripts --count --exit-zero --max-complexity=10 --max-line-length=88 --statistics

	- name: Type check with mypy
	run: uv run mypy eval_protocol

	test-core:
	name: Core Tests (Python ${{ matrix.python-version }})
	runs-on: ubuntu-latest
	needs: lint-and-type-check
	strategy:
	fail-fast: false
	matrix:
	python-version: ["3.10", "3.11", "3.12"]

	steps:
	- uses: actions/checkout@v4
	with:
	fetch-depth: 0 # Fetch all history for all tags and branches

	- name: Set up Python ${{ matrix.python-version }}
	uses: actions/setup-python@v5
	with:
	python-version: ${{ matrix.python-version }}

	- name: Install uv
	uses: astral-sh/setup-uv@v6
	with:
	enable-cache: true

	- name: Install the project
	run: uv sync --locked --all-extras --dev

	- name: Install tau2 for testing
	run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main

	- name: Run Core Tests with pytest-xdist
	env:
	E2B_API_KEY: ${{ secrets.E2B_API_KEY }}
	FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
	FIREWORKS_ACCOUNT_ID: ${{ secrets.FIREWORKS_ACCOUNT_ID }}
	PYTHONWARNINGS: "ignore::DeprecationWarning,ignore::RuntimeWarning"
	run: \|
	# Run most tests in parallel, but explicitly ignore tests that manage their own servers or are slow
	uv run pytest \
	-n auto \
	--ignore=tests/test_batch_evaluation.py \
	--ignore=tests/pytest/test_frozen_lake.py \
	--ignore=tests/pytest/test_lunar_lander.py \
	--ignore=tests/pytest/test_tau_bench_airline.py \
	--cov=eval_protocol --cov-append --cov-report=xml --cov-report=term-missing -v --durations=10

	- name: Store coverage file
	uses: actions/upload-artifact@v4
	with:
	name: coverage-core-${{ matrix.python-version }}
	path: coverage.xml
	retention-days: 1

	test-batch-evaluation:
	name: Batch Evaluation Tests
	runs-on: ubuntu-latest
	needs: lint-and-type-check
	steps:
	- uses: actions/checkout@v4
	with:
	fetch-depth: 0 # Fetch all history for all tags and branches

	- name: Set up Python 3.12
	uses: actions/setup-python@v5
	with:
	python-version: "3.12"

	- name: Install uv
	uses: astral-sh/setup-uv@v6
	with:
	enable-cache: true

	- name: Install the project
	run: uv sync --locked --all-extras --dev

	- name: Install tau2 for testing
	run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main

	- name: Run Batch Evaluation Tests
	env:
	E2B_API_KEY: ${{ secrets.E2B_API_KEY }}
	FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
	FIREWORKS_ACCOUNT_ID: ${{ secrets.FIREWORKS_ACCOUNT_ID }}
	PYTHONWARNINGS: "ignore::DeprecationWarning,ignore::RuntimeWarning"
	run: \|
	# Run only this specific test file, WITHOUT xdist
	uv run pytest tests/test_batch_evaluation.py --cov=eval_protocol --cov-append --cov-report=xml -v --durations=10
	- name: Store coverage file
	uses: actions/upload-artifact@v4
	with:
	name: coverage-batch-eval
	path: coverage.xml
	retention-days: 1

	test-mcp-e2e:
	name: MCP End-to-End Tests
	runs-on: ubuntu-latest
	needs: lint-and-type-check
	steps:
	- uses: actions/checkout@v4
	with:
	fetch-depth: 0 # Fetch all history for all tags and branches
	- name: Set up Python 3.12
	uses: actions/setup-python@v5
	with:
	python-version: "3.12"
	- name: Install uv
	uses: astral-sh/setup-uv@v6
	with:
	enable-cache: true

	- name: Install the project
	run: uv sync --locked --all-extras --dev

	- name: Install tau2 for testing
	run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main

	- name: Store coverage file
	uses: actions/upload-artifact@v4
	with:
	name: coverage-mcp-e2e
	path: coverage.xml
	retention-days: 1

	upload-coverage:
	name: Upload Coverage
	runs-on: ubuntu-latest
	needs: [test-core, test-batch-evaluation, test-mcp-e2e]
	steps:
	- name: Download all coverage artifacts
	uses: actions/download-artifact@v4
	with:
	path: coverage-artifacts
	- name: Upload coverage to Codecov
	uses: codecov/codecov-action@v3
	with:
	token: ${{ secrets.CODECOV_TOKEN }}
	directory: ./coverage-artifacts/
	fail_ci_if_error: false
	verbose: true

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Enhance math evaluation scoring by introducing weighted contributions… #32

Workflow file

Enhance math evaluation scoring by introducing weighted contributions… #32

Uh oh!

Workflow file for this run