Skip to content

Enhance math evaluation scoring by introducing weighted contributions… #32

Enhance math evaluation scoring by introducing weighted contributions…

Enhance math evaluation scoring by introducing weighted contributions… #32

Workflow file for this run

name: Python CI
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
on:
push:
branches: [main]
paths-ignore:
- "docs/**"
- "*.md"
pull_request:
branches: [main]
paths-ignore:
- "docs/**"
- "*.md"
workflow_dispatch:
jobs:
lint-and-type-check:
name: Lint & Type Check
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0 # Fetch all history for all tags and branches
- name: Set up Python 3.12
uses: actions/setup-python@v5
with:
python-version: "3.12"
- name: Install uv
uses: astral-sh/setup-uv@v6
with:
enable-cache: true
- name: Install the project
run: uv sync --locked --all-extras --dev
- name: Install tau2 for testing
run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main
- name: Lint with flake8
run: uv run flake8 eval_protocol tests examples scripts --count --exit-zero --max-complexity=10 --max-line-length=88 --statistics
- name: Type check with mypy
run: uv run mypy eval_protocol
test-core:
name: Core Tests (Python ${{ matrix.python-version }})
runs-on: ubuntu-latest
needs: lint-and-type-check
strategy:
fail-fast: false
matrix:
python-version: ["3.10", "3.11", "3.12"]
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0 # Fetch all history for all tags and branches
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install uv
uses: astral-sh/setup-uv@v6
with:
enable-cache: true
- name: Install the project
run: uv sync --locked --all-extras --dev
- name: Install tau2 for testing
run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main
- name: Run Core Tests with pytest-xdist
env:
E2B_API_KEY: ${{ secrets.E2B_API_KEY }}
FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
FIREWORKS_ACCOUNT_ID: ${{ secrets.FIREWORKS_ACCOUNT_ID }}
PYTHONWARNINGS: "ignore::DeprecationWarning,ignore::RuntimeWarning"
run: |
# Run most tests in parallel, but explicitly ignore tests that manage their own servers or are slow
uv run pytest \
-n auto \
--ignore=tests/test_batch_evaluation.py \
--ignore=tests/pytest/test_frozen_lake.py \
--ignore=tests/pytest/test_lunar_lander.py \
--ignore=tests/pytest/test_tau_bench_airline.py \
--cov=eval_protocol --cov-append --cov-report=xml --cov-report=term-missing -v --durations=10
- name: Store coverage file
uses: actions/upload-artifact@v4
with:
name: coverage-core-${{ matrix.python-version }}
path: coverage.xml
retention-days: 1
test-batch-evaluation:
name: Batch Evaluation Tests
runs-on: ubuntu-latest
needs: lint-and-type-check
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0 # Fetch all history for all tags and branches
- name: Set up Python 3.12
uses: actions/setup-python@v5
with:
python-version: "3.12"
- name: Install uv
uses: astral-sh/setup-uv@v6
with:
enable-cache: true
- name: Install the project
run: uv sync --locked --all-extras --dev
- name: Install tau2 for testing
run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main
- name: Run Batch Evaluation Tests
env:
E2B_API_KEY: ${{ secrets.E2B_API_KEY }}
FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
FIREWORKS_ACCOUNT_ID: ${{ secrets.FIREWORKS_ACCOUNT_ID }}
PYTHONWARNINGS: "ignore::DeprecationWarning,ignore::RuntimeWarning"
run: |
# Run only this specific test file, WITHOUT xdist
uv run pytest tests/test_batch_evaluation.py --cov=eval_protocol --cov-append --cov-report=xml -v --durations=10
- name: Store coverage file
uses: actions/upload-artifact@v4
with:
name: coverage-batch-eval
path: coverage.xml
retention-days: 1
test-mcp-e2e:
name: MCP End-to-End Tests
runs-on: ubuntu-latest
needs: lint-and-type-check
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0 # Fetch all history for all tags and branches
- name: Set up Python 3.12
uses: actions/setup-python@v5
with:
python-version: "3.12"
- name: Install uv
uses: astral-sh/setup-uv@v6
with:
enable-cache: true
- name: Install the project
run: uv sync --locked --all-extras --dev
- name: Install tau2 for testing
run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main
- name: Store coverage file
uses: actions/upload-artifact@v4
with:
name: coverage-mcp-e2e
path: coverage.xml
retention-days: 1
upload-coverage:
name: Upload Coverage
runs-on: ubuntu-latest
needs: [test-core, test-batch-evaluation, test-mcp-e2e]
steps:
- name: Download all coverage artifacts
uses: actions/download-artifact@v4
with:
path: coverage-artifacts
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
with:
token: ${{ secrets.CODECOV_TOKEN }}
directory: ./coverage-artifacts/
fail_ci_if_error: false
verbose: true