handle direct calls to the evaluation_test

dphuang2 · dphuang2 · commit 179312d0f562 · 2025-08-03T21:36:13.000-07:00
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
@@ -259,10 +259,101 @@ def wrapper_body(**kwargs):
 
             return create_dynamically_parameterized_wrapper(test_func, wrapper_body, test_param_names)
 
-        wrapper = create_wrapper_with_signature()
-        wrapper = pytest.mark.parametrize(test_param_names, param_tuples)(wrapper)
-        wrapper.original_evaluation_test_func = test_func
+        # Create the pytest wrapper
+        pytest_wrapper = create_wrapper_with_signature()
+        pytest_wrapper = pytest.mark.parametrize(test_param_names, param_tuples)(pytest_wrapper)
 
-        return wrapper
+        def create_dual_mode_wrapper() -> Callable:
+            """
+            Creates a wrapper that supports both pytest parameterized execution and direct function calls.
+
+            This wrapper enables the decorated evaluation test function to be used in two ways:
+            1. As a pytest test (via pytest.mark.parametrize) with full parameterization
+            2. As a direct function call with EvaluationRow data for programmatic use
+
+            The wrapper automatically detects the calling pattern and routes to the appropriate
+            execution path, ensuring consistent behavior regardless of how the function is invoked.
+
+            Returns:
+                A callable that can handle both pytest test execution and direct function calls
+            """
+            import asyncio
+
+            # Check if the test function is async
+            is_async = asyncio.iscoroutinefunction(test_func)
+
+            if is_async:
+
+                async def dual_mode_wrapper(*args, **kwargs):
+                    # Check if this is a direct call with the expected signature
+                    if mode == "pointwise":
+                        # For pointwise mode, check if called with a single row argument
+                        if len(args) == 1 and isinstance(args[0], EvaluationRow) and not kwargs:
+                            return await test_func(row=args[0])
+                    else:
+                        # For batch mode, check if called with rows argument
+                        if (
+                            len(args) == 1
+                            and isinstance(args[0], list)
+                            and all(isinstance(r, EvaluationRow) for r in args[0])
+                            and not kwargs
+                        ):
+                            return await test_func(rows=args[0])
+                        # Also check if called with keyword argument 'rows'
+                        if (
+                            len(args) == 0
+                            and "rows" in kwargs
+                            and isinstance(kwargs["rows"], list)
+                            and all(isinstance(r, EvaluationRow) for r in kwargs["rows"])
+                        ):
+                            return await test_func(**kwargs)
+
+                    # If not a direct call, use the pytest wrapper
+                    return pytest_wrapper(*args, **kwargs)
+
+            else:
+
+                def dual_mode_wrapper(*args, **kwargs):
+                    # Check if this is a direct call with the expected signature
+                    if mode == "pointwise":
+                        # For pointwise mode, check if called with a single row argument
+                        if len(args) == 1 and isinstance(args[0], EvaluationRow) and not kwargs:
+                            return test_func(row=args[0])
+
+                        if len(args) == 0 and "row" in kwargs and isinstance(kwargs["row"], EvaluationRow):
+                            return test_func(**kwargs)
+                    else:
+                        # For batch mode, check if called with rows argument
+                        if (
+                            len(args) == 1
+                            and isinstance(args[0], list)
+                            and all(isinstance(r, EvaluationRow) for r in args[0])
+                            and not kwargs
+                        ):
+                            return test_func(rows=args[0])
+                        # Also check if called with keyword argument 'rows'
+                        if (
+                            len(args) == 0
+                            and "rows" in kwargs
+                            and isinstance(kwargs["rows"], list)
+                            and all(isinstance(r, EvaluationRow) for r in kwargs["rows"])
+                        ):
+                            return test_func(**kwargs)
+
+                    # If not a direct call, use the pytest wrapper
+                    return pytest_wrapper(*args, **kwargs)
+
+            # Copy all attributes from the pytest wrapper to our dual mode wrapper
+            import functools
+
+            functools.update_wrapper(dual_mode_wrapper, pytest_wrapper)
+            dual_mode_wrapper.original_evaluation_test_func = test_func
+
+            return dual_mode_wrapper
+
+        # Create the dual mode wrapper
+        dual_mode_wrapper = create_dual_mode_wrapper()
+
+        return dual_mode_wrapper
 
     return decorator
diff --git a/eval_protocol/pytest/utils.py b/eval_protocol/pytest/utils.py
@@ -24,8 +24,17 @@ def execute_function(func: Callable, **kwargs) -> Any:
         # Handle async functions with proper event loop management
         try:
             loop = asyncio.get_event_loop()
-            if not loop.is_closed():
-                # Use existing loop
+            if loop.is_running():
+                # Event loop is already running, create a task and wait for it
+                task = loop.create_task(func(**kwargs))
+                # Use asyncio.wait to avoid run_until_complete on running loop
+                import concurrent.futures
+
+                with concurrent.futures.ThreadPoolExecutor() as executor:
+                    future = executor.submit(asyncio.run, func(**kwargs))
+                    results = future.result()
+            elif not loop.is_closed():
+                # Use existing loop that's not running
                 task = loop.create_task(func(**kwargs))
                 results = loop.run_until_complete(task)
             else:
diff --git a/tests/pytest/test_pytest_async.py b/tests/pytest/test_pytest_async.py
@@ -1,5 +1,8 @@
+import asyncio
 from typing import List
 
+import pytest
+
 from eval_protocol.models import EvaluationRow, Message
 from eval_protocol.pytest import evaluation_test
 from examples.math_example.main import evaluate as math_evaluate
@@ -19,3 +22,47 @@
 async def test_pytest_async(rows: List[EvaluationRow]) -> List[EvaluationRow]:
     """Run math evaluation on sample dataset using pytest interface."""
     return rows
+
+
+@evaluation_test(
+    input_messages=[
+        [
+            Message(role="user", content="What is the capital of France?"),
+        ],
+    ],
+    model=["accounts/fireworks/models/kimi-k2-instruct"],
+    mode="pointwise",
+)
+async def test_pytest_async_pointwise(row: EvaluationRow) -> EvaluationRow:
+    """Run pointwise evaluation on sample dataset using pytest interface."""
+    return row
+
+
+@pytest.mark.asyncio
+async def test_pytest_async_main():
+    """
+    Tests that we can just run the test function directly
+    """
+    rows = [
+        EvaluationRow(
+            messages=[
+                Message(role="user", content="What is the capital of France?"),
+            ],
+        )
+    ]
+    result = await test_pytest_async(rows)
+    assert result == rows
+
+
+@pytest.mark.asyncio
+async def test_pytest_async_pointwise_main():
+    """
+    Tests that we can just run the pointwise test function directly
+    """
+    row = EvaluationRow(
+        messages=[
+            Message(role="user", content="What is the capital of France?"),
+        ],
+    )
+    result = await test_pytest_async_pointwise(row)
+    assert result == row