3636 EvaluationInputParam ,
3737 EvaluationTestMode ,
3838 InputMessagesParam ,
39+ InputRowsParam ,
3940 ModelParam ,
4041 RolloutProcessorConfig ,
4142 RolloutProcessorInputParam ,
@@ -238,6 +239,7 @@ def evaluation_test( # noqa: C901
238239 completion_params : List [CompletionParams ],
239240 input_messages : Optional [List [InputMessagesParam ]] = None ,
240241 input_dataset : Optional [List [DatasetPathParam ]] = None ,
242+ input_rows : Optional [List [InputRowsParam ]] = None ,
241243 dataset_adapter : Callable [[List [Dict [str , Any ]]], Dataset ] = default_dataset_adapter ,
242244 rollout_processor : RolloutProcessor = NoOpRolloutProcessor (),
243245 evaluation_test_kwargs : Optional [List [EvaluationInputParam ]] = None ,
@@ -299,6 +301,9 @@ def evaluation_test( # noqa: C901
299301 input_dataset: Paths to JSONL datasets. This is useful if you have a
300302 dataset already. Provide a dataset_adapter to convert the input dataset
301303 to a list of EvaluationRows if you have a custom dataset format.
304+ input_rows: Pre-constructed EvaluationRow objects to use directly. This is useful
305+ when you want to provide EvaluationRow objects with custom metadata, input_messages,
306+ or other fields already populated. Will be passed as "input_dataset" to the test function.
302307 dataset_adapter: Function to convert the input dataset to a list of
303308 EvaluationRows. This is useful if you have a custom dataset format.
304309 completion_params: Generation parameters for the rollout.
@@ -413,33 +418,42 @@ async def execute_with_params(
413418 # Calculate all possible combinations of parameters
414419 if mode == "groupwise" :
415420 combinations = generate_parameter_combinations (
416- input_dataset , None , input_messages , evaluation_test_kwargs , max_dataset_rows , combine_datasets
421+ input_dataset ,
422+ None ,
423+ input_messages ,
424+ input_rows ,
425+ evaluation_test_kwargs ,
426+ max_dataset_rows ,
427+ combine_datasets ,
417428 )
418429 else :
419430 combinations = generate_parameter_combinations (
420431 input_dataset ,
421432 completion_params ,
422433 input_messages ,
434+ input_rows ,
423435 evaluation_test_kwargs ,
424436 max_dataset_rows ,
425437 combine_datasets ,
426438 )
427439 if len (combinations ) == 0 :
428440 raise ValueError (
429- "No combinations of parameters were found. Please provide at least a model and one of input_dataset or input_messages ."
441+ "No combinations of parameters were found. Please provide at least a model and one of input_dataset, input_messages, or input_rows ."
430442 )
431443
432444 # Create parameter tuples for pytest.mark.parametrize
433445 param_tuples = []
434446 for combo in combinations :
435- dataset , cp , messages , etk = combo
447+ dataset , cp , messages , rows , etk = combo
436448 param_tuple = []
437449 if input_dataset is not None :
438450 param_tuple .append (dataset )
439451 if completion_params is not None :
440452 param_tuple .append (cp )
441453 if input_messages is not None :
442454 param_tuple .append (messages )
455+ if input_rows is not None :
456+ param_tuple .append (rows )
443457 if evaluation_test_kwargs is not None :
444458 param_tuple .append (etk )
445459 param_tuples .append (tuple (param_tuple ))
@@ -452,6 +466,8 @@ async def execute_with_params(
452466 test_param_names .append ("completion_params" )
453467 if input_messages is not None :
454468 test_param_names .append ("input_messages" )
469+ if input_rows is not None :
470+ test_param_names .append ("input_rows" )
455471 if evaluation_test_kwargs is not None :
456472 test_param_names .append ("evaluation_test_kwargs" )
457473
@@ -500,8 +516,11 @@ def _log_eval_error(
500516 else :
501517 # Multiple rows: list of List[Message]
502518 data = [EvaluationRow (messages = m ) for m in im ]
519+ elif "input_rows" in kwargs and kwargs ["input_rows" ] is not None :
520+ # Use pre-constructed EvaluationRow objects directly
521+ data = kwargs ["input_rows" ]
503522 else :
504- raise ValueError ("No input dataset or input messages provided" )
523+ raise ValueError ("No input dataset, input messages, or input rows provided" )
505524
506525 for row in data :
507526 # generate a stable row_id for each row
0 commit comments