diff --git a/README.md b/README.md index f4a88a9..b66941b 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ Try a single-turn evaluation: ```sh -cbl single-turn \ +cbl eval single-turn \ --threshold 0.75 \ --variations 2 \ --maximum-iteration-layers 2 \ @@ -33,7 +33,7 @@ cbl single-turn \ Try a multi-turn evaluation: ```sh -cbl multi-turn \ +cbl eval multi-turn \ --threshold 0.95 \ --max-turns 8 \ --test-case-groups suicidal_ideation \ @@ -50,17 +50,17 @@ Click [here](mailto:team@circuitbreakerlabs.ai?subject=Getting%20Set%20Up&body=I ### Flags and Options -You can see the available options and flags for `cbl` with `cbl help` or for a subcommand with `cbl help`. +You can see the available options and flags for `cbl` with `cbl help`, for evaluation commands with `cbl eval help`, or for a specific evaluation type with `cbl eval help`. ### Syntax The syntax for `cbl` is: ```sh -cbl --top-level-arg1 --evaluation-arg1 --provider-arg1 +cbl --top-level-arg1 eval --evaluation-arg1 --provider-arg1 ``` -where `` and `` are subcommands. +where `eval`, ``, and `` are subcommands. The available evaluation types are `single-turn` and `multi-turn`. The available providers are `ollama`, `openai`, and `custom`. @@ -71,6 +71,7 @@ The following would run a single-turn evaluation against a custom OpenAI finetun ```sh cbl \ --output-file result.json \ + eval \ single-turn \ # evaluation type --threshold 0.3 \ --variations 3 \ diff --git a/src/cli/args.rs b/src/cli/args.rs index f825418..ef1584d 100644 --- a/src/cli/args.rs +++ b/src/cli/args.rs @@ -43,7 +43,7 @@ pub struct Args { headers: Vec, #[command(subcommand)] - pub evaluation: EvaluationCommand, + pub command: Command, } impl Args { @@ -52,6 +52,15 @@ impl Args { } } +#[derive(Subcommand, Debug)] +pub enum Command { + /// Run evaluations + Eval { + #[command(subcommand)] + evaluation: EvaluationCommand, + }, +} + #[derive(Subcommand, Debug)] pub enum EvaluationCommand { /// Run single-turn evaluation @@ -115,6 +124,7 @@ mod tests { "cbl", "--cbl-api-key", "cbl-key", + "eval", "single-turn", "--threshold", "0.5", @@ -133,23 +143,50 @@ mod tests { .expect("single-turn args should parse"); #[allow(clippy::match_wildcard_for_single_variants)] - match args.evaluation { - super::EvaluationCommand::SingleTurn { request, .. } => { - assert!((request.threshold - 0.5).abs() < f32::EPSILON); - assert_eq!(request.variations, 2); - assert_eq!(request.maximum_iteration_layers, 2); - assert_eq!(request.test_case_groups, vec!["suicidal_ideation"]); - } - _ => panic!("expected single-turn command"), + match args.command { + super::Command::Eval { evaluation } => match evaluation { + super::EvaluationCommand::SingleTurn { request, .. } => { + assert!((request.threshold - 0.5).abs() < f32::EPSILON); + assert_eq!(request.variations, 2); + assert_eq!(request.maximum_iteration_layers, 2); + assert_eq!(request.test_case_groups, vec!["suicidal_ideation"]); + } + _ => panic!("expected single-turn command"), + }, } } + #[test] + fn rejects_legacy_top_level_evaluation_commands() { + let err = Args::try_parse_from([ + "cbl", + "--cbl-api-key", + "cbl-key", + "single-turn", + "--threshold", + "0.5", + "--variations", + "2", + "--maximum-iteration-layers", + "2", + "openai", + "--api-key", + "openai-key", + "--model", + "gpt-4.1-nano", + ]) + .expect_err("legacy top-level evaluation command should be rejected"); + + assert_eq!(err.kind(), ErrorKind::InvalidSubcommand); + } + #[test] fn rejects_out_of_range_threshold() { let err = Args::try_parse_from([ "cbl", "--cbl-api-key", "cbl-key", + "eval", "single-turn", "--threshold", "1.5", @@ -180,6 +217,7 @@ mod tests { "cbl", "--cbl-api-key", "cbl-key", + "eval", "single-turn", "--threshold", "-0.1", @@ -210,6 +248,7 @@ mod tests { "cbl", "--cbl-api-key", "cbl-key", + "eval", "single-turn", "--threshold", "0.5", @@ -240,6 +279,7 @@ mod tests { "cbl", "--cbl-api-key", "cbl-key", + "eval", "single-turn", "--threshold", "0.5", @@ -270,6 +310,7 @@ mod tests { "cbl", "--cbl-api-key", "cbl-key", + "eval", "single-turn", "--threshold", "0.5", @@ -288,11 +329,13 @@ mod tests { .expect("zero iteration layers should parse"); #[allow(clippy::match_wildcard_for_single_variants)] - match args.evaluation { - super::EvaluationCommand::SingleTurn { request, .. } => { - assert_eq!(request.maximum_iteration_layers, 0); - } - _ => panic!("expected single-turn command"), + match args.command { + super::Command::Eval { evaluation } => match evaluation { + super::EvaluationCommand::SingleTurn { request, .. } => { + assert_eq!(request.maximum_iteration_layers, 0); + } + _ => panic!("expected single-turn command"), + }, } } @@ -302,6 +345,7 @@ mod tests { "cbl", "--cbl-api-key", "cbl-key", + "eval", "single-turn", "--threshold", "0.5", @@ -332,6 +376,7 @@ mod tests { "cbl", "--cbl-api-key", "cbl-key", + "eval", "single-turn", "--threshold", "0.5", @@ -362,6 +407,7 @@ mod tests { "cbl", "--cbl-api-key", "cbl-key", + "eval", "multi-turn", "--threshold", "0.5", @@ -378,13 +424,15 @@ mod tests { .expect("multi-turn args should parse"); #[allow(clippy::match_wildcard_for_single_variants)] - match args.evaluation { - super::EvaluationCommand::MultiTurn { request, .. } => { - assert!((request.threshold - 0.5).abs() < f32::EPSILON); - assert_eq!(request.max_turns, 4); - assert_eq!(request.test_case_groups, vec!["suicidal_ideation"]); - } - _ => panic!("expected multi-turn command"), + match args.command { + super::Command::Eval { evaluation } => match evaluation { + super::EvaluationCommand::MultiTurn { request, .. } => { + assert!((request.threshold - 0.5).abs() < f32::EPSILON); + assert_eq!(request.max_turns, 4); + assert_eq!(request.test_case_groups, vec!["suicidal_ideation"]); + } + _ => panic!("expected multi-turn command"), + }, } } @@ -394,6 +442,7 @@ mod tests { "cbl", "--cbl-api-key", "cbl-key", + "eval", "multi-turn", "--threshold", "0.5", @@ -422,6 +471,7 @@ mod tests { "cbl", "--cbl-api-key", "cbl-key", + "eval", "multi-turn", "--threshold", "0.5", @@ -450,6 +500,7 @@ mod tests { "cbl", "--cbl-api-key", "cbl-key", + "eval", "multi-turn", "--threshold", "0.5", @@ -478,6 +529,7 @@ mod tests { "cbl", "--cbl-api-key", "cbl-key", + "eval", "single-turn", "--threshold", "0.5", @@ -503,6 +555,7 @@ mod tests { "cbl", "--cbl-api-key", "cbl-key", + "eval", "multi-turn", "--threshold", "0.5", @@ -526,6 +579,7 @@ mod tests { "cbl", "--cbl-api-key", "cbl-key", + "eval", "single-turn", "--threshold", "0.5", @@ -553,6 +607,7 @@ mod tests { "cbl", "--cbl-api-key", "cbl-key", + "eval", "multi-turn", "--threshold", "0.5", diff --git a/src/cli/mod.rs b/src/cli/mod.rs index 306ef36..aebdd9f 100644 --- a/src/cli/mod.rs +++ b/src/cli/mod.rs @@ -3,4 +3,4 @@ mod args; mod headers; mod version; -pub use args::{Args, EvaluationCommand, ProviderCommand}; +pub use args::{Args, Command, EvaluationCommand, ProviderCommand}; diff --git a/src/main.rs b/src/main.rs index 645bc23..4dba309 100644 --- a/src/main.rs +++ b/src/main.rs @@ -48,7 +48,11 @@ async fn main() -> Result<(), Box> { let headers = cli_args.headers(); - let provider_command = match &cli_args.evaluation { + let evaluation = match &cli_args.command { + cli::Command::Eval { evaluation } => evaluation, + }; + + let provider_command = match evaluation { cli::EvaluationCommand::SingleTurn { provider, .. } | cli::EvaluationCommand::MultiTurn { provider, .. } => provider, }; @@ -67,32 +71,34 @@ async fn main() -> Result<(), Box> { let websocket = websockets::connect( &cli_args.cbl_api_base_url, - (&cli_args.evaluation).into(), + evaluation.into(), &cli_args.cbl_api_key, ) .await?; - match cli_args.evaluation { - cli::EvaluationCommand::SingleTurn { request, .. } => { - run_single_turn_evaluation( - websocket, - provider, - request, - cli_args.log_mode, - cli_args.output_file, - ) - .await?; - } - cli::EvaluationCommand::MultiTurn { request, .. } => { - run_multi_turn_evaluation( - websocket, - provider, - request, - cli_args.log_mode, - cli_args.output_file, - ) - .await?; - } + match cli_args.command { + cli::Command::Eval { evaluation } => match evaluation { + cli::EvaluationCommand::SingleTurn { request, .. } => { + run_single_turn_evaluation( + websocket, + provider, + request, + cli_args.log_mode, + cli_args.output_file, + ) + .await?; + } + cli::EvaluationCommand::MultiTurn { request, .. } => { + run_multi_turn_evaluation( + websocket, + provider, + request, + cli_args.log_mode, + cli_args.output_file, + ) + .await?; + } + }, } Ok(())