Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
Try a single-turn evaluation:

```sh
cbl single-turn \
cbl eval single-turn \
--threshold 0.75 \
--variations 2 \
--maximum-iteration-layers 2 \
Expand All @@ -33,7 +33,7 @@ cbl single-turn \
Try a multi-turn evaluation:

```sh
cbl multi-turn \
cbl eval multi-turn \
--threshold 0.95 \
--max-turns 8 \
--test-case-groups suicidal_ideation \
Expand All @@ -50,17 +50,17 @@ Click [here](mailto:team@circuitbreakerlabs.ai?subject=Getting%20Set%20Up&body=I

### Flags and Options

You can see the available options and flags for `cbl` with `cbl help` or for a subcommand with `cbl <subcommand> help`.
You can see the available options and flags for `cbl` with `cbl help`, for evaluation commands with `cbl eval help`, or for a specific evaluation type with `cbl eval <evaluation_type> help`.

### Syntax

The syntax for `cbl` is:

```sh
cbl --top-level-arg1 <evaluation_type> --evaluation-arg1 <provider> --provider-arg1
cbl --top-level-arg1 eval <evaluation_type> --evaluation-arg1 <provider> --provider-arg1
```

where `<evaluation_type>` and `<provider>` are subcommands.
where `eval`, `<evaluation_type>`, and `<provider>` are subcommands.

The available evaluation types are `single-turn` and `multi-turn`. The available providers are `ollama`, `openai`, and `custom`.

Expand All @@ -71,6 +71,7 @@ The following would run a single-turn evaluation against a custom OpenAI finetun
```sh
cbl \
--output-file result.json \
eval \
single-turn \ # evaluation type
--threshold 0.3 \
--variations 3 \
Expand Down
97 changes: 76 additions & 21 deletions src/cli/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ pub struct Args {
headers: Vec<Headers>,

#[command(subcommand)]
pub evaluation: EvaluationCommand,
pub command: Command,
}

impl Args {
Expand All @@ -52,6 +52,15 @@ impl Args {
}
}

#[derive(Subcommand, Debug)]
pub enum Command {
/// Run evaluations
Eval {
#[command(subcommand)]
evaluation: EvaluationCommand,
},
}

#[derive(Subcommand, Debug)]
pub enum EvaluationCommand {
/// Run single-turn evaluation
Expand Down Expand Up @@ -115,6 +124,7 @@ mod tests {
"cbl",
"--cbl-api-key",
"cbl-key",
"eval",
"single-turn",
"--threshold",
"0.5",
Expand All @@ -133,23 +143,50 @@ mod tests {
.expect("single-turn args should parse");

#[allow(clippy::match_wildcard_for_single_variants)]
match args.evaluation {
super::EvaluationCommand::SingleTurn { request, .. } => {
assert!((request.threshold - 0.5).abs() < f32::EPSILON);
assert_eq!(request.variations, 2);
assert_eq!(request.maximum_iteration_layers, 2);
assert_eq!(request.test_case_groups, vec!["suicidal_ideation"]);
}
_ => panic!("expected single-turn command"),
match args.command {
super::Command::Eval { evaluation } => match evaluation {
super::EvaluationCommand::SingleTurn { request, .. } => {
assert!((request.threshold - 0.5).abs() < f32::EPSILON);
assert_eq!(request.variations, 2);
assert_eq!(request.maximum_iteration_layers, 2);
assert_eq!(request.test_case_groups, vec!["suicidal_ideation"]);
}
_ => panic!("expected single-turn command"),
},
}
}

#[test]
fn rejects_legacy_top_level_evaluation_commands() {
let err = Args::try_parse_from([
"cbl",
"--cbl-api-key",
"cbl-key",
"single-turn",
"--threshold",
"0.5",
"--variations",
"2",
"--maximum-iteration-layers",
"2",
"openai",
"--api-key",
"openai-key",
"--model",
"gpt-4.1-nano",
])
.expect_err("legacy top-level evaluation command should be rejected");

assert_eq!(err.kind(), ErrorKind::InvalidSubcommand);
}

#[test]
fn rejects_out_of_range_threshold() {
let err = Args::try_parse_from([
"cbl",
"--cbl-api-key",
"cbl-key",
"eval",
"single-turn",
"--threshold",
"1.5",
Expand Down Expand Up @@ -180,6 +217,7 @@ mod tests {
"cbl",
"--cbl-api-key",
"cbl-key",
"eval",
"single-turn",
"--threshold",
"-0.1",
Expand Down Expand Up @@ -210,6 +248,7 @@ mod tests {
"cbl",
"--cbl-api-key",
"cbl-key",
"eval",
"single-turn",
"--threshold",
"0.5",
Expand Down Expand Up @@ -240,6 +279,7 @@ mod tests {
"cbl",
"--cbl-api-key",
"cbl-key",
"eval",
"single-turn",
"--threshold",
"0.5",
Expand Down Expand Up @@ -270,6 +310,7 @@ mod tests {
"cbl",
"--cbl-api-key",
"cbl-key",
"eval",
"single-turn",
"--threshold",
"0.5",
Expand All @@ -288,11 +329,13 @@ mod tests {
.expect("zero iteration layers should parse");

#[allow(clippy::match_wildcard_for_single_variants)]
match args.evaluation {
super::EvaluationCommand::SingleTurn { request, .. } => {
assert_eq!(request.maximum_iteration_layers, 0);
}
_ => panic!("expected single-turn command"),
match args.command {
super::Command::Eval { evaluation } => match evaluation {
super::EvaluationCommand::SingleTurn { request, .. } => {
assert_eq!(request.maximum_iteration_layers, 0);
}
_ => panic!("expected single-turn command"),
},
}
}

Expand All @@ -302,6 +345,7 @@ mod tests {
"cbl",
"--cbl-api-key",
"cbl-key",
"eval",
"single-turn",
"--threshold",
"0.5",
Expand Down Expand Up @@ -332,6 +376,7 @@ mod tests {
"cbl",
"--cbl-api-key",
"cbl-key",
"eval",
"single-turn",
"--threshold",
"0.5",
Expand Down Expand Up @@ -362,6 +407,7 @@ mod tests {
"cbl",
"--cbl-api-key",
"cbl-key",
"eval",
"multi-turn",
"--threshold",
"0.5",
Expand All @@ -378,13 +424,15 @@ mod tests {
.expect("multi-turn args should parse");

#[allow(clippy::match_wildcard_for_single_variants)]
match args.evaluation {
super::EvaluationCommand::MultiTurn { request, .. } => {
assert!((request.threshold - 0.5).abs() < f32::EPSILON);
assert_eq!(request.max_turns, 4);
assert_eq!(request.test_case_groups, vec!["suicidal_ideation"]);
}
_ => panic!("expected multi-turn command"),
match args.command {
super::Command::Eval { evaluation } => match evaluation {
super::EvaluationCommand::MultiTurn { request, .. } => {
assert!((request.threshold - 0.5).abs() < f32::EPSILON);
assert_eq!(request.max_turns, 4);
assert_eq!(request.test_case_groups, vec!["suicidal_ideation"]);
}
_ => panic!("expected multi-turn command"),
},
}
}

Expand All @@ -394,6 +442,7 @@ mod tests {
"cbl",
"--cbl-api-key",
"cbl-key",
"eval",
"multi-turn",
"--threshold",
"0.5",
Expand Down Expand Up @@ -422,6 +471,7 @@ mod tests {
"cbl",
"--cbl-api-key",
"cbl-key",
"eval",
"multi-turn",
"--threshold",
"0.5",
Expand Down Expand Up @@ -450,6 +500,7 @@ mod tests {
"cbl",
"--cbl-api-key",
"cbl-key",
"eval",
"multi-turn",
"--threshold",
"0.5",
Expand Down Expand Up @@ -478,6 +529,7 @@ mod tests {
"cbl",
"--cbl-api-key",
"cbl-key",
"eval",
"single-turn",
"--threshold",
"0.5",
Expand All @@ -503,6 +555,7 @@ mod tests {
"cbl",
"--cbl-api-key",
"cbl-key",
"eval",
"multi-turn",
"--threshold",
"0.5",
Expand All @@ -526,6 +579,7 @@ mod tests {
"cbl",
"--cbl-api-key",
"cbl-key",
"eval",
"single-turn",
"--threshold",
"0.5",
Expand Down Expand Up @@ -553,6 +607,7 @@ mod tests {
"cbl",
"--cbl-api-key",
"cbl-key",
"eval",
"multi-turn",
"--threshold",
"0.5",
Expand Down
2 changes: 1 addition & 1 deletion src/cli/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ mod args;
mod headers;
mod version;

pub use args::{Args, EvaluationCommand, ProviderCommand};
pub use args::{Args, Command, EvaluationCommand, ProviderCommand};
52 changes: 29 additions & 23 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,11 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {

let headers = cli_args.headers();

let provider_command = match &cli_args.evaluation {
let evaluation = match &cli_args.command {
cli::Command::Eval { evaluation } => evaluation,
};

let provider_command = match evaluation {
cli::EvaluationCommand::SingleTurn { provider, .. }
| cli::EvaluationCommand::MultiTurn { provider, .. } => provider,
};
Expand All @@ -67,32 +71,34 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {

let websocket = websockets::connect(
&cli_args.cbl_api_base_url,
(&cli_args.evaluation).into(),
evaluation.into(),
&cli_args.cbl_api_key,
)
.await?;

match cli_args.evaluation {
cli::EvaluationCommand::SingleTurn { request, .. } => {
run_single_turn_evaluation(
websocket,
provider,
request,
cli_args.log_mode,
cli_args.output_file,
)
.await?;
}
cli::EvaluationCommand::MultiTurn { request, .. } => {
run_multi_turn_evaluation(
websocket,
provider,
request,
cli_args.log_mode,
cli_args.output_file,
)
.await?;
}
match cli_args.command {
cli::Command::Eval { evaluation } => match evaluation {
cli::EvaluationCommand::SingleTurn { request, .. } => {
run_single_turn_evaluation(
websocket,
provider,
request,
cli_args.log_mode,
cli_args.output_file,
)
.await?;
}
cli::EvaluationCommand::MultiTurn { request, .. } => {
run_multi_turn_evaluation(
websocket,
provider,
request,
cli_args.log_mode,
cli_args.output_file,
)
.await?;
}
},
}

Ok(())
Expand Down
Loading