-
Notifications
You must be signed in to change notification settings - Fork 102
Expand file tree
/
Copy pathtrace_simulation.py
More file actions
82 lines (66 loc) · 2.92 KB
/
trace_simulation.py
File metadata and controls
82 lines (66 loc) · 2.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
"""
Trace Export & Offline Cascade Simulation
Demonstrates how to simulate different cascade configurations offline,
without making any real API calls. Useful for tuning quality thresholds,
comparing model tiers, and projecting cost savings.
Usage:
python examples/trace_simulation.py
"""
from cascadeflow import ModelConfig, SimulationResult, simulate
# Sample queries representing a realistic workload mix
queries = [
# Simple queries (should stay on draft model)
"What is Python?",
"Hello, how are you?",
"What's the capital of France?",
"Convert 100 degrees Fahrenheit to Celsius",
# Moderate queries
"Explain the difference between REST and GraphQL APIs",
"Write a Python function to find the nth Fibonacci number",
"What are the pros and cons of microservices architecture?",
# Complex queries (should escalate to verifier)
"Prove that the square root of 2 is irrational",
"Explain the proof of Gödel's incompleteness theorem",
"Derive the Navier-Stokes equations from conservation of momentum",
]
# Define two model configurations to compare
config_a = [
ModelConfig(name="gpt-4o-mini", provider="openai", cost=0.000375),
ModelConfig(name="gpt-4o", provider="openai", cost=0.005),
]
config_b = [
ModelConfig(name="llama-3.1-8b", provider="groq", cost=0.00005),
ModelConfig(name="gpt-4o", provider="openai", cost=0.005),
]
def print_result(label: str, result: SimulationResult) -> None:
print(f"\n{'=' * 50}")
print(f" {label}")
print(f"{'=' * 50}")
print(f" Queries: {result.total_queries}")
print(f" Projected cost: ${result.projected_cost:.6f}")
print(f" Escalation rate: {result.escalation_rate:.1%}")
print(f" Model usage: {result.model_distribution}")
print(f" Complexity: {result.complexity_distribution}")
# Simulate with different quality thresholds
print("Comparing quality thresholds with GPT-4o-mini -> GPT-4o cascade:\n")
for threshold in [0.4, 0.7, 0.9]:
result = simulate(queries=queries, models=config_a, quality_threshold=threshold)
print_result(f"Threshold = {threshold}", result)
# Compare two different draft models
print("\n\nComparing draft models (threshold=0.7):\n")
result_a = simulate(queries=queries, models=config_a, quality_threshold=0.7)
result_b = simulate(queries=queries, models=config_b, quality_threshold=0.7)
print_result("GPT-4o-mini (draft) -> GPT-4o", result_a)
print_result("Llama-3.1-8b (draft) -> GPT-4o", result_b)
diff = result_b.compare(result_a)
print(
f"\nSwitching to Llama-3.1-8b saves ${-diff['cost_change']:.6f} ({-diff['cost_change_pct']:.1f}%)"
)
# Per-query breakdown
print("\n\nPer-query breakdown (config B, threshold=0.7):")
print(f"{'Query':<50} {'Complexity':<10} {'Model':<20} {'Cost':<10}")
print("-" * 90)
for entry in result_b.per_query:
print(
f"{entry.query:<50} {entry.complexity:<10} {entry.projected_model:<20} ${entry.projected_cost:.6f}"
)