-
Notifications
You must be signed in to change notification settings - Fork 96
Expand file tree
/
Copy pathbasic_usage.py
More file actions
391 lines (332 loc) · 15.2 KB
/
basic_usage.py
File metadata and controls
391 lines (332 loc) · 15.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
"""
cascadeflow - Basic Usage Example
The simplest way to get started with cascadeflow. This example demonstrates:
- Setting up a two-tier cascade (cheap → expensive)
- Processing queries with automatic quality-based routing
- Cost tracking and savings calculation
- Different complexity levels (simple → complex queries)
Requirements:
- cascadeflow[all]
- OpenAI API key
Setup:
pip install cascadeflow[all]
export OPENAI_API_KEY="your-key-here"
python examples/basic_usage.py
What You'll Learn:
1. How to configure a basic cascade
2. How cascadeflow automatically routes queries
3. How to track costs and savings
4. How different query complexities are handled
Expected Output:
- Simple queries: GPT-4o-mini draft accepted, GPT-4o skipped
- Complex queries: Direct to GPT-4o OR draft rejected and escalated
- Token-based cost comparison showing realistic 40-60% savings
Note on Costs:
Costs are calculated using actual token-based pricing from OpenAI:
- GPT-4o-mini: ~$0.000375 per 1K tokens (blended input/output)
- GPT-4o: ~$0.0025 per 1K tokens (blended input/output)
Savings depend on your query mix and response lengths.
Note on Latency:
95% of latency comes from provider API calls, NOT from cascadeflow!
- Provider API: 95% (waiting for OpenAI/Anthropic/etc to respond)
- cascadeflow overhead: 5% (routing, quality checks, etc.)
To reduce latency:
1. Choose faster providers (Groq is 5-10x faster than OpenAI)
2. Use streaming for perceived speed improvement
3. Don't worry about cascade overhead (it's minimal)
Documentation:
For complete setup instructions and detailed explanations, see:
docs/guides/quickstart.md
"""
import asyncio
import os
from cascadeflow import CascadeAgent, ModelConfig
async def main():
"""
Basic cascadeflow usage - the simplest possible example.
"""
# Check for required API key
if not os.getenv("OPENAI_API_KEY"):
print("\n❌ Error: OPENAI_API_KEY environment variable not set")
print("\nThis example requires an OpenAI API key to run.")
print("\nSetup:")
print(" 1. Get your API key from: https://platform.openai.com/api-keys")
print(" 2. Set the environment variable:")
print(" export OPENAI_API_KEY='sk-...'")
print(" 3. Run this example:")
print(" python examples/basic_usage.py")
return
print("=" * 80)
print("🌊 CASCADEFLOW - BASIC USAGE EXAMPLE")
print("=" * 80)
print()
print("This example shows how cascadeflow automatically routes queries")
print("between a cheap model (GPT-4o-mini) and expensive model (GPT-4o).")
print()
print("💡 Key Concept: cascadeflow uses TOKEN-BASED pricing, not flat rates.")
print(" This means costs depend on how long your queries and responses are.")
print()
# ========================================================================
# STEP 1: Configure Your Cascade
# ========================================================================
print("📋 Step 1: Configuring cascade with two models...")
print()
agent = CascadeAgent(
models=[
# Cheap model - tries first
ModelConfig(
name="gpt-4o-mini",
provider="openai",
cost=0.000375, # $0.375 per 1M tokens (blended estimate)
quality_threshold=0.7, # Accept if confidence >= 70%
),
# Expensive model - only if needed
ModelConfig(
name="gpt-4o",
provider="openai",
cost=0.00625, # $6.25 per 1M tokens (blended estimate)
quality_threshold=0.95, # Very high quality
),
]
)
print(" ✅ Tier 1: gpt-4o-mini (~$0.375/1M tokens) - Tries first")
print(" ✅ Tier 2: gpt-4o (~$6.25/1M tokens) - Escalates if needed")
print()
# ========================================================================
# STEP 2: Test with Different Query Types
# ========================================================================
print("📝 Step 2: Testing with various query types...\n")
# Test queries ranging from simple to complex
test_queries = [
# SIMPLE queries - should stay on GPT-4o-mini
{
"query": "What color is the sky?",
"expected": "gpt-4o-mini",
"reason": "Simple factual question - cheap model handles easily",
},
{
"query": "What's the capital of France?",
"expected": "gpt-4o-mini",
"reason": "Simple factual - cheap model knows this",
},
{
"query": "Translate 'hello' to Spanish",
"expected": "gpt-4o-mini",
"reason": "Simple translation - cheap model sufficient",
},
# MODERATE queries - might escalate
{
"query": "Explain the difference between lists and tuples in Python",
"expected": "gpt-4o-mini",
"reason": "Moderate complexity - cheap model likely handles it",
},
{
"query": "Write a function to reverse a string in Python",
"expected": "gpt-4o-mini",
"reason": "Standard coding task - cheap model can do it",
},
# COMPLEX queries - likely escalate to GPT-4o
{
"query": "Explain quantum entanglement and its implications for quantum computing in detail",
"expected": "gpt-4o",
"reason": "Complex scientific topic - needs better model",
},
{
"query": "Design a microservices architecture for a large-scale e-commerce platform with high availability",
"expected": "gpt-4o",
"reason": "Complex architecture design - benefits from GPT-4o",
},
{
"query": "Analyze the philosophical implications of consciousness and free will in the context of determinism",
"expected": "gpt-4o",
"reason": "Deep philosophical analysis - needs sophisticated reasoning",
},
]
# Track statistics
stats = {
"gpt-4o-mini": {"count": 0, "cost": 0.0},
"gpt-4o": {"count": 0, "cost": 0.0},
"total_cost": 0.0,
"draft_accepted": 0,
"draft_rejected": 0,
"direct_routing": 0,
}
# Track token usage for baseline calculation
all_gpt4_tokens = 0
# Process each query
for i, test in enumerate(test_queries, 1):
print(f"{'─' * 80}")
print(f"Query {i}/{len(test_queries)}")
print(f"{'─' * 80}")
print(f"❓ Question: {test['query']}")
print(f"🎯 Expected: {test['expected']}")
print(f"💡 Why: {test['reason']}")
print()
# Run the query through cascade
result = await agent.run(test["query"], max_tokens=150)
# Determine which model was used
model_used = "gpt-4o-mini" if "4o-mini" in result.model_used.lower() else "gpt-4o"
# Update statistics
stats[model_used]["count"] += 1
stats[model_used]["cost"] += result.total_cost
stats["total_cost"] += result.total_cost
# Track cascade status
if hasattr(result, "cascaded") and result.cascaded:
if hasattr(result, "draft_accepted") and result.draft_accepted:
stats["draft_accepted"] += 1
else:
stats["draft_rejected"] += 1
else:
stats["direct_routing"] += 1
# Estimate tokens for baseline (approximate)
query_tokens = len(test["query"].split()) * 1.3
if hasattr(result, "content"):
response_tokens = len(result.content.split()) * 1.3
else:
response_tokens = 100 # Default estimate
all_gpt4_tokens += query_tokens + response_tokens
# Show result
tier = "Tier 1 (Cheap)" if model_used == "gpt-4o-mini" else "Tier 2 (Expensive)"
icon = "💚" if model_used == "gpt-4o-mini" else "💛"
print("✅ Result:")
# Show actual model(s) used with clear status
if hasattr(result, "draft_accepted") and result.draft_accepted:
# Only draft was used
print(f" {icon} Model Used: gpt-4o-mini only ({tier})")
elif (
hasattr(result, "cascaded")
and result.cascaded
and not getattr(result, "draft_accepted", True)
):
# Both models were used
print(" 💚💛 Models Used: gpt-4o-mini + gpt-4o (Both Tiers)")
else:
# Direct routing
print(f" {icon} Model Used: {result.model_used} ({tier})")
# Safely get cost
cost = getattr(result, "total_cost", 0.0)
print(f" 💰 Cost: ${cost:.6f}")
# Safely get latency with breakdown - use library-provided fields!
total_latency = getattr(result, "latency_ms", 0.0)
draft_latency = getattr(result, "draft_latency_ms", 0.0)
verifier_latency = getattr(result, "verifier_latency_ms", 0.0)
# cascade_overhead_ms is computed by the library:
# - Draft accepted: 0ms (we saved verifier time)
# - Draft rejected: full draft_latency_ms (wasted drafter attempt)
# - Direct route: 0ms (no cascade)
cascade_overhead = getattr(result, "cascade_overhead_ms", 0.0)
is_cascaded = hasattr(result, "cascaded") and result.cascaded
is_draft_accepted = getattr(result, "draft_accepted", False)
print(" ⚡ Latency Breakdown:")
print(f" Total: {total_latency:.0f}ms")
if is_cascaded and not is_draft_accepted:
# Draft was rejected - drafter time was wasted
print(f" ├─ Drafter (wasted): {draft_latency:.0f}ms")
print(f" └─ Verifier: {verifier_latency:.0f}ms")
print(f" ⚠️ Cascade overhead: +{cascade_overhead:.0f}ms (drafter was rejected)")
elif is_cascaded and is_draft_accepted:
# Draft was accepted - we saved the verifier time
print(f" └─ Drafter only: {draft_latency:.0f}ms")
print(" ✅ Cascade overhead: 0ms (verifier skipped)")
else:
# Direct route - no cascade overhead
print(f" └─ Provider API: {total_latency:.0f}ms (direct route)")
print(" ✅ Cascade overhead: 0ms (direct route)")
# Safely get complexity
complexity = getattr(result, "complexity", "unknown")
print(f" 📊 Complexity: {complexity}")
# Show cascade status more clearly
if hasattr(result, "cascaded") and result.cascaded:
if hasattr(result, "draft_accepted") and result.draft_accepted:
print(" ✅ Draft Accepted: GPT-4o-mini response passed quality check")
print(" 💡 Verifier Skipped: GPT-4o was not called (cost saved!)")
else:
print(" ❌ Draft Rejected: Quality check failed, escalated to GPT-4o")
print(" 💸 Both Models Used: Paid for GPT-4o-mini + GPT-4o")
else:
print(" 🎯 Direct Route: Query sent directly to GPT-4o (no cascade)")
# Show first part of response
response_preview = result.content[:100].replace("\n", " ")
print(f" 📝 Response: {response_preview}...")
print()
# ========================================================================
# STEP 3: Show Cost Analysis
# ========================================================================
print("=" * 80)
print("💰 COST ANALYSIS")
print("=" * 80)
print()
# Calculate statistics
total_queries = len(test_queries)
gpt4mini_count = stats["gpt-4o-mini"]["count"]
gpt4o_count = stats["gpt-4o"]["count"]
gpt4mini_pct = (gpt4mini_count / total_queries) * 100
gpt4o_pct = (gpt4o_count / total_queries) * 100
print("📊 Query Distribution:")
print(f" GPT-4o-mini: {gpt4mini_count}/{total_queries} ({gpt4mini_pct:.0f}%)")
print(f" GPT-4o: {gpt4o_count}/{total_queries} ({gpt4o_pct:.0f}%)")
print()
print("🔄 Cascade Behavior:")
print(f" Draft Accepted: {stats['draft_accepted']} (verifier skipped)")
print(f" Draft Rejected: {stats['draft_rejected']} (both models used)")
print(f" Direct Routing: {stats['direct_routing']} (no cascade)")
print()
print("💵 Cost Breakdown:")
print(f" GPT-4o-mini: ${stats['gpt-4o-mini']['cost']:.6f}")
print(f" GPT-4o: ${stats['gpt-4o']['cost']:.6f}")
print(f" Total Cost: ${stats['total_cost']:.6f}")
print()
# Calculate savings vs all-GPT-4o (token-based estimate)
# GPT-4o pricing: ~$0.00625 per 1K tokens (blended)
all_gpt4o_cost = (all_gpt4_tokens / 1000) * 0.00625
savings = all_gpt4o_cost - stats["total_cost"]
savings_pct = (savings / all_gpt4o_cost * 100) if all_gpt4o_cost > 0 else 0.0
print("💎 Savings Compared to All-GPT-4o (Token-Based):")
print(f" All-GPT-4o Estimate: ${all_gpt4o_cost:.6f}")
print(f" cascadeflow Cost: ${stats['total_cost']:.6f}")
print(f" 💰 SAVINGS: ${savings:.6f} ({savings_pct:.1f}%)")
print()
print(f" ℹ️ Note: Savings based on actual token usage (~{int(all_gpt4_tokens)} tokens)")
print(" Your savings will vary based on query complexity and response length.")
print()
# Extrapolate to realistic scale
print("📈 Extrapolated to 10,000 Queries/Month:")
if all_gpt4_tokens > 0:
scale_factor = 10_000 / total_queries
monthly_cascade = stats["total_cost"] * scale_factor
monthly_gpt4o = all_gpt4o_cost * scale_factor
monthly_savings = monthly_gpt4o - monthly_cascade
print(f" All-GPT-4o: ${monthly_gpt4o:,.2f}/month")
print(f" cascadeflow: ${monthly_cascade:,.2f}/month")
print(f" 💵 SAVE: ${monthly_savings:,.2f}/month")
print()
# ========================================================================
# STEP 4: Key Takeaways
# ========================================================================
print("=" * 80)
print("🎯 KEY TAKEAWAYS")
print("=" * 80)
print()
print("✅ What You Learned:")
print(" 1. cascadeflow automatically routes queries by complexity")
print(" 2. Simple queries use cheap models (GPT-4o-mini)")
print(" 3. Complex queries escalate to expensive models (GPT-4o)")
print(" 4. When draft is accepted, verifier is SKIPPED (saves cost!)")
print(" 5. Token-based pricing means actual costs depend on query/response length")
print(f" 6. You achieved {savings_pct:.1f}% savings on this query mix")
print()
print("🚀 Next Steps:")
print(" • Try with your own queries")
print(" • Adjust quality_threshold to tune cascade behavior")
print(" • Add more models (Ollama for local, Groq for free)")
print(" • Monitor your own query patterns and optimize")
print(" • Deploy to production")
print()
print("📚 Resources:")
print(" • Full Guide: docs/guides/quickstart.md")
print(" • API Reference: docs/api/")
print(" • GitHub: https://github.com/lemony-ai/cascadeflow")
print()
print("=" * 80)
if __name__ == "__main__":
asyncio.run(main())