cascadeflow/examples/basic_usage.py at main · lemony-ai/cascadeflow · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
"""
cascadeflow - Basic Usage Example

The simplest way to get started with cascadeflow. This example demonstrates:
- Setting up a two-tier cascade (cheap → expensive)
- Processing queries with automatic quality-based routing
- Cost tracking and savings calculation
- Different complexity levels (simple → complex queries)

Requirements:
    - cascadeflow[all]
    - OpenAI API key

Setup:
    pip install cascadeflow[all]
    export OPENAI_API_KEY="your-key-here"
    python examples/basic_usage.py

What You'll Learn:
    1. How to configure a basic cascade
    2. How cascadeflow automatically routes queries
    3. How to track costs and savings
    4. How different query complexities are handled

Expected Output:
    - Simple queries: GPT-4o-mini draft accepted, GPT-4o skipped
    - Complex queries: Direct to GPT-4o OR draft rejected and escalated
    - Token-based cost comparison showing realistic 40-60% savings

Note on Costs:
    Costs are calculated using actual token-based pricing from OpenAI:
    - GPT-4o-mini: ~$0.000375 per 1K tokens (blended input/output)
    - GPT-4o: ~$0.0025 per 1K tokens (blended input/output)

    Savings depend on your query mix and response lengths.

Note on Latency:
    95% of latency comes from provider API calls, NOT from cascadeflow!
    - Provider API: 95% (waiting for OpenAI/Anthropic/etc to respond)
    - cascadeflow overhead: 5% (routing, quality checks, etc.)

    To reduce latency:
    1. Choose faster providers (Groq is 5-10x faster than OpenAI)
    2. Use streaming for perceived speed improvement
    3. Don't worry about cascade overhead (it's minimal)

Documentation:
    For complete setup instructions and detailed explanations, see:
    docs/guides/quickstart.md
"""

import asyncio
import os

from cascadeflow import CascadeAgent, ModelConfig


async def main():
    """
    Basic cascadeflow usage - the simplest possible example.
    """

    # Check for required API key
    if not os.getenv("OPENAI_API_KEY"):
        print("\n❌ Error: OPENAI_API_KEY environment variable not set")
        print("\nThis example requires an OpenAI API key to run.")
        print("\nSetup:")
        print("  1. Get your API key from: https://platform.openai.com/api-keys")
        print("  2. Set the environment variable:")
        print("     export OPENAI_API_KEY='sk-...'")
        print("  3. Run this example:")
        print("     python examples/basic_usage.py")
        return

    print("=" * 80)
    print("🌊 CASCADEFLOW - BASIC USAGE EXAMPLE")
    print("=" * 80)
    print()
    print("This example shows how cascadeflow automatically routes queries")
    print("between a cheap model (GPT-4o-mini) and expensive model (GPT-4o).")
    print()
    print("💡 Key Concept: cascadeflow uses TOKEN-BASED pricing, not flat rates.")
    print("   This means costs depend on how long your queries and responses are.")
    print()

    # ========================================================================
    # STEP 1: Configure Your Cascade
    # ========================================================================

    print("📋 Step 1: Configuring cascade with two models...")
    print()

    agent = CascadeAgent(
        models=[
            # Cheap model - tries first
            ModelConfig(
                name="gpt-4o-mini",
                provider="openai",
                cost=0.000375,  # $0.375 per 1M tokens (blended estimate)
                quality_threshold=0.7,  # Accept if confidence >= 70%
            ),
            # Expensive model - only if needed
            ModelConfig(
                name="gpt-4o",
                provider="openai",
                cost=0.00625,  # $6.25 per 1M tokens (blended estimate)
                quality_threshold=0.95,  # Very high quality
            ),
        ]
    )

    print("   ✅ Tier 1: gpt-4o-mini (~$0.375/1M tokens) - Tries first")
    print("   ✅ Tier 2: gpt-4o (~$6.25/1M tokens) - Escalates if needed")
    print()

    # ========================================================================
    # STEP 2: Test with Different Query Types
    # ========================================================================

    print("📝 Step 2: Testing with various query types...\n")

    # Test queries ranging from simple to complex
    test_queries = [
        # SIMPLE queries - should stay on GPT-4o-mini
        {
            "query": "What color is the sky?",
            "expected": "gpt-4o-mini",
            "reason": "Simple factual question - cheap model handles easily",
        },
        {
            "query": "What's the capital of France?",
            "expected": "gpt-4o-mini",
            "reason": "Simple factual - cheap model knows this",
        },
        {
            "query": "Translate 'hello' to Spanish",
            "expected": "gpt-4o-mini",
            "reason": "Simple translation - cheap model sufficient",
        },
        # MODERATE queries - might escalate
        {
            "query": "Explain the difference between lists and tuples in Python",
            "expected": "gpt-4o-mini",
            "reason": "Moderate complexity - cheap model likely handles it",
        },
        {
            "query": "Write a function to reverse a string in Python",
            "expected": "gpt-4o-mini",
            "reason": "Standard coding task - cheap model can do it",
        },
        # COMPLEX queries - likely escalate to GPT-4o
        {
            "query": "Explain quantum entanglement and its implications for quantum computing in detail",
            "expected": "gpt-4o",
            "reason": "Complex scientific topic - needs better model",
        },
        {
            "query": "Design a microservices architecture for a large-scale e-commerce platform with high availability",
            "expected": "gpt-4o",
            "reason": "Complex architecture design - benefits from GPT-4o",
        },
        {
            "query": "Analyze the philosophical implications of consciousness and free will in the context of determinism",
            "expected": "gpt-4o",
            "reason": "Deep philosophical analysis - needs sophisticated reasoning",
        },
    ]

    # Track statistics
    stats = {
        "gpt-4o-mini": {"count": 0, "cost": 0.0},
        "gpt-4o": {"count": 0, "cost": 0.0},
        "total_cost": 0.0,
        "draft_accepted": 0,
        "draft_rejected": 0,
        "direct_routing": 0,
    }

    # Track token usage for baseline calculation
    all_gpt4_tokens = 0

    # Process each query
    for i, test in enumerate(test_queries, 1):
        print(f"{'─' * 80}")
        print(f"Query {i}/{len(test_queries)}")
        print(f"{'─' * 80}")
        print(f"❓ Question: {test['query']}")
        print(f"🎯 Expected: {test['expected']}")
        print(f"💡 Why: {test['reason']}")
        print()

        # Run the query through cascade
        result = await agent.run(test["query"], max_tokens=150)

        # Determine which model was used
        model_used = "gpt-4o-mini" if "4o-mini" in result.model_used.lower() else "gpt-4o"

        # Update statistics
        stats[model_used]["count"] += 1
        stats[model_used]["cost"] += result.total_cost
        stats["total_cost"] += result.total_cost

        # Track cascade status
        if hasattr(result, "cascaded") and result.cascaded:
            if hasattr(result, "draft_accepted") and result.draft_accepted:
                stats["draft_accepted"] += 1
            else:
                stats["draft_rejected"] += 1
        else:
            stats["direct_routing"] += 1

        # Estimate tokens for baseline (approximate)
        query_tokens = len(test["query"].split()) * 1.3
        if hasattr(result, "content"):
            response_tokens = len(result.content.split()) * 1.3
        else:
            response_tokens = 100  # Default estimate
        all_gpt4_tokens += query_tokens + response_tokens

        # Show result
        tier = "Tier 1 (Cheap)" if model_used == "gpt-4o-mini" else "Tier 2 (Expensive)"
        icon = "💚" if model_used == "gpt-4o-mini" else "💛"

        print("✅ Result:")

        # Show actual model(s) used with clear status
        if hasattr(result, "draft_accepted") and result.draft_accepted:
            # Only draft was used
            print(f"   {icon} Model Used: gpt-4o-mini only ({tier})")
        elif (
            hasattr(result, "cascaded")
            and result.cascaded
            and not getattr(result, "draft_accepted", True)
        ):
            # Both models were used
            print("   💚💛 Models Used: gpt-4o-mini + gpt-4o (Both Tiers)")
        else:
            # Direct routing
            print(f"   {icon} Model Used: {result.model_used} ({tier})")

        # Safely get cost
        cost = getattr(result, "total_cost", 0.0)
        print(f"   💰 Cost: ${cost:.6f}")

        # Safely get latency with breakdown - use library-provided fields!
        total_latency = getattr(result, "latency_ms", 0.0)
        draft_latency = getattr(result, "draft_latency_ms", 0.0)
        verifier_latency = getattr(result, "verifier_latency_ms", 0.0)
        # cascade_overhead_ms is computed by the library:
        # - Draft accepted: 0ms (we saved verifier time)
        # - Draft rejected: full draft_latency_ms (wasted drafter attempt)
        # - Direct route: 0ms (no cascade)
        cascade_overhead = getattr(result, "cascade_overhead_ms", 0.0)

        is_cascaded = hasattr(result, "cascaded") and result.cascaded
        is_draft_accepted = getattr(result, "draft_accepted", False)

        print("   ⚡ Latency Breakdown:")
        print(f"      Total: {total_latency:.0f}ms")
        if is_cascaded and not is_draft_accepted:
            # Draft was rejected - drafter time was wasted
            print(f"      ├─ Drafter (wasted): {draft_latency:.0f}ms")
            print(f"      └─ Verifier: {verifier_latency:.0f}ms")
            print(f"      ⚠️  Cascade overhead: +{cascade_overhead:.0f}ms (drafter was rejected)")
        elif is_cascaded and is_draft_accepted:
            # Draft was accepted - we saved the verifier time
            print(f"      └─ Drafter only: {draft_latency:.0f}ms")
            print("      ✅ Cascade overhead: 0ms (verifier skipped)")
        else:
            # Direct route - no cascade overhead
            print(f"      └─ Provider API: {total_latency:.0f}ms (direct route)")
            print("      ✅ Cascade overhead: 0ms (direct route)")

        # Safely get complexity
        complexity = getattr(result, "complexity", "unknown")
        print(f"   📊 Complexity: {complexity}")

        # Show cascade status more clearly
        if hasattr(result, "cascaded") and result.cascaded:
            if hasattr(result, "draft_accepted") and result.draft_accepted:
                print("   ✅ Draft Accepted: GPT-4o-mini response passed quality check")
                print("   💡 Verifier Skipped: GPT-4o was not called (cost saved!)")
            else:
                print("   ❌ Draft Rejected: Quality check failed, escalated to GPT-4o")
                print("   💸 Both Models Used: Paid for GPT-4o-mini + GPT-4o")
        else:
            print("   🎯 Direct Route: Query sent directly to GPT-4o (no cascade)")

        # Show first part of response
        response_preview = result.content[:100].replace("\n", " ")
        print(f"   📝 Response: {response_preview}...")
        print()

    # ========================================================================
    # STEP 3: Show Cost Analysis
    # ========================================================================

    print("=" * 80)
    print("💰 COST ANALYSIS")
    print("=" * 80)
    print()

    # Calculate statistics
    total_queries = len(test_queries)
    gpt4mini_count = stats["gpt-4o-mini"]["count"]
    gpt4o_count = stats["gpt-4o"]["count"]

    gpt4mini_pct = (gpt4mini_count / total_queries) * 100
    gpt4o_pct = (gpt4o_count / total_queries) * 100

    print("📊 Query Distribution:")
    print(f"   GPT-4o-mini: {gpt4mini_count}/{total_queries} ({gpt4mini_pct:.0f}%)")
    print(f"   GPT-4o:      {gpt4o_count}/{total_queries} ({gpt4o_pct:.0f}%)")
    print()

    print("🔄 Cascade Behavior:")
    print(f"   Draft Accepted:  {stats['draft_accepted']} (verifier skipped)")
    print(f"   Draft Rejected:  {stats['draft_rejected']} (both models used)")
    print(f"   Direct Routing:  {stats['direct_routing']} (no cascade)")
    print()

    print("💵 Cost Breakdown:")
    print(f"   GPT-4o-mini: ${stats['gpt-4o-mini']['cost']:.6f}")
    print(f"   GPT-4o:      ${stats['gpt-4o']['cost']:.6f}")
    print(f"   Total Cost:  ${stats['total_cost']:.6f}")
    print()

    # Calculate savings vs all-GPT-4o (token-based estimate)
    # GPT-4o pricing: ~$0.00625 per 1K tokens (blended)
    all_gpt4o_cost = (all_gpt4_tokens / 1000) * 0.00625
    savings = all_gpt4o_cost - stats["total_cost"]
    savings_pct = (savings / all_gpt4o_cost * 100) if all_gpt4o_cost > 0 else 0.0

    print("💎 Savings Compared to All-GPT-4o (Token-Based):")
    print(f"   All-GPT-4o Estimate: ${all_gpt4o_cost:.6f}")
    print(f"   cascadeflow Cost:   ${stats['total_cost']:.6f}")
    print(f"   💰 SAVINGS:         ${savings:.6f} ({savings_pct:.1f}%)")
    print()
    print(f"   ℹ️  Note: Savings based on actual token usage (~{int(all_gpt4_tokens)} tokens)")
    print("       Your savings will vary based on query complexity and response length.")
    print()

    # Extrapolate to realistic scale
    print("📈 Extrapolated to 10,000 Queries/Month:")
    if all_gpt4_tokens > 0:
        scale_factor = 10_000 / total_queries
        monthly_cascade = stats["total_cost"] * scale_factor
        monthly_gpt4o = all_gpt4o_cost * scale_factor
        monthly_savings = monthly_gpt4o - monthly_cascade

        print(f"   All-GPT-4o:     ${monthly_gpt4o:,.2f}/month")
        print(f"   cascadeflow:    ${monthly_cascade:,.2f}/month")
        print(f"   💵 SAVE:        ${monthly_savings:,.2f}/month")
        print()

    # ========================================================================
    # STEP 4: Key Takeaways
    # ========================================================================

    print("=" * 80)
    print("🎯 KEY TAKEAWAYS")
    print("=" * 80)
    print()
    print("✅ What You Learned:")
    print("   1. cascadeflow automatically routes queries by complexity")
    print("   2. Simple queries use cheap models (GPT-4o-mini)")
    print("   3. Complex queries escalate to expensive models (GPT-4o)")
    print("   4. When draft is accepted, verifier is SKIPPED (saves cost!)")
    print("   5. Token-based pricing means actual costs depend on query/response length")
    print(f"   6. You achieved {savings_pct:.1f}% savings on this query mix")
    print()

    print("🚀 Next Steps:")
    print("   • Try with your own queries")
    print("   • Adjust quality_threshold to tune cascade behavior")
    print("   • Add more models (Ollama for local, Groq for free)")
    print("   • Monitor your own query patterns and optimize")
    print("   • Deploy to production")
    print()

    print("📚 Resources:")
    print("   • Full Guide: docs/guides/quickstart.md")
    print("   • API Reference: docs/api/")
    print("   • GitHub: https://github.com/lemony-ai/cascadeflow")
    print()

    print("=" * 80)


if __name__ == "__main__":
    asyncio.run(main())