From 7feca566a9fa1522485cc190ee4fa19c0953d92a Mon Sep 17 00:00:00 2001 From: Enreign Date: Wed, 11 Mar 2026 20:29:51 +0100 Subject: [PATCH 1/2] Add token/cost estimation (Step 15) and 4 new PM tool integrations Token estimation computes per-task token consumption and optional API cost across economy/standard/premium model tiers. Four new tracker mappings (Asana, Azure DevOps, Zenhub, Shortcut) bring supported tools to 10. Bumps version to 0.4.0. Co-Authored-By: Claude Opus 4.6 --- README.md | 9 +- SKILL.md | 7 +- references/formulas.md | 87 +++++++++++++++++++- references/output-schema.md | 107 ++++++++++++++++++++++-- references/questionnaire.md | 9 ++ tests/test_formulas.py | 159 ++++++++++++++++++++++++++++++++++++ 6 files changed, 361 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 3d5028c..1381ecd 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,8 @@ Research-backed formulas. PERT statistics. Calibration feedback loops. Zero depe - Supports **single tasks or batches** (paste 5 issues or 500) - Produces **PERT expected values** with confidence bands, not just ranges - Separates **"expected"** from **"committed"** estimates at your chosen confidence level -- Outputs in formats ready for **Linear, JIRA, ClickUp, GitHub Issues, Monday, and GitLab** +- Estimates **token consumption and API cost** per model tier (economy/standard/premium) +- Outputs in formats ready for **Linear, JIRA, ClickUp, GitHub Issues, Monday, GitLab, Asana, Azure DevOps, Zenhub, and Shortcut** - Includes a **calibration system** to improve accuracy over time with actuals ## Quick Start @@ -247,7 +248,7 @@ Estimates can be output in two modes for any supported tracker: | **Embedded** (default) | Markdown table in description/body | None | | **Native** | Maps to tracker-specific fields | Custom fields | -**Supported:** Linear, JIRA, ClickUp, GitHub Issues, Monday, GitLab +**Supported:** Linear, JIRA, ClickUp, GitHub Issues, Monday, GitLab, Asana, Azure DevOps, Zenhub, Shortcut Embedded mode works everywhere immediately. Native mode requires custom fields for agent-specific metrics. @@ -343,7 +344,7 @@ Evaluation prompts per the [Claude Skills 2.0](https://claude.com/blog/improving | `eval-quick.md` | Quick path produces valid PERT output with minimal input | | `eval-hybrid.md` | Detailed path handles multi-team, confidence levels, org overhead | | `eval-batch.md` | Batch mode with mixed types, dependencies, and rollup | -| `eval-regression.md` | 6 baseline cases to detect drift after formula changes | +| `eval-regression.md` | 8 baseline cases to detect drift after formula changes | Run evals after any change to formulas, frameworks, or the skill workflow. @@ -354,7 +355,7 @@ Run evals after any change to formulas, frameworks, or the skill workflow. Contributions welcome — see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines. Key areas: - **Calibration data** — Share anonymized estimated vs. actual results to improve default ratios -- **Tracker mappings** — Additional tracker support (Asana, Notion, Shortcut, etc.) +- **Tracker mappings** — Additional tracker support (Notion, Basecamp, etc.) - **Task types** — New multipliers for work categories not yet covered - **Formulas** — Improvements backed by data or research - **Evals** — Additional test cases, especially edge cases diff --git a/SKILL.md b/SKILL.md index 798da2a..a8f84bf 100644 --- a/SKILL.md +++ b/SKILL.md @@ -4,7 +4,7 @@ description: "Adapts to your team's working mode — human-only, hybrid, or agen license: MIT metadata: author: Enreign - version: "0.3.0" + version: "0.4.0" --- # Progressive Estimation @@ -131,7 +131,8 @@ The computation pipeline: 6. Apply cone of uncertainty spread to widen/narrow range 7. Compute PERT expected value and standard deviation 8. Apply confidence multiplier for committed estimate -9. Check anti-pattern guards and generate warnings +9. Compute token & cost estimates (Step 15) +10. Check anti-pattern guards and generate warnings If the user requests a standalone deterministic calculator, generate one from `formulas.md` in their preferred language. The generated script must: @@ -183,7 +184,7 @@ Then provide: - Tracker-formatted output (if requested) Ask which tracker and mode: -- **Tracker**: Linear, JIRA, ClickUp, GitHub Issues, Monday, GitLab, or generic +- **Tracker**: Linear, JIRA, ClickUp, GitHub Issues, Monday, GitLab, Asana, Azure DevOps, Zenhub, Shortcut, or generic - **Mode**: Native fields or embedded in description (default: embedded) For batch output, produce a summary table first, then rollup, then warnings, diff --git a/references/formulas.md b/references/formulas.md index 4d78d5c..9b02e3f 100644 --- a/references/formulas.md +++ b/references/formulas.md @@ -21,6 +21,8 @@ standalone calculator scripts in any language. | confidence_level | 50/80/90 | 80 | — | | definition_phase | concept/requirements/design/ready | ready | — | | org_size | solo-startup/growth/enterprise | solo-startup | — | +| model_tier | economy/standard/premium or specific model | standard | — | +| show_cost | boolean | false | — | ## Lookup Tables @@ -164,6 +166,52 @@ enterprise: 1.3 (formal review, compliance, multi-team coordination) Applied to human time only (planning, review, fix), not agent time. +### Tokens Per Round (thousands, by complexity × maturity) + +``` + S M L XL +exploratory: 8k 15k 25k 40k +partial: 6k 12k 20k 35k +mostly-automated: 5k 10k 18k 30k +``` + +### Output Token Ratio (by complexity) + +``` +S: 0.25 M: 0.28 L: 0.30 XL: 0.35 +``` + +### Model Pricing (per 1M tokens, USD — last verified March 2026) + +Representative models so users can pick the closest match: + +``` +Model Input Output Tier +───────────────────────────────────────────────────── +GPT-4o Mini $0.15 $0.60 economy +Gemini 2.5 Flash $0.30 $2.50 economy +Claude Haiku 4.5 $1.00 $5.00 economy +Gemini 2.5 Pro $1.25 $10.00 standard +GPT-4o $2.50 $10.00 standard +Claude Sonnet 4.6 $3.00 $15.00 standard +Claude Opus 4.6 $5.00 $25.00 premium +GPT-5 $1.25 $10.00 premium (capability, not price) +``` + +For the tier-based formula, use these representative rates: + +``` + Input Output +economy: $0.50 $2.50 (Haiku, GPT-4o-mini, Gemini Flash) +standard: $2.50 $12.00 (Sonnet, GPT-4o, Gemini 2.5 Pro) +premium: $5.00 $25.00 (Opus, GPT-5) +``` + +Note: "Premium" reflects capability tier (best available models), not +necessarily highest price. GPT-5 is premium-capability at standard pricing. +Pricing changes frequently — check provider pages before committing to +cost-based decisions. + ## Formulas ### Step 1: Agent Rounds @@ -300,6 +348,29 @@ communication_overhead = 0.15 × (num_humans - 1) adjusted_human_time = adjusted_human_time × (1 + communication_overhead) ``` +### Step 15: Token & Cost Estimation + +``` +tokens_per_round = tokens_per_round_table[complexity][maturity] +output_ratio = output_token_ratio[complexity] + +total_tokens_min = adjusted_rounds_min × tokens_per_round × num_agents +total_tokens_max = adjusted_rounds_max × tokens_per_round × num_agents + +input_tokens_min = total_tokens_min × (1 - output_ratio) +input_tokens_max = total_tokens_max × (1 - output_ratio) +output_tokens_min = total_tokens_min × output_ratio +output_tokens_max = total_tokens_max × output_ratio + +token_midpoint = (total_tokens_min + total_tokens_max) / 2 +pert_expected_tokens = (total_tokens_min + 4 × token_midpoint + total_tokens_max) / 6 + +# Cost (only if show_cost == true) +cost_min = (input_tokens_min × input_price + output_tokens_min × output_price) / 1_000_000 +cost_max = (input_tokens_max × input_price + output_tokens_max × output_price) / 1_000_000 +pert_expected_cost = (cost_min + 4 × (cost_min + cost_max) / 2 + cost_max) / 6 +``` + ## Anti-Pattern Guards After computing estimates, check for these patterns and append warnings: @@ -427,7 +498,16 @@ Every estimation must produce these canonical fields: "humans": int, "agents": int }, - "story_points": int | null + "story_points": int | null, + "token_estimate": { + "total_tokens": { "min": int, "max": int }, + "input_tokens": { "min": int, "max": int }, + "output_tokens": { "min": int, "max": int }, + "pert_expected_tokens": int, + "model_tier": "economy" | "standard" | "premium", + "cost_usd": { "min": float, "max": float } | null, + "pert_expected_cost_usd": float | null + } } ``` @@ -444,7 +524,10 @@ For batch, wrap in: "critical_path": string[], "task_count": int, "size_distribution": { "S": int, "M": int, "L": int, "XL": int }, - "warnings": string[] + "warnings": string[], + "total_tokens": int, + "pert_expected_tokens": int, + "total_cost_usd": float | null } } ``` diff --git a/references/output-schema.md b/references/output-schema.md index 3a94b08..f1af76b 100644 --- a/references/output-schema.md +++ b/references/output-schema.md @@ -40,7 +40,7 @@ Output format adapts to the detected cooperation mode: Single task: ``` -Expected: ~4 hrs | Committed (80%): ~5.5 hrs | 10-26 agent rounds + 3 hrs human | Risk: medium | Size: M +Expected: ~4 hrs | Committed (80%): ~5.5 hrs | 10-26 agent rounds (~180k tokens) + 3 hrs human | Risk: medium | Size: M ``` Batch: @@ -90,6 +90,7 @@ Ask the user: "Native fields or embedded in description? (default: embedded)" | committed_hours | Custom field | "Committed Estimate (hrs)" | | confidence_level | Custom field | "Confidence %" | | priority | Priority | 1-4 mapping | +| token_estimate | Custom field | "Est. Tokens" | **Embedded:** ```markdown @@ -107,10 +108,17 @@ Ask the user: "Native fields or embedded in description? (default: embedded)" | **Expected (PERT)** | **~4 hrs** | | **Committed (80%)** | **~5.5 hrs** | | Confidence Band (68%) | 3.4-5.0 hrs | +| Token Estimate | ~180k tokens | +| Model Tier | standard | +| Est. Cost | ~$1.20 | | Risk | medium | | Team | 1 human, 1 agent | ``` +Token Estimate and Model Tier always appear in the breakdown table. +Est. Cost only appears if `show_cost == true`. +Cost does NOT appear in the one-line summary (too noisy). + ### Canonical → JIRA **Native:** @@ -126,6 +134,7 @@ Ask the user: "Native fields or embedded in description? (default: embedded)" | human_review_minutes | Custom field | number type | | pert_expected_hours | Custom field | "Expected Estimate (hrs)" | | labels | Labels | array | +| token_estimate | Custom field | "Est. Tokens" (number) | **Embedded:** Same markdown table in Description field. @@ -143,6 +152,7 @@ Ask the user: "Native fields or embedded in description? (default: embedded)" | agent_rounds | Custom field | number | | human_review_minutes | Custom field | number | | priority | Priority | 1-4 | +| token_estimate | Custom field | "Est. Tokens" (number) | **Embedded:** Same markdown table in Description field. @@ -160,6 +170,7 @@ Ask the user: "Native fields or embedded in description? (default: embedded)" | agent_rounds | Body section | no custom fields | | human_review_minutes | Body section | no custom fields | | labels | Labels | — | +| token_estimate | Body section | no custom fields | **Embedded:** Markdown table in issue Body. This is the recommended mode for GitHub Issues since it has no custom field support. @@ -180,6 +191,7 @@ for GitHub Issues since it has no custom field support. | human_review_minutes | Numbers column | "Review (min)" | | priority | Priority column | — | | labels | Tags column | — | +| token_estimate | Numbers column | "Est. Tokens" | **Embedded:** Markdown in Updates or Long Text column. @@ -198,22 +210,101 @@ for GitHub Issues since it has no custom field support. | agent_rounds | Description section | no custom fields in free tier | | human_review_minutes | Description section | — | | labels | Labels | scoped labels supported | +| token_estimate | Description section | no custom fields in free tier | **Embedded:** Markdown table in Description. Use `/estimate` quick action for time tracking integration. +### Canonical → Asana + +**Native:** +| Canonical Field | Asana Field | Notes | +|----------------|------------|-------| +| title | Task Name | — | +| complexity | Custom field (Dropdown) | "Size" — S/M/L/XL | +| committed_hours | Custom field (Number) | "Committed Estimate (hrs)" | +| pert_expected_hours | Custom field (Number) | "Expected (hrs)" | +| risk_level | Custom field (Dropdown) | "Risk" — low/medium/high | +| risk_notes | Description | appended | +| subtasks | Subtasks | native | +| agent_rounds | Custom field (Number) | "Agent Rounds" | +| human_review_minutes | Custom field (Number) | "Review (min)" | +| token_estimate | Custom field (Number) | "Est. Tokens" | + +**Embedded:** Markdown in Description. Quirks: custom fields are +project-scoped; time tracking is paid. + +### Canonical → Azure DevOps + +**Native:** +| Canonical Field | ADO Field | Notes | +|----------------|----------|-------| +| title | Title | — | +| complexity | Tags | `Size:M` | +| committed_hours | Original Estimate | hours (native) | +| pert_expected_hours | Custom field (Decimal) | "Expected Estimate (hrs)" | +| risk_level | Tags | `Risk:medium` | +| risk_notes | Description | HTML — use `` | +| subtasks | Child work items | parent-child link | +| agent_rounds | Custom field (Integer) | "Agent Rounds" | +| story_points | Story Points | native on User Story | +| token_estimate | Custom field (Integer) | "Est. Tokens" | + +**Embedded:** HTML table in Description (ADO uses HTML, not markdown). +Quirks: custom fields via Process customization; work item types matter +(User Story vs Task). + +### Canonical → Zenhub + +**Native:** +| Canonical Field | Zenhub Field | Notes | +|----------------|-------------|-------| +| title | Issue Title | GitHub Issue title | +| complexity | Label | `size/M` (GitHub label) | +| committed_hours | Estimate | Zenhub story points field | +| pert_expected_hours | Body section | no custom fields | +| risk_level | Label | `risk/medium` (GitHub label) | +| risk_notes | Body | — | +| subtasks | Task list | `- [ ]` in body, or child issues | +| agent_rounds | Body section | no custom fields | +| story_points | Estimate | native Zenhub field (points) | +| token_estimate | Body section | no custom fields | + +**Embedded:** Markdown in GitHub Issue body (recommended). Quirks: Zenhub +layers on top of GitHub Issues — uses GitHub labels + body for most data; +Estimate field is points-only; Epics are cross-repo issue collections. + +### Canonical → Shortcut + +**Native:** +| Canonical Field | Shortcut Field | Notes | +|----------------|---------------|-------| +| title | Story Name | — | +| complexity | Label | `size:M` | +| committed_hours | Custom field (Number) | "Committed (hrs)" | +| pert_expected_hours | Custom field (Number) | "Expected (hrs)" | +| risk_level | Label | `risk:medium` | +| risk_notes | Description | markdown supported | +| subtasks | Tasks (within Story) | checklist-style | +| agent_rounds | Custom field (Number) | "Agent Rounds" | +| story_points | Estimate | native field (points) | +| token_estimate | Custom field (Number) | "Est. Tokens" | + +**Embedded:** Markdown in Description. Quirks: custom fields on Team plan+; +native Estimate is points not hours; Stories have Tasks (checklist items). + ## Batch Output Format ### Summary Table (Always First) ``` -| # | Task | Size | Type | Rounds | Agent | Human | Expected | Committed (80%) | Risk | Deps | -|---|------|------|------|--------|-------|-------|----------|-----------------|------|------| -| 1 | Auth service | M | coding | 10-26 | 20-78m | 2-3h | ~4h | ~5.5h | med | — | -| 2 | Payment | L | coding | 26-65 | 52-195m | 4-8h | ~8h | ~11h | high | #1 | -| 3 | DB migration | L | data-mig | 26-65 | 52-195m | 4-8h | ~16h | ~22h | high | — | -|---|------|------|------|--------|-------|-------|----------|-----------------|------|------| -| | **Totals** | | | | | | **~28h** | **~38.5h** | | | +| # | Task | Size | Type | Rounds | Agent | Human | Tokens | Expected | Committed (80%) | Risk | Deps | +|---|------|------|------|--------|-------|-------|--------|----------|-----------------|------|------| +| 1 | Auth service | M | coding | 10-26 | 20-78m | 2-3h | ~180k | ~4h | ~5.5h | med | — | +| 2 | Payment | L | coding | 26-65 | 52-195m | 4-8h | ~520k | ~8h | ~11h | high | #1 | +| 3 | DB migration | L | data-mig | 26-65 | 52-195m | 4-8h | ~520k | ~16h | ~22h | high | — | +|---|------|------|------|--------|-------|-------|--------|----------|-----------------|------|------| +| | **Totals** | | | | | | **~1.2M** | **~28h** | **~38.5h** | | | ``` ### Rollup Block diff --git a/references/questionnaire.md b/references/questionnaire.md index a743629..de2ca6e 100644 --- a/references/questionnaire.md +++ b/references/questionnaire.md @@ -119,6 +119,14 @@ All quick-path questions, plus: - Enterprise (50+ people) — formal review, compliance, multi-team coordination (1.3x) 13. **Dependencies**: "Is this blocked by or blocking other tasks?" → dependency graph for sequencing +14. **Model & cost**: "Which model tier are you using, and do you want cost estimates?" + → `model_tier`, `show_cost` + - Economy (Haiku, GPT-4o Mini, Gemini Flash) — cheapest + - Standard (Sonnet, GPT-4o, Gemini 2.5 Pro) — default + - Premium (Opus, GPT-5) — most capable + - Or name a specific model from the pricing table + - Show cost: yes/no (default: no) + - If user names a specific model, map to its tier for the formula ## Detailed Path — Batch @@ -159,6 +167,7 @@ User can mark overrides or approve the whole table at once. | Definition phase | spread_multiplier | ready (1.0x) | asked | | Organization context | org_overhead | solo-startup (1.0x) | asked | | Dependencies | sequencing | none | asked | +| Model & cost | model_tier, show_cost | standard, false | asked | ## Input Formats Accepted (Batch) diff --git a/tests/test_formulas.py b/tests/test_formulas.py index aa86d62..89f118f 100644 --- a/tests/test_formulas.py +++ b/tests/test_formulas.py @@ -193,6 +193,77 @@ def estimate( } +# ── Token Estimation ────────────────────────────────────────── + +TOKENS_PER_ROUND = { + "exploratory": {"S": 8000, "M": 15000, "L": 25000, "XL": 40000}, + "partial": {"S": 6000, "M": 12000, "L": 20000, "XL": 35000}, + "mostly-automated": {"S": 5000, "M": 10000, "L": 18000, "XL": 30000}, +} + +OUTPUT_TOKEN_RATIO = { + "S": 0.25, + "M": 0.28, + "L": 0.30, + "XL": 0.35, +} + +TIER_PRICING = { + "economy": {"input": 0.50, "output": 2.50}, + "standard": {"input": 2.50, "output": 12.00}, + "premium": {"input": 5.00, "output": 25.00}, +} + + +def estimate_tokens( + complexity, + maturity="partial", + num_agents=1, + model_tier="standard", + show_cost=False, + risk_coefficient=1.3, + domain_familiarity=1.0, +): + """Step 15: Token & cost estimation.""" + base_min, base_max = BASE_ROUNDS[complexity] + rounds_min = round(base_min * risk_coefficient * domain_familiarity) + rounds_max = round(base_max * risk_coefficient * domain_familiarity) + + tpr = TOKENS_PER_ROUND[maturity][complexity] + output_ratio = OUTPUT_TOKEN_RATIO[complexity] + + total_tokens_min = rounds_min * tpr * num_agents + total_tokens_max = rounds_max * tpr * num_agents + + input_tokens_min = total_tokens_min * (1 - output_ratio) + input_tokens_max = total_tokens_max * (1 - output_ratio) + output_tokens_min = total_tokens_min * output_ratio + output_tokens_max = total_tokens_max * output_ratio + + token_midpoint = (total_tokens_min + total_tokens_max) / 2 + pert_expected_tokens = (total_tokens_min + 4 * token_midpoint + total_tokens_max) / 6 + + result = { + "total_tokens": {"min": total_tokens_min, "max": total_tokens_max}, + "input_tokens": {"min": input_tokens_min, "max": input_tokens_max}, + "output_tokens": {"min": output_tokens_min, "max": output_tokens_max}, + "pert_expected_tokens": pert_expected_tokens, + "model_tier": model_tier, + "cost_usd": None, + "pert_expected_cost_usd": None, + } + + if show_cost: + pricing = TIER_PRICING[model_tier] + cost_min = (input_tokens_min * pricing["input"] + output_tokens_min * pricing["output"]) / 1_000_000 + cost_max = (input_tokens_max * pricing["input"] + output_tokens_max * pricing["output"]) / 1_000_000 + pert_expected_cost = (cost_min + 4 * (cost_min + cost_max) / 2 + cost_max) / 6 + result["cost_usd"] = {"min": cost_min, "max": cost_max} + result["pert_expected_cost_usd"] = pert_expected_cost + + return result + + # ── Regression Tests ─────────────────────────────────────────── class TestCase1TrivialS(unittest.TestCase): @@ -390,5 +461,93 @@ def test_committed_ratio(self): self.assertAlmostEqual(ratio, 1.8, places=2) +class TestCase7TokenMath(unittest.TestCase): + """Case 7: Token estimation math for M coding task, partial maturity.""" + + def setUp(self): + self.t = estimate_tokens( + complexity="M", maturity="partial", num_agents=1, + model_tier="standard", show_cost=True, + ) + + def test_total_tokens_range(self): + # M partial: 12k tokens/round, rounds 10-26 + # min = 10 * 12000 = 120000, max = 26 * 12000 = 312000 + self.assertEqual(self.t["total_tokens"]["min"], 120000) + self.assertEqual(self.t["total_tokens"]["max"], 312000) + + def test_output_ratio(self): + # M output ratio = 0.28 + self.assertAlmostEqual( + self.t["output_tokens"]["min"] / self.t["total_tokens"]["min"], 0.28 + ) + + def test_input_output_sum(self): + self.assertEqual( + self.t["input_tokens"]["min"] + self.t["output_tokens"]["min"], + self.t["total_tokens"]["min"], + ) + self.assertEqual( + self.t["input_tokens"]["max"] + self.t["output_tokens"]["max"], + self.t["total_tokens"]["max"], + ) + + def test_pert_between_min_max(self): + self.assertGreaterEqual( + self.t["pert_expected_tokens"], self.t["total_tokens"]["min"] + ) + self.assertLessEqual( + self.t["pert_expected_tokens"], self.t["total_tokens"]["max"] + ) + + def test_cost_present(self): + self.assertIsNotNone(self.t["cost_usd"]) + self.assertIsNotNone(self.t["pert_expected_cost_usd"]) + + def test_cost_min_less_than_max(self): + self.assertLess(self.t["cost_usd"]["min"], self.t["cost_usd"]["max"]) + + def test_cost_math(self): + # Standard tier: input=$2.50/M, output=$12.00/M + # min: input=120000*0.72=86400, output=120000*0.28=33600 + # cost_min = (86400*2.50 + 33600*12.00) / 1_000_000 + expected_cost_min = (86400 * 2.50 + 33600 * 12.00) / 1_000_000 + self.assertAlmostEqual(self.t["cost_usd"]["min"], expected_cost_min, places=4) + + +class TestCase8TokenScaling(unittest.TestCase): + """Case 8: Token scaling — XL > L > M > S, multi-agent multiplies.""" + + def test_scaling_by_complexity(self): + sizes = ["S", "M", "L", "XL"] + tokens = [] + for s in sizes: + t = estimate_tokens(complexity=s, maturity="partial") + tokens.append(t["total_tokens"]["min"]) + for i in range(len(tokens) - 1): + self.assertLess(tokens[i], tokens[i + 1]) + + def test_multi_agent_multiplier(self): + t1 = estimate_tokens(complexity="M", maturity="partial", num_agents=1) + t3 = estimate_tokens(complexity="M", maturity="partial", num_agents=3) + self.assertEqual( + t3["total_tokens"]["min"], t1["total_tokens"]["min"] * 3 + ) + + def test_no_cost_by_default(self): + t = estimate_tokens(complexity="M", maturity="partial", show_cost=False) + self.assertIsNone(t["cost_usd"]) + self.assertIsNone(t["pert_expected_cost_usd"]) + + def test_premium_more_expensive_than_economy(self): + te = estimate_tokens( + complexity="M", maturity="partial", model_tier="economy", show_cost=True + ) + tp = estimate_tokens( + complexity="M", maturity="partial", model_tier="premium", show_cost=True + ) + self.assertLess(te["cost_usd"]["max"], tp["cost_usd"]["max"]) + + if __name__ == "__main__": unittest.main() From be889b272842a11aaff21d6107a2ce0bfc1f5a92 Mon Sep 17 00:00:00 2001 From: Enreign Date: Wed, 11 Mar 2026 20:39:33 +0100 Subject: [PATCH 2/2] Integrate token estimation into main pipeline and expand test coverage Wire estimate_tokens() into estimate() as Step 15 so the core estimator returns token_estimate in its output. Add TestCase9 for integration verification, maturity variation test, and PERT cost formula test. 44 tests now pass (was 36). Co-Authored-By: Claude Opus 4.6 --- tests/test_formulas.py | 81 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 80 insertions(+), 1 deletion(-) diff --git a/tests/test_formulas.py b/tests/test_formulas.py index 89f118f..875101f 100644 --- a/tests/test_formulas.py +++ b/tests/test_formulas.py @@ -89,8 +89,10 @@ def estimate( confidence_level=80, definition_phase="ready", org_size="solo-startup", + model_tier="standard", + show_cost=False, ): - """Run the full 14-step estimation pipeline. Returns dict with all fields.""" + """Run the full 15-step estimation pipeline. Returns dict with all fields.""" # Step 1: Agent Rounds base_min, base_max = BASE_ROUNDS[complexity] @@ -167,6 +169,17 @@ def estimate( human_time_min = human_time_min / num_humans * (1 + communication_overhead) human_time_max = human_time_max / num_humans * (1 + communication_overhead) + # Step 15: Token & Cost Estimation + token_est = estimate_tokens( + complexity=complexity, + maturity=maturity, + num_agents=num_agents, + model_tier=model_tier, + show_cost=show_cost, + risk_coefficient=risk_coefficient, + domain_familiarity=domain_familiarity, + ) + return { "complexity": complexity, "task_type": task_type, @@ -190,6 +203,7 @@ def estimate( "committed_hours": {"min": committed_min / 60, "max": committed_max / 60}, "spread_multiplier": spread, "definition_phase": definition_phase, + "token_estimate": token_est, } @@ -548,6 +562,71 @@ def test_premium_more_expensive_than_economy(self): ) self.assertLess(te["cost_usd"]["max"], tp["cost_usd"]["max"]) + def test_maturity_variation(self): + """Exploratory should produce more tokens than mostly-automated.""" + t_exp = estimate_tokens(complexity="M", maturity="exploratory") + t_auto = estimate_tokens(complexity="M", maturity="mostly-automated") + self.assertGreater( + t_exp["total_tokens"]["min"], t_auto["total_tokens"]["min"] + ) + self.assertGreater( + t_exp["total_tokens"]["max"], t_auto["total_tokens"]["max"] + ) + + def test_pert_expected_cost_math(self): + """PERT expected cost = (cost_min + 4*midpoint + cost_max) / 6.""" + t = estimate_tokens( + complexity="M", maturity="partial", model_tier="standard", show_cost=True + ) + cost_min = t["cost_usd"]["min"] + cost_max = t["cost_usd"]["max"] + expected_pert_cost = (cost_min + 4 * (cost_min + cost_max) / 2 + cost_max) / 6 + self.assertAlmostEqual( + t["pert_expected_cost_usd"], expected_pert_cost, places=6 + ) + + +class TestCase9TokenIntegration(unittest.TestCase): + """Case 9: Token estimate is integrated into the main estimate() pipeline.""" + + def setUp(self): + self.r = estimate( + complexity="M", task_type="coding", maturity="partial", + model_tier="standard", show_cost=True, + ) + + def test_token_estimate_present(self): + self.assertIn("token_estimate", self.r) + + def test_token_estimate_structure(self): + te = self.r["token_estimate"] + self.assertIn("total_tokens", te) + self.assertIn("input_tokens", te) + self.assertIn("output_tokens", te) + self.assertIn("pert_expected_tokens", te) + self.assertIn("model_tier", te) + self.assertIn("cost_usd", te) + self.assertIn("pert_expected_cost_usd", te) + + def test_token_estimate_uses_same_rounds(self): + """Token estimate should use the same adjusted rounds as the main estimate.""" + te = self.r["token_estimate"] + rounds_min = self.r["agent_rounds"]["min"] + rounds_max = self.r["agent_rounds"]["max"] + tpr = TOKENS_PER_ROUND["partial"]["M"] + self.assertEqual(te["total_tokens"]["min"], rounds_min * tpr) + self.assertEqual(te["total_tokens"]["max"], rounds_max * tpr) + + def test_model_tier_passthrough(self): + self.assertEqual(self.r["token_estimate"]["model_tier"], "standard") + + def test_cost_present_when_show_cost(self): + self.assertIsNotNone(self.r["token_estimate"]["cost_usd"]) + + def test_cost_absent_when_not_show_cost(self): + r = estimate(complexity="M", show_cost=False) + self.assertIsNone(r["token_estimate"]["cost_usd"]) + if __name__ == "__main__": unittest.main()