From f55ac7e5be18e0f3edd2295642578a9f0ba0c955 Mon Sep 17 00:00:00 2001 From: Michael Sitarzewski Date: Mon, 9 Mar 2026 15:22:22 -0500 Subject: [PATCH] Add follow-up questions, revision citations, CLI persistence, provider updates Follow-up questions: generate_followups() uses cheapest model with JSON mode to suggest 3 post-consensus follow-up questions. Stored in followups_json on Thread model, displayed as clickable items in ConsensusNav/ThreadNav sidebar. _run_consensus returns 8-tuple (added followups), all callers updated. Revision citations: handle_revise() now accepts tool_registry + web_search, extracts citations. revision_citations on ConsensusContext + RoundResult, persisted to DB, included in Sources nav. CLI persistence: persist_consensus() saves full round history to DB from CLI. Top-level --rounds/--challengers cascade to subcommands. _parse_challengers() accepts int count or comma-separated model refs. Calibration date filters: category + since/until inputs on CalibrationDashboard. OpenAI: reasoning_effort high for GPT-5.x (no tools), gpt-5.2 catalog entry. Perplexity: retry for APIConnectionError (2 attempts, 1s delay). Alembic: DUH_DATABASE_URL env var overrides alembic.ini. Co-Authored-By: Claude Opus 4.6 --- .gitignore | 3 + alembic/env.py | 14 +- memory-bank/activeContext.md | 119 ++++---- memory-bank/progress.md | 47 +++- memory-bank/tasks/2026-03/README.md | 32 +++ memory-bank/toc.md | 2 +- src/duh/api/routes/ask.py | 2 + src/duh/api/routes/threads.py | 12 + src/duh/api/routes/ws.py | 41 ++- src/duh/cli/app.py | 253 ++++++++++++++++-- src/duh/consensus/handlers.py | 96 ++++++- src/duh/consensus/machine.py | 5 + src/duh/mcp/server.py | 1 + src/duh/memory/migrations.py | 5 + src/duh/memory/models.py | 3 + src/duh/providers/catalog.py | 1 + src/duh/providers/openai.py | 17 ++ src/duh/providers/perplexity.py | 36 ++- tests/unit/test_cli.py | 8 +- tests/unit/test_cli_batch.py | 15 +- tests/unit/test_cli_display.py | 142 ++++++++++ tests/unit/test_cli_tools.py | 10 +- tests/unit/test_cli_voting.py | 2 +- tests/unit/test_mcp_server.py | 2 +- tests/unit/test_providers_openai.py | 4 +- web/src/__tests__/stores.test.ts | 45 ++++ web/src/api/types.ts | 2 + .../calibration/CalibrationDashboard.tsx | 79 +++++- web/src/components/consensus/ConsensusNav.tsx | 31 ++- .../components/consensus/ConsensusPanel.tsx | 1 + web/src/components/threads/ThreadNav.tsx | 35 +++ web/src/stores/calibration.ts | 20 +- web/src/stores/consensus.ts | 6 + 33 files changed, 962 insertions(+), 129 deletions(-) diff --git a/.gitignore b/.gitignore index ca4bd90..1952a59 100644 --- a/.gitignore +++ b/.gitignore @@ -58,3 +58,6 @@ memory-bank/setup.md web/node_modules/ web/dist/ +# npm wrapper +npm/like-duh/node_modules/ + diff --git a/alembic/env.py b/alembic/env.py index b24a771..7c71480 100644 --- a/alembic/env.py +++ b/alembic/env.py @@ -35,9 +35,17 @@ def _expand_url(section: dict[str, str]) -> dict[str, str]: return section +def _resolve_url() -> str: + """Return database URL from env var, falling back to alembic.ini.""" + env_url = os.environ.get("DUH_DATABASE_URL") + if env_url: + return env_url + return config.get_main_option("sqlalchemy.url") or "" + + def run_migrations_offline() -> None: """Run migrations in 'offline' mode.""" - url = config.get_main_option("sqlalchemy.url") + url = _resolve_url() context.configure( url=url, target_metadata=target_metadata, @@ -59,6 +67,8 @@ def do_run_migrations(connection) -> None: # type: ignore[no-untyped-def] async def run_async_migrations() -> None: """Run migrations in 'online' mode with async engine.""" section = _expand_url(config.get_section(config.config_ini_section, {})) + section["sqlalchemy.url"] = _resolve_url() + section = _expand_url(section) connectable = async_engine_from_config( section, prefix="sqlalchemy.", @@ -74,6 +84,8 @@ async def run_async_migrations() -> None: def run_migrations_online() -> None: """Run migrations in 'online' mode (sync or async).""" section = _expand_url(config.get_section(config.config_ini_section, {})) + section["sqlalchemy.url"] = _resolve_url() + section = _expand_url(section) url = section.get("sqlalchemy.url", "") if _is_async_url(url): diff --git a/memory-bank/activeContext.md b/memory-bank/activeContext.md index 15e18e7..2dd36b4 100644 --- a/memory-bank/activeContext.md +++ b/memory-bank/activeContext.md @@ -1,79 +1,84 @@ # Active Context -**Last Updated**: 2026-03-08 -**Current Phase**: `question-refinement` branch — pre-consensus question refinement, native web search, citations, tools-by-default -**Next Action**: Branch in progress, uncommitted changes staged +**Last Updated**: 2026-03-09 +**Current Phase**: Post PR #14 merge — follow-up questions, revision citations, CLI persistence, calibration filters, provider updates +**Next Action**: Commit and push uncommitted work to new branch -## Latest Work (2026-03-08) +## Latest Work (2026-03-09) -### Question Refinement -- Pre-consensus clarification step: analyze question → ask clarifying questions → enrich with answers → proceed to consensus -- `src/duh/consensus/refine.py` — `analyze_question()` + `enrich_question()`, uses MOST EXPENSIVE model (not cheapest) -- API: `POST /api/refine` → `RefineResponse{needs_refinement, questions[]}`, `POST /api/enrich` → `EnrichResponse{enriched_question}` -- CLI: `duh ask --refine "question"` — interactive `click.prompt()` loop, default `--no-refine` -- Frontend: consensus store `'refining'` status, `submitQuestion` → refine → clarify → enrich → `startConsensus` -- `RefinementPanel.tsx` — tabbed UI inside GlassPanel, checkmarks on answered tabs, Skip + Start Consensus buttons -- Graceful fallback: any failure → proceed to consensus with original question +### Follow-up Questions (new end-to-end feature) +- `generate_followups()` in `src/duh/consensus/handlers.py:930` — uses cheapest model with JSON mode to suggest 3 follow-up questions after consensus completes +- Prompt asks for different angles: deeper technical detail, practical implications, risks/edge cases, related decisions +- `followups` field added to `ConsensusContext` in `machine.py` +- `_run_consensus` returns 8-tuple now (was 7): `(decision, confidence, rigor, dissent, cost, overview, citations, followups)` +- All callers updated: CLI ask, CLI auto, CLI decompose, CLI batch, REST API, WebSocket, MCP server +- **Persistence**: `followups_json` TEXT column on Thread model + SQLite auto-migration in `ensure_schema()` +- **Thread detail API**: returns `followups` parsed from `followups_json` +- **WebSocket**: sends `followups` in `complete` event, persists via `_persist_consensus` +- **Frontend**: `ConsensusNav` + `ThreadNav` show clickable follow-ups in Disclosure section + - Clicking a follow-up calls `submitQuestion()` to start a new consensus + - `consensus.ts` store: `followups` state, included in reset + - `types.ts`: `followups` on `ThreadDetail` and `WSComplete` -### Native Provider Web Search -- Providers use server-side search instead of DDG proxy when `config.tools.web_search.native` is true -- `web_search: bool` param added to `ModelProvider.send()` protocol -- Anthropic: `web_search_20250305` server tool in tools[] -- Google: `GoogleSearch()` grounding (replaces function tools — can't coexist) -- Mistral: `{"type": "web_search"}` appended to tools -- OpenAI: `web_search_options={}` only for `_SEARCH_MODELS` set; others fall back to DDG -- Perplexity: no-op (always searches natively) -- `tool_augmented_send`: filters DDG `web_search` tool when native=True, passes flag to provider +### Revision Citations (enhancement to existing citation system) +- `revision_citations` field added to both `ConsensusContext` and `RoundResult` in `machine.py` +- `handle_revise()` now accepts `tool_registry` + `web_search` params — enables tool-augmented revision with web search +- `handle_revise()` extracts citations from response into `ctx.revision_citations` +- `handle_propose()` now extracts `proposal_citations` directly in handler (moved from ws.py) +- WebSocket sends revision citations in REVISE `phase_complete` event +- `_persist_consensus` saves revision citations to DB as `citations_json` on reviser contribution +- `ConsensusPanel.tsx` passes `revisionCitations` to REVISE phase card +- `ConsensusNav.tsx` includes revision citations in Sources section (role: 'revise') +- `_run_consensus` citation collection now includes revision citations from both round history and current round -### Citations — Persisted + Domain-Grouped -- `Citation` dataclass (url, title, snippet) on `ModelResponse.citations` -- Extraction per provider: Anthropic (`web_search_tool_result`), Google (grounding metadata), Perplexity (`response.citations`) -- **Persistence**: `citations_json` TEXT column on `Contribution` model, SQLite auto-migration via `ensure_schema()` -- `proposal_citations` tracked on `ConsensusContext` → archived to `RoundResult` → persisted via `_persist_consensus` -- Thread detail API returns `citations` on `ContributionResponse` -- **Domain-grouped Sources nav**: ConsensusNav (live) + ThreadNav (stored) group citations by hostname - - Nested Disclosure: outer "Sources (17)" → inner "wikipedia.org (3)" → P/C/R role badges per citation - - P (green) = propose, C (amber) = challenge, R (blue) = revise -- `CitationList` shared component for inline display below content +### CLI Enhancements +- Top-level `--rounds` and `--challengers` options on `cli()` group cascade to subcommands (subcommand wins if both set) +- `_parse_challengers()` accepts either int count or comma-separated model refs (e.g. `3` or `openai:gpt-5,google:gemini-2.5-pro`) +- `challenger_count` param flows through `_run_consensus` → `select_challengers(count=N)` +- **CLI DB persistence**: new `persist_consensus()` function in `app.py` — CLI `ask` command now persists full consensus round history to DB (proposals, challenges, revisions, citations, decisions, overview, followups) +- `_ask_async` creates DB factory via `_create_db()`, disposes engine in `finally` block +- Top-level `--rounds` also cascades into `batch` subcommand -### Anthropic Streaming + max_tokens -- `AnthropicProvider.send()` now uses streaming internally via `_collect_stream()` — avoids 10-minute timeout -- `max_tokens` bumped from 16384 → 32768 across all 6 handler defaults (propose, challenge, revise, commit, voting, decomposition) -- Citations are part of the value — truncating them undermines trust +### Calibration Date Filters (frontend) +- `CalibrationDashboard.tsx`: category dropdown + since/until date inputs + Apply button +- `INTENT_CATEGORIES` constant: `['factual', 'technical', 'creative', 'judgment', 'strategic']` +- `calibration.ts` store: `since`/`until` state + `setSince`/`setUntil` setters, passed to API call +- Store tests: 4 new tests for date filter state and API param passing -### Parallel Challenge Streaming -- `_stream_challenges()` in `ws.py` uses `asyncio.as_completed()` to send each challenge result to the frontend as it finishes -- Previously: all challengers ran in parallel but results were batched after all completed -- Now: first challenger to respond appears immediately in the UI +### Provider Updates +- **OpenAI**: `_REASONING_EFFORT_MODELS` set (gpt-5, gpt-5-mini, gpt-5-nano, gpt-5.2, gpt-5.4) — sends `reasoning_effort: "high"` when no function tools present (incompatible with tools on /v1/chat/completions) +- **OpenAI**: also sends `reasoning_effort: "high"` in structured output path (`_send_structured`) +- **OpenAI**: `gpt-5.2` added to `NO_TEMPERATURE_MODELS` in `catalog.py` +- **Perplexity**: retry logic for `APIConnectionError` — 2 attempts, 1s delay between retries +- **Perplexity**: `APIConnectionError` mapped to `ProviderTimeoutError` -### Tools Enabled by Default -- `web_search` tool wired through CLI, REST, and WebSocket paths by default -- Provider tool format fix: `tool_augmented_send` builds generic `{name, description, parameters}` — each provider transforms to native format in `send()` +### Infrastructure +- `alembic/env.py`: `DUH_DATABASE_URL` env var overrides `alembic.ini` — `_resolve_url()` used in offline, online sync, and online async migration paths +- `.gitignore`: `npm/like-duh/node_modules/` added -### Sidebar UX -- New-question button (Heroicons pencil-square) + collapsible sidebar toggle -- Shell manages `desktopSidebarOpen` (default true) + `mobileSidebarOpen` separately -- TopBar shows sidebar toggle when desktop sidebar collapsed or always on mobile - -### Test Results -- 1641 Python tests + 194 Vitest tests (1835 total) -- Build clean, all tests pass +### Test Updates +- All test files updated for 8-tuple `_run_consensus` return value +- `test_cli_display.py`: new `TestShowCitations` class (8 tests — empty, single, dedup, grouping, sort, title fallback, no-url skip, numbered) +- `test_cli_display.py`: new `TestShowFinalDecisionOverview` class (2 tests — shows/hides overview panel) +- `test_cli_tools.py`: mock return values corrected from 4-tuple to 8-tuple +- `test_providers_openai.py`: test switched from `gpt-5.2` to `gpt-4o` (since 5.2 now has special reasoning_effort behavior) +- `stores.test.ts`: 4 new calibration date filter tests +- `test_cli_batch.py`, `test_cli_voting.py`, `test_mcp_server.py`: 8-tuple updates --- ## Current State -- **Branch `question-refinement`** — in progress, not yet merged -- **1641 Python tests + 194 Vitest tests** (1835 total) -- All previous features intact (v0.1–v0.6) -- Prior work merged: z-index fix, GPT-5.4, .env docs, password reset +- **Branch `main`** — uncommitted changes across 29 files (+828/-63) +- All previous features intact (v0.1-v0.6, question-refinement PR #13, messaging-refinement PR #14) +- Prior merged: question refinement, native web search, citations, tools-by-default, sidebar UX, README rewrite, CLI citation display ## Open Questions (Still Unresolved) - Licensing (MIT vs Apache 2.0) - Output licensing for multi-provider synthesized content -- Vector search solution for SQLite (sqlite-vss vs ChromaDB vs FAISS) — v1.0 decision +- Vector search solution for SQLite (sqlite-vss vs ChromaDB vs FAISS) -- v1.0 decision - Client library packaging: monorepo `client/` dir vs separate repo? - MCP server transport: stdio vs SSE vs streamable HTTP? -- Hosted demo economics (try.duh.dev) — deferred to post-1.0 -- A2A protocol — deferred to post-1.0 +- Hosted demo economics (try.duh.dev) -- deferred to post-1.0 +- A2A protocol -- deferred to post-1.0 diff --git a/memory-bank/progress.md b/memory-bank/progress.md index dd0e044..db0c127 100644 --- a/memory-bank/progress.md +++ b/memory-bank/progress.md @@ -4,9 +4,31 @@ --- -## Current State: Post v0.6.0 — `question-refinement` Branch In Progress +## Current State: Post PR #14 — Follow-ups, Revision Citations, CLI Persistence -### Question Refinement + Native Web Search + Citations (2026-03-08) +### Follow-up Questions + Revision Citations + CLI Persistence + Provider Updates (2026-03-09) + +- **Follow-up questions**: `generate_followups()` uses cheapest model w/ JSON mode to suggest 3 follow-up questions after consensus + - `followups` on ConsensusContext, `followups_json` TEXT on Thread model + migration + - `_run_consensus` now returns 8-tuple (was 7, added `followups`) + - All callers updated: CLI, REST, WS, MCP, batch, decompose + - Frontend: clickable follow-ups in ConsensusNav + ThreadNav (Disclosure), triggers new consensus + - WS `complete` event includes `followups`, thread detail API returns them +- **Revision citations**: `handle_revise()` now accepts `tool_registry` + `web_search`, extracts citations + - `revision_citations` on ConsensusContext + RoundResult, persisted to DB + - `handle_propose()` now extracts proposal_citations directly in handler + - WS sends revision citations in REVISE phase, ConsensusNav includes them in Sources +- **CLI persistence**: new `persist_consensus()` in `app.py` — CLI `ask` saves full round history to DB + - `_ask_async` creates DB factory, disposes engine in finally block +- **CLI enhancements**: top-level `--rounds` and `--challengers` cascade to subcommands + - `_parse_challengers()` accepts int count or comma-separated model refs +- **Calibration date filters**: frontend category + since/until date inputs on CalibrationDashboard +- **OpenAI**: `reasoning_effort: "high"` for GPT-5.x models (when no tools), gpt-5.2 in NO_TEMPERATURE_MODELS +- **Perplexity**: retry logic for APIConnectionError (2 attempts, 1s delay) +- **Alembic**: `DUH_DATABASE_URL` env var overrides alembic.ini +- Tests: new TestShowCitations (8), TestShowFinalDecisionOverview (2), calibration date filter tests (4), all 8-tuple updates + +### Question Refinement + Native Web Search + Citations (2026-03-08, merged PR #13 + #14) - **Question refinement**: pre-consensus clarification step (analyze → clarify → enrich → consensus) - `src/duh/consensus/refine.py`, API routes (`/api/refine`, `/api/enrich`), CLI `--refine` flag @@ -224,9 +246,18 @@ Phase 0 benchmark framework — fully functional, pilot-tested on 5 questions. | 2026-03-07 | GPT-5.4 added to model catalog (1M ctx, $2.50/$15.00, no-temperature) | Done | | 2026-03-07 | .env.example updated with provider API key placeholders | Done | | 2026-03-07 | README updated with all provider env vars | Done | -| 2026-03-08 | Question refinement (analyze → clarify → enrich → consensus) | In Progress | -| 2026-03-08 | Native provider web search (Anthropic/Google/Mistral/OpenAI/Perplexity) | In Progress | -| 2026-03-08 | Citations extraction + frontend CitationList + ConsensusNav Sources | In Progress | -| 2026-03-08 | Tools enabled by default (web_search wired through CLI/REST/WS) | In Progress | -| 2026-03-08 | Provider tool format fix (generic → native transform per provider) | In Progress | -| 2026-03-08 | Sidebar UX (new-question button, collapsible toggle) | In Progress | +| 2026-03-08 | Question refinement (analyze → clarify → enrich → consensus) | Done (PR #13) | +| 2026-03-08 | Native provider web search (Anthropic/Google/Mistral/OpenAI/Perplexity) | Done (PR #13) | +| 2026-03-08 | Citations extraction + frontend CitationList + ConsensusNav Sources | Done (PR #13) | +| 2026-03-08 | Tools enabled by default (web_search wired through CLI/REST/WS) | Done (PR #13) | +| 2026-03-08 | Provider tool format fix (generic → native transform per provider) | Done (PR #13) | +| 2026-03-08 | Sidebar UX (new-question button, collapsible toggle) | Done (PR #13) | +| 2026-03-08 | README rewrite + CLI citation display (7-tuple _run_consensus) | Done (PR #14) | +| 2026-03-09 | Follow-up questions (generate, persist, display, clickable) | In Progress | +| 2026-03-09 | Revision citations (handle_revise with tools/search, persist, display) | In Progress | +| 2026-03-09 | CLI DB persistence (persist_consensus, _ask_async DB factory) | In Progress | +| 2026-03-09 | CLI top-level --rounds/--challengers cascade + _parse_challengers | In Progress | +| 2026-03-09 | Calibration date filters (frontend category/since/until) | In Progress | +| 2026-03-09 | OpenAI reasoning_effort for GPT-5.x, gpt-5.2 catalog | In Progress | +| 2026-03-09 | Perplexity retry logic for APIConnectionError | In Progress | +| 2026-03-09 | Alembic DUH_DATABASE_URL env var support | In Progress | diff --git a/memory-bank/tasks/2026-03/README.md b/memory-bank/tasks/2026-03/README.md index 028b13b..4850729 100644 --- a/memory-bank/tasks/2026-03/README.md +++ b/memory-bank/tasks/2026-03/README.md @@ -9,6 +9,38 @@ - Files: `mail.py`, `auth.py`, `schema.py`, `loader.py`, `LoginPage.tsx`, `ResetPasswordPage.tsx`, `TopBar.tsx` - See: [070307_password-reset.md](./070307_password-reset.md) +## 2026-03-08: Question Refinement + Native Web Search + Citations (PR #13 + #14) +- Pre-consensus question refinement: analyze → clarify → enrich → consensus +- Native provider web search (Anthropic/Google/Mistral/OpenAI/Perplexity) +- Citations: extraction per provider, persistence, domain-grouped Sources nav with P/C/R badges +- Tools enabled by default (web_search wired through CLI, REST, WS) +- Sidebar UX: new-question button + collapsible toggle +- Anthropic streaming + parallel challenge streaming + max_tokens 32768 +- README rewrite: repositioned as AI infrastructure, CLI citation display +- `_run_consensus` 7-tuple return (added citations) +- 1641 Python + 194 Vitest tests (1835 total) +- Files: refine.py, handlers.py, machine.py, ws.py, ask.py, threads.py, app.py, all providers, ConsensusNav.tsx, ThreadNav.tsx, CitationList.tsx, RefinementPanel.tsx, consensus.ts, types.ts + +## 2026-03-09: Follow-ups + Revision Citations + CLI Persistence + Provider Updates +- **Follow-up questions**: `generate_followups()` — cheapest model, JSON mode, 3 questions post-consensus + - `followups` on ConsensusContext, `followups_json` on Thread model + migration + - `_run_consensus` 8-tuple return (added followups), all callers updated + - Frontend: clickable follow-ups in ConsensusNav + ThreadNav Disclosure, triggers new consensus +- **Revision citations**: `handle_revise()` accepts tool_registry + web_search, extracts citations + - `revision_citations` on ConsensusContext + RoundResult, persisted to DB + - `handle_propose()` extracts proposal_citations directly in handler + - WS sends revision citations in REVISE phase, ConsensusPanel passes to phase card +- **CLI persistence**: `persist_consensus()` saves full round history to DB from CLI + - `_ask_async` creates DB factory, disposes engine in finally +- **CLI options**: top-level `--rounds`/`--challengers` cascade to subcommands + - `_parse_challengers()`: int count or comma-separated model refs +- **Calibration filters**: category + since/until date inputs on CalibrationDashboard +- **OpenAI**: `reasoning_effort: "high"` for GPT-5.x (no tools), gpt-5.2 in NO_TEMPERATURE_MODELS +- **Perplexity**: retry for APIConnectionError (2 attempts, 1s delay) +- **Alembic**: `DUH_DATABASE_URL` env var overrides alembic.ini +- Tests: TestShowCitations (8), TestShowFinalDecisionOverview (2), calibration date tests (4), all 8-tuple updates +- Files: handlers.py, machine.py, app.py, ws.py, ask.py, threads.py, models.py, migrations.py, mcp/server.py, openai.py, perplexity.py, catalog.py, alembic/env.py, CalibrationDashboard.tsx, ConsensusNav.tsx, ConsensusPanel.tsx, ThreadNav.tsx, calibration.ts, consensus.ts, types.ts, + 7 test files + ## 2026-03-07: Z-index Fix + GPT-5.4 + .env Docs - Fixed z-index stacking contexts trapping dropdowns (Shell z-10, TopBar z-20 removed) - Added CSS z-index tokens (`--z-background`, `--z-dropdown`, `--z-overlay`, `--z-modal`) diff --git a/memory-bank/toc.md b/memory-bank/toc.md index 66f86ef..8356e53 100644 --- a/memory-bank/toc.md +++ b/memory-bank/toc.md @@ -4,7 +4,7 @@ - [projectbrief.md](./projectbrief.md) — Vision, tenets, architecture, build sequence - [techContext.md](./techContext.md) — Tech stack decisions with rationale (Python, Docker, SQLAlchemy, frontend, tools, etc.) - [decisions.md](./decisions.md) — Architectural decisions with context, alternatives, and consequences (26 ADRs) -- [activeContext.md](./activeContext.md) — Current state, question-refinement branch in progress +- [activeContext.md](./activeContext.md) — Current state, post PR #14 — follow-ups, revision citations, CLI persistence - [progress.md](./progress.md) — Milestone tracking, what's built, what's next - [competitive-landscape.md](./competitive-landscape.md) — Research on existing tools, frameworks, and academic work - [quick-start.md](./quick-start.md) — Session entry point, v0.5 complete, key file references diff --git a/src/duh/api/routes/ask.py b/src/duh/api/routes/ask.py index 3dd634b..6b11c8a 100644 --- a/src/duh/api/routes/ask.py +++ b/src/duh/api/routes/ask.py @@ -112,6 +112,7 @@ async def _handle_consensus( # type: ignore[no-untyped-def] cost, _overview, _citations, + _followups, ) = await _run_consensus( body.question, config, @@ -192,6 +193,7 @@ async def _handle_decompose(body: AskRequest, config, pm) -> AskResponse: # typ cost, _overview, _citations, + _followups, ) = await _run_consensus(body.question, config, pm) return AskResponse( decision=decision, diff --git a/src/duh/api/routes/threads.py b/src/duh/api/routes/threads.py index 1fcb537..e6c3a59 100644 --- a/src/duh/api/routes/threads.py +++ b/src/duh/api/routes/threads.py @@ -56,6 +56,7 @@ class ThreadDetailResponse(BaseModel): status: str created_at: str turns: list[TurnResponse] = Field(default_factory=list) + followups: list[str] = Field(default_factory=list) class ThreadListResponse(BaseModel): @@ -189,12 +190,23 @@ def _build_thread_detail(thread: object) -> ThreadDetailResponse: ) ) + # Parse followups from JSON + followups_raw = getattr(thread, "followups_json", None) + followups: list[str] = [] + if followups_raw: + import contextlib + import json as _json + + with contextlib.suppress(ValueError, TypeError): + followups = _json.loads(followups_raw) + return ThreadDetailResponse( thread_id=thread.id, # type: ignore[attr-defined] question=thread.question, # type: ignore[attr-defined] status=thread.status, # type: ignore[attr-defined] created_at=thread.created_at.isoformat(), # type: ignore[attr-defined] turns=turns, + followups=followups, ) diff --git a/src/duh/api/routes/ws.py b/src/duh/api/routes/ws.py index eb27d38..0a52a28 100644 --- a/src/duh/api/routes/ws.py +++ b/src/duh/api/routes/ws.py @@ -145,17 +145,13 @@ async def _stream_consensus( tool_registry=tool_registry, web_search=use_native_search, ) - propose_citations = [ - {"url": c.url, "title": c.title} for c in (propose_resp.citations or []) - ] - ctx.proposal_citations = propose_citations await ws.send_json( { "type": "phase_complete", "phase": "PROPOSE", "content": ctx.proposal or "", "truncated": propose_resp.finish_reason != "stop", - "citations": propose_citations if propose_citations else None, + "citations": ctx.proposal_citations or None, } ) @@ -193,13 +189,16 @@ async def _stream_consensus( "round": ctx.current_round, } ) - revise_resp = await handle_revise(ctx, pm) + revise_resp = await handle_revise( + ctx, pm, tool_registry=tool_registry, web_search=use_native_search + ) await ws.send_json( { "type": "phase_complete", "phase": "REVISE", "content": ctx.revision or "", "truncated": revise_resp.finish_reason != "stop", + "citations": ctx.revision_citations or None, } ) @@ -221,8 +220,11 @@ async def _stream_consensus( sm.transition(ConsensusState.COMPLETE) - # Generate executive overview (best-effort) + # Generate executive overview and follow-up questions (best-effort) await generate_overview(ctx, pm) + from duh.consensus.handlers import generate_followups + + await generate_followups(ctx, pm) # Persist to DB if available thread_id: str | None = None @@ -230,7 +232,11 @@ async def _stream_consensus( if db_factory is not None: try: thread_id = await _persist_consensus( - db_factory, question, ctx.round_history, ctx.overview + db_factory, + question, + ctx.round_history, + ctx.overview, + followups=ctx.followups or None, ) except Exception: logger.exception("Failed to persist consensus thread") @@ -245,6 +251,7 @@ async def _stream_consensus( "cost": pm.total_cost, "thread_id": thread_id, "overview": ctx.overview, + "followups": ctx.followups if ctx.followups else None, } ) await ws.close() @@ -347,6 +354,7 @@ async def _persist_consensus( question: str, round_history: list[RoundResult], overview: str | None = None, + followups: list[str] | None = None, ) -> str: """Persist consensus round history to the database. @@ -392,8 +400,20 @@ async def _persist_consensus( ch.content, citations_json=ch_cit, ) + rev_cit = None + if rr.revision_citations: + rev_cit = json.dumps( + [ + {"url": c["url"], "title": c.get("title")} + for c in rr.revision_citations + ] + ) await repo.add_contribution( - turn.id, rr.proposal_model, "reviser", rr.revision + turn.id, + rr.proposal_model, + "reviser", + rr.revision, + citations_json=rev_cit, ) await repo.save_decision( turn.id, @@ -407,5 +427,8 @@ async def _persist_consensus( if overview: await repo.save_thread_summary(thread.id, overview, "overview") + if followups: + thread.followups_json = json.dumps(followups) + await session.commit() return str(thread.id) diff --git a/src/duh/cli/app.py b/src/duh/cli/app.py index c0bf7e7..0b3b22c 100644 --- a/src/duh/cli/app.py +++ b/src/duh/cli/app.py @@ -25,6 +25,7 @@ from duh.cli.display import ConsensusDisplay from duh.config.schema import DuhConfig + from duh.consensus.machine import RoundResult from duh.memory.models import Thread, Vote from duh.providers.base import ModelInfo from duh.providers.manager import ProviderManager @@ -40,6 +41,21 @@ def _error(msg: str) -> None: sys.exit(1) +def _parse_challengers( + value: str | None, +) -> tuple[list[str] | None, int | None]: + """Parse --challengers as int (count) or comma-separated model refs. + + Returns (model_list, count) — exactly one will be set, or both None. + """ + if not value: + return None, None + try: + return None, int(value) + except ValueError: + return value.split(","), None + + def _load_config(config_path: str | None) -> DuhConfig: """Load config with user-friendly error handling.""" try: @@ -199,6 +215,104 @@ def _setup_tools(config: DuhConfig) -> ToolRegistry | None: return registry +async def persist_consensus( + db_factory: async_sessionmaker[AsyncSession], + question: str, + round_history: list[RoundResult], + overview: str | None = None, + followups: list[str] | None = None, +) -> str: + """Persist full consensus round history to the database. + + Saves proposals, challenger responses, revisions, citations, + decisions, and overview — the same rich format used by the web UI. + + Returns the new thread ID. + """ + import json as _json + + from duh.memory.repository import MemoryRepository + + async with db_factory() as session: + repo = MemoryRepository(session) + thread = await repo.create_thread(question) + thread.status = "complete" + + for rr in round_history: + turn = await repo.create_turn(thread.id, rr.round_number, "COMMIT") + + # Proposal with citations + proposal_cit = None + if rr.proposal_citations: + proposal_cit = _json.dumps( + [ + {"url": c["url"], "title": c.get("title")} + for c in rr.proposal_citations + ] + ) + await repo.add_contribution( + turn.id, + rr.proposal_model, + "proposer", + rr.proposal, + citations_json=proposal_cit, + ) + + # Challenger responses with citations + for ch in rr.challenges: + ch_cit = None + if ch.citations: + ch_cit = _json.dumps( + [ + {"url": c["url"], "title": c.get("title")} + for c in ch.citations + ] + ) + await repo.add_contribution( + turn.id, + ch.model_ref, + "challenger", + ch.content, + citations_json=ch_cit, + ) + + # Revision with citations + rev_cit = None + if rr.revision_citations: + rev_cit = _json.dumps( + [ + {"url": c["url"], "title": c.get("title")} + for c in rr.revision_citations + ] + ) + await repo.add_contribution( + turn.id, + rr.proposal_model, + "reviser", + rr.revision, + citations_json=rev_cit, + ) + + # Decision + await repo.save_decision( + turn.id, + thread.id, + rr.decision, + rr.confidence, + rigor=rr.rigor, + dissent=rr.dissent, + ) + + if overview: + await repo.save_thread_summary(thread.id, overview, "overview") + + if followups: + thread.followups_json = _json.dumps(followups) + + await session.commit() + return str(thread.id) + + async def _run_consensus( question: str, config: DuhConfig, @@ -209,13 +323,23 @@ async def _run_consensus( panel: list[str] | None = None, proposer_override: str | None = None, challengers_override: list[str] | None = None, + challenger_count: int | None = None, web_search: bool = False, + db_factory: async_sessionmaker[AsyncSession] | None = None, ) -> tuple[ - str, float, float, str | None, float, str | None, list[dict[str, str | None]] + str, + float, + float, + str | None, + float, + str | None, + list[dict[str, str | None]], + list[str], ]: """Run the full consensus loop. - Returns (decision, confidence, rigor, dissent, total_cost, overview, citations). + Returns (decision, confidence, rigor, dissent, total_cost, overview, + citations, followups). """ from duh.consensus.convergence import check_convergence from duh.consensus.handlers import ( @@ -272,7 +396,10 @@ async def _run_consensus( # CHALLENGE sm.transition(ConsensusState.CHALLENGE) challengers = challengers_override or select_challengers( - pm, proposer, panel=effective_panel + pm, + proposer, + panel=effective_panel, + **({"count": challenger_count} if challenger_count else {}), ) if display: detail = f"{len(challengers)} models" @@ -299,10 +426,17 @@ async def _run_consensus( if display: reviser = ctx.proposal_model or proposer with display.phase_status("REVISE", reviser): - await handle_revise(ctx, pm) + await handle_revise( + ctx, + pm, + tool_registry=tool_registry, + web_search=web_search, + ) display.show_revise(ctx.revision_model or reviser, ctx.revision or "") else: - await handle_revise(ctx, pm) + await handle_revise( + ctx, pm, tool_registry=tool_registry, web_search=web_search + ) # COMMIT sm.transition(ConsensusState.COMMIT) @@ -327,8 +461,11 @@ async def _run_consensus( sm.transition(ConsensusState.COMPLETE) - # Generate executive overview (best-effort) + # Generate executive overview and follow-up questions (best-effort) await generate_overview(ctx, pm) + from duh.consensus.handlers import generate_followups + + await generate_followups(ctx, pm) # Show tool usage if any if display and ctx.tool_calls_log: @@ -340,10 +477,27 @@ async def _run_consensus( all_citations.extend(rr.proposal_citations) for ch in rr.challenges: all_citations.extend(ch.citations) + all_citations.extend(rr.revision_citations) # Include current round (may not be archived yet) all_citations.extend(ctx.proposal_citations) for ch in ctx.challenges: all_citations.extend(ch.citations) + all_citations.extend(ctx.revision_citations) + + # Persist full round history if DB available + if db_factory is not None: + try: + await persist_consensus( + db_factory, + question, + ctx.round_history, + overview=ctx.overview, + followups=ctx.followups or None, + ) + except Exception: + import logging as _logging + + _logging.getLogger(__name__).exception("Failed to persist consensus thread") return ( ctx.decision or "", @@ -353,6 +507,7 @@ async def _run_consensus( pm.total_cost, ctx.overview, all_citations, + ctx.followups, ) @@ -368,14 +523,32 @@ async def _run_consensus( default=None, help="Path to config file.", ) +@click.option( + "--rounds", + type=int, + default=None, + help="Max consensus rounds (overrides config).", +) +@click.option( + "--challengers", + default=None, + help="Count or model refs (e.g. 3 or openai:gpt-5,google:gemini-2.5-pro).", +) @click.pass_context -def cli(ctx: click.Context, config_path: str | None) -> None: +def cli( + ctx: click.Context, + config_path: str | None, + rounds: int | None, + challengers: str | None, +) -> None: """duh - Multi-model consensus engine. Ask multiple LLMs, get one answer they agree on. """ ctx.ensure_object(dict) ctx.obj["config_path"] = config_path + ctx.obj["rounds"] = rounds + ctx.obj["challengers"] = challengers if ctx.invoked_subcommand is None: click.echo(ctx.get_help()) @@ -416,7 +589,7 @@ def cli(ctx: click.Context, config_path: str | None) -> None: @click.option( "--challengers", default=None, - help="Override challengers (comma-separated model refs).", + help="Count or model refs (e.g. 3 or openai:gpt-5,google:gemini-2.5-pro).", ) @click.option( "--panel", @@ -447,6 +620,11 @@ def ask( and produces a revised consensus decision. """ config = _load_config(ctx.obj["config_path"]) + + # Top-level options cascade into subcommand (subcommand wins) + rounds = rounds or ctx.obj.get("rounds") + challengers = challengers or ctx.obj.get("challengers") + if rounds is not None: config.general.max_rounds = rounds @@ -456,7 +634,7 @@ def ask( # Parse model selection overrides panel_list = panel.split(",") if panel else None - challengers_list = challengers.split(",") if challengers else None + challengers_list, challenger_count = _parse_challengers(challengers) # Question refinement (pre-consensus clarification) if refine: @@ -498,13 +676,14 @@ def ask( panel=panel_list, proposer_override=proposer, challengers_override=challengers_list, + challenger_count=challenger_count, ) ) except DuhError as e: _error(str(e)) return # unreachable - decision, confidence, rigor, dissent, cost, overview, citations = result + decision, confidence, rigor, dissent, cost, overview, citations, _followups = result from duh.cli.display import ConsensusDisplay @@ -547,8 +726,16 @@ async def _ask_async( panel: list[str] | None = None, proposer_override: str | None = None, challengers_override: list[str] | None = None, + challenger_count: int | None = None, ) -> tuple[ - str, float, float, str | None, float, str | None, list[dict[str, str | None]] + str, + float, + float, + str | None, + float, + str | None, + list[dict[str, str | None]], + list[str], ]: """Async implementation for the ask command.""" from duh.cli.display import ConsensusDisplay @@ -563,19 +750,25 @@ async def _ask_async( tool_registry = _setup_tools(config) use_native_search = config.tools.enabled and config.tools.web_search.native + factory, engine = await _create_db(config) display = ConsensusDisplay() display.start() - return await _run_consensus( - question, - config, - pm, - display=display, - tool_registry=tool_registry, - panel=panel, - proposer_override=proposer_override, - challengers_override=challengers_override, - web_search=use_native_search, - ) + try: + return await _run_consensus( + question, + config, + pm, + display=display, + tool_registry=tool_registry, + panel=panel, + proposer_override=proposer_override, + challengers_override=challengers_override, + challenger_count=challenger_count, + web_search=use_native_search, + db_factory=factory, + ) + finally: + await engine.dispose() async def _ask_voting_async( @@ -666,6 +859,7 @@ async def _ask_auto_async( cost, overview, citations, + _followups, ) = await _run_consensus(question, config, pm, display=display) display.show_final_decision( decision, confidence, rigor, cost, dissent, overview=overview @@ -743,7 +937,16 @@ async def _ask_decompose_async( # Single-subtask optimization: skip synthesis if len(subtask_specs) == 1: result = await _run_consensus(question, config, pm, display=display) - decision, confidence, rigor, dissent, cost, overview, citations = result + ( + decision, + confidence, + rigor, + dissent, + cost, + overview, + citations, + _followups, + ) = result display.show_final_decision( decision, confidence, rigor, cost, dissent, overview=overview ) @@ -2260,6 +2463,9 @@ def batch( (each line is {"question": "..."}). """ config = _load_config(ctx.obj["config_path"]) + + # Top-level --rounds cascades into batch + rounds = rounds or ctx.obj.get("rounds") if rounds is not None: config.general.max_rounds = rounds @@ -2397,6 +2603,7 @@ async def _batch_async( _cost, _overview, _citations, + _fups, ) = await _run_consensus(question, config, pm) q_cost = pm.total_cost - cost_before diff --git a/src/duh/consensus/handlers.py b/src/duh/consensus/handlers.py index 848845a..fac7376 100644 --- a/src/duh/consensus/handlers.py +++ b/src/duh/consensus/handlers.py @@ -328,6 +328,10 @@ async def handle_propose( # Update context ctx.proposal = response.content ctx.proposal_model = model_ref + ctx.proposal_citations = [ + {"url": c.url, "title": c.title, "snippet": c.snippet} + for c in (response.citations or []) + ] return response @@ -631,6 +635,8 @@ async def handle_revise( *, temperature: float = 0.7, max_tokens: int = 32768, + tool_registry: ToolRegistry | None = None, + web_search: bool = False, ) -> ModelResponse: """Execute the REVISE phase of consensus. @@ -680,9 +686,27 @@ async def handle_revise( messages = build_revise_prompt(ctx, max_tokens=max_tokens) provider, model_id = provider_manager.get_provider(reviser_ref) - response = await provider.send( - messages, model_id, max_tokens=max_tokens, temperature=temperature - ) + if tool_registry is not None: + from duh.tools.augmented_send import tool_augmented_send + + response = await tool_augmented_send( + provider, + model_id, + messages, + tool_registry, + max_tokens=max_tokens, + temperature=temperature, + web_search=web_search, + ) + _log_tool_calls(ctx, response, "revise") + else: + response = await provider.send( + messages, + model_id, + max_tokens=max_tokens, + temperature=temperature, + web_search=web_search, + ) # Record cost model_info = provider_manager.get_model_info(reviser_ref) @@ -691,6 +715,10 @@ async def handle_revise( # Update context ctx.revision = response.content ctx.revision_model = reviser_ref + ctx.revision_citations = [ + {"url": c.url, "title": c.title, "snippet": c.snippet} + for c in (response.citations or []) + ] return response @@ -902,3 +930,65 @@ async def generate_overview( return None except Exception: return None + + +async def generate_followups( + ctx: ConsensusContext, + provider_manager: ProviderManager, + *, + count: int = 3, +) -> list[str]: + """Generate suggested follow-up questions based on the consensus. + + Uses the cheapest model with JSON mode to produce follow-up + questions that dig deeper into the decision, explore gaps, or + investigate related angles the user might care about. + + Returns an empty list on failure so callers can gracefully degrade. + """ + import json as _json + + models = provider_manager.list_all_models() + if not models: + return [] + + cheapest = min(models, key=lambda m: m.input_cost_per_mtok) + provider, model_id = provider_manager.get_provider(cheapest.model_ref) + + challenges_summary = "" + for ch in ctx.challenges: + challenges_summary += f"\n- [{ch.model_ref}] ({ch.framing}): {ch.content[:200]}" + + prompt = ( + f"Given this consensus decision, suggest {count} follow-up questions " + "the user should ask next. Each question should explore a different " + "angle: deeper technical detail, practical implications, risks or " + "edge cases, or related decisions that follow from this one.\n\n" + "Make questions specific and actionable, not generic. They should " + "build on the debate that happened, not repeat it.\n\n" + f"Original question: {ctx.question}\n" + f"Decision: {ctx.decision}\n" + f"Confidence: {ctx.confidence:.0%}\n" + f"Key challenges:{challenges_summary or ' (none)'}\n" + f"Dissent: {ctx.dissent or 'None'}\n\n" + f'Return JSON: {{"questions": ["question1", "question2", ...]}}' + ) + + try: + response = await provider.send( + [PromptMessage(role="user", content=prompt)], + model_id, + max_tokens=1000, + temperature=0.7, + response_format="json", + ) + provider_manager.record_usage(cheapest, response.usage) + data = _json.loads(response.content) + questions = data.get("questions", []) + if isinstance(questions, list): + followups = [q for q in questions if isinstance(q, str)][:count] + ctx.followups = followups + return followups + return [] + except Exception: + return [] diff --git a/src/duh/consensus/machine.py b/src/duh/consensus/machine.py index 941dbf5..f0c6ae3 100644 --- a/src/duh/consensus/machine.py +++ b/src/duh/consensus/machine.py @@ -58,6 +58,7 @@ class RoundResult: rigor: float = 0.0 dissent: str | None = None proposal_citations: tuple[dict[str, str | None], ...] = () + revision_citations: tuple[dict[str, str | None], ...] = () @dataclass(frozen=True, slots=True) @@ -92,11 +93,13 @@ class ConsensusContext: challenges: list[ChallengeResult] = field(default_factory=list) revision: str | None = None revision_model: str | None = None + revision_citations: list[dict[str, str | None]] = field(default_factory=list) decision: str | None = None confidence: float = 0.0 rigor: float = 0.0 dissent: str | None = None overview: str | None = None + followups: list[str] = field(default_factory=list) converged: bool = False # History @@ -122,6 +125,7 @@ def _clear_round_data(self) -> None: self.challenges = [] self.revision = None self.revision_model = None + self.revision_citations = [] self.decision = None self.confidence = 0.0 self.rigor = 0.0 @@ -142,6 +146,7 @@ def _archive_round(self) -> None: rigor=self.rigor, dissent=self.dissent, proposal_citations=tuple(self.proposal_citations), + revision_citations=tuple(self.revision_citations), ) ) diff --git a/src/duh/mcp/server.py b/src/duh/mcp/server.py index 53b3e9b..db7e826 100644 --- a/src/duh/mcp/server.py +++ b/src/duh/mcp/server.py @@ -143,6 +143,7 @@ async def _handle_ask(args: dict) -> list[TextContent]: # type: ignore[type-arg cost, _overview, _citations, + _followups, ) = await _run_consensus(question, config, pm) return [ TextContent( diff --git a/src/duh/memory/migrations.py b/src/duh/memory/migrations.py index be2f8f6..c455590 100644 --- a/src/duh/memory/migrations.py +++ b/src/duh/memory/migrations.py @@ -69,3 +69,8 @@ async def ensure_schema(engine: AsyncEngine) -> None: await conn.exec_driver_sql( "ALTER TABLE threads ADD COLUMN slug VARCHAR(200) DEFAULT NULL" ) + if "followups_json" not in thread_cols: + logger.info("Adding 'followups_json' column to threads table") + await conn.exec_driver_sql( + "ALTER TABLE threads ADD COLUMN followups_json TEXT DEFAULT NULL" + ) diff --git a/src/duh/memory/models.py b/src/duh/memory/models.py index 56286d0..bcdbc10 100644 --- a/src/duh/memory/models.py +++ b/src/duh/memory/models.py @@ -75,6 +75,9 @@ class Thread(Base): user_id: Mapped[str | None] = mapped_column( ForeignKey("users.id"), nullable=True, index=True, default=None ) + followups_json: Mapped[str | None] = mapped_column( + Text, nullable=True, default=None + ) created_at: Mapped[datetime] = mapped_column(DateTime, default=_utcnow) updated_at: Mapped[datetime] = mapped_column( DateTime, default=_utcnow, onupdate=_utcnow diff --git a/src/duh/providers/catalog.py b/src/duh/providers/catalog.py index a6cbd32..93927b9 100644 --- a/src/duh/providers/catalog.py +++ b/src/duh/providers/catalog.py @@ -241,6 +241,7 @@ "gpt-5", "gpt-5-mini", "gpt-5-nano", + "gpt-5.2", "gpt-5.4", } diff --git a/src/duh/providers/openai.py b/src/duh/providers/openai.py index 82b70a7..9df11e4 100644 --- a/src/duh/providers/openai.py +++ b/src/duh/providers/openai.py @@ -43,6 +43,16 @@ "gpt-4o-mini-search-preview", } +# GPT-5.x models support reasoning_effort but don't always reason by default. +# Dedicated reasoning models (o3, o3-mini, o4-mini) always reason at full capacity. +_REASONING_EFFORT_MODELS: set[str] = { + "gpt-5", + "gpt-5-mini", + "gpt-5-nano", + "gpt-5.2", + "gpt-5.4", +} + def _map_error(e: openai.APIError) -> Exception: """Map OpenAI SDK errors to duh error hierarchy.""" @@ -160,6 +170,11 @@ async def send( } for t in tools ] + else: + # reasoning_effort is incompatible with function tools on + # /v1/chat/completions for gpt-5.x models. + if model_id in _REASONING_EFFORT_MODELS: + kwargs["reasoning_effort"] = "high" if web_search and model_id in _SEARCH_MODELS: kwargs["web_search_options"] = {} @@ -229,6 +244,8 @@ async def stream( } if model_id not in _NO_TEMPERATURE_MODELS: kwargs["temperature"] = temperature + if model_id in _REASONING_EFFORT_MODELS: + kwargs["reasoning_effort"] = "high" if stop_sequences: kwargs["stop"] = stop_sequences diff --git a/src/duh/providers/perplexity.py b/src/duh/providers/perplexity.py index 9aea342..8000996 100644 --- a/src/duh/providers/perplexity.py +++ b/src/duh/providers/perplexity.py @@ -2,7 +2,9 @@ from __future__ import annotations +import asyncio import contextlib +import logging import time from typing import TYPE_CHECKING, Any @@ -30,6 +32,11 @@ from duh.providers.base import PromptMessage +logger = logging.getLogger(__name__) + +_MAX_RETRIES = 2 +_RETRY_DELAY = 1.0 # seconds + PROVIDER_ID = "perplexity" _KNOWN_MODELS = MODEL_CATALOG[PROVIDER_ID] _DEFAULT_CAPS = PROVIDER_CAPS[PROVIDER_ID] @@ -47,6 +54,8 @@ def _map_error(e: openai.APIError) -> Exception: with contextlib.suppress(ValueError): retry_after = float(raw) return ProviderRateLimitError(PROVIDER_ID, retry_after=retry_after) + if isinstance(e, openai.APIConnectionError): + return ProviderTimeoutError(PROVIDER_ID, f"Connection error: {e}") if isinstance(e, openai.APITimeoutError): return ProviderTimeoutError(PROVIDER_ID, str(e)) if isinstance(e, openai.InternalServerError): @@ -153,10 +162,29 @@ async def send( ] start = time.monotonic() - try: - response = await self._client.chat.completions.create(**kwargs) - except openai.APIError as e: - raise _map_error(e) from e + last_exc: openai.APIError | None = None + response = None + for attempt in range(_MAX_RETRIES): + try: + response = await self._client.chat.completions.create(**kwargs) + break + except openai.APIConnectionError as e: + last_exc = e + if attempt < _MAX_RETRIES - 1: + logger.warning( + "Perplexity connection error (attempt %d/%d), " + "retrying in %.0fs: %s", + attempt + 1, + _MAX_RETRIES, + _RETRY_DELAY, + e, + ) + await asyncio.sleep(_RETRY_DELAY) + except openai.APIError as e: + raise _map_error(e) from e + if response is None: + assert last_exc is not None + raise _map_error(last_exc) from last_exc latency_ms = (time.monotonic() - start) * 1000 diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py index 4158647..5c1d800 100644 --- a/tests/unit/test_cli.py +++ b/tests/unit/test_cli.py @@ -77,6 +77,7 @@ def test_displays_decision( 0.0042, None, [], + [], ) result = runner.invoke(cli, ["ask", "What database?"]) @@ -105,6 +106,7 @@ def test_displays_dissent( 0.01, None, [], + [], ) result = runner.invoke(cli, ["ask", "What database?"]) @@ -125,7 +127,7 @@ def test_no_dissent_when_none( from duh.config.schema import DuhConfig mock_config.return_value = DuhConfig() - mock_run.return_value = ("Answer.", 1.0, 1.0, None, 0.0, None, []) + mock_run.return_value = ("Answer.", 1.0, 1.0, None, 0.0, None, [], []) result = runner.invoke(cli, ["ask", "Question?"]) @@ -144,7 +146,7 @@ def test_rounds_option( config = DuhConfig() mock_config.return_value = config - mock_run.return_value = ("Answer.", 1.0, 1.0, None, 0.0, None, []) + mock_run.return_value = ("Answer.", 1.0, 1.0, None, 0.0, None, [], []) result = runner.invoke(cli, ["ask", "--rounds", "5", "Question?"]) @@ -645,7 +647,7 @@ def test_ask_full_loop(self, runner: CliRunner) -> None: async def fake_ask( question: str, cfg: Any, **kwargs: Any - ) -> tuple[str, float, float, str | None, float, str | None]: + ) -> tuple[str, float, float, str | None, float, str | None, list[dict[str, str | None]], list[str]]: pm = ProviderManager() await pm.register(provider) from duh.cli.app import _run_consensus diff --git a/tests/unit/test_cli_batch.py b/tests/unit/test_cli_batch.py index df261e3..22f4a14 100644 --- a/tests/unit/test_cli_batch.py +++ b/tests/unit/test_cli_batch.py @@ -460,10 +460,11 @@ async def fake_consensus( float, str | None, list[dict[str, str | None]], + list[str], ]: nonlocal consensus_called consensus_called = True - return ("Use SQLite.", 0.85, 1.0, None, 0.01, None, []) + return ("Use SQLite.", 0.85, 1.0, None, 0.01, None, [], []) with ( patch("duh.cli.app.load_config", return_value=config), @@ -554,8 +555,8 @@ async def fake_consensus( pm: Any, display: Any = None, tool_registry: Any = None, - ) -> tuple[str, float, float, str | None, float, str | None]: - return ("Answer.", 0.9, 1.0, None, 0.01, None, []) + ) -> tuple[str, float, float, str | None, float, str | None, list[dict[str, str | None]], list[str]]: + return ("Answer.", 0.9, 1.0, None, 0.01, None, [], []) with ( patch("duh.cli.app.load_config", return_value=config), @@ -609,12 +610,12 @@ async def fake_consensus( pm: Any, display: Any = None, tool_registry: Any = None, - ) -> tuple[str, float, float, str | None, float, str | None]: + ) -> tuple[str, float, float, str | None, float, str | None, list[dict[str, str | None]], list[str]]: nonlocal call_count call_count += 1 if question == "Q2": raise RuntimeError("Provider timeout") - return ("Answer.", 0.9, 1.0, None, 0.01, None, []) + return ("Answer.", 0.9, 1.0, None, 0.01, None, [], []) with ( patch("duh.cli.app.load_config", return_value=config), @@ -658,10 +659,10 @@ async def fake_consensus( pm: Any, display: Any = None, tool_registry: Any = None, - ) -> tuple[str, float, float, str | None, float, str | None]: + ) -> tuple[str, float, float, str | None, float, str | None, list[dict[str, str | None]], list[str]]: if question == "Q2": raise RuntimeError("Model unavailable") - return ("Answer.", 0.9, 1.0, None, 0.01, None, []) + return ("Answer.", 0.9, 1.0, None, 0.01, None, [], []) with ( patch("duh.cli.app.load_config", return_value=config), diff --git a/tests/unit/test_cli_display.py b/tests/unit/test_cli_display.py index 5c72456..d8a3534 100644 --- a/tests/unit/test_cli_display.py +++ b/tests/unit/test_cli_display.py @@ -386,3 +386,145 @@ def test_complete_round_display(self) -> None: # Stats assert "3 models" in out assert "$0.0500" in out + + +# ── show_citations ─────────────────────────────────────────── + + +class TestShowCitations: + def test_empty_citations_no_output(self) -> None: + display, buf = _make_display() + display.show_citations([]) + assert _output(buf) == "" + + def test_single_citation(self) -> None: + display, buf = _make_display() + display.show_citations( + [ + {"url": "https://example.com/page", "title": "Example Page"}, + ] + ) + out = _output(buf) + assert "Sources" in out + assert "(1)" in out + assert "Example Page" in out + assert "https://example.com/page" in out + + def test_deduplicates_by_url(self) -> None: + display, buf = _make_display() + display.show_citations( + [ + {"url": "https://example.com/a", "title": "Page A"}, + {"url": "https://example.com/a", "title": "Page A duplicate"}, + {"url": "https://example.com/b", "title": "Page B"}, + ] + ) + out = _output(buf) + assert "(2)" in out # 2 unique URLs + assert "Page A" in out + assert "Page A duplicate" not in out + assert "Page B" in out + + def test_groups_by_hostname(self) -> None: + display, buf = _make_display() + display.show_citations( + [ + {"url": "https://docs.python.org/a", "title": "Doc A"}, + {"url": "https://docs.python.org/b", "title": "Doc B"}, + {"url": "https://example.com/x", "title": "Example"}, + ] + ) + out = _output(buf) + assert "(3)" in out + # Both python.org docs should appear (grouped by host, sorted by count) + assert "Doc A" in out + assert "Doc B" in out + assert "Example" in out + + def test_sorts_groups_by_count_descending(self) -> None: + display, buf = _make_display() + display.show_citations( + [ + {"url": "https://a.com/1", "title": "A1"}, + {"url": "https://b.com/1", "title": "B1"}, + {"url": "https://b.com/2", "title": "B2"}, + {"url": "https://b.com/3", "title": "B3"}, + ] + ) + out = _output(buf) + # b.com has 3 citations, should appear first (lower index numbers) + b1_pos = out.index("B1") + a1_pos = out.index("A1") + assert b1_pos < a1_pos + + def test_title_falls_back_to_hostname(self) -> None: + display, buf = _make_display() + display.show_citations( + [ + {"url": "https://example.com/page", "title": None}, + ] + ) + out = _output(buf) + assert "example.com" in out + + def test_no_url_entries_skipped(self) -> None: + display, buf = _make_display() + display.show_citations( + [ + {"url": "", "title": "No URL"}, + {"url": None, "title": "Also no URL"}, + ] + ) + # All entries have empty/None URLs, so nothing to show + assert _output(buf) == "" + + def test_mixed_valid_and_empty_urls(self) -> None: + display, buf = _make_display() + display.show_citations( + [ + {"url": "", "title": "Skip me"}, + {"url": "https://example.com/real", "title": "Real"}, + ] + ) + out = _output(buf) + assert "(1)" in out + assert "Real" in out + assert "Skip me" not in out + + def test_numbered_entries(self) -> None: + display, buf = _make_display() + display.show_citations( + [ + {"url": "https://a.com/1", "title": "First"}, + {"url": "https://a.com/2", "title": "Second"}, + ] + ) + out = _output(buf) + assert "[1]" in out + assert "[2]" in out + + +# ── show_final_decision with overview ──────────────────────── + + +class TestShowFinalDecisionOverview: + def test_shows_overview_when_provided(self) -> None: + display, buf = _make_display() + display.show_final_decision( + "Decision text.", + 0.9, + 1.0, + 0.05, + None, + overview="Executive summary here.", + ) + out = _output(buf) + assert "Executive Overview" in out + assert "Executive summary here." in out + assert "Decision text." in out + + def test_no_overview_panel_when_none(self) -> None: + display, buf = _make_display() + display.show_final_decision("Answer.", 1.0, 1.0, 0.0, None, overview=None) + out = _output(buf) + assert "Executive Overview" not in out diff --git a/tests/unit/test_cli_tools.py b/tests/unit/test_cli_tools.py index adce8ff..c1bd586 100644 --- a/tests/unit/test_cli_tools.py +++ b/tests/unit/test_cli_tools.py @@ -123,7 +123,7 @@ def test_tools_flag_enables_tools( ) -> None: config = DuhConfig(tools=ToolsConfig(enabled=False)) mock_config.return_value = config - mock_run.return_value = ("Answer", 0.9, None, 0.01) + mock_run.return_value = ("Answer", 0.9, 1.0, None, 0.01, None, [], []) runner.invoke(cli, ["ask", "--tools", "test question"]) # After CLI processes --tools flag, config should be overridden @@ -139,7 +139,7 @@ def test_no_tools_flag_disables_tools( ) -> None: config = DuhConfig(tools=ToolsConfig(enabled=True)) mock_config.return_value = config - mock_run.return_value = ("Answer", 0.9, None, 0.01) + mock_run.return_value = ("Answer", 0.9, 1.0, None, 0.01, None, [], []) runner.invoke(cli, ["ask", "--no-tools", "test question"]) assert config.tools.enabled is False @@ -154,7 +154,7 @@ def test_no_flag_preserves_config( ) -> None: config = DuhConfig(tools=ToolsConfig(enabled=True)) mock_config.return_value = config - mock_run.return_value = ("Answer", 0.9, None, 0.01) + mock_run.return_value = ("Answer", 0.9, 1.0, None, 0.01, None, [], []) runner.invoke(cli, ["ask", "test question"]) # Should remain True since no flag was passed @@ -242,7 +242,7 @@ def test_tools_enabled_passes_registry( config = DuhConfig(tools=ToolsConfig(enabled=True)) mock_config.return_value = config mock_providers.return_value.list_all_models.return_value = ["model1"] - mock_consensus.return_value = ("Answer", 0.9, 1.0, None, 0.01, None) + mock_consensus.return_value = ("Answer", 0.9, 1.0, None, 0.01, None, [], []) runner.invoke(cli, ["ask", "test question"]) @@ -263,7 +263,7 @@ def test_tools_disabled_passes_none( config = DuhConfig(tools=ToolsConfig(enabled=False)) mock_config.return_value = config mock_providers.return_value.list_all_models.return_value = ["model1"] - mock_consensus.return_value = ("Answer", 0.9, 1.0, None, 0.01, None) + mock_consensus.return_value = ("Answer", 0.9, 1.0, None, 0.01, None, [], []) runner.invoke(cli, ["ask", "test question"]) diff --git a/tests/unit/test_cli_voting.py b/tests/unit/test_cli_voting.py index 6ce5f85..df1e0ef 100644 --- a/tests/unit/test_cli_voting.py +++ b/tests/unit/test_cli_voting.py @@ -147,7 +147,7 @@ def test_default_protocol_is_consensus( from duh.config.schema import DuhConfig mock_config.return_value = DuhConfig() - mock_run.return_value = ("Answer.", 1.0, 1.0, None, 0.0, None, []) + mock_run.return_value = ("Answer.", 1.0, 1.0, None, 0.0, None, [], []) result = runner.invoke(cli, ["ask", "Question?"]) assert result.exit_code == 0 diff --git a/tests/unit/test_mcp_server.py b/tests/unit/test_mcp_server.py index 7cc2745..e405b1a 100644 --- a/tests/unit/test_mcp_server.py +++ b/tests/unit/test_mcp_server.py @@ -177,7 +177,7 @@ async def test_consensus_protocol(self) -> None: patch( "duh.cli.app._run_consensus", new_callable=AsyncMock, - return_value=("Use SQLite.", 0.9, 1.0, "minor dissent", 0.05, None, []), + return_value=("Use SQLite.", 0.9, 1.0, "minor dissent", 0.05, None, [], []), ), ): result = await _handle_ask({"question": "What DB?", "rounds": 2}) diff --git a/tests/unit/test_providers_openai.py b/tests/unit/test_providers_openai.py index 2b7c401..9317e9d 100644 --- a/tests/unit/test_providers_openai.py +++ b/tests/unit/test_providers_openai.py @@ -239,13 +239,13 @@ async def test_passes_params_to_sdk(self): ] await provider.send( msgs, - "gpt-5.2", + "gpt-4o", max_tokens=1000, temperature=0.3, stop_sequences=["STOP"], ) call_kwargs = client.chat.completions.create.call_args.kwargs - assert call_kwargs["model"] == "gpt-5.2" + assert call_kwargs["model"] == "gpt-4o" assert call_kwargs["max_completion_tokens"] == 1000 assert call_kwargs["temperature"] == 0.3 assert call_kwargs["stop"] == ["STOP"] diff --git a/web/src/__tests__/stores.test.ts b/web/src/__tests__/stores.test.ts index ead0eb0..36a152e 100644 --- a/web/src/__tests__/stores.test.ts +++ b/web/src/__tests__/stores.test.ts @@ -409,6 +409,8 @@ describe('useCalibrationStore', () => { loading: false, error: null, category: null, + since: null, + until: null, }) }) @@ -472,4 +474,47 @@ describe('useCalibrationStore', () => { useCalibrationStore.getState().setCategory(null) expect(useCalibrationStore.getState().category).toBeNull() }) + + it('setSince updates since filter', () => { + useCalibrationStore.getState().setSince('2025-01-01') + expect(useCalibrationStore.getState().since).toBe('2025-01-01') + + useCalibrationStore.getState().setSince(null) + expect(useCalibrationStore.getState().since).toBeNull() + }) + + it('setUntil updates until filter', () => { + useCalibrationStore.getState().setUntil('2025-12-31') + expect(useCalibrationStore.getState().until).toBe('2025-12-31') + + useCalibrationStore.getState().setUntil(null) + expect(useCalibrationStore.getState().until).toBeNull() + }) + + it('fetchCalibration passes filters to API', async () => { + mockedApi.calibration.mockResolvedValue({ + buckets: [], + total_decisions: 0, + total_with_outcomes: 0, + overall_accuracy: 0, + ece: 0, + }) + + useCalibrationStore.getState().setCategory('factual') + useCalibrationStore.getState().setSince('2025-06-01') + useCalibrationStore.getState().setUntil('2025-06-30') + await useCalibrationStore.getState().fetchCalibration() + + expect(mockedApi.calibration).toHaveBeenCalledWith({ + category: 'factual', + since: '2025-06-01', + until: '2025-06-30', + }) + }) + + it('has correct initial state for date filters', () => { + const state = useCalibrationStore.getState() + expect(state.since).toBeNull() + expect(state.until).toBeNull() + }) }) diff --git a/web/src/api/types.ts b/web/src/api/types.ts index ffcd29c..e96e475 100644 --- a/web/src/api/types.ts +++ b/web/src/api/types.ts @@ -135,6 +135,7 @@ export interface ThreadDetail { status: string created_at: string turns: Turn[] + followups?: string[] } export interface RecallResult { @@ -302,6 +303,7 @@ export interface WSComplete { cost: number thread_id: string | null overview: string | null + followups: string[] | null } export interface WSChallengeError { diff --git a/web/src/components/calibration/CalibrationDashboard.tsx b/web/src/components/calibration/CalibrationDashboard.tsx index 502a25e..ab4de36 100644 --- a/web/src/components/calibration/CalibrationDashboard.tsx +++ b/web/src/components/calibration/CalibrationDashboard.tsx @@ -1,6 +1,8 @@ -import { useEffect } from 'react' +import { useCallback, useEffect } from 'react' import { useCalibrationStore } from '@/stores' +const INTENT_CATEGORIES = ['factual', 'technical', 'creative', 'judgment', 'strategic'] + function eceRating(ece: number): { label: string; color: string } { if (ece < 0.05) return { label: 'Excellent', color: 'var(--color-success, #22c55e)' } if (ece < 0.1) return { label: 'Good', color: 'var(--color-primary)' } @@ -17,13 +19,44 @@ export function CalibrationDashboard() { ece, loading, error, + category, + since, + until, fetchCalibration, + setCategory, + setSince, + setUntil, } = useCalibrationStore() useEffect(() => { fetchCalibration() }, [fetchCalibration]) + const handleCategoryChange = useCallback( + (e: React.ChangeEvent) => { + setCategory(e.target.value || null) + }, + [setCategory], + ) + + const handleSinceChange = useCallback( + (e: React.ChangeEvent) => { + setSince(e.target.value || null) + }, + [setSince], + ) + + const handleUntilChange = useCallback( + (e: React.ChangeEvent) => { + setUntil(e.target.value || null) + }, + [setUntil], + ) + + const handleApplyFilters = useCallback(() => { + fetchCalibration() + }, [fetchCalibration]) + if (loading) { return (
@@ -52,6 +85,50 @@ export function CalibrationDashboard() { outcomes.

+ {/* Filters */} +
+ + + + +
+ {/* Metric cards */}
diff --git a/web/src/components/consensus/ConsensusNav.tsx b/web/src/components/consensus/ConsensusNav.tsx index c1247f8..15d431a 100644 --- a/web/src/components/consensus/ConsensusNav.tsx +++ b/web/src/components/consensus/ConsensusNav.tsx @@ -55,7 +55,7 @@ function shortModel(model: string): string { } export function ConsensusNav() { - const { status, rounds, currentRound, currentPhase } = useConsensusStore() + const { status, rounds, currentRound, currentPhase, followups, submitQuestion } = useConsensusStore() if (rounds.length === 0) return null @@ -76,6 +76,9 @@ export function ConsensusNav() { if (!seen.has(c.url)) { seen.add(c.url); tagged.push({ ...c, role: 'challenge' }) } } } + for (const c of round.revisionCitations ?? []) { + if (!seen.has(c.url)) { seen.add(c.url); tagged.push({ ...c, role: 'revise' }) } + } } // Group by hostname const groups = new Map() @@ -216,6 +219,32 @@ export function ConsensusNav() {
)} + {followups?.length > 0 && ( +
+ + Follow up ({followups.length}) + + } + defaultOpen + > +
    + {followups.map((q, i) => ( +
  • + +
  • + ))} +
+
+
+ )} + ) diff --git a/web/src/components/consensus/ConsensusPanel.tsx b/web/src/components/consensus/ConsensusPanel.tsx index e2e2897..9fbeb46 100644 --- a/web/src/components/consensus/ConsensusPanel.tsx +++ b/web/src/components/consensus/ConsensusPanel.tsx @@ -120,6 +120,7 @@ export function ConsensusPanel() { collapsible={isCompletedRound} defaultOpen={false} truncated={round.truncated.includes('REVISE')} + citations={round.revisionCitations} /> )} diff --git a/web/src/components/threads/ThreadNav.tsx b/web/src/components/threads/ThreadNav.tsx index 9c88bc9..0b5e594 100644 --- a/web/src/components/threads/ThreadNav.tsx +++ b/web/src/components/threads/ThreadNav.tsx @@ -1,4 +1,5 @@ import { GlassPanel, Disclosure } from '@/components/shared' +import { useConsensusStore } from '@/stores/consensus' import { useThreadsStore } from '@/stores/threads' import type { Citation } from '@/api/types' @@ -145,7 +146,41 @@ export function ThreadNav() {
)} + + ) } + +function FollowupSection({ followups }: { followups?: string[] }) { + const submitQuestion = useConsensusStore((s) => s.submitQuestion) + + if (!followups || followups.length === 0) return null + + return ( +
+ + Follow up ({followups.length}) + + } + defaultOpen + > +
    + {followups.map((q, i) => ( +
  • + +
  • + ))} +
+
+
+ ) +} diff --git a/web/src/stores/calibration.ts b/web/src/stores/calibration.ts index efce8f1..083f410 100644 --- a/web/src/stores/calibration.ts +++ b/web/src/stores/calibration.ts @@ -13,10 +13,14 @@ interface CalibrationState { // Filters category: string | null + since: string | null + until: string | null // Actions fetchCalibration: () => Promise setCategory: (category: string | null) => void + setSince: (since: string | null) => void + setUntil: (until: string | null) => void } export const useCalibrationStore = create((set, get) => ({ @@ -29,13 +33,17 @@ export const useCalibrationStore = create((set, get) => ({ error: null, category: null, + since: null, + until: null, fetchCalibration: async () => { set({ loading: true, error: null }) try { - const { category } = get() - const params: { category?: string } = {} + const { category, since, until } = get() + const params: { category?: string; since?: string; until?: string } = {} if (category) params.category = category + if (since) params.since = since + if (until) params.until = until const data = await api.calibration(params) set({ @@ -54,4 +62,12 @@ export const useCalibrationStore = create((set, get) => ({ setCategory: (category) => { set({ category }) }, + + setSince: (since) => { + set({ since }) + }, + + setUntil: (until) => { + set({ until }) + }, })) diff --git a/web/src/stores/consensus.ts b/web/src/stores/consensus.ts index 290e7b4..baa2111 100644 --- a/web/src/stores/consensus.ts +++ b/web/src/stores/consensus.ts @@ -29,6 +29,7 @@ export interface RoundData { challenges: ChallengeEntry[] reviser: string | null revision: string | null + revisionCitations?: Citation[] | null confidence: number | null rigor: number | null dissent: string | null @@ -56,6 +57,7 @@ interface ConsensusState { cost: number | null threadId: string | null overview: string | null + followups: string[] // Refinement clarifyingQuestions: ClarifyingQuestion[] @@ -106,6 +108,7 @@ const initialState = { cost: null as number | null, threadId: null as string | null, overview: null as string | null, + followups: [] as string[], clarifyingQuestions: [] as ClarifyingQuestion[], clarificationAnswers: {} as Record, pendingRounds: 3, @@ -203,6 +206,7 @@ export const useConsensusStore = create((set, get) => ({ cost: null, threadId: null, overview: null, + followups: [], clarifyingQuestions: [], clarificationAnswers: {}, }) @@ -292,6 +296,7 @@ function handleEvent( if (event.truncated) update.truncated = [...round.truncated, 'PROPOSE'] } else if (event.phase === 'REVISE') { update.revision = event.content ?? null + update.revisionCitations = event.citations ?? null if (event.truncated) update.truncated = [...round.truncated, 'REVISE'] } @@ -353,6 +358,7 @@ function handleEvent( cost: event.cost, threadId: event.thread_id ?? null, overview: event.overview ?? null, + followups: event.followups ?? [], }) break }