From 3c64cca27a0fd2d2a5a43b1829c85a15bc19787b Mon Sep 17 00:00:00 2001 From: GitHub CI Date: Sat, 18 Apr 2026 18:44:12 -0700 Subject: [PATCH] feat(a2a): opt in to confidence-v1 + blast-v1 + hitl-mode-v1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Declares three additional workstacean extensions on Quinn's agent card: - confidence-v1: model self-reports confidence via and tags in OUTPUT_FORMAT_INSTRUCTIONS. Parsed from the accumulated stream in _chat_langgraph_stream and emitted as a DataPart on the terminal artifact (mirrors the cost-v1 wiring). success flag is derived from the terminal state so FAILED + high confidence correctly registers as a calibration-warning signal on the workstacean side. - blast-v1: per-skill scope of effect (self | project | repo). Card-only policy metadata — planner + HITL policy read it to gate high-impact work. Radii chosen to match what each skill actually does (qa_report = self, pr_review = repo, board_audit + bug_triage = project). - hitl-mode-v1: per-skill approval policy. All four Quinn skills declared — qa_report autonomous, the rest notification. None of Quinn's skills today warrant veto/gated/compound (pr_review leaves comments but doesn't merge; bug_triage files a reversible backlog feature). Closes #27 Co-Authored-By: Claude Opus 4.7 (1M context) --- a2a_handler.py | 86 +++++++++++++++++++++++++++++++++++ graph/output_format.py | 55 ++++++++++++++++++++-- server.py | 63 ++++++++++++++++++++++++- tests/test_a2a_handler.py | 79 ++++++++++++++++++++++++++++++++ tests/test_a2a_integration.py | 78 +++++++++++++++++++++++++++++++ tests/test_output_format.py | 72 +++++++++++++++++++++++++++++ 6 files changed, 429 insertions(+), 4 deletions(-) diff --git a/a2a_handler.py b/a2a_handler.py index aa8c45a..b4fdc78 100644 --- a/a2a_handler.py +++ b/a2a_handler.py @@ -73,6 +73,16 @@ # Ref: protoWorkstacean/docs/extensions/cost-v1.md COST_MIME = "application/vnd.protolabs.cost-v1+json" +# Confidence-v1: self-reported confidence score + explanation on the terminal +# artifact. Workstacean's confidence interceptor reads +# result.data.confidence (clamped to [0, 1]) and optional +# result.data.confidenceExplanation, records a ConfidenceSample, and +# publishes autonomous.confidence.{agent}.{skill}. Planner L0 reads +# avgConfidenceOnSuccess alongside cost for candidate ranking. +# Schema: {"confidence": float, "confidenceExplanation": str?, "success": bool} +# Ref: protoWorkstacean/docs/extensions/confidence-v1.md +CONFIDENCE_MIME = "application/vnd.protolabs.confidence-v1+json" + # ── Data types ──────────────────────────────────────────────────────────────── @@ -117,6 +127,12 @@ class TaskRecord: # cost interceptor (protoWorkstacean#372) can record per-skill samples. # Shape: {"input_tokens": int, "output_tokens": int, "total_tokens": int} usage: dict = field(default_factory=lambda: {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0}) + # Self-reported confidence for the confidence-v1 DataPart. Set by the + # producer when it parses a tag out of the model's final + # output. Clamped to [0, 1] on write; None when the model didn't + # report one (the interceptor no-ops in that case). + confidence: float | None = None + confidence_explanation: str | None = None # ── asyncio primitives (not serialised) ── _cancel_event: asyncio.Event = field(default_factory=asyncio.Event, repr=False) _update_event: asyncio.Event = field(default_factory=asyncio.Event, repr=False) @@ -234,6 +250,32 @@ async def add_usage(self, task_id: str, input_tokens: int, output_tokens: int) - record.usage["input_tokens"] + record.usage["output_tokens"] ) + async def set_confidence( + self, + task_id: str, + confidence: float, + explanation: str | None = None, + ) -> None: + """Record the agent's self-reported confidence for this task. + + Called once from the producer when it parses a tag + out of the model's final output. Emitted on the terminal artifact + under the confidence-v1 MIME so Workstacean's confidence + interceptor can record per-skill samples. + + Confidence is clamped to [0, 1] defensively — the workstacean-side + interceptor also clamps, but we do it here too so the emitted + DataPart is always in-spec. + """ + clamped = max(0.0, min(1.0, float(confidence))) + async with self._lock: + record = self._tasks.get(task_id) + if record is None: + return + record.confidence = clamped + if explanation and isinstance(explanation, str): + record.confidence_explanation = explanation.strip() or None + async def cancel_if_not_terminal(self, task_id: str) -> TaskRecord | None: """Atomically cancel a task iff it's not already terminal. @@ -348,9 +390,37 @@ def _terminal_artifact_parts(record: TaskRecord) -> list[dict]: "data": cost_data, "metadata": {"mimeType": COST_MIME}, }) + confidence_data = _confidence_payload(record) + if confidence_data is not None: + parts.append({ + "kind": "data", + "data": confidence_data, + "metadata": {"mimeType": CONFIDENCE_MIME}, + }) return parts +def _confidence_payload(record: TaskRecord) -> dict | None: + """Build the confidence-v1 payload for a terminal record, or None if the + agent didn't self-report a confidence score this run. + + ``success`` is derived from the terminal state — COMPLETED is the only + truthy case (CANCELED and FAILED both count as not-a-success for + OutcomeAnalysis's purposes). The interceptor pairs confidence with + success when recording samples; reporting a confidence on a FAILED run + is exactly the "high-confidence failure" calibration signal. + """ + if record.confidence is None: + return None + payload: dict = { + "confidence": record.confidence, + "success": record.state == COMPLETED, + } + if record.confidence_explanation: + payload["confidenceExplanation"] = record.confidence_explanation + return payload + + def _cost_payload(record: TaskRecord) -> dict | None: """Build the cost-v1 payload for a terminal record, or None if no cost-relevant data is available. @@ -786,6 +856,22 @@ async def _run_task_background( output_tokens=payload.get("output_tokens", 0), ) + elif event_type == "confidence": + # Self-reported confidence parsed from the model's final + # output. Stored on the record and emitted on the terminal + # artifact under the confidence-v1 MIME for Workstacean's + # confidence interceptor. + if isinstance(payload, dict) and "confidence" in payload: + try: + await _store.set_confidence( + task_id, + confidence=float(payload["confidence"]), + explanation=payload.get("explanation"), + ) + except (TypeError, ValueError): + # Bad payload — skip rather than crash the run. + pass + elif event_type == "done": record = await _store.update_state( task_id, diff --git a/graph/output_format.py b/graph/output_format.py index d799a7d..32bd719 100644 --- a/graph/output_format.py +++ b/graph/output_format.py @@ -41,15 +41,28 @@ The user-facing answer. This is what lands in the A2A artifact / Discord / Gradio chat. Be clean, scannable, markdown-formatted. + 0.85 + + One short sentence on why this score — what made you sure or unsure. + Rules: -- Always emit both tags, in that order, exactly once. -- Never include literal `` or `` markers inside the - user-facing content. +- Always emit `` and ``, in that order, exactly once. +- Never include literal `` / `` / `` / + `` markers inside the user-facing content. - Keep tool-calling deliberation in ``. Keep only the finished, customer-ready answer in ``. - If you must defer or ask for clarification, put the question inside `` too — the user never sees ``. + +Confidence (required on terminal responses): +- `` is a number in [0, 1] — your self-assessed confidence + that the `` is correct and complete. Calibrate honestly: a + 0.9 should mean you'd bet on it; a 0.5 means roughly a coin flip. +- `` is one short sentence on what drove the + score — spec clarity, tool-result completeness, edge cases unchecked. +- Omit both tags when you're only calling tools (no final answer yet). + Include them once, on the turn that contains the final ``. """.strip() @@ -59,6 +72,16 @@ _THINK_RE = re.compile(r"[\s\S]*?", re.IGNORECASE) _ORPHAN_THINK_OPEN_RE = re.compile(r"[\s\S]*$", re.IGNORECASE) _ORPHAN_THINK_CLOSE_RE = re.compile(r"\s*", re.IGNORECASE) +_CONFIDENCE_RE = re.compile( + r"\s*(-?[\d.]+)\s*", re.IGNORECASE, +) +_CONFIDENCE_EXPLANATION_RE = re.compile( + r"([\s\S]*?)", re.IGNORECASE, +) +_CONFIDENCE_ANY_RE = re.compile( + r"[\s\S]*?", + re.IGNORECASE, +) def _strip_reasoning(text: str) -> str: @@ -73,9 +96,35 @@ def _strip_reasoning(text: str) -> str: text = _ORPHAN_THINK_CLOSE_RE.sub("", text) text = _SCRATCH_RE.sub("", text) text = _ORPHAN_SCRATCH_OPEN_RE.sub("", text) + text = _CONFIDENCE_ANY_RE.sub("", text) return text +def extract_confidence(text: str) -> tuple[float | None, str | None]: + """Pull ``(confidence, explanation)`` out of a complete model response. + + Returns ``(None, None)`` if the model didn't emit a `` tag. + Clamps confidence to [0, 1]. Unparseable numbers return ``None`` so + ``_chat_langgraph_stream`` emits no confidence event — the workstacean + interceptor no-ops on missing confidence, which is the correct + fallback for a malformed self-report. + """ + m = _CONFIDENCE_RE.search(text) + if not m: + return None, None + try: + value = float(m.group(1)) + except ValueError: + return None, None + value = max(0.0, min(1.0, value)) + explanation_m = _CONFIDENCE_EXPLANATION_RE.search(text) + explanation = None + if explanation_m: + cleaned = explanation_m.group(1).strip() + explanation = cleaned or None + return value, explanation + + def extract_output(text: str) -> str: """Return the user-facing content from a complete model response. diff --git a/server.py b/server.py index af1d4d7..3d0d474 100644 --- a/server.py +++ b/server.py @@ -18,7 +18,7 @@ from pathlib import Path from typing import Any -from graph.output_format import extract_output +from graph.output_format import extract_confidence, extract_output # chat_ui pulls in gradio, which the server needs at runtime but which would # otherwise block anyone from importing tiny helpers (e.g. _build_agent_card) @@ -476,6 +476,17 @@ async def _chat_langgraph_stream(message: str, session_id: str, *, caller_trace: "output_tokens": int(usage.get("output_tokens", 0) or 0), }) + # Self-reported confidence, if the model emitted + # tags. Pulled from the raw text BEFORE extract_output strips + # reasoning markers. Emitted ahead of "done" so the a2a handler + # stamps it on the terminal artifact. + confidence, explanation = extract_confidence(accumulated_raw) + if confidence is not None: + yield ("confidence", { + "confidence": confidence, + "explanation": explanation, + }) + yield ("done", extract_output(accumulated_raw)) except GeneratorExit: @@ -819,6 +830,56 @@ def _build_agent_card(host: str) -> dict: { "uri": "https://proto-labs.ai/a2a/ext/cost-v1", }, + # confidence-v1: Quinn emits a self-reported confidence score + # on the terminal artifact when the model includes + # / tags (see + # graph/output_format.py). Workstacean's confidence + # interceptor records per-(agent, skill) samples so planner + # ranking can weight by avgConfidenceOnSuccess and + # OutcomeAnalysis can flag high-confidence-failure clusters. + # Ref: docs/extensions/confidence-v1.md in protoWorkstacean. + { + "uri": "https://proto-labs.ai/a2a/ext/confidence-v1", + }, + # blast-v1: per-skill scope of effect so HITL policy + + # planner can apply stricter gates to higher-impact work. + # Read-side only (no response payload). Radii here align + # with what Quinn actually does in each skill handler — + # don't over-declare, the planner uses this for tiebreaking. + # + # Ref: docs/extensions/blast-v1.md in protoWorkstacean. + { + "uri": "https://proto-labs.ai/a2a/ext/blast-v1", + "params": { + "skills": { + "qa_report": {"radius": "self"}, + "board_audit": {"radius": "project"}, + "pr_review": {"radius": "repo"}, + "bug_triage": {"radius": "project"}, + }, + }, + }, + # hitl-mode-v1: per-skill approval policy. Composes with + # blast-v1 so higher-blast work can be gated independently + # of goal-level config. All Quinn skills are safe enough to + # run without blocking gates today — bug_triage files a + # backlog feature (reversible), board_audit + qa_report + # are read-only, pr_review posts review comments but never + # merges. Notification mode surfaces the action to the + # originating surface without blocking. + # + # Ref: docs/extensions/hitl-mode-v1.md in protoWorkstacean. + { + "uri": "https://proto-labs.ai/a2a/ext/hitl-mode-v1", + "params": { + "skills": { + "qa_report": {"mode": "autonomous"}, + "board_audit": {"mode": "notification"}, + "pr_review": {"mode": "notification"}, + "bug_triage": {"mode": "notification"}, + }, + }, + }, ], }, "defaultInputModes": ["text/plain"], diff --git a/tests/test_a2a_handler.py b/tests/test_a2a_handler.py index 7ec250d..3ca61c9 100644 --- a/tests/test_a2a_handler.py +++ b/tests/test_a2a_handler.py @@ -839,6 +839,85 @@ async def test_store_add_usage_ignores_zero_payloads(store): assert fetched.usage == {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0} +# ── confidence-v1 ───────────────────────────────────────────────────────────── + + +def test_terminal_artifact_emits_confidence_v1_when_set(): + """When the producer set a confidence score on the record, the terminal + artifact carries a confidence-v1 DataPart with confidence + success + + optional explanation. Workstacean's confidence interceptor extracts + this onto result.data for per-(agent, skill) ConfidenceSamples.""" + from a2a_handler import CONFIDENCE_MIME, _terminal_artifact_parts + record = _make_record(state=COMPLETED, accumulated_text="hi") + record.confidence = 0.82 + record.confidence_explanation = "Spec unambiguous; all tests pass." + parts = _terminal_artifact_parts(record) + conf = next( + (p for p in parts if p.get("metadata", {}).get("mimeType") == CONFIDENCE_MIME), + None, + ) + assert conf is not None, "confidence-v1 DataPart missing" + assert conf["data"]["confidence"] == 0.82 + assert conf["data"]["confidenceExplanation"] == "Spec unambiguous; all tests pass." + assert conf["data"]["success"] is True + + +def test_terminal_artifact_confidence_success_is_false_on_failure(): + """FAILED and CANCELED terminal states must report success=False so the + interceptor can classify high-confidence failures — the whole point + of calibration tracking.""" + from a2a_handler import CONFIDENCE_MIME, _terminal_artifact_parts + record = _make_record(state=FAILED, accumulated_text="") + record.confidence = 0.9 + parts = _terminal_artifact_parts(record) + conf = next( + (p for p in parts if p.get("metadata", {}).get("mimeType") == CONFIDENCE_MIME), + None, + ) + assert conf is not None + assert conf["data"]["success"] is False + # explanation omitted when not set — avoids empty string noise. + assert "confidenceExplanation" not in conf["data"] + + +def test_terminal_artifact_omits_confidence_v1_when_not_reported(): + """No tag in the model output → no DataPart. The + interceptor no-ops on absent confidence, so emitting an empty payload + would just be noise.""" + from a2a_handler import CONFIDENCE_MIME, _terminal_artifact_parts + record = _make_record(state=COMPLETED, accumulated_text="hi") + # confidence defaults to None + parts = _terminal_artifact_parts(record) + conf = next( + (p for p in parts if p.get("metadata", {}).get("mimeType") == CONFIDENCE_MIME), + None, + ) + assert conf is None + + +@pytest.mark.asyncio +async def test_store_set_confidence_clamps_to_unit_interval(store): + """Misbehaving models can emit 1.2 or -0.3. We clamp to [0, 1] here + too (the workstacean interceptor also clamps, but defence-in-depth + keeps the DataPart in-spec on the wire).""" + record = _make_record() + await store.create(record) + await store.set_confidence("test-task-id", confidence=1.5, explanation="over") + fetched = await store.get("test-task-id") + assert fetched.confidence == 1.0 + assert fetched.confidence_explanation == "over" + await store.set_confidence("test-task-id", confidence=-0.2) + fetched = await store.get("test-task-id") + assert fetched.confidence == 0.0 + + +@pytest.mark.asyncio +async def test_store_set_confidence_ignores_missing_task(store): + """set_confidence on an unknown task id must no-op, not raise.""" + await store.set_confidence("no-such-task", confidence=0.5) + # No exception = pass + + def test_task_to_response_has_kind_discriminator(): """A2A spec: Task objects carry kind='task'. This is what message/send / tasks/get / initial stream frame all return.""" diff --git a/tests/test_a2a_integration.py b/tests/test_a2a_integration.py index 7ff5cba..4dfad30 100644 --- a/tests/test_a2a_integration.py +++ b/tests/test_a2a_integration.py @@ -118,6 +118,84 @@ def test_agent_card_declares_cost_v1_extension() -> None: ) +def test_agent_card_declares_confidence_v1_extension() -> None: + """Quinn emits a confidence-v1 DataPart when the model self-reports + via tags. The declaration tells Workstacean's confidence + interceptor to expect the payload on result.data.""" + from server import _build_agent_card + + card = _build_agent_card("quinn:7870") + exts = card["capabilities"].get("extensions", []) + conf_ext = next( + (e for e in exts + if e.get("uri") == "https://proto-labs.ai/a2a/ext/confidence-v1"), + None, + ) + assert conf_ext is not None, ( + "Missing confidence-v1 extension — planner ranking won't get " + "avgConfidenceOnSuccess samples from Quinn." + ) + + +def test_agent_card_declares_blast_v1_with_real_skills() -> None: + """blast-v1 is card-only policy metadata — HITL policy + planner read + the radius to decide gating. Every radius must map to a real skill in + the card (over-declaring invents skills the planner can't actually + route to); every radius must be one of the five valid values.""" + from server import _build_agent_card + + card = _build_agent_card("quinn:7870") + exts = card["capabilities"].get("extensions", []) + blast_ext = next( + (e for e in exts + if e.get("uri") == "https://proto-labs.ai/a2a/ext/blast-v1"), + None, + ) + assert blast_ext is not None, "Missing blast-v1 extension declaration." + + declared = blast_ext.get("params", {}).get("skills", {}) + assert declared, "blast-v1 declared but no skills mapped." + + real_skill_ids = {s["id"] for s in card["skills"]} + valid_radii = {"self", "project", "repo", "fleet", "public"} + for skill_id, entry in declared.items(): + assert skill_id in real_skill_ids, ( + f"blast-v1 declares '{skill_id}' but no such skill on card" + ) + assert entry.get("radius") in valid_radii, ( + f"blast-v1 '{skill_id}' has invalid radius {entry.get('radius')!r}" + ) + + +def test_agent_card_declares_hitl_mode_v1_with_real_skills() -> None: + """hitl-mode-v1 is card-only approval policy. HITL plugin reads the + mode to select the rendering path. Validate every declared skill + exists on the card and every mode is one of the five valid values.""" + from server import _build_agent_card + + card = _build_agent_card("quinn:7870") + exts = card["capabilities"].get("extensions", []) + hitl_ext = next( + (e for e in exts + if e.get("uri") == "https://proto-labs.ai/a2a/ext/hitl-mode-v1"), + None, + ) + assert hitl_ext is not None, "Missing hitl-mode-v1 extension declaration." + + declared = hitl_ext.get("params", {}).get("skills", {}) + assert declared, "hitl-mode-v1 declared but no skills mapped." + + real_skill_ids = {s["id"] for s in card["skills"]} + valid_modes = {"autonomous", "notification", "veto", "gated", "compound"} + for skill_id, entry in declared.items(): + assert skill_id in real_skill_ids, ( + f"hitl-mode-v1 declares '{skill_id}' but no such skill on card" + ) + assert entry.get("mode") in valid_modes, ( + f"hitl-mode-v1 '{skill_id}' has invalid mode {entry.get('mode')!r}" + ) + + # ── Worldstate-delta-v1 runtime emission ───────────────────────────────────── diff --git a/tests/test_output_format.py b/tests/test_output_format.py index 148761f..396c2da 100644 --- a/tests/test_output_format.py +++ b/tests/test_output_format.py @@ -18,6 +18,7 @@ from graph.output_format import ( OUTPUT_FORMAT_INSTRUCTIONS, _strip_reasoning, + extract_confidence, extract_output, ) @@ -94,3 +95,74 @@ def test_instructions_mention_both_tags(): """Sanity check — the prompt fragment must teach both tags.""" assert "" in OUTPUT_FORMAT_INSTRUCTIONS assert "" in OUTPUT_FORMAT_INSTRUCTIONS + + +# ── confidence-v1 parsing ──────────────────────────────────────────────────── + + +def test_extract_confidence_happy_path(): + text = ( + "think" + "answer" + "0.82" + "Spec clear; tests pass." + ) + conf, expl = extract_confidence(text) + assert conf == 0.82 + assert expl == "Spec clear; tests pass." + + +def test_extract_confidence_returns_none_when_absent(): + """Model didn't self-report — both values must be None so the stream + skips the confidence event entirely.""" + assert extract_confidence("answer") == (None, None) + + +def test_extract_confidence_clamps_out_of_range(): + """Miscalibrated models occasionally emit >1 or <0. Clamp so the + DataPart on the wire stays in-spec.""" + high, _ = extract_confidence("1.3") + low, _ = extract_confidence("-0.4") + assert high == 1.0 + assert low == 0.0 + + +def test_extract_confidence_unparseable_returns_none(): + """Garbage like probably → None, not a crash.""" + conf, expl = extract_confidence("probably") + assert conf is None + assert expl is None + + +def test_extract_confidence_explanation_optional(): + """A tag without an accompanying explanation still + yields the score (explanation is optional per the spec).""" + conf, expl = extract_confidence("0.5") + assert conf == 0.5 + assert expl is None + + +def test_extract_output_strips_confidence_tags(): + """Confidence markers must not leak into the user-facing output — + they're metadata for the interceptor, not prose for the reader.""" + text = ( + "the answer" + "0.9" + "why" + ) + # extract_output reads the body directly; the extra tags + # live outside it and simply shouldn't survive the reasoning strip + # if someone concatenates without wrapping. + assert extract_output(text) == "the answer" + + # Mixed bag — no , just tags at the end. _strip_reasoning + # should drop confidence entirely. + tailing = "plain answer0.7" + assert extract_output(tailing) == "plain answer" + + +def test_instructions_mention_confidence_tags(): + """The prompt must teach so the model actually emits + the score — the whole extension is dead weight without this.""" + assert "" in OUTPUT_FORMAT_INSTRUCTIONS + assert "" in OUTPUT_FORMAT_INSTRUCTIONS