diff --git a/.cspell.json b/.cspell.json index 9c2501fa5..a05105c0b 100644 --- a/.cspell.json +++ b/.cspell.json @@ -24,7 +24,8 @@ "**/Cargo.lock", "CHANGELOG.md", "logs/**", - "docs/docusaurus/build/**" + "docs/docusaurus/build/**", + "beval/**/results/**" ], "ignoreRegExpList": [ "/#.*/g", @@ -62,9 +63,11 @@ "general-technical" ], "words": [ + "agentic", "atheris", "behaviour", "behavioural", + "beval", "brainwriting", "clusterfuzzlite", "collab", @@ -72,12 +75,13 @@ "figjam", "hideable", "learning", + "parseable", "smol", "subcat", "whiteboarding", + "wireframes", "ˈpræksɪs", - "πρᾶξις", - "agentic" + "πρᾶξις" ], "reporters": [ "default", diff --git a/.github/workflows/beval.yml b/.github/workflows/beval.yml new file mode 100644 index 000000000..f3be9a56f --- /dev/null +++ b/.github/workflows/beval.yml @@ -0,0 +1,87 @@ +name: Behavioral Evaluation (beval) + +on: + workflow_call: + secrets: + COPILOT_TOKEN: + required: true + workflow_dispatch: + +permissions: + contents: read + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: false + +jobs: + evaluate: + runs-on: ubuntu-latest + timeout-minutes: 30 + + env: + AGENT_REPO_ROOT: ${{ github.workspace }} + + steps: + - name: Checkout repository + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v4.2.2 + with: + persist-credentials: false + + - name: Set up Python + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 + with: + python-version: "3.12" + + - name: Install GitHub Copilot CLI + run: | + npm ci --prefix beval + echo "${{ github.workspace }}/beval/node_modules/.bin" >> "$GITHUB_PATH" + + - name: Install beval + # beval is hosted under a personal account (vyta) while an org-owned + # home is evaluated. The install is pinned to a specific commit SHA to + # mitigate supply-chain risk in the interim. + run: pip install --no-cache-dir "beval[all] @ git+https://github.com/vyta/beval.git@a2effa10cec1b06c394811587fede0070174d589#subdirectory=python" + + - name: Start agent (TCP) + env: + COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_TOKEN }} + run: | + copilot --acp --port 3000 & + for i in $(seq 1 30); do + nc -z 127.0.0.1 3000 && break + echo "Waiting for agent to start ($i)..." + sleep 2 + done + nc -z 127.0.0.1 3000 || { echo "Agent failed to start"; exit 1; } + + - name: Start judge (TCP) + env: + COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_TOKEN }} + run: | + copilot --acp --port 3001 & + for i in $(seq 1 30); do + nc -z 127.0.0.1 3001 && break + echo "Waiting for judge to start ($i)..." + sleep 2 + done + nc -z 127.0.0.1 3001 || { echo "Judge failed to start"; exit 1; } + + - name: Run evaluations + run: | + beval \ + -c beval/dt-coach/eval.config.yaml \ + run \ + --cases beval/dt-coach/cases/ \ + --agent beval/dt-coach/agent.yaml \ + -m validation \ + -o beval/dt-coach/results/results.json + + - name: Upload results + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v4.4.3 + if: always() + with: + name: beval-results-${{ github.run_id }} + path: beval/dt-coach/results/ + retention-days: 30 diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml index a6cb0d9d8..31aaff26a 100644 --- a/.github/workflows/dependency-review.yml +++ b/.github/workflows/dependency-review.yml @@ -55,12 +55,22 @@ jobs: WTFPL, LicenseRef-scancode-unicode # Packages with compound SPDX expressions containing GPL or MPL # from bundled code; distributed licenses are permissive. + # @github/copilot uses a non-SPDX proprietary license + # (LicenseRef-bad-see-license-in-license.md); it is GitHub's own + # CLI toolchain, deliberately used in beval.yml. # pkg:npm/hve-core is the private root package (never published to npm). allow-dependencies-licenses: >- pkg:pypi/lxml, pkg:pypi/typing-extensions, pkg:npm/dompurify, pkg:npm/lunr-languages, + pkg:npm/%40github/copilot, + pkg:npm/%40github/copilot-darwin-arm64, + pkg:npm/%40github/copilot-darwin-x64, + pkg:npm/%40github/copilot-linux-arm64, + pkg:npm/%40github/copilot-linux-x64, + pkg:npm/%40github/copilot-win32-arm64, + pkg:npm/%40github/copilot-win32-x64, pkg:npm/hve-core show-openssf-scorecard: true warn-on-openssf-scorecard-level: 3 diff --git a/beval/dt-coach/agent.yaml b/beval/dt-coach/agent.yaml new file mode 100644 index 000000000..a82398922 --- /dev/null +++ b/beval/dt-coach/agent.yaml @@ -0,0 +1,20 @@ +name: dt-coach +description: > + Design Thinking Coach — a conversational coaching agent that guides teams + through the 9 Design Thinking for HVE methods using a Think/Speak/Empower + philosophy. +protocol: acp +connection: + transport: tcp + host: ${AGENT_HOST:-127.0.0.1} + port: ${AGENT_PORT:-3000} + cwd: ${AGENT_REPO_ROOT:-.} + model: ${AGENT_MODEL:-claude-opus-4.6-1m} +init_prompt: "Launch .github/agents/design-thinking/dt-coach.agent.md" +timeout: 120 +retry: + max_attempts: 2 + backoff: 5.0 +metadata: + domain: design-thinking + version: "0.1" diff --git a/beval/dt-coach/cases/coaching-behaviors.yaml b/beval/dt-coach/cases/coaching-behaviors.yaml new file mode 100644 index 000000000..cdec72b98 --- /dev/null +++ b/beval/dt-coach/cases/coaching-behaviors.yaml @@ -0,0 +1,130 @@ +background: + category: coaching-behaviors + given: + domain: design-thinking + +cases: + # ── Think / Speak / Empower philosophy ────────────────────────── + + - id: think_speak_empower_pattern + name: Response follows Think/Speak/Empower structure + tags: [philosophy, core] + given: + query: > + Our team has been struggling with a legacy inventory system. Users + keep asking for a dashboard, but we're not sure that's the real + problem. Can you help us figure out what to do? + stages: + - when: the agent processes the request + then: + - completion time should be under: 120 + - when: the agent responds + then: + - response length should be: [50, 3000] + - the answer should be: > + shares an observation or insight conversationally (e.g. "I'm + noticing..." or "This makes me think...") and ends with a + choice or open question that empowers the user to decide what + to explore next, rather than giving a directive or action plan + + - id: short_conversational_responses + name: Keep responses concise — no methodology lectures + tags: [conversation-style, core] + given: + query: > + What is design thinking and how does it work? + stages: + - when: the agent processes the request + then: + - completion time should be under: 120 + - when: the agent responds + then: + - response length should be: [50, 2000] + - the answer should be: > + gives a brief, conversational explanation without delivering a + long methodology lecture or listing all 9 methods in detail; + keeps the response focused and asks what the user wants to + work on rather than comprehensively explaining the framework + + - id: empowers_with_choices + name: End with choices not directives + tags: [philosophy, core] + given: + query: > + We just finished interviewing 8 stakeholders. What do we do now? + stages: + - when: the agent processes the request + then: + - completion time should be under: 120 + - when: the agent responds + then: + - response length should be: [50, 3000] + - the answer should be: > + presents options or asks a question that lets the user choose + the next step rather than issuing a single directive like + "you should do X"; the response ends with something like + "does that resonate?" or "want to explore that or move forward?" + + # ── Coaching boundaries ───────────────────────────────────────── + + - id: collaborate_not_execute + name: Work WITH users, not FOR them + tags: [boundaries, core] + given: + query: > + Can you create a stakeholder map for our project? The key people + are the VP of Operations, two plant managers, a shift supervisor, + and the IT director. + stages: + - when: the agent processes the request + then: + - completion time should be under: 120 + - when: the agent responds + then: + - response length should be: [50, 3000] + - the answer should be: > + does NOT simply produce a finished stakeholder map; instead + guides the user to co-create it by asking about relationships, + influence levels, or perspectives that would make the map + more useful + + - id: no_prescriptive_solutions + name: Do not prescribe specific solutions to user problems + tags: [boundaries, core] + given: + query: > + Our factory floor workers are ignoring the new safety checklist app. + Adoption is at 15%. How do we fix this? + stages: + - when: the agent processes the request + then: + - completion time should be under: 120 + - when: the agent responds + then: + - response length should be: [50, 3000] + - the answer should be: > + does NOT jump to prescribing a specific fix like "add + gamification" or "simplify the UI"; instead helps the user + explore WHY adoption is low by asking questions about user + context, pain points, or assumptions that haven't been tested + + - id: never_make_users_feel_foolish + name: Stay curious and supportive when users are confused + tags: [boundaries, tone] + given: + query: > + I don't really understand what input synthesis means. We just have + a bunch of interview notes and I'm not sure what to do with them. + This feels overwhelming. + stages: + - when: the agent processes the request + then: + - completion time should be under: 120 + - when: the agent responds + then: + - response length should be: [50, 3000] + - the answer should be: > + responds with empathy and curiosity, normalizing the feeling + of being overwhelmed; does NOT lecture about synthesis + methodology but instead offers a small, manageable starting + point and reassures the user diff --git a/beval/dt-coach/cases/method-guidance.yaml b/beval/dt-coach/cases/method-guidance.yaml new file mode 100644 index 000000000..963ee1cba --- /dev/null +++ b/beval/dt-coach/cases/method-guidance.yaml @@ -0,0 +1,240 @@ +background: + category: method-guidance + given: + domain: design-thinking + +cases: + # ── Method 1: Scope Conversations ────────────────────────────── + + - id: method_1_frozen_vs_fluid + name: "Method 1: Assess whether request is frozen or fluid" + tags: [method-1, problem-space, core] + given: + query: > + Our VP wants us to build an AI chatbot for the help desk. She's + pretty set on it. We're starting Method 1 scope conversations. + How should we approach this? + stages: + - when: the agent processes the request + then: + - completion time should be under: 120 + - when: the agent responds + then: + - response length should be: [80, 3000] + - the answer should be: > + helps the user assess whether the VP's request is frozen + (solution already decided) or fluid (open to exploring the + underlying problem), and suggests how to have scope + conversations that uncover the real need behind the chatbot + request + + - id: method_1_identify_stakeholders + name: "Method 1: Guide stakeholder identification" + tags: [method-1, problem-space, core] + given: + query: > + We want to do scope conversations for our supply chain visibility + project but we're not sure who to talk to. + stages: + - when: the agent processes the request + then: + - completion time should be under: 120 + - when: the agent responds + then: + - response length should be: [80, 3000] + - response should contain: "stakeholder" + - the answer should be: > + guides the user to identify relevant stakeholders by asking + about who is affected by supply chain visibility issues, who + makes decisions, and who has been requesting changes; does + not produce a list for them but helps them think through it + + # ── Method 2: Design Research ─────────────────────────────────── + + - id: method_2_research_planning + name: "Method 2: Help plan systematic research" + tags: [method-2, problem-space] + given: + query: > + We've completed our scope conversations and confirmed the problem + is real. Now we need to do design research. We have access to + 3 plant managers and about 20 floor operators. How do we structure + our research? + stages: + - when: the agent processes the request + then: + - completion time should be under: 120 + - when: the agent responds + then: + - response length should be: [80, 5000] + - the answer should be: > + addresses research planning — who to interview, what to + observe, or how to capture data — and includes at least one + clarifying question or prompt that invites the user to shape + the plan rather than passively receiving it + + # ── Method 3: Input Synthesis ─────────────────────────────────── + + - id: method_3_pattern_recognition + name: "Method 3: Guide pattern recognition from research" + tags: [method-3, problem-space, core] + given: + query: > + We finished 12 interviews across 3 plants. Common things we heard: + operators say they waste time looking for tools, supervisors want + real-time status boards, maintenance crew says preventive schedules + are ignored, and everyone complains about the ERP being too slow. + Help us synthesize this. + stages: + - when: the agent processes the request + then: + - completion time should be under: 120 + - when: the agent responds + then: + - response length should be: [80, 5000] + - the answer should be: > + helps the user identify patterns and themes across the + research findings; may offer some initial observations but + also asks questions that prompt the user to explore + connections between the findings and develop themes + + # ── Method 4: Brainstorming ───────────────────────────────────── + + - id: method_4_divergent_ideation + name: "Method 4: Facilitate divergent ideation" + tags: [method-4, solution-space, core] + given: + query: > + Our synthesis produced three themes: tool accessibility on the floor, + real-time communication gaps, and misaligned maintenance schedules. + We want to brainstorm solutions. There are 6 of us in the room. + stages: + - when: the agent processes the request + then: + - completion time should be under: 120 + - when: the agent responds + then: + - response length should be: [80, 3000] + - the answer should be: > + helps set up a brainstorming session with divergent thinking + principles (quantity over quality, build on ideas, defer + judgment); may suggest focusing on one theme at a time; does + NOT generate solutions but helps the team generate their own + + # ── Method 5: User Concepts ───────────────────────────────────── + + - id: method_5_concept_validation + name: "Method 5: Guide concept creation for validation" + tags: [method-5, solution-space] + given: + query: > + From brainstorming we picked our top 3 ideas: a tool-tracking tag + system, a floor status dashboard, and a predictive maintenance + alert. How do we turn these into user concepts? + stages: + - when: the agent processes the request + then: + - completion time should be under: 120 + - when: the agent responds + then: + - response length should be: [80, 5000] + - the answer should be: > + addresses how to create user-facing concept descriptions + that can be validated with stakeholders; may provide a + framework or starting structure but also asks about + target audience, validation goals, or what feedback the + user wants to get + + # ── Method 6: Low-Fidelity Prototypes ─────────────────────────── + + - id: method_6_scrappy_prototypes + name: "Method 6: Encourage scrappy constraint discovery" + tags: [method-6, solution-space] + given: + query: > + Users loved the floor status dashboard concept. We want to + prototype it. Should we start building it in React? + stages: + - when: the agent processes the request + then: + - completion time should be under: 120 + - when: the agent responds + then: + - response length should be: [80, 3000] + - the answer should be: > + steers away from jumping to code and encourages a low-fidelity + approach (paper sketches, wireframes, clickable mockups) to + discover constraints cheaply before investing in development; + asks what assumptions they want to test with the prototype + + # ── Method 7: High-Fidelity Prototypes ────────────────────────── + + - id: method_7_feasibility_testing + name: "Method 7: Guide technical feasibility testing" + tags: [method-7, implementation-space] + given: + query: > + Our paper prototypes validated the dashboard layout. Now we need + to test whether we can actually pull real-time data from the PLCs + on the floor. We're moving to high-fidelity prototyping. + stages: + - when: the agent processes the request + then: + - completion time should be under: 120 + - when: the agent responds + then: + - response length should be: [80, 3000] + - the answer should be: > + helps the user think through technical feasibility questions + and what they need to prove with the high-fidelity prototype; + asks about technical constraints, integration points, and + what "good enough" looks like at this stage + + # ── Method 8: User Testing ───────────────────────────────────── + + - id: method_8_systematic_validation + name: "Method 8: Structure user testing for validation" + tags: [method-8, implementation-space] + given: + query: > + I'm a UX lead on a manufacturing ops team. We've been working + through the design thinking methods on our floor-status dashboard + project. We now have a working prototype pulling live PLC data + and we're moving into Method 8 — user testing. We want to test + the prototype with operators at Plant B. How should we set up + the user testing? + stages: + - when: the agent processes the request + then: + - completion time should be under: 120 + - when: the agent responds + then: + - response length should be: [80, 5000] + - the answer should be: > + helps the user plan systematic user testing by addressing + success criteria, test scenarios, observation methods, or + feedback capture; includes questions or prompts that + encourage the user to think about what they need to learn + + # ── Method 9: Iteration at Scale ──────────────────────────────── + + - id: method_9_continuous_optimization + name: "Method 9: Guide continuous optimization approach" + tags: [method-9, implementation-space] + given: + query: > + User testing went well at Plant B. Leadership wants to roll out + the dashboard across all 5 plants. How do we approach iteration + at scale? + stages: + - when: the agent processes the request + then: + - completion time should be under: 120 + - when: the agent responds + then: + - response length should be: [80, 5000] + - the answer should be: > + addresses scaling considerations — acknowledges that what + worked at one plant may not transfer directly; covers + differences between sites, feedback loops, or metrics for + ongoing optimization diff --git a/beval/dt-coach/cases/progressive-hints-and-navigation.yaml b/beval/dt-coach/cases/progressive-hints-and-navigation.yaml new file mode 100644 index 000000000..febfcbe85 --- /dev/null +++ b/beval/dt-coach/cases/progressive-hints-and-navigation.yaml @@ -0,0 +1,161 @@ +background: + category: progressive-hints-and-navigation + given: + domain: design-thinking + +cases: + # ── Progressive Hint Engine ───────────────────────────────────── + + - id: hint_broad_direction_first + name: Start with broad hints when user is stuck + tags: [hints, core] + given: + query: > + We're trying to do input synthesis on our interview notes but I + have no idea where to start. I'm totally lost. + stages: + - when: the agent processes the request + then: + - completion time should be under: 120 + - when: the agent responds + then: + - response length should be: [50, 3000] + - the answer should be: > + provides a broad directional hint or gentle starting point + rather than jumping straight to a detailed step-by-step + process; acknowledges the feeling of being lost and offers + a manageable first move like looking for recurring words or + surprising moments in the notes + + - id: hint_escalation_on_repeated_confusion + name: Escalate hints when user remains stuck + tags: [hints, escalation] + given: + query: > + You suggested looking for recurring themes but I'm still stuck. + I read through all the notes and I don't see any patterns. + Everything feels unique to each person. I really don't know + what to look for. + stages: + - when: the agent processes the request + then: + - completion time should be under: 120 + - when: the agent responds + then: + - response length should be: [50, 3000] + - the answer should be: > + escalates to a more specific level of guidance — perhaps + suggesting a concrete technique like looking for emotional + reactions, workarounds people mentioned, or grouping by job + role — while still letting the user do the actual synthesis + work + + # ── Non-linear method navigation ──────────────────────────────── + + - id: backward_transition_accepted + name: Accept backward transitions between methods + tags: [navigation, non-linear, core] + given: + query: > + We started prototyping (Method 6) but realized we missed a key + stakeholder group — the night shift operators. Their workflow is + completely different. I think we need to go back to research. + stages: + - when: the agent processes the request + then: + - completion time should be under: 120 + - when: the agent responds + then: + - response length should be: [80, 3000] + - the answer should be: > + validates the decision to go backward, frames it as a + normal and healthy part of the design thinking process, + suggests returning to Method 2 (Design Research) to + understand the night shift context, and helps identify + what specific gaps to fill + + - id: transparent_method_shift + name: Announce method shifts transparently + tags: [navigation, transparency, core] + given: + query: > + We've been talking about our interview findings and I just had + an idea for a solution — what if we put QR codes on every tool + so operators can scan them to check availability? + stages: + - when: the agent processes the request + then: + - completion time should be under: 120 + - when: the agent responds + then: + - response length should be: [50, 3000] + - the answer should be: > + acknowledges the idea positively but is transparent about + the method shift — notes they are currently in problem space + (synthesis) and the idea jumps to solution space; asks whether + they want to capture the idea and continue synthesis or + deliberately shift to brainstorming + + # ── Anti-patterns ─────────────────────────────────────────────── + + - id: no_multiple_choice_quizzes + name: Avoid multiple-choice question lists + tags: [anti-pattern, conversation-style] + given: + query: > + We need help figuring out our next step. We've completed scope + conversations and have notes from 5 interviews. + stages: + - when: the agent processes the request + then: + - completion time should be under: 120 + - when: the agent responds + then: + - response length should be: [50, 3000] + - the answer should be: > + does NOT respond with a numbered list of options like a quiz + (e.g. "1. Move to synthesis 2. Do more interviews 3. Revisit + scope"); instead offers a conversational observation about + what seems ready and asks one focused question + + - id: no_unsolicited_method_change + name: Do not change method focus without announcing it + tags: [anti-pattern, navigation] + given: + query: > + We're working on Method 3 synthesis. I noticed that two + interviewees mentioned a workaround where they text photos to + their supervisor. Is that significant? + stages: + - when: the agent processes the request + then: + - completion time should be under: 120 + - when: the agent responds + then: + - response length should be: [50, 3000] + - the answer should be: > + stays within Method 3 (Input Synthesis) and helps the user + evaluate the significance of this finding as a synthesis + pattern; does NOT silently jump to brainstorming solutions + for the texting workaround + + # ── Session resumption ────────────────────────────────────────── + + - id: session_resumption + name: Resume session with state context + tags: [session-management, resumption] + given: + query: > + I'm back to continue our customer-portal-redesign project. We + left off in the middle of Method 2 design research last week. + stages: + - when: the agent processes the request + then: + - completion time should be under: 120 + - when: the agent responds + then: + - response length should be: [80, 3000] + - the answer should be: > + acknowledges the returning user, references Method 2 + (Design Research), and asks about or summarizes where they + left off to re-establish context before continuing coaching diff --git a/beval/dt-coach/cases/session-phases.yaml b/beval/dt-coach/cases/session-phases.yaml new file mode 100644 index 000000000..f0edc3730 --- /dev/null +++ b/beval/dt-coach/cases/session-phases.yaml @@ -0,0 +1,160 @@ +background: + category: session-phases + given: + domain: design-thinking + +cases: + # ── Phase 1: Session Initialization ───────────────────────────── + + - id: init_asks_for_project_slug + name: Ask for project slug during initialization + tags: [phase-1, initialization, core] + given: + query: > + Hi! I want to start a new design thinking project for improving + our warehouse picking process. + stages: + - when: the agent processes the request + then: + - completion time should be under: 120 + - when: the agent responds + then: + - response length should be: [50, 3000] + - the answer should be: > + asks the user for a project slug (a kebab-case identifier) + or proposes one, and begins gathering context about the user's + role, team, and which method they want to start with + + - id: init_clarifies_context + name: Gather role, team, and method focus during init + tags: [phase-1, initialization, core] + given: + query: > + I'd like coaching on our customer portal redesign. Project slug + can be "customer-portal-redesign". + stages: + - when: the agent processes the request + then: + - completion time should be under: 120 + - when: the agent responds + then: + - response length should be: [50, 3000] + - the answer should be: > + acknowledges the user's project context, then asks about + the user's role, team composition, which design thinking + method to focus on, session goals, or time constraints — + covering at least one of these initialization items + + - id: init_defaults_to_method_1 + name: Default to Method 1 for new projects + tags: [phase-1, initialization] + given: + query: > + We have a brand new project to rethink how field technicians report + equipment failures. We haven't done any design thinking on this + yet. Project slug is "field-failure-reporting". I'm the product + manager and my team is 4 engineers plus a UX designer. We have + about an hour today. + stages: + - when: the agent processes the request + then: + - completion time should be under: 120 + - when: the agent responds + then: + - response length should be: [50, 3000] + - the answer should be: > + for a brand-new project with no prior design thinking work, + suggests starting at the beginning of the process (problem + space / early methods); acknowledges the team composition + and time constraints and begins transitioning to coaching + + # ── Phase 2: Active Coaching ──────────────────────────────────── + + - id: active_coaching_open_ended_questions + name: Ask targeted, open-ended questions during coaching + tags: [phase-2, active-coaching, core] + given: + query: > + We're in Method 1 for our field-failure-reporting project. The + original request from management was "build a mobile app for + failure reports." + stages: + - when: the agent processes the request + then: + - completion time should be under: 120 + - when: the agent responds + then: + - response length should be: [50, 3000] + - the answer should be: > + asks open-ended questions to help the user discover the real + problem behind the solution request (e.g. "what happens today + when a technician finds a failure?"), rather than accepting + "build a mobile app" at face value + + - id: active_coaching_periodic_summary + name: Summarize progress and check direction + tags: [phase-2, active-coaching] + given: + query: > + So far we've identified that technicians currently use paper forms, + the forms get lost about 30% of the time, supervisors don't see + reports until end of shift, and there's no way to attach photos. + We also learned that technicians hate the current form because it + asks for irrelevant fields. + stages: + - when: the agent processes the request + then: + - completion time should be under: 120 + - when: the agent responds + then: + - response length should be: [80, 3000] + - the answer should be: > + summarizes or reflects back the key findings, then asks whether + the user wants to go deeper into any of these areas, broaden + scope, or move on to the next step + + # ── Phase 3: Method Transition ────────────────────────────────── + + - id: method_transition_recap_and_confirm + name: Recap accomplishments and confirm method change + tags: [phase-3, transition, core] + given: + query: > + I think we've done enough scope conversations for the + field-failure-reporting project. We talked to 6 stakeholders and + identified that the core problem is delayed visibility into + equipment health, not the reporting form itself. Let's move on. + stages: + - when: the agent processes the request + then: + - completion time should be under: 120 + - when: the agent responds + then: + - response length should be: [80, 3000] + - the answer should be: > + explicitly confirms the method transition, briefly recaps + key accomplishments from Method 1 (scope conversations), + and suggests the next method (Method 2: Design Research) + with a clear connection to the previous work + + # ── Phase 4: Session Closure ──────────────────────────────────── + + - id: session_closure_summary + name: Summarize session and suggest next steps on closure + tags: [phase-4, closure, core] + given: + query: > + I think that's enough for today. Let's wrap up our session on + the customer-portal-redesign project. + stages: + - when: the agent processes the request + then: + - completion time should be under: 120 + - when: the agent responds + then: + - response length should be: [80, 3000] + - the answer should be: > + summarizes the session journey, highlights key decisions or + artifacts, mentions open questions or follow-up work, and + suggests how to pick up in a future session including which + method to revisit diff --git a/beval/dt-coach/eval.config.yaml b/beval/dt-coach/eval.config.yaml new file mode 100644 index 000000000..61a1299d7 --- /dev/null +++ b/beval/dt-coach/eval.config.yaml @@ -0,0 +1,20 @@ +eval: + mode: validation + thresholds: + grade_pass: 0.5 + case_pass: 0.5 + agents: + default: dt-coach + definitions: + - name: dt-coach + output: + dir: beval/dt-coach/results + format: json + judge: + protocol: acp + connection: + transport: tcp + host: ${JUDGE_HOST:-127.0.0.1} + port: ${JUDGE_PORT:-3001} + model: ${JUDGE_MODEL:-claude-opus-4.6-1m} + timeout: 60 diff --git a/beval/package-lock.json b/beval/package-lock.json new file mode 100644 index 000000000..7568fb18b --- /dev/null +++ b/beval/package-lock.json @@ -0,0 +1,128 @@ +{ + "name": "beval-deps", + "version": "1.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "beval-deps", + "version": "1.0.0", + "dependencies": { + "@github/copilot": "1.0.9" + } + }, + "node_modules/@github/copilot": { + "version": "1.0.9", + "resolved": "https://registry.npmjs.org/@github/copilot/-/copilot-1.0.9.tgz", + "integrity": "sha512-Kf9okaiXF7C4R38wNf4wLMzq0pCjXYvT6UL5thfA0Ttre1L3oZrPyRUzpqUp0cPnNWGU3oTz3bew0eur7IoPmg==", + "license": "SEE LICENSE IN LICENSE.md", + "bin": { + "copilot": "npm-loader.js" + }, + "optionalDependencies": { + "@github/copilot-darwin-arm64": "1.0.9", + "@github/copilot-darwin-x64": "1.0.9", + "@github/copilot-linux-arm64": "1.0.9", + "@github/copilot-linux-x64": "1.0.9", + "@github/copilot-win32-arm64": "1.0.9", + "@github/copilot-win32-x64": "1.0.9" + } + }, + "node_modules/@github/copilot-darwin-arm64": { + "version": "1.0.9", + "resolved": "https://registry.npmjs.org/@github/copilot-darwin-arm64/-/copilot-darwin-arm64-1.0.9.tgz", + "integrity": "sha512-bqaiE9JkXXG979fmy8uK0cbDjk0gQyUkkdpWDIawf6KwVfoFxpk8dx0Xgl2Bt2vST0FPdT2PlqEYdnDz/6ZuaA==", + "cpu": [ + "arm64" + ], + "license": "SEE LICENSE IN LICENSE.md", + "optional": true, + "os": [ + "darwin" + ], + "bin": { + "copilot-darwin-arm64": "copilot" + } + }, + "node_modules/@github/copilot-darwin-x64": { + "version": "1.0.9", + "resolved": "https://registry.npmjs.org/@github/copilot-darwin-x64/-/copilot-darwin-x64-1.0.9.tgz", + "integrity": "sha512-m1d8TwgbZuviKtZEoKJdgcgFDAKunXzJyAFulIt10WVtkFB32tKbzKj10gZr+C+XdkuNnWjI5RgVPjvcn8zlCw==", + "cpu": [ + "x64" + ], + "license": "SEE LICENSE IN LICENSE.md", + "optional": true, + "os": [ + "darwin" + ], + "bin": { + "copilot-darwin-x64": "copilot" + } + }, + "node_modules/@github/copilot-linux-arm64": { + "version": "1.0.9", + "resolved": "https://registry.npmjs.org/@github/copilot-linux-arm64/-/copilot-linux-arm64-1.0.9.tgz", + "integrity": "sha512-3k/pIzpaCIGTr1uGXiBadW8AYWmlfkstDMYokkYYON0ZZ7dTAQRDLQTe3AD4kd0fFjtTdS6Cr56kKVIO1AHWkw==", + "cpu": [ + "arm64" + ], + "license": "SEE LICENSE IN LICENSE.md", + "optional": true, + "os": [ + "linux" + ], + "bin": { + "copilot-linux-arm64": "copilot" + } + }, + "node_modules/@github/copilot-linux-x64": { + "version": "1.0.9", + "resolved": "https://registry.npmjs.org/@github/copilot-linux-x64/-/copilot-linux-x64-1.0.9.tgz", + "integrity": "sha512-tMd4Md69Jz7Z3jPEpkcGK6+4tx6UlMUOz405FqfItGmNXMw3JXQehZi3DaigYWotWU5TgUwVavRxiADup5AtsQ==", + "cpu": [ + "x64" + ], + "license": "SEE LICENSE IN LICENSE.md", + "optional": true, + "os": [ + "linux" + ], + "bin": { + "copilot-linux-x64": "copilot" + } + }, + "node_modules/@github/copilot-win32-arm64": { + "version": "1.0.9", + "resolved": "https://registry.npmjs.org/@github/copilot-win32-arm64/-/copilot-win32-arm64-1.0.9.tgz", + "integrity": "sha512-mSkjT9A78GgyHTAX0I69yo2cUG86mG4sbldCqqXm/ZbPoHq/+1+6KxIGYeDFQU9BowT4W/fboSCFY/2OtVSY5Q==", + "cpu": [ + "arm64" + ], + "license": "SEE LICENSE IN LICENSE.md", + "optional": true, + "os": [ + "win32" + ], + "bin": { + "copilot-win32-arm64": "copilot.exe" + } + }, + "node_modules/@github/copilot-win32-x64": { + "version": "1.0.9", + "resolved": "https://registry.npmjs.org/@github/copilot-win32-x64/-/copilot-win32-x64-1.0.9.tgz", + "integrity": "sha512-0uaSe0sgFANXU6S9OMSj7/7swiUro61+/N/3GEUwgRJer7dfvBEFgpDC8F//pkBT9fawQS6sGCnlHk7gVCqC2g==", + "cpu": [ + "x64" + ], + "license": "SEE LICENSE IN LICENSE.md", + "optional": true, + "os": [ + "win32" + ], + "bin": { + "copilot-win32-x64": "copilot.exe" + } + } + } +} diff --git a/beval/package.json b/beval/package.json new file mode 100644 index 000000000..0be5f2249 --- /dev/null +++ b/beval/package.json @@ -0,0 +1,8 @@ +{ + "name": "beval-deps", + "version": "1.0.0", + "private": true, + "dependencies": { + "@github/copilot": "1.0.9" + } +}