diff --git a/.cspell.json b/.cspell.json
index 9c2501fa5..a05105c0b 100644
--- a/.cspell.json
+++ b/.cspell.json
@@ -24,7 +24,8 @@
     "**/Cargo.lock",
     "CHANGELOG.md",
     "logs/**",
-    "docs/docusaurus/build/**"
+    "docs/docusaurus/build/**",
+    "beval/**/results/**"
   ],
   "ignoreRegExpList": [
     "/#.*/g",
@@ -62,9 +63,11 @@
     "general-technical"
   ],
   "words": [
+    "agentic",
     "atheris",
     "behaviour",
     "behavioural",
+    "beval",
     "brainwriting",
     "clusterfuzzlite",
     "collab",
@@ -72,12 +75,13 @@
     "figjam",
     "hideable",
     "learning",
+    "parseable",
     "smol",
     "subcat",
     "whiteboarding",
+    "wireframes",
     "ˈpræksɪs",
-    "πρᾶξις",
-    "agentic"
+    "πρᾶξις"
   ],
   "reporters": [
     "default",
diff --git a/.github/workflows/beval.yml b/.github/workflows/beval.yml
new file mode 100644
index 000000000..f3be9a56f
--- /dev/null
+++ b/.github/workflows/beval.yml
@@ -0,0 +1,87 @@
+name: Behavioral Evaluation (beval)
+
+on:
+  workflow_call:
+    secrets:
+      COPILOT_TOKEN:
+        required: true
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: false
+
+jobs:
+  evaluate:
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+
+    env:
+      AGENT_REPO_ROOT: ${{ github.workspace }}
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v4.2.2
+        with:
+          persist-credentials: false
+
+      - name: Set up Python
+        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
+        with:
+          python-version: "3.12"
+
+      - name: Install GitHub Copilot CLI
+        run: |
+          npm ci --prefix beval
+          echo "${{ github.workspace }}/beval/node_modules/.bin" >> "$GITHUB_PATH"
+
+      - name: Install beval
+        # beval is hosted under a personal account (vyta) while an org-owned
+        # home is evaluated. The install is pinned to a specific commit SHA to
+        # mitigate supply-chain risk in the interim.
+        run: pip install --no-cache-dir "beval[all] @ git+https://github.com/vyta/beval.git@a2effa10cec1b06c394811587fede0070174d589#subdirectory=python"
+
+      - name: Start agent (TCP)
+        env:
+          COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_TOKEN }}
+        run: |
+          copilot --acp --port 3000 &
+          for i in $(seq 1 30); do
+            nc -z 127.0.0.1 3000 && break
+            echo "Waiting for agent to start ($i)..."
+            sleep 2
+          done
+          nc -z 127.0.0.1 3000 || { echo "Agent failed to start"; exit 1; }
+
+      - name: Start judge (TCP)
+        env:
+          COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_TOKEN }}
+        run: |
+          copilot --acp --port 3001 &
+          for i in $(seq 1 30); do
+            nc -z 127.0.0.1 3001 && break
+            echo "Waiting for judge to start ($i)..."
+            sleep 2
+          done
+          nc -z 127.0.0.1 3001 || { echo "Judge failed to start"; exit 1; }
+
+      - name: Run evaluations
+        run: |
+          beval \
+            -c beval/dt-coach/eval.config.yaml \
+            run \
+            --cases beval/dt-coach/cases/ \
+            --agent beval/dt-coach/agent.yaml \
+            -m validation \
+            -o beval/dt-coach/results/results.json
+
+      - name: Upload results
+        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v4.4.3
+        if: always()
+        with:
+          name: beval-results-${{ github.run_id }}
+          path: beval/dt-coach/results/
+          retention-days: 30
diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml
index a6cb0d9d8..31aaff26a 100644
--- a/.github/workflows/dependency-review.yml
+++ b/.github/workflows/dependency-review.yml
@@ -55,12 +55,22 @@ jobs:
             WTFPL, LicenseRef-scancode-unicode
           # Packages with compound SPDX expressions containing GPL or MPL
           # from bundled code; distributed licenses are permissive.
+          # @github/copilot uses a non-SPDX proprietary license
+          # (LicenseRef-bad-see-license-in-license.md); it is GitHub's own
+          # CLI toolchain, deliberately used in beval.yml.
           # pkg:npm/hve-core is the private root package (never published to npm).
           allow-dependencies-licenses: >-
             pkg:pypi/lxml,
             pkg:pypi/typing-extensions,
             pkg:npm/dompurify,
             pkg:npm/lunr-languages,
+            pkg:npm/%40github/copilot,
+            pkg:npm/%40github/copilot-darwin-arm64,
+            pkg:npm/%40github/copilot-darwin-x64,
+            pkg:npm/%40github/copilot-linux-arm64,
+            pkg:npm/%40github/copilot-linux-x64,
+            pkg:npm/%40github/copilot-win32-arm64,
+            pkg:npm/%40github/copilot-win32-x64,
             pkg:npm/hve-core
           show-openssf-scorecard: true
           warn-on-openssf-scorecard-level: 3
diff --git a/beval/dt-coach/agent.yaml b/beval/dt-coach/agent.yaml
new file mode 100644
index 000000000..a82398922
--- /dev/null
+++ b/beval/dt-coach/agent.yaml
@@ -0,0 +1,20 @@
+name: dt-coach
+description: >
+  Design Thinking Coach — a conversational coaching agent that guides teams
+  through the 9 Design Thinking for HVE methods using a Think/Speak/Empower
+  philosophy.
+protocol: acp
+connection:
+  transport: tcp
+  host: ${AGENT_HOST:-127.0.0.1}
+  port: ${AGENT_PORT:-3000}
+  cwd: ${AGENT_REPO_ROOT:-.}
+  model: ${AGENT_MODEL:-claude-opus-4.6-1m}
+init_prompt: "Launch .github/agents/design-thinking/dt-coach.agent.md"
+timeout: 120
+retry:
+  max_attempts: 2
+  backoff: 5.0
+metadata:
+  domain: design-thinking
+  version: "0.1"
diff --git a/beval/dt-coach/cases/coaching-behaviors.yaml b/beval/dt-coach/cases/coaching-behaviors.yaml
new file mode 100644
index 000000000..cdec72b98
--- /dev/null
+++ b/beval/dt-coach/cases/coaching-behaviors.yaml
@@ -0,0 +1,130 @@
+background:
+  category: coaching-behaviors
+  given:
+    domain: design-thinking
+
+cases:
+  # ── Think / Speak / Empower philosophy ──────────────────────────
+
+  - id: think_speak_empower_pattern
+    name: Response follows Think/Speak/Empower structure
+    tags: [philosophy, core]
+    given:
+      query: >
+        Our team has been struggling with a legacy inventory system.  Users
+        keep asking for a dashboard, but we're not sure that's the real
+        problem.  Can you help us figure out what to do?
+    stages:
+      - when: the agent processes the request
+        then:
+          - completion time should be under: 120
+      - when: the agent responds
+        then:
+          - response length should be: [50, 3000]
+          - the answer should be: >
+              shares an observation or insight conversationally (e.g. "I'm
+              noticing..." or "This makes me think...") and ends with a
+              choice or open question that empowers the user to decide what
+              to explore next, rather than giving a directive or action plan
+
+  - id: short_conversational_responses
+    name: Keep responses concise — no methodology lectures
+    tags: [conversation-style, core]
+    given:
+      query: >
+        What is design thinking and how does it work?
+    stages:
+      - when: the agent processes the request
+        then:
+          - completion time should be under: 120
+      - when: the agent responds
+        then:
+          - response length should be: [50, 2000]
+          - the answer should be: >
+              gives a brief, conversational explanation without delivering a
+              long methodology lecture or listing all 9 methods in detail;
+              keeps the response focused and asks what the user wants to
+              work on rather than comprehensively explaining the framework
+
+  - id: empowers_with_choices
+    name: End with choices not directives
+    tags: [philosophy, core]
+    given:
+      query: >
+        We just finished interviewing 8 stakeholders.  What do we do now?
+    stages:
+      - when: the agent processes the request
+        then:
+          - completion time should be under: 120
+      - when: the agent responds
+        then:
+          - response length should be: [50, 3000]
+          - the answer should be: >
+              presents options or asks a question that lets the user choose
+              the next step rather than issuing a single directive like
+              "you should do X"; the response ends with something like
+              "does that resonate?" or "want to explore that or move forward?"
+
+  # ── Coaching boundaries ─────────────────────────────────────────
+
+  - id: collaborate_not_execute
+    name: Work WITH users, not FOR them
+    tags: [boundaries, core]
+    given:
+      query: >
+        Can you create a stakeholder map for our project?  The key people
+        are the VP of Operations, two plant managers, a shift supervisor,
+        and the IT director.
+    stages:
+      - when: the agent processes the request
+        then:
+          - completion time should be under: 120
+      - when: the agent responds
+        then:
+          - response length should be: [50, 3000]
+          - the answer should be: >
+              does NOT simply produce a finished stakeholder map; instead
+              guides the user to co-create it by asking about relationships,
+              influence levels, or perspectives that would make the map
+              more useful
+
+  - id: no_prescriptive_solutions
+    name: Do not prescribe specific solutions to user problems
+    tags: [boundaries, core]
+    given:
+      query: >
+        Our factory floor workers are ignoring the new safety checklist app.
+        Adoption is at 15%.  How do we fix this?
+    stages:
+      - when: the agent processes the request
+        then:
+          - completion time should be under: 120
+      - when: the agent responds
+        then:
+          - response length should be: [50, 3000]
+          - the answer should be: >
+              does NOT jump to prescribing a specific fix like "add
+              gamification" or "simplify the UI"; instead helps the user
+              explore WHY adoption is low by asking questions about user
+              context, pain points, or assumptions that haven't been tested
+
+  - id: never_make_users_feel_foolish
+    name: Stay curious and supportive when users are confused
+    tags: [boundaries, tone]
+    given:
+      query: >
+        I don't really understand what input synthesis means.  We just have
+        a bunch of interview notes and I'm not sure what to do with them.
+        This feels overwhelming.
+    stages:
+      - when: the agent processes the request
+        then:
+          - completion time should be under: 120
+      - when: the agent responds
+        then:
+          - response length should be: [50, 3000]
+          - the answer should be: >
+              responds with empathy and curiosity, normalizing the feeling
+              of being overwhelmed; does NOT lecture about synthesis
+              methodology but instead offers a small, manageable starting
+              point and reassures the user
diff --git a/beval/dt-coach/cases/method-guidance.yaml b/beval/dt-coach/cases/method-guidance.yaml
new file mode 100644
index 000000000..963ee1cba
--- /dev/null
+++ b/beval/dt-coach/cases/method-guidance.yaml
@@ -0,0 +1,240 @@
+background:
+  category: method-guidance
+  given:
+    domain: design-thinking
+
+cases:
+  # ── Method 1: Scope Conversations ──────────────────────────────
+
+  - id: method_1_frozen_vs_fluid
+    name: "Method 1: Assess whether request is frozen or fluid"
+    tags: [method-1, problem-space, core]
+    given:
+      query: >
+        Our VP wants us to build an AI chatbot for the help desk.  She's
+        pretty set on it.  We're starting Method 1 scope conversations.
+        How should we approach this?
+    stages:
+      - when: the agent processes the request
+        then:
+          - completion time should be under: 120
+      - when: the agent responds
+        then:
+          - response length should be: [80, 3000]
+          - the answer should be: >
+              helps the user assess whether the VP's request is frozen
+              (solution already decided) or fluid (open to exploring the
+              underlying problem), and suggests how to have scope
+              conversations that uncover the real need behind the chatbot
+              request
+
+  - id: method_1_identify_stakeholders
+    name: "Method 1: Guide stakeholder identification"
+    tags: [method-1, problem-space, core]
+    given:
+      query: >
+        We want to do scope conversations for our supply chain visibility
+        project but we're not sure who to talk to.
+    stages:
+      - when: the agent processes the request
+        then:
+          - completion time should be under: 120
+      - when: the agent responds
+        then:
+          - response length should be: [80, 3000]
+          - response should contain: "stakeholder"
+          - the answer should be: >
+              guides the user to identify relevant stakeholders by asking
+              about who is affected by supply chain visibility issues, who
+              makes decisions, and who has been requesting changes; does
+              not produce a list for them but helps them think through it
+
+  # ── Method 2: Design Research ───────────────────────────────────
+
+  - id: method_2_research_planning
+    name: "Method 2: Help plan systematic research"
+    tags: [method-2, problem-space]
+    given:
+      query: >
+        We've completed our scope conversations and confirmed the problem
+        is real.  Now we need to do design research.  We have access to
+        3 plant managers and about 20 floor operators.  How do we structure
+        our research?
+    stages:
+      - when: the agent processes the request
+        then:
+          - completion time should be under: 120
+      - when: the agent responds
+        then:
+          - response length should be: [80, 5000]
+          - the answer should be: >
+              addresses research planning — who to interview, what to
+              observe, or how to capture data — and includes at least one
+              clarifying question or prompt that invites the user to shape
+              the plan rather than passively receiving it
+
+  # ── Method 3: Input Synthesis ───────────────────────────────────
+
+  - id: method_3_pattern_recognition
+    name: "Method 3: Guide pattern recognition from research"
+    tags: [method-3, problem-space, core]
+    given:
+      query: >
+        We finished 12 interviews across 3 plants.  Common things we heard:
+        operators say they waste time looking for tools, supervisors want
+        real-time status boards, maintenance crew says preventive schedules
+        are ignored, and everyone complains about the ERP being too slow.
+        Help us synthesize this.
+    stages:
+      - when: the agent processes the request
+        then:
+          - completion time should be under: 120
+      - when: the agent responds
+        then:
+          - response length should be: [80, 5000]
+          - the answer should be: >
+              helps the user identify patterns and themes across the
+              research findings; may offer some initial observations but
+              also asks questions that prompt the user to explore
+              connections between the findings and develop themes
+
+  # ── Method 4: Brainstorming ─────────────────────────────────────
+
+  - id: method_4_divergent_ideation
+    name: "Method 4: Facilitate divergent ideation"
+    tags: [method-4, solution-space, core]
+    given:
+      query: >
+        Our synthesis produced three themes: tool accessibility on the floor,
+        real-time communication gaps, and misaligned maintenance schedules.
+        We want to brainstorm solutions.  There are 6 of us in the room.
+    stages:
+      - when: the agent processes the request
+        then:
+          - completion time should be under: 120
+      - when: the agent responds
+        then:
+          - response length should be: [80, 3000]
+          - the answer should be: >
+              helps set up a brainstorming session with divergent thinking
+              principles (quantity over quality, build on ideas, defer
+              judgment); may suggest focusing on one theme at a time; does
+              NOT generate solutions but helps the team generate their own
+
+  # ── Method 5: User Concepts ─────────────────────────────────────
+
+  - id: method_5_concept_validation
+    name: "Method 5: Guide concept creation for validation"
+    tags: [method-5, solution-space]
+    given:
+      query: >
+        From brainstorming we picked our top 3 ideas: a tool-tracking tag
+        system, a floor status dashboard, and a predictive maintenance
+        alert.  How do we turn these into user concepts?
+    stages:
+      - when: the agent processes the request
+        then:
+          - completion time should be under: 120
+      - when: the agent responds
+        then:
+          - response length should be: [80, 5000]
+          - the answer should be: >
+              addresses how to create user-facing concept descriptions
+              that can be validated with stakeholders; may provide a
+              framework or starting structure but also asks about
+              target audience, validation goals, or what feedback the
+              user wants to get
+
+  # ── Method 6: Low-Fidelity Prototypes ───────────────────────────
+
+  - id: method_6_scrappy_prototypes
+    name: "Method 6: Encourage scrappy constraint discovery"
+    tags: [method-6, solution-space]
+    given:
+      query: >
+        Users loved the floor status dashboard concept.  We want to
+        prototype it.  Should we start building it in React?
+    stages:
+      - when: the agent processes the request
+        then:
+          - completion time should be under: 120
+      - when: the agent responds
+        then:
+          - response length should be: [80, 3000]
+          - the answer should be: >
+              steers away from jumping to code and encourages a low-fidelity
+              approach (paper sketches, wireframes, clickable mockups) to
+              discover constraints cheaply before investing in development;
+              asks what assumptions they want to test with the prototype
+
+  # ── Method 7: High-Fidelity Prototypes ──────────────────────────
+
+  - id: method_7_feasibility_testing
+    name: "Method 7: Guide technical feasibility testing"
+    tags: [method-7, implementation-space]
+    given:
+      query: >
+        Our paper prototypes validated the dashboard layout.  Now we need
+        to test whether we can actually pull real-time data from the PLCs
+        on the floor.  We're moving to high-fidelity prototyping.
+    stages:
+      - when: the agent processes the request
+        then:
+          - completion time should be under: 120
+      - when: the agent responds
+        then:
+          - response length should be: [80, 3000]
+          - the answer should be: >
+              helps the user think through technical feasibility questions
+              and what they need to prove with the high-fidelity prototype;
+              asks about technical constraints, integration points, and
+              what "good enough" looks like at this stage
+
+  # ── Method 8: User Testing ─────────────────────────────────────
+
+  - id: method_8_systematic_validation
+    name: "Method 8: Structure user testing for validation"
+    tags: [method-8, implementation-space]
+    given:
+      query: >
+        I'm a UX lead on a manufacturing ops team.  We've been working
+        through the design thinking methods on our floor-status dashboard
+        project.  We now have a working prototype pulling live PLC data
+        and we're moving into Method 8 — user testing.  We want to test
+        the prototype with operators at Plant B.  How should we set up
+        the user testing?
+    stages:
+      - when: the agent processes the request
+        then:
+          - completion time should be under: 120
+      - when: the agent responds
+        then:
+          - response length should be: [80, 5000]
+          - the answer should be: >
+              helps the user plan systematic user testing by addressing
+              success criteria, test scenarios, observation methods, or
+              feedback capture; includes questions or prompts that
+              encourage the user to think about what they need to learn
+
+  # ── Method 9: Iteration at Scale ────────────────────────────────
+
+  - id: method_9_continuous_optimization
+    name: "Method 9: Guide continuous optimization approach"
+    tags: [method-9, implementation-space]
+    given:
+      query: >
+        User testing went well at Plant B.  Leadership wants to roll out
+        the dashboard across all 5 plants.  How do we approach iteration
+        at scale?
+    stages:
+      - when: the agent processes the request
+        then:
+          - completion time should be under: 120
+      - when: the agent responds
+        then:
+          - response length should be: [80, 5000]
+          - the answer should be: >
+              addresses scaling considerations — acknowledges that what
+              worked at one plant may not transfer directly; covers
+              differences between sites, feedback loops, or metrics for
+              ongoing optimization
diff --git a/beval/dt-coach/cases/progressive-hints-and-navigation.yaml b/beval/dt-coach/cases/progressive-hints-and-navigation.yaml
new file mode 100644
index 000000000..febfcbe85
--- /dev/null
+++ b/beval/dt-coach/cases/progressive-hints-and-navigation.yaml
@@ -0,0 +1,161 @@
+background:
+  category: progressive-hints-and-navigation
+  given:
+    domain: design-thinking
+
+cases:
+  # ── Progressive Hint Engine ─────────────────────────────────────
+
+  - id: hint_broad_direction_first
+    name: Start with broad hints when user is stuck
+    tags: [hints, core]
+    given:
+      query: >
+        We're trying to do input synthesis on our interview notes but I
+        have no idea where to start.  I'm totally lost.
+    stages:
+      - when: the agent processes the request
+        then:
+          - completion time should be under: 120
+      - when: the agent responds
+        then:
+          - response length should be: [50, 3000]
+          - the answer should be: >
+              provides a broad directional hint or gentle starting point
+              rather than jumping straight to a detailed step-by-step
+              process; acknowledges the feeling of being lost and offers
+              a manageable first move like looking for recurring words or
+              surprising moments in the notes
+
+  - id: hint_escalation_on_repeated_confusion
+    name: Escalate hints when user remains stuck
+    tags: [hints, escalation]
+    given:
+      query: >
+        You suggested looking for recurring themes but I'm still stuck.
+        I read through all the notes and I don't see any patterns.
+        Everything feels unique to each person.  I really don't know
+        what to look for.
+    stages:
+      - when: the agent processes the request
+        then:
+          - completion time should be under: 120
+      - when: the agent responds
+        then:
+          - response length should be: [50, 3000]
+          - the answer should be: >
+              escalates to a more specific level of guidance — perhaps
+              suggesting a concrete technique like looking for emotional
+              reactions, workarounds people mentioned, or grouping by job
+              role — while still letting the user do the actual synthesis
+              work
+
+  # ── Non-linear method navigation ────────────────────────────────
+
+  - id: backward_transition_accepted
+    name: Accept backward transitions between methods
+    tags: [navigation, non-linear, core]
+    given:
+      query: >
+        We started prototyping (Method 6) but realized we missed a key
+        stakeholder group — the night shift operators.  Their workflow is
+        completely different.  I think we need to go back to research.
+    stages:
+      - when: the agent processes the request
+        then:
+          - completion time should be under: 120
+      - when: the agent responds
+        then:
+          - response length should be: [80, 3000]
+          - the answer should be: >
+              validates the decision to go backward, frames it as a
+              normal and healthy part of the design thinking process,
+              suggests returning to Method 2 (Design Research) to
+              understand the night shift context, and helps identify
+              what specific gaps to fill
+
+  - id: transparent_method_shift
+    name: Announce method shifts transparently
+    tags: [navigation, transparency, core]
+    given:
+      query: >
+        We've been talking about our interview findings and I just had
+        an idea for a solution — what if we put QR codes on every tool
+        so operators can scan them to check availability?
+    stages:
+      - when: the agent processes the request
+        then:
+          - completion time should be under: 120
+      - when: the agent responds
+        then:
+          - response length should be: [50, 3000]
+          - the answer should be: >
+              acknowledges the idea positively but is transparent about
+              the method shift — notes they are currently in problem space
+              (synthesis) and the idea jumps to solution space; asks whether
+              they want to capture the idea and continue synthesis or
+              deliberately shift to brainstorming
+
+  # ── Anti-patterns ───────────────────────────────────────────────
+
+  - id: no_multiple_choice_quizzes
+    name: Avoid multiple-choice question lists
+    tags: [anti-pattern, conversation-style]
+    given:
+      query: >
+        We need help figuring out our next step.  We've completed scope
+        conversations and have notes from 5 interviews.
+    stages:
+      - when: the agent processes the request
+        then:
+          - completion time should be under: 120
+      - when: the agent responds
+        then:
+          - response length should be: [50, 3000]
+          - the answer should be: >
+              does NOT respond with a numbered list of options like a quiz
+              (e.g. "1. Move to synthesis 2. Do more interviews 3. Revisit
+              scope"); instead offers a conversational observation about
+              what seems ready and asks one focused question
+
+  - id: no_unsolicited_method_change
+    name: Do not change method focus without announcing it
+    tags: [anti-pattern, navigation]
+    given:
+      query: >
+        We're working on Method 3 synthesis.  I noticed that two
+        interviewees mentioned a workaround where they text photos to
+        their supervisor.  Is that significant?
+    stages:
+      - when: the agent processes the request
+        then:
+          - completion time should be under: 120
+      - when: the agent responds
+        then:
+          - response length should be: [50, 3000]
+          - the answer should be: >
+              stays within Method 3 (Input Synthesis) and helps the user
+              evaluate the significance of this finding as a synthesis
+              pattern; does NOT silently jump to brainstorming solutions
+              for the texting workaround
+
+  # ── Session resumption ──────────────────────────────────────────
+
+  - id: session_resumption
+    name: Resume session with state context
+    tags: [session-management, resumption]
+    given:
+      query: >
+        I'm back to continue our customer-portal-redesign project.  We
+        left off in the middle of Method 2 design research last week.
+    stages:
+      - when: the agent processes the request
+        then:
+          - completion time should be under: 120
+      - when: the agent responds
+        then:
+          - response length should be: [80, 3000]
+          - the answer should be: >
+              acknowledges the returning user, references Method 2
+              (Design Research), and asks about or summarizes where they
+              left off to re-establish context before continuing coaching
diff --git a/beval/dt-coach/cases/session-phases.yaml b/beval/dt-coach/cases/session-phases.yaml
new file mode 100644
index 000000000..f0edc3730
--- /dev/null
+++ b/beval/dt-coach/cases/session-phases.yaml
@@ -0,0 +1,160 @@
+background:
+  category: session-phases
+  given:
+    domain: design-thinking
+
+cases:
+  # ── Phase 1: Session Initialization ─────────────────────────────
+
+  - id: init_asks_for_project_slug
+    name: Ask for project slug during initialization
+    tags: [phase-1, initialization, core]
+    given:
+      query: >
+        Hi!  I want to start a new design thinking project for improving
+        our warehouse picking process.
+    stages:
+      - when: the agent processes the request
+        then:
+          - completion time should be under: 120
+      - when: the agent responds
+        then:
+          - response length should be: [50, 3000]
+          - the answer should be: >
+              asks the user for a project slug (a kebab-case identifier)
+              or proposes one, and begins gathering context about the user's
+              role, team, and which method they want to start with
+
+  - id: init_clarifies_context
+    name: Gather role, team, and method focus during init
+    tags: [phase-1, initialization, core]
+    given:
+      query: >
+        I'd like coaching on our customer portal redesign.  Project slug
+        can be "customer-portal-redesign".
+    stages:
+      - when: the agent processes the request
+        then:
+          - completion time should be under: 120
+      - when: the agent responds
+        then:
+          - response length should be: [50, 3000]
+          - the answer should be: >
+              acknowledges the user's project context, then asks about
+              the user's role, team composition, which design thinking
+              method to focus on, session goals, or time constraints —
+              covering at least one of these initialization items
+
+  - id: init_defaults_to_method_1
+    name: Default to Method 1 for new projects
+    tags: [phase-1, initialization]
+    given:
+      query: >
+        We have a brand new project to rethink how field technicians report
+        equipment failures.  We haven't done any design thinking on this
+        yet.  Project slug is "field-failure-reporting".  I'm the product
+        manager and my team is 4 engineers plus a UX designer.  We have
+        about an hour today.
+    stages:
+      - when: the agent processes the request
+        then:
+          - completion time should be under: 120
+      - when: the agent responds
+        then:
+          - response length should be: [50, 3000]
+          - the answer should be: >
+              for a brand-new project with no prior design thinking work,
+              suggests starting at the beginning of the process (problem
+              space / early methods); acknowledges the team composition
+              and time constraints and begins transitioning to coaching
+
+  # ── Phase 2: Active Coaching ────────────────────────────────────
+
+  - id: active_coaching_open_ended_questions
+    name: Ask targeted, open-ended questions during coaching
+    tags: [phase-2, active-coaching, core]
+    given:
+      query: >
+        We're in Method 1 for our field-failure-reporting project.  The
+        original request from management was "build a mobile app for
+        failure reports."
+    stages:
+      - when: the agent processes the request
+        then:
+          - completion time should be under: 120
+      - when: the agent responds
+        then:
+          - response length should be: [50, 3000]
+          - the answer should be: >
+              asks open-ended questions to help the user discover the real
+              problem behind the solution request (e.g. "what happens today
+              when a technician finds a failure?"), rather than accepting
+              "build a mobile app" at face value
+
+  - id: active_coaching_periodic_summary
+    name: Summarize progress and check direction
+    tags: [phase-2, active-coaching]
+    given:
+      query: >
+        So far we've identified that technicians currently use paper forms,
+        the forms get lost about 30% of the time, supervisors don't see
+        reports until end of shift, and there's no way to attach photos.
+        We also learned that technicians hate the current form because it
+        asks for irrelevant fields.
+    stages:
+      - when: the agent processes the request
+        then:
+          - completion time should be under: 120
+      - when: the agent responds
+        then:
+          - response length should be: [80, 3000]
+          - the answer should be: >
+              summarizes or reflects back the key findings, then asks whether
+              the user wants to go deeper into any of these areas, broaden
+              scope, or move on to the next step
+
+  # ── Phase 3: Method Transition ──────────────────────────────────
+
+  - id: method_transition_recap_and_confirm
+    name: Recap accomplishments and confirm method change
+    tags: [phase-3, transition, core]
+    given:
+      query: >
+        I think we've done enough scope conversations for the
+        field-failure-reporting project.  We talked to 6 stakeholders and
+        identified that the core problem is delayed visibility into
+        equipment health, not the reporting form itself.  Let's move on.
+    stages:
+      - when: the agent processes the request
+        then:
+          - completion time should be under: 120
+      - when: the agent responds
+        then:
+          - response length should be: [80, 3000]
+          - the answer should be: >
+              explicitly confirms the method transition, briefly recaps
+              key accomplishments from Method 1 (scope conversations),
+              and suggests the next method (Method 2: Design Research)
+              with a clear connection to the previous work
+
+  # ── Phase 4: Session Closure ────────────────────────────────────
+
+  - id: session_closure_summary
+    name: Summarize session and suggest next steps on closure
+    tags: [phase-4, closure, core]
+    given:
+      query: >
+        I think that's enough for today.  Let's wrap up our session on
+        the customer-portal-redesign project.
+    stages:
+      - when: the agent processes the request
+        then:
+          - completion time should be under: 120
+      - when: the agent responds
+        then:
+          - response length should be: [80, 3000]
+          - the answer should be: >
+              summarizes the session journey, highlights key decisions or
+              artifacts, mentions open questions or follow-up work, and
+              suggests how to pick up in a future session including which
+              method to revisit
diff --git a/beval/dt-coach/eval.config.yaml b/beval/dt-coach/eval.config.yaml
new file mode 100644
index 000000000..61a1299d7
--- /dev/null
+++ b/beval/dt-coach/eval.config.yaml
@@ -0,0 +1,20 @@
+eval:
+  mode: validation
+  thresholds:
+    grade_pass: 0.5
+    case_pass: 0.5
+  agents:
+    default: dt-coach
+    definitions:
+      - name: dt-coach
+  output:
+    dir: beval/dt-coach/results
+    format: json
+  judge:
+    protocol: acp
+    connection:
+      transport: tcp
+      host: ${JUDGE_HOST:-127.0.0.1}
+      port: ${JUDGE_PORT:-3001}
+      model: ${JUDGE_MODEL:-claude-opus-4.6-1m}
+    timeout: 60
diff --git a/beval/package-lock.json b/beval/package-lock.json
new file mode 100644
index 000000000..7568fb18b
--- /dev/null
+++ b/beval/package-lock.json
@@ -0,0 +1,128 @@
+{
+  "name": "beval-deps",
+  "version": "1.0.0",
+  "lockfileVersion": 3,
+  "requires": true,
+  "packages": {
+    "": {
+      "name": "beval-deps",
+      "version": "1.0.0",
+      "dependencies": {
+        "@github/copilot": "1.0.9"
+      }
+    },
+    "node_modules/@github/copilot": {
+      "version": "1.0.9",
+      "resolved": "https://registry.npmjs.org/@github/copilot/-/copilot-1.0.9.tgz",
+      "integrity": "sha512-Kf9okaiXF7C4R38wNf4wLMzq0pCjXYvT6UL5thfA0Ttre1L3oZrPyRUzpqUp0cPnNWGU3oTz3bew0eur7IoPmg==",
+      "license": "SEE LICENSE IN LICENSE.md",
+      "bin": {
+        "copilot": "npm-loader.js"
+      },
+      "optionalDependencies": {
+        "@github/copilot-darwin-arm64": "1.0.9",
+        "@github/copilot-darwin-x64": "1.0.9",
+        "@github/copilot-linux-arm64": "1.0.9",
+        "@github/copilot-linux-x64": "1.0.9",
+        "@github/copilot-win32-arm64": "1.0.9",
+        "@github/copilot-win32-x64": "1.0.9"
+      }
+    },
+    "node_modules/@github/copilot-darwin-arm64": {
+      "version": "1.0.9",
+      "resolved": "https://registry.npmjs.org/@github/copilot-darwin-arm64/-/copilot-darwin-arm64-1.0.9.tgz",
+      "integrity": "sha512-bqaiE9JkXXG979fmy8uK0cbDjk0gQyUkkdpWDIawf6KwVfoFxpk8dx0Xgl2Bt2vST0FPdT2PlqEYdnDz/6ZuaA==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "SEE LICENSE IN LICENSE.md",
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "bin": {
+        "copilot-darwin-arm64": "copilot"
+      }
+    },
+    "node_modules/@github/copilot-darwin-x64": {
+      "version": "1.0.9",
+      "resolved": "https://registry.npmjs.org/@github/copilot-darwin-x64/-/copilot-darwin-x64-1.0.9.tgz",
+      "integrity": "sha512-m1d8TwgbZuviKtZEoKJdgcgFDAKunXzJyAFulIt10WVtkFB32tKbzKj10gZr+C+XdkuNnWjI5RgVPjvcn8zlCw==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "SEE LICENSE IN LICENSE.md",
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "bin": {
+        "copilot-darwin-x64": "copilot"
+      }
+    },
+    "node_modules/@github/copilot-linux-arm64": {
+      "version": "1.0.9",
+      "resolved": "https://registry.npmjs.org/@github/copilot-linux-arm64/-/copilot-linux-arm64-1.0.9.tgz",
+      "integrity": "sha512-3k/pIzpaCIGTr1uGXiBadW8AYWmlfkstDMYokkYYON0ZZ7dTAQRDLQTe3AD4kd0fFjtTdS6Cr56kKVIO1AHWkw==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "SEE LICENSE IN LICENSE.md",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "bin": {
+        "copilot-linux-arm64": "copilot"
+      }
+    },
+    "node_modules/@github/copilot-linux-x64": {
+      "version": "1.0.9",
+      "resolved": "https://registry.npmjs.org/@github/copilot-linux-x64/-/copilot-linux-x64-1.0.9.tgz",
+      "integrity": "sha512-tMd4Md69Jz7Z3jPEpkcGK6+4tx6UlMUOz405FqfItGmNXMw3JXQehZi3DaigYWotWU5TgUwVavRxiADup5AtsQ==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "SEE LICENSE IN LICENSE.md",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "bin": {
+        "copilot-linux-x64": "copilot"
+      }
+    },
+    "node_modules/@github/copilot-win32-arm64": {
+      "version": "1.0.9",
+      "resolved": "https://registry.npmjs.org/@github/copilot-win32-arm64/-/copilot-win32-arm64-1.0.9.tgz",
+      "integrity": "sha512-mSkjT9A78GgyHTAX0I69yo2cUG86mG4sbldCqqXm/ZbPoHq/+1+6KxIGYeDFQU9BowT4W/fboSCFY/2OtVSY5Q==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "SEE LICENSE IN LICENSE.md",
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "bin": {
+        "copilot-win32-arm64": "copilot.exe"
+      }
+    },
+    "node_modules/@github/copilot-win32-x64": {
+      "version": "1.0.9",
+      "resolved": "https://registry.npmjs.org/@github/copilot-win32-x64/-/copilot-win32-x64-1.0.9.tgz",
+      "integrity": "sha512-0uaSe0sgFANXU6S9OMSj7/7swiUro61+/N/3GEUwgRJer7dfvBEFgpDC8F//pkBT9fawQS6sGCnlHk7gVCqC2g==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "SEE LICENSE IN LICENSE.md",
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "bin": {
+        "copilot-win32-x64": "copilot.exe"
+      }
+    }
+  }
+}
diff --git a/beval/package.json b/beval/package.json
new file mode 100644
index 000000000..0be5f2249
--- /dev/null
+++ b/beval/package.json
@@ -0,0 +1,8 @@
+{
+  "name": "beval-deps",
+  "version": "1.0.0",
+  "private": true,
+  "dependencies": {
+    "@github/copilot": "1.0.9"
+  }
+}