microsoft · eedorenko · Mar 16, 2026 · Mar 16, 2026 · Mar 16, 2026 · Mar 16, 2026
@@ -24,7 +24,8 @@
     "**/Cargo.lock",
     "CHANGELOG.md",
     "logs/**",
-    "docs/docusaurus/build/**"
+    "docs/docusaurus/build/**",
+    "beval/**/results/**"
   ],
   "ignoreRegExpList": [
     "/#.*/g",
@@ -62,22 +63,25 @@
     "general-technical"
   ],
   "words": [
+    "agentic",
     "atheris",
     "behaviour",
     "behavioural",
+    "beval",
     "brainwriting",
     "clusterfuzzlite",
     "collab",
     "easyops",
     "figjam",
     "hideable",
     "learning",
+    "parseable",
     "smol",
     "subcat",
     "whiteboarding",
+    "wireframes",
     "ˈpræksɪs",
-    "πρᾶξις",
-    "agentic"
+    "πρᾶξις"
   ],
   "reporters": [
     "default",

@@ -0,0 +1,87 @@
+name: Behavioral Evaluation (beval)
+
+on:
+  workflow_call:
+    secrets:
+      COPILOT_TOKEN:
+        required: true
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: false
+
+jobs:
+  evaluate:
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+
+    env:
+      AGENT_REPO_ROOT: ${{ github.workspace }}
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v4.2.2
+        with:
+          persist-credentials: false
+
+      - name: Set up Python
+        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
+        with:
+          python-version: "3.12"
+
+      - name: Install GitHub Copilot CLI
+        run: |
+          npm ci --prefix beval
+          echo "${{ github.workspace }}/beval/node_modules/.bin" >> "$GITHUB_PATH"
+
+      - name: Install beval
+        # beval is hosted under a personal account (vyta) while an org-owned
+        # home is evaluated. The install is pinned to a specific commit SHA to
+        # mitigate supply-chain risk in the interim.
+        run: pip install --no-cache-dir "beval[all] @ git+https://github.com/vyta/beval.git@a2effa10cec1b06c394811587fede0070174d589#subdirectory=python"
+
+      - name: Start agent (TCP)
+        env:
+          COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_TOKEN }}
+        run: |
+          copilot --acp --port 3000 &
+          for i in $(seq 1 30); do
+            nc -z 127.0.0.1 3000 && break
+            echo "Waiting for agent to start ($i)..."
+            sleep 2
+          done
+          nc -z 127.0.0.1 3000 || { echo "Agent failed to start"; exit 1; }
+
+      - name: Start judge (TCP)
+        env:
+          COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_TOKEN }}
+        run: |
+          copilot --acp --port 3001 &
+          for i in $(seq 1 30); do
+            nc -z 127.0.0.1 3001 && break
+            echo "Waiting for judge to start ($i)..."
+            sleep 2
+          done
+          nc -z 127.0.0.1 3001 || { echo "Judge failed to start"; exit 1; }
+
+      - name: Run evaluations
+        run: |
+          beval \
+            -c beval/dt-coach/eval.config.yaml \
+            run \
+            --cases beval/dt-coach/cases/ \
+            --agent beval/dt-coach/agent.yaml \
+            -m validation \
+            -o beval/dt-coach/results/results.json
+
+      - name: Upload results
+        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v4.4.3
+        if: always()
+        with:
+          name: beval-results-${{ github.run_id }}
+          path: beval/dt-coach/results/
+          retention-days: 30
@@ -55,12 +55,22 @@ jobs:
             WTFPL, LicenseRef-scancode-unicode
           # Packages with compound SPDX expressions containing GPL or MPL
           # from bundled code; distributed licenses are permissive.
+          # @github/copilot uses a non-SPDX proprietary license
+          # (LicenseRef-bad-see-license-in-license.md); it is GitHub's own
+          # CLI toolchain, deliberately used in beval.yml.
           # pkg:npm/hve-core is the private root package (never published to npm).
           allow-dependencies-licenses: >-
             pkg:pypi/lxml,
             pkg:pypi/typing-extensions,
             pkg:npm/dompurify,
             pkg:npm/lunr-languages,
+            pkg:npm/%40github/copilot,
+            pkg:npm/%40github/copilot-darwin-arm64,
+            pkg:npm/%40github/copilot-darwin-x64,
+            pkg:npm/%40github/copilot-linux-arm64,
+            pkg:npm/%40github/copilot-linux-x64,
+            pkg:npm/%40github/copilot-win32-arm64,
+            pkg:npm/%40github/copilot-win32-x64,
             pkg:npm/hve-core
           show-openssf-scorecard: true
           warn-on-openssf-scorecard-level: 3

@@ -0,0 +1,20 @@
+name: dt-coach
+description: >
+  Design Thinking Coach — a conversational coaching agent that guides teams
+  through the 9 Design Thinking for HVE methods using a Think/Speak/Empower
+  philosophy.
+protocol: acp
+connection:
+  transport: tcp
+  host: ${AGENT_HOST:-127.0.0.1}
+  port: ${AGENT_PORT:-3000}
+  cwd: ${AGENT_REPO_ROOT:-.}
+  model: ${AGENT_MODEL:-claude-opus-4.6-1m}
+init_prompt: "Launch .github/agents/design-thinking/dt-coach.agent.md"
+timeout: 120
+retry:
+  max_attempts: 2
+  backoff: 5.0
+metadata:
+  domain: design-thinking
+  version: "0.1"
@@ -0,0 +1,130 @@
+background:
+  category: coaching-behaviors
+  given:
+    domain: design-thinking
+
+cases:
+  # ── Think / Speak / Empower philosophy ──────────────────────────
+
+  - id: think_speak_empower_pattern
+    name: Response follows Think/Speak/Empower structure
+    tags: [philosophy, core]
+    given:
+      query: >
+        Our team has been struggling with a legacy inventory system.  Users
+        keep asking for a dashboard, but we're not sure that's the real
+        problem.  Can you help us figure out what to do?
+    stages:
+      - when: the agent processes the request
+        then:
+          - completion time should be under: 120
+      - when: the agent responds
+        then:
+          - response length should be: [50, 3000]
+          - the answer should be: >
+              shares an observation or insight conversationally (e.g. "I'm
+              noticing..." or "This makes me think...") and ends with a
+              choice or open question that empowers the user to decide what
+              to explore next, rather than giving a directive or action plan
+
+  - id: short_conversational_responses
+    name: Keep responses concise — no methodology lectures
+    tags: [conversation-style, core]
+    given:
+      query: >
+        What is design thinking and how does it work?
+    stages:
+      - when: the agent processes the request
+        then:
+          - completion time should be under: 120
+      - when: the agent responds
+        then:
+          - response length should be: [50, 2000]
+          - the answer should be: >
+              gives a brief, conversational explanation without delivering a
+              long methodology lecture or listing all 9 methods in detail;
+              keeps the response focused and asks what the user wants to
+              work on rather than comprehensively explaining the framework
+
+  - id: empowers_with_choices
+    name: End with choices not directives
+    tags: [philosophy, core]
+    given:
+      query: >
+        We just finished interviewing 8 stakeholders.  What do we do now?
+    stages:
+      - when: the agent processes the request
+        then:
+          - completion time should be under: 120
+      - when: the agent responds
+        then:
+          - response length should be: [50, 3000]
+          - the answer should be: >
+              presents options or asks a question that lets the user choose
+              the next step rather than issuing a single directive like
+              "you should do X"; the response ends with something like
+              "does that resonate?" or "want to explore that or move forward?"
+
+  # ── Coaching boundaries ─────────────────────────────────────────
+
+  - id: collaborate_not_execute
+    name: Work WITH users, not FOR them
+    tags: [boundaries, core]
+    given:
+      query: >
+        Can you create a stakeholder map for our project?  The key people
+        are the VP of Operations, two plant managers, a shift supervisor,
+        and the IT director.
+    stages:
+      - when: the agent processes the request
+        then:
+          - completion time should be under: 120
+      - when: the agent responds
+        then:
+          - response length should be: [50, 3000]
+          - the answer should be: >
+              does NOT simply produce a finished stakeholder map; instead
+              guides the user to co-create it by asking about relationships,
+              influence levels, or perspectives that would make the map
+              more useful
+
+  - id: no_prescriptive_solutions
+    name: Do not prescribe specific solutions to user problems
+    tags: [boundaries, core]
+    given:
+      query: >
+        Our factory floor workers are ignoring the new safety checklist app.
+        Adoption is at 15%.  How do we fix this?
+    stages:
+      - when: the agent processes the request
+        then:
+          - completion time should be under: 120
+      - when: the agent responds
+        then:
+          - response length should be: [50, 3000]
+          - the answer should be: >
+              does NOT jump to prescribing a specific fix like "add
+              gamification" or "simplify the UI"; instead helps the user
+              explore WHY adoption is low by asking questions about user
+              context, pain points, or assumptions that haven't been tested
+
+  - id: never_make_users_feel_foolish
+    name: Stay curious and supportive when users are confused
+    tags: [boundaries, tone]
+    given:
+      query: >
+        I don't really understand what input synthesis means.  We just have
+        a bunch of interview notes and I'm not sure what to do with them.
+        This feels overwhelming.
+    stages:
+      - when: the agent processes the request
+        then:
+          - completion time should be under: 120
+      - when: the agent responds
+        then:
+          - response length should be: [50, 3000]
+          - the answer should be: >
+              responds with empathy and curiosity, normalizing the feeling
+              of being overwhelmed; does NOT lecture about synthesis
+              methodology but instead offers a small, manageable starting
+              point and reassures the user