From 3d337936fe770bce29778daec9688660551175f8 Mon Sep 17 00:00:00 2001
From: CatB1t <skytedits@gmail.com>
Date: Tue, 7 Apr 2026 13:48:12 +0200
Subject: [PATCH 01/14] ci: restructure CI to auto-run tests on staging/main
 pushes and PRs to main

Push to staging/main runs lightweight tests (marker "not llm_call",
excluding
actor/conversation_manager/async_tool_loop dirs). PRs to main run the
full
suite. Removes the opt-in [run-tests]/[parallel_run.sh] tag mechanism.

Also fixes discover_test_paths.py which had a broken startswith("test")
filter
that didn't match any actual test directory names.
---
 .github/scripts/discover_test_paths.py |  87 +++++++--
 .github/workflows/tests.yml            | 249 ++++++-------------------
 2 files changed, 126 insertions(+), 210 deletions(-)

diff --git a/.github/scripts/discover_test_paths.py b/.github/scripts/discover_test_paths.py
index 49e37e701..db203cab3 100644
--- a/.github/scripts/discover_test_paths.py
+++ b/.github/scripts/discover_test_paths.py
@@ -9,6 +9,9 @@
     # Expand specific paths to their leaf directories
     python discover_test_paths.py tests/function_manager tests/actor
 
+    # Exclude directories from discovery
+    python discover_test_paths.py --exclude tests/actor --exclude tests/conversation_manager
+
 When explicit paths are provided:
 - Files are kept as-is (no expansion)
 - Directories are expanded to their leaf sub-directories using Option A algorithm
@@ -19,8 +22,8 @@
   direct test files (space-separated), plus recursive jobs for subdirs
 """
 
+import argparse
 import os
-import sys
 from pathlib import Path
 
 EXCLUDE_DIRS = {
@@ -33,6 +36,14 @@
     "venv",
 }
 
+_exclude_prefixes: list[str] = []
+
+
+def _is_excluded(path: str | Path) -> bool:
+    """Return True if path starts with any --exclude prefix."""
+    s = str(path)
+    return any(s == p or s.startswith(p + "/") for p in _exclude_prefixes)
+
 
 def has_test_files(directory):
     """Check if directory has test_*.py files directly in it."""
@@ -46,9 +57,17 @@ def has_test_files(directory):
 def has_test_subdirs(directory):
     """Check if directory has subdirectories that contain test files (recursively)."""
     for subdir in directory.iterdir():
-        if subdir.is_dir() and subdir.name not in EXCLUDE_DIRS:
+        if (
+            subdir.is_dir()
+            and subdir.name not in EXCLUDE_DIRS
+            and not _is_excluded(subdir)
+        ):
             for root, dirs, files in os.walk(subdir):
-                dirs[:] = [d for d in dirs if d not in EXCLUDE_DIRS]
+                dirs[:] = [
+                    d
+                    for d in dirs
+                    if d not in EXCLUDE_DIRS and not _is_excluded(os.path.join(root, d))
+                ]
                 if any(f.startswith("test_") and f.endswith(".py") for f in files):
                     return True
     return False
@@ -67,7 +86,7 @@ def get_direct_test_files(directory):
 
 def collect_paths(directory, paths):
     """Recursively collect test paths using Option A algorithm."""
-    if not directory.is_dir():
+    if not directory.is_dir() or _is_excluded(directory):
         return
 
     has_files = has_test_files(directory)
@@ -81,12 +100,20 @@ def collect_paths(directory, paths):
         direct_files = get_direct_test_files(directory)
         paths.append(" ".join(str(f) for f in direct_files))
         for subdir in sorted(directory.iterdir()):
-            if subdir.is_dir() and subdir.name not in EXCLUDE_DIRS:
+            if (
+                subdir.is_dir()
+                and subdir.name not in EXCLUDE_DIRS
+                and not _is_excluded(subdir)
+            ):
                 collect_paths(subdir, paths)
     elif has_subdirs:
         # No direct test files, but has subdirs with tests: just recurse
         for subdir in sorted(directory.iterdir()):
-            if subdir.is_dir() and subdir.name not in EXCLUDE_DIRS:
+            if (
+                subdir.is_dir()
+                and subdir.name not in EXCLUDE_DIRS
+                and not _is_excluded(subdir)
+            ):
                 collect_paths(subdir, paths)
 
 
@@ -100,6 +127,9 @@ def expand_path(path_str):
     path = Path(path_str)
     paths = []
 
+    if _is_excluded(path):
+        return []
+
     if not path.exists():
         # Path doesn't exist - return as-is and let pytest handle the error
         return [path_str]
@@ -122,12 +152,20 @@ def expand_path(path_str):
             direct_files = get_direct_test_files(path)
             paths.append(" ".join(str(f) for f in direct_files))
             for subdir in sorted(path.iterdir()):
-                if subdir.is_dir() and subdir.name not in EXCLUDE_DIRS:
+                if (
+                    subdir.is_dir()
+                    and subdir.name not in EXCLUDE_DIRS
+                    and not _is_excluded(subdir)
+                ):
                     collect_paths(subdir, paths)
         elif has_subdirs:
             # No direct test files, but has subdirs with tests: recurse
             for subdir in sorted(path.iterdir()):
-                if subdir.is_dir() and subdir.name not in EXCLUDE_DIRS:
+                if (
+                    subdir.is_dir()
+                    and subdir.name not in EXCLUDE_DIRS
+                    and not _is_excluded(subdir)
+                ):
                     collect_paths(subdir, paths)
         else:
             # No test files at all - return as-is and let pytest handle it
@@ -141,33 +179,50 @@ def discover_all():
     test_root = Path("tests")
     paths = []
 
-    # Handle test files directly in tests/ root (e.g., test_settings.py)
     for item in sorted(test_root.iterdir()):
+        if _is_excluded(item):
+            continue
         if (
             item.is_file()
             and item.name.startswith("test_")
             and item.name.endswith(".py")
         ):
             paths.append(str(item))
-        elif item.is_dir() and item.name.startswith("test"):
+        elif item.is_dir() and item.name not in EXCLUDE_DIRS:
             collect_paths(item, paths)
 
     return paths
 
 
 def main():
-    if len(sys.argv) > 1:
-        # Explicit paths provided - expand each one
+    parser = argparse.ArgumentParser(
+        description="Discover test paths for CI parallelism",
+    )
+    parser.add_argument(
+        "paths",
+        nargs="*",
+        help="Paths to expand (default: discover all)",
+    )
+    parser.add_argument(
+        "--exclude",
+        action="append",
+        default=[],
+        help="Directory prefix to exclude (repeatable)",
+    )
+    args = parser.parse_args()
+
+    global _exclude_prefixes
+    _exclude_prefixes = [p.rstrip("/") for p in args.exclude]
+
+    if args.paths:
         all_paths = []
-        for arg in sys.argv[1:]:
-            expanded = expand_path(arg)
+        for p in args.paths:
+            expanded = expand_path(p)
             all_paths.extend(expanded)
         paths = all_paths
     else:
-        # No arguments - discover all from tests/
         paths = discover_all()
 
-    # Output unique paths, sorted
     for p in sorted(set(paths)):
         print(p)
 
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 47116f1f7..9d0a70a57 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -36,137 +36,48 @@ on:
         type: number
         default: 120
 
-  # Push events - tests only run if commit message contains [run-tests]
+  # Push to staging/main → lightweight tests (not llm_call, excluded heavy dirs)
   push:
     branches:
-    - '**'    # All branches
+    - staging
+    - main
 
-  # Pull request events - tests only run if PR title contains [run-tests]
+  # PR to main → full test suite
   pull_request:
     branches:
-    - '**'    # All branches
+    - main
 
 jobs:
-  # Determine if tests should run based on trigger and commit/PR message
+  # Determine test mode based on trigger type.
+  # Branch filtering is handled by the on: triggers above, so all events here
+  # are guaranteed to be: push to staging/main, PR to main, or workflow_dispatch.
   should-run-tests:
     runs-on: ubuntu-latest
     outputs:
-      run_tests: ${{ steps.check.outputs.run_tests }}
-      parallel_run_cmd: ${{ steps.check.outputs.parallel_run_cmd }}
+      test_mode: ${{ steps.check.outputs.test_mode }}
     steps:
-    - name: Check if tests should run
+    - name: Determine test mode
       id: check
       env:
-        # Pass these via env to avoid shell interpretation of special chars
-        # (backticks, quotes, $() in commit messages would otherwise be executed)
         EVENT_NAME: ${{ github.event_name }}
-        COMMIT_MSG: ${{ github.event.head_commit.message }}
-        PR_TITLE: ${{ github.event.pull_request.title }}
-        PR_BASE_REF: ${{ github.event.pull_request.base.ref }}
-        PR_HEAD_REF: ${{ github.event.pull_request.head.ref }}
       run: |
-        # Helper function to extract [parallel_run.sh ...] content
-        # Matches ALL occurrences and combines their arguments
-        extract_parallel_cmd() {
-          local msg="$1"
-          local result=""
-          local remaining="$msg"
-
-          # Loop through all matches of [parallel_run.sh ...]
-          while [[ "$remaining" =~ \[parallel_run\.sh[[:space:]]+([^\]]+)\] ]]; do
-            local match="${BASH_REMATCH[1]}"
-            # Skip literal "..." which is just documentation/example text
-            if [[ "$match" != "..." ]]; then
-              if [[ -n "$result" ]]; then
-                result="$result $match"
-              else
-                result="$match"
-              fi
-            fi
-            # Remove the matched portion and continue searching
-            remaining="${remaining#*"${BASH_REMATCH[0]}"}"
-          done
-
-          if [[ -n "$result" ]]; then
-            echo "$result"
-          fi
-        }
-
-        # workflow_dispatch always runs tests
         if [[ "$EVENT_NAME" == "workflow_dispatch" ]]; then
-          echo "run_tests=true" >> "$GITHUB_OUTPUT"
-          echo "parallel_run_cmd=" >> "$GITHUB_OUTPUT"
-          echo "Tests triggered by workflow_dispatch"
-          exit 0
-        fi
-
-        # For push events, check commit message
-        if [[ "$EVENT_NAME" == "push" ]]; then
-          # Check for [parallel_run.sh ...] first
-          PARALLEL_CMD=$(extract_parallel_cmd "$COMMIT_MSG")
-          if [[ -n "$PARALLEL_CMD" ]]; then
-            echo "run_tests=true" >> "$GITHUB_OUTPUT"
-            echo "parallel_run_cmd=$PARALLEL_CMD" >> "$GITHUB_OUTPUT"
-            echo "Tests triggered by commit message: [parallel_run.sh $PARALLEL_CMD]"
-            exit 0
-          fi
-
-          # Fall back to [run-tests]
-          if [[ "$COMMIT_MSG" == *"[run-tests]"* ]]; then
-            echo "run_tests=true" >> "$GITHUB_OUTPUT"
-            echo "parallel_run_cmd=" >> "$GITHUB_OUTPUT"
-            echo "Tests triggered by commit message containing [run-tests]"
-          else
-            echo "run_tests=false" >> "$GITHUB_OUTPUT"
-            echo "parallel_run_cmd=" >> "$GITHUB_OUTPUT"
-            echo "Skipping tests - commit message does not contain [run-tests] or [parallel_run.sh ...]"
-          fi
-          exit 0
-        fi
-
-        # For pull_request events, check PR title
-        if [[ "$EVENT_NAME" == "pull_request" ]]; then
-          # Auto-run full suite for staging → main PRs (no tags required)
-          if [[ "$PR_BASE_REF" == "main" && "$PR_HEAD_REF" == "staging" ]]; then
-            echo "run_tests=true" >> "$GITHUB_OUTPUT"
-            echo "parallel_run_cmd=" >> "$GITHUB_OUTPUT"
-            echo "Tests auto-triggered: staging → main PR"
-            exit 0
-          fi
-
-          # Check for [parallel_run.sh ...] first
-          PARALLEL_CMD=$(extract_parallel_cmd "$PR_TITLE")
-          if [[ -n "$PARALLEL_CMD" ]]; then
-            echo "run_tests=true" >> "$GITHUB_OUTPUT"
-            echo "parallel_run_cmd=$PARALLEL_CMD" >> "$GITHUB_OUTPUT"
-            echo "Tests triggered by PR title: [parallel_run.sh $PARALLEL_CMD]"
-            exit 0
-          fi
-
-          # Fall back to [run-tests]
-          if [[ "$PR_TITLE" == *"[run-tests]"* ]]; then
-            echo "run_tests=true" >> "$GITHUB_OUTPUT"
-            echo "parallel_run_cmd=" >> "$GITHUB_OUTPUT"
-            echo "Tests triggered by PR title containing [run-tests]"
-          else
-            echo "run_tests=false" >> "$GITHUB_OUTPUT"
-            echo "parallel_run_cmd=" >> "$GITHUB_OUTPUT"
-            echo "Skipping tests - PR title does not contain [run-tests] or [parallel_run.sh ...]"
-          fi
-          exit 0
+          echo "test_mode=manual" >> "$GITHUB_OUTPUT"
+          echo "Mode: manual (workflow_dispatch)"
+        elif [[ "$EVENT_NAME" == "push" ]]; then
+          echo "test_mode=lightweight" >> "$GITHUB_OUTPUT"
+          echo "Mode: lightweight (push to ${GITHUB_REF#refs/heads/})"
+        elif [[ "$EVENT_NAME" == "pull_request" ]]; then
+          echo "test_mode=full" >> "$GITHUB_OUTPUT"
+          echo "Mode: full (PR to main)"
         fi
 
-        # Default: don't run tests
-        echo "run_tests=false" >> "$GITHUB_OUTPUT"
-        echo "parallel_run_cmd=" >> "$GITHUB_OUTPUT"
-        echo "Skipping tests - unknown event type"
-
   discover:
     runs-on: ubuntu-latest
     needs: should-run-tests
-    if: needs.should-run-tests.outputs.run_tests == 'true'
     outputs:
       test_dirs: ${{ steps.set-matrix.outputs.test_dirs }}
+      test_mode: ${{ needs.should-run-tests.outputs.test_mode }}
       parallel_run_args: ${{ steps.set-matrix.outputs.parallel_run_args }}
       job_timeout: ${{ steps.set-matrix.outputs.job_timeout }}
       # Project deletion flags (handled at workflow level to avoid race conditions)
@@ -181,25 +92,15 @@ jobs:
     - id: set-matrix
       shell: bash
       run: |
-        # Parse test paths and flags, then create matrix entries for each path.
-        # Each unique path gets its own parallel GitHub runner.
-        #
-        # Sources (priority order):
-        # 1. [parallel_run.sh ...] from commit/PR → parse paths and flags
-        # 2. workflow_dispatch inputs → test_path + parallel_run_args
-        # 3. Default (no paths) → discover all leaf test folders
+        # Discover test paths and create matrix entries for parallel runners.
         #
-        # ALL paths (explicit or discovered) are expanded to leaf directories:
-        # - Files are kept as-is
-        # - Directories are expanded to their leaf sub-folders
+        # Sources:
+        # 1. workflow_dispatch inputs → test_path + parallel_run_args
+        # 2. Default → discover all leaf test folders (with mode-based exclusions)
         #
-        # Leaf discovery (Option A algorithm):
-        # - Leaf directories (test files, no test subdirs) → one job per directory
-        # - Mixed directories (test files AND test subdirs) → one bundled job for
-        #   all direct test files (space-separated), plus recursive jobs for subdirs
-        # This maximizes parallelism regardless of how paths are specified.
+        # Paths are expanded to leaf directories for maximum parallelism.
 
-        PARALLEL_CMD="${{ needs.should-run-tests.outputs.parallel_run_cmd }}"
+        TEST_MODE="${{ needs.should-run-tests.outputs.test_mode }}"
         INPUT_PATH="${{ inputs.test_path }}"
         INPUT_ARGS="${{ inputs.parallel_run_args }}"
 
@@ -208,31 +109,15 @@ jobs:
         declare -a PATHS=()
         declare -a FLAGS=()
 
-        if [[ -n "$PARALLEL_CMD" ]]; then
-          # Parse [parallel_run.sh ...] from commit/PR message
-          # Separate flags (--env, --eval-only, etc.) from paths
-          echo "Parsing [parallel_run.sh $PARALLEL_CMD]" >&2
-          LAST_FLAG=""
-          for token in $PARALLEL_CMD; do
-            if [[ "$token" == --* ]]; then
-              # It's a flag - collect it and its value if needed
-              FLAGS+=("$token")
-              LAST_FLAG="$token"
-            elif [[ "$LAST_FLAG" == "--env" || "$LAST_FLAG" == "--tags" || "$LAST_FLAG" == "-j" || "$LAST_FLAG" == "--jobs" || "$LAST_FLAG" == "-t" || "$LAST_FLAG" == "--timeout" ]]; then
-              # Previous token was a flag that takes a value
-              FLAGS+=("$token")
-              LAST_FLAG=""
-            else
-              # It's a path - collect for expansion
-              RAW_PATHS+=("$token")
-              LAST_FLAG=""
-            fi
-          done
-        elif [[ -n "$INPUT_PATH" && "$INPUT_PATH" != "." ]]; then
-          # workflow_dispatch with specific path(s)
-          # Split space-separated paths into array
+        # Build --exclude args for lightweight mode
+        EXCLUDE_ARGS=""
+        if [[ "$TEST_MODE" == "lightweight" ]]; then
+          EXCLUDE_ARGS="--exclude tests/actor --exclude tests/conversation_manager --exclude tests/async_tool_loop"
+          echo "Lightweight mode: excluding tests/actor, tests/conversation_manager, tests/async_tool_loop" >&2
+        fi
+
+        if [[ "$TEST_MODE" == "manual" && -n "$INPUT_PATH" && "$INPUT_PATH" != "." ]]; then
           read -ra RAW_PATHS <<< "$INPUT_PATH"
-          # Flags come from parallel_run_args input
           if [[ -n "$INPUT_ARGS" ]]; then
             read -ra FLAGS <<< "$INPUT_ARGS"
           fi
@@ -240,28 +125,21 @@ jobs:
         fi
 
         # Expand paths to leaf directories using discover_test_paths.py
-        # - If RAW_PATHS is empty, discover all test paths from tests/
-        # - If RAW_PATHS has entries, expand each directory to its leaf sub-folders
-        # This ensures maximum parallelism regardless of how paths are specified.
         if (( ${#RAW_PATHS[@]} == 0 )); then
-          # No explicit paths - discover all
           while IFS= read -r path; do
             PATHS+=("$path")
-          done < <(python3 .github/scripts/discover_test_paths.py)
-          # Flags come from parallel_run_args input (if any)
-          if [[ -n "$INPUT_ARGS" ]]; then
+          done < <(python3 .github/scripts/discover_test_paths.py $EXCLUDE_ARGS)
+          if [[ "$TEST_MODE" == "manual" && -n "$INPUT_ARGS" ]]; then
             read -ra FLAGS <<< "$INPUT_ARGS"
           fi
           echo "Discovered ${#PATHS[@]} test paths (leaf dirs + individual files)" >&2
         else
-          # Expand explicit paths to their leaf directories
           while IFS= read -r path; do
             PATHS+=("$path")
-          done < <(python3 .github/scripts/discover_test_paths.py "${RAW_PATHS[@]}")
+          done < <(python3 .github/scripts/discover_test_paths.py $EXCLUDE_ARGS "${RAW_PATHS[@]}")
           echo "Expanded ${#RAW_PATHS[@]} path(s) to ${#PATHS[@]} leaf test paths" >&2
         fi
 
-        # Build JSON array of paths for matrix
         if (( ${#PATHS[@]} == 0 )); then
           echo "Error: No test paths found" >&2
           exit 1
@@ -281,12 +159,9 @@ jobs:
         # to .env in the pytest job. We only parse it here for project mgmt flags.
         # =====================================================================
 
-        # CI defaults for project management (can be overridden)
-        # With local orchestra (default), these are ignored - each job has fresh DB
-        # Only relevant when ORCHESTRA_URL is set to a non-local URL (staging/production)
-        DELETE_ON_START="false"  # Default: no deletion (local orchestra has fresh DB)
-        DELETE_ON_EXIT="false"   # Default: keep project after tests
-        RANDOM_PROJECTS="false"  # Default: shared project mode
+        DELETE_ON_START="false"
+        DELETE_ON_EXIT="false"
+        RANDOM_PROJECTS="false"
 
         # Orchestra branch precedence (highest to lowest):
         # 1. --env LOCAL_ORCHESTRA_BRANCH=xxx (handled below in FLAGS parsing)
@@ -308,11 +183,9 @@ jobs:
         fi
 
         # Parse env_file_content for project management flags only
-        # (actual env vars are written to .env in pytest job, not passed as --env args)
         ENV_FILE_CONTENT_RAW="${{ inputs.env_file_content }}"
         if [[ -n "$ENV_FILE_CONTENT_RAW" ]]; then
           echo "Parsing env_file_content for project management flags..." >&2
-          # Try to decode as base64; if it fails or produces garbage, assume raw content
           if DECODED=$(echo "$ENV_FILE_CONTENT_RAW" | base64 -d 2>/dev/null) && [[ "$DECODED" == *$'\n'* || "$DECODED" == *"="* ]]; then
             ENV_FILE_CONTENT="$DECODED"
           else
@@ -320,20 +193,13 @@ jobs:
           fi
           if [[ -n "$ENV_FILE_CONTENT" ]]; then
             while IFS= read -r line || [[ -n "$line" ]]; do
-              # Skip empty lines and comments
               [[ -z "$line" || "$line" =~ ^[[:space:]]*# ]] && continue
-
-              # Extract KEY=VALUE (handles quotes)
               if [[ "$line" =~ ^[[:space:]]*([A-Za-z_][A-Za-z0-9_]*)[[:space:]]*=[[:space:]]*(.*)[[:space:]]*$ ]]; then
                 key="${BASH_REMATCH[1]}"
                 value="${BASH_REMATCH[2]}"
-
-                # Strip surrounding quotes if present
                 if [[ "$value" =~ ^\"(.*)\"$ ]] || [[ "$value" =~ ^\'(.*)\'$ ]]; then
                   value="${BASH_REMATCH[1]}"
                 fi
-
-                # Only check for project management flags (other vars handled via .env file)
                 case "$key=$value" in
                   UNIFY_TESTS_DELETE_PROJ_ON_START=true|UNIFY_TESTS_DELETE_PROJ_ON_START=True|UNIFY_TESTS_DELETE_PROJ_ON_START=1)
                     DELETE_ON_START="true"
@@ -370,7 +236,6 @@ jobs:
         fi
 
         # Process FLAGS (from parallel_run_args) - highest priority
-        # These are explicit --env args passed via commit message or workflow dispatch
         declare -a CLEANED_FLAGS=()
         skip_next=false
 
@@ -383,7 +248,6 @@ jobs:
           flag="${FLAGS[$i]}"
           next_val="${FLAGS[$((i+1))]:-}"
 
-          # Check for --env with project management flags (strip them, handle at workflow level)
           if [[ "$flag" == "--env" ]]; then
             case "$next_val" in
               UNIFY_TESTS_DELETE_PROJ_ON_START=true|UNIFY_TESTS_DELETE_PROJ_ON_START=True|UNIFY_TESTS_DELETE_PROJ_ON_START=1)
@@ -412,7 +276,7 @@ jobs:
                 ;;
               UNIFY_TESTS_RAND_PROJ=true|UNIFY_TESTS_RAND_PROJ=True|UNIFY_TESTS_RAND_PROJ=1)
                 RANDOM_PROJECTS="true"
-                CLEANED_FLAGS+=("$flag" "$next_val")  # Keep for runners
+                CLEANED_FLAGS+=("$flag" "$next_val")
                 skip_next=true
                 echo "  args: RANDOM_PROJECTS=true (override)" >&2
                 continue
@@ -446,8 +310,6 @@ jobs:
         echo "random_projects=$RANDOM_PROJECTS" >> "$GITHUB_OUTPUT"
         echo "orchestra_branch=$ORCHESTRA_BRANCH" >> "$GITHUB_OUTPUT"
 
-        # Build parallel_run_args from explicit flags only (not env_file content)
-        # env_file content is handled separately by writing to .env on the runner
         if (( ${#CLEANED_FLAGS[@]} > 0 )); then
           flags_str="${CLEANED_FLAGS[*]}"
           echo "parallel_run_args=$flags_str" >> "$GITHUB_OUTPUT"
@@ -472,7 +334,6 @@ jobs:
     needs: [should-run-tests, discover]
     # Only run if DELETE_ON_START is requested AND we're in shared project mode
     if: |
-      needs.should-run-tests.outputs.run_tests == 'true' &&
       needs.discover.outputs.delete_on_start == 'true' &&
       needs.discover.outputs.random_projects != 'true'
     env:
@@ -516,7 +377,6 @@ jobs:
     needs: [should-run-tests, discover, setup]
     if: |
       always() &&
-      needs.should-run-tests.outputs.run_tests == 'true' &&
       (needs.setup.result == 'success' || needs.setup.result == 'skipped')
     strategy:
       fail-fast: false
@@ -838,28 +698,30 @@ jobs:
       run: |
         set +e
 
-        # parallel_run_args comes from discover job (parsed from commit message or inputs)
+        TEST_MODE="${{ needs.discover.outputs.test_mode }}"
         EXTRA_ARGS="${{ needs.discover.outputs.parallel_run_args }}"
         TEST_PATH="${{ steps.normalize-path.outputs.test_args }}"
         TIMEOUT="${{ steps.normalize-path.outputs.timeout_seconds }}"
 
-        # Start Xvfb manually instead of using xvfb-run.
-        # xvfb-run doesn't propagate signals to child processes, making
-        # workflow cancellation ineffective. This approach ensures SIGTERM
-        # reaches parallel_run.sh directly.
+        # Determine pytest marker args based on test mode
+        MARKER_ARGS=""
+        if [[ "$TEST_MODE" == "lightweight" ]]; then
+          MARKER_ARGS='-- -m "not llm_call"'
+        fi
+
         Xvfb :99 -screen 0 1920x1080x24 &
         XVFB_PID=$!
         export DISPLAY=:99
-
-        # Ensure Xvfb is cleaned up on exit
         trap "kill $XVFB_PID 2>/dev/null" EXIT
 
-        # parallel_run.sh handles starting local orchestra automatically
-        # Note: TEST_PATH must come before EXTRA_ARGS because EXTRA_ARGS may contain
-        # "-- pytest-args" and everything after -- is treated as pytest arguments
         # TEST_PATH is unquoted to allow word splitting for bundled multi-file entries
-        echo "Running: parallel_run.sh --timeout $TIMEOUT $TEST_PATH $EXTRA_ARGS"
-        tests/parallel_run.sh --timeout "$TIMEOUT" $TEST_PATH $EXTRA_ARGS
+        if [[ -n "$MARKER_ARGS" ]]; then
+          echo "Running: parallel_run.sh --timeout $TIMEOUT $TEST_PATH $MARKER_ARGS"
+          tests/parallel_run.sh --timeout "$TIMEOUT" $TEST_PATH $MARKER_ARGS
+        else
+          echo "Running: parallel_run.sh --timeout $TIMEOUT $TEST_PATH $EXTRA_ARGS"
+          tests/parallel_run.sh --timeout "$TIMEOUT" $TEST_PATH $EXTRA_ARGS
+        fi
         TEST_EXIT_CODE=$?
         echo "exit_code=$TEST_EXIT_CODE" >> "$GITHUB_OUTPUT"
         exit $TEST_EXIT_CODE
@@ -1114,7 +976,6 @@ jobs:
     # Cancelled runs likely left partial test data that should be cleaned up
     if: |
       always() &&
-      needs.should-run-tests.outputs.run_tests == 'true' &&
       needs.discover.outputs.random_projects != 'true' &&
       (needs.discover.outputs.delete_on_exit == 'true' || needs.pytest.result == 'cancelled')
     env:
@@ -1151,7 +1012,7 @@ jobs:
     name: Consolidate and store cache
     runs-on: ubuntu-latest
     needs: [should-run-tests, pytest, cleanup]
-    if: always() && needs.should-run-tests.outputs.run_tests == 'true'
+    if: always()
     steps:
     - uses: actions/checkout@v4
 

From b36d378e6eff41ed078ab930fe8de6cc71bccd24 Mon Sep 17 00:00:00 2001
From: CatB1t <skytedits@gmail.com>
Date: Wed, 8 Apr 2026 14:51:36 +0200
Subject: [PATCH 02/14] ci: enable lightweight tests when merging to staging

---
 .github/workflows/tests.yml | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 9d0a70a57..06586ce86 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -42,15 +42,16 @@ on:
     - staging
     - main
 
-  # PR to main → full test suite
+  # PR to main → full test suite; PR to staging → lightweight tests
   pull_request:
     branches:
     - main
+    - staging
 
 jobs:
   # Determine test mode based on trigger type.
   # Branch filtering is handled by the on: triggers above, so all events here
-  # are guaranteed to be: push to staging/main, PR to main, or workflow_dispatch.
+  # are guaranteed to be: push to staging/main, PR to main/staging, or workflow_dispatch.
   should-run-tests:
     runs-on: ubuntu-latest
     outputs:
@@ -60,6 +61,7 @@ jobs:
       id: check
       env:
         EVENT_NAME: ${{ github.event_name }}
+        PR_BASE_REF: ${{ github.event.pull_request.base.ref }}
       run: |
         if [[ "$EVENT_NAME" == "workflow_dispatch" ]]; then
           echo "test_mode=manual" >> "$GITHUB_OUTPUT"
@@ -68,8 +70,13 @@ jobs:
           echo "test_mode=lightweight" >> "$GITHUB_OUTPUT"
           echo "Mode: lightweight (push to ${GITHUB_REF#refs/heads/})"
         elif [[ "$EVENT_NAME" == "pull_request" ]]; then
-          echo "test_mode=full" >> "$GITHUB_OUTPUT"
-          echo "Mode: full (PR to main)"
+          if [[ "$PR_BASE_REF" == "main" ]]; then
+            echo "test_mode=full" >> "$GITHUB_OUTPUT"
+            echo "Mode: full (PR to main)"
+          else
+            echo "test_mode=lightweight" >> "$GITHUB_OUTPUT"
+            echo "Mode: lightweight (PR to $PR_BASE_REF)"
+          fi
         fi
 
   discover:

From bb4d75b220dbe807e8ad014d61afbbc4f142f229 Mon Sep 17 00:00:00 2001
From: CatB1t <skytedits@gmail.com>
Date: Wed, 8 Apr 2026 15:13:53 +0200
Subject: [PATCH 03/14] ci: disable tests.yml temporarily

---
 .github/workflows/tests.yml | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 06586ce86..2ce2d9efc 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -36,17 +36,6 @@ on:
         type: number
         default: 120
 
-  # Push to staging/main → lightweight tests (not llm_call, excluded heavy dirs)
-  push:
-    branches:
-    - staging
-    - main
-
-  # PR to main → full test suite; PR to staging → lightweight tests
-  pull_request:
-    branches:
-    - main
-    - staging
 
 jobs:
   # Determine test mode based on trigger type.

From 4b82e7d779f7349666c7d8c90b667f420a6fdf4d Mon Sep 17 00:00:00 2001
From: CatB1t <skytedits@gmail.com>
Date: Wed, 8 Apr 2026 15:36:59 +0200
Subject: [PATCH 04/14] tests(parallel_run): add support for --ignore

---
 tests/_parse_args.sh  | 28 ++++++++++++++++++++++++++--
 tests/parallel_run.sh | 35 +++++++++++++++++++++++++++++++++++
 2 files changed, 61 insertions(+), 2 deletions(-)

diff --git a/tests/_parse_args.sh b/tests/_parse_args.sh
index a4e5be6d9..5730f772f 100644
--- a/tests/_parse_args.sh
+++ b/tests/_parse_args.sh
@@ -47,6 +47,7 @@ parse_test_args() {
   TAGS=()
   PYTEST_EXTRA_ARGS=()
   PYTEST_COLLECTION_ARGS=()
+  PYTEST_IGNORE_PATHS=()
   POSITIONAL_ARGS=()
 
   while (( "$#" )); do
@@ -141,8 +142,10 @@ parse_test_args() {
       --)
         shift
         PYTEST_EXTRA_ARGS=("$@")
-        # Extract collection-relevant args (-k, -m) for use during test discovery
-        # These filters affect which tests are collected, not just how they run
+        # Extract collection-relevant args for use during test discovery.
+        # These affect which tests are collected, not just how they run:
+        #   -k/-m: filter by keyword/marker expression
+        #   --ignore/--ignore-glob: exclude paths from collection
         local _coll_i=0
         while (( _coll_i < ${#PYTEST_EXTRA_ARGS[@]} )); do
           local _coll_arg="${PYTEST_EXTRA_ARGS[_coll_i]}"
@@ -156,11 +159,32 @@ parse_test_args() {
                 ((_coll_i++))
               fi
               ;;
+            --ignore)
+              # Next arg is the path (e.g., --ignore tests/actor)
+              if (( _coll_i + 1 < ${#PYTEST_EXTRA_ARGS[@]} )); then
+                PYTEST_COLLECTION_ARGS+=( "$_coll_arg" "${PYTEST_EXTRA_ARGS[_coll_i+1]}" )
+                PYTEST_IGNORE_PATHS+=( "${PYTEST_EXTRA_ARGS[_coll_i+1]%/}" )
+                ((_coll_i+=2))
+              else
+                ((_coll_i++))
+              fi
+              ;;
             -k=*|-m=*)
               # Value is attached (e.g., -k="pattern")
               PYTEST_COLLECTION_ARGS+=( "$_coll_arg" )
               ((_coll_i++))
               ;;
+            --ignore=*)
+              # Value is attached (e.g., --ignore=tests/actor)
+              PYTEST_COLLECTION_ARGS+=( "$_coll_arg" )
+              local _ival="${_coll_arg#--ignore=}"
+              PYTEST_IGNORE_PATHS+=( "${_ival%/}" )
+              ((_coll_i++))
+              ;;
+            --ignore-glob=*)
+              PYTEST_COLLECTION_ARGS+=( "$_coll_arg" )
+              ((_coll_i++))
+              ;;
             --keyword=*|--markers=*)
               # Long form with attached value
               PYTEST_COLLECTION_ARGS+=( "$_coll_arg" )
diff --git a/tests/parallel_run.sh b/tests/parallel_run.sh
index a651cce60..0fed212d6 100755
--- a/tests/parallel_run.sh
+++ b/tests/parallel_run.sh
@@ -1174,6 +1174,41 @@ if (( ${#roots[@]} )); then
   done < <(eval "$(build_find_cmd)")
 fi
 
+# Filter out files under --ignore paths (pytest --ignore only affects directory
+# traversal, not explicitly listed targets, so we must filter before collection)
+if (( ${#PYTEST_IGNORE_PATHS[@]} > 0 && (${#found_files[@]} + ${#direct_files[@]}) > 0 )); then
+  _filter_ignored() {
+    local -a input=("$@")
+    local -a output=()
+    for f in "${input[@]}"; do
+      local _skip=0
+      for pfx in "${PYTEST_IGNORE_PATHS[@]}"; do
+        if [[ "$f" == "$pfx"/* || "$f" == "./$pfx"/* || "$f" == "$pfx" ]]; then
+          _skip=1
+          break
+        fi
+      done
+      (( _skip )) || output+=( "$f" )
+    done
+    printf '%s\0' "${output[@]}"
+  }
+  if (( ${#found_files[@]} )); then
+    tmp_filtered=()
+    while IFS= read -r -d '' f; do
+      tmp_filtered+=( "$f" )
+    done < <(_filter_ignored "${found_files[@]}")
+    found_files=( "${tmp_filtered[@]}" )
+  fi
+  if (( ${#direct_files[@]} )); then
+    tmp_filtered=()
+    while IFS= read -r -d '' f; do
+      tmp_filtered+=( "$f" )
+    done < <(_filter_ignored "${direct_files[@]}")
+    direct_files=( "${tmp_filtered[@]}" )
+  fi
+  unset -f _filter_ignored
+fi
+
 # Apply filename pattern filter (matches on basename) if provided
 if [[ -n "$NAME_PATTERN" ]]; then
   if (( ${#direct_files[@]} )); then

From 8d95b2819b22a07cffcfc19556ac50a899d0cb81 Mon Sep 17 00:00:00 2001
From: CatB1t <skytedits@gmail.com>
Date: Wed, 8 Apr 2026 15:38:17 +0200
Subject: [PATCH 05/14] add tests_lightweight.yml

---
 .github/workflows/tests_lightweight.yml | 406 ++++++++++++++++++++++++
 1 file changed, 406 insertions(+)
 create mode 100644 .github/workflows/tests_lightweight.yml

diff --git a/.github/workflows/tests_lightweight.yml b/.github/workflows/tests_lightweight.yml
new file mode 100644
index 000000000..fa934ecdc
--- /dev/null
+++ b/.github/workflows/tests_lightweight.yml
@@ -0,0 +1,406 @@
+# Lightweight test suite: runs all tests with -m "not llm_call" as a single job.
+# For the full test suite (with LLM calls, matrix parallelism, cache consolidation),
+# use tests.yml via workflow_dispatch.
+name: Tests (lightweight)
+run-name: >-
+  ${{
+    github.event.head_commit.message ||
+    github.event.pull_request.title
+  }}
+
+on:
+  push:
+    branches:
+    - staging
+    - main
+
+  pull_request:
+    branches:
+    - main
+    - staging
+
+jobs:
+  pytest:
+    name: pytest (lightweight)
+    runs-on: ubuntu-latest-8-cores
+    environment: unity-testing
+    timeout-minutes: 130
+    env:
+      UNIFY_TESTS_RAND_PROJ: "false"
+      UNIFY_TESTS_DELETE_PROJ_ON_START: "false"
+      UNIFY_TESTS_DELETE_PROJ_ON_EXIT: "false"
+      UNILLM_CACHE: "true"
+      ORCHESTRA_REPO_PATH: ${{ github.workspace }}/orchestra
+      ORCHESTRA_URL: ${{ vars.ORCHESTRA_URL }}
+      OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+      ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+      UNITY_COMMS_URL: ${{ vars.UNITY_COMMS_URL }}
+      GCP_PROJECT_ID: ${{ vars.GCP_PROJECT_ID }}
+      GCP_LOCATION: ${{ vars.GCP_LOCATION }}
+      ORCHESTRA_GCP_BUCKET_NAME: ${{ vars.GCP_BUCKET_LOGS }}
+      ORCHESTRA_GCP_ASSISTANT_MEDIA_BUCKET_NAME: ${{ vars.GCP_BUCKET_ASSISTANT_IMAGES }}
+      ORCHESTRA_GCP_ASSISTANT_CALL_RECORDINGS_BUCKET_NAME: ${{ vars.GCP_BUCKET_RECORDINGS }}
+      TAVILY_API_KEY: ${{ secrets.TAVILY_API_KEY }}
+      UNIFY_KEY: ${{ secrets.UNIFY_KEY }}
+      PYTHONPATH: ${{ github.workspace }}/unillm:${{ github.workspace }}/unify
+    steps:
+    - uses: actions/checkout@v6
+      with:
+        fetch-depth: 1
+
+    - name: Clone orchestra repo for local deployment
+      uses: actions/checkout@v6
+      with:
+        repository: unifyai/orchestra
+        ref: ${{ vars.LOCAL_ORCHESTRA_BRANCH || (github.ref_name == 'main' && 'main' || 'staging') }}
+        path: orchestra
+        token: ${{ secrets.CLONE_TOKEN }}
+        fetch-depth: 1
+
+    - name: Clone unify repo
+      uses: actions/checkout@v6
+      with:
+        repository: unifyai/unify
+        ref: ${{ github.ref_name == 'main' && 'main' || 'staging' }}
+        path: unify
+        token: ${{ secrets.CLONE_TOKEN }}
+        fetch-depth: 1
+
+    - name: Clone unillm repo
+      uses: actions/checkout@v6
+      with:
+        repository: unifyai/unillm
+        ref: ${{ github.ref_name == 'main' && 'main' || 'staging' }}
+        path: unillm
+        token: ${{ secrets.CLONE_TOKEN }}
+        fetch-depth: 1
+
+    - name: Authenticate to Google Cloud
+      uses: google-github-actions/auth@v2
+      with:
+        credentials_json: ${{ secrets.GCP_SERVICE_ACCOUNT_JSON }}
+
+    - name: Get orchestra commit SHA
+      id: orchestra-sha
+      run: echo "sha=$(git -C orchestra rev-parse HEAD)" >> $GITHUB_OUTPUT
+
+    - name: Set up Python
+      uses: actions/setup-python@v6
+      with:
+        python-version: '3.12'
+
+    - name: Cache apt packages
+      uses: awalsh128/cache-apt-pkgs-action@latest
+      with:
+        packages: >-
+          tmux zsh portaudio19-dev libnss3-dev libatk-bridge2.0-dev libdrm2
+          libxkbcommon0 libgtk-3-dev libgbm-dev libgirepository1.0-dev
+          libasound2t64 libatspi2.0-0 libgtk-3-0 libnspr4 libwayland-client0
+          libxcomposite1 libxdamage1 libxrandr2 xvfb
+        version: 1.1
+
+    - name: Install uv and poetry
+      run: |
+        pip install uv
+        pip install poetry
+
+    - name: Cache uv dependencies
+      uses: actions/cache@v5
+      with:
+        path: |
+          ~/.cache/uv
+          .venv
+        key: uv-${{ runner.os }}-${{ hashFiles('pyproject.toml', 'uv.lock', 'unillm/pyproject.toml', 'unify/pyproject.toml') }}
+        restore-keys: |
+          uv-${{ runner.os }}-
+
+    - name: Configure unify/unillm paths for CI
+      run: |
+        sed -i 's|path = "../unify"|path = "./unify"|g' pyproject.toml
+        sed -i 's|path = "../unillm"|path = "./unillm"|g' pyproject.toml
+
+    - name: Sync dependencies with uv
+      run: |
+        uv sync --all-groups
+
+    - name: Verify local packages installed correctly
+      run: |
+        uv run python -c "
+        import sys
+        print('Python path (first 5):')
+        for p in sys.path[:5]:
+            print(f'  {p}')
+
+        from unillm import AsyncUnify, Unify
+        import unify
+
+        print(f'✓ unillm: {AsyncUnify.__module__}')
+        print(f'✓ unify: {unify.__file__}')
+        assert hasattr(unify, 'create_project'), f'Wrong unify package!'
+        print('✓ Local packages verified successfully')
+        "
+
+    - name: Cache Playwright assets
+      uses: actions/cache@v5
+      id: playwright-cache
+      with:
+        path: ~/.cache/ms-playwright
+        key: playwright-${{ runner.os }}-${{ hashFiles('pyproject.toml') }}
+        restore-keys: |
+          playwright-${{ runner.os }}-
+
+    - name: Install Playwright assets
+      if: steps.playwright-cache.outputs.cache-hit != 'true'
+      run: |
+        uv run playwright install --with-deps
+
+    - name: Setup Node.js (agent-service tests)
+      uses: actions/setup-node@v4
+      with:
+        node-version: '22'
+
+    - name: Clone magnitude repo (agent-service tests)
+      uses: actions/checkout@v6
+      with:
+        repository: unifyai/magnitude
+        ref: unity-modifications
+        path: magnitude
+        token: ${{ secrets.CLONE_TOKEN }}
+        fetch-depth: 1
+
+    - name: Build magnitude packages and install agent-service (agent-service tests)
+      run: |
+        cd magnitude/packages/magnitude-core
+        npm install
+        npm run build
+        echo "✓ magnitude-core built"
+
+        cd ../magnitude-extract
+        npm install
+        npm run build
+        echo "✓ magnitude-extract built"
+
+        cd ${{ github.workspace }}/agent-service
+        npm install
+        echo "✓ agent-service dependencies installed"
+
+        npx patchright install chromium
+        echo "✓ Patchright Chromium installed"
+
+    - name: Cache orchestra poetry dependencies
+      uses: actions/cache@v4
+      with:
+        path: ~/.cache/pypoetry
+        key: poetry-orchestra-${{ runner.os }}-${{ steps.orchestra-sha.outputs.sha }}
+        restore-keys: |
+          poetry-orchestra-${{ runner.os }}-
+
+    - name: Install orchestra dependencies
+      run: |
+        cd "$ORCHESTRA_REPO_PATH"
+        poetry install --no-interaction
+
+    - name: Run tests
+      id: run-tests
+      run: |
+        set +e
+
+        Xvfb :99 -screen 0 1920x1080x24 &
+        XVFB_PID=$!
+        export DISPLAY=:99
+        trap "kill $XVFB_PID 2>/dev/null" EXIT
+
+        echo "Running: parallel_run.sh --timeout 7200 tests/ -- -m \"not llm_call\" --ignore=tests/actor --ignore=tests/conversation_manager --ignore=tests/async_tool_loop"
+        tests/parallel_run.sh --timeout 7200 tests/ -- -m "not llm_call" --ignore=tests/actor --ignore=tests/conversation_manager --ignore=tests/async_tool_loop
+        TEST_EXIT_CODE=$?
+        echo "exit_code=$TEST_EXIT_CODE" >> "$GITHUB_OUTPUT"
+        exit $TEST_EXIT_CODE
+
+    - name: Dump orchestra logs on failure
+      if: failure()
+      run: |
+        echo "=== Orchestra Server Logs ==="
+        if [ -f /tmp/orchestra-local-server.log ]; then
+          cat /tmp/orchestra-local-server.log
+          mkdir -p logs/orchestra
+          cp /tmp/orchestra-local-server.log logs/orchestra/server.log
+          echo "(Copied to logs/orchestra/server.log for artifact upload)"
+        else
+          echo "No orchestra log file found at /tmp/orchestra-local-server.log"
+        fi
+        echo ""
+        echo "=== Orchestra Server Process Status ==="
+        if [ -f /tmp/orchestra-local-server.pid ]; then
+          PID=$(cat /tmp/orchestra-local-server.pid)
+          echo "Expected PID: $PID"
+          if ps -p "$PID" > /dev/null 2>&1; then
+            echo "Process is still running"
+          else
+            echo "Process is NOT running"
+          fi
+        else
+          echo "No PID file found"
+        fi
+        echo ""
+        echo "=== Any orchestra-related processes ==="
+        ps aux | grep -E "(orchestra|uvicorn)" | grep -v grep || echo "None found"
+
+    - name: Generate failure summary
+      if: failure()
+      run: |
+        ARTIFACTS_URL="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}#artifacts"
+
+        TOTAL_COUNT=0
+        if [ -d "logs/pytest" ]; then
+          TOTAL_COUNT=$(find logs/pytest -name "*.txt" -type f 2>/dev/null | wc -l | tr -d ' ')
+        fi
+
+        FAILED_SESSIONS=""
+        FAIL_COUNT=0
+        for sock in /tmp/tmux-$(id -u)/unity*; do
+          if [ -S "$sock" ]; then
+            socket_name=$(basename "$sock")
+            FAILED_SESSIONS=$(tmux -L "$socket_name" ls 2>/dev/null | sed 's/: .*//' | grep '^f ' || true)
+            FAIL_COUNT=$(echo "$FAILED_SESSIONS" | grep -c '^f ' || echo 0)
+            break
+          fi
+        done
+
+        PASS_COUNT=$((TOTAL_COUNT - FAIL_COUNT))
+
+        echo "=== Test Results ==="
+        echo "✅ $PASS_COUNT passed, ❌ $FAIL_COUNT failed"
+        echo ""
+        echo "Failed tests:"
+        echo "$FAILED_SESSIONS"
+        echo ""
+        echo "Download logs: $ARTIFACTS_URL"
+
+        echo "## ❌ Test Failures (lightweight)" >> $GITHUB_STEP_SUMMARY
+        echo "" >> $GITHUB_STEP_SUMMARY
+        echo "**✅ $PASS_COUNT** passed, **❌ $FAIL_COUNT** failed — [view logs]($ARTIFACTS_URL)" >> $GITHUB_STEP_SUMMARY
+        echo "" >> $GITHUB_STEP_SUMMARY
+        echo "### Failed tests:" >> $GITHUB_STEP_SUMMARY
+        echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
+        echo "$FAILED_SESSIONS" >> $GITHUB_STEP_SUMMARY
+        echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
+
+        DURATION_FILE=$(find logs/pytest -name "duration_summary.txt" 2>/dev/null | head -1)
+        if [ -n "$DURATION_FILE" ] && [ -f "$DURATION_FILE" ]; then
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "<details>" >> $GITHUB_STEP_SUMMARY
+          echo "<summary>📊 Test Stats: Duration & Cache (fastest → slowest)</summary>" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
+          cat "$DURATION_FILE" >> $GITHUB_STEP_SUMMARY
+          echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
+          echo "</details>" >> $GITHUB_STEP_SUMMARY
+        fi
+
+    - name: Generate success summary
+      if: success()
+      run: |
+        ARTIFACTS_URL="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}#artifacts"
+
+        log_count=0
+        if [ -d "logs/pytest" ]; then
+          log_count=$(find logs/pytest -name "*.txt" -type f 2>/dev/null | wc -l | tr -d ' ')
+        fi
+
+        if [ "$log_count" -eq 0 ]; then
+          echo "## ⚠️ No Tests Ran (lightweight)" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "**0 tests executed** — this likely indicates a problem with test discovery." >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "Check the 'Run tests' step logs for details." >> $GITHUB_STEP_SUMMARY
+          echo ""
+          echo "Error: No tests were executed. This indicates a problem with test discovery." >&2
+          echo "Check the 'Run tests' step output for pytest collection errors." >&2
+          exit 1
+        fi
+
+        echo "=== All tests passed ==="
+        echo "✅ $log_count passed"
+        echo ""
+        echo "View logs: $ARTIFACTS_URL"
+
+        echo "## ✅ All Tests Passed (lightweight)" >> $GITHUB_STEP_SUMMARY
+        echo "" >> $GITHUB_STEP_SUMMARY
+        echo "**✅ $log_count** passed — [view logs]($ARTIFACTS_URL)" >> $GITHUB_STEP_SUMMARY
+
+        DURATION_FILE=$(find logs/pytest -name "duration_summary.txt" 2>/dev/null | head -1)
+        if [ -n "$DURATION_FILE" ] && [ -f "$DURATION_FILE" ]; then
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "<details>" >> $GITHUB_STEP_SUMMARY
+          echo "<summary>📊 Test Stats: Duration & Cache (fastest → slowest)</summary>" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
+          cat "$DURATION_FILE" >> $GITHUB_STEP_SUMMARY
+          echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
+          echo "</details>" >> $GITHUB_STEP_SUMMARY
+        fi
+
+    - name: Stop local orchestra (before log upload)
+      if: always()
+      continue-on-error: true
+      run: |
+        trap 'echo "Received SIGTERM, exiting gracefully"; exit 0' TERM
+        orchestra/scripts/local.sh stop || true
+
+    - name: Prepare logs for upload
+      if: always()
+      run: |
+        echo "=== Preparing logs for upload ==="
+        mkdir -p logs
+
+        if [ -d logs/pytest ]; then
+          file_count=$(find logs/pytest -type f 2>/dev/null | wc -l | tr -d ' ')
+          echo "✓ pytest logs: $file_count files"
+        else
+          echo "○ pytest logs: directory not found (skipping)"
+        fi
+
+        if [ -d logs/unillm ]; then
+          file_count=$(find logs/unillm -type f 2>/dev/null | wc -l | tr -d ' ')
+          echo "✓ unillm logs: $file_count files"
+        else
+          echo "○ unillm logs: directory not found (skipping)"
+        fi
+
+        if [ -d logs/orchestra ]; then
+          sanitized=0
+          while IFS= read -r -d '' file; do
+            newname=$(echo "$file" | tr ':' '_')
+            if mv "$file" "$newname" 2>/dev/null; then
+              ((sanitized++)) || true
+            fi
+          done < <(find logs/orchestra -type f -name '*:*' -print0 2>/dev/null || true)
+
+          file_count=$(find logs/orchestra -type f 2>/dev/null | wc -l | tr -d ' ')
+          if [ "$sanitized" -gt 0 ]; then
+            echo "✓ orchestra logs: $file_count files ($sanitized filenames sanitized)"
+          else
+            echo "✓ orchestra logs: $file_count files"
+          fi
+        else
+          echo "○ orchestra logs: directory not found (skipping)"
+        fi
+
+        echo "=== Log preparation complete ==="
+
+    - name: Upload logs
+      if: always()
+      uses: actions/upload-artifact@v7
+      with:
+        name: logs-lightweight
+        path: logs/
+        if-no-files-found: ignore
+        retention-days: 90
+
+    - name: Cleanup on cancellation
+      if: cancelled()
+      run: |
+        echo "Workflow cancelled - cleaning up tmux sessions and orphaned processes..."
+        tests/kill_server.sh --all || true
+        orchestra/scripts/local.sh stop || true
+        echo "Cleanup complete."

From 109f3d48ed63e7a4dece3e47a3e9138a67d9e624 Mon Sep 17 00:00:00 2001
From: CatB1t <skytedits@gmail.com>
Date: Wed, 8 Apr 2026 15:51:52 +0200
Subject: [PATCH 06/14] ci: fix magnitude package build

---
 .github/workflows/tests_lightweight.yml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/tests_lightweight.yml b/.github/workflows/tests_lightweight.yml
index fa934ecdc..2c772c9fb 100644
--- a/.github/workflows/tests_lightweight.yml
+++ b/.github/workflows/tests_lightweight.yml
@@ -170,13 +170,15 @@ jobs:
 
     - name: Build magnitude packages and install agent-service (agent-service tests)
       run: |
-        cd magnitude/packages/magnitude-core
+        cd magnitude
         npm install
+        echo "✓ magnitude root deps installed (turbo, etc.)"
+
+        cd packages/magnitude-core
         npm run build
         echo "✓ magnitude-core built"
 
         cd ../magnitude-extract
-        npm install
         npm run build
         echo "✓ magnitude-extract built"
 

From 97e499c760571fd6baff672ec2b0dc37cae275fa Mon Sep 17 00:00:00 2001
From: CatB1t <skytedits@gmail.com>
Date: Wed, 8 Apr 2026 15:59:51 +0200
Subject: [PATCH 07/14] ci(lightweight): exclude agent_service tests

---
 .github/workflows/tests_lightweight.yml | 39 ++-----------------------
 1 file changed, 2 insertions(+), 37 deletions(-)

diff --git a/.github/workflows/tests_lightweight.yml b/.github/workflows/tests_lightweight.yml
index 2c772c9fb..449a3a616 100644
--- a/.github/workflows/tests_lightweight.yml
+++ b/.github/workflows/tests_lightweight.yml
@@ -154,41 +154,6 @@ jobs:
       run: |
         uv run playwright install --with-deps
 
-    - name: Setup Node.js (agent-service tests)
-      uses: actions/setup-node@v4
-      with:
-        node-version: '22'
-
-    - name: Clone magnitude repo (agent-service tests)
-      uses: actions/checkout@v6
-      with:
-        repository: unifyai/magnitude
-        ref: unity-modifications
-        path: magnitude
-        token: ${{ secrets.CLONE_TOKEN }}
-        fetch-depth: 1
-
-    - name: Build magnitude packages and install agent-service (agent-service tests)
-      run: |
-        cd magnitude
-        npm install
-        echo "✓ magnitude root deps installed (turbo, etc.)"
-
-        cd packages/magnitude-core
-        npm run build
-        echo "✓ magnitude-core built"
-
-        cd ../magnitude-extract
-        npm run build
-        echo "✓ magnitude-extract built"
-
-        cd ${{ github.workspace }}/agent-service
-        npm install
-        echo "✓ agent-service dependencies installed"
-
-        npx patchright install chromium
-        echo "✓ Patchright Chromium installed"
-
     - name: Cache orchestra poetry dependencies
       uses: actions/cache@v4
       with:
@@ -212,8 +177,8 @@ jobs:
         export DISPLAY=:99
         trap "kill $XVFB_PID 2>/dev/null" EXIT
 
-        echo "Running: parallel_run.sh --timeout 7200 tests/ -- -m \"not llm_call\" --ignore=tests/actor --ignore=tests/conversation_manager --ignore=tests/async_tool_loop"
-        tests/parallel_run.sh --timeout 7200 tests/ -- -m "not llm_call" --ignore=tests/actor --ignore=tests/conversation_manager --ignore=tests/async_tool_loop
+        echo "Running: parallel_run.sh --timeout 7200 tests/ -- -m \"not llm_call\" --ignore=tests/actor --ignore=tests/conversation_manager --ignore=tests/async_tool_loop --ignore=tests/agent_service --ignore=tests/demo_url_mapping"
+        tests/parallel_run.sh --timeout 7200 tests/ -- -m "not llm_call" --ignore=tests/actor --ignore=tests/conversation_manager --ignore=tests/async_tool_loop --ignore=tests/agent_service --ignore=tests/demo_url_mapping
         TEST_EXIT_CODE=$?
         echo "exit_code=$TEST_EXIT_CODE" >> "$GITHUB_OUTPUT"
         exit $TEST_EXIT_CODE

From 49890686998ff20244fdc0bb403e33b67f83330e Mon Sep 17 00:00:00 2001
From: CatB1t <skytedits@gmail.com>
Date: Wed, 8 Apr 2026 19:31:23 +0200
Subject: [PATCH 08/14] ci(lightweight): split testing to 8 jobs

---
 .github/workflows/tests_lightweight.yml | 71 ++++++++++++++++++++++---
 1 file changed, 63 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/tests_lightweight.yml b/.github/workflows/tests_lightweight.yml
index 449a3a616..c85a10d78 100644
--- a/.github/workflows/tests_lightweight.yml
+++ b/.github/workflows/tests_lightweight.yml
@@ -1,4 +1,4 @@
-# Lightweight test suite: runs all tests with -m "not llm_call" as a single job.
+# Lightweight test suite: runs all tests with -m "not llm_call" across 8 fixed per-test shards.
 # For the full test suite (with LLM calls, matrix parallelism, cache consolidation),
 # use tests.yml via workflow_dispatch.
 name: Tests (lightweight)
@@ -21,11 +21,16 @@ on:
 
 jobs:
   pytest:
-    name: pytest (lightweight)
+    name: pytest (lightweight shard ${{ matrix.shard }}/8)
     runs-on: ubuntu-latest-8-cores
     environment: unity-testing
     timeout-minutes: 130
+    strategy:
+      fail-fast: false
+      matrix:
+        shard: [1, 2, 3, 4, 5, 6, 7, 8]
     env:
+      LIGHTWEIGHT_SHARD_COUNT: "8"
       UNIFY_TESTS_RAND_PROJ: "false"
       UNIFY_TESTS_DELETE_PROJ_ON_START: "false"
       UNIFY_TESTS_DELETE_PROJ_ON_EXIT: "false"
@@ -167,18 +172,68 @@ jobs:
         cd "$ORCHESTRA_REPO_PATH"
         poetry install --no-interaction
 
+    - name: Collect test nodes for this shard
+      id: collect-nodes
+      env:
+        LIGHTWEIGHT_SHARD: ${{ matrix.shard }}
+      run: |
+        SHARD_NODES_FILE="$(mktemp)"
+        echo "shard_nodes_file=$SHARD_NODES_FILE" >> "$GITHUB_OUTPUT"
+        export SHARD_NODES_FILE
+
+        uv run python - <<'PY'
+        import os
+        import subprocess
+        import sys
+        from pathlib import Path
+
+        cmd = [
+            sys.executable,
+            "-m",
+            "pytest",
+            "--collect-only",
+            "-q",
+            "-m",
+            "not llm_call",
+            "tests/",
+            "--ignore=tests/actor",
+            "--ignore=tests/conversation_manager",
+            "--ignore=tests/async_tool_loop",
+            "--ignore=tests/agent_service",
+            "--ignore=tests/demo_url_mapping",
+        ]
+        result = subprocess.run(cmd, text=True, capture_output=True)
+        if result.returncode != 0:
+            print(result.stdout)
+            print(result.stderr, file=sys.stderr)
+            raise SystemExit(result.returncode)
+
+        nodes = [line.strip() for line in result.stdout.splitlines() if "::" in line]
+        shard = int(os.environ["LIGHTWEIGHT_SHARD"])
+        shard_count = int(os.environ["LIGHTWEIGHT_SHARD_COUNT"])
+        shard_nodes = [node for index, node in enumerate(nodes) if index % shard_count == shard - 1]
+        Path(os.environ["SHARD_NODES_FILE"]).write_text("".join(f"{node}\n" for node in shard_nodes))
+
+        print(f"Collected {len(nodes)} lightweight nodes")
+        print(f"Shard {shard}/{shard_count}: {len(shard_nodes)} nodes")
+
+        if not shard_nodes:
+            raise SystemExit(f"Shard {shard}/{shard_count} has no collected tests")
+        PY
+
     - name: Run tests
       id: run-tests
       run: |
         set +e
+        mapfile -t SHARD_NODES < '${{ steps.collect-nodes.outputs.shard_nodes_file }}'
 
         Xvfb :99 -screen 0 1920x1080x24 &
         XVFB_PID=$!
         export DISPLAY=:99
         trap "kill $XVFB_PID 2>/dev/null" EXIT
 
-        echo "Running: parallel_run.sh --timeout 7200 tests/ -- -m \"not llm_call\" --ignore=tests/actor --ignore=tests/conversation_manager --ignore=tests/async_tool_loop --ignore=tests/agent_service --ignore=tests/demo_url_mapping"
-        tests/parallel_run.sh --timeout 7200 tests/ -- -m "not llm_call" --ignore=tests/actor --ignore=tests/conversation_manager --ignore=tests/async_tool_loop --ignore=tests/agent_service --ignore=tests/demo_url_mapping
+        echo "Running shard ${{ matrix.shard }}/8 with ${#SHARD_NODES[@]} explicit test nodes"
+        tests/parallel_run.sh --timeout 7200 "${SHARD_NODES[@]}"
         TEST_EXIT_CODE=$?
         echo "exit_code=$TEST_EXIT_CODE" >> "$GITHUB_OUTPUT"
         exit $TEST_EXIT_CODE
@@ -243,7 +298,7 @@ jobs:
         echo ""
         echo "Download logs: $ARTIFACTS_URL"
 
-        echo "## ❌ Test Failures (lightweight)" >> $GITHUB_STEP_SUMMARY
+        echo "## ❌ Test Failures (lightweight shard ${{ matrix.shard }}/8)" >> $GITHUB_STEP_SUMMARY
         echo "" >> $GITHUB_STEP_SUMMARY
         echo "**✅ $PASS_COUNT** passed, **❌ $FAIL_COUNT** failed — [view logs]($ARTIFACTS_URL)" >> $GITHUB_STEP_SUMMARY
         echo "" >> $GITHUB_STEP_SUMMARY
@@ -275,7 +330,7 @@ jobs:
         fi
 
         if [ "$log_count" -eq 0 ]; then
-          echo "## ⚠️ No Tests Ran (lightweight)" >> $GITHUB_STEP_SUMMARY
+          echo "## ⚠️ No Tests Ran (lightweight shard ${{ matrix.shard }}/8)" >> $GITHUB_STEP_SUMMARY
           echo "" >> $GITHUB_STEP_SUMMARY
           echo "**0 tests executed** — this likely indicates a problem with test discovery." >> $GITHUB_STEP_SUMMARY
           echo "" >> $GITHUB_STEP_SUMMARY
@@ -291,7 +346,7 @@ jobs:
         echo ""
         echo "View logs: $ARTIFACTS_URL"
 
-        echo "## ✅ All Tests Passed (lightweight)" >> $GITHUB_STEP_SUMMARY
+        echo "## ✅ All Tests Passed (lightweight shard ${{ matrix.shard }}/8)" >> $GITHUB_STEP_SUMMARY
         echo "" >> $GITHUB_STEP_SUMMARY
         echo "**✅ $log_count** passed — [view logs]($ARTIFACTS_URL)" >> $GITHUB_STEP_SUMMARY
 
@@ -359,7 +414,7 @@ jobs:
       if: always()
       uses: actions/upload-artifact@v7
       with:
-        name: logs-lightweight
+        name: logs-lightweight-shard-${{ matrix.shard }}
         path: logs/
         if-no-files-found: ignore
         retention-days: 90

From cfd350d35bf1cae79a98a4e61edd99e95f3c8726 Mon Sep 17 00:00:00 2001
From: CatB1t <skytedits@gmail.com>
Date: Wed, 8 Apr 2026 20:14:57 +0200
Subject: [PATCH 09/14] ci(lightweight): skip unity init on collect only

---
 .github/workflows/tests_lightweight.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/tests_lightweight.yml b/.github/workflows/tests_lightweight.yml
index c85a10d78..a99d3433b 100644
--- a/.github/workflows/tests_lightweight.yml
+++ b/.github/workflows/tests_lightweight.yml
@@ -176,6 +176,7 @@ jobs:
       id: collect-nodes
       env:
         LIGHTWEIGHT_SHARD: ${{ matrix.shard }}
+        SKIP_UNITY_TEST_INIT: "1"
       run: |
         SHARD_NODES_FILE="$(mktemp)"
         echo "shard_nodes_file=$SHARD_NODES_FILE" >> "$GITHUB_OUTPUT"

From 30eb516ed170845225695341b3bcd59a86bf7fb3 Mon Sep 17 00:00:00 2001
From: CatB1t <skytedits@gmail.com>
Date: Wed, 8 Apr 2026 20:48:43 +0200
Subject: [PATCH 10/14] ci(lightweight): shard by test file counts instead of
 test nodes

---
 .github/workflows/tests_lightweight.yml | 80 +++++++++++++++++++------
 1 file changed, 62 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/tests_lightweight.yml b/.github/workflows/tests_lightweight.yml
index a99d3433b..90f66aebf 100644
--- a/.github/workflows/tests_lightweight.yml
+++ b/.github/workflows/tests_lightweight.yml
@@ -172,17 +172,18 @@ jobs:
         cd "$ORCHESTRA_REPO_PATH"
         poetry install --no-interaction
 
-    - name: Collect test nodes for this shard
-      id: collect-nodes
+    - name: Collect test files for this shard
+      id: collect-files
       env:
         LIGHTWEIGHT_SHARD: ${{ matrix.shard }}
         SKIP_UNITY_TEST_INIT: "1"
       run: |
-        SHARD_NODES_FILE="$(mktemp)"
-        echo "shard_nodes_file=$SHARD_NODES_FILE" >> "$GITHUB_OUTPUT"
-        export SHARD_NODES_FILE
+        SHARD_FILES_FILE="$(mktemp)"
+        echo "shard_files_file=$SHARD_FILES_FILE" >> "$GITHUB_OUTPUT"
+        export SHARD_FILES_FILE
 
         uv run python - <<'PY'
+        import math
         import os
         import subprocess
         import sys
@@ -193,7 +194,7 @@ jobs:
             "-m",
             "pytest",
             "--collect-only",
-            "-q",
+            "-qq",
             "-m",
             "not llm_call",
             "tests/",
@@ -209,32 +210,75 @@ jobs:
             print(result.stderr, file=sys.stderr)
             raise SystemExit(result.returncode)
 
-        nodes = [line.strip() for line in result.stdout.splitlines() if "::" in line]
         shard = int(os.environ["LIGHTWEIGHT_SHARD"])
         shard_count = int(os.environ["LIGHTWEIGHT_SHARD_COUNT"])
-        shard_nodes = [node for index, node in enumerate(nodes) if index % shard_count == shard - 1]
-        Path(os.environ["SHARD_NODES_FILE"]).write_text("".join(f"{node}\n" for node in shard_nodes))
-
-        print(f"Collected {len(nodes)} lightweight nodes")
-        print(f"Shard {shard}/{shard_count}: {len(shard_nodes)} nodes")
-
-        if not shard_nodes:
-            raise SystemExit(f"Shard {shard}/{shard_count} has no collected tests")
+        file_counts = []
+        for raw_line in result.stdout.splitlines():
+            line = raw_line.strip()
+            if not line or line.startswith("="):
+                continue
+            file_path, separator, count_text = line.rpartition(":")
+            if separator != ":" or not file_path.startswith("tests/"):
+                continue
+            test_count = int(count_text.strip())
+            file_counts.append((file_path, test_count))
+
+        if not file_counts:
+            raise SystemExit("Pytest did not report any lightweight test files")
+
+        total_tests = sum(test_count for _, test_count in file_counts)
+        target_tests_per_shard = math.ceil(total_tests / shard_count)
+        shards = [[] for _ in range(shard_count)]
+        shard_test_counts = [0] * shard_count
+        current_shard = 0
+
+        for index, (file_path, test_count) in enumerate(file_counts):
+            shards[current_shard].append(file_path)
+            shard_test_counts[current_shard] += test_count
+
+            remaining_files = len(file_counts) - index - 1
+            remaining_shards = shard_count - current_shard - 1
+            if (
+                current_shard < shard_count - 1
+                and shard_test_counts[current_shard] >= target_tests_per_shard
+                and remaining_files >= remaining_shards
+            ):
+                current_shard += 1
+
+        shard_files = shards[shard - 1]
+        shard_test_count = shard_test_counts[shard - 1]
+        Path(os.environ["SHARD_FILES_FILE"]).write_text("".join(f"{file_path}\n" for file_path in shard_files))
+
+        print(f"Collected {len(file_counts)} lightweight test files")
+        print(f"Collected {total_tests} lightweight tests")
+        print(
+            f"Target per shard: {target_tests_per_shard} tests "
+            f"across {shard_count} shards"
+        )
+        print(
+            f"Shard {shard}/{shard_count}: "
+            f"{len(shard_files)} files, {shard_test_count} tests"
+        )
+        for file_path in shard_files:
+            print(f"  - {file_path}")
+
+        if not shard_files:
+            raise SystemExit(f"Shard {shard}/{shard_count} has no collected test files")
         PY
 
     - name: Run tests
       id: run-tests
       run: |
         set +e
-        mapfile -t SHARD_NODES < '${{ steps.collect-nodes.outputs.shard_nodes_file }}'
+        mapfile -t SHARD_FILES < '${{ steps.collect-files.outputs.shard_files_file }}'
 
         Xvfb :99 -screen 0 1920x1080x24 &
         XVFB_PID=$!
         export DISPLAY=:99
         trap "kill $XVFB_PID 2>/dev/null" EXIT
 
-        echo "Running shard ${{ matrix.shard }}/8 with ${#SHARD_NODES[@]} explicit test nodes"
-        tests/parallel_run.sh --timeout 7200 "${SHARD_NODES[@]}"
+        echo "Running shard ${{ matrix.shard }}/8 with ${#SHARD_FILES[@]} test files"
+        tests/parallel_run.sh --timeout 7200 "${SHARD_FILES[@]}"
         TEST_EXIT_CODE=$?
         echo "exit_code=$TEST_EXIT_CODE" >> "$GITHUB_OUTPUT"
         exit $TEST_EXIT_CODE

From 94067e5b2bc772e28e955af7456d8038165eb85a Mon Sep 17 00:00:00 2001
From: CatB1t <skytedits@gmail.com>
Date: Thu, 9 Apr 2026 11:16:21 +0200
Subject: [PATCH 11/14] Revert "ci(lightweight): shard by test file counts
 instead of test nodes"

This reverts commit 30eb516ed170845225695341b3bcd59a86bf7fb3.
---
 .github/workflows/tests_lightweight.yml | 80 ++++++-------------------
 1 file changed, 18 insertions(+), 62 deletions(-)

diff --git a/.github/workflows/tests_lightweight.yml b/.github/workflows/tests_lightweight.yml
index 90f66aebf..a99d3433b 100644
--- a/.github/workflows/tests_lightweight.yml
+++ b/.github/workflows/tests_lightweight.yml
@@ -172,18 +172,17 @@ jobs:
         cd "$ORCHESTRA_REPO_PATH"
         poetry install --no-interaction
 
-    - name: Collect test files for this shard
-      id: collect-files
+    - name: Collect test nodes for this shard
+      id: collect-nodes
       env:
         LIGHTWEIGHT_SHARD: ${{ matrix.shard }}
         SKIP_UNITY_TEST_INIT: "1"
       run: |
-        SHARD_FILES_FILE="$(mktemp)"
-        echo "shard_files_file=$SHARD_FILES_FILE" >> "$GITHUB_OUTPUT"
-        export SHARD_FILES_FILE
+        SHARD_NODES_FILE="$(mktemp)"
+        echo "shard_nodes_file=$SHARD_NODES_FILE" >> "$GITHUB_OUTPUT"
+        export SHARD_NODES_FILE
 
         uv run python - <<'PY'
-        import math
         import os
         import subprocess
         import sys
@@ -194,7 +193,7 @@ jobs:
             "-m",
             "pytest",
             "--collect-only",
-            "-qq",
+            "-q",
             "-m",
             "not llm_call",
             "tests/",
@@ -210,75 +209,32 @@ jobs:
             print(result.stderr, file=sys.stderr)
             raise SystemExit(result.returncode)
 
+        nodes = [line.strip() for line in result.stdout.splitlines() if "::" in line]
         shard = int(os.environ["LIGHTWEIGHT_SHARD"])
         shard_count = int(os.environ["LIGHTWEIGHT_SHARD_COUNT"])
-        file_counts = []
-        for raw_line in result.stdout.splitlines():
-            line = raw_line.strip()
-            if not line or line.startswith("="):
-                continue
-            file_path, separator, count_text = line.rpartition(":")
-            if separator != ":" or not file_path.startswith("tests/"):
-                continue
-            test_count = int(count_text.strip())
-            file_counts.append((file_path, test_count))
-
-        if not file_counts:
-            raise SystemExit("Pytest did not report any lightweight test files")
-
-        total_tests = sum(test_count for _, test_count in file_counts)
-        target_tests_per_shard = math.ceil(total_tests / shard_count)
-        shards = [[] for _ in range(shard_count)]
-        shard_test_counts = [0] * shard_count
-        current_shard = 0
-
-        for index, (file_path, test_count) in enumerate(file_counts):
-            shards[current_shard].append(file_path)
-            shard_test_counts[current_shard] += test_count
-
-            remaining_files = len(file_counts) - index - 1
-            remaining_shards = shard_count - current_shard - 1
-            if (
-                current_shard < shard_count - 1
-                and shard_test_counts[current_shard] >= target_tests_per_shard
-                and remaining_files >= remaining_shards
-            ):
-                current_shard += 1
-
-        shard_files = shards[shard - 1]
-        shard_test_count = shard_test_counts[shard - 1]
-        Path(os.environ["SHARD_FILES_FILE"]).write_text("".join(f"{file_path}\n" for file_path in shard_files))
-
-        print(f"Collected {len(file_counts)} lightweight test files")
-        print(f"Collected {total_tests} lightweight tests")
-        print(
-            f"Target per shard: {target_tests_per_shard} tests "
-            f"across {shard_count} shards"
-        )
-        print(
-            f"Shard {shard}/{shard_count}: "
-            f"{len(shard_files)} files, {shard_test_count} tests"
-        )
-        for file_path in shard_files:
-            print(f"  - {file_path}")
-
-        if not shard_files:
-            raise SystemExit(f"Shard {shard}/{shard_count} has no collected test files")
+        shard_nodes = [node for index, node in enumerate(nodes) if index % shard_count == shard - 1]
+        Path(os.environ["SHARD_NODES_FILE"]).write_text("".join(f"{node}\n" for node in shard_nodes))
+
+        print(f"Collected {len(nodes)} lightweight nodes")
+        print(f"Shard {shard}/{shard_count}: {len(shard_nodes)} nodes")
+
+        if not shard_nodes:
+            raise SystemExit(f"Shard {shard}/{shard_count} has no collected tests")
         PY
 
     - name: Run tests
       id: run-tests
       run: |
         set +e
-        mapfile -t SHARD_FILES < '${{ steps.collect-files.outputs.shard_files_file }}'
+        mapfile -t SHARD_NODES < '${{ steps.collect-nodes.outputs.shard_nodes_file }}'
 
         Xvfb :99 -screen 0 1920x1080x24 &
         XVFB_PID=$!
         export DISPLAY=:99
         trap "kill $XVFB_PID 2>/dev/null" EXIT
 
-        echo "Running shard ${{ matrix.shard }}/8 with ${#SHARD_FILES[@]} test files"
-        tests/parallel_run.sh --timeout 7200 "${SHARD_FILES[@]}"
+        echo "Running shard ${{ matrix.shard }}/8 with ${#SHARD_NODES[@]} explicit test nodes"
+        tests/parallel_run.sh --timeout 7200 "${SHARD_NODES[@]}"
         TEST_EXIT_CODE=$?
         echo "exit_code=$TEST_EXIT_CODE" >> "$GITHUB_OUTPUT"
         exit $TEST_EXIT_CODE

From 7843fc82366124a9ac041b471a85c44cbd0be453 Mon Sep 17 00:00:00 2001
From: CatB1t <skytedits@gmail.com>
Date: Thu, 9 Apr 2026 11:34:47 +0200
Subject: [PATCH 12/14] ci(lighweight): add --from-file and --skip-collection
 to parallel_run, update lightweight tests to use it

---
 .github/workflows/tests_lightweight.yml |  7 ++-
 tests/_parse_args.sh                    | 31 +++++++++-
 tests/parallel_run.sh                   | 76 +++++++++++++++++++++++--
 3 files changed, 103 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/tests_lightweight.yml b/.github/workflows/tests_lightweight.yml
index a99d3433b..2c33e5c33 100644
--- a/.github/workflows/tests_lightweight.yml
+++ b/.github/workflows/tests_lightweight.yml
@@ -226,15 +226,16 @@ jobs:
       id: run-tests
       run: |
         set +e
-        mapfile -t SHARD_NODES < '${{ steps.collect-nodes.outputs.shard_nodes_file }}'
+        SHARD_NODES_FILE='${{ steps.collect-nodes.outputs.shard_nodes_file }}'
+        SHARD_NODE_COUNT=$(wc -l < "$SHARD_NODES_FILE" | tr -d ' ')
 
         Xvfb :99 -screen 0 1920x1080x24 &
         XVFB_PID=$!
         export DISPLAY=:99
         trap "kill $XVFB_PID 2>/dev/null" EXIT
 
-        echo "Running shard ${{ matrix.shard }}/8 with ${#SHARD_NODES[@]} explicit test nodes"
-        tests/parallel_run.sh --timeout 7200 "${SHARD_NODES[@]}"
+        echo "Running shard ${{ matrix.shard }}/8 with $SHARD_NODE_COUNT explicit test nodes from $SHARD_NODES_FILE"
+        tests/parallel_run.sh --timeout 7200 --skip-collection --from-file "$SHARD_NODES_FILE"
         TEST_EXIT_CODE=$?
         echo "exit_code=$TEST_EXIT_CODE" >> "$GITHUB_OUTPUT"
         exit $TEST_EXIT_CODE
diff --git a/tests/_parse_args.sh b/tests/_parse_args.sh
index 5730f772f..d1f7f8006 100644
--- a/tests/_parse_args.sh
+++ b/tests/_parse_args.sh
@@ -11,9 +11,10 @@
 #
 # After calling parse_test_args, these variables are populated:
 #   SERIAL, TIMEOUT, NAME_PATTERN, EVAL_ONLY, SYMBOLIC_ONLY,
-#   REPEAT_COUNT, OVERWRITE_SCENARIOS, MAX_JOBS, ENV_OVERRIDES[],
-#   TAGS[], PYTEST_EXTRA_ARGS[], PYTEST_COLLECTION_ARGS[],
-#   POSITIONAL_ARGS[]
+#   REPEAT_COUNT, OVERWRITE_SCENARIOS, SKIP_COLLECTION,
+#   MAX_JOBS, ENV_OVERRIDES[],
+#   TAGS[], FROM_FILE_PATHS[], PYTEST_EXTRA_ARGS[],
+#   PYTEST_COLLECTION_ARGS[], POSITIONAL_ARGS[]
 #
 # Additional functions:
 #   resolve_test_paths REPO_ROOT   - Validates paths in POSITIONAL_ARGS, sets RESOLVED_TEST_PATHS[]
@@ -42,9 +43,11 @@ parse_test_args() {
   SYMBOLIC_ONLY=0
   REPEAT_COUNT=1
   OVERWRITE_SCENARIOS=0
+  SKIP_COLLECTION=0
   MAX_JOBS=$_PARSE_ARGS_NUM_CORES
   ENV_OVERRIDES=()
   TAGS=()
+  FROM_FILE_PATHS=()
   PYTEST_EXTRA_ARGS=()
   PYTEST_COLLECTION_ARGS=()
   PYTEST_IGNORE_PATHS=()
@@ -104,6 +107,10 @@ parse_test_args() {
         OVERWRITE_SCENARIOS=1
         shift
         ;;
+      --skip-collection)
+        SKIP_COLLECTION=1
+        shift
+        ;;
       --tags)
         if [[ -n "${2-}" ]]; then
           # Split on comma and add each tag to TAGS array
@@ -117,6 +124,15 @@ parse_test_args() {
           return 2
         fi
         ;;
+      --from-file)
+        if [[ -n "${2-}" ]]; then
+          FROM_FILE_PATHS+=( "$2" )
+          shift 2
+        else
+          echo "Error: --from-file requires a file path." >&2
+          return 2
+        fi
+        ;;
       -j|--jobs)
         if [[ -z "${2-}" ]]; then
           echo "Error: -j|--jobs requires an argument (e.g., --jobs 8, --jobs 0, --jobs none)." >&2
@@ -260,12 +276,17 @@ reconstruct_parallel_run_args() {
   (( SYMBOLIC_ONLY )) && args="$args --symbolic-only"
   (( REPEAT_COUNT > 1 )) && args="$args --repeat $REPEAT_COUNT"
   (( OVERWRITE_SCENARIOS )) && args="$args --overwrite-scenarios"
+  (( SKIP_COLLECTION )) && args="$args --skip-collection"
   # Note: MAX_JOBS is not passed to CI (CI has its own resource limits)
 
   for tag in "${TAGS[@]}"; do
     args="$args --tags $(printf '%q' "$tag")"
   done
 
+  for path in "${FROM_FILE_PATHS[@]}"; do
+    args="$args --from-file $(printf '%q' "$path")"
+  done
+
   # Include --env flags if requested
   if (( include_env )); then
     for kv in "${ENV_OVERRIDES[@]}"; do
@@ -304,7 +325,9 @@ Options:
   --eval-only          Run only @pytest.mark.eval tests
   --symbolic-only      Run only non-eval tests
   --repeat N           Run each test N times
+  --skip-collection    Trust explicit node ids instead of validating via collection
   --tags TAG           Tag runs for filtering (repeatable)
+  --from-file PATH     Read test targets from a newline-delimited file
   --overwrite-scenarios  Delete and recreate test scenarios
   -h, --help           Show this help
   --                   Pass remaining args directly to pytest
@@ -316,6 +339,8 @@ Examples:
   $script_name -s tests/                # Serial mode (per-file)
   $script_name -j 8 tests/              # Limit to 8 concurrent
   $script_name --eval-only tests/       # Only eval tests
+  $script_name --skip-collection tests/foo.py::test_bar
+  $script_name --from-file targets.txt  # Read targets from file
   $script_name -e UNILLM_CACHE=false tests/
   $script_name tests/ -- -v --tb=short  # Pass args to pytest
   $script_name tests/ -- -k 'gpt-5'     # Filter by test name pattern
diff --git a/tests/parallel_run.sh b/tests/parallel_run.sh
index 0fed212d6..cf3dc1a9e 100755
--- a/tests/parallel_run.sh
+++ b/tests/parallel_run.sh
@@ -188,8 +188,8 @@ REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd -P)"
 
 # Parse arguments using shared helper
 # Returns: 0=success, 1=help requested, 2=error
-parse_test_args "$@"
-_parse_result=$?
+_parse_result=0
+parse_test_args "$@" || _parse_result=$?
 if (( _parse_result == 1 )); then
   # Help requested
   HELP_SCRIPT_NAME="parallel_run.sh"
@@ -575,8 +575,51 @@ build_env_exports() {
   echo "$exports"
 }
 
-# Reset positional parameters safely under nounset (only expand if set)
-set -- ${POSITIONAL_ARGS[@]+"${POSITIONAL_ARGS[@]}"}
+# Resolve an auxiliary input file path relative to the caller, tests/, or repo root.
+resolve_input_file_path() {
+  local path="$1"
+  if [[ -f "$path" ]]; then
+    printf "%s" "$path"
+  elif [[ -f "$SCRIPT_DIR/$path" ]]; then
+    printf "%s" "$SCRIPT_DIR/$path"
+  elif [[ -f "$REPO_ROOT/$path" ]]; then
+    printf "%s" "$REPO_ROOT/$path"
+  else
+    return 1
+  fi
+}
+
+expanded_targets=()
+if (( ${#POSITIONAL_ARGS[@]} > 0 )); then
+  expanded_targets=( "${POSITIONAL_ARGS[@]}" )
+fi
+
+had_explicit_target_source=0
+if (( ${#POSITIONAL_ARGS[@]} > 0 || ${#FROM_FILE_PATHS[@]} > 0 )); then
+  had_explicit_target_source=1
+fi
+
+if (( ${#FROM_FILE_PATHS[@]} > 0 )); then
+  for list_path in "${FROM_FILE_PATHS[@]}"; do
+    resolved_list_path=$(resolve_input_file_path "$list_path") || {
+      echo "Error: --from-file not found: $list_path" >&2
+      exit 2
+    }
+    while IFS= read -r raw_line || [[ -n "$raw_line" ]]; do
+      line="${raw_line%$'\r'}"
+      line="${line#"${line%%[![:space:]]*}"}"
+      line="${line%"${line##*[![:space:]]}"}"
+      [[ -z "$line" || "$line" == \#* ]] && continue
+      expanded_targets+=( "$line" )
+    done < "$resolved_list_path"
+  done
+fi
+
+if (( ${#expanded_targets[@]} > 0 )); then
+  set -- "${expanded_targets[@]}"
+else
+  set --
+fi
 
 # Always operate from the repo root for discovery, regardless of where the script was invoked
 cd "$REPO_ROOT"
@@ -979,6 +1022,10 @@ declare -a direct_files=()
 declare -a direct_nodes=()
 
 if (( $# == 0 )); then
+  if (( had_explicit_target_source )); then
+    echo "No valid directories, files, or tests provided." >&2
+    exit 1
+  fi
   roots=( "." )
 else
   for arg in "$@"; do
@@ -1140,6 +1187,11 @@ validate_and_add_direct_nodes() {
     return 0
   fi
 
+  if (( SKIP_COLLECTION )); then
+    printf '%s\0' "${direct_nodes[@]}" >> "$tmp"
+    return 0
+  fi
+
   # Extract unique base files from direct_nodes
   local -a base_files=()
   local seen_files=""
@@ -1154,15 +1206,19 @@ validate_and_add_direct_nodes() {
   # Collect all valid nodes from those files (no marker filter — just checking existence)
   local collected
   collected=$(collect_nodes_batch "" "${base_files[@]}")
+  local collected_file
+  collected_file=$(mktemp)
+  printf '%s\n' "$collected" > "$collected_file"
 
   # Validate each direct_node against collected output
   for node in "${direct_nodes[@]}"; do
-    if echo "$collected" | grep -qxF "$node"; then
+    if grep -qxF -- "$node" "$collected_file"; then
       printf '%s\0' "$node" >> "$tmp"
     else
       echo "Error: Test node not found (skipping): $node" >&2
     fi
   done
+  rm -f "$collected_file"
 }
 
 # Gather recursive .py files from roots (NUL-delimited, sorted)
@@ -1345,6 +1401,7 @@ fi
 
 # Print header before drip-feeding session creation
 echo "Creating ${#files[@]} tmux sessions..."
+WALL_START=$(date +%s)
 
 for target in "${files[@]}"; do
   # Report any completions before creating new sessions
@@ -1561,8 +1618,17 @@ if (( total_tests > 0 )); then
   else
     duration_str="${total_duration}s"
   fi
+  wall_duration=$(( $(date +%s) - WALL_START ))
+  if (( wall_duration >= 60 )); then
+    wall_mins=$((wall_duration / 60))
+    wall_secs=$((wall_duration % 60))
+    wall_str="${wall_mins}m ${wall_secs}s"
+  else
+    wall_str="${wall_duration}s"
+  fi
   total_cache_rate=$(format_cache_rate "$total_hits" "$total_misses")
   total_calls=$((total_hits + total_misses))
+  print_duration_line "  Wall time:       $wall_str"
   print_duration_line "  Serial duration: $duration_str"
   print_duration_line "  LLM cache: $total_cache_rate ($total_hits hits, $total_misses misses, $total_calls total)"
   print_duration_line "  LLM cost: \$$total_cost"

From c543d245cf05df0307d49a4c0aed044fee0b8126 Mon Sep 17 00:00:00 2001
From: CatB1t <skytedits@gmail.com>
Date: Fri, 10 Apr 2026 12:11:52 +0200
Subject: [PATCH 13/14] revert changes to 'test.yml'

---
 .github/scripts/discover_test_paths.py |  87 ++-------
 .github/workflows/tests.yml            | 249 +++++++++++++++++++------
 2 files changed, 212 insertions(+), 124 deletions(-)

diff --git a/.github/scripts/discover_test_paths.py b/.github/scripts/discover_test_paths.py
index db203cab3..49e37e701 100644
--- a/.github/scripts/discover_test_paths.py
+++ b/.github/scripts/discover_test_paths.py
@@ -9,9 +9,6 @@
     # Expand specific paths to their leaf directories
     python discover_test_paths.py tests/function_manager tests/actor
 
-    # Exclude directories from discovery
-    python discover_test_paths.py --exclude tests/actor --exclude tests/conversation_manager
-
 When explicit paths are provided:
 - Files are kept as-is (no expansion)
 - Directories are expanded to their leaf sub-directories using Option A algorithm
@@ -22,8 +19,8 @@
   direct test files (space-separated), plus recursive jobs for subdirs
 """
 
-import argparse
 import os
+import sys
 from pathlib import Path
 
 EXCLUDE_DIRS = {
@@ -36,14 +33,6 @@
     "venv",
 }
 
-_exclude_prefixes: list[str] = []
-
-
-def _is_excluded(path: str | Path) -> bool:
-    """Return True if path starts with any --exclude prefix."""
-    s = str(path)
-    return any(s == p or s.startswith(p + "/") for p in _exclude_prefixes)
-
 
 def has_test_files(directory):
     """Check if directory has test_*.py files directly in it."""
@@ -57,17 +46,9 @@ def has_test_files(directory):
 def has_test_subdirs(directory):
     """Check if directory has subdirectories that contain test files (recursively)."""
     for subdir in directory.iterdir():
-        if (
-            subdir.is_dir()
-            and subdir.name not in EXCLUDE_DIRS
-            and not _is_excluded(subdir)
-        ):
+        if subdir.is_dir() and subdir.name not in EXCLUDE_DIRS:
             for root, dirs, files in os.walk(subdir):
-                dirs[:] = [
-                    d
-                    for d in dirs
-                    if d not in EXCLUDE_DIRS and not _is_excluded(os.path.join(root, d))
-                ]
+                dirs[:] = [d for d in dirs if d not in EXCLUDE_DIRS]
                 if any(f.startswith("test_") and f.endswith(".py") for f in files):
                     return True
     return False
@@ -86,7 +67,7 @@ def get_direct_test_files(directory):
 
 def collect_paths(directory, paths):
     """Recursively collect test paths using Option A algorithm."""
-    if not directory.is_dir() or _is_excluded(directory):
+    if not directory.is_dir():
         return
 
     has_files = has_test_files(directory)
@@ -100,20 +81,12 @@ def collect_paths(directory, paths):
         direct_files = get_direct_test_files(directory)
         paths.append(" ".join(str(f) for f in direct_files))
         for subdir in sorted(directory.iterdir()):
-            if (
-                subdir.is_dir()
-                and subdir.name not in EXCLUDE_DIRS
-                and not _is_excluded(subdir)
-            ):
+            if subdir.is_dir() and subdir.name not in EXCLUDE_DIRS:
                 collect_paths(subdir, paths)
     elif has_subdirs:
         # No direct test files, but has subdirs with tests: just recurse
         for subdir in sorted(directory.iterdir()):
-            if (
-                subdir.is_dir()
-                and subdir.name not in EXCLUDE_DIRS
-                and not _is_excluded(subdir)
-            ):
+            if subdir.is_dir() and subdir.name not in EXCLUDE_DIRS:
                 collect_paths(subdir, paths)
 
 
@@ -127,9 +100,6 @@ def expand_path(path_str):
     path = Path(path_str)
     paths = []
 
-    if _is_excluded(path):
-        return []
-
     if not path.exists():
         # Path doesn't exist - return as-is and let pytest handle the error
         return [path_str]
@@ -152,20 +122,12 @@ def expand_path(path_str):
             direct_files = get_direct_test_files(path)
             paths.append(" ".join(str(f) for f in direct_files))
             for subdir in sorted(path.iterdir()):
-                if (
-                    subdir.is_dir()
-                    and subdir.name not in EXCLUDE_DIRS
-                    and not _is_excluded(subdir)
-                ):
+                if subdir.is_dir() and subdir.name not in EXCLUDE_DIRS:
                     collect_paths(subdir, paths)
         elif has_subdirs:
             # No direct test files, but has subdirs with tests: recurse
             for subdir in sorted(path.iterdir()):
-                if (
-                    subdir.is_dir()
-                    and subdir.name not in EXCLUDE_DIRS
-                    and not _is_excluded(subdir)
-                ):
+                if subdir.is_dir() and subdir.name not in EXCLUDE_DIRS:
                     collect_paths(subdir, paths)
         else:
             # No test files at all - return as-is and let pytest handle it
@@ -179,50 +141,33 @@ def discover_all():
     test_root = Path("tests")
     paths = []
 
+    # Handle test files directly in tests/ root (e.g., test_settings.py)
     for item in sorted(test_root.iterdir()):
-        if _is_excluded(item):
-            continue
         if (
             item.is_file()
             and item.name.startswith("test_")
             and item.name.endswith(".py")
         ):
             paths.append(str(item))
-        elif item.is_dir() and item.name not in EXCLUDE_DIRS:
+        elif item.is_dir() and item.name.startswith("test"):
             collect_paths(item, paths)
 
     return paths
 
 
 def main():
-    parser = argparse.ArgumentParser(
-        description="Discover test paths for CI parallelism",
-    )
-    parser.add_argument(
-        "paths",
-        nargs="*",
-        help="Paths to expand (default: discover all)",
-    )
-    parser.add_argument(
-        "--exclude",
-        action="append",
-        default=[],
-        help="Directory prefix to exclude (repeatable)",
-    )
-    args = parser.parse_args()
-
-    global _exclude_prefixes
-    _exclude_prefixes = [p.rstrip("/") for p in args.exclude]
-
-    if args.paths:
+    if len(sys.argv) > 1:
+        # Explicit paths provided - expand each one
         all_paths = []
-        for p in args.paths:
-            expanded = expand_path(p)
+        for arg in sys.argv[1:]:
+            expanded = expand_path(arg)
             all_paths.extend(expanded)
         paths = all_paths
     else:
+        # No arguments - discover all from tests/
         paths = discover_all()
 
+    # Output unique paths, sorted
     for p in sorted(set(paths)):
         print(p)
 
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 2ce2d9efc..47116f1f7 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -36,44 +36,137 @@ on:
         type: number
         default: 120
 
+  # Push events - tests only run if commit message contains [run-tests]
+  push:
+    branches:
+    - '**'    # All branches
+
+  # Pull request events - tests only run if PR title contains [run-tests]
+  pull_request:
+    branches:
+    - '**'    # All branches
 
 jobs:
-  # Determine test mode based on trigger type.
-  # Branch filtering is handled by the on: triggers above, so all events here
-  # are guaranteed to be: push to staging/main, PR to main/staging, or workflow_dispatch.
+  # Determine if tests should run based on trigger and commit/PR message
   should-run-tests:
     runs-on: ubuntu-latest
     outputs:
-      test_mode: ${{ steps.check.outputs.test_mode }}
+      run_tests: ${{ steps.check.outputs.run_tests }}
+      parallel_run_cmd: ${{ steps.check.outputs.parallel_run_cmd }}
     steps:
-    - name: Determine test mode
+    - name: Check if tests should run
       id: check
       env:
+        # Pass these via env to avoid shell interpretation of special chars
+        # (backticks, quotes, $() in commit messages would otherwise be executed)
         EVENT_NAME: ${{ github.event_name }}
+        COMMIT_MSG: ${{ github.event.head_commit.message }}
+        PR_TITLE: ${{ github.event.pull_request.title }}
         PR_BASE_REF: ${{ github.event.pull_request.base.ref }}
+        PR_HEAD_REF: ${{ github.event.pull_request.head.ref }}
       run: |
+        # Helper function to extract [parallel_run.sh ...] content
+        # Matches ALL occurrences and combines their arguments
+        extract_parallel_cmd() {
+          local msg="$1"
+          local result=""
+          local remaining="$msg"
+
+          # Loop through all matches of [parallel_run.sh ...]
+          while [[ "$remaining" =~ \[parallel_run\.sh[[:space:]]+([^\]]+)\] ]]; do
+            local match="${BASH_REMATCH[1]}"
+            # Skip literal "..." which is just documentation/example text
+            if [[ "$match" != "..." ]]; then
+              if [[ -n "$result" ]]; then
+                result="$result $match"
+              else
+                result="$match"
+              fi
+            fi
+            # Remove the matched portion and continue searching
+            remaining="${remaining#*"${BASH_REMATCH[0]}"}"
+          done
+
+          if [[ -n "$result" ]]; then
+            echo "$result"
+          fi
+        }
+
+        # workflow_dispatch always runs tests
         if [[ "$EVENT_NAME" == "workflow_dispatch" ]]; then
-          echo "test_mode=manual" >> "$GITHUB_OUTPUT"
-          echo "Mode: manual (workflow_dispatch)"
-        elif [[ "$EVENT_NAME" == "push" ]]; then
-          echo "test_mode=lightweight" >> "$GITHUB_OUTPUT"
-          echo "Mode: lightweight (push to ${GITHUB_REF#refs/heads/})"
-        elif [[ "$EVENT_NAME" == "pull_request" ]]; then
-          if [[ "$PR_BASE_REF" == "main" ]]; then
-            echo "test_mode=full" >> "$GITHUB_OUTPUT"
-            echo "Mode: full (PR to main)"
+          echo "run_tests=true" >> "$GITHUB_OUTPUT"
+          echo "parallel_run_cmd=" >> "$GITHUB_OUTPUT"
+          echo "Tests triggered by workflow_dispatch"
+          exit 0
+        fi
+
+        # For push events, check commit message
+        if [[ "$EVENT_NAME" == "push" ]]; then
+          # Check for [parallel_run.sh ...] first
+          PARALLEL_CMD=$(extract_parallel_cmd "$COMMIT_MSG")
+          if [[ -n "$PARALLEL_CMD" ]]; then
+            echo "run_tests=true" >> "$GITHUB_OUTPUT"
+            echo "parallel_run_cmd=$PARALLEL_CMD" >> "$GITHUB_OUTPUT"
+            echo "Tests triggered by commit message: [parallel_run.sh $PARALLEL_CMD]"
+            exit 0
+          fi
+
+          # Fall back to [run-tests]
+          if [[ "$COMMIT_MSG" == *"[run-tests]"* ]]; then
+            echo "run_tests=true" >> "$GITHUB_OUTPUT"
+            echo "parallel_run_cmd=" >> "$GITHUB_OUTPUT"
+            echo "Tests triggered by commit message containing [run-tests]"
           else
-            echo "test_mode=lightweight" >> "$GITHUB_OUTPUT"
-            echo "Mode: lightweight (PR to $PR_BASE_REF)"
+            echo "run_tests=false" >> "$GITHUB_OUTPUT"
+            echo "parallel_run_cmd=" >> "$GITHUB_OUTPUT"
+            echo "Skipping tests - commit message does not contain [run-tests] or [parallel_run.sh ...]"
           fi
+          exit 0
         fi
 
+        # For pull_request events, check PR title
+        if [[ "$EVENT_NAME" == "pull_request" ]]; then
+          # Auto-run full suite for staging → main PRs (no tags required)
+          if [[ "$PR_BASE_REF" == "main" && "$PR_HEAD_REF" == "staging" ]]; then
+            echo "run_tests=true" >> "$GITHUB_OUTPUT"
+            echo "parallel_run_cmd=" >> "$GITHUB_OUTPUT"
+            echo "Tests auto-triggered: staging → main PR"
+            exit 0
+          fi
+
+          # Check for [parallel_run.sh ...] first
+          PARALLEL_CMD=$(extract_parallel_cmd "$PR_TITLE")
+          if [[ -n "$PARALLEL_CMD" ]]; then
+            echo "run_tests=true" >> "$GITHUB_OUTPUT"
+            echo "parallel_run_cmd=$PARALLEL_CMD" >> "$GITHUB_OUTPUT"
+            echo "Tests triggered by PR title: [parallel_run.sh $PARALLEL_CMD]"
+            exit 0
+          fi
+
+          # Fall back to [run-tests]
+          if [[ "$PR_TITLE" == *"[run-tests]"* ]]; then
+            echo "run_tests=true" >> "$GITHUB_OUTPUT"
+            echo "parallel_run_cmd=" >> "$GITHUB_OUTPUT"
+            echo "Tests triggered by PR title containing [run-tests]"
+          else
+            echo "run_tests=false" >> "$GITHUB_OUTPUT"
+            echo "parallel_run_cmd=" >> "$GITHUB_OUTPUT"
+            echo "Skipping tests - PR title does not contain [run-tests] or [parallel_run.sh ...]"
+          fi
+          exit 0
+        fi
+
+        # Default: don't run tests
+        echo "run_tests=false" >> "$GITHUB_OUTPUT"
+        echo "parallel_run_cmd=" >> "$GITHUB_OUTPUT"
+        echo "Skipping tests - unknown event type"
+
   discover:
     runs-on: ubuntu-latest
     needs: should-run-tests
+    if: needs.should-run-tests.outputs.run_tests == 'true'
     outputs:
       test_dirs: ${{ steps.set-matrix.outputs.test_dirs }}
-      test_mode: ${{ needs.should-run-tests.outputs.test_mode }}
       parallel_run_args: ${{ steps.set-matrix.outputs.parallel_run_args }}
       job_timeout: ${{ steps.set-matrix.outputs.job_timeout }}
       # Project deletion flags (handled at workflow level to avoid race conditions)
@@ -88,15 +181,25 @@ jobs:
     - id: set-matrix
       shell: bash
       run: |
-        # Discover test paths and create matrix entries for parallel runners.
+        # Parse test paths and flags, then create matrix entries for each path.
+        # Each unique path gets its own parallel GitHub runner.
+        #
+        # Sources (priority order):
+        # 1. [parallel_run.sh ...] from commit/PR → parse paths and flags
+        # 2. workflow_dispatch inputs → test_path + parallel_run_args
+        # 3. Default (no paths) → discover all leaf test folders
         #
-        # Sources:
-        # 1. workflow_dispatch inputs → test_path + parallel_run_args
-        # 2. Default → discover all leaf test folders (with mode-based exclusions)
+        # ALL paths (explicit or discovered) are expanded to leaf directories:
+        # - Files are kept as-is
+        # - Directories are expanded to their leaf sub-folders
         #
-        # Paths are expanded to leaf directories for maximum parallelism.
+        # Leaf discovery (Option A algorithm):
+        # - Leaf directories (test files, no test subdirs) → one job per directory
+        # - Mixed directories (test files AND test subdirs) → one bundled job for
+        #   all direct test files (space-separated), plus recursive jobs for subdirs
+        # This maximizes parallelism regardless of how paths are specified.
 
-        TEST_MODE="${{ needs.should-run-tests.outputs.test_mode }}"
+        PARALLEL_CMD="${{ needs.should-run-tests.outputs.parallel_run_cmd }}"
         INPUT_PATH="${{ inputs.test_path }}"
         INPUT_ARGS="${{ inputs.parallel_run_args }}"
 
@@ -105,15 +208,31 @@ jobs:
         declare -a PATHS=()
         declare -a FLAGS=()
 
-        # Build --exclude args for lightweight mode
-        EXCLUDE_ARGS=""
-        if [[ "$TEST_MODE" == "lightweight" ]]; then
-          EXCLUDE_ARGS="--exclude tests/actor --exclude tests/conversation_manager --exclude tests/async_tool_loop"
-          echo "Lightweight mode: excluding tests/actor, tests/conversation_manager, tests/async_tool_loop" >&2
-        fi
-
-        if [[ "$TEST_MODE" == "manual" && -n "$INPUT_PATH" && "$INPUT_PATH" != "." ]]; then
+        if [[ -n "$PARALLEL_CMD" ]]; then
+          # Parse [parallel_run.sh ...] from commit/PR message
+          # Separate flags (--env, --eval-only, etc.) from paths
+          echo "Parsing [parallel_run.sh $PARALLEL_CMD]" >&2
+          LAST_FLAG=""
+          for token in $PARALLEL_CMD; do
+            if [[ "$token" == --* ]]; then
+              # It's a flag - collect it and its value if needed
+              FLAGS+=("$token")
+              LAST_FLAG="$token"
+            elif [[ "$LAST_FLAG" == "--env" || "$LAST_FLAG" == "--tags" || "$LAST_FLAG" == "-j" || "$LAST_FLAG" == "--jobs" || "$LAST_FLAG" == "-t" || "$LAST_FLAG" == "--timeout" ]]; then
+              # Previous token was a flag that takes a value
+              FLAGS+=("$token")
+              LAST_FLAG=""
+            else
+              # It's a path - collect for expansion
+              RAW_PATHS+=("$token")
+              LAST_FLAG=""
+            fi
+          done
+        elif [[ -n "$INPUT_PATH" && "$INPUT_PATH" != "." ]]; then
+          # workflow_dispatch with specific path(s)
+          # Split space-separated paths into array
           read -ra RAW_PATHS <<< "$INPUT_PATH"
+          # Flags come from parallel_run_args input
           if [[ -n "$INPUT_ARGS" ]]; then
             read -ra FLAGS <<< "$INPUT_ARGS"
           fi
@@ -121,21 +240,28 @@ jobs:
         fi
 
         # Expand paths to leaf directories using discover_test_paths.py
+        # - If RAW_PATHS is empty, discover all test paths from tests/
+        # - If RAW_PATHS has entries, expand each directory to its leaf sub-folders
+        # This ensures maximum parallelism regardless of how paths are specified.
         if (( ${#RAW_PATHS[@]} == 0 )); then
+          # No explicit paths - discover all
           while IFS= read -r path; do
             PATHS+=("$path")
-          done < <(python3 .github/scripts/discover_test_paths.py $EXCLUDE_ARGS)
-          if [[ "$TEST_MODE" == "manual" && -n "$INPUT_ARGS" ]]; then
+          done < <(python3 .github/scripts/discover_test_paths.py)
+          # Flags come from parallel_run_args input (if any)
+          if [[ -n "$INPUT_ARGS" ]]; then
             read -ra FLAGS <<< "$INPUT_ARGS"
           fi
           echo "Discovered ${#PATHS[@]} test paths (leaf dirs + individual files)" >&2
         else
+          # Expand explicit paths to their leaf directories
           while IFS= read -r path; do
             PATHS+=("$path")
-          done < <(python3 .github/scripts/discover_test_paths.py $EXCLUDE_ARGS "${RAW_PATHS[@]}")
+          done < <(python3 .github/scripts/discover_test_paths.py "${RAW_PATHS[@]}")
           echo "Expanded ${#RAW_PATHS[@]} path(s) to ${#PATHS[@]} leaf test paths" >&2
         fi
 
+        # Build JSON array of paths for matrix
         if (( ${#PATHS[@]} == 0 )); then
           echo "Error: No test paths found" >&2
           exit 1
@@ -155,9 +281,12 @@ jobs:
         # to .env in the pytest job. We only parse it here for project mgmt flags.
         # =====================================================================
 
-        DELETE_ON_START="false"
-        DELETE_ON_EXIT="false"
-        RANDOM_PROJECTS="false"
+        # CI defaults for project management (can be overridden)
+        # With local orchestra (default), these are ignored - each job has fresh DB
+        # Only relevant when ORCHESTRA_URL is set to a non-local URL (staging/production)
+        DELETE_ON_START="false"  # Default: no deletion (local orchestra has fresh DB)
+        DELETE_ON_EXIT="false"   # Default: keep project after tests
+        RANDOM_PROJECTS="false"  # Default: shared project mode
 
         # Orchestra branch precedence (highest to lowest):
         # 1. --env LOCAL_ORCHESTRA_BRANCH=xxx (handled below in FLAGS parsing)
@@ -179,9 +308,11 @@ jobs:
         fi
 
         # Parse env_file_content for project management flags only
+        # (actual env vars are written to .env in pytest job, not passed as --env args)
         ENV_FILE_CONTENT_RAW="${{ inputs.env_file_content }}"
         if [[ -n "$ENV_FILE_CONTENT_RAW" ]]; then
           echo "Parsing env_file_content for project management flags..." >&2
+          # Try to decode as base64; if it fails or produces garbage, assume raw content
           if DECODED=$(echo "$ENV_FILE_CONTENT_RAW" | base64 -d 2>/dev/null) && [[ "$DECODED" == *$'\n'* || "$DECODED" == *"="* ]]; then
             ENV_FILE_CONTENT="$DECODED"
           else
@@ -189,13 +320,20 @@ jobs:
           fi
           if [[ -n "$ENV_FILE_CONTENT" ]]; then
             while IFS= read -r line || [[ -n "$line" ]]; do
+              # Skip empty lines and comments
               [[ -z "$line" || "$line" =~ ^[[:space:]]*# ]] && continue
+
+              # Extract KEY=VALUE (handles quotes)
               if [[ "$line" =~ ^[[:space:]]*([A-Za-z_][A-Za-z0-9_]*)[[:space:]]*=[[:space:]]*(.*)[[:space:]]*$ ]]; then
                 key="${BASH_REMATCH[1]}"
                 value="${BASH_REMATCH[2]}"
+
+                # Strip surrounding quotes if present
                 if [[ "$value" =~ ^\"(.*)\"$ ]] || [[ "$value" =~ ^\'(.*)\'$ ]]; then
                   value="${BASH_REMATCH[1]}"
                 fi
+
+                # Only check for project management flags (other vars handled via .env file)
                 case "$key=$value" in
                   UNIFY_TESTS_DELETE_PROJ_ON_START=true|UNIFY_TESTS_DELETE_PROJ_ON_START=True|UNIFY_TESTS_DELETE_PROJ_ON_START=1)
                     DELETE_ON_START="true"
@@ -232,6 +370,7 @@ jobs:
         fi
 
         # Process FLAGS (from parallel_run_args) - highest priority
+        # These are explicit --env args passed via commit message or workflow dispatch
         declare -a CLEANED_FLAGS=()
         skip_next=false
 
@@ -244,6 +383,7 @@ jobs:
           flag="${FLAGS[$i]}"
           next_val="${FLAGS[$((i+1))]:-}"
 
+          # Check for --env with project management flags (strip them, handle at workflow level)
           if [[ "$flag" == "--env" ]]; then
             case "$next_val" in
               UNIFY_TESTS_DELETE_PROJ_ON_START=true|UNIFY_TESTS_DELETE_PROJ_ON_START=True|UNIFY_TESTS_DELETE_PROJ_ON_START=1)
@@ -272,7 +412,7 @@ jobs:
                 ;;
               UNIFY_TESTS_RAND_PROJ=true|UNIFY_TESTS_RAND_PROJ=True|UNIFY_TESTS_RAND_PROJ=1)
                 RANDOM_PROJECTS="true"
-                CLEANED_FLAGS+=("$flag" "$next_val")
+                CLEANED_FLAGS+=("$flag" "$next_val")  # Keep for runners
                 skip_next=true
                 echo "  args: RANDOM_PROJECTS=true (override)" >&2
                 continue
@@ -306,6 +446,8 @@ jobs:
         echo "random_projects=$RANDOM_PROJECTS" >> "$GITHUB_OUTPUT"
         echo "orchestra_branch=$ORCHESTRA_BRANCH" >> "$GITHUB_OUTPUT"
 
+        # Build parallel_run_args from explicit flags only (not env_file content)
+        # env_file content is handled separately by writing to .env on the runner
         if (( ${#CLEANED_FLAGS[@]} > 0 )); then
           flags_str="${CLEANED_FLAGS[*]}"
           echo "parallel_run_args=$flags_str" >> "$GITHUB_OUTPUT"
@@ -330,6 +472,7 @@ jobs:
     needs: [should-run-tests, discover]
     # Only run if DELETE_ON_START is requested AND we're in shared project mode
     if: |
+      needs.should-run-tests.outputs.run_tests == 'true' &&
       needs.discover.outputs.delete_on_start == 'true' &&
       needs.discover.outputs.random_projects != 'true'
     env:
@@ -373,6 +516,7 @@ jobs:
     needs: [should-run-tests, discover, setup]
     if: |
       always() &&
+      needs.should-run-tests.outputs.run_tests == 'true' &&
       (needs.setup.result == 'success' || needs.setup.result == 'skipped')
     strategy:
       fail-fast: false
@@ -694,30 +838,28 @@ jobs:
       run: |
         set +e
 
-        TEST_MODE="${{ needs.discover.outputs.test_mode }}"
+        # parallel_run_args comes from discover job (parsed from commit message or inputs)
         EXTRA_ARGS="${{ needs.discover.outputs.parallel_run_args }}"
         TEST_PATH="${{ steps.normalize-path.outputs.test_args }}"
         TIMEOUT="${{ steps.normalize-path.outputs.timeout_seconds }}"
 
-        # Determine pytest marker args based on test mode
-        MARKER_ARGS=""
-        if [[ "$TEST_MODE" == "lightweight" ]]; then
-          MARKER_ARGS='-- -m "not llm_call"'
-        fi
-
+        # Start Xvfb manually instead of using xvfb-run.
+        # xvfb-run doesn't propagate signals to child processes, making
+        # workflow cancellation ineffective. This approach ensures SIGTERM
+        # reaches parallel_run.sh directly.
         Xvfb :99 -screen 0 1920x1080x24 &
         XVFB_PID=$!
         export DISPLAY=:99
+
+        # Ensure Xvfb is cleaned up on exit
         trap "kill $XVFB_PID 2>/dev/null" EXIT
 
+        # parallel_run.sh handles starting local orchestra automatically
+        # Note: TEST_PATH must come before EXTRA_ARGS because EXTRA_ARGS may contain
+        # "-- pytest-args" and everything after -- is treated as pytest arguments
         # TEST_PATH is unquoted to allow word splitting for bundled multi-file entries
-        if [[ -n "$MARKER_ARGS" ]]; then
-          echo "Running: parallel_run.sh --timeout $TIMEOUT $TEST_PATH $MARKER_ARGS"
-          tests/parallel_run.sh --timeout "$TIMEOUT" $TEST_PATH $MARKER_ARGS
-        else
-          echo "Running: parallel_run.sh --timeout $TIMEOUT $TEST_PATH $EXTRA_ARGS"
-          tests/parallel_run.sh --timeout "$TIMEOUT" $TEST_PATH $EXTRA_ARGS
-        fi
+        echo "Running: parallel_run.sh --timeout $TIMEOUT $TEST_PATH $EXTRA_ARGS"
+        tests/parallel_run.sh --timeout "$TIMEOUT" $TEST_PATH $EXTRA_ARGS
         TEST_EXIT_CODE=$?
         echo "exit_code=$TEST_EXIT_CODE" >> "$GITHUB_OUTPUT"
         exit $TEST_EXIT_CODE
@@ -972,6 +1114,7 @@ jobs:
     # Cancelled runs likely left partial test data that should be cleaned up
     if: |
       always() &&
+      needs.should-run-tests.outputs.run_tests == 'true' &&
       needs.discover.outputs.random_projects != 'true' &&
       (needs.discover.outputs.delete_on_exit == 'true' || needs.pytest.result == 'cancelled')
     env:
@@ -1008,7 +1151,7 @@ jobs:
     name: Consolidate and store cache
     runs-on: ubuntu-latest
     needs: [should-run-tests, pytest, cleanup]
-    if: always()
+    if: always() && needs.should-run-tests.outputs.run_tests == 'true'
     steps:
     - uses: actions/checkout@v4
 

From f6713d380b1c01d79e8bca5b05a2d72769cc0f2c Mon Sep 17 00:00:00 2001
From: CatB1t <skytedits@gmail.com>
Date: Fri, 10 Apr 2026 12:12:50 +0200
Subject: [PATCH 14/14] ci(lightweight): disable checks on push, keep for PRs
 to staging & main

---
 .github/workflows/tests_lightweight.yml | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/.github/workflows/tests_lightweight.yml b/.github/workflows/tests_lightweight.yml
index 2c33e5c33..666330d52 100644
--- a/.github/workflows/tests_lightweight.yml
+++ b/.github/workflows/tests_lightweight.yml
@@ -9,11 +9,6 @@ run-name: >-
   }}
 
 on:
-  push:
-    branches:
-    - staging
-    - main
-
   pull_request:
     branches:
     - main