diff --git a/agent/llm_withtools.py b/agent/llm_withtools.py
index b25741e..72aaf25 100644
--- a/agent/llm_withtools.py
+++ b/agent/llm_withtools.py
@@ -95,7 +95,7 @@ def chat_with_agent(
logging=print,
tools_available=[], # Empty list means no tools, 'all' means all tools
multiple_tool_calls=False, # Whether to allow multiple tool calls in a single response
- max_tool_calls=40, # Maximum number of tool calls allowed in a single response, -1 for unlimited
+ max_tool_calls=40, # Max tool calls per response, -1=unlimited
):
get_response_fn = get_response_from_llm
# Construct message
@@ -107,15 +107,17 @@ def chat_with_agent(
# Load all tools
all_tools = load_tools(logging=logging, names=tools_available)
tools_dict = {tool['info']['name']: tool for tool in all_tools}
- system_msg = f"{get_tooluse_prompt([tool['info'] for tool in all_tools])}\n\n"
+ tool_infos = [t['info'] for t in all_tools]
+ tool_system_msg = get_tooluse_prompt(tool_infos)
num_tool_calls = 0
- # Call API
+ # Call API — tool descriptions sent as system message
logging(f"Input: {repr(msg)}")
response, new_msg_history, info = get_response_fn(
- msg=system_msg + msg,
+ msg=msg,
model=model,
msg_history=new_msg_history,
+ system_msg=tool_system_msg,
)
logging(f"Output: {repr(response)}")
# logging(f"Info: {repr(info)}")
@@ -139,13 +141,17 @@ def chat_with_agent(
tool_input = tool_use['tool_input']
tool_output = process_tool_call(tools_dict, tool_name, tool_input)
num_tool_calls += 1
- tool_msg = f'''
- {{
- "tool_name": "{tool_name}",
- "tool_input": {tool_input},
- "tool_output": "{tool_output}"
- }}
- '''.strip()
+ tool_msg_data = {
+ "tool_name": tool_name,
+ "tool_input": tool_input,
+ "tool_output": str(tool_output),
+ }
+ tool_json = json.dumps(
+ tool_msg_data, indent=2,
+ )
+ tool_msg = (
+ f"\n{tool_json}\n"
+ )
logging(f"Tool output: {repr(tool_msg)}")
tool_msgs.append(tool_msg)
@@ -154,9 +160,10 @@ def chat_with_agent(
logging("Error: Output context exceeded. Please try again.")
tool_msgs.append("Error: Output context exceeded. Please try again.")
- # Get tool response
+ # Get tool response — no system_msg on
+ # subsequent calls (already in context)
response, new_msg_history, info = get_response_fn(
- msg=system_msg + '\n\n'.join(tool_msgs),
+ msg='\n\n'.join(tool_msgs),
model=model,
msg_history=new_msg_history,
)
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/__pycache__/__init__.cpython-314.pyc b/tests/__pycache__/__init__.cpython-314.pyc
new file mode 100644
index 0000000..fe6a863
Binary files /dev/null and b/tests/__pycache__/__init__.cpython-314.pyc differ
diff --git a/tests/__pycache__/conftest.cpython-314-pytest-9.0.2.pyc b/tests/__pycache__/conftest.cpython-314-pytest-9.0.2.pyc
new file mode 100644
index 0000000..bbd9fbe
Binary files /dev/null and b/tests/__pycache__/conftest.cpython-314-pytest-9.0.2.pyc differ
diff --git a/tests/__pycache__/test_archive_parsing.cpython-314-pytest-9.0.2.pyc b/tests/__pycache__/test_archive_parsing.cpython-314-pytest-9.0.2.pyc
new file mode 100644
index 0000000..80ec9e0
Binary files /dev/null and b/tests/__pycache__/test_archive_parsing.cpython-314-pytest-9.0.2.pyc differ
diff --git a/tests/__pycache__/test_bash_sentinel.cpython-314-pytest-9.0.2.pyc b/tests/__pycache__/test_bash_sentinel.cpython-314-pytest-9.0.2.pyc
new file mode 100644
index 0000000..1221cce
Binary files /dev/null and b/tests/__pycache__/test_bash_sentinel.cpython-314-pytest-9.0.2.pyc differ
diff --git a/tests/__pycache__/test_ensemble.cpython-314-pytest-9.0.2.pyc b/tests/__pycache__/test_ensemble.cpython-314-pytest-9.0.2.pyc
new file mode 100644
index 0000000..5ee0b44
Binary files /dev/null and b/tests/__pycache__/test_ensemble.cpython-314-pytest-9.0.2.pyc differ
diff --git a/tests/__pycache__/test_genesis_evaluator.cpython-314-pytest-9.0.2.pyc b/tests/__pycache__/test_genesis_evaluator.cpython-314-pytest-9.0.2.pyc
new file mode 100644
index 0000000..730bc85
Binary files /dev/null and b/tests/__pycache__/test_genesis_evaluator.cpython-314-pytest-9.0.2.pyc differ
diff --git a/tests/__pycache__/test_llm_metadata.cpython-314-pytest-9.0.2.pyc b/tests/__pycache__/test_llm_metadata.cpython-314-pytest-9.0.2.pyc
new file mode 100644
index 0000000..45877a0
Binary files /dev/null and b/tests/__pycache__/test_llm_metadata.cpython-314-pytest-9.0.2.pyc differ
diff --git a/tests/__pycache__/test_meta_agent_instruction.cpython-314-pytest-9.0.2.pyc b/tests/__pycache__/test_meta_agent_instruction.cpython-314-pytest-9.0.2.pyc
new file mode 100644
index 0000000..39d1bd2
Binary files /dev/null and b/tests/__pycache__/test_meta_agent_instruction.cpython-314-pytest-9.0.2.pyc differ
diff --git a/tests/__pycache__/test_metadata_atomic.cpython-314-pytest-9.0.2.pyc b/tests/__pycache__/test_metadata_atomic.cpython-314-pytest-9.0.2.pyc
new file mode 100644
index 0000000..d53ae9a
Binary files /dev/null and b/tests/__pycache__/test_metadata_atomic.cpython-314-pytest-9.0.2.pyc differ
diff --git a/tests/__pycache__/test_smoke_test.cpython-314-pytest-9.0.2.pyc b/tests/__pycache__/test_smoke_test.cpython-314-pytest-9.0.2.pyc
new file mode 100644
index 0000000..db08b30
Binary files /dev/null and b/tests/__pycache__/test_smoke_test.cpython-314-pytest-9.0.2.pyc differ
diff --git a/tests/__pycache__/test_tool_output_format.cpython-314-pytest-9.0.2.pyc b/tests/__pycache__/test_tool_output_format.cpython-314-pytest-9.0.2.pyc
new file mode 100644
index 0000000..8005677
Binary files /dev/null and b/tests/__pycache__/test_tool_output_format.cpython-314-pytest-9.0.2.pyc differ
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..2708571
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,188 @@
+"""Shared fixtures for HyperAgents test suite."""
+
+import importlib
+import importlib.util
+import os
+import json
+import sys
+import tempfile
+import shutil
+import types
+
+import pytest
+
+# ---- Project root on sys.path ----
+_PROJ = os.path.normpath(
+ "C:/Users/ryuke/Desktop/Projects/Hyperagents"
+)
+if _PROJ not in sys.path:
+ sys.path.insert(0, _PROJ)
+
+
+def _install_lightweight_mocks():
+ """Install minimal mock modules so that project
+ modules can be imported without heavy deps like
+ docker, litellm, backoff, torch, etc.
+
+ Only installs mocks for modules NOT already
+ present -- safe to call multiple times.
+ """
+ def _ensure(name, factory):
+ if name not in sys.modules:
+ sys.modules[name] = factory()
+
+ # docker
+ _ensure("docker", lambda: types.ModuleType("docker"))
+
+ # utils.docker_utils
+ def _make_docker_utils():
+ m = types.ModuleType("utils.docker_utils")
+ m.copy_to_container = lambda *a, **k: None
+ m.log_container_output = lambda *a, **k: None
+ return m
+ _ensure("utils.docker_utils", _make_docker_utils)
+
+ # utils.git_utils
+ def _make_git_utils():
+ m = types.ModuleType("utils.git_utils")
+ m.commit_repo = lambda *a, **k: "abc123"
+ m.get_git_commit_hash = lambda *a, **k: "abc"
+ return m
+ _ensure("utils.git_utils", _make_git_utils)
+
+ # backoff
+ def _make_backoff():
+ m = types.ModuleType("backoff")
+ m.expo = "expo"
+ m.on_exception = (
+ lambda *a, **kw: (lambda f: f)
+ )
+ return m
+ _ensure("backoff", _make_backoff)
+
+ # requests / requests.exceptions
+ def _make_requests():
+ m = types.ModuleType("requests")
+ exc = types.ModuleType("requests.exceptions")
+ exc.RequestException = Exception
+ m.exceptions = exc
+ sys.modules["requests.exceptions"] = exc
+ return m
+ _ensure("requests", _make_requests)
+
+ # litellm
+ def _make_litellm():
+ m = types.ModuleType("litellm")
+ m.drop_params = True
+ m.completion = lambda **kw: None
+ return m
+ _ensure("litellm", _make_litellm)
+
+ # dotenv
+ def _make_dotenv():
+ m = types.ModuleType("dotenv")
+ m.load_dotenv = lambda *a, **kw: None
+ return m
+ _ensure("dotenv", _make_dotenv)
+
+ # utils.thread_logger
+ def _make_thread_logger():
+ m = types.ModuleType("utils.thread_logger")
+ class FakeLM:
+ def __init__(self, **kw):
+ self.log = print
+ m.ThreadLoggerManager = FakeLM
+ return m
+ _ensure(
+ "utils.thread_logger", _make_thread_logger
+ )
+
+ # tqdm (used by genesis evaluator)
+ def _make_tqdm():
+ m = types.ModuleType("tqdm")
+ m.tqdm = lambda *a, **kw: iter([])
+ return m
+ _ensure("tqdm", _make_tqdm)
+
+ # pandas (used by ensemble.py)
+ def _make_pandas():
+ m = types.ModuleType("pandas")
+ m.read_csv = lambda *a, **kw: None
+ return m
+ _ensure("pandas", _make_pandas)
+ _ensure("pd", _make_pandas)
+
+
+# Install mocks at import time so all test modules
+# benefit.
+_install_lightweight_mocks()
+
+
+def load_module_from_file(module_name, file_path):
+ """Load a Python module directly from a file path,
+ bypassing package __init__.py files.
+
+ Useful for modules whose package __init__ imports
+ heavy deps (e.g., torch).
+ """
+ abs_path = os.path.join(_PROJ, file_path)
+ spec = importlib.util.spec_from_file_location(
+ module_name, abs_path
+ )
+ mod = importlib.util.module_from_spec(spec)
+ sys.modules[module_name] = mod
+ spec.loader.exec_module(mod)
+ return mod
+
+
+@pytest.fixture
+def tmp_dir():
+ """Provide a temporary directory, cleaned up after test."""
+ d = tempfile.mkdtemp()
+ yield d
+ shutil.rmtree(d, ignore_errors=True)
+
+
+@pytest.fixture
+def sample_archive_jsonl(tmp_dir):
+ """Create a sample archive.jsonl file with valid data."""
+ path = os.path.join(tmp_dir, "archive.jsonl")
+ entries = [
+ {
+ "current_genid": 0,
+ "archive": [0],
+ },
+ {
+ "current_genid": 1,
+ "archive": [0, 1],
+ },
+ {
+ "current_genid": 2,
+ "archive": [0, 1, 2],
+ },
+ ]
+ with open(path, "w") as f:
+ for entry in entries:
+ f.write(json.dumps(entry) + "\n")
+ return path
+
+
+@pytest.fixture
+def sample_metadata_dir(tmp_dir):
+ """Create gen_X directories with metadata.json files."""
+ for genid in range(3):
+ gen_dir = os.path.join(
+ tmp_dir, f"gen_{genid}"
+ )
+ os.makedirs(gen_dir, exist_ok=True)
+ metadata = {
+ "parent_genid": genid - 1 if genid > 0 else None,
+ "valid_parent": True,
+ "prev_patch_files": [],
+ "curr_patch_files": [],
+ }
+ with open(
+ os.path.join(gen_dir, "metadata.json"), "w"
+ ) as f:
+ json.dump(metadata, f)
+ return tmp_dir
diff --git a/tests/test_archive_parsing.py b/tests/test_archive_parsing.py
new file mode 100644
index 0000000..f6fb711
--- /dev/null
+++ b/tests/test_archive_parsing.py
@@ -0,0 +1,180 @@
+"""Tests for JSONL archive parsing (F-07 fix).
+
+Validates that load_archive_data() correctly parses
+JSONL format (one JSON object per line) instead of
+treating the whole file as a single JSON array.
+"""
+
+import json
+import os
+import tempfile
+
+import pytest
+
+
+# --------------- helpers to avoid heavy project imports -----
+# We extract the parsing logic directly to test in
+# isolation. If import works, we test the real function
+# too.
+
+def _parse_jsonl(filepath, last_only=True):
+ """Pure-Python reimplementation of the JSONL parsing
+ logic from utils/gl_utils.py::load_archive_data."""
+ if not os.path.exists(filepath):
+ raise FileNotFoundError(
+ f"Metadata file not found at {filepath}"
+ )
+ archive_data = []
+ with open(filepath, "r") as f:
+ for line in f:
+ line = line.strip()
+ if line:
+ try:
+ archive_data.append(json.loads(line))
+ except json.JSONDecodeError:
+ continue
+ if last_only:
+ return archive_data[-1]
+ return archive_data
+
+
+class TestLoadArchiveDataParsing:
+ """Tests for JSONL line-by-line parsing."""
+
+ def test_parse_valid_jsonl(self, tmp_dir):
+ """Valid JSONL with multiple lines parses each
+ line independently."""
+ path = os.path.join(tmp_dir, "archive.jsonl")
+ entries = [
+ {"current_genid": 0, "archive": [0]},
+ {"current_genid": 1, "archive": [0, 1]},
+ ]
+ with open(path, "w") as f:
+ for e in entries:
+ f.write(json.dumps(e) + "\n")
+
+ result = _parse_jsonl(path, last_only=False)
+ assert len(result) == 2
+ assert result[0]["current_genid"] == 0
+ assert result[1]["archive"] == [0, 1]
+
+ def test_parse_last_only(self, tmp_dir):
+ """last_only=True returns only the final entry."""
+ path = os.path.join(tmp_dir, "archive.jsonl")
+ entries = [
+ {"current_genid": 0, "archive": [0]},
+ {"current_genid": 1, "archive": [0, 1]},
+ {"current_genid": 2, "archive": [0, 1, 2]},
+ ]
+ with open(path, "w") as f:
+ for e in entries:
+ f.write(json.dumps(e) + "\n")
+
+ result = _parse_jsonl(path, last_only=True)
+ assert result["current_genid"] == 2
+ assert len(result["archive"]) == 3
+
+ def test_empty_lines_skipped(self, tmp_dir):
+ """Blank lines between entries are ignored."""
+ path = os.path.join(tmp_dir, "archive.jsonl")
+ with open(path, "w") as f:
+ f.write(json.dumps({"a": 1}) + "\n")
+ f.write("\n")
+ f.write(" \n")
+ f.write(json.dumps({"a": 2}) + "\n")
+
+ result = _parse_jsonl(path, last_only=False)
+ assert len(result) == 2
+
+ def test_malformed_lines_skipped(self, tmp_dir):
+ """Malformed JSON lines are skipped without
+ crashing."""
+ path = os.path.join(tmp_dir, "archive.jsonl")
+ with open(path, "w") as f:
+ f.write(json.dumps({"ok": True}) + "\n")
+ f.write("this is not json\n")
+ f.write("{broken json\n")
+ f.write(json.dumps({"ok": True}) + "\n")
+
+ result = _parse_jsonl(path, last_only=False)
+ assert len(result) == 2
+ assert all(e["ok"] for e in result)
+
+ def test_empty_file_raises(self, tmp_dir):
+ """An empty file (no valid entries) raises
+ IndexError when last_only=True."""
+ path = os.path.join(tmp_dir, "archive.jsonl")
+ with open(path, "w") as f:
+ f.write("")
+
+ with pytest.raises(IndexError):
+ _parse_jsonl(path, last_only=True)
+
+ def test_empty_file_returns_empty_list(self, tmp_dir):
+ """An empty file returns [] when last_only=False."""
+ path = os.path.join(tmp_dir, "archive.jsonl")
+ with open(path, "w") as f:
+ f.write("")
+
+ result = _parse_jsonl(path, last_only=False)
+ assert result == []
+
+ def test_missing_file_raises(self, tmp_dir):
+ """A nonexistent file raises FileNotFoundError."""
+ path = os.path.join(tmp_dir, "nonexistent.jsonl")
+ with pytest.raises(FileNotFoundError):
+ _parse_jsonl(path)
+
+ def test_single_line_file(self, tmp_dir):
+ """A file with exactly one line works correctly."""
+ path = os.path.join(tmp_dir, "archive.jsonl")
+ entry = {"current_genid": 0, "archive": [0]}
+ with open(path, "w") as f:
+ f.write(json.dumps(entry) + "\n")
+
+ result = _parse_jsonl(path, last_only=True)
+ assert result == entry
+
+ result_all = _parse_jsonl(path, last_only=False)
+ assert len(result_all) == 1
+
+
+class TestLoadArchiveDataReal:
+ """Test the real load_archive_data function.
+
+ conftest.py installs lightweight mocks so the
+ import succeeds without docker/litellm/etc.
+ """
+
+ @pytest.fixture(autouse=True)
+ def _try_import(self):
+ """Import load_archive_data (mocks in
+ conftest handle heavy deps)."""
+ try:
+ from utils.gl_utils import (
+ load_archive_data,
+ )
+ self.load_fn = load_archive_data
+ except Exception as e:
+ pytest.skip(
+ f"Could not import: {e}"
+ )
+
+ def test_real_parse_valid(
+ self, sample_archive_jsonl
+ ):
+ """Real function parses valid JSONL."""
+ result = self.load_fn(
+ sample_archive_jsonl, last_only=False
+ )
+ assert len(result) == 3
+ assert result[-1]["current_genid"] == 2
+
+ def test_real_last_only(
+ self, sample_archive_jsonl
+ ):
+ """Real function returns last entry."""
+ result = self.load_fn(
+ sample_archive_jsonl, last_only=True
+ )
+ assert result["current_genid"] == 2
diff --git a/tests/test_bash_sentinel.py b/tests/test_bash_sentinel.py
new file mode 100644
index 0000000..ee35689
--- /dev/null
+++ b/tests/test_bash_sentinel.py
@@ -0,0 +1,68 @@
+"""Tests for BashSession random sentinel (F-14).
+
+Validates that each BashSession instance gets a
+unique, random sentinel instead of the static
+'<>' string, preventing command output from
+accidentally matching the sentinel.
+"""
+
+import re
+import sys
+
+import pytest
+
+from agent.tools.bash import BashSession
+
+
+class TestBashSentinelUniqueness:
+ """F-14: Each BashSession gets a unique
+ random sentinel."""
+
+ def test_sentinel_is_not_static(self):
+ """Sentinel must not be the old static
+ '<>' string."""
+ session = BashSession()
+ assert session._sentinel != "<>"
+ assert session._sentinel != "<>"
+
+ def test_sentinel_matches_expected_pattern(self):
+ """Sentinel matches <> format."""
+ session = BashSession()
+ pattern = r"^<>$"
+ assert re.match(pattern, session._sentinel), (
+ f"Sentinel {session._sentinel!r} does not "
+ f"match pattern {pattern}"
+ )
+
+ def test_each_instance_gets_unique_sentinel(self):
+ """Two separate BashSession instances must have
+ different sentinels."""
+ s1 = BashSession()
+ s2 = BashSession()
+ assert s1._sentinel != s2._sentinel, (
+ "Two sessions should not share a sentinel"
+ )
+
+ def test_many_instances_all_unique(self):
+ """Creating 50 sessions yields 50 distinct
+ sentinels."""
+ sentinels = {
+ BashSession()._sentinel for _ in range(50)
+ }
+ assert len(sentinels) == 50
+
+ def test_sentinel_hex_length(self):
+ """The hex portion has 32 chars (uuid4.hex)."""
+ session = BashSession()
+ # <>
+ inner = session._sentinel[7:-2] # strip <>
+ assert len(inner) == 32
+ assert all(c in "0123456789abcdef" for c in inner)
+
+ def test_sentinel_used_in_run_command(self):
+ """The run() method references self._sentinel,
+ not a hardcoded string."""
+ import inspect
+ src = inspect.getsource(BashSession.run)
+ assert "self._sentinel" in src
+ assert "<>" not in src.lower()
diff --git a/tests/test_ensemble.py b/tests/test_ensemble.py
new file mode 100644
index 0000000..3148c0f
--- /dev/null
+++ b/tests/test_ensemble.py
@@ -0,0 +1,174 @@
+"""Tests for ensemble majority voting logic.
+
+Validates weighted majority voting for classification
+domains, single-best fallback, and domain gating.
+"""
+
+import json
+import os
+import sys
+from collections import defaultdict
+from unittest.mock import patch, MagicMock
+
+import pytest
+
+_PROJ = "C:/Users/ryuke/Desktop/Projects/Hyperagents"
+if _PROJ not in sys.path:
+ sys.path.insert(0, _PROJ)
+
+
+# ------ Pure logic tests (no imports needed) ------
+
+class TestMajorityVoteLogic:
+ """Test the weighted majority voting algorithm
+ in isolation."""
+
+ @staticmethod
+ def _weighted_majority(predictions_scores):
+ """Reimplement the core voting logic from
+ ensemble.py for isolated testing.
+
+ Args:
+ predictions_scores: list of (pred, score)
+ Returns:
+ The prediction with highest total weight.
+ """
+ votes = defaultdict(float)
+ for pred, score in predictions_scores:
+ if pred is not None:
+ votes[pred] += score
+ if votes:
+ return max(votes, key=votes.get)
+ return None
+
+ def test_majority_simple(self):
+ """3 agents: A(0.8), B(0.7), A(0.6) -> A
+ wins with 1.4 vs 0.7."""
+ result = self._weighted_majority([
+ ("A", 0.8), ("B", 0.7), ("A", 0.6),
+ ])
+ assert result == "A"
+
+ def test_weights_override_count(self):
+ """2 agents vote B with low scores, 1 votes A
+ with high score. A wins on weight."""
+ result = self._weighted_majority([
+ ("A", 0.95), ("B", 0.1), ("B", 0.1),
+ ])
+ assert result == "A"
+
+ def test_tie_broken_deterministically(self):
+ """When weights tie, max() returns
+ deterministically."""
+ result = self._weighted_majority([
+ ("A", 0.5), ("B", 0.5),
+ ])
+ # max() returns first key encountered with max
+ assert result in ("A", "B")
+
+ def test_all_same_vote(self):
+ """All agents agree -> that answer wins."""
+ result = self._weighted_majority([
+ ("X", 0.9), ("X", 0.8), ("X", 0.7),
+ ])
+ assert result == "X"
+
+ def test_none_predictions_ignored(self):
+ """None predictions don't count toward any
+ vote."""
+ result = self._weighted_majority([
+ ("A", 0.9), (None, 0.8), ("B", 0.7),
+ ])
+ assert result == "A"
+
+ def test_all_none_returns_none(self):
+ """If all predictions are None, returns None."""
+ result = self._weighted_majority([
+ (None, 0.9), (None, 0.8),
+ ])
+ assert result is None
+
+
+class TestEnsembleDomainGating:
+ """Verify domain-based routing: classification
+ domains use voting, others use single-best."""
+
+ def test_classification_domains_known(self):
+ """Classification domains that support
+ ensemble are a known set."""
+ # conftest.py mocks make this import work
+ from ensemble import _CLASSIFICATION_DOMAINS
+ assert (
+ "search_arena" in _CLASSIFICATION_DOMAINS
+ )
+ assert (
+ "paper_review" in _CLASSIFICATION_DOMAINS
+ )
+ assert (
+ "imo_grading" in _CLASSIFICATION_DOMAINS
+ )
+
+ def test_non_classification_not_in_set(self):
+ """Non-classification domains are NOT in the
+ classification set."""
+ from ensemble import _CLASSIFICATION_DOMAINS
+ assert (
+ "genesis_go2walking"
+ not in _CLASSIFICATION_DOMAINS
+ )
+ assert (
+ "balrog_babyai"
+ not in _CLASSIFICATION_DOMAINS
+ )
+ assert (
+ "polyglot"
+ not in _CLASSIFICATION_DOMAINS
+ )
+
+
+class TestEnsembleFallback:
+ """Test fallback behavior when <3 agents
+ available."""
+
+ @staticmethod
+ def _should_use_voting(
+ domain, can_ensemble, n_agents
+ ):
+ """Replicate the gating logic from ensemble()
+ for testing."""
+ classification = {
+ "search_arena",
+ "paper_review",
+ "imo_grading",
+ }
+ return (
+ domain in classification
+ and can_ensemble
+ and n_agents >= 3
+ )
+
+ def test_fewer_than_3_uses_single_best(self):
+ """With 2 agents, should NOT use voting."""
+ assert not self._should_use_voting(
+ "search_arena", True, 2
+ )
+
+ def test_3_or_more_uses_voting(self):
+ """With 3+ agents, should use voting."""
+ assert self._should_use_voting(
+ "search_arena", True, 3
+ )
+
+ def test_non_classification_always_single(self):
+ """Non-classification domain never uses
+ voting regardless of agent count."""
+ assert not self._should_use_voting(
+ "genesis_go2walking", False, 5
+ )
+
+ def test_can_ensemble_false_blocks_voting(self):
+ """Even a classification domain with
+ can_ensemble=False uses single-best."""
+ assert not self._should_use_voting(
+ "imo_grading", False, 3
+ )
diff --git a/tests/test_llm_metadata.py b/tests/test_llm_metadata.py
new file mode 100644
index 0000000..759f9cd
--- /dev/null
+++ b/tests/test_llm_metadata.py
@@ -0,0 +1,93 @@
+"""Tests for LLM response metadata (F-10).
+
+Validates that get_response_from_llm() returns an
+info dict with expected keys: finish_reason, usage,
+model.
+"""
+
+import inspect
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+# conftest.py mocks backoff, litellm, dotenv, etc.
+from agent.llm import get_response_from_llm
+
+
+class TestLlmMetadataKeys:
+ """F-10: info dict contains expected keys."""
+
+ def test_return_annotation_is_tuple(self):
+ """get_response_from_llm returns a 3-tuple
+ (text, history, info)."""
+ sig = inspect.signature(
+ get_response_from_llm
+ )
+ ret = sig.return_annotation
+ assert "Tuple" in str(ret)
+
+ def test_info_dict_constructed_in_source(self):
+ """Source constructs info with finish_reason,
+ usage, model keys."""
+ src = inspect.getsource(
+ get_response_from_llm
+ )
+ assert '"finish_reason"' in src
+ assert '"usage"' in src
+ assert '"model"' in src
+
+ def test_info_dict_is_returned(self):
+ """The function returns (response_text,
+ new_msg_history, info)."""
+ src = inspect.getsource(
+ get_response_from_llm
+ )
+ assert (
+ "return response_text, "
+ "new_msg_history, info"
+ ) in src
+
+ def test_info_structure_via_mock(self):
+ """Mock litellm.completion and verify the
+ returned info dict shape."""
+ mock_response = MagicMock()
+ mock_response.choices = [MagicMock()]
+ mock_response.choices[0].message.content = (
+ "hello"
+ )
+ mock_response.choices[0].finish_reason = (
+ "stop"
+ )
+ mock_response.usage = MagicMock()
+ mock_response.model = "test-model"
+
+ # Make response subscriptable for the
+ # response['choices'][0]['message']['content']
+ # pattern used in the source.
+ choice_msg = {"content": "hello"}
+ choice = {"message": choice_msg}
+
+ def getitem(self, key):
+ if key == "choices":
+ return [choice]
+ return None
+
+ type(mock_response).__getitem__ = getitem
+
+ with patch(
+ "agent.llm.litellm.completion",
+ return_value=mock_response,
+ ):
+ text, history, info = (
+ get_response_from_llm(
+ msg="test", model="test-model"
+ )
+ )
+
+ assert isinstance(info, dict)
+ assert "finish_reason" in info
+ assert "usage" in info
+ assert "model" in info
+ assert info["finish_reason"] == "stop"
+ assert info["model"] == "test-model"
+ assert text == "hello"
diff --git a/tests/test_meta_agent_instruction.py b/tests/test_meta_agent_instruction.py
new file mode 100644
index 0000000..f4c7631
--- /dev/null
+++ b/tests/test_meta_agent_instruction.py
@@ -0,0 +1,69 @@
+"""Tests for MetaAgent instruction construction (F-04).
+
+Validates that the instruction string built inside
+MetaAgent.forward() contains eval_path, iterations_left,
+and is not trivially short.
+"""
+
+import inspect
+
+import pytest
+
+# conftest.py mocks backoff, litellm, dotenv,
+# thread_logger, etc.
+from meta_agent import MetaAgent
+
+
+class TestMetaAgentInstruction:
+ """F-04: Instruction string is comprehensive."""
+
+ def test_forward_accepts_eval_path(self):
+ """forward() signature includes eval_path."""
+ sig = inspect.signature(MetaAgent.forward)
+ params = list(sig.parameters.keys())
+ assert "eval_path" in params
+
+ def test_forward_accepts_iterations_left(self):
+ """forward() signature includes
+ iterations_left."""
+ sig = inspect.signature(MetaAgent.forward)
+ params = list(sig.parameters.keys())
+ assert "iterations_left" in params
+
+ def test_eval_path_appears_in_instruction(self):
+ """The source of forward() references
+ eval_path in the instruction string."""
+ src = inspect.getsource(MetaAgent.forward)
+ assert "eval_path" in src
+
+ def test_iterations_left_in_instruction(self):
+ """When iterations_left is provided, it
+ appears in the instruction."""
+ src = inspect.getsource(MetaAgent.forward)
+ assert "iterations_left" in src
+
+ def test_instruction_is_substantial(self):
+ """The instruction is built with multiple
+ concatenations, not just a few words."""
+ src = inspect.getsource(MetaAgent.forward)
+ plus_eq_count = src.count("instruction +=")
+ assert plus_eq_count >= 3, (
+ f"Expected 3+ instruction +=, got "
+ f"{plus_eq_count}"
+ )
+
+ def test_instruction_mentions_readme(self):
+ """Instruction tells the agent to read the
+ README for orientation."""
+ src = inspect.getsource(MetaAgent.forward)
+ assert "README" in src
+
+ def test_instruction_mentions_repo_path(self):
+ """Instruction references repo_path."""
+ src = inspect.getsource(MetaAgent.forward)
+ assert "repo_path" in src
+
+ def test_forward_calls_chat_with_agent(self):
+ """forward() delegates to chat_with_agent."""
+ src = inspect.getsource(MetaAgent.forward)
+ assert "chat_with_agent" in src
diff --git a/tests/test_metadata_atomic.py b/tests/test_metadata_atomic.py
new file mode 100644
index 0000000..59bcc92
--- /dev/null
+++ b/tests/test_metadata_atomic.py
@@ -0,0 +1,139 @@
+"""Tests for atomic metadata writes (F-11).
+
+Validates that update_node_metadata() uses the
+temp-file + os.replace pattern to avoid corruption.
+"""
+
+import json
+import os
+import sys
+
+import pytest
+
+# conftest.py mocks heavy deps (docker, etc.)
+from utils.gl_utils import (
+ update_node_metadata,
+ get_node_metadata_key,
+)
+
+
+class TestUpdateNodeMetadataAtomic:
+ """F-11: metadata writes use temp + rename."""
+
+ def _make_gen_dir(self, tmp_dir, genid, data):
+ """Helper: create gen_{genid}/metadata.json."""
+ gen_dir = os.path.join(
+ tmp_dir, f"gen_{genid}"
+ )
+ os.makedirs(gen_dir, exist_ok=True)
+ meta_path = os.path.join(
+ gen_dir, "metadata.json"
+ )
+ with open(meta_path, "w") as f:
+ json.dump(data, f)
+ return meta_path
+
+ def test_update_writes_correct_data(self, tmp_dir):
+ """After update, the file contains the merged
+ data."""
+ original = {"parent_genid": None, "score": 0.5}
+ self._make_gen_dir(tmp_dir, 0, original)
+
+ update_node_metadata(
+ tmp_dir, 0, {"score": 0.9, "new_key": True}
+ )
+
+ meta_path = os.path.join(
+ tmp_dir, "gen_0", "metadata.json"
+ )
+ with open(meta_path, "r") as f:
+ result = json.load(f)
+
+ assert result["score"] == 0.9
+ assert result["new_key"] is True
+ assert result["parent_genid"] is None
+
+ def test_no_temp_file_left_behind(self, tmp_dir):
+ """After a successful write, no .tmp file
+ remains."""
+ self._make_gen_dir(
+ tmp_dir, 0, {"key": "value"}
+ )
+ update_node_metadata(
+ tmp_dir, 0, {"key": "updated"}
+ )
+
+ gen_dir = os.path.join(tmp_dir, "gen_0")
+ files = os.listdir(gen_dir)
+ tmp_files = [f for f in files if f.endswith(".tmp")]
+ assert len(tmp_files) == 0, (
+ f"Temp files remain: {tmp_files}"
+ )
+
+ def test_atomic_pattern_in_source(self):
+ """Source code uses tmp_file + os.replace
+ pattern."""
+ import inspect
+ src = inspect.getsource(update_node_metadata)
+ assert "tmp" in src.lower(), (
+ "Should use a temp file"
+ )
+ assert "os.replace" in src or "os.rename" in src, (
+ "Should use os.replace or os.rename for "
+ "atomic swap"
+ )
+ assert "f.flush()" in src, (
+ "Should flush before fsync"
+ )
+ assert "os.fsync" in src, (
+ "Should fsync before replace"
+ )
+
+ def test_missing_metadata_is_noop(self, tmp_dir):
+ """If metadata.json doesn't exist,
+ update_node_metadata does nothing."""
+ # gen_99 does not exist
+ update_node_metadata(
+ tmp_dir, 99, {"key": "value"}
+ )
+ gen_dir = os.path.join(tmp_dir, "gen_99")
+ assert not os.path.exists(gen_dir)
+
+ def test_get_after_update_returns_new_value(
+ self, tmp_dir
+ ):
+ """get_node_metadata_key returns the updated
+ value after update_node_metadata."""
+ self._make_gen_dir(
+ tmp_dir, 1, {"status": "pending"}
+ )
+ update_node_metadata(
+ tmp_dir, 1, {"status": "complete"}
+ )
+ val = get_node_metadata_key(
+ tmp_dir, 1, "status"
+ )
+ assert val == "complete"
+
+ def test_concurrent_safety_no_partial_writes(
+ self, tmp_dir
+ ):
+ """Simulates that the original file stays
+ intact if we check it before os.replace
+ would run -- i.e., the tmp file is written
+ first, then swapped."""
+ original = {"step": 1, "data": "original"}
+ meta_path = self._make_gen_dir(
+ tmp_dir, 0, original
+ )
+
+ # Perform multiple updates sequentially
+ for i in range(2, 6):
+ update_node_metadata(
+ tmp_dir, 0, {"step": i}
+ )
+
+ with open(meta_path, "r") as f:
+ result = json.load(f)
+ assert result["step"] == 5
+ assert result["data"] == "original"
diff --git a/tests/test_smoke_test.py b/tests/test_smoke_test.py
new file mode 100644
index 0000000..3a56c57
--- /dev/null
+++ b/tests/test_smoke_test.py
@@ -0,0 +1,101 @@
+"""Tests for run_smoke_test() (F-15).
+
+Validates that run_smoke_test correctly interprets
+container.exec_run results: True on success,
+False on non-zero exit code or missing sentinel.
+"""
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+# conftest.py mocks docker, etc.
+from utils.gl_utils import run_smoke_test
+
+
+def _make_mock_container(exit_code, output_text):
+ """Create a mock Docker container whose exec_run
+ returns the given exit code and output."""
+ container = MagicMock()
+ exec_result = MagicMock()
+ exec_result.exit_code = exit_code
+ exec_result.output = output_text.encode("utf-8")
+ container.exec_run.return_value = exec_result
+ return container
+
+
+class TestRunSmokeTest:
+ """F-15: run_smoke_test container validation."""
+
+ def _run(self, exit_code, output):
+ """Helper: run smoke test with mock."""
+ container = _make_mock_container(
+ exit_code, output
+ )
+ with patch(
+ "utils.gl_utils.log_container_output",
+ lambda *a, **kw: None,
+ ):
+ return run_smoke_test(container)
+
+ def test_success_returns_true(self):
+ """Exit code 0 + sentinel present -> True."""
+ result = self._run(
+ 0, "some output\nsmoke_test_passed\n"
+ )
+ assert result is True
+
+ def test_nonzero_exit_returns_false(self):
+ """Non-zero exit code -> False."""
+ result = self._run(
+ 1, "smoke_test_passed\n"
+ )
+ assert result is False
+
+ def test_missing_sentinel_returns_false(self):
+ """Exit code 0 but no sentinel string ->
+ False."""
+ result = self._run(
+ 0, "import succeeded\n"
+ )
+ assert result is False
+
+ def test_empty_output_returns_false(self):
+ """Empty output -> False."""
+ result = self._run(0, "")
+ assert result is False
+
+ def test_exception_returns_false(self):
+ """If exec_run raises, returns False."""
+ container = MagicMock()
+ container.exec_run.side_effect = (
+ RuntimeError("docker error")
+ )
+ with patch(
+ "utils.gl_utils.log_container_output",
+ lambda *a, **kw: None,
+ ):
+ result = run_smoke_test(container)
+ assert result is False
+
+ def test_calls_exec_run_with_command(self):
+ """exec_run is called with a command list
+ including 'python' and '-c'."""
+ container = _make_mock_container(
+ 0, "smoke_test_passed"
+ )
+ with patch(
+ "utils.gl_utils.log_container_output",
+ lambda *a, **kw: None,
+ ):
+ run_smoke_test(container)
+
+ call_args = container.exec_run.call_args
+ cmd = call_args.kwargs.get(
+ "cmd",
+ call_args.args[0]
+ if call_args.args
+ else None,
+ )
+ assert "python" in cmd
+ assert "-c" in cmd
diff --git a/tests/test_tool_output_format.py b/tests/test_tool_output_format.py
new file mode 100644
index 0000000..440b94b
--- /dev/null
+++ b/tests/test_tool_output_format.py
@@ -0,0 +1,158 @@
+"""Tests for tool output JSON serialization (F-03).
+
+Validates that tool output messages use json.dumps()
+for proper serialization, avoiding the old f-string
+approach which produced invalid JSON with unescaped
+quotes and special characters.
+"""
+
+import json
+
+import pytest
+
+
+class TestToolOutputJsonSerialization:
+ """Verify json.dumps produces valid JSON for tool
+ output messages."""
+
+ def test_basic_tool_output_is_valid_json(self):
+ """A simple tool output round-trips through
+ json.dumps / json.loads."""
+ tool_input = {
+ "command": "ls -la",
+ "path": "/tmp",
+ }
+ tool_msg_data = {
+ "tool_name": "bash",
+ "tool_input": tool_input,
+ "tool_output": "file1.txt\nfile2.txt",
+ }
+ serialized = json.dumps(tool_msg_data)
+ parsed = json.loads(serialized)
+ assert parsed["tool_name"] == "bash"
+ assert parsed["tool_input"] == tool_input
+ assert "file1.txt" in parsed["tool_output"]
+
+ def test_old_fstring_approach_produces_invalid_json(
+ self,
+ ):
+ """Demonstrate the bug: f-string interpolation
+ of dicts produces repr() output that is NOT
+ valid JSON (single quotes, unescaped chars)."""
+ tool_input = {
+ "command": "echo 'hello'",
+ "path": "/tmp",
+ }
+ tool_output = 'He said "hello"'
+
+ # Old broken approach: f-string with dict
+ old_msg = (
+ f'{{"tool_name": "bash", '
+ f'"tool_input": {tool_input}, '
+ f'"tool_output": "{tool_output}"}}'
+ )
+ with pytest.raises(json.JSONDecodeError):
+ json.loads(old_msg)
+
+ def test_special_chars_quotes(self):
+ """Double quotes in tool_output are properly
+ escaped by json.dumps."""
+ data = {
+ "tool_name": "bash",
+ "tool_input": {"command": "echo"},
+ "tool_output": 'He said "hello"',
+ }
+ serialized = json.dumps(data)
+ parsed = json.loads(serialized)
+ assert parsed["tool_output"] == (
+ 'He said "hello"'
+ )
+
+ def test_special_chars_newlines(self):
+ """Newlines in tool_output are escaped."""
+ data = {
+ "tool_name": "bash",
+ "tool_input": {"command": "ls"},
+ "tool_output": "line1\nline2\nline3",
+ }
+ serialized = json.dumps(data)
+ # Raw string should contain \\n, not newlines
+ assert "\\n" in serialized
+ parsed = json.loads(serialized)
+ assert parsed["tool_output"].count("\n") == 2
+
+ def test_special_chars_backslashes(self):
+ """Backslashes in tool_output are escaped."""
+ data = {
+ "tool_name": "bash",
+ "tool_input": {"command": "echo"},
+ "tool_output": "C:\\Users\\test\\file.txt",
+ }
+ serialized = json.dumps(data)
+ parsed = json.loads(serialized)
+ assert (
+ parsed["tool_output"]
+ == "C:\\Users\\test\\file.txt"
+ )
+
+ def test_special_chars_tabs_and_unicode(self):
+ """Tabs and unicode in tool_output are handled."""
+ data = {
+ "tool_name": "bash",
+ "tool_input": {"command": "cat"},
+ "tool_output": "col1\tcol2\n\u2603 snowman",
+ }
+ serialized = json.dumps(data)
+ parsed = json.loads(serialized)
+ assert "\t" in parsed["tool_output"]
+ assert "\u2603" in parsed["tool_output"]
+
+ def test_nested_json_in_output(self):
+ """Tool output containing JSON-like strings
+ serializes correctly."""
+ inner = json.dumps({"key": "value"})
+ data = {
+ "tool_name": "bash",
+ "tool_input": {"command": "curl"},
+ "tool_output": inner,
+ }
+ serialized = json.dumps(data)
+ parsed = json.loads(serialized)
+ # The output is a string, not a dict
+ assert isinstance(parsed["tool_output"], str)
+ inner_parsed = json.loads(
+ parsed["tool_output"]
+ )
+ assert inner_parsed["key"] == "value"
+
+ def test_empty_tool_output(self):
+ """Empty string tool output serializes."""
+ data = {
+ "tool_name": "bash",
+ "tool_input": {"command": "true"},
+ "tool_output": "",
+ }
+ serialized = json.dumps(data)
+ parsed = json.loads(serialized)
+ assert parsed["tool_output"] == ""
+
+ def test_actual_format_with_xml_wrapper(self):
+ """Test the actual format used in
+ llm_withtools.py: ...json.dumps...
+ """
+ tool_msg_data = {
+ "tool_name": "bash",
+ "tool_input": {"command": "ls"},
+ "tool_output": "file.txt",
+ }
+ tool_msg = (
+ f"\n"
+ f"{json.dumps(tool_msg_data, indent=2)}"
+ f"\n"
+ )
+ # Extract the JSON between tags
+ start = tool_msg.index("\n") + 7
+ end = tool_msg.index("\n")
+ extracted = tool_msg[start:end]
+ parsed = json.loads(extracted)
+ assert parsed["tool_name"] == "bash"