diff --git a/agent/llm_withtools.py b/agent/llm_withtools.py
index b25741e..72aaf25 100644
--- a/agent/llm_withtools.py
+++ b/agent/llm_withtools.py
@@ -95,7 +95,7 @@ def chat_with_agent(
     logging=print,
     tools_available=[],  # Empty list means no tools, 'all' means all tools
     multiple_tool_calls=False,  # Whether to allow multiple tool calls in a single response
-    max_tool_calls=40,  # Maximum number of tool calls allowed in a single response, -1 for unlimited
+    max_tool_calls=40,  # Max tool calls per response, -1=unlimited
 ):
     get_response_fn = get_response_from_llm
     # Construct message
@@ -107,15 +107,17 @@ def chat_with_agent(
         # Load all tools
         all_tools = load_tools(logging=logging, names=tools_available)
         tools_dict = {tool['info']['name']: tool for tool in all_tools}
-        system_msg = f"{get_tooluse_prompt([tool['info'] for tool in all_tools])}\n\n"
+        tool_infos = [t['info'] for t in all_tools]
+        tool_system_msg = get_tooluse_prompt(tool_infos)
         num_tool_calls = 0
 
-        # Call API
+        # Call API — tool descriptions sent as system message
         logging(f"Input: {repr(msg)}")
         response, new_msg_history, info = get_response_fn(
-            msg=system_msg + msg,
+            msg=msg,
             model=model,
             msg_history=new_msg_history,
+            system_msg=tool_system_msg,
         )
         logging(f"Output: {repr(response)}")
         # logging(f"Info: {repr(info)}")
@@ -139,13 +141,17 @@ def chat_with_agent(
                     tool_input = tool_use['tool_input']
                     tool_output = process_tool_call(tools_dict, tool_name, tool_input)
                     num_tool_calls += 1
-                    tool_msg = f'''<json>
-    {{
-        "tool_name": "{tool_name}",
-        "tool_input": {tool_input},
-        "tool_output": "{tool_output}"
-    }}
-    </json>'''.strip()
+                    tool_msg_data = {
+                        "tool_name": tool_name,
+                        "tool_input": tool_input,
+                        "tool_output": str(tool_output),
+                    }
+                    tool_json = json.dumps(
+                        tool_msg_data, indent=2,
+                    )
+                    tool_msg = (
+                        f"<json>\n{tool_json}\n</json>"
+                    )
                     logging(f"Tool output: {repr(tool_msg)}")
                     tool_msgs.append(tool_msg)
 
@@ -154,9 +160,10 @@ def chat_with_agent(
                 logging("Error: Output context exceeded. Please try again.")
                 tool_msgs.append("Error: Output context exceeded. Please try again.")
 
-            # Get tool response
+            # Get tool response — no system_msg on
+            # subsequent calls (already in context)
             response, new_msg_history, info = get_response_fn(
-                msg=system_msg + '\n\n'.join(tool_msgs),
+                msg='\n\n'.join(tool_msgs),
                 model=model,
                 msg_history=new_msg_history,
             )
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/__pycache__/__init__.cpython-314.pyc b/tests/__pycache__/__init__.cpython-314.pyc
new file mode 100644
index 0000000..fe6a863
Binary files /dev/null and b/tests/__pycache__/__init__.cpython-314.pyc differ
diff --git a/tests/__pycache__/conftest.cpython-314-pytest-9.0.2.pyc b/tests/__pycache__/conftest.cpython-314-pytest-9.0.2.pyc
new file mode 100644
index 0000000..bbd9fbe
Binary files /dev/null and b/tests/__pycache__/conftest.cpython-314-pytest-9.0.2.pyc differ
diff --git a/tests/__pycache__/test_archive_parsing.cpython-314-pytest-9.0.2.pyc b/tests/__pycache__/test_archive_parsing.cpython-314-pytest-9.0.2.pyc
new file mode 100644
index 0000000..80ec9e0
Binary files /dev/null and b/tests/__pycache__/test_archive_parsing.cpython-314-pytest-9.0.2.pyc differ
diff --git a/tests/__pycache__/test_bash_sentinel.cpython-314-pytest-9.0.2.pyc b/tests/__pycache__/test_bash_sentinel.cpython-314-pytest-9.0.2.pyc
new file mode 100644
index 0000000..1221cce
Binary files /dev/null and b/tests/__pycache__/test_bash_sentinel.cpython-314-pytest-9.0.2.pyc differ
diff --git a/tests/__pycache__/test_ensemble.cpython-314-pytest-9.0.2.pyc b/tests/__pycache__/test_ensemble.cpython-314-pytest-9.0.2.pyc
new file mode 100644
index 0000000..5ee0b44
Binary files /dev/null and b/tests/__pycache__/test_ensemble.cpython-314-pytest-9.0.2.pyc differ
diff --git a/tests/__pycache__/test_genesis_evaluator.cpython-314-pytest-9.0.2.pyc b/tests/__pycache__/test_genesis_evaluator.cpython-314-pytest-9.0.2.pyc
new file mode 100644
index 0000000..730bc85
Binary files /dev/null and b/tests/__pycache__/test_genesis_evaluator.cpython-314-pytest-9.0.2.pyc differ
diff --git a/tests/__pycache__/test_llm_metadata.cpython-314-pytest-9.0.2.pyc b/tests/__pycache__/test_llm_metadata.cpython-314-pytest-9.0.2.pyc
new file mode 100644
index 0000000..45877a0
Binary files /dev/null and b/tests/__pycache__/test_llm_metadata.cpython-314-pytest-9.0.2.pyc differ
diff --git a/tests/__pycache__/test_meta_agent_instruction.cpython-314-pytest-9.0.2.pyc b/tests/__pycache__/test_meta_agent_instruction.cpython-314-pytest-9.0.2.pyc
new file mode 100644
index 0000000..39d1bd2
Binary files /dev/null and b/tests/__pycache__/test_meta_agent_instruction.cpython-314-pytest-9.0.2.pyc differ
diff --git a/tests/__pycache__/test_metadata_atomic.cpython-314-pytest-9.0.2.pyc b/tests/__pycache__/test_metadata_atomic.cpython-314-pytest-9.0.2.pyc
new file mode 100644
index 0000000..d53ae9a
Binary files /dev/null and b/tests/__pycache__/test_metadata_atomic.cpython-314-pytest-9.0.2.pyc differ
diff --git a/tests/__pycache__/test_smoke_test.cpython-314-pytest-9.0.2.pyc b/tests/__pycache__/test_smoke_test.cpython-314-pytest-9.0.2.pyc
new file mode 100644
index 0000000..db08b30
Binary files /dev/null and b/tests/__pycache__/test_smoke_test.cpython-314-pytest-9.0.2.pyc differ
diff --git a/tests/__pycache__/test_tool_output_format.cpython-314-pytest-9.0.2.pyc b/tests/__pycache__/test_tool_output_format.cpython-314-pytest-9.0.2.pyc
new file mode 100644
index 0000000..8005677
Binary files /dev/null and b/tests/__pycache__/test_tool_output_format.cpython-314-pytest-9.0.2.pyc differ
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..2708571
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,188 @@
+"""Shared fixtures for HyperAgents test suite."""
+
+import importlib
+import importlib.util
+import os
+import json
+import sys
+import tempfile
+import shutil
+import types
+
+import pytest
+
+# ---- Project root on sys.path ----
+_PROJ = os.path.normpath(
+    "C:/Users/ryuke/Desktop/Projects/Hyperagents"
+)
+if _PROJ not in sys.path:
+    sys.path.insert(0, _PROJ)
+
+
+def _install_lightweight_mocks():
+    """Install minimal mock modules so that project
+    modules can be imported without heavy deps like
+    docker, litellm, backoff, torch, etc.
+
+    Only installs mocks for modules NOT already
+    present -- safe to call multiple times.
+    """
+    def _ensure(name, factory):
+        if name not in sys.modules:
+            sys.modules[name] = factory()
+
+    # docker
+    _ensure("docker", lambda: types.ModuleType("docker"))
+
+    # utils.docker_utils
+    def _make_docker_utils():
+        m = types.ModuleType("utils.docker_utils")
+        m.copy_to_container = lambda *a, **k: None
+        m.log_container_output = lambda *a, **k: None
+        return m
+    _ensure("utils.docker_utils", _make_docker_utils)
+
+    # utils.git_utils
+    def _make_git_utils():
+        m = types.ModuleType("utils.git_utils")
+        m.commit_repo = lambda *a, **k: "abc123"
+        m.get_git_commit_hash = lambda *a, **k: "abc"
+        return m
+    _ensure("utils.git_utils", _make_git_utils)
+
+    # backoff
+    def _make_backoff():
+        m = types.ModuleType("backoff")
+        m.expo = "expo"
+        m.on_exception = (
+            lambda *a, **kw: (lambda f: f)
+        )
+        return m
+    _ensure("backoff", _make_backoff)
+
+    # requests / requests.exceptions
+    def _make_requests():
+        m = types.ModuleType("requests")
+        exc = types.ModuleType("requests.exceptions")
+        exc.RequestException = Exception
+        m.exceptions = exc
+        sys.modules["requests.exceptions"] = exc
+        return m
+    _ensure("requests", _make_requests)
+
+    # litellm
+    def _make_litellm():
+        m = types.ModuleType("litellm")
+        m.drop_params = True
+        m.completion = lambda **kw: None
+        return m
+    _ensure("litellm", _make_litellm)
+
+    # dotenv
+    def _make_dotenv():
+        m = types.ModuleType("dotenv")
+        m.load_dotenv = lambda *a, **kw: None
+        return m
+    _ensure("dotenv", _make_dotenv)
+
+    # utils.thread_logger
+    def _make_thread_logger():
+        m = types.ModuleType("utils.thread_logger")
+        class FakeLM:
+            def __init__(self, **kw):
+                self.log = print
+        m.ThreadLoggerManager = FakeLM
+        return m
+    _ensure(
+        "utils.thread_logger", _make_thread_logger
+    )
+
+    # tqdm (used by genesis evaluator)
+    def _make_tqdm():
+        m = types.ModuleType("tqdm")
+        m.tqdm = lambda *a, **kw: iter([])
+        return m
+    _ensure("tqdm", _make_tqdm)
+
+    # pandas (used by ensemble.py)
+    def _make_pandas():
+        m = types.ModuleType("pandas")
+        m.read_csv = lambda *a, **kw: None
+        return m
+    _ensure("pandas", _make_pandas)
+    _ensure("pd", _make_pandas)
+
+
+# Install mocks at import time so all test modules
+# benefit.
+_install_lightweight_mocks()
+
+
+def load_module_from_file(module_name, file_path):
+    """Load a Python module directly from a file path,
+    bypassing package __init__.py files.
+
+    Useful for modules whose package __init__ imports
+    heavy deps (e.g., torch).
+    """
+    abs_path = os.path.join(_PROJ, file_path)
+    spec = importlib.util.spec_from_file_location(
+        module_name, abs_path
+    )
+    mod = importlib.util.module_from_spec(spec)
+    sys.modules[module_name] = mod
+    spec.loader.exec_module(mod)
+    return mod
+
+
+@pytest.fixture
+def tmp_dir():
+    """Provide a temporary directory, cleaned up after test."""
+    d = tempfile.mkdtemp()
+    yield d
+    shutil.rmtree(d, ignore_errors=True)
+
+
+@pytest.fixture
+def sample_archive_jsonl(tmp_dir):
+    """Create a sample archive.jsonl file with valid data."""
+    path = os.path.join(tmp_dir, "archive.jsonl")
+    entries = [
+        {
+            "current_genid": 0,
+            "archive": [0],
+        },
+        {
+            "current_genid": 1,
+            "archive": [0, 1],
+        },
+        {
+            "current_genid": 2,
+            "archive": [0, 1, 2],
+        },
+    ]
+    with open(path, "w") as f:
+        for entry in entries:
+            f.write(json.dumps(entry) + "\n")
+    return path
+
+
+@pytest.fixture
+def sample_metadata_dir(tmp_dir):
+    """Create gen_X directories with metadata.json files."""
+    for genid in range(3):
+        gen_dir = os.path.join(
+            tmp_dir, f"gen_{genid}"
+        )
+        os.makedirs(gen_dir, exist_ok=True)
+        metadata = {
+            "parent_genid": genid - 1 if genid > 0 else None,
+            "valid_parent": True,
+            "prev_patch_files": [],
+            "curr_patch_files": [],
+        }
+        with open(
+            os.path.join(gen_dir, "metadata.json"), "w"
+        ) as f:
+            json.dump(metadata, f)
+    return tmp_dir
diff --git a/tests/test_archive_parsing.py b/tests/test_archive_parsing.py
new file mode 100644
index 0000000..f6fb711
--- /dev/null
+++ b/tests/test_archive_parsing.py
@@ -0,0 +1,180 @@
+"""Tests for JSONL archive parsing (F-07 fix).
+
+Validates that load_archive_data() correctly parses
+JSONL format (one JSON object per line) instead of
+treating the whole file as a single JSON array.
+"""
+
+import json
+import os
+import tempfile
+
+import pytest
+
+
+# --------------- helpers to avoid heavy project imports -----
+# We extract the parsing logic directly to test in
+# isolation.  If import works, we test the real function
+# too.
+
+def _parse_jsonl(filepath, last_only=True):
+    """Pure-Python reimplementation of the JSONL parsing
+    logic from utils/gl_utils.py::load_archive_data."""
+    if not os.path.exists(filepath):
+        raise FileNotFoundError(
+            f"Metadata file not found at {filepath}"
+        )
+    archive_data = []
+    with open(filepath, "r") as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                try:
+                    archive_data.append(json.loads(line))
+                except json.JSONDecodeError:
+                    continue
+    if last_only:
+        return archive_data[-1]
+    return archive_data
+
+
+class TestLoadArchiveDataParsing:
+    """Tests for JSONL line-by-line parsing."""
+
+    def test_parse_valid_jsonl(self, tmp_dir):
+        """Valid JSONL with multiple lines parses each
+        line independently."""
+        path = os.path.join(tmp_dir, "archive.jsonl")
+        entries = [
+            {"current_genid": 0, "archive": [0]},
+            {"current_genid": 1, "archive": [0, 1]},
+        ]
+        with open(path, "w") as f:
+            for e in entries:
+                f.write(json.dumps(e) + "\n")
+
+        result = _parse_jsonl(path, last_only=False)
+        assert len(result) == 2
+        assert result[0]["current_genid"] == 0
+        assert result[1]["archive"] == [0, 1]
+
+    def test_parse_last_only(self, tmp_dir):
+        """last_only=True returns only the final entry."""
+        path = os.path.join(tmp_dir, "archive.jsonl")
+        entries = [
+            {"current_genid": 0, "archive": [0]},
+            {"current_genid": 1, "archive": [0, 1]},
+            {"current_genid": 2, "archive": [0, 1, 2]},
+        ]
+        with open(path, "w") as f:
+            for e in entries:
+                f.write(json.dumps(e) + "\n")
+
+        result = _parse_jsonl(path, last_only=True)
+        assert result["current_genid"] == 2
+        assert len(result["archive"]) == 3
+
+    def test_empty_lines_skipped(self, tmp_dir):
+        """Blank lines between entries are ignored."""
+        path = os.path.join(tmp_dir, "archive.jsonl")
+        with open(path, "w") as f:
+            f.write(json.dumps({"a": 1}) + "\n")
+            f.write("\n")
+            f.write("   \n")
+            f.write(json.dumps({"a": 2}) + "\n")
+
+        result = _parse_jsonl(path, last_only=False)
+        assert len(result) == 2
+
+    def test_malformed_lines_skipped(self, tmp_dir):
+        """Malformed JSON lines are skipped without
+        crashing."""
+        path = os.path.join(tmp_dir, "archive.jsonl")
+        with open(path, "w") as f:
+            f.write(json.dumps({"ok": True}) + "\n")
+            f.write("this is not json\n")
+            f.write("{broken json\n")
+            f.write(json.dumps({"ok": True}) + "\n")
+
+        result = _parse_jsonl(path, last_only=False)
+        assert len(result) == 2
+        assert all(e["ok"] for e in result)
+
+    def test_empty_file_raises(self, tmp_dir):
+        """An empty file (no valid entries) raises
+        IndexError when last_only=True."""
+        path = os.path.join(tmp_dir, "archive.jsonl")
+        with open(path, "w") as f:
+            f.write("")
+
+        with pytest.raises(IndexError):
+            _parse_jsonl(path, last_only=True)
+
+    def test_empty_file_returns_empty_list(self, tmp_dir):
+        """An empty file returns [] when last_only=False."""
+        path = os.path.join(tmp_dir, "archive.jsonl")
+        with open(path, "w") as f:
+            f.write("")
+
+        result = _parse_jsonl(path, last_only=False)
+        assert result == []
+
+    def test_missing_file_raises(self, tmp_dir):
+        """A nonexistent file raises FileNotFoundError."""
+        path = os.path.join(tmp_dir, "nonexistent.jsonl")
+        with pytest.raises(FileNotFoundError):
+            _parse_jsonl(path)
+
+    def test_single_line_file(self, tmp_dir):
+        """A file with exactly one line works correctly."""
+        path = os.path.join(tmp_dir, "archive.jsonl")
+        entry = {"current_genid": 0, "archive": [0]}
+        with open(path, "w") as f:
+            f.write(json.dumps(entry) + "\n")
+
+        result = _parse_jsonl(path, last_only=True)
+        assert result == entry
+
+        result_all = _parse_jsonl(path, last_only=False)
+        assert len(result_all) == 1
+
+
+class TestLoadArchiveDataReal:
+    """Test the real load_archive_data function.
+
+    conftest.py installs lightweight mocks so the
+    import succeeds without docker/litellm/etc.
+    """
+
+    @pytest.fixture(autouse=True)
+    def _try_import(self):
+        """Import load_archive_data (mocks in
+        conftest handle heavy deps)."""
+        try:
+            from utils.gl_utils import (
+                load_archive_data,
+            )
+            self.load_fn = load_archive_data
+        except Exception as e:
+            pytest.skip(
+                f"Could not import: {e}"
+            )
+
+    def test_real_parse_valid(
+        self, sample_archive_jsonl
+    ):
+        """Real function parses valid JSONL."""
+        result = self.load_fn(
+            sample_archive_jsonl, last_only=False
+        )
+        assert len(result) == 3
+        assert result[-1]["current_genid"] == 2
+
+    def test_real_last_only(
+        self, sample_archive_jsonl
+    ):
+        """Real function returns last entry."""
+        result = self.load_fn(
+            sample_archive_jsonl, last_only=True
+        )
+        assert result["current_genid"] == 2
diff --git a/tests/test_bash_sentinel.py b/tests/test_bash_sentinel.py
new file mode 100644
index 0000000..ee35689
--- /dev/null
+++ b/tests/test_bash_sentinel.py
@@ -0,0 +1,68 @@
+"""Tests for BashSession random sentinel (F-14).
+
+Validates that each BashSession instance gets a
+unique, random sentinel instead of the static
+'<<exit>>' string, preventing command output from
+accidentally matching the sentinel.
+"""
+
+import re
+import sys
+
+import pytest
+
+from agent.tools.bash import BashSession
+
+
+class TestBashSentinelUniqueness:
+    """F-14: Each BashSession gets a unique
+    random sentinel."""
+
+    def test_sentinel_is_not_static(self):
+        """Sentinel must not be the old static
+        '<<exit>>' string."""
+        session = BashSession()
+        assert session._sentinel != "<<exit>>"
+        assert session._sentinel != "<<EXIT>>"
+
+    def test_sentinel_matches_expected_pattern(self):
+        """Sentinel matches <<EXIT_[hex]>> format."""
+        session = BashSession()
+        pattern = r"^<<EXIT_[0-9a-f]+>>$"
+        assert re.match(pattern, session._sentinel), (
+            f"Sentinel {session._sentinel!r} does not "
+            f"match pattern {pattern}"
+        )
+
+    def test_each_instance_gets_unique_sentinel(self):
+        """Two separate BashSession instances must have
+        different sentinels."""
+        s1 = BashSession()
+        s2 = BashSession()
+        assert s1._sentinel != s2._sentinel, (
+            "Two sessions should not share a sentinel"
+        )
+
+    def test_many_instances_all_unique(self):
+        """Creating 50 sessions yields 50 distinct
+        sentinels."""
+        sentinels = {
+            BashSession()._sentinel for _ in range(50)
+        }
+        assert len(sentinels) == 50
+
+    def test_sentinel_hex_length(self):
+        """The hex portion has 32 chars (uuid4.hex)."""
+        session = BashSession()
+        # <<EXIT_{32 hex chars}>>
+        inner = session._sentinel[7:-2]  # strip <<EXIT_ and >>
+        assert len(inner) == 32
+        assert all(c in "0123456789abcdef" for c in inner)
+
+    def test_sentinel_used_in_run_command(self):
+        """The run() method references self._sentinel,
+        not a hardcoded string."""
+        import inspect
+        src = inspect.getsource(BashSession.run)
+        assert "self._sentinel" in src
+        assert "<<exit>>" not in src.lower()
diff --git a/tests/test_ensemble.py b/tests/test_ensemble.py
new file mode 100644
index 0000000..3148c0f
--- /dev/null
+++ b/tests/test_ensemble.py
@@ -0,0 +1,174 @@
+"""Tests for ensemble majority voting logic.
+
+Validates weighted majority voting for classification
+domains, single-best fallback, and domain gating.
+"""
+
+import json
+import os
+import sys
+from collections import defaultdict
+from unittest.mock import patch, MagicMock
+
+import pytest
+
+_PROJ = "C:/Users/ryuke/Desktop/Projects/Hyperagents"
+if _PROJ not in sys.path:
+    sys.path.insert(0, _PROJ)
+
+
+# ------ Pure logic tests (no imports needed) ------
+
+class TestMajorityVoteLogic:
+    """Test the weighted majority voting algorithm
+    in isolation."""
+
+    @staticmethod
+    def _weighted_majority(predictions_scores):
+        """Reimplement the core voting logic from
+        ensemble.py for isolated testing.
+
+        Args:
+            predictions_scores: list of (pred, score)
+        Returns:
+            The prediction with highest total weight.
+        """
+        votes = defaultdict(float)
+        for pred, score in predictions_scores:
+            if pred is not None:
+                votes[pred] += score
+        if votes:
+            return max(votes, key=votes.get)
+        return None
+
+    def test_majority_simple(self):
+        """3 agents: A(0.8), B(0.7), A(0.6) -> A
+        wins with 1.4 vs 0.7."""
+        result = self._weighted_majority([
+            ("A", 0.8), ("B", 0.7), ("A", 0.6),
+        ])
+        assert result == "A"
+
+    def test_weights_override_count(self):
+        """2 agents vote B with low scores, 1 votes A
+        with high score. A wins on weight."""
+        result = self._weighted_majority([
+            ("A", 0.95), ("B", 0.1), ("B", 0.1),
+        ])
+        assert result == "A"
+
+    def test_tie_broken_deterministically(self):
+        """When weights tie, max() returns
+        deterministically."""
+        result = self._weighted_majority([
+            ("A", 0.5), ("B", 0.5),
+        ])
+        # max() returns first key encountered with max
+        assert result in ("A", "B")
+
+    def test_all_same_vote(self):
+        """All agents agree -> that answer wins."""
+        result = self._weighted_majority([
+            ("X", 0.9), ("X", 0.8), ("X", 0.7),
+        ])
+        assert result == "X"
+
+    def test_none_predictions_ignored(self):
+        """None predictions don't count toward any
+        vote."""
+        result = self._weighted_majority([
+            ("A", 0.9), (None, 0.8), ("B", 0.7),
+        ])
+        assert result == "A"
+
+    def test_all_none_returns_none(self):
+        """If all predictions are None, returns None."""
+        result = self._weighted_majority([
+            (None, 0.9), (None, 0.8),
+        ])
+        assert result is None
+
+
+class TestEnsembleDomainGating:
+    """Verify domain-based routing: classification
+    domains use voting, others use single-best."""
+
+    def test_classification_domains_known(self):
+        """Classification domains that support
+        ensemble are a known set."""
+        # conftest.py mocks make this import work
+        from ensemble import _CLASSIFICATION_DOMAINS
+        assert (
+            "search_arena" in _CLASSIFICATION_DOMAINS
+        )
+        assert (
+            "paper_review" in _CLASSIFICATION_DOMAINS
+        )
+        assert (
+            "imo_grading" in _CLASSIFICATION_DOMAINS
+        )
+
+    def test_non_classification_not_in_set(self):
+        """Non-classification domains are NOT in the
+        classification set."""
+        from ensemble import _CLASSIFICATION_DOMAINS
+        assert (
+            "genesis_go2walking"
+            not in _CLASSIFICATION_DOMAINS
+        )
+        assert (
+            "balrog_babyai"
+            not in _CLASSIFICATION_DOMAINS
+        )
+        assert (
+            "polyglot"
+            not in _CLASSIFICATION_DOMAINS
+        )
+
+
+class TestEnsembleFallback:
+    """Test fallback behavior when <3 agents
+    available."""
+
+    @staticmethod
+    def _should_use_voting(
+        domain, can_ensemble, n_agents
+    ):
+        """Replicate the gating logic from ensemble()
+        for testing."""
+        classification = {
+            "search_arena",
+            "paper_review",
+            "imo_grading",
+        }
+        return (
+            domain in classification
+            and can_ensemble
+            and n_agents >= 3
+        )
+
+    def test_fewer_than_3_uses_single_best(self):
+        """With 2 agents, should NOT use voting."""
+        assert not self._should_use_voting(
+            "search_arena", True, 2
+        )
+
+    def test_3_or_more_uses_voting(self):
+        """With 3+ agents, should use voting."""
+        assert self._should_use_voting(
+            "search_arena", True, 3
+        )
+
+    def test_non_classification_always_single(self):
+        """Non-classification domain never uses
+        voting regardless of agent count."""
+        assert not self._should_use_voting(
+            "genesis_go2walking", False, 5
+        )
+
+    def test_can_ensemble_false_blocks_voting(self):
+        """Even a classification domain with
+        can_ensemble=False uses single-best."""
+        assert not self._should_use_voting(
+            "imo_grading", False, 3
+        )
diff --git a/tests/test_llm_metadata.py b/tests/test_llm_metadata.py
new file mode 100644
index 0000000..759f9cd
--- /dev/null
+++ b/tests/test_llm_metadata.py
@@ -0,0 +1,93 @@
+"""Tests for LLM response metadata (F-10).
+
+Validates that get_response_from_llm() returns an
+info dict with expected keys: finish_reason, usage,
+model.
+"""
+
+import inspect
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+# conftest.py mocks backoff, litellm, dotenv, etc.
+from agent.llm import get_response_from_llm
+
+
+class TestLlmMetadataKeys:
+    """F-10: info dict contains expected keys."""
+
+    def test_return_annotation_is_tuple(self):
+        """get_response_from_llm returns a 3-tuple
+        (text, history, info)."""
+        sig = inspect.signature(
+            get_response_from_llm
+        )
+        ret = sig.return_annotation
+        assert "Tuple" in str(ret)
+
+    def test_info_dict_constructed_in_source(self):
+        """Source constructs info with finish_reason,
+        usage, model keys."""
+        src = inspect.getsource(
+            get_response_from_llm
+        )
+        assert '"finish_reason"' in src
+        assert '"usage"' in src
+        assert '"model"' in src
+
+    def test_info_dict_is_returned(self):
+        """The function returns (response_text,
+        new_msg_history, info)."""
+        src = inspect.getsource(
+            get_response_from_llm
+        )
+        assert (
+            "return response_text, "
+            "new_msg_history, info"
+        ) in src
+
+    def test_info_structure_via_mock(self):
+        """Mock litellm.completion and verify the
+        returned info dict shape."""
+        mock_response = MagicMock()
+        mock_response.choices = [MagicMock()]
+        mock_response.choices[0].message.content = (
+            "hello"
+        )
+        mock_response.choices[0].finish_reason = (
+            "stop"
+        )
+        mock_response.usage = MagicMock()
+        mock_response.model = "test-model"
+
+        # Make response subscriptable for the
+        # response['choices'][0]['message']['content']
+        # pattern used in the source.
+        choice_msg = {"content": "hello"}
+        choice = {"message": choice_msg}
+
+        def getitem(self, key):
+            if key == "choices":
+                return [choice]
+            return None
+
+        type(mock_response).__getitem__ = getitem
+
+        with patch(
+            "agent.llm.litellm.completion",
+            return_value=mock_response,
+        ):
+            text, history, info = (
+                get_response_from_llm(
+                    msg="test", model="test-model"
+                )
+            )
+
+        assert isinstance(info, dict)
+        assert "finish_reason" in info
+        assert "usage" in info
+        assert "model" in info
+        assert info["finish_reason"] == "stop"
+        assert info["model"] == "test-model"
+        assert text == "hello"
diff --git a/tests/test_meta_agent_instruction.py b/tests/test_meta_agent_instruction.py
new file mode 100644
index 0000000..f4c7631
--- /dev/null
+++ b/tests/test_meta_agent_instruction.py
@@ -0,0 +1,69 @@
+"""Tests for MetaAgent instruction construction (F-04).
+
+Validates that the instruction string built inside
+MetaAgent.forward() contains eval_path, iterations_left,
+and is not trivially short.
+"""
+
+import inspect
+
+import pytest
+
+# conftest.py mocks backoff, litellm, dotenv,
+# thread_logger, etc.
+from meta_agent import MetaAgent
+
+
+class TestMetaAgentInstruction:
+    """F-04: Instruction string is comprehensive."""
+
+    def test_forward_accepts_eval_path(self):
+        """forward() signature includes eval_path."""
+        sig = inspect.signature(MetaAgent.forward)
+        params = list(sig.parameters.keys())
+        assert "eval_path" in params
+
+    def test_forward_accepts_iterations_left(self):
+        """forward() signature includes
+        iterations_left."""
+        sig = inspect.signature(MetaAgent.forward)
+        params = list(sig.parameters.keys())
+        assert "iterations_left" in params
+
+    def test_eval_path_appears_in_instruction(self):
+        """The source of forward() references
+        eval_path in the instruction string."""
+        src = inspect.getsource(MetaAgent.forward)
+        assert "eval_path" in src
+
+    def test_iterations_left_in_instruction(self):
+        """When iterations_left is provided, it
+        appears in the instruction."""
+        src = inspect.getsource(MetaAgent.forward)
+        assert "iterations_left" in src
+
+    def test_instruction_is_substantial(self):
+        """The instruction is built with multiple
+        concatenations, not just a few words."""
+        src = inspect.getsource(MetaAgent.forward)
+        plus_eq_count = src.count("instruction +=")
+        assert plus_eq_count >= 3, (
+            f"Expected 3+ instruction +=, got "
+            f"{plus_eq_count}"
+        )
+
+    def test_instruction_mentions_readme(self):
+        """Instruction tells the agent to read the
+        README for orientation."""
+        src = inspect.getsource(MetaAgent.forward)
+        assert "README" in src
+
+    def test_instruction_mentions_repo_path(self):
+        """Instruction references repo_path."""
+        src = inspect.getsource(MetaAgent.forward)
+        assert "repo_path" in src
+
+    def test_forward_calls_chat_with_agent(self):
+        """forward() delegates to chat_with_agent."""
+        src = inspect.getsource(MetaAgent.forward)
+        assert "chat_with_agent" in src
diff --git a/tests/test_metadata_atomic.py b/tests/test_metadata_atomic.py
new file mode 100644
index 0000000..59bcc92
--- /dev/null
+++ b/tests/test_metadata_atomic.py
@@ -0,0 +1,139 @@
+"""Tests for atomic metadata writes (F-11).
+
+Validates that update_node_metadata() uses the
+temp-file + os.replace pattern to avoid corruption.
+"""
+
+import json
+import os
+import sys
+
+import pytest
+
+# conftest.py mocks heavy deps (docker, etc.)
+from utils.gl_utils import (
+    update_node_metadata,
+    get_node_metadata_key,
+)
+
+
+class TestUpdateNodeMetadataAtomic:
+    """F-11: metadata writes use temp + rename."""
+
+    def _make_gen_dir(self, tmp_dir, genid, data):
+        """Helper: create gen_{genid}/metadata.json."""
+        gen_dir = os.path.join(
+            tmp_dir, f"gen_{genid}"
+        )
+        os.makedirs(gen_dir, exist_ok=True)
+        meta_path = os.path.join(
+            gen_dir, "metadata.json"
+        )
+        with open(meta_path, "w") as f:
+            json.dump(data, f)
+        return meta_path
+
+    def test_update_writes_correct_data(self, tmp_dir):
+        """After update, the file contains the merged
+        data."""
+        original = {"parent_genid": None, "score": 0.5}
+        self._make_gen_dir(tmp_dir, 0, original)
+
+        update_node_metadata(
+            tmp_dir, 0, {"score": 0.9, "new_key": True}
+        )
+
+        meta_path = os.path.join(
+            tmp_dir, "gen_0", "metadata.json"
+        )
+        with open(meta_path, "r") as f:
+            result = json.load(f)
+
+        assert result["score"] == 0.9
+        assert result["new_key"] is True
+        assert result["parent_genid"] is None
+
+    def test_no_temp_file_left_behind(self, tmp_dir):
+        """After a successful write, no .tmp file
+        remains."""
+        self._make_gen_dir(
+            tmp_dir, 0, {"key": "value"}
+        )
+        update_node_metadata(
+            tmp_dir, 0, {"key": "updated"}
+        )
+
+        gen_dir = os.path.join(tmp_dir, "gen_0")
+        files = os.listdir(gen_dir)
+        tmp_files = [f for f in files if f.endswith(".tmp")]
+        assert len(tmp_files) == 0, (
+            f"Temp files remain: {tmp_files}"
+        )
+
+    def test_atomic_pattern_in_source(self):
+        """Source code uses tmp_file + os.replace
+        pattern."""
+        import inspect
+        src = inspect.getsource(update_node_metadata)
+        assert "tmp" in src.lower(), (
+            "Should use a temp file"
+        )
+        assert "os.replace" in src or "os.rename" in src, (
+            "Should use os.replace or os.rename for "
+            "atomic swap"
+        )
+        assert "f.flush()" in src, (
+            "Should flush before fsync"
+        )
+        assert "os.fsync" in src, (
+            "Should fsync before replace"
+        )
+
+    def test_missing_metadata_is_noop(self, tmp_dir):
+        """If metadata.json doesn't exist,
+        update_node_metadata does nothing."""
+        # gen_99 does not exist
+        update_node_metadata(
+            tmp_dir, 99, {"key": "value"}
+        )
+        gen_dir = os.path.join(tmp_dir, "gen_99")
+        assert not os.path.exists(gen_dir)
+
+    def test_get_after_update_returns_new_value(
+        self, tmp_dir
+    ):
+        """get_node_metadata_key returns the updated
+        value after update_node_metadata."""
+        self._make_gen_dir(
+            tmp_dir, 1, {"status": "pending"}
+        )
+        update_node_metadata(
+            tmp_dir, 1, {"status": "complete"}
+        )
+        val = get_node_metadata_key(
+            tmp_dir, 1, "status"
+        )
+        assert val == "complete"
+
+    def test_concurrent_safety_no_partial_writes(
+        self, tmp_dir
+    ):
+        """Simulates that the original file stays
+        intact if we check it before os.replace
+        would run -- i.e., the tmp file is written
+        first, then swapped."""
+        original = {"step": 1, "data": "original"}
+        meta_path = self._make_gen_dir(
+            tmp_dir, 0, original
+        )
+
+        # Perform multiple updates sequentially
+        for i in range(2, 6):
+            update_node_metadata(
+                tmp_dir, 0, {"step": i}
+            )
+
+        with open(meta_path, "r") as f:
+            result = json.load(f)
+        assert result["step"] == 5
+        assert result["data"] == "original"
diff --git a/tests/test_smoke_test.py b/tests/test_smoke_test.py
new file mode 100644
index 0000000..3a56c57
--- /dev/null
+++ b/tests/test_smoke_test.py
@@ -0,0 +1,101 @@
+"""Tests for run_smoke_test() (F-15).
+
+Validates that run_smoke_test correctly interprets
+container.exec_run results: True on success,
+False on non-zero exit code or missing sentinel.
+"""
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+# conftest.py mocks docker, etc.
+from utils.gl_utils import run_smoke_test
+
+
+def _make_mock_container(exit_code, output_text):
+    """Create a mock Docker container whose exec_run
+    returns the given exit code and output."""
+    container = MagicMock()
+    exec_result = MagicMock()
+    exec_result.exit_code = exit_code
+    exec_result.output = output_text.encode("utf-8")
+    container.exec_run.return_value = exec_result
+    return container
+
+
+class TestRunSmokeTest:
+    """F-15: run_smoke_test container validation."""
+
+    def _run(self, exit_code, output):
+        """Helper: run smoke test with mock."""
+        container = _make_mock_container(
+            exit_code, output
+        )
+        with patch(
+            "utils.gl_utils.log_container_output",
+            lambda *a, **kw: None,
+        ):
+            return run_smoke_test(container)
+
+    def test_success_returns_true(self):
+        """Exit code 0 + sentinel present -> True."""
+        result = self._run(
+            0, "some output\nsmoke_test_passed\n"
+        )
+        assert result is True
+
+    def test_nonzero_exit_returns_false(self):
+        """Non-zero exit code -> False."""
+        result = self._run(
+            1, "smoke_test_passed\n"
+        )
+        assert result is False
+
+    def test_missing_sentinel_returns_false(self):
+        """Exit code 0 but no sentinel string ->
+        False."""
+        result = self._run(
+            0, "import succeeded\n"
+        )
+        assert result is False
+
+    def test_empty_output_returns_false(self):
+        """Empty output -> False."""
+        result = self._run(0, "")
+        assert result is False
+
+    def test_exception_returns_false(self):
+        """If exec_run raises, returns False."""
+        container = MagicMock()
+        container.exec_run.side_effect = (
+            RuntimeError("docker error")
+        )
+        with patch(
+            "utils.gl_utils.log_container_output",
+            lambda *a, **kw: None,
+        ):
+            result = run_smoke_test(container)
+        assert result is False
+
+    def test_calls_exec_run_with_command(self):
+        """exec_run is called with a command list
+        including 'python' and '-c'."""
+        container = _make_mock_container(
+            0, "smoke_test_passed"
+        )
+        with patch(
+            "utils.gl_utils.log_container_output",
+            lambda *a, **kw: None,
+        ):
+            run_smoke_test(container)
+
+        call_args = container.exec_run.call_args
+        cmd = call_args.kwargs.get(
+            "cmd",
+            call_args.args[0]
+            if call_args.args
+            else None,
+        )
+        assert "python" in cmd
+        assert "-c" in cmd
diff --git a/tests/test_tool_output_format.py b/tests/test_tool_output_format.py
new file mode 100644
index 0000000..440b94b
--- /dev/null
+++ b/tests/test_tool_output_format.py
@@ -0,0 +1,158 @@
+"""Tests for tool output JSON serialization (F-03).
+
+Validates that tool output messages use json.dumps()
+for proper serialization, avoiding the old f-string
+approach which produced invalid JSON with unescaped
+quotes and special characters.
+"""
+
+import json
+
+import pytest
+
+
+class TestToolOutputJsonSerialization:
+    """Verify json.dumps produces valid JSON for tool
+    output messages."""
+
+    def test_basic_tool_output_is_valid_json(self):
+        """A simple tool output round-trips through
+        json.dumps / json.loads."""
+        tool_input = {
+            "command": "ls -la",
+            "path": "/tmp",
+        }
+        tool_msg_data = {
+            "tool_name": "bash",
+            "tool_input": tool_input,
+            "tool_output": "file1.txt\nfile2.txt",
+        }
+        serialized = json.dumps(tool_msg_data)
+        parsed = json.loads(serialized)
+        assert parsed["tool_name"] == "bash"
+        assert parsed["tool_input"] == tool_input
+        assert "file1.txt" in parsed["tool_output"]
+
+    def test_old_fstring_approach_produces_invalid_json(
+        self,
+    ):
+        """Demonstrate the bug: f-string interpolation
+        of dicts produces repr() output that is NOT
+        valid JSON (single quotes, unescaped chars)."""
+        tool_input = {
+            "command": "echo 'hello'",
+            "path": "/tmp",
+        }
+        tool_output = 'He said "hello"'
+
+        # Old broken approach: f-string with dict
+        old_msg = (
+            f'{{"tool_name": "bash", '
+            f'"tool_input": {tool_input}, '
+            f'"tool_output": "{tool_output}"}}'
+        )
+        with pytest.raises(json.JSONDecodeError):
+            json.loads(old_msg)
+
+    def test_special_chars_quotes(self):
+        """Double quotes in tool_output are properly
+        escaped by json.dumps."""
+        data = {
+            "tool_name": "bash",
+            "tool_input": {"command": "echo"},
+            "tool_output": 'He said "hello"',
+        }
+        serialized = json.dumps(data)
+        parsed = json.loads(serialized)
+        assert parsed["tool_output"] == (
+            'He said "hello"'
+        )
+
+    def test_special_chars_newlines(self):
+        """Newlines in tool_output are escaped."""
+        data = {
+            "tool_name": "bash",
+            "tool_input": {"command": "ls"},
+            "tool_output": "line1\nline2\nline3",
+        }
+        serialized = json.dumps(data)
+        # Raw string should contain \\n, not newlines
+        assert "\\n" in serialized
+        parsed = json.loads(serialized)
+        assert parsed["tool_output"].count("\n") == 2
+
+    def test_special_chars_backslashes(self):
+        """Backslashes in tool_output are escaped."""
+        data = {
+            "tool_name": "bash",
+            "tool_input": {"command": "echo"},
+            "tool_output": "C:\\Users\\test\\file.txt",
+        }
+        serialized = json.dumps(data)
+        parsed = json.loads(serialized)
+        assert (
+            parsed["tool_output"]
+            == "C:\\Users\\test\\file.txt"
+        )
+
+    def test_special_chars_tabs_and_unicode(self):
+        """Tabs and unicode in tool_output are handled."""
+        data = {
+            "tool_name": "bash",
+            "tool_input": {"command": "cat"},
+            "tool_output": "col1\tcol2\n\u2603 snowman",
+        }
+        serialized = json.dumps(data)
+        parsed = json.loads(serialized)
+        assert "\t" in parsed["tool_output"]
+        assert "\u2603" in parsed["tool_output"]
+
+    def test_nested_json_in_output(self):
+        """Tool output containing JSON-like strings
+        serializes correctly."""
+        inner = json.dumps({"key": "value"})
+        data = {
+            "tool_name": "bash",
+            "tool_input": {"command": "curl"},
+            "tool_output": inner,
+        }
+        serialized = json.dumps(data)
+        parsed = json.loads(serialized)
+        # The output is a string, not a dict
+        assert isinstance(parsed["tool_output"], str)
+        inner_parsed = json.loads(
+            parsed["tool_output"]
+        )
+        assert inner_parsed["key"] == "value"
+
+    def test_empty_tool_output(self):
+        """Empty string tool output serializes."""
+        data = {
+            "tool_name": "bash",
+            "tool_input": {"command": "true"},
+            "tool_output": "",
+        }
+        serialized = json.dumps(data)
+        parsed = json.loads(serialized)
+        assert parsed["tool_output"] == ""
+
+    def test_actual_format_with_xml_wrapper(self):
+        """Test the actual format used in
+        llm_withtools.py: <json>...json.dumps...</json>
+        """
+        tool_msg_data = {
+            "tool_name": "bash",
+            "tool_input": {"command": "ls"},
+            "tool_output": "file.txt",
+        }
+        tool_msg = (
+            f"<json>\n"
+            f"{json.dumps(tool_msg_data, indent=2)}"
+            f"\n</json>"
+        )
+        # Extract the JSON between tags
+        start = tool_msg.index("<json>\n") + 7
+        end = tool_msg.index("\n</json>")
+        extracted = tool_msg[start:end]
+        parsed = json.loads(extracted)
+        assert parsed["tool_name"] == "bash"