diff --git a/agent/llm_withtools.py b/agent/llm_withtools.py index b25741e..72aaf25 100644 --- a/agent/llm_withtools.py +++ b/agent/llm_withtools.py @@ -95,7 +95,7 @@ def chat_with_agent( logging=print, tools_available=[], # Empty list means no tools, 'all' means all tools multiple_tool_calls=False, # Whether to allow multiple tool calls in a single response - max_tool_calls=40, # Maximum number of tool calls allowed in a single response, -1 for unlimited + max_tool_calls=40, # Max tool calls per response, -1=unlimited ): get_response_fn = get_response_from_llm # Construct message @@ -107,15 +107,17 @@ def chat_with_agent( # Load all tools all_tools = load_tools(logging=logging, names=tools_available) tools_dict = {tool['info']['name']: tool for tool in all_tools} - system_msg = f"{get_tooluse_prompt([tool['info'] for tool in all_tools])}\n\n" + tool_infos = [t['info'] for t in all_tools] + tool_system_msg = get_tooluse_prompt(tool_infos) num_tool_calls = 0 - # Call API + # Call API — tool descriptions sent as system message logging(f"Input: {repr(msg)}") response, new_msg_history, info = get_response_fn( - msg=system_msg + msg, + msg=msg, model=model, msg_history=new_msg_history, + system_msg=tool_system_msg, ) logging(f"Output: {repr(response)}") # logging(f"Info: {repr(info)}") @@ -139,13 +141,17 @@ def chat_with_agent( tool_input = tool_use['tool_input'] tool_output = process_tool_call(tools_dict, tool_name, tool_input) num_tool_calls += 1 - tool_msg = f''' - {{ - "tool_name": "{tool_name}", - "tool_input": {tool_input}, - "tool_output": "{tool_output}" - }} - '''.strip() + tool_msg_data = { + "tool_name": tool_name, + "tool_input": tool_input, + "tool_output": str(tool_output), + } + tool_json = json.dumps( + tool_msg_data, indent=2, + ) + tool_msg = ( + f"\n{tool_json}\n" + ) logging(f"Tool output: {repr(tool_msg)}") tool_msgs.append(tool_msg) @@ -154,9 +160,10 @@ def chat_with_agent( logging("Error: Output context exceeded. Please try again.") tool_msgs.append("Error: Output context exceeded. Please try again.") - # Get tool response + # Get tool response — no system_msg on + # subsequent calls (already in context) response, new_msg_history, info = get_response_fn( - msg=system_msg + '\n\n'.join(tool_msgs), + msg='\n\n'.join(tool_msgs), model=model, msg_history=new_msg_history, ) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/__pycache__/__init__.cpython-314.pyc b/tests/__pycache__/__init__.cpython-314.pyc new file mode 100644 index 0000000..fe6a863 Binary files /dev/null and b/tests/__pycache__/__init__.cpython-314.pyc differ diff --git a/tests/__pycache__/conftest.cpython-314-pytest-9.0.2.pyc b/tests/__pycache__/conftest.cpython-314-pytest-9.0.2.pyc new file mode 100644 index 0000000..bbd9fbe Binary files /dev/null and b/tests/__pycache__/conftest.cpython-314-pytest-9.0.2.pyc differ diff --git a/tests/__pycache__/test_archive_parsing.cpython-314-pytest-9.0.2.pyc b/tests/__pycache__/test_archive_parsing.cpython-314-pytest-9.0.2.pyc new file mode 100644 index 0000000..80ec9e0 Binary files /dev/null and b/tests/__pycache__/test_archive_parsing.cpython-314-pytest-9.0.2.pyc differ diff --git a/tests/__pycache__/test_bash_sentinel.cpython-314-pytest-9.0.2.pyc b/tests/__pycache__/test_bash_sentinel.cpython-314-pytest-9.0.2.pyc new file mode 100644 index 0000000..1221cce Binary files /dev/null and b/tests/__pycache__/test_bash_sentinel.cpython-314-pytest-9.0.2.pyc differ diff --git a/tests/__pycache__/test_ensemble.cpython-314-pytest-9.0.2.pyc b/tests/__pycache__/test_ensemble.cpython-314-pytest-9.0.2.pyc new file mode 100644 index 0000000..5ee0b44 Binary files /dev/null and b/tests/__pycache__/test_ensemble.cpython-314-pytest-9.0.2.pyc differ diff --git a/tests/__pycache__/test_genesis_evaluator.cpython-314-pytest-9.0.2.pyc b/tests/__pycache__/test_genesis_evaluator.cpython-314-pytest-9.0.2.pyc new file mode 100644 index 0000000..730bc85 Binary files /dev/null and b/tests/__pycache__/test_genesis_evaluator.cpython-314-pytest-9.0.2.pyc differ diff --git a/tests/__pycache__/test_llm_metadata.cpython-314-pytest-9.0.2.pyc b/tests/__pycache__/test_llm_metadata.cpython-314-pytest-9.0.2.pyc new file mode 100644 index 0000000..45877a0 Binary files /dev/null and b/tests/__pycache__/test_llm_metadata.cpython-314-pytest-9.0.2.pyc differ diff --git a/tests/__pycache__/test_meta_agent_instruction.cpython-314-pytest-9.0.2.pyc b/tests/__pycache__/test_meta_agent_instruction.cpython-314-pytest-9.0.2.pyc new file mode 100644 index 0000000..39d1bd2 Binary files /dev/null and b/tests/__pycache__/test_meta_agent_instruction.cpython-314-pytest-9.0.2.pyc differ diff --git a/tests/__pycache__/test_metadata_atomic.cpython-314-pytest-9.0.2.pyc b/tests/__pycache__/test_metadata_atomic.cpython-314-pytest-9.0.2.pyc new file mode 100644 index 0000000..d53ae9a Binary files /dev/null and b/tests/__pycache__/test_metadata_atomic.cpython-314-pytest-9.0.2.pyc differ diff --git a/tests/__pycache__/test_smoke_test.cpython-314-pytest-9.0.2.pyc b/tests/__pycache__/test_smoke_test.cpython-314-pytest-9.0.2.pyc new file mode 100644 index 0000000..db08b30 Binary files /dev/null and b/tests/__pycache__/test_smoke_test.cpython-314-pytest-9.0.2.pyc differ diff --git a/tests/__pycache__/test_tool_output_format.cpython-314-pytest-9.0.2.pyc b/tests/__pycache__/test_tool_output_format.cpython-314-pytest-9.0.2.pyc new file mode 100644 index 0000000..8005677 Binary files /dev/null and b/tests/__pycache__/test_tool_output_format.cpython-314-pytest-9.0.2.pyc differ diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..2708571 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,188 @@ +"""Shared fixtures for HyperAgents test suite.""" + +import importlib +import importlib.util +import os +import json +import sys +import tempfile +import shutil +import types + +import pytest + +# ---- Project root on sys.path ---- +_PROJ = os.path.normpath( + "C:/Users/ryuke/Desktop/Projects/Hyperagents" +) +if _PROJ not in sys.path: + sys.path.insert(0, _PROJ) + + +def _install_lightweight_mocks(): + """Install minimal mock modules so that project + modules can be imported without heavy deps like + docker, litellm, backoff, torch, etc. + + Only installs mocks for modules NOT already + present -- safe to call multiple times. + """ + def _ensure(name, factory): + if name not in sys.modules: + sys.modules[name] = factory() + + # docker + _ensure("docker", lambda: types.ModuleType("docker")) + + # utils.docker_utils + def _make_docker_utils(): + m = types.ModuleType("utils.docker_utils") + m.copy_to_container = lambda *a, **k: None + m.log_container_output = lambda *a, **k: None + return m + _ensure("utils.docker_utils", _make_docker_utils) + + # utils.git_utils + def _make_git_utils(): + m = types.ModuleType("utils.git_utils") + m.commit_repo = lambda *a, **k: "abc123" + m.get_git_commit_hash = lambda *a, **k: "abc" + return m + _ensure("utils.git_utils", _make_git_utils) + + # backoff + def _make_backoff(): + m = types.ModuleType("backoff") + m.expo = "expo" + m.on_exception = ( + lambda *a, **kw: (lambda f: f) + ) + return m + _ensure("backoff", _make_backoff) + + # requests / requests.exceptions + def _make_requests(): + m = types.ModuleType("requests") + exc = types.ModuleType("requests.exceptions") + exc.RequestException = Exception + m.exceptions = exc + sys.modules["requests.exceptions"] = exc + return m + _ensure("requests", _make_requests) + + # litellm + def _make_litellm(): + m = types.ModuleType("litellm") + m.drop_params = True + m.completion = lambda **kw: None + return m + _ensure("litellm", _make_litellm) + + # dotenv + def _make_dotenv(): + m = types.ModuleType("dotenv") + m.load_dotenv = lambda *a, **kw: None + return m + _ensure("dotenv", _make_dotenv) + + # utils.thread_logger + def _make_thread_logger(): + m = types.ModuleType("utils.thread_logger") + class FakeLM: + def __init__(self, **kw): + self.log = print + m.ThreadLoggerManager = FakeLM + return m + _ensure( + "utils.thread_logger", _make_thread_logger + ) + + # tqdm (used by genesis evaluator) + def _make_tqdm(): + m = types.ModuleType("tqdm") + m.tqdm = lambda *a, **kw: iter([]) + return m + _ensure("tqdm", _make_tqdm) + + # pandas (used by ensemble.py) + def _make_pandas(): + m = types.ModuleType("pandas") + m.read_csv = lambda *a, **kw: None + return m + _ensure("pandas", _make_pandas) + _ensure("pd", _make_pandas) + + +# Install mocks at import time so all test modules +# benefit. +_install_lightweight_mocks() + + +def load_module_from_file(module_name, file_path): + """Load a Python module directly from a file path, + bypassing package __init__.py files. + + Useful for modules whose package __init__ imports + heavy deps (e.g., torch). + """ + abs_path = os.path.join(_PROJ, file_path) + spec = importlib.util.spec_from_file_location( + module_name, abs_path + ) + mod = importlib.util.module_from_spec(spec) + sys.modules[module_name] = mod + spec.loader.exec_module(mod) + return mod + + +@pytest.fixture +def tmp_dir(): + """Provide a temporary directory, cleaned up after test.""" + d = tempfile.mkdtemp() + yield d + shutil.rmtree(d, ignore_errors=True) + + +@pytest.fixture +def sample_archive_jsonl(tmp_dir): + """Create a sample archive.jsonl file with valid data.""" + path = os.path.join(tmp_dir, "archive.jsonl") + entries = [ + { + "current_genid": 0, + "archive": [0], + }, + { + "current_genid": 1, + "archive": [0, 1], + }, + { + "current_genid": 2, + "archive": [0, 1, 2], + }, + ] + with open(path, "w") as f: + for entry in entries: + f.write(json.dumps(entry) + "\n") + return path + + +@pytest.fixture +def sample_metadata_dir(tmp_dir): + """Create gen_X directories with metadata.json files.""" + for genid in range(3): + gen_dir = os.path.join( + tmp_dir, f"gen_{genid}" + ) + os.makedirs(gen_dir, exist_ok=True) + metadata = { + "parent_genid": genid - 1 if genid > 0 else None, + "valid_parent": True, + "prev_patch_files": [], + "curr_patch_files": [], + } + with open( + os.path.join(gen_dir, "metadata.json"), "w" + ) as f: + json.dump(metadata, f) + return tmp_dir diff --git a/tests/test_archive_parsing.py b/tests/test_archive_parsing.py new file mode 100644 index 0000000..f6fb711 --- /dev/null +++ b/tests/test_archive_parsing.py @@ -0,0 +1,180 @@ +"""Tests for JSONL archive parsing (F-07 fix). + +Validates that load_archive_data() correctly parses +JSONL format (one JSON object per line) instead of +treating the whole file as a single JSON array. +""" + +import json +import os +import tempfile + +import pytest + + +# --------------- helpers to avoid heavy project imports ----- +# We extract the parsing logic directly to test in +# isolation. If import works, we test the real function +# too. + +def _parse_jsonl(filepath, last_only=True): + """Pure-Python reimplementation of the JSONL parsing + logic from utils/gl_utils.py::load_archive_data.""" + if not os.path.exists(filepath): + raise FileNotFoundError( + f"Metadata file not found at {filepath}" + ) + archive_data = [] + with open(filepath, "r") as f: + for line in f: + line = line.strip() + if line: + try: + archive_data.append(json.loads(line)) + except json.JSONDecodeError: + continue + if last_only: + return archive_data[-1] + return archive_data + + +class TestLoadArchiveDataParsing: + """Tests for JSONL line-by-line parsing.""" + + def test_parse_valid_jsonl(self, tmp_dir): + """Valid JSONL with multiple lines parses each + line independently.""" + path = os.path.join(tmp_dir, "archive.jsonl") + entries = [ + {"current_genid": 0, "archive": [0]}, + {"current_genid": 1, "archive": [0, 1]}, + ] + with open(path, "w") as f: + for e in entries: + f.write(json.dumps(e) + "\n") + + result = _parse_jsonl(path, last_only=False) + assert len(result) == 2 + assert result[0]["current_genid"] == 0 + assert result[1]["archive"] == [0, 1] + + def test_parse_last_only(self, tmp_dir): + """last_only=True returns only the final entry.""" + path = os.path.join(tmp_dir, "archive.jsonl") + entries = [ + {"current_genid": 0, "archive": [0]}, + {"current_genid": 1, "archive": [0, 1]}, + {"current_genid": 2, "archive": [0, 1, 2]}, + ] + with open(path, "w") as f: + for e in entries: + f.write(json.dumps(e) + "\n") + + result = _parse_jsonl(path, last_only=True) + assert result["current_genid"] == 2 + assert len(result["archive"]) == 3 + + def test_empty_lines_skipped(self, tmp_dir): + """Blank lines between entries are ignored.""" + path = os.path.join(tmp_dir, "archive.jsonl") + with open(path, "w") as f: + f.write(json.dumps({"a": 1}) + "\n") + f.write("\n") + f.write(" \n") + f.write(json.dumps({"a": 2}) + "\n") + + result = _parse_jsonl(path, last_only=False) + assert len(result) == 2 + + def test_malformed_lines_skipped(self, tmp_dir): + """Malformed JSON lines are skipped without + crashing.""" + path = os.path.join(tmp_dir, "archive.jsonl") + with open(path, "w") as f: + f.write(json.dumps({"ok": True}) + "\n") + f.write("this is not json\n") + f.write("{broken json\n") + f.write(json.dumps({"ok": True}) + "\n") + + result = _parse_jsonl(path, last_only=False) + assert len(result) == 2 + assert all(e["ok"] for e in result) + + def test_empty_file_raises(self, tmp_dir): + """An empty file (no valid entries) raises + IndexError when last_only=True.""" + path = os.path.join(tmp_dir, "archive.jsonl") + with open(path, "w") as f: + f.write("") + + with pytest.raises(IndexError): + _parse_jsonl(path, last_only=True) + + def test_empty_file_returns_empty_list(self, tmp_dir): + """An empty file returns [] when last_only=False.""" + path = os.path.join(tmp_dir, "archive.jsonl") + with open(path, "w") as f: + f.write("") + + result = _parse_jsonl(path, last_only=False) + assert result == [] + + def test_missing_file_raises(self, tmp_dir): + """A nonexistent file raises FileNotFoundError.""" + path = os.path.join(tmp_dir, "nonexistent.jsonl") + with pytest.raises(FileNotFoundError): + _parse_jsonl(path) + + def test_single_line_file(self, tmp_dir): + """A file with exactly one line works correctly.""" + path = os.path.join(tmp_dir, "archive.jsonl") + entry = {"current_genid": 0, "archive": [0]} + with open(path, "w") as f: + f.write(json.dumps(entry) + "\n") + + result = _parse_jsonl(path, last_only=True) + assert result == entry + + result_all = _parse_jsonl(path, last_only=False) + assert len(result_all) == 1 + + +class TestLoadArchiveDataReal: + """Test the real load_archive_data function. + + conftest.py installs lightweight mocks so the + import succeeds without docker/litellm/etc. + """ + + @pytest.fixture(autouse=True) + def _try_import(self): + """Import load_archive_data (mocks in + conftest handle heavy deps).""" + try: + from utils.gl_utils import ( + load_archive_data, + ) + self.load_fn = load_archive_data + except Exception as e: + pytest.skip( + f"Could not import: {e}" + ) + + def test_real_parse_valid( + self, sample_archive_jsonl + ): + """Real function parses valid JSONL.""" + result = self.load_fn( + sample_archive_jsonl, last_only=False + ) + assert len(result) == 3 + assert result[-1]["current_genid"] == 2 + + def test_real_last_only( + self, sample_archive_jsonl + ): + """Real function returns last entry.""" + result = self.load_fn( + sample_archive_jsonl, last_only=True + ) + assert result["current_genid"] == 2 diff --git a/tests/test_bash_sentinel.py b/tests/test_bash_sentinel.py new file mode 100644 index 0000000..ee35689 --- /dev/null +++ b/tests/test_bash_sentinel.py @@ -0,0 +1,68 @@ +"""Tests for BashSession random sentinel (F-14). + +Validates that each BashSession instance gets a +unique, random sentinel instead of the static +'<>' string, preventing command output from +accidentally matching the sentinel. +""" + +import re +import sys + +import pytest + +from agent.tools.bash import BashSession + + +class TestBashSentinelUniqueness: + """F-14: Each BashSession gets a unique + random sentinel.""" + + def test_sentinel_is_not_static(self): + """Sentinel must not be the old static + '<>' string.""" + session = BashSession() + assert session._sentinel != "<>" + assert session._sentinel != "<>" + + def test_sentinel_matches_expected_pattern(self): + """Sentinel matches <> format.""" + session = BashSession() + pattern = r"^<>$" + assert re.match(pattern, session._sentinel), ( + f"Sentinel {session._sentinel!r} does not " + f"match pattern {pattern}" + ) + + def test_each_instance_gets_unique_sentinel(self): + """Two separate BashSession instances must have + different sentinels.""" + s1 = BashSession() + s2 = BashSession() + assert s1._sentinel != s2._sentinel, ( + "Two sessions should not share a sentinel" + ) + + def test_many_instances_all_unique(self): + """Creating 50 sessions yields 50 distinct + sentinels.""" + sentinels = { + BashSession()._sentinel for _ in range(50) + } + assert len(sentinels) == 50 + + def test_sentinel_hex_length(self): + """The hex portion has 32 chars (uuid4.hex).""" + session = BashSession() + # <> + inner = session._sentinel[7:-2] # strip <> + assert len(inner) == 32 + assert all(c in "0123456789abcdef" for c in inner) + + def test_sentinel_used_in_run_command(self): + """The run() method references self._sentinel, + not a hardcoded string.""" + import inspect + src = inspect.getsource(BashSession.run) + assert "self._sentinel" in src + assert "<>" not in src.lower() diff --git a/tests/test_ensemble.py b/tests/test_ensemble.py new file mode 100644 index 0000000..3148c0f --- /dev/null +++ b/tests/test_ensemble.py @@ -0,0 +1,174 @@ +"""Tests for ensemble majority voting logic. + +Validates weighted majority voting for classification +domains, single-best fallback, and domain gating. +""" + +import json +import os +import sys +from collections import defaultdict +from unittest.mock import patch, MagicMock + +import pytest + +_PROJ = "C:/Users/ryuke/Desktop/Projects/Hyperagents" +if _PROJ not in sys.path: + sys.path.insert(0, _PROJ) + + +# ------ Pure logic tests (no imports needed) ------ + +class TestMajorityVoteLogic: + """Test the weighted majority voting algorithm + in isolation.""" + + @staticmethod + def _weighted_majority(predictions_scores): + """Reimplement the core voting logic from + ensemble.py for isolated testing. + + Args: + predictions_scores: list of (pred, score) + Returns: + The prediction with highest total weight. + """ + votes = defaultdict(float) + for pred, score in predictions_scores: + if pred is not None: + votes[pred] += score + if votes: + return max(votes, key=votes.get) + return None + + def test_majority_simple(self): + """3 agents: A(0.8), B(0.7), A(0.6) -> A + wins with 1.4 vs 0.7.""" + result = self._weighted_majority([ + ("A", 0.8), ("B", 0.7), ("A", 0.6), + ]) + assert result == "A" + + def test_weights_override_count(self): + """2 agents vote B with low scores, 1 votes A + with high score. A wins on weight.""" + result = self._weighted_majority([ + ("A", 0.95), ("B", 0.1), ("B", 0.1), + ]) + assert result == "A" + + def test_tie_broken_deterministically(self): + """When weights tie, max() returns + deterministically.""" + result = self._weighted_majority([ + ("A", 0.5), ("B", 0.5), + ]) + # max() returns first key encountered with max + assert result in ("A", "B") + + def test_all_same_vote(self): + """All agents agree -> that answer wins.""" + result = self._weighted_majority([ + ("X", 0.9), ("X", 0.8), ("X", 0.7), + ]) + assert result == "X" + + def test_none_predictions_ignored(self): + """None predictions don't count toward any + vote.""" + result = self._weighted_majority([ + ("A", 0.9), (None, 0.8), ("B", 0.7), + ]) + assert result == "A" + + def test_all_none_returns_none(self): + """If all predictions are None, returns None.""" + result = self._weighted_majority([ + (None, 0.9), (None, 0.8), + ]) + assert result is None + + +class TestEnsembleDomainGating: + """Verify domain-based routing: classification + domains use voting, others use single-best.""" + + def test_classification_domains_known(self): + """Classification domains that support + ensemble are a known set.""" + # conftest.py mocks make this import work + from ensemble import _CLASSIFICATION_DOMAINS + assert ( + "search_arena" in _CLASSIFICATION_DOMAINS + ) + assert ( + "paper_review" in _CLASSIFICATION_DOMAINS + ) + assert ( + "imo_grading" in _CLASSIFICATION_DOMAINS + ) + + def test_non_classification_not_in_set(self): + """Non-classification domains are NOT in the + classification set.""" + from ensemble import _CLASSIFICATION_DOMAINS + assert ( + "genesis_go2walking" + not in _CLASSIFICATION_DOMAINS + ) + assert ( + "balrog_babyai" + not in _CLASSIFICATION_DOMAINS + ) + assert ( + "polyglot" + not in _CLASSIFICATION_DOMAINS + ) + + +class TestEnsembleFallback: + """Test fallback behavior when <3 agents + available.""" + + @staticmethod + def _should_use_voting( + domain, can_ensemble, n_agents + ): + """Replicate the gating logic from ensemble() + for testing.""" + classification = { + "search_arena", + "paper_review", + "imo_grading", + } + return ( + domain in classification + and can_ensemble + and n_agents >= 3 + ) + + def test_fewer_than_3_uses_single_best(self): + """With 2 agents, should NOT use voting.""" + assert not self._should_use_voting( + "search_arena", True, 2 + ) + + def test_3_or_more_uses_voting(self): + """With 3+ agents, should use voting.""" + assert self._should_use_voting( + "search_arena", True, 3 + ) + + def test_non_classification_always_single(self): + """Non-classification domain never uses + voting regardless of agent count.""" + assert not self._should_use_voting( + "genesis_go2walking", False, 5 + ) + + def test_can_ensemble_false_blocks_voting(self): + """Even a classification domain with + can_ensemble=False uses single-best.""" + assert not self._should_use_voting( + "imo_grading", False, 3 + ) diff --git a/tests/test_llm_metadata.py b/tests/test_llm_metadata.py new file mode 100644 index 0000000..759f9cd --- /dev/null +++ b/tests/test_llm_metadata.py @@ -0,0 +1,93 @@ +"""Tests for LLM response metadata (F-10). + +Validates that get_response_from_llm() returns an +info dict with expected keys: finish_reason, usage, +model. +""" + +import inspect +from unittest.mock import MagicMock, patch + +import pytest + +# conftest.py mocks backoff, litellm, dotenv, etc. +from agent.llm import get_response_from_llm + + +class TestLlmMetadataKeys: + """F-10: info dict contains expected keys.""" + + def test_return_annotation_is_tuple(self): + """get_response_from_llm returns a 3-tuple + (text, history, info).""" + sig = inspect.signature( + get_response_from_llm + ) + ret = sig.return_annotation + assert "Tuple" in str(ret) + + def test_info_dict_constructed_in_source(self): + """Source constructs info with finish_reason, + usage, model keys.""" + src = inspect.getsource( + get_response_from_llm + ) + assert '"finish_reason"' in src + assert '"usage"' in src + assert '"model"' in src + + def test_info_dict_is_returned(self): + """The function returns (response_text, + new_msg_history, info).""" + src = inspect.getsource( + get_response_from_llm + ) + assert ( + "return response_text, " + "new_msg_history, info" + ) in src + + def test_info_structure_via_mock(self): + """Mock litellm.completion and verify the + returned info dict shape.""" + mock_response = MagicMock() + mock_response.choices = [MagicMock()] + mock_response.choices[0].message.content = ( + "hello" + ) + mock_response.choices[0].finish_reason = ( + "stop" + ) + mock_response.usage = MagicMock() + mock_response.model = "test-model" + + # Make response subscriptable for the + # response['choices'][0]['message']['content'] + # pattern used in the source. + choice_msg = {"content": "hello"} + choice = {"message": choice_msg} + + def getitem(self, key): + if key == "choices": + return [choice] + return None + + type(mock_response).__getitem__ = getitem + + with patch( + "agent.llm.litellm.completion", + return_value=mock_response, + ): + text, history, info = ( + get_response_from_llm( + msg="test", model="test-model" + ) + ) + + assert isinstance(info, dict) + assert "finish_reason" in info + assert "usage" in info + assert "model" in info + assert info["finish_reason"] == "stop" + assert info["model"] == "test-model" + assert text == "hello" diff --git a/tests/test_meta_agent_instruction.py b/tests/test_meta_agent_instruction.py new file mode 100644 index 0000000..f4c7631 --- /dev/null +++ b/tests/test_meta_agent_instruction.py @@ -0,0 +1,69 @@ +"""Tests for MetaAgent instruction construction (F-04). + +Validates that the instruction string built inside +MetaAgent.forward() contains eval_path, iterations_left, +and is not trivially short. +""" + +import inspect + +import pytest + +# conftest.py mocks backoff, litellm, dotenv, +# thread_logger, etc. +from meta_agent import MetaAgent + + +class TestMetaAgentInstruction: + """F-04: Instruction string is comprehensive.""" + + def test_forward_accepts_eval_path(self): + """forward() signature includes eval_path.""" + sig = inspect.signature(MetaAgent.forward) + params = list(sig.parameters.keys()) + assert "eval_path" in params + + def test_forward_accepts_iterations_left(self): + """forward() signature includes + iterations_left.""" + sig = inspect.signature(MetaAgent.forward) + params = list(sig.parameters.keys()) + assert "iterations_left" in params + + def test_eval_path_appears_in_instruction(self): + """The source of forward() references + eval_path in the instruction string.""" + src = inspect.getsource(MetaAgent.forward) + assert "eval_path" in src + + def test_iterations_left_in_instruction(self): + """When iterations_left is provided, it + appears in the instruction.""" + src = inspect.getsource(MetaAgent.forward) + assert "iterations_left" in src + + def test_instruction_is_substantial(self): + """The instruction is built with multiple + concatenations, not just a few words.""" + src = inspect.getsource(MetaAgent.forward) + plus_eq_count = src.count("instruction +=") + assert plus_eq_count >= 3, ( + f"Expected 3+ instruction +=, got " + f"{plus_eq_count}" + ) + + def test_instruction_mentions_readme(self): + """Instruction tells the agent to read the + README for orientation.""" + src = inspect.getsource(MetaAgent.forward) + assert "README" in src + + def test_instruction_mentions_repo_path(self): + """Instruction references repo_path.""" + src = inspect.getsource(MetaAgent.forward) + assert "repo_path" in src + + def test_forward_calls_chat_with_agent(self): + """forward() delegates to chat_with_agent.""" + src = inspect.getsource(MetaAgent.forward) + assert "chat_with_agent" in src diff --git a/tests/test_metadata_atomic.py b/tests/test_metadata_atomic.py new file mode 100644 index 0000000..59bcc92 --- /dev/null +++ b/tests/test_metadata_atomic.py @@ -0,0 +1,139 @@ +"""Tests for atomic metadata writes (F-11). + +Validates that update_node_metadata() uses the +temp-file + os.replace pattern to avoid corruption. +""" + +import json +import os +import sys + +import pytest + +# conftest.py mocks heavy deps (docker, etc.) +from utils.gl_utils import ( + update_node_metadata, + get_node_metadata_key, +) + + +class TestUpdateNodeMetadataAtomic: + """F-11: metadata writes use temp + rename.""" + + def _make_gen_dir(self, tmp_dir, genid, data): + """Helper: create gen_{genid}/metadata.json.""" + gen_dir = os.path.join( + tmp_dir, f"gen_{genid}" + ) + os.makedirs(gen_dir, exist_ok=True) + meta_path = os.path.join( + gen_dir, "metadata.json" + ) + with open(meta_path, "w") as f: + json.dump(data, f) + return meta_path + + def test_update_writes_correct_data(self, tmp_dir): + """After update, the file contains the merged + data.""" + original = {"parent_genid": None, "score": 0.5} + self._make_gen_dir(tmp_dir, 0, original) + + update_node_metadata( + tmp_dir, 0, {"score": 0.9, "new_key": True} + ) + + meta_path = os.path.join( + tmp_dir, "gen_0", "metadata.json" + ) + with open(meta_path, "r") as f: + result = json.load(f) + + assert result["score"] == 0.9 + assert result["new_key"] is True + assert result["parent_genid"] is None + + def test_no_temp_file_left_behind(self, tmp_dir): + """After a successful write, no .tmp file + remains.""" + self._make_gen_dir( + tmp_dir, 0, {"key": "value"} + ) + update_node_metadata( + tmp_dir, 0, {"key": "updated"} + ) + + gen_dir = os.path.join(tmp_dir, "gen_0") + files = os.listdir(gen_dir) + tmp_files = [f for f in files if f.endswith(".tmp")] + assert len(tmp_files) == 0, ( + f"Temp files remain: {tmp_files}" + ) + + def test_atomic_pattern_in_source(self): + """Source code uses tmp_file + os.replace + pattern.""" + import inspect + src = inspect.getsource(update_node_metadata) + assert "tmp" in src.lower(), ( + "Should use a temp file" + ) + assert "os.replace" in src or "os.rename" in src, ( + "Should use os.replace or os.rename for " + "atomic swap" + ) + assert "f.flush()" in src, ( + "Should flush before fsync" + ) + assert "os.fsync" in src, ( + "Should fsync before replace" + ) + + def test_missing_metadata_is_noop(self, tmp_dir): + """If metadata.json doesn't exist, + update_node_metadata does nothing.""" + # gen_99 does not exist + update_node_metadata( + tmp_dir, 99, {"key": "value"} + ) + gen_dir = os.path.join(tmp_dir, "gen_99") + assert not os.path.exists(gen_dir) + + def test_get_after_update_returns_new_value( + self, tmp_dir + ): + """get_node_metadata_key returns the updated + value after update_node_metadata.""" + self._make_gen_dir( + tmp_dir, 1, {"status": "pending"} + ) + update_node_metadata( + tmp_dir, 1, {"status": "complete"} + ) + val = get_node_metadata_key( + tmp_dir, 1, "status" + ) + assert val == "complete" + + def test_concurrent_safety_no_partial_writes( + self, tmp_dir + ): + """Simulates that the original file stays + intact if we check it before os.replace + would run -- i.e., the tmp file is written + first, then swapped.""" + original = {"step": 1, "data": "original"} + meta_path = self._make_gen_dir( + tmp_dir, 0, original + ) + + # Perform multiple updates sequentially + for i in range(2, 6): + update_node_metadata( + tmp_dir, 0, {"step": i} + ) + + with open(meta_path, "r") as f: + result = json.load(f) + assert result["step"] == 5 + assert result["data"] == "original" diff --git a/tests/test_smoke_test.py b/tests/test_smoke_test.py new file mode 100644 index 0000000..3a56c57 --- /dev/null +++ b/tests/test_smoke_test.py @@ -0,0 +1,101 @@ +"""Tests for run_smoke_test() (F-15). + +Validates that run_smoke_test correctly interprets +container.exec_run results: True on success, +False on non-zero exit code or missing sentinel. +""" + +from unittest.mock import MagicMock, patch + +import pytest + +# conftest.py mocks docker, etc. +from utils.gl_utils import run_smoke_test + + +def _make_mock_container(exit_code, output_text): + """Create a mock Docker container whose exec_run + returns the given exit code and output.""" + container = MagicMock() + exec_result = MagicMock() + exec_result.exit_code = exit_code + exec_result.output = output_text.encode("utf-8") + container.exec_run.return_value = exec_result + return container + + +class TestRunSmokeTest: + """F-15: run_smoke_test container validation.""" + + def _run(self, exit_code, output): + """Helper: run smoke test with mock.""" + container = _make_mock_container( + exit_code, output + ) + with patch( + "utils.gl_utils.log_container_output", + lambda *a, **kw: None, + ): + return run_smoke_test(container) + + def test_success_returns_true(self): + """Exit code 0 + sentinel present -> True.""" + result = self._run( + 0, "some output\nsmoke_test_passed\n" + ) + assert result is True + + def test_nonzero_exit_returns_false(self): + """Non-zero exit code -> False.""" + result = self._run( + 1, "smoke_test_passed\n" + ) + assert result is False + + def test_missing_sentinel_returns_false(self): + """Exit code 0 but no sentinel string -> + False.""" + result = self._run( + 0, "import succeeded\n" + ) + assert result is False + + def test_empty_output_returns_false(self): + """Empty output -> False.""" + result = self._run(0, "") + assert result is False + + def test_exception_returns_false(self): + """If exec_run raises, returns False.""" + container = MagicMock() + container.exec_run.side_effect = ( + RuntimeError("docker error") + ) + with patch( + "utils.gl_utils.log_container_output", + lambda *a, **kw: None, + ): + result = run_smoke_test(container) + assert result is False + + def test_calls_exec_run_with_command(self): + """exec_run is called with a command list + including 'python' and '-c'.""" + container = _make_mock_container( + 0, "smoke_test_passed" + ) + with patch( + "utils.gl_utils.log_container_output", + lambda *a, **kw: None, + ): + run_smoke_test(container) + + call_args = container.exec_run.call_args + cmd = call_args.kwargs.get( + "cmd", + call_args.args[0] + if call_args.args + else None, + ) + assert "python" in cmd + assert "-c" in cmd diff --git a/tests/test_tool_output_format.py b/tests/test_tool_output_format.py new file mode 100644 index 0000000..440b94b --- /dev/null +++ b/tests/test_tool_output_format.py @@ -0,0 +1,158 @@ +"""Tests for tool output JSON serialization (F-03). + +Validates that tool output messages use json.dumps() +for proper serialization, avoiding the old f-string +approach which produced invalid JSON with unescaped +quotes and special characters. +""" + +import json + +import pytest + + +class TestToolOutputJsonSerialization: + """Verify json.dumps produces valid JSON for tool + output messages.""" + + def test_basic_tool_output_is_valid_json(self): + """A simple tool output round-trips through + json.dumps / json.loads.""" + tool_input = { + "command": "ls -la", + "path": "/tmp", + } + tool_msg_data = { + "tool_name": "bash", + "tool_input": tool_input, + "tool_output": "file1.txt\nfile2.txt", + } + serialized = json.dumps(tool_msg_data) + parsed = json.loads(serialized) + assert parsed["tool_name"] == "bash" + assert parsed["tool_input"] == tool_input + assert "file1.txt" in parsed["tool_output"] + + def test_old_fstring_approach_produces_invalid_json( + self, + ): + """Demonstrate the bug: f-string interpolation + of dicts produces repr() output that is NOT + valid JSON (single quotes, unescaped chars).""" + tool_input = { + "command": "echo 'hello'", + "path": "/tmp", + } + tool_output = 'He said "hello"' + + # Old broken approach: f-string with dict + old_msg = ( + f'{{"tool_name": "bash", ' + f'"tool_input": {tool_input}, ' + f'"tool_output": "{tool_output}"}}' + ) + with pytest.raises(json.JSONDecodeError): + json.loads(old_msg) + + def test_special_chars_quotes(self): + """Double quotes in tool_output are properly + escaped by json.dumps.""" + data = { + "tool_name": "bash", + "tool_input": {"command": "echo"}, + "tool_output": 'He said "hello"', + } + serialized = json.dumps(data) + parsed = json.loads(serialized) + assert parsed["tool_output"] == ( + 'He said "hello"' + ) + + def test_special_chars_newlines(self): + """Newlines in tool_output are escaped.""" + data = { + "tool_name": "bash", + "tool_input": {"command": "ls"}, + "tool_output": "line1\nline2\nline3", + } + serialized = json.dumps(data) + # Raw string should contain \\n, not newlines + assert "\\n" in serialized + parsed = json.loads(serialized) + assert parsed["tool_output"].count("\n") == 2 + + def test_special_chars_backslashes(self): + """Backslashes in tool_output are escaped.""" + data = { + "tool_name": "bash", + "tool_input": {"command": "echo"}, + "tool_output": "C:\\Users\\test\\file.txt", + } + serialized = json.dumps(data) + parsed = json.loads(serialized) + assert ( + parsed["tool_output"] + == "C:\\Users\\test\\file.txt" + ) + + def test_special_chars_tabs_and_unicode(self): + """Tabs and unicode in tool_output are handled.""" + data = { + "tool_name": "bash", + "tool_input": {"command": "cat"}, + "tool_output": "col1\tcol2\n\u2603 snowman", + } + serialized = json.dumps(data) + parsed = json.loads(serialized) + assert "\t" in parsed["tool_output"] + assert "\u2603" in parsed["tool_output"] + + def test_nested_json_in_output(self): + """Tool output containing JSON-like strings + serializes correctly.""" + inner = json.dumps({"key": "value"}) + data = { + "tool_name": "bash", + "tool_input": {"command": "curl"}, + "tool_output": inner, + } + serialized = json.dumps(data) + parsed = json.loads(serialized) + # The output is a string, not a dict + assert isinstance(parsed["tool_output"], str) + inner_parsed = json.loads( + parsed["tool_output"] + ) + assert inner_parsed["key"] == "value" + + def test_empty_tool_output(self): + """Empty string tool output serializes.""" + data = { + "tool_name": "bash", + "tool_input": {"command": "true"}, + "tool_output": "", + } + serialized = json.dumps(data) + parsed = json.loads(serialized) + assert parsed["tool_output"] == "" + + def test_actual_format_with_xml_wrapper(self): + """Test the actual format used in + llm_withtools.py: ...json.dumps... + """ + tool_msg_data = { + "tool_name": "bash", + "tool_input": {"command": "ls"}, + "tool_output": "file.txt", + } + tool_msg = ( + f"\n" + f"{json.dumps(tool_msg_data, indent=2)}" + f"\n" + ) + # Extract the JSON between tags + start = tool_msg.index("\n") + 7 + end = tool_msg.index("\n") + extracted = tool_msg[start:end] + parsed = json.loads(extracted) + assert parsed["tool_name"] == "bash"