diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..2708571 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,188 @@ +"""Shared fixtures for HyperAgents test suite.""" + +import importlib +import importlib.util +import os +import json +import sys +import tempfile +import shutil +import types + +import pytest + +# ---- Project root on sys.path ---- +_PROJ = os.path.normpath( + "C:/Users/ryuke/Desktop/Projects/Hyperagents" +) +if _PROJ not in sys.path: + sys.path.insert(0, _PROJ) + + +def _install_lightweight_mocks(): + """Install minimal mock modules so that project + modules can be imported without heavy deps like + docker, litellm, backoff, torch, etc. + + Only installs mocks for modules NOT already + present -- safe to call multiple times. + """ + def _ensure(name, factory): + if name not in sys.modules: + sys.modules[name] = factory() + + # docker + _ensure("docker", lambda: types.ModuleType("docker")) + + # utils.docker_utils + def _make_docker_utils(): + m = types.ModuleType("utils.docker_utils") + m.copy_to_container = lambda *a, **k: None + m.log_container_output = lambda *a, **k: None + return m + _ensure("utils.docker_utils", _make_docker_utils) + + # utils.git_utils + def _make_git_utils(): + m = types.ModuleType("utils.git_utils") + m.commit_repo = lambda *a, **k: "abc123" + m.get_git_commit_hash = lambda *a, **k: "abc" + return m + _ensure("utils.git_utils", _make_git_utils) + + # backoff + def _make_backoff(): + m = types.ModuleType("backoff") + m.expo = "expo" + m.on_exception = ( + lambda *a, **kw: (lambda f: f) + ) + return m + _ensure("backoff", _make_backoff) + + # requests / requests.exceptions + def _make_requests(): + m = types.ModuleType("requests") + exc = types.ModuleType("requests.exceptions") + exc.RequestException = Exception + m.exceptions = exc + sys.modules["requests.exceptions"] = exc + return m + _ensure("requests", _make_requests) + + # litellm + def _make_litellm(): + m = types.ModuleType("litellm") + m.drop_params = True + m.completion = lambda **kw: None + return m + _ensure("litellm", _make_litellm) + + # dotenv + def _make_dotenv(): + m = types.ModuleType("dotenv") + m.load_dotenv = lambda *a, **kw: None + return m + _ensure("dotenv", _make_dotenv) + + # utils.thread_logger + def _make_thread_logger(): + m = types.ModuleType("utils.thread_logger") + class FakeLM: + def __init__(self, **kw): + self.log = print + m.ThreadLoggerManager = FakeLM + return m + _ensure( + "utils.thread_logger", _make_thread_logger + ) + + # tqdm (used by genesis evaluator) + def _make_tqdm(): + m = types.ModuleType("tqdm") + m.tqdm = lambda *a, **kw: iter([]) + return m + _ensure("tqdm", _make_tqdm) + + # pandas (used by ensemble.py) + def _make_pandas(): + m = types.ModuleType("pandas") + m.read_csv = lambda *a, **kw: None + return m + _ensure("pandas", _make_pandas) + _ensure("pd", _make_pandas) + + +# Install mocks at import time so all test modules +# benefit. +_install_lightweight_mocks() + + +def load_module_from_file(module_name, file_path): + """Load a Python module directly from a file path, + bypassing package __init__.py files. + + Useful for modules whose package __init__ imports + heavy deps (e.g., torch). + """ + abs_path = os.path.join(_PROJ, file_path) + spec = importlib.util.spec_from_file_location( + module_name, abs_path + ) + mod = importlib.util.module_from_spec(spec) + sys.modules[module_name] = mod + spec.loader.exec_module(mod) + return mod + + +@pytest.fixture +def tmp_dir(): + """Provide a temporary directory, cleaned up after test.""" + d = tempfile.mkdtemp() + yield d + shutil.rmtree(d, ignore_errors=True) + + +@pytest.fixture +def sample_archive_jsonl(tmp_dir): + """Create a sample archive.jsonl file with valid data.""" + path = os.path.join(tmp_dir, "archive.jsonl") + entries = [ + { + "current_genid": 0, + "archive": [0], + }, + { + "current_genid": 1, + "archive": [0, 1], + }, + { + "current_genid": 2, + "archive": [0, 1, 2], + }, + ] + with open(path, "w") as f: + for entry in entries: + f.write(json.dumps(entry) + "\n") + return path + + +@pytest.fixture +def sample_metadata_dir(tmp_dir): + """Create gen_X directories with metadata.json files.""" + for genid in range(3): + gen_dir = os.path.join( + tmp_dir, f"gen_{genid}" + ) + os.makedirs(gen_dir, exist_ok=True) + metadata = { + "parent_genid": genid - 1 if genid > 0 else None, + "valid_parent": True, + "prev_patch_files": [], + "curr_patch_files": [], + } + with open( + os.path.join(gen_dir, "metadata.json"), "w" + ) as f: + json.dump(metadata, f) + return tmp_dir diff --git a/tests/test_archive_parsing.py b/tests/test_archive_parsing.py new file mode 100644 index 0000000..f6fb711 --- /dev/null +++ b/tests/test_archive_parsing.py @@ -0,0 +1,180 @@ +"""Tests for JSONL archive parsing (F-07 fix). + +Validates that load_archive_data() correctly parses +JSONL format (one JSON object per line) instead of +treating the whole file as a single JSON array. +""" + +import json +import os +import tempfile + +import pytest + + +# --------------- helpers to avoid heavy project imports ----- +# We extract the parsing logic directly to test in +# isolation. If import works, we test the real function +# too. + +def _parse_jsonl(filepath, last_only=True): + """Pure-Python reimplementation of the JSONL parsing + logic from utils/gl_utils.py::load_archive_data.""" + if not os.path.exists(filepath): + raise FileNotFoundError( + f"Metadata file not found at {filepath}" + ) + archive_data = [] + with open(filepath, "r") as f: + for line in f: + line = line.strip() + if line: + try: + archive_data.append(json.loads(line)) + except json.JSONDecodeError: + continue + if last_only: + return archive_data[-1] + return archive_data + + +class TestLoadArchiveDataParsing: + """Tests for JSONL line-by-line parsing.""" + + def test_parse_valid_jsonl(self, tmp_dir): + """Valid JSONL with multiple lines parses each + line independently.""" + path = os.path.join(tmp_dir, "archive.jsonl") + entries = [ + {"current_genid": 0, "archive": [0]}, + {"current_genid": 1, "archive": [0, 1]}, + ] + with open(path, "w") as f: + for e in entries: + f.write(json.dumps(e) + "\n") + + result = _parse_jsonl(path, last_only=False) + assert len(result) == 2 + assert result[0]["current_genid"] == 0 + assert result[1]["archive"] == [0, 1] + + def test_parse_last_only(self, tmp_dir): + """last_only=True returns only the final entry.""" + path = os.path.join(tmp_dir, "archive.jsonl") + entries = [ + {"current_genid": 0, "archive": [0]}, + {"current_genid": 1, "archive": [0, 1]}, + {"current_genid": 2, "archive": [0, 1, 2]}, + ] + with open(path, "w") as f: + for e in entries: + f.write(json.dumps(e) + "\n") + + result = _parse_jsonl(path, last_only=True) + assert result["current_genid"] == 2 + assert len(result["archive"]) == 3 + + def test_empty_lines_skipped(self, tmp_dir): + """Blank lines between entries are ignored.""" + path = os.path.join(tmp_dir, "archive.jsonl") + with open(path, "w") as f: + f.write(json.dumps({"a": 1}) + "\n") + f.write("\n") + f.write(" \n") + f.write(json.dumps({"a": 2}) + "\n") + + result = _parse_jsonl(path, last_only=False) + assert len(result) == 2 + + def test_malformed_lines_skipped(self, tmp_dir): + """Malformed JSON lines are skipped without + crashing.""" + path = os.path.join(tmp_dir, "archive.jsonl") + with open(path, "w") as f: + f.write(json.dumps({"ok": True}) + "\n") + f.write("this is not json\n") + f.write("{broken json\n") + f.write(json.dumps({"ok": True}) + "\n") + + result = _parse_jsonl(path, last_only=False) + assert len(result) == 2 + assert all(e["ok"] for e in result) + + def test_empty_file_raises(self, tmp_dir): + """An empty file (no valid entries) raises + IndexError when last_only=True.""" + path = os.path.join(tmp_dir, "archive.jsonl") + with open(path, "w") as f: + f.write("") + + with pytest.raises(IndexError): + _parse_jsonl(path, last_only=True) + + def test_empty_file_returns_empty_list(self, tmp_dir): + """An empty file returns [] when last_only=False.""" + path = os.path.join(tmp_dir, "archive.jsonl") + with open(path, "w") as f: + f.write("") + + result = _parse_jsonl(path, last_only=False) + assert result == [] + + def test_missing_file_raises(self, tmp_dir): + """A nonexistent file raises FileNotFoundError.""" + path = os.path.join(tmp_dir, "nonexistent.jsonl") + with pytest.raises(FileNotFoundError): + _parse_jsonl(path) + + def test_single_line_file(self, tmp_dir): + """A file with exactly one line works correctly.""" + path = os.path.join(tmp_dir, "archive.jsonl") + entry = {"current_genid": 0, "archive": [0]} + with open(path, "w") as f: + f.write(json.dumps(entry) + "\n") + + result = _parse_jsonl(path, last_only=True) + assert result == entry + + result_all = _parse_jsonl(path, last_only=False) + assert len(result_all) == 1 + + +class TestLoadArchiveDataReal: + """Test the real load_archive_data function. + + conftest.py installs lightweight mocks so the + import succeeds without docker/litellm/etc. + """ + + @pytest.fixture(autouse=True) + def _try_import(self): + """Import load_archive_data (mocks in + conftest handle heavy deps).""" + try: + from utils.gl_utils import ( + load_archive_data, + ) + self.load_fn = load_archive_data + except Exception as e: + pytest.skip( + f"Could not import: {e}" + ) + + def test_real_parse_valid( + self, sample_archive_jsonl + ): + """Real function parses valid JSONL.""" + result = self.load_fn( + sample_archive_jsonl, last_only=False + ) + assert len(result) == 3 + assert result[-1]["current_genid"] == 2 + + def test_real_last_only( + self, sample_archive_jsonl + ): + """Real function returns last entry.""" + result = self.load_fn( + sample_archive_jsonl, last_only=True + ) + assert result["current_genid"] == 2 diff --git a/tests/test_metadata_atomic.py b/tests/test_metadata_atomic.py new file mode 100644 index 0000000..59bcc92 --- /dev/null +++ b/tests/test_metadata_atomic.py @@ -0,0 +1,139 @@ +"""Tests for atomic metadata writes (F-11). + +Validates that update_node_metadata() uses the +temp-file + os.replace pattern to avoid corruption. +""" + +import json +import os +import sys + +import pytest + +# conftest.py mocks heavy deps (docker, etc.) +from utils.gl_utils import ( + update_node_metadata, + get_node_metadata_key, +) + + +class TestUpdateNodeMetadataAtomic: + """F-11: metadata writes use temp + rename.""" + + def _make_gen_dir(self, tmp_dir, genid, data): + """Helper: create gen_{genid}/metadata.json.""" + gen_dir = os.path.join( + tmp_dir, f"gen_{genid}" + ) + os.makedirs(gen_dir, exist_ok=True) + meta_path = os.path.join( + gen_dir, "metadata.json" + ) + with open(meta_path, "w") as f: + json.dump(data, f) + return meta_path + + def test_update_writes_correct_data(self, tmp_dir): + """After update, the file contains the merged + data.""" + original = {"parent_genid": None, "score": 0.5} + self._make_gen_dir(tmp_dir, 0, original) + + update_node_metadata( + tmp_dir, 0, {"score": 0.9, "new_key": True} + ) + + meta_path = os.path.join( + tmp_dir, "gen_0", "metadata.json" + ) + with open(meta_path, "r") as f: + result = json.load(f) + + assert result["score"] == 0.9 + assert result["new_key"] is True + assert result["parent_genid"] is None + + def test_no_temp_file_left_behind(self, tmp_dir): + """After a successful write, no .tmp file + remains.""" + self._make_gen_dir( + tmp_dir, 0, {"key": "value"} + ) + update_node_metadata( + tmp_dir, 0, {"key": "updated"} + ) + + gen_dir = os.path.join(tmp_dir, "gen_0") + files = os.listdir(gen_dir) + tmp_files = [f for f in files if f.endswith(".tmp")] + assert len(tmp_files) == 0, ( + f"Temp files remain: {tmp_files}" + ) + + def test_atomic_pattern_in_source(self): + """Source code uses tmp_file + os.replace + pattern.""" + import inspect + src = inspect.getsource(update_node_metadata) + assert "tmp" in src.lower(), ( + "Should use a temp file" + ) + assert "os.replace" in src or "os.rename" in src, ( + "Should use os.replace or os.rename for " + "atomic swap" + ) + assert "f.flush()" in src, ( + "Should flush before fsync" + ) + assert "os.fsync" in src, ( + "Should fsync before replace" + ) + + def test_missing_metadata_is_noop(self, tmp_dir): + """If metadata.json doesn't exist, + update_node_metadata does nothing.""" + # gen_99 does not exist + update_node_metadata( + tmp_dir, 99, {"key": "value"} + ) + gen_dir = os.path.join(tmp_dir, "gen_99") + assert not os.path.exists(gen_dir) + + def test_get_after_update_returns_new_value( + self, tmp_dir + ): + """get_node_metadata_key returns the updated + value after update_node_metadata.""" + self._make_gen_dir( + tmp_dir, 1, {"status": "pending"} + ) + update_node_metadata( + tmp_dir, 1, {"status": "complete"} + ) + val = get_node_metadata_key( + tmp_dir, 1, "status" + ) + assert val == "complete" + + def test_concurrent_safety_no_partial_writes( + self, tmp_dir + ): + """Simulates that the original file stays + intact if we check it before os.replace + would run -- i.e., the tmp file is written + first, then swapped.""" + original = {"step": 1, "data": "original"} + meta_path = self._make_gen_dir( + tmp_dir, 0, original + ) + + # Perform multiple updates sequentially + for i in range(2, 6): + update_node_metadata( + tmp_dir, 0, {"step": i} + ) + + with open(meta_path, "r") as f: + result = json.load(f) + assert result["step"] == 5 + assert result["data"] == "original" diff --git a/utils/gl_utils.py b/utils/gl_utils.py index 13c905d..e108825 100644 --- a/utils/gl_utils.py +++ b/utils/gl_utils.py @@ -141,9 +141,13 @@ def update_node_metadata(output_dir, genid, data_update): metadata = json.load(f) # Update metadata metadata.update(data_update) - # Save metadata - with open(metadata_file, "w") as f: + # Save metadata atomically + tmp_file = metadata_file + ".tmp" + with open(tmp_file, "w") as f: json.dump(metadata, f, indent=4) + f.flush() + os.fsync(f.fileno()) + os.replace(tmp_file, metadata_file) def get_node_metadata_key(output_dir, genid, key): @@ -180,18 +184,16 @@ def load_archive_data(filepath, last_only=True): # Load all archives from given metadata file if not os.path.exists(filepath): raise FileNotFoundError(f"Metadata file not found at {filepath}") - # Read all JSON entries from the metadata file - content = read_file(filepath) - json_entries = content.split("\n{") - # Parse all JSON entries + # Read all JSON entries from the JSONL file (one JSON object per line) archive_data = [] - for json_entry in json_entries: - # Add back the { if it was removed by split - if not json_entry.startswith("{"): - json_entry = "{" + json_entry - # Parse the JSON entry - metadata = json.loads(json_entry) - archive_data.append(metadata) + with open(filepath, "r") as f: + for line in f: + line = line.strip() + if line: + try: + archive_data.append(json.loads(line)) + except json.JSONDecodeError: + continue # Return the last entry if last_only: return archive_data[-1] @@ -354,6 +356,9 @@ def ignore_function_domains(src, names): readme_desc = get_readme_description( ensemble="ensemble" in optimize_option, edit_select_parent=edit_select_parent, + domains=domains, + generation=0, + eval_path=output_dir, ) f.write(readme_desc) @@ -370,11 +375,68 @@ def ignore_function_domains(src, names): return root_dir, commit_hash -def get_readme_description(ensemble=False, edit_select_parent=False): +def get_readme_description( + ensemble=False, + edit_select_parent=False, + domains=None, + generation=None, + eval_path=None, +): desc = """# Self-Improving AI This system is designed to automatically produce agents for solving downstream tasks. The system iteratively improves the generated agents through code editing. To enable continuous improvement, the system should look at its code repository and the provided path to previously generated agents and their evaluation results, and then edit and enhance its own mechanisms for generating agents. This process creates a recursive loop of self-improvement. """ + + # Add domain and evaluation info + if domains: + domain_metrics = { + "search_arena": "overall_accuracy", + "paper_review": "overall_accuracy", + "imo_grading": "overall_accuracy", + "imo_proof": "points_percentage", + } + desc += "\n## Target Domains\n\n" + for d in domains: + if d in domain_metrics: + metric = domain_metrics[d] + elif "balrog" in d: + metric = "average_progress" + elif "genesis" in d: + metric = "average_fitness" + elif "polyglot" in d: + metric = "accuracy_score" + else: + metric = "score" + desc += f"- **{d}** (metric: `{metric}`)\n" + + if generation is not None: + desc += ( + f"\n## Current State\n\n" + f"This is generation **{generation}**" + f" (initial setup).\n" + ) + + if eval_path: + desc += ( + f"\nEvaluation results from previous" + f" generations are stored at" + f" `{eval_path}`. Each generation" + f" folder (e.g., `gen_0/`, `gen_1/`)" + f" contains a" + f" `_eval/report.json`" + f" with scores.\n" + ) + + desc += """ +## Key Files + +- `meta_agent.py` — The meta agent that drives + self-improvement. It can modify any file. +- `task_agent.py` — The task agent that solves downstream domain tasks. +- `agent/` — LLM interface, tool definitions, and base agent classes. +- `domains/` — Domain-specific evaluation and task code. +""" + if ensemble: desc += """\n## Optimize the Ensemble of Agents @@ -605,6 +667,37 @@ def get_latest_can_select_parent(archive, output_dir, trunc_genid=None): print("shouldn't reach here") return None +def run_smoke_test(container, repo_name=REPO_NAME): + """ + Lightweight smoke test: verify TaskAgent is + instantiable and forward() has the right + signature. No actual LLM calls -- just import, + instantiate, and inspect. + Returns True on pass, False otherwise. + """ + smoke_test_code = ( + "from task_agent import TaskAgent; " + "agent = TaskAgent(model='claude-3-haiku-20240307'); " + "import inspect; " + "sig = inspect.signature(agent.forward); " + "print('smoke_test_passed')" + ) + command = [ + "timeout", + "60", # 60s timeout + "python", + "-c", + smoke_test_code, + ] + try: + exec_result = container.exec_run(cmd=command, workdir=f"/{repo_name}") + log_container_output(exec_result) + output = exec_result.output.decode() if exec_result.output else "" + return exec_result.exit_code == 0 and "smoke_test_passed" in output + except Exception: + return False + + def run_commands_to_check_compilation(container, run_baseline=None, edit_select_parent=False): # Run commands to check if the agents are compilable if run_baseline and "dgm" in run_baseline: