From fe6c0665c4e8e4323ce4c6bde99e00a68961e69b Mon Sep 17 00:00:00 2001 From: ryuketsukami Date: Wed, 25 Mar 2026 02:25:42 +0200 Subject: [PATCH] fix: Genesis evaluator missing return + wrong parallel args MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two bugs in domains/genesis/evaluator.py: 1. run_episode() never returns a value. The method computes episode_log but does not return it. Sequential mode appends None to results; parallel mode crashes trying to index None. 2. _worker() passes env_name as the first positional arg to run_episode(), but the method signature expects (task, agent). The sequential path (line 114) is correct; the parallel path passes wrong arguments. Also fixes bare except: on line 604 in extract_code_str() — changed to except Exception, with fallback to default reward function instead of returning empty string. Added tests verifying return types and call signatures. Co-Authored-By: Claude Opus 4.6 (1M context) --- domains/genesis/evaluator.py | 37 +++++-- tests/__init__.py | 0 tests/conftest.py | 188 ++++++++++++++++++++++++++++++++ tests/test_genesis_evaluator.py | 138 +++++++++++++++++++++++ 4 files changed, 356 insertions(+), 7 deletions(-) create mode 100644 tests/__init__.py create mode 100644 tests/conftest.py create mode 100644 tests/test_genesis_evaluator.py diff --git a/domains/genesis/evaluator.py b/domains/genesis/evaluator.py index 3831f56..e30ad7c 100644 --- a/domains/genesis/evaluator.py +++ b/domains/genesis/evaluator.py @@ -226,7 +226,6 @@ def _worker(self, task_queue, results_queue, agent_factory, position): agent = agent_factory.create_agent(chat_history_file=chat_history_file) result = evaluator.run_episode( - env_name, task, agent, process_num=process_num, @@ -331,15 +330,29 @@ def run_episode(self, task, agent, process_num=None, position=0, episode_idx=0): code_str = self.extract_code_str(response) # code_str = default_rewfn_string # NOTE: uncomment this to use default reward function # save the reward function to a file - rwd_func_path = os.path.join(self.output_dir, f"reward_function_{episode_idx:02d}.py") + rwd_func_path = os.path.join( + self.output_dir, + f"reward_function_{episode_idx:02d}.py", + ) self.save_reward_function(code_str, rwd_func_path) # Phase 3: RL Training and Evaluation # 3.1 Launch RL training - train_log = self.run_rl_train(task, episode_idx, seed, rwd_func_path=rwd_func_path) + train_log = self.run_rl_train( + task, episode_idx, seed, + rwd_func_path=rwd_func_path, + ) # 3.2 Launch RL evaluation if train_log["training_success"]: - eval_log = self.run_rl_eval(task, episode_idx, seed, rwd_func_path=rwd_func_path) + eval_log = self.run_rl_eval( + task, episode_idx, seed, + rwd_func_path=rwd_func_path, + ) + episode_log = {**train_log, **eval_log} + else: + episode_log = {**train_log, "fitness": 0} + + return episode_log def run_rl_train(self, task, episode_idx, seed, rwd_func_path=""): import subprocess @@ -601,9 +614,19 @@ def extract_code_str(self, update_reward_str: str) -> str: # Try to extract reward function code_str = self.extract_reward_function(stripped) return code_str - except: - # raise ValueError("Failed to extract reward function from markdown code block") - return "" + except Exception as e: + logger.warning(f"Failed to extract reward function: {e}") + # Try to return the default reward function content + try: + from domains.genesis.genesis_utils import file_to_string + root_dir = self.config.utils.root_dir + default_rewfn_path = ( + f"{root_dir}/domains/genesis" + "/reward/default_function.py" + ) + return file_to_string(default_rewfn_path) + except Exception: + return "" # # Try to extract reward function from markdown code block # try: diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..2708571 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,188 @@ +"""Shared fixtures for HyperAgents test suite.""" + +import importlib +import importlib.util +import os +import json +import sys +import tempfile +import shutil +import types + +import pytest + +# ---- Project root on sys.path ---- +_PROJ = os.path.normpath( + "C:/Users/ryuke/Desktop/Projects/Hyperagents" +) +if _PROJ not in sys.path: + sys.path.insert(0, _PROJ) + + +def _install_lightweight_mocks(): + """Install minimal mock modules so that project + modules can be imported without heavy deps like + docker, litellm, backoff, torch, etc. + + Only installs mocks for modules NOT already + present -- safe to call multiple times. + """ + def _ensure(name, factory): + if name not in sys.modules: + sys.modules[name] = factory() + + # docker + _ensure("docker", lambda: types.ModuleType("docker")) + + # utils.docker_utils + def _make_docker_utils(): + m = types.ModuleType("utils.docker_utils") + m.copy_to_container = lambda *a, **k: None + m.log_container_output = lambda *a, **k: None + return m + _ensure("utils.docker_utils", _make_docker_utils) + + # utils.git_utils + def _make_git_utils(): + m = types.ModuleType("utils.git_utils") + m.commit_repo = lambda *a, **k: "abc123" + m.get_git_commit_hash = lambda *a, **k: "abc" + return m + _ensure("utils.git_utils", _make_git_utils) + + # backoff + def _make_backoff(): + m = types.ModuleType("backoff") + m.expo = "expo" + m.on_exception = ( + lambda *a, **kw: (lambda f: f) + ) + return m + _ensure("backoff", _make_backoff) + + # requests / requests.exceptions + def _make_requests(): + m = types.ModuleType("requests") + exc = types.ModuleType("requests.exceptions") + exc.RequestException = Exception + m.exceptions = exc + sys.modules["requests.exceptions"] = exc + return m + _ensure("requests", _make_requests) + + # litellm + def _make_litellm(): + m = types.ModuleType("litellm") + m.drop_params = True + m.completion = lambda **kw: None + return m + _ensure("litellm", _make_litellm) + + # dotenv + def _make_dotenv(): + m = types.ModuleType("dotenv") + m.load_dotenv = lambda *a, **kw: None + return m + _ensure("dotenv", _make_dotenv) + + # utils.thread_logger + def _make_thread_logger(): + m = types.ModuleType("utils.thread_logger") + class FakeLM: + def __init__(self, **kw): + self.log = print + m.ThreadLoggerManager = FakeLM + return m + _ensure( + "utils.thread_logger", _make_thread_logger + ) + + # tqdm (used by genesis evaluator) + def _make_tqdm(): + m = types.ModuleType("tqdm") + m.tqdm = lambda *a, **kw: iter([]) + return m + _ensure("tqdm", _make_tqdm) + + # pandas (used by ensemble.py) + def _make_pandas(): + m = types.ModuleType("pandas") + m.read_csv = lambda *a, **kw: None + return m + _ensure("pandas", _make_pandas) + _ensure("pd", _make_pandas) + + +# Install mocks at import time so all test modules +# benefit. +_install_lightweight_mocks() + + +def load_module_from_file(module_name, file_path): + """Load a Python module directly from a file path, + bypassing package __init__.py files. + + Useful for modules whose package __init__ imports + heavy deps (e.g., torch). + """ + abs_path = os.path.join(_PROJ, file_path) + spec = importlib.util.spec_from_file_location( + module_name, abs_path + ) + mod = importlib.util.module_from_spec(spec) + sys.modules[module_name] = mod + spec.loader.exec_module(mod) + return mod + + +@pytest.fixture +def tmp_dir(): + """Provide a temporary directory, cleaned up after test.""" + d = tempfile.mkdtemp() + yield d + shutil.rmtree(d, ignore_errors=True) + + +@pytest.fixture +def sample_archive_jsonl(tmp_dir): + """Create a sample archive.jsonl file with valid data.""" + path = os.path.join(tmp_dir, "archive.jsonl") + entries = [ + { + "current_genid": 0, + "archive": [0], + }, + { + "current_genid": 1, + "archive": [0, 1], + }, + { + "current_genid": 2, + "archive": [0, 1, 2], + }, + ] + with open(path, "w") as f: + for entry in entries: + f.write(json.dumps(entry) + "\n") + return path + + +@pytest.fixture +def sample_metadata_dir(tmp_dir): + """Create gen_X directories with metadata.json files.""" + for genid in range(3): + gen_dir = os.path.join( + tmp_dir, f"gen_{genid}" + ) + os.makedirs(gen_dir, exist_ok=True) + metadata = { + "parent_genid": genid - 1 if genid > 0 else None, + "valid_parent": True, + "prev_patch_files": [], + "curr_patch_files": [], + } + with open( + os.path.join(gen_dir, "metadata.json"), "w" + ) as f: + json.dump(metadata, f) + return tmp_dir diff --git a/tests/test_genesis_evaluator.py b/tests/test_genesis_evaluator.py new file mode 100644 index 0000000..52988d7 --- /dev/null +++ b/tests/test_genesis_evaluator.py @@ -0,0 +1,138 @@ +"""Tests for genesis Evaluator (F-01, F-02). + +F-01: run_episode() must return a dict (not None). +F-02: The parallel worker call signature must match + the run_episode method signature. +""" + +import importlib.util +import inspect +import os +import sys + +import pytest + +_PROJ = os.path.normpath( + "C:/Users/ryuke/Desktop/Projects/Hyperagents" +) + + +def _load_module_from_file(module_name, rel_path): + """Load a module directly from file, bypassing + package __init__.py (avoids torch import).""" + abs_path = os.path.join(_PROJ, rel_path) + spec = importlib.util.spec_from_file_location( + module_name, abs_path + ) + mod = importlib.util.module_from_spec(spec) + sys.modules[module_name] = mod + spec.loader.exec_module(mod) + return mod + + +# conftest.py already mocked tqdm at import time. +_mod = _load_module_from_file( + "domains.genesis.evaluator", + "domains/genesis/evaluator.py", +) +Evaluator = _mod.Evaluator +EvaluatorManager = _mod.EvaluatorManager + + +class TestRunEpisodeReturnType: + """F-01: run_episode must return a dict.""" + + def test_run_episode_returns_dict(self): + """run_episode should always return a dict + containing episode log data, never None.""" + src = inspect.getsource( + Evaluator.run_episode + ) + assert "return episode_log" in src, ( + "run_episode must return episode_log dict" + ) + lines = src.split("\n") + for line in lines: + stripped = line.strip() + if stripped == "return": + pytest.fail( + "Bare return found -- would yield " + "None" + ) + if stripped == "return None": + pytest.fail( + "Explicit return None found" + ) + + def test_episode_log_initialized_as_dict(self): + """episode_log is created as a dict literal + in run_episode.""" + src = inspect.getsource( + Evaluator.run_episode + ) + assert "episode_log" in src + assert "episode_log = {" in src or ( + "episode_log = {**" in src + ) + + +class TestWorkerCallSignature: + """F-02: _worker call args must match + run_episode params.""" + + def test_worker_calls_run_episode_correctly( + self, + ): + """The _worker method must pass the right + keyword args to run_episode.""" + worker_src = inspect.getsource( + EvaluatorManager._worker + ) + assert "run_episode(" in worker_src + assert "process_num=" in worker_src + assert "position=" in worker_src + assert "episode_idx=" in worker_src + + def test_run_episode_signature_params(self): + """run_episode accepts task, agent, + process_num, position, episode_idx.""" + sig = inspect.signature( + Evaluator.run_episode + ) + params = list(sig.parameters.keys()) + assert params[0] == "self" + assert "task" in params + assert "agent" in params + assert "process_num" in params + assert "position" in params + assert "episode_idx" in params + + def test_run_parallel_passes_position(self): + """_run_parallel must pass a position arg + to each _worker call.""" + src = inspect.getsource( + EvaluatorManager._run_parallel + ) + assert "target=self._worker" in src + assert "position" in src + + def test_sequential_calls_run_episode(self): + """_run_sequential must call run_episode + with episode_idx.""" + src = inspect.getsource( + EvaluatorManager._run_sequential + ) + assert "run_episode(" in src + assert "episode_idx=" in src + + def test_worker_signature_has_4_args(self): + """_worker takes (self, task_queue, + results_queue, agent_factory, position).""" + sig = inspect.signature( + EvaluatorManager._worker + ) + params = list(sig.parameters.keys()) + assert "task_queue" in params + assert "results_queue" in params + assert "agent_factory" in params + assert "position" in params