Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 30 additions & 7 deletions domains/genesis/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,6 @@ def _worker(self, task_queue, results_queue, agent_factory, position):
agent = agent_factory.create_agent(chat_history_file=chat_history_file)

result = evaluator.run_episode(
env_name,
task,
agent,
process_num=process_num,
Expand Down Expand Up @@ -331,15 +330,29 @@ def run_episode(self, task, agent, process_num=None, position=0, episode_idx=0):
code_str = self.extract_code_str(response)
# code_str = default_rewfn_string # NOTE: uncomment this to use default reward function
# save the reward function to a file
rwd_func_path = os.path.join(self.output_dir, f"reward_function_{episode_idx:02d}.py")
rwd_func_path = os.path.join(
self.output_dir,
f"reward_function_{episode_idx:02d}.py",
)
self.save_reward_function(code_str, rwd_func_path)

# Phase 3: RL Training and Evaluation
# 3.1 Launch RL training
train_log = self.run_rl_train(task, episode_idx, seed, rwd_func_path=rwd_func_path)
train_log = self.run_rl_train(
task, episode_idx, seed,
rwd_func_path=rwd_func_path,
)
# 3.2 Launch RL evaluation
if train_log["training_success"]:
eval_log = self.run_rl_eval(task, episode_idx, seed, rwd_func_path=rwd_func_path)
eval_log = self.run_rl_eval(
task, episode_idx, seed,
rwd_func_path=rwd_func_path,
)
episode_log = {**train_log, **eval_log}
else:
episode_log = {**train_log, "fitness": 0}

return episode_log

def run_rl_train(self, task, episode_idx, seed, rwd_func_path=""):
import subprocess
Expand Down Expand Up @@ -601,9 +614,19 @@ def extract_code_str(self, update_reward_str: str) -> str:
# Try to extract reward function
code_str = self.extract_reward_function(stripped)
return code_str
except:
# raise ValueError("Failed to extract reward function from markdown code block")
return ""
except Exception as e:
logger.warning(f"Failed to extract reward function: {e}")
# Try to return the default reward function content
try:
from domains.genesis.genesis_utils import file_to_string
root_dir = self.config.utils.root_dir
default_rewfn_path = (
f"{root_dir}/domains/genesis"
"/reward/default_function.py"
)
return file_to_string(default_rewfn_path)
except Exception:
return ""

# # Try to extract reward function from markdown code block
# try:
Expand Down
Empty file added tests/__init__.py
Empty file.
188 changes: 188 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
"""Shared fixtures for HyperAgents test suite."""

import importlib
import importlib.util
import os
import json
import sys
import tempfile
import shutil
import types

import pytest

# ---- Project root on sys.path ----
_PROJ = os.path.normpath(
"C:/Users/ryuke/Desktop/Projects/Hyperagents"
)
if _PROJ not in sys.path:
sys.path.insert(0, _PROJ)


def _install_lightweight_mocks():
"""Install minimal mock modules so that project
modules can be imported without heavy deps like
docker, litellm, backoff, torch, etc.

Only installs mocks for modules NOT already
present -- safe to call multiple times.
"""
def _ensure(name, factory):
if name not in sys.modules:
sys.modules[name] = factory()

# docker
_ensure("docker", lambda: types.ModuleType("docker"))

# utils.docker_utils
def _make_docker_utils():
m = types.ModuleType("utils.docker_utils")
m.copy_to_container = lambda *a, **k: None
m.log_container_output = lambda *a, **k: None
return m
_ensure("utils.docker_utils", _make_docker_utils)

# utils.git_utils
def _make_git_utils():
m = types.ModuleType("utils.git_utils")
m.commit_repo = lambda *a, **k: "abc123"
m.get_git_commit_hash = lambda *a, **k: "abc"
return m
_ensure("utils.git_utils", _make_git_utils)

# backoff
def _make_backoff():
m = types.ModuleType("backoff")
m.expo = "expo"
m.on_exception = (
lambda *a, **kw: (lambda f: f)
)
return m
_ensure("backoff", _make_backoff)

# requests / requests.exceptions
def _make_requests():
m = types.ModuleType("requests")
exc = types.ModuleType("requests.exceptions")
exc.RequestException = Exception
m.exceptions = exc
sys.modules["requests.exceptions"] = exc
return m
_ensure("requests", _make_requests)

# litellm
def _make_litellm():
m = types.ModuleType("litellm")
m.drop_params = True
m.completion = lambda **kw: None
return m
_ensure("litellm", _make_litellm)

# dotenv
def _make_dotenv():
m = types.ModuleType("dotenv")
m.load_dotenv = lambda *a, **kw: None
return m
_ensure("dotenv", _make_dotenv)

# utils.thread_logger
def _make_thread_logger():
m = types.ModuleType("utils.thread_logger")
class FakeLM:
def __init__(self, **kw):
self.log = print
m.ThreadLoggerManager = FakeLM
return m
_ensure(
"utils.thread_logger", _make_thread_logger
)

# tqdm (used by genesis evaluator)
def _make_tqdm():
m = types.ModuleType("tqdm")
m.tqdm = lambda *a, **kw: iter([])
return m
_ensure("tqdm", _make_tqdm)

# pandas (used by ensemble.py)
def _make_pandas():
m = types.ModuleType("pandas")
m.read_csv = lambda *a, **kw: None
return m
_ensure("pandas", _make_pandas)
_ensure("pd", _make_pandas)


# Install mocks at import time so all test modules
# benefit.
_install_lightweight_mocks()


def load_module_from_file(module_name, file_path):
"""Load a Python module directly from a file path,
bypassing package __init__.py files.

Useful for modules whose package __init__ imports
heavy deps (e.g., torch).
"""
abs_path = os.path.join(_PROJ, file_path)
spec = importlib.util.spec_from_file_location(
module_name, abs_path
)
mod = importlib.util.module_from_spec(spec)
sys.modules[module_name] = mod
spec.loader.exec_module(mod)
return mod


@pytest.fixture
def tmp_dir():
"""Provide a temporary directory, cleaned up after test."""
d = tempfile.mkdtemp()
yield d
shutil.rmtree(d, ignore_errors=True)


@pytest.fixture
def sample_archive_jsonl(tmp_dir):
"""Create a sample archive.jsonl file with valid data."""
path = os.path.join(tmp_dir, "archive.jsonl")
entries = [
{
"current_genid": 0,
"archive": [0],
},
{
"current_genid": 1,
"archive": [0, 1],
},
{
"current_genid": 2,
"archive": [0, 1, 2],
},
]
with open(path, "w") as f:
for entry in entries:
f.write(json.dumps(entry) + "\n")
return path


@pytest.fixture
def sample_metadata_dir(tmp_dir):
"""Create gen_X directories with metadata.json files."""
for genid in range(3):
gen_dir = os.path.join(
tmp_dir, f"gen_{genid}"
)
os.makedirs(gen_dir, exist_ok=True)
metadata = {
"parent_genid": genid - 1 if genid > 0 else None,
"valid_parent": True,
"prev_patch_files": [],
"curr_patch_files": [],
}
with open(
os.path.join(gen_dir, "metadata.json"), "w"
) as f:
json.dump(metadata, f)
return tmp_dir
138 changes: 138 additions & 0 deletions tests/test_genesis_evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
"""Tests for genesis Evaluator (F-01, F-02).

F-01: run_episode() must return a dict (not None).
F-02: The parallel worker call signature must match
the run_episode method signature.
"""

import importlib.util
import inspect
import os
import sys

import pytest

_PROJ = os.path.normpath(
"C:/Users/ryuke/Desktop/Projects/Hyperagents"
)


def _load_module_from_file(module_name, rel_path):
"""Load a module directly from file, bypassing
package __init__.py (avoids torch import)."""
abs_path = os.path.join(_PROJ, rel_path)
spec = importlib.util.spec_from_file_location(
module_name, abs_path
)
mod = importlib.util.module_from_spec(spec)
sys.modules[module_name] = mod
spec.loader.exec_module(mod)
return mod


# conftest.py already mocked tqdm at import time.
_mod = _load_module_from_file(
"domains.genesis.evaluator",
"domains/genesis/evaluator.py",
)
Evaluator = _mod.Evaluator
EvaluatorManager = _mod.EvaluatorManager


class TestRunEpisodeReturnType:
"""F-01: run_episode must return a dict."""

def test_run_episode_returns_dict(self):
"""run_episode should always return a dict
containing episode log data, never None."""
src = inspect.getsource(
Evaluator.run_episode
)
assert "return episode_log" in src, (
"run_episode must return episode_log dict"
)
lines = src.split("\n")
for line in lines:
stripped = line.strip()
if stripped == "return":
pytest.fail(
"Bare return found -- would yield "
"None"
)
if stripped == "return None":
pytest.fail(
"Explicit return None found"
)

def test_episode_log_initialized_as_dict(self):
"""episode_log is created as a dict literal
in run_episode."""
src = inspect.getsource(
Evaluator.run_episode
)
assert "episode_log" in src
assert "episode_log = {" in src or (
"episode_log = {**" in src
)


class TestWorkerCallSignature:
"""F-02: _worker call args must match
run_episode params."""

def test_worker_calls_run_episode_correctly(
self,
):
"""The _worker method must pass the right
keyword args to run_episode."""
worker_src = inspect.getsource(
EvaluatorManager._worker
)
assert "run_episode(" in worker_src
assert "process_num=" in worker_src
assert "position=" in worker_src
assert "episode_idx=" in worker_src

def test_run_episode_signature_params(self):
"""run_episode accepts task, agent,
process_num, position, episode_idx."""
sig = inspect.signature(
Evaluator.run_episode
)
params = list(sig.parameters.keys())
assert params[0] == "self"
assert "task" in params
assert "agent" in params
assert "process_num" in params
assert "position" in params
assert "episode_idx" in params

def test_run_parallel_passes_position(self):
"""_run_parallel must pass a position arg
to each _worker call."""
src = inspect.getsource(
EvaluatorManager._run_parallel
)
assert "target=self._worker" in src
assert "position" in src

def test_sequential_calls_run_episode(self):
"""_run_sequential must call run_episode
with episode_idx."""
src = inspect.getsource(
EvaluatorManager._run_sequential
)
assert "run_episode(" in src
assert "episode_idx=" in src

def test_worker_signature_has_4_args(self):
"""_worker takes (self, task_queue,
results_queue, agent_factory, position)."""
sig = inspect.signature(
EvaluatorManager._worker
)
params = list(sig.parameters.keys())
assert "task_queue" in params
assert "results_queue" in params
assert "agent_factory" in params
assert "position" in params