From 41a03d8558b393b953251303e3f4def1c5adc2ea Mon Sep 17 00:00:00 2001
From: ryuketsukami <ryuketsukami@gmail.com>
Date: Wed, 25 Mar 2026 02:28:21 +0200
Subject: [PATCH] fix: return LLM response metadata (finish_reason, usage,
 model)

get_response_from_llm() returned an empty dict {} as the third
element, discarding finish_reason, token usage, and model info.

Now returns a populated info dict with finish_reason, usage,
and model from the litellm ModelResponse. Also adds system_msg
parameter for proper role-based message separation.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 agent/llm.py               |  41 +++++++-
 tests/__init__.py          |   0
 tests/conftest.py          | 188 +++++++++++++++++++++++++++++++++++++
 tests/test_llm_metadata.py |  93 ++++++++++++++++++
 4 files changed, 318 insertions(+), 4 deletions(-)
 create mode 100644 tests/__init__.py
 create mode 100644 tests/conftest.py
 create mode 100644 tests/test_llm_metadata.py

diff --git a/agent/llm.py b/agent/llm.py
index b8ba343..e11893a 100644
--- a/agent/llm.py
+++ b/agent/llm.py
@@ -39,6 +39,7 @@ def get_response_from_llm(
     temperature: float = 0.0,
     max_tokens: int = MAX_TOKENS,
     msg_history=None,
+    system_msg: str = None,
 ) -> Tuple[str, list, dict]:
     if msg_history is None:
         msg_history = []
@@ -49,7 +50,11 @@ def get_response_from_llm(
         for msg in msg_history
     ]
 
-    new_msg_history = msg_history + [{"role": "user", "content": msg}]
+    new_msg_history = []
+    # Prepend system message if provided
+    if system_msg is not None:
+        new_msg_history.append({"role": "system", "content": system_msg})
+    new_msg_history += msg_history + [{"role": "user", "content": msg}]
 
     # Build kwargs - handle model-specific requirements
     completion_kwargs = {
@@ -75,8 +80,16 @@ def get_response_from_llm(
             completion_kwargs["max_tokens"] = max_tokens
 
     response = litellm.completion(**completion_kwargs)
-    response_text = response['choices'][0]['message']['content']  # pyright: ignore
-    new_msg_history.append({"role": "assistant", "content": response['choices'][0]['message']['content']})
+    msg_content = response['choices'][0]['message']  # pyright: ignore
+    response_text = msg_content['content']
+    new_msg_history.append({
+        "role": "assistant",
+        "content": response_text,
+    })
+
+    # Strip system message from returned history
+    # (only needed for the API call)
+    new_msg_history = [m for m in new_msg_history if m.get("role") != "system"]
 
     # Convert content to text, compatible with MetaGen API
     new_msg_history = [
@@ -84,7 +97,27 @@ def get_response_from_llm(
         for msg in new_msg_history
     ]
 
-    return response_text, new_msg_history, {}
+    has_choices = (
+        hasattr(response, 'choices') and response.choices
+    )
+    has_usage = (
+        hasattr(response, 'usage') and response.usage
+    )
+    info = {
+        "finish_reason": (
+            response.choices[0].finish_reason
+            if has_choices else None
+        ),
+        "usage": (
+            dict(response.usage)
+            if has_usage else {}
+        ),
+        "model": (
+            response.model
+            if hasattr(response, 'model') else None
+        ),
+    }
+    return response_text, new_msg_history, info
 
 
 if __name__ == "__main__":
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..2708571
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,188 @@
+"""Shared fixtures for HyperAgents test suite."""
+
+import importlib
+import importlib.util
+import os
+import json
+import sys
+import tempfile
+import shutil
+import types
+
+import pytest
+
+# ---- Project root on sys.path ----
+_PROJ = os.path.normpath(
+    "C:/Users/ryuke/Desktop/Projects/Hyperagents"
+)
+if _PROJ not in sys.path:
+    sys.path.insert(0, _PROJ)
+
+
+def _install_lightweight_mocks():
+    """Install minimal mock modules so that project
+    modules can be imported without heavy deps like
+    docker, litellm, backoff, torch, etc.
+
+    Only installs mocks for modules NOT already
+    present -- safe to call multiple times.
+    """
+    def _ensure(name, factory):
+        if name not in sys.modules:
+            sys.modules[name] = factory()
+
+    # docker
+    _ensure("docker", lambda: types.ModuleType("docker"))
+
+    # utils.docker_utils
+    def _make_docker_utils():
+        m = types.ModuleType("utils.docker_utils")
+        m.copy_to_container = lambda *a, **k: None
+        m.log_container_output = lambda *a, **k: None
+        return m
+    _ensure("utils.docker_utils", _make_docker_utils)
+
+    # utils.git_utils
+    def _make_git_utils():
+        m = types.ModuleType("utils.git_utils")
+        m.commit_repo = lambda *a, **k: "abc123"
+        m.get_git_commit_hash = lambda *a, **k: "abc"
+        return m
+    _ensure("utils.git_utils", _make_git_utils)
+
+    # backoff
+    def _make_backoff():
+        m = types.ModuleType("backoff")
+        m.expo = "expo"
+        m.on_exception = (
+            lambda *a, **kw: (lambda f: f)
+        )
+        return m
+    _ensure("backoff", _make_backoff)
+
+    # requests / requests.exceptions
+    def _make_requests():
+        m = types.ModuleType("requests")
+        exc = types.ModuleType("requests.exceptions")
+        exc.RequestException = Exception
+        m.exceptions = exc
+        sys.modules["requests.exceptions"] = exc
+        return m
+    _ensure("requests", _make_requests)
+
+    # litellm
+    def _make_litellm():
+        m = types.ModuleType("litellm")
+        m.drop_params = True
+        m.completion = lambda **kw: None
+        return m
+    _ensure("litellm", _make_litellm)
+
+    # dotenv
+    def _make_dotenv():
+        m = types.ModuleType("dotenv")
+        m.load_dotenv = lambda *a, **kw: None
+        return m
+    _ensure("dotenv", _make_dotenv)
+
+    # utils.thread_logger
+    def _make_thread_logger():
+        m = types.ModuleType("utils.thread_logger")
+        class FakeLM:
+            def __init__(self, **kw):
+                self.log = print
+        m.ThreadLoggerManager = FakeLM
+        return m
+    _ensure(
+        "utils.thread_logger", _make_thread_logger
+    )
+
+    # tqdm (used by genesis evaluator)
+    def _make_tqdm():
+        m = types.ModuleType("tqdm")
+        m.tqdm = lambda *a, **kw: iter([])
+        return m
+    _ensure("tqdm", _make_tqdm)
+
+    # pandas (used by ensemble.py)
+    def _make_pandas():
+        m = types.ModuleType("pandas")
+        m.read_csv = lambda *a, **kw: None
+        return m
+    _ensure("pandas", _make_pandas)
+    _ensure("pd", _make_pandas)
+
+
+# Install mocks at import time so all test modules
+# benefit.
+_install_lightweight_mocks()
+
+
+def load_module_from_file(module_name, file_path):
+    """Load a Python module directly from a file path,
+    bypassing package __init__.py files.
+
+    Useful for modules whose package __init__ imports
+    heavy deps (e.g., torch).
+    """
+    abs_path = os.path.join(_PROJ, file_path)
+    spec = importlib.util.spec_from_file_location(
+        module_name, abs_path
+    )
+    mod = importlib.util.module_from_spec(spec)
+    sys.modules[module_name] = mod
+    spec.loader.exec_module(mod)
+    return mod
+
+
+@pytest.fixture
+def tmp_dir():
+    """Provide a temporary directory, cleaned up after test."""
+    d = tempfile.mkdtemp()
+    yield d
+    shutil.rmtree(d, ignore_errors=True)
+
+
+@pytest.fixture
+def sample_archive_jsonl(tmp_dir):
+    """Create a sample archive.jsonl file with valid data."""
+    path = os.path.join(tmp_dir, "archive.jsonl")
+    entries = [
+        {
+            "current_genid": 0,
+            "archive": [0],
+        },
+        {
+            "current_genid": 1,
+            "archive": [0, 1],
+        },
+        {
+            "current_genid": 2,
+            "archive": [0, 1, 2],
+        },
+    ]
+    with open(path, "w") as f:
+        for entry in entries:
+            f.write(json.dumps(entry) + "\n")
+    return path
+
+
+@pytest.fixture
+def sample_metadata_dir(tmp_dir):
+    """Create gen_X directories with metadata.json files."""
+    for genid in range(3):
+        gen_dir = os.path.join(
+            tmp_dir, f"gen_{genid}"
+        )
+        os.makedirs(gen_dir, exist_ok=True)
+        metadata = {
+            "parent_genid": genid - 1 if genid > 0 else None,
+            "valid_parent": True,
+            "prev_patch_files": [],
+            "curr_patch_files": [],
+        }
+        with open(
+            os.path.join(gen_dir, "metadata.json"), "w"
+        ) as f:
+            json.dump(metadata, f)
+    return tmp_dir
diff --git a/tests/test_llm_metadata.py b/tests/test_llm_metadata.py
new file mode 100644
index 0000000..759f9cd
--- /dev/null
+++ b/tests/test_llm_metadata.py
@@ -0,0 +1,93 @@
+"""Tests for LLM response metadata (F-10).
+
+Validates that get_response_from_llm() returns an
+info dict with expected keys: finish_reason, usage,
+model.
+"""
+
+import inspect
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+# conftest.py mocks backoff, litellm, dotenv, etc.
+from agent.llm import get_response_from_llm
+
+
+class TestLlmMetadataKeys:
+    """F-10: info dict contains expected keys."""
+
+    def test_return_annotation_is_tuple(self):
+        """get_response_from_llm returns a 3-tuple
+        (text, history, info)."""
+        sig = inspect.signature(
+            get_response_from_llm
+        )
+        ret = sig.return_annotation
+        assert "Tuple" in str(ret)
+
+    def test_info_dict_constructed_in_source(self):
+        """Source constructs info with finish_reason,
+        usage, model keys."""
+        src = inspect.getsource(
+            get_response_from_llm
+        )
+        assert '"finish_reason"' in src
+        assert '"usage"' in src
+        assert '"model"' in src
+
+    def test_info_dict_is_returned(self):
+        """The function returns (response_text,
+        new_msg_history, info)."""
+        src = inspect.getsource(
+            get_response_from_llm
+        )
+        assert (
+            "return response_text, "
+            "new_msg_history, info"
+        ) in src
+
+    def test_info_structure_via_mock(self):
+        """Mock litellm.completion and verify the
+        returned info dict shape."""
+        mock_response = MagicMock()
+        mock_response.choices = [MagicMock()]
+        mock_response.choices[0].message.content = (
+            "hello"
+        )
+        mock_response.choices[0].finish_reason = (
+            "stop"
+        )
+        mock_response.usage = MagicMock()
+        mock_response.model = "test-model"
+
+        # Make response subscriptable for the
+        # response['choices'][0]['message']['content']
+        # pattern used in the source.
+        choice_msg = {"content": "hello"}
+        choice = {"message": choice_msg}
+
+        def getitem(self, key):
+            if key == "choices":
+                return [choice]
+            return None
+
+        type(mock_response).__getitem__ = getitem
+
+        with patch(
+            "agent.llm.litellm.completion",
+            return_value=mock_response,
+        ):
+            text, history, info = (
+                get_response_from_llm(
+                    msg="test", model="test-model"
+                )
+            )
+
+        assert isinstance(info, dict)
+        assert "finish_reason" in info
+        assert "usage" in info
+        assert "model" in info
+        assert info["finish_reason"] == "stop"
+        assert info["model"] == "test-model"
+        assert text == "hello"