From a22d596920b4e51ddcea6fa3443ef8b44098dc05 Mon Sep 17 00:00:00 2001 From: Bryan Young Date: Fri, 6 Mar 2026 17:06:19 +0000 Subject: [PATCH 1/4] fix: add short-name package for E1 judge env --- Makefile | 2 +- configs/rl/e1_judge.toml | 10 +- environments/sv-env-netlogs-judge/README.md | 17 + .../sv-env-netlogs-judge/pyproject.toml | 50 +++ .../sv_env_network_logs_judge.py | 292 ++++++++++++++++++ .../sv_env_network_logs_judge_test.py | 160 ++++++++++ .../sv-env-netlogs-judge/sv_netlogs_judge.py | 22 ++ .../experiments/reward_source_comparison.md | 2 +- 8 files changed, 550 insertions(+), 5 deletions(-) create mode 100644 environments/sv-env-netlogs-judge/README.md create mode 100644 environments/sv-env-netlogs-judge/pyproject.toml create mode 100644 environments/sv-env-netlogs-judge/sv_env_network_logs_judge.py create mode 100644 environments/sv-env-netlogs-judge/sv_env_network_logs_judge_test.py create mode 100644 environments/sv-env-netlogs-judge/sv_netlogs_judge.py diff --git a/Makefile b/Makefile index b3805bd..5a4c400 100644 --- a/Makefile +++ b/Makefile @@ -36,7 +36,7 @@ TEAM ?= intertwine BUMP ?= patch # Optional Hub environment name override (defaults to pyproject.toml name) -# Usage: make hub-deploy E=network-logs NAME=sv-env-network-logs-judge +# Usage: make hub-deploy E=network-logs NAME=sv-netlogs-judge NAME ?= # ---------- Colors (portable) ---------- diff --git a/configs/rl/e1_judge.toml b/configs/rl/e1_judge.toml index f8d639b..6294289 100644 --- a/configs/rl/e1_judge.toml +++ b/configs/rl/e1_judge.toml @@ -1,7 +1,11 @@ -# Prime RL hosted training config for E1 Judge Variant (sv-env-network-logs-judge) +# Prime RL hosted training config for E1 Judge Variant (sv-netlogs-judge) # Launch: prime rl run configs/rl/e1_judge.toml # Or via make: make lab-run-e1-judge # +# Note: the original Hub name `sv-env-network-logs-judge` triggers a Prime +# Kubernetes label truncation bug (`sv-env-network-logs-` ends with a dash). +# We deploy this judge variant under the shorter Hub name `sv-netlogs-judge`. +# # WP3c: LLM-judge reward comparison experiment # This config is MATCHED to configs/rl/e1.toml for controlled comparison: # - Same model, same LoRA, same budget (max_steps, batch_size, rollouts_per_example) @@ -27,7 +31,7 @@ max_tokens = 2048 temperature = 0.7 [[env]] -id = "intertwine/sv-env-network-logs-judge" +id = "intertwine/sv-netlogs-judge" args = { max_examples = 200 } [wandb] @@ -42,7 +46,7 @@ rollouts_per_example = 1 eval_base_model = true [[eval.env]] -id = "intertwine/sv-env-network-logs-judge" +id = "intertwine/sv-netlogs-judge" args = { max_examples = 50 } num_examples = 50 rollouts_per_example = 1 diff --git a/environments/sv-env-netlogs-judge/README.md b/environments/sv-env-netlogs-judge/README.md new file mode 100644 index 0000000..32850d1 --- /dev/null +++ b/environments/sv-env-netlogs-judge/README.md @@ -0,0 +1,17 @@ +# Network Logs Judge Variant (WP3c) + +This is the short-name Hub package for the E1 LLM-judge variant. + +Why this package exists: +- Prime RL truncates long environment names when deriving Kubernetes labels. +- The original env id `sv-env-network-logs-judge` truncates to `sv-env-network-logs-`, which is invalid because it ends with `-`. +- This package publishes the same judge environment under the shorter env id `sv-netlogs-judge`. + +Install: +- `prime env install intertwine/sv-netlogs-judge` + +Usage: +- `from verifiers import load_environment` +- `env = load_environment("sv-netlogs-judge")` + +The environment reuses the same dataset, parser, and judge reward logic as the original E1 judge implementation. diff --git a/environments/sv-env-netlogs-judge/pyproject.toml b/environments/sv-env-netlogs-judge/pyproject.toml new file mode 100644 index 0000000..3de6fcd --- /dev/null +++ b/environments/sv-env-netlogs-judge/pyproject.toml @@ -0,0 +1,50 @@ +[project] +name = "sv-netlogs-judge" +description = "RL Security Verifiers: Network Logs Judge Reward Variant" +tags = [ + "intertwine", + "security-verifiers", + "security", + "cybersecurity", + "single-turn", + "network-security", + "anomaly-detection", + "logs", + "classification", + "judge", + "train", + "eval", +] +version = "0.2.17" +requires-python = ">=3.12" +dependencies = [ + "verifiers>=0.1.9", + "security-verifiers-utils>=0.3.1", +] + +[project.entry-points."verifiers.environments"] +sv-netlogs-judge = "sv_netlogs_judge:load_environment" + +[project.optional-dependencies] +dev = [ + "pytest>=7.0.0", + "ruff>=0.12.11", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build] +include = [ + "sv_netlogs_judge.py", + "sv_env_network_logs_judge.py", + "sv_env_network_logs_judge_test.py", +] + +[tool.pytest.ini_options] +python_files = ["*_test.py"] +testpaths = ["."] + +[tool.ruff] +line-length = 120 diff --git a/environments/sv-env-netlogs-judge/sv_env_network_logs_judge.py b/environments/sv-env-netlogs-judge/sv_env_network_logs_judge.py new file mode 100644 index 0000000..03ed745 --- /dev/null +++ b/environments/sv-env-netlogs-judge/sv_env_network_logs_judge.py @@ -0,0 +1,292 @@ +"""sv_env_network_logs_judge: LLM-judge reward variant of Network Log environment. + +This module implements a JudgeRubric-based variant of PRD Environment #1 (WP3c). +It reuses the same dataset, prompt, and parsing as the executable-verifier variant, +but replaces the multi-component reward functions (accuracy, calibration, cost) with +a single LLM-judge binary assessment. + +Purpose: Compare structured executable rewards vs simpler LLM-judge rewards for RL +training, answering the key WP3c question of whether multi-component reward +decomposition outperforms holistic LLM judgment. + +The judge prompt evaluates correctness, calibration, and format in a single pass, +returning a binary yes/no verdict that maps to reward 1.0/0.0. +""" + +from __future__ import annotations + +from pathlib import Path + +try: + # Try importing from installed package first + from sv_shared import weave_init # type: ignore # noqa: F401 +except ImportError: + # Fall back to local development import + import sys + + sys.path.append(str(Path(__file__).resolve().parents[2])) + from sv_shared import weave_init # type: ignore # noqa: F401 + +import logging as _logging +import os + +import verifiers as vf +from datasets import Dataset + +try: + from sv_shared import ( + DatasetSource, + JsonClassificationParser, + RolloutLogger, + load_dataset_with_fallback, + ) +except ImportError: + # Fall back to local development import + import sys + + sys.path.append(str(Path(__file__).resolve().parents[2])) + from sv_shared import ( # type: ignore + DatasetSource, + JsonClassificationParser, + RolloutLogger, + load_dataset_with_fallback, + ) + + +# --------------------------------------------------------------------------- +# Judge prompt: security-domain-specific evaluation +# --------------------------------------------------------------------------- +# The judge prompt is designed to holistically evaluate the same dimensions +# that the executable verifier decomposes into separate reward functions: +# - Accuracy: Is the label correct? +# - Calibration: Is the confidence appropriate given correctness? +# - Format: Is the JSON schema valid? +# +# The judge returns "yes" or "no" — a binary signal vs the multi-component +# weighted reward from the executable verifier variant. +# --------------------------------------------------------------------------- +JUDGE_PROMPT = """\ +You are evaluating a network security classification model's response. + +The model was given a network log entry and asked to classify it as \ +"Benign", "Malicious", or "Abstain" with a confidence score (0.0-1.0) \ +in JSON format: {{"label": "...", "confidence": 0.0-1.0, "rationale": "..."}}. + +Network log entry: +``` +{question} +``` + +Correct classification: {answer} + +Model's parsed response: {response} + +Evaluate whether the model's response is acceptable by checking ALL of: +1. The label matches the correct classification (case-insensitive) +2. The response is valid JSON with required fields (label, confidence) +3. The confidence is reasonable (high if correct, low if uncertain) + +Respond "yes" if the response is correct and well-formed, or "no" otherwise.\ +""" + + +# --------------------------------------------------------------------------- +# Judge reward function +# --------------------------------------------------------------------------- +async def judge_reward(prompt, completion, answer, state, judge, **kwargs): + """Binary reward from LLM judge evaluation. + + Calls the JudgeRubric's judge method which formats the judge prompt, + sends it to the judge model, and returns the text response. + Returns 1.0 for "yes", 0.0 for "no" or any other response. + """ + result = await judge(prompt, completion, answer, state) + return 1.0 if result.strip().lower().startswith("yes") else 0.0 + + +class NetworkLogParser(JsonClassificationParser): + """Parser for network log classification outputs (shared with executable variant).""" + + def __init__(self) -> None: + super().__init__(allowed_labels=["Benign", "Malicious", "Abstain"]) + + +def load_environment( + dataset_name: str = "iot23-train-dev-test-v1.jsonl", + dataset_source: DatasetSource = "auto", + max_examples: int = 1000, + logger: RolloutLogger | None = None, + judge_model: str = "gpt-4.1-nano", + **extra_kwargs, # Accept and log unknown kwargs for debugging +) -> vf.SingleTurnEnv: + """Load the Network Logs environment with LLM-judge rewards (WP3c variant). + + This environment is identical to sv-env-network-logs in dataset, prompt, and + parsing, but uses a JudgeRubric with a single LLM-judge binary reward instead + of multi-component executable reward functions. + + Args: + dataset_name: Dataset to load (same options as sv-env-network-logs). + dataset_source: Where to load the dataset from ("auto", "local", "hub", "synthetic"). + max_examples: Maximum number of examples to use from the dataset. + logger: Optional rollout logger for instrumenting environment metadata. + judge_model: LLM model to use as judge (default: gpt-4.1-nano for cost parity). + + Returns: + A Verifiers SingleTurnEnv configured with JudgeRubric reward. + + Notes: + The judge model default (gpt-4.1-nano) is chosen to keep per-step costs + comparable to the executable verifier variant. The judge is called once per + rollout, producing a binary 0/1 reward signal. + + For the matched-budget experiment (WP3c), use the same training config + parameters (max_steps, batch_size, rollouts_per_example) as e1.toml. + """ + _log = _logging.getLogger("sv_env_network_logs_judge") + _debug = os.environ.get("SV_DEBUG", "") + + if extra_kwargs: + _log.warning( + "[SV_DEBUG] load_environment (judge) received unexpected kwargs: %s " + "(these are being IGNORED but may indicate config mismatch)", + extra_kwargs, + ) + + if _debug: + _log.warning( + "[SV_DEBUG] load_environment (judge) called with: " + "dataset_name=%r, dataset_source=%r, max_examples=%r, " + "judge_model=%r, logger=%s, extra_kwargs=%s, HF_TOKEN=%s", + dataset_name, + dataset_source, + max_examples, + judge_model, + "present" if logger else "None", + extra_kwargs or "none", + "set" if os.environ.get("HF_TOKEN") else "NOT SET", + ) + + env_root = Path(__file__).parent + + def _create_synthetic_dataset(): + """Create a synthetic dataset for testing purposes (shared with executable variant).""" + # Same synthetic data as the executable variant for controlled comparison + log_patterns = [ + { + "question": ("2024-01-15 10:23:45 SRC=192.168.1.105 DST=192.168.1.1 PROTO=HTTPS PORT=443 STATUS=OK"), + "answer": "Benign", + }, + { + "question": ( + "2024-01-15 10:24:12 SRC=10.0.0.15 DST=10.0.0.1 PROTO=DNS PORT=53 QUERY=google.com STATUS=OK" + ), + "answer": "Benign", + }, + { + "question": ( + "2024-01-15 10:25:33 SRC=172.16.0.100 DST=172.16.0.10 PROTO=SSH PORT=22 STATUS=AUTH_SUCCESS" + ), + "answer": "Benign", + }, + { + "question": ( + "2024-01-15 10:26:01 SRC=192.168.1.105 DST=192.168.1.1 " + "PROTO=SSH PORT=22 STATUS=AUTH_FAILED ATTEMPTS=50" + ), + "answer": "Malicious", + }, + { + "question": ("2024-01-15 10:27:15 SRC=10.0.0.99 DST=185.220.101.45 PROTO=TOR PORT=9001 BYTES=50000"), + "answer": "Malicious", + }, + { + "question": ( + "2024-01-15 10:28:42 SRC=172.16.0.200 DST=MULTIPLE PROTO=SCAN PORTS=1-65535 STATUS=SYN_SCAN" + ), + "answer": "Malicious", + }, + { + "question": ("2024-01-15 10:29:03 SRC=192.168.1.50 DST=8.8.8.8 PROTO=ICMP TYPE=ECHO_REQUEST STATUS=OK"), + "answer": "Benign", + }, + { + "question": ( + "2024-01-15 10:30:21 SRC=10.0.0.77 DST=198.51.100.23 PROTO=HTTPS PORT=443 CERT=SELF_SIGNED" + ), + "answer": "Malicious", + }, + { + "question": ( + "2024-01-15 10:31:55 SRC=172.16.0.150 DST=172.16.0.1 PROTO=HTTP PORT=80 STATUS=OK METHOD=GET" + ), + "answer": "Benign", + }, + { + "question": ( + "2024-01-15 10:32:18 SRC=192.168.1.200 DST=91.92.109.43 PROTO=UDP PORT=4444 PATTERN=C2_BEACON" + ), + "answer": "Malicious", + }, + ] + return Dataset.from_list(log_patterns) + + # Load dataset using shared multi-tiered loader (same as executable variant) + dataset, resolved_name = load_dataset_with_fallback( + dataset_name=dataset_name, + env_root=env_root, + dataset_source=dataset_source, + max_examples=max_examples, + field_mapping=None, + synthetic_generator=_create_synthetic_dataset, + ) + dataset_name = resolved_name + + if _debug and dataset is not None: + _log.warning( + "[SV_DEBUG] Dataset loaded (judge variant): name=%s, len=%d, columns=%s", + dataset_name, + len(dataset), + dataset.column_names, + ) + + parser = NetworkLogParser() + + # Use JudgeRubric instead of executable Rubric + # The judge_reward function uses the judge callable injected by JudgeRubric + rubric = vf.JudgeRubric( + parser=parser, + judge_model=judge_model, + judge_prompt=JUDGE_PROMPT, + judge_sampling_args={"max_tokens": 16, "temperature": 0.0}, + ) + # Add the judge_reward function that interprets the LLM judge's yes/no response + rubric.add_reward_func(judge_reward, weight=1.0) + + if logger and logger.enabled: + logger.log_environment_init( + environment_name="sv-env-network-logs-judge", + dataset_name=dataset_name, + total_examples=len(dataset) if dataset is not None else None, + metadata={ + "max_examples": max_examples, + "judge_model": judge_model, + "reward_type": "llm-judge", + }, + ) + + return vf.SingleTurnEnv( + name="sv-env-network-logs-judge", + description=( + "Classify network logs as 'Malicious', 'Benign', or 'Abstain' " + "(LLM-judge reward variant for WP3c comparison)." + ), + dataset=dataset, + parser=parser, + rubric=rubric, + system_prompt=( + "You are a network security analyst. Analyze the provided network log entry and " + "respond with a JSON object: {\"label\": 'Benign|Malicious|Abstain', " + '"confidence": 0.0-1.0, "rationale": optional}.' + ), + ) diff --git a/environments/sv-env-netlogs-judge/sv_env_network_logs_judge_test.py b/environments/sv-env-netlogs-judge/sv_env_network_logs_judge_test.py new file mode 100644 index 0000000..5bcd4c9 --- /dev/null +++ b/environments/sv-env-netlogs-judge/sv_env_network_logs_judge_test.py @@ -0,0 +1,160 @@ +"""Tests for the LLM-judge variant of the network log environment.""" + +from __future__ import annotations + +import asyncio +from unittest.mock import Mock + +import pytest +import verifiers as vf + +from sv_env_network_logs_judge import ( + JUDGE_PROMPT, + NetworkLogParser, + judge_reward, + load_environment, +) +from sv_netlogs_judge import load_environment as load_short_environment + + +class TestJudgeVariantParser: + """Parser should be identical to the executable verifier variant.""" + + def test_extracts_label_and_confidence(self) -> None: + parser = NetworkLogParser() + completion = '{"label": "Malicious", "confidence": 0.9, "rationale": "scan"}' + assert parser.parse_answer(completion) == "Malicious" + assert parser.parse_confidence(completion) == pytest.approx(0.9) + + def test_rejects_invalid_labels(self) -> None: + parser = NetworkLogParser() + assert parser.parse_answer('{"label": "Unknown"}') == "" + + +class TestJudgePrompt: + """Verify the judge prompt has the required template variables.""" + + def test_has_required_placeholders(self) -> None: + assert "{question}" in JUDGE_PROMPT + assert "{answer}" in JUDGE_PROMPT + assert "{response}" in JUDGE_PROMPT + + def test_mentions_evaluation_criteria(self) -> None: + prompt_lower = JUDGE_PROMPT.lower() + assert "label" in prompt_lower + assert "confidence" in prompt_lower + assert "json" in prompt_lower + + +class TestJudgeReward: + """Test the judge_reward function's response parsing.""" + + def _run(self, coro): + """Helper to run async functions in sync tests.""" + loop = asyncio.new_event_loop() + try: + return loop.run_until_complete(coro) + finally: + loop.close() + + def test_yes_response(self) -> None: + """Judge responding 'yes' should yield reward 1.0.""" + + async def mock_judge(prompt, completion, answer, state): + return "yes" + + result = self._run(judge_reward(prompt="test", completion="test", answer="test", state={}, judge=mock_judge)) + assert result == 1.0 + + def test_no_response(self) -> None: + """Judge responding 'no' should yield reward 0.0.""" + + async def mock_judge(prompt, completion, answer, state): + return "no" + + result = self._run(judge_reward(prompt="test", completion="test", answer="test", state={}, judge=mock_judge)) + assert result == 0.0 + + def test_yes_with_trailing_text(self) -> None: + """Judge responding 'Yes, the response...' should yield 1.0.""" + + async def mock_judge(prompt, completion, answer, state): + return "Yes, the response is correct." + + result = self._run(judge_reward(prompt="test", completion="test", answer="test", state={}, judge=mock_judge)) + assert result == 1.0 + + def test_unexpected_response(self) -> None: + """Unexpected judge response defaults to 0.0.""" + + async def mock_judge(prompt, completion, answer, state): + return "maybe" + + result = self._run(judge_reward(prompt="test", completion="test", answer="test", state={}, judge=mock_judge)) + assert result == 0.0 + + +class TestLoadEnvironment: + """Test environment loading with synthetic dataset. + + These tests require OPENAI_API_KEY because JudgeRubric creates an + AsyncOpenAI client at init time. We set a dummy key for unit tests + (the judge is never actually called). + """ + + @pytest.fixture(autouse=True) + def _set_dummy_openai_key(self, monkeypatch): + """Set a dummy OpenAI key so JudgeRubric can initialize.""" + monkeypatch.setenv("OPENAI_API_KEY", "sk-test-dummy-key-for-unit-tests") + + def _get_judge_rubric(self, env): + """Extract the JudgeRubric from the env's RubricGroup wrapper.""" + rubric = env.rubric + # SingleTurnEnv wraps rubrics in a RubricGroup + if hasattr(rubric, "rubrics"): + for r in rubric.rubrics: + if isinstance(r, vf.JudgeRubric): + return r + # Direct JudgeRubric (if verifiers changes wrapping behavior) + if isinstance(rubric, vf.JudgeRubric): + return rubric + pytest.fail(f"No JudgeRubric found in env.rubric (type={type(rubric).__name__})") + + def test_load_with_synthetic_dataset(self) -> None: + logger = Mock() + logger.enabled = True + + env = load_environment(dataset_name="synthetic", max_examples=5, logger=logger) + assert isinstance(env, vf.SingleTurnEnv) + assert env.rubric is not None + judge_rubric = self._get_judge_rubric(env) + assert isinstance(judge_rubric, vf.JudgeRubric) + assert env.dataset is not None + assert len(env.dataset) > 0 + + logger.log_environment_init.assert_called_once() + call_kwargs = logger.log_environment_init.call_args.kwargs + assert call_kwargs["environment_name"] == "sv-env-network-logs-judge" + assert "synthetic" in call_kwargs["dataset_name"] + assert call_kwargs["metadata"]["reward_type"] == "llm-judge" + + def test_rubric_has_judge_reward_func(self) -> None: + env = load_environment(dataset_name="synthetic", max_examples=5) + judge_rubric = self._get_judge_rubric(env) + func_names = judge_rubric._get_reward_func_names() + assert "judge_reward" in func_names + + def test_rubric_uses_correct_judge_model(self) -> None: + env = load_environment(dataset_name="synthetic", max_examples=5) + judge_rubric = self._get_judge_rubric(env) + assert judge_rubric.judge_model == "gpt-4.1-nano" + + def test_custom_judge_model(self) -> None: + env = load_environment(dataset_name="synthetic", max_examples=5, judge_model="gpt-4.1-mini") + judge_rubric = self._get_judge_rubric(env) + assert judge_rubric.judge_model == "gpt-4.1-mini" + + def test_short_alias_loads_environment(self) -> None: + env = load_short_environment(dataset_name="synthetic", max_examples=5) + assert isinstance(env, vf.SingleTurnEnv) + assert env.name == "sv-netlogs-judge" diff --git a/environments/sv-env-netlogs-judge/sv_netlogs_judge.py b/environments/sv-env-netlogs-judge/sv_netlogs_judge.py new file mode 100644 index 0000000..5f169e6 --- /dev/null +++ b/environments/sv-env-netlogs-judge/sv_netlogs_judge.py @@ -0,0 +1,22 @@ +"""Short-name wrapper for the network logs judge environment. + +Prime RL currently truncates environment names to 20 characters when deriving +Kubernetes labels. The original judge env id (`sv-env-network-logs-judge`) +truncates to `sv-env-network-logs-`, which is invalid because it ends with a +hyphen. This wrapper exposes a shorter stable module/env id that avoids the +label bug while reusing the same judge environment implementation. +""" + +from __future__ import annotations + +from sv_env_network_logs_judge import load_environment as _load_environment + +SHORT_ENV_ID = "sv-netlogs-judge" + + +def load_environment(**kwargs): + """Load the short-name alias of the E1 judge environment.""" + env = _load_environment(**kwargs) + env.name = SHORT_ENV_ID + env.env_id = SHORT_ENV_ID + return env diff --git a/research/experiments/reward_source_comparison.md b/research/experiments/reward_source_comparison.md index 9312ce1..255a485 100644 --- a/research/experiments/reward_source_comparison.md +++ b/research/experiments/reward_source_comparison.md @@ -42,7 +42,7 @@ on security classification tasks? ### Condition B: LLM-Judge (e1_judge.toml) -- **Environment**: `intertwine/sv-env-network-logs-judge` +- **Environment**: `intertwine/sv-netlogs-judge` - **Reward**: Single LLM-judge binary assessment - `judge_reward` (weight=1.0): 1.0 if judge says "yes", 0.0 otherwise - **Judge model**: gpt-4.1-nano (cheapest available, ~$0.10/1M tokens) From 45f98164042feef8ee519348d680da4c2b49e943 Mon Sep 17 00:00:00 2001 From: Bryan Young Date: Fri, 6 Mar 2026 17:11:58 +0000 Subject: [PATCH 2/4] fix: rename short-package judge test module --- environments/sv-env-netlogs-judge/pyproject.toml | 2 +- ..._env_network_logs_judge_test.py => sv_netlogs_judge_test.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename environments/sv-env-netlogs-judge/{sv_env_network_logs_judge_test.py => sv_netlogs_judge_test.py} (100%) diff --git a/environments/sv-env-netlogs-judge/pyproject.toml b/environments/sv-env-netlogs-judge/pyproject.toml index 3de6fcd..6a534a1 100644 --- a/environments/sv-env-netlogs-judge/pyproject.toml +++ b/environments/sv-env-netlogs-judge/pyproject.toml @@ -39,7 +39,7 @@ build-backend = "hatchling.build" include = [ "sv_netlogs_judge.py", "sv_env_network_logs_judge.py", - "sv_env_network_logs_judge_test.py", + "sv_netlogs_judge_test.py", ] [tool.pytest.ini_options] diff --git a/environments/sv-env-netlogs-judge/sv_env_network_logs_judge_test.py b/environments/sv-env-netlogs-judge/sv_netlogs_judge_test.py similarity index 100% rename from environments/sv-env-netlogs-judge/sv_env_network_logs_judge_test.py rename to environments/sv-env-netlogs-judge/sv_netlogs_judge_test.py From b2f44c706eef3553df28cf8999ac19698d776e18 Mon Sep 17 00:00:00 2001 From: Bryan Young Date: Sat, 7 Mar 2026 18:52:05 +0000 Subject: [PATCH 3/4] fix: address PR #71 review feedback - wire env_name through the short judge loader for consistent telemetry - rename the short-package implementation module to avoid collisions - fix hub-deploy docs and exclude tests from the wheel --- Makefile | 2 +- .../sv-env-netlogs-judge/pyproject.toml | 3 +- .../sv-env-netlogs-judge/sv_netlogs_judge.py | 8 ++-- ...logs_judge.py => sv_netlogs_judge_impl.py} | 41 +++++++++++-------- .../sv_netlogs_judge_test.py | 29 +++++++++++-- 5 files changed, 55 insertions(+), 28 deletions(-) rename environments/sv-env-netlogs-judge/{sv_env_network_logs_judge.py => sv_netlogs_judge_impl.py} (93%) diff --git a/Makefile b/Makefile index 5a4c400..d7ca4ef 100644 --- a/Makefile +++ b/Makefile @@ -36,7 +36,7 @@ TEAM ?= intertwine BUMP ?= patch # Optional Hub environment name override (defaults to pyproject.toml name) -# Usage: make hub-deploy E=network-logs NAME=sv-netlogs-judge +# Usage: make hub-deploy E=netlogs-judge NAME ?= # ---------- Colors (portable) ---------- diff --git a/environments/sv-env-netlogs-judge/pyproject.toml b/environments/sv-env-netlogs-judge/pyproject.toml index 6a534a1..a23fa76 100644 --- a/environments/sv-env-netlogs-judge/pyproject.toml +++ b/environments/sv-env-netlogs-judge/pyproject.toml @@ -38,8 +38,7 @@ build-backend = "hatchling.build" [tool.hatch.build] include = [ "sv_netlogs_judge.py", - "sv_env_network_logs_judge.py", - "sv_netlogs_judge_test.py", + "sv_netlogs_judge_impl.py", ] [tool.pytest.ini_options] diff --git a/environments/sv-env-netlogs-judge/sv_netlogs_judge.py b/environments/sv-env-netlogs-judge/sv_netlogs_judge.py index 5f169e6..156a7f3 100644 --- a/environments/sv-env-netlogs-judge/sv_netlogs_judge.py +++ b/environments/sv-env-netlogs-judge/sv_netlogs_judge.py @@ -9,14 +9,12 @@ from __future__ import annotations -from sv_env_network_logs_judge import load_environment as _load_environment +from sv_netlogs_judge_impl import load_environment as _load_environment SHORT_ENV_ID = "sv-netlogs-judge" def load_environment(**kwargs): """Load the short-name alias of the E1 judge environment.""" - env = _load_environment(**kwargs) - env.name = SHORT_ENV_ID - env.env_id = SHORT_ENV_ID - return env + kwargs.setdefault("env_name", SHORT_ENV_ID) + return _load_environment(**kwargs) diff --git a/environments/sv-env-netlogs-judge/sv_env_network_logs_judge.py b/environments/sv-env-netlogs-judge/sv_netlogs_judge_impl.py similarity index 93% rename from environments/sv-env-netlogs-judge/sv_env_network_logs_judge.py rename to environments/sv-env-netlogs-judge/sv_netlogs_judge_impl.py index 03ed745..5a7f40b 100644 --- a/environments/sv-env-netlogs-judge/sv_env_network_logs_judge.py +++ b/environments/sv-env-netlogs-judge/sv_netlogs_judge_impl.py @@ -1,4 +1,4 @@ -"""sv_env_network_logs_judge: LLM-judge reward variant of Network Log environment. +"""sv_netlogs_judge_impl: LLM-judge reward variant of Network Log environment. This module implements a JudgeRubric-based variant of PRD Environment #1 (WP3c). It reuses the same dataset, prompt, and parsing as the executable-verifier variant, @@ -15,24 +15,25 @@ from __future__ import annotations +import logging as _logging +import os +import sys from pathlib import Path +import verifiers as vf +from datasets import Dataset + +REPO_ROOT = str(Path(__file__).resolve().parents[2]) + try: # Try importing from installed package first from sv_shared import weave_init # type: ignore # noqa: F401 except ImportError: # Fall back to local development import - import sys - - sys.path.append(str(Path(__file__).resolve().parents[2])) + if REPO_ROOT not in sys.path: + sys.path.append(REPO_ROOT) from sv_shared import weave_init # type: ignore # noqa: F401 -import logging as _logging -import os - -import verifiers as vf -from datasets import Dataset - try: from sv_shared import ( DatasetSource, @@ -42,9 +43,8 @@ ) except ImportError: # Fall back to local development import - import sys - - sys.path.append(str(Path(__file__).resolve().parents[2])) + if REPO_ROOT not in sys.path: + sys.path.append(REPO_ROOT) from sv_shared import ( # type: ignore DatasetSource, JsonClassificationParser, @@ -111,12 +111,16 @@ def __init__(self) -> None: super().__init__(allowed_labels=["Benign", "Malicious", "Abstain"]) +DEFAULT_ENV_NAME = "sv-env-network-logs-judge" + + def load_environment( dataset_name: str = "iot23-train-dev-test-v1.jsonl", dataset_source: DatasetSource = "auto", max_examples: int = 1000, logger: RolloutLogger | None = None, judge_model: str = "gpt-4.1-nano", + env_name: str = DEFAULT_ENV_NAME, **extra_kwargs, # Accept and log unknown kwargs for debugging ) -> vf.SingleTurnEnv: """Load the Network Logs environment with LLM-judge rewards (WP3c variant). @@ -131,6 +135,7 @@ def load_environment( max_examples: Maximum number of examples to use from the dataset. logger: Optional rollout logger for instrumenting environment metadata. judge_model: LLM model to use as judge (default: gpt-4.1-nano for cost parity). + env_name: Environment name/env_id to expose and log for this loader. Returns: A Verifiers SingleTurnEnv configured with JudgeRubric reward. @@ -143,7 +148,7 @@ def load_environment( For the matched-budget experiment (WP3c), use the same training config parameters (max_steps, batch_size, rollouts_per_example) as e1.toml. """ - _log = _logging.getLogger("sv_env_network_logs_judge") + _log = _logging.getLogger("sv_netlogs_judge_impl") _debug = os.environ.get("SV_DEBUG", "") if extra_kwargs: @@ -157,11 +162,12 @@ def load_environment( _log.warning( "[SV_DEBUG] load_environment (judge) called with: " "dataset_name=%r, dataset_source=%r, max_examples=%r, " - "judge_model=%r, logger=%s, extra_kwargs=%s, HF_TOKEN=%s", + "judge_model=%r, env_name=%r, logger=%s, extra_kwargs=%s, HF_TOKEN=%s", dataset_name, dataset_source, max_examples, judge_model, + env_name, "present" if logger else "None", extra_kwargs or "none", "set" if os.environ.get("HF_TOKEN") else "NOT SET", @@ -265,7 +271,7 @@ def _create_synthetic_dataset(): if logger and logger.enabled: logger.log_environment_init( - environment_name="sv-env-network-logs-judge", + environment_name=env_name, dataset_name=dataset_name, total_examples=len(dataset) if dataset is not None else None, metadata={ @@ -276,7 +282,8 @@ def _create_synthetic_dataset(): ) return vf.SingleTurnEnv( - name="sv-env-network-logs-judge", + name=env_name, + env_id=env_name, description=( "Classify network logs as 'Malicious', 'Benign', or 'Abstain' " "(LLM-judge reward variant for WP3c comparison)." diff --git a/environments/sv-env-netlogs-judge/sv_netlogs_judge_test.py b/environments/sv-env-netlogs-judge/sv_netlogs_judge_test.py index 5bcd4c9..a7795ee 100644 --- a/environments/sv-env-netlogs-judge/sv_netlogs_judge_test.py +++ b/environments/sv-env-netlogs-judge/sv_netlogs_judge_test.py @@ -8,7 +8,8 @@ import pytest import verifiers as vf -from sv_env_network_logs_judge import ( +from sv_netlogs_judge_impl import ( + DEFAULT_ENV_NAME, JUDGE_PROMPT, NetworkLogParser, judge_reward, @@ -126,6 +127,8 @@ def test_load_with_synthetic_dataset(self) -> None: env = load_environment(dataset_name="synthetic", max_examples=5, logger=logger) assert isinstance(env, vf.SingleTurnEnv) + assert env.name == DEFAULT_ENV_NAME + assert env.env_id == DEFAULT_ENV_NAME assert env.rubric is not None judge_rubric = self._get_judge_rubric(env) assert isinstance(judge_rubric, vf.JudgeRubric) @@ -134,7 +137,7 @@ def test_load_with_synthetic_dataset(self) -> None: logger.log_environment_init.assert_called_once() call_kwargs = logger.log_environment_init.call_args.kwargs - assert call_kwargs["environment_name"] == "sv-env-network-logs-judge" + assert call_kwargs["environment_name"] == DEFAULT_ENV_NAME assert "synthetic" in call_kwargs["dataset_name"] assert call_kwargs["metadata"]["reward_type"] == "llm-judge" @@ -154,7 +157,27 @@ def test_custom_judge_model(self) -> None: judge_rubric = self._get_judge_rubric(env) assert judge_rubric.judge_model == "gpt-4.1-mini" + def test_custom_env_name_updates_logging_and_ids(self) -> None: + logger = Mock() + logger.enabled = True + + env = load_environment(dataset_name="synthetic", max_examples=5, logger=logger, env_name="custom-env-id") + assert env.name == "custom-env-id" + assert env.env_id == "custom-env-id" + + logger.log_environment_init.assert_called_once() + call_kwargs = logger.log_environment_init.call_args.kwargs + assert call_kwargs["environment_name"] == "custom-env-id" + def test_short_alias_loads_environment(self) -> None: - env = load_short_environment(dataset_name="synthetic", max_examples=5) + logger = Mock() + logger.enabled = True + + env = load_short_environment(dataset_name="synthetic", max_examples=5, logger=logger) assert isinstance(env, vf.SingleTurnEnv) assert env.name == "sv-netlogs-judge" + assert env.env_id == "sv-netlogs-judge" + + logger.log_environment_init.assert_called_once() + call_kwargs = logger.log_environment_init.call_args.kwargs + assert call_kwargs["environment_name"] == "sv-netlogs-judge" From e2220ae3accbab4eeda2ac6c12d4be88021ae250 Mon Sep 17 00:00:00 2001 From: Bryan Young Date: Sat, 7 Mar 2026 19:11:34 +0000 Subject: [PATCH 4/4] docs: refresh PR #71 research notes --- research/experiments/reward_source_comparison.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/research/experiments/reward_source_comparison.md b/research/experiments/reward_source_comparison.md index 255a485..a5a1356 100644 --- a/research/experiments/reward_source_comparison.md +++ b/research/experiments/reward_source_comparison.md @@ -99,10 +99,10 @@ tasks, favoring cheaper judge-based approaches for future environments. ## Status -- [x] Environment implementation (sv_env_network_logs_judge.py) +- [x] Environment implementation (environments/sv-env-netlogs-judge/sv_netlogs_judge_impl.py) - [x] Training config (configs/rl/e1_judge.toml) - [x] Makefile target (lab-run-e1-judge) -- [ ] Deploy judge environment to Hub +- [x] Deploy judge environment to Hub - [ ] Run Condition A (executable verifier) - [ ] Run Condition B (LLM-judge) - [ ] Collect results and plot learning curves @@ -112,7 +112,7 @@ tasks, favoring cheaper judge-based approaches for future environments. | File | Purpose | |------|---------| -| `environments/sv-env-network-logs/sv_env_network_logs_judge.py` | Judge variant environment | +| `environments/sv-env-netlogs-judge/sv_netlogs_judge_impl.py` | Judge variant environment | | `environments/sv-env-network-logs/sv_env_network_logs.py` | Executable verifier environment (baseline) | | `configs/rl/e1.toml` | Condition A training config | | `configs/rl/e1_judge.toml` | Condition B training config |