From 8af716f93f1e9cd79edab33103d385e15cee357c Mon Sep 17 00:00:00 2001 From: eitanporat Date: Fri, 30 Jan 2026 21:21:30 +0200 Subject: [PATCH] Add Action0 environment for credit assignment testing Simple environment where agent must pick action 0 on step 1 to win. Episode terminates at step 128, reward given only at termination. - 2 discrete actions - 50% random baseline - Tests long-horizon credit assignment with BPTT --- pufferlib/config/ocean/action0.ini | 16 +++++++ pufferlib/ocean/action0/action0.h | 73 ++++++++++++++++++++++++++++++ pufferlib/ocean/action0/action0.py | 73 ++++++++++++++++++++++++++++++ pufferlib/ocean/action0/binding.c | 14 ++++++ pufferlib/ocean/environment.py | 1 + test_action0.py | 53 ++++++++++++++++++++++ 6 files changed, 230 insertions(+) create mode 100644 pufferlib/config/ocean/action0.ini create mode 100644 pufferlib/ocean/action0/action0.h create mode 100644 pufferlib/ocean/action0/action0.py create mode 100644 pufferlib/ocean/action0/binding.c create mode 100644 test_action0.py diff --git a/pufferlib/config/ocean/action0.ini b/pufferlib/config/ocean/action0.ini new file mode 100644 index 000000000..638560192 --- /dev/null +++ b/pufferlib/config/ocean/action0.ini @@ -0,0 +1,16 @@ +[base] +package = ocean +env_name = puffer_action0 +policy_name = Policy +rnn_name = Recurrent + +[env] +num_envs = 1024 + +[vec] +num_envs = 8 + +[train] +device = mps +total_timesteps = 100_000_000 +minibatch_size = 32768 diff --git a/pufferlib/ocean/action0/action0.h b/pufferlib/ocean/action0/action0.h new file mode 100644 index 000000000..9686729fa --- /dev/null +++ b/pufferlib/ocean/action0/action0.h @@ -0,0 +1,73 @@ +/* + * Action0 Environment + * + * A simple credit assignment test. The agent must take action 0 + * on the first step to win. The episode terminates at step 128 and + * the reward is given only at termination. + * + * Observation: Box(0, 1, (1,)) - always 1 + * Action: Discrete(2) + * Win condition: action 0 on step 1 + * Horizon: 128 steps + */ + +#include +#include + +// Log struct - only floats, ends with n +typedef struct { + float score; + float n; +} Log; + +typedef struct { + Log log; // Required field + float* observations; // Required field + int* actions; // Required field + float* rewards; // Required field + unsigned char* terminals; // Required field + int horizon; + int tick; + int won; +} Action0; + +void c_reset(Action0* env) { + env->observations[0] = 1.0f; + env->tick = 0; + env->won = 0; + env->rewards[0] = 0.0f; + env->terminals[0] = 0; +} + +void c_step(Action0* env) { + env->tick++; + + // Check if first step and correct action + if (env->tick == 1 && env->actions[0] == 0) { + env->won = 1; + } + + // Always reset reward/terminal first + env->rewards[0] = 0.0f; + env->terminals[0] = 0; + + // Check if episode is done + if (env->tick >= env->horizon) { + env->rewards[0] = env->won ? 1.0f : 0.0f; + env->terminals[0] = 1; + env->log.score += env->won ? 1.0f : 0.0f; + env->log.n += 1.0f; + // Reset state for next episode but DON'T overwrite reward/terminal + env->tick = 0; + env->won = 0; + env->observations[0] = 1.0f; + } +} + +void c_render(Action0* env) { + // No rendering for this simple env +} + +void c_close(Action0* env) { + // Nothing to clean up +} diff --git a/pufferlib/ocean/action0/action0.py b/pufferlib/ocean/action0/action0.py new file mode 100644 index 000000000..24ab608ce --- /dev/null +++ b/pufferlib/ocean/action0/action0.py @@ -0,0 +1,73 @@ +'''Action0 - A simple credit assignment test environment. + +The agent must take action 0 on the first step to win. +The episode terminates at step 128 and reward is given only at termination. +''' + +import gymnasium +import numpy as np + +import pufferlib +from pufferlib.ocean.action0 import binding + + +class Action0(pufferlib.PufferEnv): + def __init__(self, num_envs=1, render_mode=None, log_interval=128, + horizon=128, buf=None, seed=0): + self.single_observation_space = gymnasium.spaces.Box( + low=0, high=1, shape=(1,), dtype=np.float32) + self.single_action_space = gymnasium.spaces.Discrete(2) + self.render_mode = render_mode + self.num_agents = num_envs + self.log_interval = log_interval + + super().__init__(buf) + self.c_envs = binding.vec_init( + self.observations, self.actions, self.rewards, + self.terminals, self.truncations, num_envs, seed, + horizon=horizon) + + def reset(self, seed=0): + binding.vec_reset(self.c_envs, seed) + self.tick = 0 + return self.observations, [] + + def step(self, actions): + self.tick += 1 + + self.actions[:] = actions + binding.vec_step(self.c_envs) + + info = [] + if self.tick % self.log_interval == 0: + log = binding.vec_log(self.c_envs) + if log: + info.append(log) + + return (self.observations, self.rewards, + self.terminals, self.truncations, info) + + def render(self): + binding.vec_render(self.c_envs, 0) + + def close(self): + binding.vec_close(self.c_envs) + + +if __name__ == '__main__': + N = 4096 + + env = Action0(num_envs=N) + env.reset() + steps = 0 + + CACHE = 1024 + actions = np.random.randint(0, 2, (CACHE, N)) + + import time + start = time.time() + while time.time() - start < 10: + env.step(actions[steps % CACHE]) + steps += 1 + + print('Action0 SPS:', int(env.num_agents * steps / (time.time() - start))) diff --git a/pufferlib/ocean/action0/binding.c b/pufferlib/ocean/action0/binding.c new file mode 100644 index 000000000..fac267f57 --- /dev/null +++ b/pufferlib/ocean/action0/binding.c @@ -0,0 +1,14 @@ +#include "action0.h" + +#define Env Action0 +#include "../env_binding.h" + +static int my_init(Env* env, PyObject* args, PyObject* kwargs) { + env->horizon = (int)unpack(kwargs, "horizon"); + return 0; +} + +static int my_log(PyObject* dict, Log* log) { + assign_to_dict(dict, "score", log->score); + return 0; +} diff --git a/pufferlib/ocean/environment.py b/pufferlib/ocean/environment.py index 6c56a4ea2..f3c922ee5 100644 --- a/pufferlib/ocean/environment.py +++ b/pufferlib/ocean/environment.py @@ -117,6 +117,7 @@ def make_multiagent(buf=None, **kwargs): return pufferlib.emulation.PettingZooPufferEnv(env=env, buf=buf) MAKE_FUNCTIONS = { + 'action0': 'Action0', 'battle': 'Battle', 'breakout': 'Breakout', 'blastar': 'Blastar', diff --git a/test_action0.py b/test_action0.py new file mode 100644 index 000000000..4bd50bd40 --- /dev/null +++ b/test_action0.py @@ -0,0 +1,53 @@ +"""Test script to prove Action0 env can be beaten.""" +import numpy as np +from pufferlib.ocean.action0.action0 import Action0 + +# Create environment +env = Action0(num_envs=1, horizon=128) + +# Test 1: Agent that always picks action 0 on step 1 +print("=== Test 1: Always pick action 0 on step 1 ===") +wins = 0 +episodes = 1000 +for ep in range(episodes): + obs, _ = env.reset() + for step in range(128): + if step == 0: + action = np.array([0]) # Pick action 0 on first step + else: + action = np.array([1]) # Any action after + obs, reward, terminal, truncated, info = env.step(action) + if terminal[0]: + if reward[0] > 0: + wins += 1 + break +print(f"Win rate: {wins}/{episodes} = {wins/episodes:.2%}") + +# Test 2: Agent that picks randomly +print("\n=== Test 2: Random agent ===") +wins = 0 +for ep in range(episodes): + obs, _ = env.reset() + for step in range(128): + action = np.random.randint(0, 2, size=(1,)) + obs, reward, terminal, truncated, info = env.step(action) + if terminal[0]: + if reward[0] > 0: + wins += 1 + break +print(f"Win rate: {wins}/{episodes} = {wins/episodes:.2%}") +print(f"Expected random: 1/2 = 50.00%") + +# Test 3: Agent that always picks wrong action +print("\n=== Test 3: Always pick action 1 on step 1 ===") +wins = 0 +for ep in range(episodes): + obs, _ = env.reset() + for step in range(128): + action = np.array([1]) # Always pick 1 (wrong) + obs, reward, terminal, truncated, info = env.step(action) + if terminal[0]: + if reward[0] > 0: + wins += 1 + break +print(f"Win rate: {wins}/{episodes} = {wins/episodes:.2%}")