From 8af716f93f1e9cd79edab33103d385e15cee357c Mon Sep 17 00:00:00 2001
From: eitanporat <eitan.porat@duplex.org>
Date: Fri, 30 Jan 2026 21:21:30 +0200
Subject: [PATCH] Add Action0 environment for credit assignment testing

Simple environment where agent must pick action 0 on step 1 to win.
Episode terminates at step 128, reward given only at termination.
- 2 discrete actions
- 50% random baseline
- Tests long-horizon credit assignment with BPTT
---
 pufferlib/config/ocean/action0.ini | 16 +++++++
 pufferlib/ocean/action0/action0.h  | 73 ++++++++++++++++++++++++++++++
 pufferlib/ocean/action0/action0.py | 73 ++++++++++++++++++++++++++++++
 pufferlib/ocean/action0/binding.c  | 14 ++++++
 pufferlib/ocean/environment.py     |  1 +
 test_action0.py                    | 53 ++++++++++++++++++++++
 6 files changed, 230 insertions(+)
 create mode 100644 pufferlib/config/ocean/action0.ini
 create mode 100644 pufferlib/ocean/action0/action0.h
 create mode 100644 pufferlib/ocean/action0/action0.py
 create mode 100644 pufferlib/ocean/action0/binding.c
 create mode 100644 test_action0.py

diff --git a/pufferlib/config/ocean/action0.ini b/pufferlib/config/ocean/action0.ini
new file mode 100644
index 000000000..638560192
--- /dev/null
+++ b/pufferlib/config/ocean/action0.ini
@@ -0,0 +1,16 @@
+[base]
+package = ocean
+env_name = puffer_action0
+policy_name = Policy
+rnn_name = Recurrent
+
+[env]
+num_envs = 1024
+
+[vec]
+num_envs = 8
+
+[train]
+device = mps
+total_timesteps = 100_000_000
+minibatch_size = 32768
diff --git a/pufferlib/ocean/action0/action0.h b/pufferlib/ocean/action0/action0.h
new file mode 100644
index 000000000..9686729fa
--- /dev/null
+++ b/pufferlib/ocean/action0/action0.h
@@ -0,0 +1,73 @@
+/*
+ * Action0 Environment
+ *
+ * A simple credit assignment test. The agent must take action 0
+ * on the first step to win. The episode terminates at step 128 and
+ * the reward is given only at termination.
+ *
+ * Observation: Box(0, 1, (1,)) - always 1
+ * Action: Discrete(2)
+ * Win condition: action 0 on step 1
+ * Horizon: 128 steps
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+// Log struct - only floats, ends with n
+typedef struct {
+    float score;
+    float n;
+} Log;
+
+typedef struct {
+    Log log;                     // Required field
+    float* observations;         // Required field
+    int* actions;                // Required field
+    float* rewards;              // Required field
+    unsigned char* terminals;    // Required field
+    int horizon;
+    int tick;
+    int won;
+} Action0;
+
+void c_reset(Action0* env) {
+    env->observations[0] = 1.0f;
+    env->tick = 0;
+    env->won = 0;
+    env->rewards[0] = 0.0f;
+    env->terminals[0] = 0;
+}
+
+void c_step(Action0* env) {
+    env->tick++;
+
+    // Check if first step and correct action
+    if (env->tick == 1 && env->actions[0] == 0) {
+        env->won = 1;
+    }
+
+    // Always reset reward/terminal first
+    env->rewards[0] = 0.0f;
+    env->terminals[0] = 0;
+
+    // Check if episode is done
+    if (env->tick >= env->horizon) {
+        env->rewards[0] = env->won ? 1.0f : 0.0f;
+        env->terminals[0] = 1;
+        env->log.score += env->won ? 1.0f : 0.0f;
+        env->log.n += 1.0f;
+        // Reset state for next episode but DON'T overwrite reward/terminal
+        env->tick = 0;
+        env->won = 0;
+        env->observations[0] = 1.0f;
+    }
+}
+
+void c_render(Action0* env) {
+    // No rendering for this simple env
+}
+
+void c_close(Action0* env) {
+    // Nothing to clean up
+}
diff --git a/pufferlib/ocean/action0/action0.py b/pufferlib/ocean/action0/action0.py
new file mode 100644
index 000000000..24ab608ce
--- /dev/null
+++ b/pufferlib/ocean/action0/action0.py
@@ -0,0 +1,73 @@
+'''Action0 - A simple credit assignment test environment.
+
+The agent must take action 0 on the first step to win.
+The episode terminates at step 128 and reward is given only at termination.
+'''
+
+import gymnasium
+import numpy as np
+
+import pufferlib
+from pufferlib.ocean.action0 import binding
+
+
+class Action0(pufferlib.PufferEnv):
+    def __init__(self, num_envs=1, render_mode=None, log_interval=128,
+                 horizon=128, buf=None, seed=0):
+        self.single_observation_space = gymnasium.spaces.Box(
+            low=0, high=1, shape=(1,), dtype=np.float32)
+        self.single_action_space = gymnasium.spaces.Discrete(2)
+        self.render_mode = render_mode
+        self.num_agents = num_envs
+        self.log_interval = log_interval
+
+        super().__init__(buf)
+        self.c_envs = binding.vec_init(
+            self.observations, self.actions, self.rewards,
+            self.terminals, self.truncations, num_envs, seed,
+            horizon=horizon)
+
+    def reset(self, seed=0):
+        binding.vec_reset(self.c_envs, seed)
+        self.tick = 0
+        return self.observations, []
+
+    def step(self, actions):
+        self.tick += 1
+
+        self.actions[:] = actions
+        binding.vec_step(self.c_envs)
+
+        info = []
+        if self.tick % self.log_interval == 0:
+            log = binding.vec_log(self.c_envs)
+            if log:
+                info.append(log)
+
+        return (self.observations, self.rewards,
+                self.terminals, self.truncations, info)
+
+    def render(self):
+        binding.vec_render(self.c_envs, 0)
+
+    def close(self):
+        binding.vec_close(self.c_envs)
+
+
+if __name__ == '__main__':
+    N = 4096
+
+    env = Action0(num_envs=N)
+    env.reset()
+    steps = 0
+
+    CACHE = 1024
+    actions = np.random.randint(0, 2, (CACHE, N))
+
+    import time
+    start = time.time()
+    while time.time() - start < 10:
+        env.step(actions[steps % CACHE])
+        steps += 1
+
+    print('Action0 SPS:', int(env.num_agents * steps / (time.time() - start)))
diff --git a/pufferlib/ocean/action0/binding.c b/pufferlib/ocean/action0/binding.c
new file mode 100644
index 000000000..fac267f57
--- /dev/null
+++ b/pufferlib/ocean/action0/binding.c
@@ -0,0 +1,14 @@
+#include "action0.h"
+
+#define Env Action0
+#include "../env_binding.h"
+
+static int my_init(Env* env, PyObject* args, PyObject* kwargs) {
+    env->horizon = (int)unpack(kwargs, "horizon");
+    return 0;
+}
+
+static int my_log(PyObject* dict, Log* log) {
+    assign_to_dict(dict, "score", log->score);
+    return 0;
+}
diff --git a/pufferlib/ocean/environment.py b/pufferlib/ocean/environment.py
index 6c56a4ea2..f3c922ee5 100644
--- a/pufferlib/ocean/environment.py
+++ b/pufferlib/ocean/environment.py
@@ -117,6 +117,7 @@ def make_multiagent(buf=None, **kwargs):
     return pufferlib.emulation.PettingZooPufferEnv(env=env, buf=buf)
 
 MAKE_FUNCTIONS = {
+    'action0': 'Action0',
     'battle': 'Battle',
     'breakout': 'Breakout',
     'blastar': 'Blastar',
diff --git a/test_action0.py b/test_action0.py
new file mode 100644
index 000000000..4bd50bd40
--- /dev/null
+++ b/test_action0.py
@@ -0,0 +1,53 @@
+"""Test script to prove Action0 env can be beaten."""
+import numpy as np
+from pufferlib.ocean.action0.action0 import Action0
+
+# Create environment
+env = Action0(num_envs=1, horizon=128)
+
+# Test 1: Agent that always picks action 0 on step 1
+print("=== Test 1: Always pick action 0 on step 1 ===")
+wins = 0
+episodes = 1000
+for ep in range(episodes):
+    obs, _ = env.reset()
+    for step in range(128):
+        if step == 0:
+            action = np.array([0])  # Pick action 0 on first step
+        else:
+            action = np.array([1])  # Any action after
+        obs, reward, terminal, truncated, info = env.step(action)
+        if terminal[0]:
+            if reward[0] > 0:
+                wins += 1
+            break
+print(f"Win rate: {wins}/{episodes} = {wins/episodes:.2%}")
+
+# Test 2: Agent that picks randomly
+print("\n=== Test 2: Random agent ===")
+wins = 0
+for ep in range(episodes):
+    obs, _ = env.reset()
+    for step in range(128):
+        action = np.random.randint(0, 2, size=(1,))
+        obs, reward, terminal, truncated, info = env.step(action)
+        if terminal[0]:
+            if reward[0] > 0:
+                wins += 1
+            break
+print(f"Win rate: {wins}/{episodes} = {wins/episodes:.2%}")
+print(f"Expected random: 1/2 = 50.00%")
+
+# Test 3: Agent that always picks wrong action
+print("\n=== Test 3: Always pick action 1 on step 1 ===")
+wins = 0
+for ep in range(episodes):
+    obs, _ = env.reset()
+    for step in range(128):
+        action = np.array([1])  # Always pick 1 (wrong)
+        obs, reward, terminal, truncated, info = env.step(action)
+        if terminal[0]:
+            if reward[0] > 0:
+                wins += 1
+            break
+print(f"Win rate: {wins}/{episodes} = {wins/episodes:.2%}")