Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions pufferlib/config/ocean/action0.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
[base]
package = ocean
env_name = puffer_action0
policy_name = Policy
rnn_name = Recurrent

[env]
num_envs = 1024

[vec]
num_envs = 8

[train]
device = mps
total_timesteps = 100_000_000
minibatch_size = 32768
73 changes: 73 additions & 0 deletions pufferlib/ocean/action0/action0.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
/*
* Action0 Environment
*
* A simple credit assignment test. The agent must take action 0
* on the first step to win. The episode terminates at step 128 and
* the reward is given only at termination.
*
* Observation: Box(0, 1, (1,)) - always 1
* Action: Discrete(2)
* Win condition: action 0 on step 1
* Horizon: 128 steps
*/

#include <stdlib.h>
#include <string.h>

// Log struct - only floats, ends with n
typedef struct {
float score;
float n;
} Log;

typedef struct {
Log log; // Required field
float* observations; // Required field
int* actions; // Required field
float* rewards; // Required field
unsigned char* terminals; // Required field
int horizon;
int tick;
int won;
} Action0;

void c_reset(Action0* env) {
env->observations[0] = 1.0f;
env->tick = 0;
env->won = 0;
env->rewards[0] = 0.0f;
env->terminals[0] = 0;
}

void c_step(Action0* env) {
env->tick++;

// Check if first step and correct action
if (env->tick == 1 && env->actions[0] == 0) {
env->won = 1;
}

// Always reset reward/terminal first
env->rewards[0] = 0.0f;
env->terminals[0] = 0;

// Check if episode is done
if (env->tick >= env->horizon) {
env->rewards[0] = env->won ? 1.0f : 0.0f;
env->terminals[0] = 1;
env->log.score += env->won ? 1.0f : 0.0f;
env->log.n += 1.0f;
// Reset state for next episode but DON'T overwrite reward/terminal
env->tick = 0;
env->won = 0;
env->observations[0] = 1.0f;
}
}

void c_render(Action0* env) {
// No rendering for this simple env
}

void c_close(Action0* env) {
// Nothing to clean up
}
73 changes: 73 additions & 0 deletions pufferlib/ocean/action0/action0.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
'''Action0 - A simple credit assignment test environment.

The agent must take action 0 on the first step to win.
The episode terminates at step 128 and reward is given only at termination.
'''

import gymnasium
import numpy as np

import pufferlib
from pufferlib.ocean.action0 import binding


class Action0(pufferlib.PufferEnv):
def __init__(self, num_envs=1, render_mode=None, log_interval=128,
horizon=128, buf=None, seed=0):
self.single_observation_space = gymnasium.spaces.Box(
low=0, high=1, shape=(1,), dtype=np.float32)
self.single_action_space = gymnasium.spaces.Discrete(2)
self.render_mode = render_mode
self.num_agents = num_envs
self.log_interval = log_interval

super().__init__(buf)
self.c_envs = binding.vec_init(
self.observations, self.actions, self.rewards,
self.terminals, self.truncations, num_envs, seed,
horizon=horizon)

def reset(self, seed=0):
binding.vec_reset(self.c_envs, seed)
self.tick = 0
return self.observations, []

def step(self, actions):
self.tick += 1

self.actions[:] = actions
binding.vec_step(self.c_envs)

info = []
if self.tick % self.log_interval == 0:
log = binding.vec_log(self.c_envs)
if log:
info.append(log)

return (self.observations, self.rewards,
self.terminals, self.truncations, info)

def render(self):
binding.vec_render(self.c_envs, 0)

def close(self):
binding.vec_close(self.c_envs)


if __name__ == '__main__':
N = 4096

env = Action0(num_envs=N)
env.reset()
steps = 0

CACHE = 1024
actions = np.random.randint(0, 2, (CACHE, N))

import time
start = time.time()
while time.time() - start < 10:
env.step(actions[steps % CACHE])
steps += 1

print('Action0 SPS:', int(env.num_agents * steps / (time.time() - start)))
14 changes: 14 additions & 0 deletions pufferlib/ocean/action0/binding.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#include "action0.h"

#define Env Action0
#include "../env_binding.h"

static int my_init(Env* env, PyObject* args, PyObject* kwargs) {
env->horizon = (int)unpack(kwargs, "horizon");
return 0;
}

static int my_log(PyObject* dict, Log* log) {
assign_to_dict(dict, "score", log->score);
return 0;
}
1 change: 1 addition & 0 deletions pufferlib/ocean/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ def make_multiagent(buf=None, **kwargs):
return pufferlib.emulation.PettingZooPufferEnv(env=env, buf=buf)

MAKE_FUNCTIONS = {
'action0': 'Action0',
'battle': 'Battle',
'breakout': 'Breakout',
'blastar': 'Blastar',
Expand Down
53 changes: 53 additions & 0 deletions test_action0.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
"""Test script to prove Action0 env can be beaten."""
import numpy as np
from pufferlib.ocean.action0.action0 import Action0

# Create environment
env = Action0(num_envs=1, horizon=128)

# Test 1: Agent that always picks action 0 on step 1
print("=== Test 1: Always pick action 0 on step 1 ===")
wins = 0
episodes = 1000
for ep in range(episodes):
obs, _ = env.reset()
for step in range(128):
if step == 0:
action = np.array([0]) # Pick action 0 on first step
else:
action = np.array([1]) # Any action after
obs, reward, terminal, truncated, info = env.step(action)
if terminal[0]:
if reward[0] > 0:
wins += 1
break
print(f"Win rate: {wins}/{episodes} = {wins/episodes:.2%}")

# Test 2: Agent that picks randomly
print("\n=== Test 2: Random agent ===")
wins = 0
for ep in range(episodes):
obs, _ = env.reset()
for step in range(128):
action = np.random.randint(0, 2, size=(1,))
obs, reward, terminal, truncated, info = env.step(action)
if terminal[0]:
if reward[0] > 0:
wins += 1
break
print(f"Win rate: {wins}/{episodes} = {wins/episodes:.2%}")
print(f"Expected random: 1/2 = 50.00%")

# Test 3: Agent that always picks wrong action
print("\n=== Test 3: Always pick action 1 on step 1 ===")
wins = 0
for ep in range(episodes):
obs, _ = env.reset()
for step in range(128):
action = np.array([1]) # Always pick 1 (wrong)
obs, reward, terminal, truncated, info = env.step(action)
if terminal[0]:
if reward[0] > 0:
wins += 1
break
print(f"Win rate: {wins}/{episodes} = {wins/episodes:.2%}")
Loading