pufferlib/config/ocean/action0.ini

-Original file line number
+Diff line change
@@ -0,0 +1,16 @@
+    [base]
+    package = ocean
+    env_name = puffer_action0
+    policy_name = Policy
+    rnn_name = Recurrent
+    [env]
+    num_envs = 1024
+    [vec]
+    num_envs = 8
+    [train]
+    device = mps
+    total_timesteps = 100_000_000
+    minibatch_size = 32768

pufferlib/ocean/action0/action0.h

-Original file line number
+Diff line change
@@ -0,0 +1,73 @@
+    /*
+     * Action0 Environment
+     *
+     * A simple credit assignment test. The agent must take action 0
+     * on the first step to win. The episode terminates at step 128 and
+     * the reward is given only at termination.
+     *
+     * Observation: Box(0, 1, (1,)) - always 1
+     * Action: Discrete(2)
+     * Win condition: action 0 on step 1
+     * Horizon: 128 steps
+     */
+    #include <stdlib.h>
+    #include <string.h>
+    // Log struct - only floats, ends with n
+    typedef struct {
+        float score;
+        float n;
+    } Log;
+    typedef struct {
+        Log log;                     // Required field
+        float* observations;         // Required field
+        int* actions;                // Required field
+        float* rewards;              // Required field
+        unsigned char* terminals;    // Required field
+        int horizon;
+        int tick;
+        int won;
+    } Action0;
+    void c_reset(Action0* env) {
+        env->observations[0] = 1.0f;
+        env->tick = 0;
+        env->won = 0;
+        env->rewards[0] = 0.0f;
+        env->terminals[0] = 0;
+    }
+    void c_step(Action0* env) {
+        env->tick++;
+        // Check if first step and correct action
+        if (env->tick == 1 && env->actions[0] == 0) {
+            env->won = 1;
+        }
+        // Always reset reward/terminal first
+        env->rewards[0] = 0.0f;
+        env->terminals[0] = 0;
+        // Check if episode is done
+        if (env->tick >= env->horizon) {
+            env->rewards[0] = env->won ? 1.0f : 0.0f;
+            env->terminals[0] = 1;
+            env->log.score += env->won ? 1.0f : 0.0f;
+            env->log.n += 1.0f;
+            // Reset state for next episode but DON'T overwrite reward/terminal
+            env->tick = 0;
+            env->won = 0;
+            env->observations[0] = 1.0f;
+        }
+    }
+    void c_render(Action0* env) {
+        // No rendering for this simple env
+    }
+    void c_close(Action0* env) {
+        // Nothing to clean up
+    }

pufferlib/ocean/action0/action0.py

-Original file line number
+Diff line change
@@ -0,0 +1,73 @@
+    '''Action0 - A simple credit assignment test environment.
+    The agent must take action 0 on the first step to win.
+    The episode terminates at step 128 and reward is given only at termination.
+    '''
+    import gymnasium
+    import numpy as np
+    import pufferlib
+    from pufferlib.ocean.action0 import binding
+    class Action0(pufferlib.PufferEnv):
+        def __init__(self, num_envs=1, render_mode=None, log_interval=128,
+                     horizon=128, buf=None, seed=0):
+            self.single_observation_space = gymnasium.spaces.Box(
+                low=0, high=1, shape=(1,), dtype=np.float32)
+            self.single_action_space = gymnasium.spaces.Discrete(2)
+            self.render_mode = render_mode
+            self.num_agents = num_envs
+            self.log_interval = log_interval
+            super().__init__(buf)
+            self.c_envs = binding.vec_init(
+                self.observations, self.actions, self.rewards,
+                self.terminals, self.truncations, num_envs, seed,
+                horizon=horizon)
+        def reset(self, seed=0):
+            binding.vec_reset(self.c_envs, seed)
+            self.tick = 0
+            return self.observations, []
+        def step(self, actions):
+            self.tick += 1
+            self.actions[:] = actions
+            binding.vec_step(self.c_envs)
+            info = []
+            if self.tick % self.log_interval == 0:
+                log = binding.vec_log(self.c_envs)
+                if log:
+                    info.append(log)
+            return (self.observations, self.rewards,
+                    self.terminals, self.truncations, info)
+        def render(self):
+            binding.vec_render(self.c_envs, 0)
+        def close(self):
+            binding.vec_close(self.c_envs)
+    if __name__ == '__main__':
+        N = 4096
+        env = Action0(num_envs=N)
+        env.reset()
+        steps = 0
+        CACHE = 1024
+        actions = np.random.randint(0, 2, (CACHE, N))
+        import time
+        start = time.time()
+        while time.time() - start < 10:
+            env.step(actions[steps % CACHE])
+            steps += 1
+        print('Action0 SPS:', int(env.num_agents * steps / (time.time() - start)))

pufferlib/ocean/action0/binding.c

-Original file line number
+Diff line change
@@ -0,0 +1,14 @@
+    #include "action0.h"
+    #define Env Action0
+    #include "../env_binding.h"
+    static int my_init(Env* env, PyObject* args, PyObject* kwargs) {
+        env->horizon = (int)unpack(kwargs, "horizon");
+        return 0;
+    }
+    static int my_log(PyObject* dict, Log* log) {
+        assign_to_dict(dict, "score", log->score);
+        return 0;
+    }

pufferlib/ocean/environment.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -117,6 +117,7 @@ def make_multiagent(buf=None, **kwargs): @@
         return pufferlib.emulation.PettingZooPufferEnv(env=env, buf=buf)
     MAKE_FUNCTIONS = {
+        'action0': 'Action0',
         'battle': 'Battle',
         'breakout': 'Breakout',
         'blastar': 'Blastar',
@@ Expand Down @@

test_action0.py

-Original file line number
+Diff line change
@@ -0,0 +1,53 @@
+    """Test script to prove Action0 env can be beaten."""
+    import numpy as np
+    from pufferlib.ocean.action0.action0 import Action0
+    # Create environment
+    env = Action0(num_envs=1, horizon=128)
+    # Test 1: Agent that always picks action 0 on step 1
+    print("=== Test 1: Always pick action 0 on step 1 ===")
+    wins = 0
+    episodes = 1000
+    for ep in range(episodes):
+        obs, _ = env.reset()
+        for step in range(128):
+            if step == 0:
+                action = np.array([0])  # Pick action 0 on first step
+            else:
+                action = np.array([1])  # Any action after
+            obs, reward, terminal, truncated, info = env.step(action)
+            if terminal[0]:
+                if reward[0] > 0:
+                    wins += 1
+                break
+    print(f"Win rate: {wins}/{episodes} = {wins/episodes:.2%}")
+    # Test 2: Agent that picks randomly
+    print("\n=== Test 2: Random agent ===")
+    wins = 0
+    for ep in range(episodes):
+        obs, _ = env.reset()
+        for step in range(128):
+            action = np.random.randint(0, 2, size=(1,))
+            obs, reward, terminal, truncated, info = env.step(action)
+            if terminal[0]:
+                if reward[0] > 0:
+                    wins += 1
+                break
+    print(f"Win rate: {wins}/{episodes} = {wins/episodes:.2%}")
+    print(f"Expected random: 1/2 = 50.00%")
+    # Test 3: Agent that always picks wrong action
+    print("\n=== Test 3: Always pick action 1 on step 1 ===")
+    wins = 0
+    for ep in range(episodes):
+        obs, _ = env.reset()
+        for step in range(128):
+            action = np.array([1])  # Always pick 1 (wrong)
+            obs, reward, terminal, truncated, info = env.step(action)
+            if terminal[0]:
+                if reward[0] > 0:
+                    wins += 1
+                break
+    print(f"Win rate: {wins}/{episodes} = {wins/episodes:.2%}")

Puffer doesn't beat my simple env - credit assignment testing #478

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open

eitanporat wants to merge 1 commit into PufferAI:3.0 from eitanporat:action0-env

+230 −0

-Original file line number
+Diff line change
@@ -0,0 +1,16 @@
+    [base]
+    package = ocean
+    env_name = puffer_action0
+    policy_name = Policy
+    rnn_name = Recurrent
+    [env]
+    num_envs = 1024
+    [vec]
+    num_envs = 8
+    [train]
+    device = mps
+    total_timesteps = 100_000_000
+    minibatch_size = 32768

-Original file line number
+Diff line change
@@ -0,0 +1,73 @@
+    /*
+     * Action0 Environment
+     *
+     * A simple credit assignment test. The agent must take action 0
+     * on the first step to win. The episode terminates at step 128 and
+     * the reward is given only at termination.
+     *
+     * Observation: Box(0, 1, (1,)) - always 1
+     * Action: Discrete(2)
+     * Win condition: action 0 on step 1
+     * Horizon: 128 steps
+     */
+    #include <stdlib.h>
+    #include <string.h>
+    // Log struct - only floats, ends with n
+    typedef struct {
+        float score;
+        float n;
+    } Log;
+    typedef struct {
+        Log log;                     // Required field
+        float* observations;         // Required field
+        int* actions;                // Required field
+        float* rewards;              // Required field
+        unsigned char* terminals;    // Required field
+        int horizon;
+        int tick;
+        int won;
+    } Action0;
+    void c_reset(Action0* env) {
+        env->observations[0] = 1.0f;
+        env->tick = 0;
+        env->won = 0;
+        env->rewards[0] = 0.0f;
+        env->terminals[0] = 0;
+    }
+    void c_step(Action0* env) {
+        env->tick++;
+        // Check if first step and correct action
+        if (env->tick == 1 && env->actions[0] == 0) {
+            env->won = 1;
+        }
+        // Always reset reward/terminal first
+        env->rewards[0] = 0.0f;
+        env->terminals[0] = 0;
+        // Check if episode is done
+        if (env->tick >= env->horizon) {
+            env->rewards[0] = env->won ? 1.0f : 0.0f;
+            env->terminals[0] = 1;
+            env->log.score += env->won ? 1.0f : 0.0f;
+            env->log.n += 1.0f;
+            // Reset state for next episode but DON'T overwrite reward/terminal
+            env->tick = 0;
+            env->won = 0;
+            env->observations[0] = 1.0f;
+        }
+    }
+    void c_render(Action0* env) {
+        // No rendering for this simple env
+    }
+    void c_close(Action0* env) {
+        // Nothing to clean up
+    }

-Original file line number
+Diff line change
@@ -0,0 +1,73 @@
+    '''Action0 - A simple credit assignment test environment.
+    The agent must take action 0 on the first step to win.
+    The episode terminates at step 128 and reward is given only at termination.
+    '''
+    import gymnasium
+    import numpy as np
+    import pufferlib
+    from pufferlib.ocean.action0 import binding
+    class Action0(pufferlib.PufferEnv):
+        def __init__(self, num_envs=1, render_mode=None, log_interval=128,
+                     horizon=128, buf=None, seed=0):
+            self.single_observation_space = gymnasium.spaces.Box(
+                low=0, high=1, shape=(1,), dtype=np.float32)
+            self.single_action_space = gymnasium.spaces.Discrete(2)
+            self.render_mode = render_mode
+            self.num_agents = num_envs
+            self.log_interval = log_interval
+            super().__init__(buf)
+            self.c_envs = binding.vec_init(
+                self.observations, self.actions, self.rewards,
+                self.terminals, self.truncations, num_envs, seed,
+                horizon=horizon)
+        def reset(self, seed=0):
+            binding.vec_reset(self.c_envs, seed)
+            self.tick = 0
+            return self.observations, []
+        def step(self, actions):
+            self.tick += 1
+            self.actions[:] = actions
+            binding.vec_step(self.c_envs)
+            info = []
+            if self.tick % self.log_interval == 0:
+                log = binding.vec_log(self.c_envs)
+                if log:
+                    info.append(log)
+            return (self.observations, self.rewards,
+                    self.terminals, self.truncations, info)
+        def render(self):
+            binding.vec_render(self.c_envs, 0)
+        def close(self):
+            binding.vec_close(self.c_envs)
+    if __name__ == '__main__':
+        N = 4096
+        env = Action0(num_envs=N)
+        env.reset()
+        steps = 0
+        CACHE = 1024
+        actions = np.random.randint(0, 2, (CACHE, N))
+        import time
+        start = time.time()
+        while time.time() - start < 10:
+            env.step(actions[steps % CACHE])
+            steps += 1
+        print('Action0 SPS:', int(env.num_agents * steps / (time.time() - start)))

-Original file line number
+Diff line change
@@ -0,0 +1,14 @@
+    #include "action0.h"
+    #define Env Action0
+    #include "../env_binding.h"
+    static int my_init(Env* env, PyObject* args, PyObject* kwargs) {
+        env->horizon = (int)unpack(kwargs, "horizon");
+        return 0;
+    }
+    static int my_log(PyObject* dict, Log* log) {
+        assign_to_dict(dict, "score", log->score);
+        return 0;
+    }

-Original file line number
+Diff line change
@@ Expand Up / @@ -117,6 +117,7 @@ def make_multiagent(buf=None, **kwargs): @@
         return pufferlib.emulation.PettingZooPufferEnv(env=env, buf=buf)
     MAKE_FUNCTIONS = {
+        'action0': 'Action0',
         'battle': 'Battle',
         'breakout': 'Breakout',
         'blastar': 'Blastar',
@@ Expand Down @@

-Original file line number
+Diff line change
@@ -0,0 +1,53 @@
+    """Test script to prove Action0 env can be beaten."""
+    import numpy as np
+    from pufferlib.ocean.action0.action0 import Action0
+    # Create environment
+    env = Action0(num_envs=1, horizon=128)
+    # Test 1: Agent that always picks action 0 on step 1
+    print("=== Test 1: Always pick action 0 on step 1 ===")
+    wins = 0
+    episodes = 1000
+    for ep in range(episodes):
+        obs, _ = env.reset()
+        for step in range(128):
+            if step == 0:
+                action = np.array([0])  # Pick action 0 on first step
+            else:
+                action = np.array([1])  # Any action after
+            obs, reward, terminal, truncated, info = env.step(action)
+            if terminal[0]:
+                if reward[0] > 0:
+                    wins += 1
+                break
+    print(f"Win rate: {wins}/{episodes} = {wins/episodes:.2%}")
+    # Test 2: Agent that picks randomly
+    print("\n=== Test 2: Random agent ===")
+    wins = 0
+    for ep in range(episodes):
+        obs, _ = env.reset()
+        for step in range(128):
+            action = np.random.randint(0, 2, size=(1,))
+            obs, reward, terminal, truncated, info = env.step(action)
+            if terminal[0]:
+                if reward[0] > 0:
+                    wins += 1
+                break
+    print(f"Win rate: {wins}/{episodes} = {wins/episodes:.2%}")
+    print(f"Expected random: 1/2 = 50.00%")
+    # Test 3: Agent that always picks wrong action
+    print("\n=== Test 3: Always pick action 1 on step 1 ===")
+    wins = 0
+    for ep in range(episodes):
+        obs, _ = env.reset()
+        for step in range(128):
+            action = np.array([1])  # Always pick 1 (wrong)
+            obs, reward, terminal, truncated, info = env.step(action)
+            if terminal[0]:
+                if reward[0] > 0:
+                    wins += 1
+                break
+    print(f"Win rate: {wins}/{episodes} = {wins/episodes:.2%}")

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Puffer doesn't beat my simple env - credit assignment testing #478

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Puffer doesn't beat my simple env - credit assignment testing #478

Are you sure you want to change the base?

Puffer doesn't beat my simple env - credit assignment testing #478

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!