From 11ebb26a5e51c25a69828d0ada6425399d3e2f16 Mon Sep 17 00:00:00 2001 From: frixaco Date: Tue, 13 Jan 2026 02:47:44 +0500 Subject: [PATCH 01/29] init boss fight env --- TODO.md | 18 ++ pufferlib/config/boss_fight.ini | 15 ++ pufferlib/ocean/boss_fight/README.md | 218 ++++++++++++++++++ pufferlib/ocean/boss_fight/binding.c | 14 ++ pufferlib/ocean/boss_fight/boss_fight.c | 32 +++ pufferlib/ocean/boss_fight/boss_fight.h | 80 +++++++ pufferlib/ocean/boss_fight/boss_fight.py | 67 ++++++ pufferlib/ocean/environment.py | 279 ++++++++++++++++------- 8 files changed, 640 insertions(+), 83 deletions(-) create mode 100644 TODO.md create mode 100644 pufferlib/config/boss_fight.ini create mode 100644 pufferlib/ocean/boss_fight/README.md create mode 100644 pufferlib/ocean/boss_fight/binding.c create mode 100644 pufferlib/ocean/boss_fight/boss_fight.c create mode 100644 pufferlib/ocean/boss_fight/boss_fight.h create mode 100644 pufferlib/ocean/boss_fight/boss_fight.py diff --git a/TODO.md b/TODO.md new file mode 100644 index 000000000..938ca7614 --- /dev/null +++ b/TODO.md @@ -0,0 +1,18 @@ +## Notes for for my Boss Fight environment + +### Setup + +1. Fork pufferlib, create new branch + +2. Run these: + ``` + uv venv + uv pip install -e . + ``` + +3. Setup files using templates, update `environment.py` + +4. Not sure what this does yet: + ``` + python setup.py build_boss_fight --inplace + ``` diff --git a/pufferlib/config/boss_fight.ini b/pufferlib/config/boss_fight.ini new file mode 100644 index 000000000..d7f426abf --- /dev/null +++ b/pufferlib/config/boss_fight.ini @@ -0,0 +1,15 @@ +[base] +package = ocean +env_name = puffer_boss_fight +policy_name = Policy + +[env] +num_envs = 14 + +[train] +total_timesteps = 1_000_000 +minibatch_size=1024 + +[sweep] +goal = maximize +metric = episode_return diff --git a/pufferlib/ocean/boss_fight/README.md b/pufferlib/ocean/boss_fight/README.md new file mode 100644 index 000000000..e81c5f874 --- /dev/null +++ b/pufferlib/ocean/boss_fight/README.md @@ -0,0 +1,218 @@ +# SoulsRL Minimal — RL-Focused Boss Fight Environment + +## Goal + +Build a **minimal** 2D boss fight environment to learn RL concepts with PufferLib. +Focus: **observation design, reward shaping, and training experiments** — not game engine complexity. + +The boss has **1 attack** (AOE burst). All hitboxes are circles. No rendering required. + +--- + +## Core Mechanics (Simplified) + +### Constants + +``` +Tick rate: 30 ticks/sec (dt = 1/30) +Arena: 10 x 10 units (centered at origin, so bounds are -5 to +5) + +Player: + - radius: 0.3 + - HP: 100 + - speed: 3.0 units/sec (~0.1 units/tick) + +Boss: + - radius: 0.5 + - HP: 100 + - position: fixed at (0, 0) — does not move +``` + +### Player Actions (Discrete, 7 total) + +``` +0: NOOP +1: UP +2: DOWN +3: LEFT +4: RIGHT +5: DODGE +6: ATTACK +``` + +### Player States + +``` +FREE — can move, can act +DODGE — 6 ticks, i-frames on ticks 1-5, moves at 2.5x speed in last move_dir +ATTACK — windup(4) + active(3) + recovery(6) = 13 ticks total, no movement +``` + +**Cooldowns:** + +- Dodge: 15 ticks after dodge ends +- Attack: No cooldown (but you're locked for 13 ticks) + +**Attack hitbox (during ACTIVE):** + +- Circle at `player_pos + facing * 0.7`, radius `0.4` +- `facing` = direction to boss at attack start +- Damage: 10 + +### Boss Behavior (Single Attack) + +Boss cycles: `IDLE → WINDUP → ACTIVE → RECOVERY → IDLE` + +``` +IDLE: 12 ticks (0.4s) — does nothing +WINDUP: 18 ticks (0.6s) — telegraphing, no damage +ACTIVE: 3 ticks (0.1s) — AOE hits +RECOVERY: 15 ticks (0.5s) — vulnerable, no damage +``` + +**AOE Attack:** + +- Circle centered on boss, radius `1.5` +- Damage: 20 +- Player takes damage if: in AOE radius AND not in i-frames + +--- + +## Observation Space (14 floats) + +Keep it minimal. You can ablate later. + +``` +Geometry (3): + 0: rel_boss_x = boss_x - player_x (normalized by arena half-size) + 1: rel_boss_y = boss_y - player_y + 2: distance = clamp(dist / 5.0, 0, 1) + +Player (5): + 3: player_hp = hp / 100 + 4: dodge_ready = 1.0 if can dodge, else 0.0 + 5: player_state = {FREE: 0, DODGE: 0.33, ATTACK: 0.66} # scalar encoding + 6: state_progress = ticks_in_state / state_duration + 7: move_dir_x = -1 to 1 + +Boss (6): + 8: boss_hp = hp / 100 + 9: boss_phase = {IDLE: 0, WINDUP: 0.33, ACTIVE: 0.66, RECOVERY: 1.0} + 10: phase_progress = ticks_in_phase / phase_duration + 11: time_to_damage = ticks until ACTIVE starts / 18 (1.0 during IDLE/RECOVERY) + 12: in_aoe_range = 1.0 if distance < 1.5, else 0.0 + 13: boss_attacking = 1.0 if in WINDUP/ACTIVE, else 0.0 +``` + +--- + +## Reward Function (v1 — HP delta) + +```python +# Per step +reward = 0 +reward += (boss_hp_prev - boss_hp_now) * 0.1 # +1.0 per hit landed +reward += (player_hp_prev - player_hp_now) * -0.1 # -2.0 per AOE hit taken +reward += -0.001 # time penalty + +# Terminal +if boss_hp <= 0: reward += 1.0 # win bonus +if player_hp <= 0: reward -= 1.0 # lose penalty +``` + +--- + +## Episode Termination + +- `terminated = True` if player or boss HP <= 0 +- `truncated = True` if ticks >= 900 (30 seconds) + +--- + +## Implementation (Single File) + +Everything in `soulsrl.py` (~250-300 lines): + +```python +class SoulsEnv(pufferlib.PufferEnv): + # Player state machine + # Boss state machine + # Collision detection (circle-circle only) + # Observation building + # Reward calculation +``` + +No separate core.py, no rendering, no curriculum stages. + +--- + +## RL Experiments + +Once v1 is working, run these experiments to learn RL concepts: + +### Experiment 1: Observation Ablations + +| Variant | Change | Hypothesis | +| --------- | --------------------------------------------------------------- | -------------------------------------- | +| no_timing | Remove `time_to_damage`, `phase_progress` | Agent can't learn precise dodge timing | +| no_range | Remove `in_aoe_range`, `distance` | Agent can't learn spacing | +| minimal | Only: `distance`, `time_to_damage`, `dodge_ready`, `boss_phase` | Test minimum viable obs | +| noisy | Add 5 uniform random floats | Network should ignore noise | + +### Experiment 2: Reward Shaping + +| Variant | Change | Hypothesis | +| --------------- | -------------------------------- | -------------------------- | +| sparse | Only win/lose bonus, no HP delta | Much slower learning | +| no_time_penalty | Remove -0.001/step | Agent becomes passive | +| dodge_bonus | +0.2 for dodging during ACTIVE | Might create dodge spam | +| proximity | +0.01 for being close to boss | Might discourage safe play | + +### Experiment 3: Hyperparameters + +| Param | Values | What to observe | +| ------------- | ---------------- | --------------------------- | +| learning_rate | 1e-3, 3e-4, 1e-4 | Learning speed vs stability | +| ent_coef | 0.0, 0.01, 0.05 | Exploration vs exploitation | +| num_envs | 8, 32, 128 | Sample efficiency | +| hidden_size | 32, 64, 128 | Model capacity | + +--- + +## Success Criteria + +1. **Baseline works**: Random agent wins ~0%, trained agent wins >80% +2. **Learned timing**: Agent dodges during WINDUP, not randomly +3. **Learned punish**: Agent attacks during RECOVERY, not during ACTIVE +4. **Experiments complete**: At least 3 ablations run with plotted comparisons + +--- + +## Optional Extensions (After Experiments) + +Only add these if baseline experiments are done: + +1. **Sweep attack**: Cone hitbox, tests directional dodging +2. **Boss movement**: Slow drift toward player +3. **Combo attack**: Multi-hit sequence, tests dodge timing +4. **ASCII rendering**: For debugging/demo +5. **Curriculum**: Start with longer windup, tighten over training + +--- + +## Deliverables + +1. `soulsrl.py` — Environment (PufferEnv) +2. `train.py` — Training script with logging +3. `experiments/` — Saved runs with different configs +4. `results.md` — Summary of what you learned from experiments + +--- + +## Timeline Estimate + +- Day 1: Implement `soulsrl.py`, verify with random agent +- Day 2: Train baseline, confirm learning +- Day 3-4: Run observation ablations +- Day 5-6: Run reward experiments +- Day 7: Document findings, optional extensions diff --git a/pufferlib/ocean/boss_fight/binding.c b/pufferlib/ocean/boss_fight/binding.c new file mode 100644 index 000000000..812e31bb7 --- /dev/null +++ b/pufferlib/ocean/boss_fight/binding.c @@ -0,0 +1,14 @@ +#include "boss_fight.h" + +#define Env BossFight +#include "../env_binding.h" + +static int my_init(Env *env, PyObject *args, PyObject *kwargs) { + env->size = unpack(kwargs, "size"); + return 0; +} + +static int my_log(PyObject *dict, Log *log) { + assign_to_dict(dict, "score", log->score); + return 0; +} diff --git a/pufferlib/ocean/boss_fight/boss_fight.c b/pufferlib/ocean/boss_fight/boss_fight.c new file mode 100644 index 000000000..0e1e152a2 --- /dev/null +++ b/pufferlib/ocean/boss_fight/boss_fight.c @@ -0,0 +1,32 @@ +#include "boss_fight.h" + +int main() { + BossFight env = {.size = 5}; + env.observations = (unsigned char *)calloc(1, sizeof(unsigned char)); + env.actions = (int *)calloc(1, sizeof(int)); + env.rewards = (float *)calloc(1, sizeof(float)); + env.terminals = (unsigned char *)calloc(1, sizeof(unsigned char)); + + c_reset(&env); + c_render(&env); + while (!WindowShouldClose()) { + if (IsKeyDown(KEY_LEFT_SHIFT)) { + if (IsKeyDown(KEY_A) || IsKeyDown(KEY_LEFT)) { + env.actions[0] = 0; + } else if (IsKeyDown(KEY_D) || IsKeyDown(KEY_RIGHT)) { + env.actions[0] = 1; + } else { + env.actions[0] = -1; + } + } else { + env.actions[0] = rand() % 2; + } + c_step(&env); + c_render(&env); + } + free(env.observations); + free(env.actions); + free(env.rewards); + free(env.terminals); + c_close(&env); +} diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h new file mode 100644 index 000000000..75d2932b1 --- /dev/null +++ b/pufferlib/ocean/boss_fight/boss_fight.h @@ -0,0 +1,80 @@ +#include "raylib.h" +#include +#include + +const Color PUFF_RED = (Color){187, 0, 0, 255}; +const Color PUFF_CYAN = (Color){0, 187, 187, 255}; +const Color PUFF_WHITE = (Color){241, 241, 241, 241}; +const Color PUFF_BACKGROUND = (Color){6, 24, 24, 255}; + +// Only use floats! +typedef struct { + float score; + float n; // Required as the last field +} Log; + +typedef struct { + Log log; // Required field + unsigned char + *observations; // Required field. Ensure type matches in .py and .c + int *actions; // Required field. Ensure type matches in .py and .c + float *rewards; // Required field + unsigned char *terminals; // Required field + int size; + int x; + int goal; +} BossFight; + +void c_reset(BossFight *env) { + env->x = 0; + env->goal = (rand() % 2 == 0) ? env->size : -env->size; +} + +void c_step(BossFight *env) { + env->rewards[0] = 0; + env->terminals[0] = 0; + if (env->actions[0] == 0) { + env->x -= 1; + } else if (env->actions[0] == 1) { + env->x += 1; + } + if (env->x == env->goal) { + c_reset(env); + env->rewards[0] = 1; + env->terminals[0] = 1; + env->log.score += 1; + env->log.n += 1; + } else if (env->x == -env->goal) { + c_reset(env); + env->rewards[0] = -1; + env->terminals[0] = 1; + env->log.score -= 1; + env->log.n += 1; + } + env->observations[0] = (env->goal > 0) ? 1 : -1; +} + +void c_render(BossFight *env) { + if (!IsWindowReady()) { + InitWindow(1080, 720, "PufferLib Template"); + SetTargetFPS(5); + } + + if (IsKeyDown(KEY_ESCAPE)) { + exit(0); + } + + DrawText("Go to the red square!", 20, 20, 20, PUFF_WHITE); + DrawRectangle(540 - 32 + 64 * env->goal, 360 - 32, 64, 64, PUFF_RED); + DrawRectangle(540 - 32 + 64 * env->x, 360 - 32, 64, 64, PUFF_CYAN); + + BeginDrawing(); + ClearBackground(PUFF_BACKGROUND); + EndDrawing(); +} + +void c_close(BossFight *env) { + if (IsWindowReady()) { + CloseWindow(); + } +} diff --git a/pufferlib/ocean/boss_fight/boss_fight.py b/pufferlib/ocean/boss_fight/boss_fight.py new file mode 100644 index 000000000..4f0bcdbb3 --- /dev/null +++ b/pufferlib/ocean/boss_fight/boss_fight.py @@ -0,0 +1,67 @@ +"""A minimal template for your own envs.""" + +import gymnasium +import numpy as np + +import pufferlib +from pufferlib.ocean.template import binding + + +class BossFight(pufferlib.PufferEnv): + def __init__( + self, num_envs=1, render_mode=None, log_interval=128, size=5, buf=None, seed=0 + ): + self.single_observation_space = gymnasium.spaces.Box( + low=0, high=1, shape=(1,), dtype=np.uint8 + ) + self.single_action_space = gymnasium.spaces.Discrete(2) + self.render_mode = render_mode + self.num_agents = num_envs + + super().__init__(buf) + self.c_envs = binding.vec_init( + self.observations, + self.actions, + self.rewards, + self.terminals, + self.truncations, + num_envs, + seed, + size=size, + ) + self.size = size + + def reset(self, seed=0): + binding.vec_reset(self.c_envs, seed) + return self.observations, [] + + def step(self, actions): + self.actions[:] = actions + binding.vec_step(self.c_envs) + info = [binding.vec_log(self.c_envs)] + return (self.observations, self.rewards, self.terminals, self.truncations, info) + + def render(self): + binding.vec_render(self.c_envs, 0) + + def close(self): + binding.vec_close(self.c_envs) + + +if __name__ == "__main__": + N = 4096 + env = BossFight(num_envs=N) + env.reset() + steps = 0 + + CACHE = 1024 + actions = np.random.randint(0, 5, (CACHE, N)) + + import time + + start = time.time() + while time.time() - start < 10: + env.step(actions[steps % CACHE]) + steps += 1 + + print("Squared SPS:", int(env.num_agents * steps / (time.time() - start))) diff --git a/pufferlib/ocean/environment.py b/pufferlib/ocean/environment.py index 6c56a4ea2..51f131ac1 100644 --- a/pufferlib/ocean/environment.py +++ b/pufferlib/ocean/environment.py @@ -1,177 +1,290 @@ import importlib import pufferlib.emulation + def lazy_import(module_path, attr): """ Returns a callable that, when called with any arguments, will import the module, retrieve the attribute (usually a class or factory) and then call it with the given arguments. """ - return lambda *args, **kwargs: getattr(__import__(module_path, fromlist=[attr]), attr)(*args, **kwargs) + return lambda *args, **kwargs: getattr( + __import__(module_path, fromlist=[attr]), attr + )(*args, **kwargs) + -def make_foraging(width=1080, height=720, num_agents=4096, horizon=512, - discretize=True, food_reward=0.1, render_mode='rgb_array'): +def make_foraging( + width=1080, + height=720, + num_agents=4096, + horizon=512, + discretize=True, + food_reward=0.1, + render_mode="rgb_array", +): from .grid import grid + init_fn = grid.init_foraging reward_fn = grid.reward_foraging - return grid.PufferGrid(width, height, num_agents, - horizon, discretize=discretize, food_reward=food_reward, init_fn=init_fn, reward_fn=reward_fn, render_mode=render_mode) + return grid.PufferGrid( + width, + height, + num_agents, + horizon, + discretize=discretize, + food_reward=food_reward, + init_fn=init_fn, + reward_fn=reward_fn, + render_mode=render_mode, + ) + -def make_predator_prey(width=1080, height=720, num_agents=4096, horizon=512, - discretize=True, food_reward=0.1, render_mode='rgb_array'): +def make_predator_prey( + width=1080, + height=720, + num_agents=4096, + horizon=512, + discretize=True, + food_reward=0.1, + render_mode="rgb_array", +): from .grid import grid + init_fn = grid.init_predator_prey reward_fn = grid.reward_predator_prey - return grid.PufferGrid(width, height, num_agents, - horizon, discretize=discretize, food_reward=food_reward, - init_fn=init_fn, reward_fn=reward_fn, - render_mode=render_mode) + return grid.PufferGrid( + width, + height, + num_agents, + horizon, + discretize=discretize, + food_reward=food_reward, + init_fn=init_fn, + reward_fn=reward_fn, + render_mode=render_mode, + ) + -def make_group(width=1080, height=720, num_agents=4096, horizon=512, - discretize=True, food_reward=0.1, render_mode='rgb_array'): +def make_group( + width=1080, + height=720, + num_agents=4096, + horizon=512, + discretize=True, + food_reward=0.1, + render_mode="rgb_array", +): from .grid import grid + init_fn = grid.init_group reward_fn = grid.reward_group - return grid.PufferGrid(width, height, num_agents, - horizon, discretize=discretize, food_reward=food_reward, - init_fn=init_fn, reward_fn=reward_fn, - render_mode=render_mode) + return grid.PufferGrid( + width, + height, + num_agents, + horizon, + discretize=discretize, + food_reward=food_reward, + init_fn=init_fn, + reward_fn=reward_fn, + render_mode=render_mode, + ) + -def make_puffer(width=1080, height=720, num_agents=4096, horizon=512, - discretize=True, food_reward=0.1, render_mode='rgb_array'): +def make_puffer( + width=1080, + height=720, + num_agents=4096, + horizon=512, + discretize=True, + food_reward=0.1, + render_mode="rgb_array", +): from .grid import grid + init_fn = grid.init_puffer reward_fn = grid.reward_puffer - return grid.PufferGrid(width, height, num_agents, - horizon, discretize=discretize, food_reward=food_reward, - init_fn=init_fn, reward_fn=reward_fn, - render_mode=render_mode) + return grid.PufferGrid( + width, + height, + num_agents, + horizon, + discretize=discretize, + food_reward=food_reward, + init_fn=init_fn, + reward_fn=reward_fn, + render_mode=render_mode, + ) + + +def make_puffergrid( + render_mode="raylib", + vision_range=5, + num_envs=4096, + num_maps=1000, + max_map_size=9, + report_interval=128, + buf=None, +): + return PufferGrid( + render_mode, + vision_range, + num_envs, + num_maps, + max_map_size, + report_interval, + buf, + ) -def make_puffergrid(render_mode='raylib', vision_range=5, - num_envs=4096, num_maps=1000, max_map_size=9, - report_interval=128, buf=None): - return PufferGrid(render_mode, vision_range, num_envs, - num_maps, max_map_size, report_interval, buf) def make_continuous(discretize=False, buf=None, **kwargs): from . import sanity + env = sanity.Continuous(discretize=discretize) if not discretize: env = pufferlib.ClipAction(env) env = pufferlib.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf) + def make_squared(distance_to_target=3, num_targets=1, buf=None, **kwargs): from . import sanity - env = sanity.Squared(distance_to_target=distance_to_target, num_targets=num_targets, **kwargs) + + env = sanity.Squared( + distance_to_target=distance_to_target, num_targets=num_targets, **kwargs + ) env = pufferlib.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf, **kwargs) + def make_bandit(num_actions=10, reward_scale=1, reward_noise=1, buf=None): from . import sanity - env = sanity.Bandit(num_actions=num_actions, reward_scale=reward_scale, - reward_noise=reward_noise) + + env = sanity.Bandit( + num_actions=num_actions, reward_scale=reward_scale, reward_noise=reward_noise + ) env = pufferlib.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf) + def make_memory(mem_length=2, mem_delay=2, buf=None, **kwargs): from . import sanity + env = sanity.Memory(mem_length=mem_length, mem_delay=mem_delay) env = pufferlib.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf) + def make_password(password_length=5, buf=None, **kwargs): from . import sanity + env = sanity.Password(password_length=password_length) env = pufferlib.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf) + def make_performance(delay_mean=0, delay_std=0, bandwidth=1, buf=None, **kwargs): from . import sanity - env = sanity.Performance(delay_mean=delay_mean, delay_std=delay_std, bandwidth=bandwidth) + + env = sanity.Performance( + delay_mean=delay_mean, delay_std=delay_std, bandwidth=bandwidth + ) env = pufferlib.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf) + def make_performance_empiric(count_n=0, count_std=0, bandwidth=1, buf=None, **kwargs): from . import sanity - env = sanity.PerformanceEmpiric(count_n=count_n, count_std=count_std, bandwidth=bandwidth) + + env = sanity.PerformanceEmpiric( + count_n=count_n, count_std=count_std, bandwidth=bandwidth + ) env = pufferlib.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf) + def make_stochastic(p=0.7, horizon=100, buf=None, **kwargs): from . import sanity + env = sanity.Stochastic(p=p, horizon=100) env = pufferlib.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf) + def make_spaces(buf=None, **kwargs): from . import sanity + env = sanity.Spaces() env = pufferlib.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf, **kwargs) + def make_multiagent(buf=None, **kwargs): from . import sanity + env = sanity.Multiagent() env = pufferlib.MultiagentEpisodeStats(env) return pufferlib.emulation.PettingZooPufferEnv(env=env, buf=buf) + MAKE_FUNCTIONS = { - 'battle': 'Battle', - 'breakout': 'Breakout', - 'blastar': 'Blastar', - 'convert': 'Convert', - 'convert_circle': 'ConvertCircle', - 'pong': 'Pong', - 'freeway': 'Freeway', - 'enduro': 'Enduro', - 'tetris': 'Tetris', - 'cartpole': 'Cartpole', - 'moba': 'Moba', - 'matsci': 'Matsci', - 'memory': 'Memory', - 'boids': 'Boids', - 'drone': 'Drone', - 'nmmo3': 'NMMO3', - 'snake': 'Snake', - 'squared': 'Squared', - 'pysquared': 'PySquared', - 'connect4': 'Connect4', - 'g2048': 'G2048', - 'terraform': 'Terraform', - 'template': 'Template', - 'tripletriad': 'TripleTriad', - 'tactical': 'Tactical', - 'target': 'Target', - 'go': 'Go', - 'rware': 'Rware', - 'trash_pickup': 'TrashPickupEnv', - 'tower_climb': 'TowerClimb', - 'grid': 'Grid', - 'shared_pool': 'PyCPR', - 'impulse_wars': 'ImpulseWars', - 'drive': 'Drive', - 'pacman': 'Pacman', - 'tmaze': 'TMaze', - 'checkers': 'Checkers', - 'asteroids': 'Asteroids', - 'whisker_racer': 'WhiskerRacer', - 'onestateworld': 'World', - 'onlyfish': 'OnlyFish', - 'chain_mdp': 'Chain', - 'spaces': make_spaces, - 'multiagent': make_multiagent, - 'slimevolley': 'SlimeVolley', + "battle": "Battle", + "breakout": "Breakout", + "blastar": "Blastar", + "boss_fight": "BossFight", + "convert": "Convert", + "convert_circle": "ConvertCircle", + "pong": "Pong", + "freeway": "Freeway", + "enduro": "Enduro", + "tetris": "Tetris", + "cartpole": "Cartpole", + "moba": "Moba", + "matsci": "Matsci", + "memory": "Memory", + "boids": "Boids", + "drone": "Drone", + "nmmo3": "NMMO3", + "snake": "Snake", + "squared": "Squared", + "pysquared": "PySquared", + "connect4": "Connect4", + "g2048": "G2048", + "terraform": "Terraform", + "template": "Template", + "tripletriad": "TripleTriad", + "tactical": "Tactical", + "target": "Target", + "go": "Go", + "rware": "Rware", + "trash_pickup": "TrashPickupEnv", + "tower_climb": "TowerClimb", + "grid": "Grid", + "shared_pool": "PyCPR", + "impulse_wars": "ImpulseWars", + "drive": "Drive", + "pacman": "Pacman", + "tmaze": "TMaze", + "checkers": "Checkers", + "asteroids": "Asteroids", + "whisker_racer": "WhiskerRacer", + "onestateworld": "World", + "onlyfish": "OnlyFish", + "chain_mdp": "Chain", + "spaces": make_spaces, + "multiagent": make_multiagent, + "slimevolley": "SlimeVolley", } -def env_creator(name='squared', *args, **kwargs): - if 'puffer_' not in name: - raise pufferlib.APIUsageError(f'Invalid environment name: {name}') + +def env_creator(name="squared", *args, **kwargs): + if "puffer_" not in name: + raise pufferlib.APIUsageError(f"Invalid environment name: {name}") # TODO: Robust sanity / ocean imports - name = name.replace('puffer_', '') + name = name.replace("puffer_", "") try: - module = importlib.import_module(f'pufferlib.ocean.{name}.{name}') + module = importlib.import_module(f"pufferlib.ocean.{name}.{name}") return getattr(module, MAKE_FUNCTIONS[name]) except ModuleNotFoundError: return MAKE_FUNCTIONS[name] From ec5efca9c9d89af192bea700290eb8bbb696ceb4 Mon Sep 17 00:00:00 2001 From: frixaco Date: Tue, 13 Jan 2026 16:19:17 +0500 Subject: [PATCH 02/29] setup fixes for bossfight env --- LEARN_TODO.md | 259 +++++++++++++++++++++++ TODO.md | 29 ++- pufferlib/config/boss_fight.ini | 96 ++++++++- pufferlib/ocean/boss_fight/__init__.py | 3 + pufferlib/ocean/boss_fight/boss_fight.py | 2 +- 5 files changed, 380 insertions(+), 9 deletions(-) create mode 100644 LEARN_TODO.md create mode 100644 pufferlib/ocean/boss_fight/__init__.py diff --git a/LEARN_TODO.md b/LEARN_TODO.md new file mode 100644 index 000000000..35c99eebc --- /dev/null +++ b/LEARN_TODO.md @@ -0,0 +1,259 @@ +# Learning TODO: RL Foundations + +Everything you need to understand `bptt_horizon` and RL training in general. + +--- + +## Level 1: Basic ML Concepts + +### 1.1 What is a Neural Network? +- Function that takes numbers in, spits numbers out +- Has "weights" (parameters) that get adjusted during training +- `input → [neural network] → output` + +### 1.2 What is Training / Learning? +- Adjusting weights so the network gives better outputs +- Done by computing "loss" (how wrong it was) and updating weights to reduce loss + +### 1.3 What is Backpropagation? +- Algorithm to figure out HOW to adjust each weight +- Flows backwards through the network: output → hidden layers → input +- "If the output was wrong, which weights were responsible?" + +### 1.4 What is a Batch? +- Group of training examples processed together +- Instead of: train on example 1, then example 2, then example 3... +- Do: train on [example 1, 2, 3, 4, 5] at once +- Why? Faster (GPU parallelism) + more stable learning + +### 1.5 What is Minibatch? +- When your batch is too big for GPU memory +- Split batch into smaller "minibatches" +- `batch_size = 1024, minibatch_size = 256` → 4 gradient updates per batch + +--- + +## Level 2: RL Basics + +### 2.1 What is a Timestep? +- One tick of the game/simulation +- Agent observes state → takes action → gets reward → new state +- `t=0: see game → press button → get +1 point → game changes` + +### 2.2 What is an Episode? +- One complete playthrough from start to end +- Boss fight: episode = one full fight (win or lose) +- `[spawn] → step → step → step → ... → [death or victory]` + +``` +Episode 1: t0 → t1 → t2 → t3 → DEAD (4 steps) +Episode 2: t0 → t1 → t2 → t3 → t4 → t5 → WIN (6 steps) +``` + +### 2.3 What is an Observation? +- What the agent "sees" at each timestep +- Your boss_fight: 14 numbers (player pos, boss HP, etc.) + +### 2.4 What is a Policy? +- The neural network that decides actions +- `observation (14 floats) → [policy network] → action (0-6)` +- Training = making this network choose better actions + +### 2.5 What is a Value Function? +- Predicts "how good is this situation?" +- "I have full HP, boss is low" → high value +- "I'm almost dead, boss is full HP" → low value +- Helps the agent learn which states to aim for + +--- + +## Level 3: How RL Training Works + +### 3.1 Collect Experience +``` +Run 56 environments in parallel: + Env 1: obs → action → reward → obs → action → reward → ... + Env 2: obs → action → reward → obs → action → reward → ... + ... + Env 56: obs → action → reward → obs → action → reward → ... + +After N steps, you have a "batch" of experience +``` + +### 3.2 Compute Advantages +- "Was this action better or worse than expected?" +- `advantage = actual_reward - predicted_value` +- Positive advantage → reinforce this action +- Negative advantage → discourage this action + +### 3.3 Update the Network +- Use collected experience to adjust policy weights +- Make good actions more likely, bad actions less likely + +### 3.4 Repeat +``` +while not done: + 1. Collect batch of experience (many timesteps) + 2. Compute advantages + 3. Update network with minibatches + 4. Go to 1 +``` + +--- + +## Level 4: Sequential Data & Memory + +### 4.1 Why Sequence Matters +In games, the PAST affects what you should do NOW: + +``` +Timestep 1: Boss starts wind-up animation +Timestep 2: Boss still winding up +Timestep 3: Boss about to attack! ← YOU SHOULD DODGE NOW +Timestep 4: Boss attacks + +If you only see timestep 3 in isolation, you might not know to dodge. +But if you saw timesteps 1-2-3 together, you'd see the pattern. +``` + +### 4.2 MLP (Multi-Layer Perceptron) — No Memory +- Standard neural network +- Only sees CURRENT observation +- `obs_t → [MLP] → action` +- No memory of previous timesteps +- Fine if observation contains all needed info + +### 4.3 RNN (Recurrent Neural Network) — Has Memory +- Sees current observation + remembers past +- `obs_t + memory → [RNN] → action + updated_memory` +- Can learn patterns over time +- Types: LSTM, GRU (different memory mechanisms) + +``` +MLP: sees [___] [___] [_X_] ← only current frame +RNN: sees [_X_] [_X_] [_X_] ← current + memory of past +``` + +### 4.4 When Do You Need RNN? +- When current observation is INCOMPLETE +- Example: "Boss is standing still" — is he about to attack or recovering? +- If your observation includes `boss_phase` and `time_to_damage`, MLP might be enough +- If observation only has positions, RNN helps learn timing + +--- + +## Level 5: BPTT (Backpropagation Through Time) + +### 5.1 The Problem +RNN has memory that flows through time: + +``` +t1 → t2 → t3 → t4 → t5 → t6 → ... → t1000 + +To train RNN, backprop must flow backwards through ALL these connections. +1000 timesteps = 1000 layers of backprop = VERY slow, uses tons of memory +``` + +### 5.2 The Solution: Truncated BPTT +Don't backprop through entire episode. Cut it into chunks: + +``` +Episode: [t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12] + +bptt_horizon = 4: + +Chunk 1: [t1 → t2 → t3 → t4] ← backprop only through these 4 +Chunk 2: [t5 → t6 → t7 → t8] ← backprop only through these 4 +Chunk 3: [t9 → t10 → t11 → t12] ← backprop only through these 4 +``` + +### 5.3 What bptt_horizon Controls +``` +bptt_horizon = 16 means: +- RNN sees 16 consecutive timesteps during training +- Gradients flow back through 16 steps max +- RNN can learn patterns up to ~16 steps long +``` + +### 5.4 Trade-offs +``` +Small horizon (8): + ✓ Fast, low memory + ✗ RNN can't learn long patterns (>8 steps) + +Large horizon (128): + ✓ RNN learns longer patterns + ✗ Slow, high memory usage +``` + +--- + +## Level 6: Putting It Together + +### 6.1 The Batch Math +``` +num_envs = 56 (parallel environments) +bptt_horizon = 16 (timesteps per chunk) + +batch_size = num_envs × bptt_horizon + = 56 × 16 + = 896 total samples per training batch +``` + +### 6.2 Why minibatch_size Must Be ≤ batch_size +``` +batch_size = 896 (you collected 896 samples) +minibatch_size = 2048 (you want to train on 2048 at a time) + +ERROR: Can't take 2048 samples from a pile of 896! + +Fix: minibatch_size = 256 or 512 (smaller than 896) +``` + +### 6.3 For Your Boss Fight (No RNN) +You're using MLP, so `bptt_horizon` just affects batch math: + +```ini +[vec] +num_envs = 56 + +[train] +bptt_horizon = 16 # 56 × 16 = 896 batch +minibatch_size = 256 # Must be ≤ 896 +``` + +Or increase horizon if you want bigger batches: + +```ini +bptt_horizon = 64 # 56 × 64 = 3584 batch +minibatch_size = 2048 # Now this works +``` + +--- + +## Summary: What You Actually Need to Know + +1. **batch_size** = total samples collected before training +2. **minibatch_size** = chunk size for each gradient update (must be ≤ batch_size) +3. **bptt_horizon** = consecutive timesteps kept together + - For RNN: determines how far back it can learn patterns + - For MLP: just affects batch_size math +4. **Your boss_fight uses MLP** — bptt_horizon is just a number to make the math work + +--- + +## Learning Resources + +### Videos (start here) +- [ ] 3Blue1Brown: "Neural Networks" series (YouTube) +- [ ] Mutual Information: "Reinforcement Learning" series (YouTube) + +### Interactive +- [ ] Andrej Karpathy: "Neural Networks: Zero to Hero" (YouTube + code) + +### Reading +- [ ] Spinning Up in Deep RL (OpenAI) — https://spinningup.openai.com +- [ ] CleanRL documentation — similar to PufferLib + +### Hands-on +- [ ] Train boss_fight, watch the numbers, build intuition diff --git a/TODO.md b/TODO.md index 938ca7614..f98518639 100644 --- a/TODO.md +++ b/TODO.md @@ -5,14 +5,33 @@ 1. Fork pufferlib, create new branch 2. Run these: - ``` - uv venv - uv pip install -e . - ``` + +``` +uv venv +uv pip install -e . +``` 3. Setup files using templates, update `environment.py` 4. Not sure what this does yet: + +``` +python setup.py build_boss_fight --inplace +``` + +### Testing + +- Make sure shit's running: ``` - python setup.py build_boss_fight --inplace + uv pip install -e . && python -c " + from pufferlib.ocean.boss_fight import BossFight + import numpy as np + env = BossFight(num_envs=2) + env.reset() + for _ in range(100): + env.step(np.random.randint(0, 7, size=2)) + print('ok') + env.close() + " ``` +- Train and check scores: `puffer train puffer_boss_fight --train.total-timesteps 50000` diff --git a/pufferlib/config/boss_fight.ini b/pufferlib/config/boss_fight.ini index d7f426abf..f6d97ae12 100644 --- a/pufferlib/config/boss_fight.ini +++ b/pufferlib/config/boss_fight.ini @@ -2,14 +2,104 @@ package = ocean env_name = puffer_boss_fight policy_name = Policy +# rnn_name = Recurrent # Uncomment if adding LSTM/GRU + +[vec] +num_envs = 56 +num_workers = 14 +batch_size = auto +zero_copy = True +seed = 42 [env] -num_envs = 14 +# Environment-specific params (passed to env constructor) +# None needed - using defaults from README + +[policy] +# Policy constructor args (e.g., hidden_size) +# hidden_size = 64 # Experiment: 32, 64, 128 [train] -total_timesteps = 1_000_000 -minibatch_size=1024 +# Experiment tracking +name = boss_fight +project = boss_fight_experiments +data_dir = experiments +checkpoint_interval = 200 + +# Reproducibility +seed = 42 +# TODO: disable for sweep or speed +torch_deterministic = True +device = mps + +# Optimization +# TODO: try muon with 0.015 lr +optimizer = adam +precision = float32 +compile = False + +# Core PPO hyperparameters +total_timesteps = 10_000_000 +learning_rate = 0.0003 +anneal_lr = True +min_lr_ratio = 0.0 +gamma = 0.99 +gae_lambda = 0.95 +update_epochs = 4 +clip_coef = 0.2 +vf_coef = 0.5 +vf_clip_coef = 0.2 +max_grad_norm = 0.5 +ent_coef = 0.01 + +# Batch sizes +minibatch_size = 512 +max_minibatch_size = 32768 +bptt_horizon = 16 + +# Adam parameters (if optimizer = adam) +adam_beta1 = 0.9 +adam_beta2 = 0.999 +adam_eps = 1e-8 + +# V-trace (for off-policy correction) +# vtrace_rho_clip = 1.0 +# vtrace_c_clip = 1.0 [sweep] goal = maximize metric = episode_return +method = Protein +metric_distribution = linear +max_suggestion_cost = 3600 +use_gpu = True + +# Learning rate sweep +[sweep.train.learning_rate] +distribution = log_normal +min = 0.0001 +max = 0.003 + +# Entropy coefficient sweep (exploration vs exploitation) +[sweep.train.ent_coef] +distribution = log_normal +min = 0.0001 +max = 0.05 + +# Discount factor sweep +[sweep.train.gamma] +distribution = logit_normal +min = 0.95 +max = 0.999 + +# GAE lambda sweep +[sweep.train.gae_lambda] +distribution = logit_normal +min = 0.9 +max = 0.99 + +# Minibatch size sweep +[sweep.train.minibatch_size] +distribution = uniform_pow2 +min = 1024 +max = 8192 diff --git a/pufferlib/ocean/boss_fight/__init__.py b/pufferlib/ocean/boss_fight/__init__.py new file mode 100644 index 000000000..4a93af7f9 --- /dev/null +++ b/pufferlib/ocean/boss_fight/__init__.py @@ -0,0 +1,3 @@ +"""BossFight Ocean Environment.""" + +from .boss_fight import BossFight diff --git a/pufferlib/ocean/boss_fight/boss_fight.py b/pufferlib/ocean/boss_fight/boss_fight.py index 4f0bcdbb3..a952dbf41 100644 --- a/pufferlib/ocean/boss_fight/boss_fight.py +++ b/pufferlib/ocean/boss_fight/boss_fight.py @@ -4,7 +4,7 @@ import numpy as np import pufferlib -from pufferlib.ocean.template import binding +from pufferlib.ocean.boss_fight import binding class BossFight(pufferlib.PufferEnv): From 094fda8f552ccb04ce4cd5c28fd577be482243fc Mon Sep 17 00:00:00 2001 From: frixaco Date: Wed, 14 Jan 2026 02:10:08 +0500 Subject: [PATCH 03/29] prep work --- AGENTS.md | 52 ++++++++++++++++++++ TODO.md | 37 -------------- pufferlib/config/boss_fight.ini | 6 +-- pufferlib/ocean/boss_fight/README.md | 4 +- pufferlib/ocean/boss_fight/compile_flags.txt | 1 + 5 files changed, 58 insertions(+), 42 deletions(-) create mode 100644 AGENTS.md delete mode 100644 TODO.md create mode 100644 pufferlib/ocean/boss_fight/compile_flags.txt diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 000000000..184f98caa --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,52 @@ +# BossFight Reinforcement Learning project + +I'm implementing a RL environment using PufferLib in C + Python. + +Environment spec file is in `./pufferlib/ocean/boss_fight/README.md`. + +You are in PufferLib's (puffer.ai) source repository which contains "Ocean" - a collection of environments. + +The environment code I'm working on is located in `./pufferlib/ocean/boss_fight/`. Environment configuration is in `./pufferlib/config/boss_fight.ini` + +### Setup + +1. Fork pufferlib, create new branch + +2. Run these: + +``` +uv venv +uv pip install -e . +``` + +3. Setup files using templates, update `environment.py` + +4. Not sure what this does yet: + +``` +python setup.py build_boss_fight --inplace +``` + +### Testing + +Make sure shit's running: + +``` +uv pip install -e . +python -c " +from pufferlib.ocean.boss_fight import BossFight +import numpy as np +env = BossFight(num_envs=2) +env.reset() +for _ in range(100): + env.step(np.random.randint(0, 7, size=2)) +print('ok') +env.close() +" +``` + +Train and check scores: + +``` +puffer train puffer_boss_fight --train.total-timesteps 50000 +``` diff --git a/TODO.md b/TODO.md deleted file mode 100644 index f98518639..000000000 --- a/TODO.md +++ /dev/null @@ -1,37 +0,0 @@ -## Notes for for my Boss Fight environment - -### Setup - -1. Fork pufferlib, create new branch - -2. Run these: - -``` -uv venv -uv pip install -e . -``` - -3. Setup files using templates, update `environment.py` - -4. Not sure what this does yet: - -``` -python setup.py build_boss_fight --inplace -``` - -### Testing - -- Make sure shit's running: - ``` - uv pip install -e . && python -c " - from pufferlib.ocean.boss_fight import BossFight - import numpy as np - env = BossFight(num_envs=2) - env.reset() - for _ in range(100): - env.step(np.random.randint(0, 7, size=2)) - print('ok') - env.close() - " - ``` -- Train and check scores: `puffer train puffer_boss_fight --train.total-timesteps 50000` diff --git a/pufferlib/config/boss_fight.ini b/pufferlib/config/boss_fight.ini index f6d97ae12..fcfe697cc 100644 --- a/pufferlib/config/boss_fight.ini +++ b/pufferlib/config/boss_fight.ini @@ -5,7 +5,7 @@ policy_name = Policy # rnn_name = Recurrent # Uncomment if adding LSTM/GRU [vec] -num_envs = 56 +num_envs = 112 num_workers = 14 batch_size = auto zero_copy = True @@ -53,9 +53,9 @@ max_grad_norm = 0.5 ent_coef = 0.01 # Batch sizes -minibatch_size = 512 +minibatch_size = 2048 max_minibatch_size = 32768 -bptt_horizon = 16 +bptt_horizon = 32 # Adam parameters (if optimizer = adam) adam_beta1 = 0.9 diff --git a/pufferlib/ocean/boss_fight/README.md b/pufferlib/ocean/boss_fight/README.md index e81c5f874..a9cbcd8b8 100644 --- a/pufferlib/ocean/boss_fight/README.md +++ b/pufferlib/ocean/boss_fight/README.md @@ -3,9 +3,9 @@ ## Goal Build a **minimal** 2D boss fight environment to learn RL concepts with PufferLib. -Focus: **observation design, reward shaping, and training experiments** — not game engine complexity. +Focus: **observation design, reward shaping, training experiments and a bit of game dev using Raylib** -The boss has **1 attack** (AOE burst). All hitboxes are circles. No rendering required. +The boss has **1 attack** (AOE burst). All hitboxes are circles. --- diff --git a/pufferlib/ocean/boss_fight/compile_flags.txt b/pufferlib/ocean/boss_fight/compile_flags.txt new file mode 100644 index 000000000..ea96eb002 --- /dev/null +++ b/pufferlib/ocean/boss_fight/compile_flags.txt @@ -0,0 +1 @@ +-I../../../raylib-5.5_macos/include From 09a79ad159e8fd96847b41546bb70d71227a9066 Mon Sep 17 00:00:00 2001 From: frixaco Date: Fri, 16 Jan 2026 18:14:39 +0500 Subject: [PATCH 04/29] implement reset --- AGENTS.md | 2 +- learn-pufferlib.py | 1175 +++++++++++++++++++++++ pufferlib/ocean/boss_fight/README.md | 148 ++- pufferlib/ocean/boss_fight/binding.c | 2 +- pufferlib/ocean/boss_fight/boss_fight.c | 38 +- pufferlib/ocean/boss_fight/boss_fight.h | 136 ++- 6 files changed, 1372 insertions(+), 129 deletions(-) create mode 100644 learn-pufferlib.py diff --git a/AGENTS.md b/AGENTS.md index 184f98caa..0712dc874 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -24,7 +24,7 @@ uv pip install -e . 4. Not sure what this does yet: ``` -python setup.py build_boss_fight --inplace +python setup.py build_boss_fight --inplace --force ``` ### Testing diff --git a/learn-pufferlib.py b/learn-pufferlib.py new file mode 100644 index 000000000..4091fb6fb --- /dev/null +++ b/learn-pufferlib.py @@ -0,0 +1,1175 @@ +""" +LEARN_V2.PY - RL with PufferLib (The Right Way) +================================================ + +PURPOSE: Learn reinforcement learning using PufferLib's patterns and infrastructure. + +This is the "full PufferLib" version of learn.py. Instead of implementing PPO +from scratch, we use PufferLib's pufferl.PuffeRL trainer which handles: +- Rollout collection +- GAE advantage computation +- PPO loss calculation +- Gradient updates +- Logging and metrics + +HOW TO USE: +1. Read each section's comments (the WHY and WHAT) +2. Fill in the TODO sections +3. Run and test after each section: python learn_v2.py +4. Only move to next section when current one works + +The environment is the same as learn.py: +- 2D arena where an agent must reach a target +- Agent can move UP/DOWN/LEFT/RIGHT or stay still +- Episode ends when: agent reaches target, hits wall, or 200 steps pass + +DEPENDENCIES: + pip install pufferlib torch numpy gymnasium +""" + +import os +import numpy as np +import gymnasium +import torch +import torch.nn as nn +import pufferlib +import pufferlib.vector +import pufferlib.pytorch +from pufferlib import pufferl + + +# ============================================================================= +# SECTION 1: PUFFERLIB ENVIRONMENT +# ============================================================================= +""" +WHY inherit from pufferlib.PufferEnv? +------------------------------------- +PufferLib provides optimized environment vectorization. When you inherit from +PufferEnv, you get: + +1. AUTOMATIC BUFFER MANAGEMENT: PufferLib creates shared memory buffers for + observations, rewards, terminals, truncations. You just write to them. + +2. MULTI-AGENT SUPPORT: The same pattern works for 1 agent or 100 agents. + You define `num_agents` and PufferLib handles the rest. + +3. VECTORIZATION COMPATIBILITY: Your env works with pufferlib.vector.make() + which can run multiple copies in parallel (Serial or Multiprocessing). + +KEY DIFFERENCES from Gymnasium: +------------------------------- +- Define `single_observation_space` and `single_action_space` (not plural) +- Set `self.num_agents` (1 for single-agent) +- Call `super().__init__(buf)` which creates self.observations, self.rewards, etc. +- Update arrays IN-PLACE: `self.observations[:] = ...` not `return obs` +- reset() and step() still return values, but also update internal buffers +""" + + +class MoveToTargetEnv(pufferlib.PufferEnv): + """ + A simple environment where an agent navigates to a target position. + + This is identical to learn.py's MoveToTargetEnv, but adapted to PufferLib's + patterns. The game logic is the same, only the interface changes. + + GAME RULES: + - Agent starts at random position in [-0.8, 0.8] x [-0.8, 0.8] + - Target is at random position (at least 0.3 units away from agent) + - Agent can: NOOP (0), UP (1), DOWN (2), LEFT (3), RIGHT (4) + - Episode ends when: agent reaches target, hits wall (|x|>1 or |y|>1), or 200 steps + - Reward: -0.01/step + distance shaping + terminal bonuses + """ + + # Type hints for attributes created by super().__init__() + observations: np.ndarray + rewards: np.ndarray + terminals: np.ndarray + truncations: np.ndarray + + def __init__(self, buf=None, seed=0): + """ + WHY these parameters? + --------------------- + - buf: Optional shared memory buffer from PufferLib's vectorization. + When running multiple envs, they share memory for efficiency. + If None, PufferLib creates a buffer automatically. + + - seed: Random seed for reproducibility. Essential for debugging! + + WHAT to do in __init__: + 1. Define single_observation_space (what ONE agent sees) + 2. Define single_action_space (what actions ONE agent can take) + 3. Set self.num_agents (1 for single-agent env) + 4. Call super().__init__(buf) - THIS CREATES self.observations, etc. + 5. Initialize game state variables + 6. Set up random number generator + """ + # ----------------------------------------------------------------- + # TODO 1.1: Define the observation space + # ----------------------------------------------------------------- + # WHAT the agent sees: [agent_x, agent_y, target_x, target_y, dx, dy] + # - Positions are in [-1, 1] (arena bounds) + # - dx, dy (direction to target) can be in [-2, 2] + # + # WHY "single_observation_space" not "observation_space"? + # PufferLib distinguishes single-agent spaces from joint spaces. + # For multi-agent, observation_space would be (num_agents, obs_dim). + # We define the SINGLE agent's view, PufferLib handles batching. + # + # YOUR CODE: Create self.single_observation_space as gymnasium.spaces.Box + # Hint: Box(low=-2.0, high=2.0, shape=(6,), dtype=np.float32) + + self.single_observation_space = gymnasium.spaces.Box( + low=-2.0, high=2.0, shape=(6,), dtype=np.float32 + ) + + # ----------------------------------------------------------------- + # TODO 1.2: Define the action space + # ----------------------------------------------------------------- + # WHAT actions are available: 0=NOOP, 1=UP, 2=DOWN, 3=LEFT, 4=RIGHT + # + # YOUR CODE: Create self.single_action_space as gymnasium.spaces.Discrete(5) + + self.single_action_space = gymnasium.spaces.Discrete(5) + + # ----------------------------------------------------------------- + # TODO 1.3: Set the number of agents + # ----------------------------------------------------------------- + # For single-agent environments, num_agents = 1. + # PufferLib uses this to allocate the right buffer sizes. + # + # YOUR CODE: Set self.num_agents = 1 + + self.num_agents = 1 + + # ----------------------------------------------------------------- + # CRITICAL: Call super().__init__(buf) + # ----------------------------------------------------------------- + # This MUST come after defining spaces and num_agents! + # It creates: + # - self.observations: array of shape (num_agents, *obs_shape) + # - self.rewards: array of shape (num_agents,) + # - self.terminals: array of shape (num_agents,) + # - self.truncations: array of shape (num_agents,) + # + # These are the buffers you'll update in reset() and step(). + super().__init__(buf) + + # ----------------------------------------------------------------- + # TODO 1.4: Initialize game state variables + # ----------------------------------------------------------------- + # Track the actual game state (not observations, those are derived). + # For single-agent, these are simple arrays of shape (2,) for positions. + # + # WHAT to initialize: + # - self.agent_pos: np.zeros(2, dtype=np.float32) - agent's [x, y] + # - self.target_pos: np.zeros(2, dtype=np.float32) - target's [x, y] + # - self.tick: 0 - step counter within episode + # + # Also initialize constants: + # - self.max_steps = 200 + # - self.target_radius = 0.1 (how close to count as "reached") + # - self.move_speed = 0.05 (movement per action) + # - self.arena_size = 1.0 (arena is [-1, 1] x [-1, 1]) + # + # YOUR CODE: Initialize game state + + self.agent_pos = np.zeros(2, dtype=np.float32) + self.target_pos = np.zeros(2, dtype=np.float32) + self.tick = 0 + + self.max_steps = 200 + self.target_radius = 0.1 + self.move_speed = 0.05 + self.arena_size = 1.0 + + # Set up random number generator for reproducibility + self.rng = np.random.default_rng(seed=seed) + + # Track previous distance for reward shaping + self.prev_dist = 0.0 + + def reset(self, seed=None): + """ + WHY reset()? + ------------ + Start a fresh episode. Called at the beginning and after each episode ends. + + WHAT to do: + 1. Randomize agent position + 2. Randomize target position (not too close to agent!) + 3. Reset step counter + 4. Compute initial distance (for reward shaping) + 5. Fill self.observations[:] with initial state + + WHY update self.observations[:] in-place? + PufferLib uses shared memory buffers. By updating in-place, we avoid + copying data. The [:] syntax means "update the existing array contents". + + RETURNS: + - self.observations: the observation buffer (now filled with initial state) + - []: empty list of infos (PufferLib expects a list) + """ + # ----------------------------------------------------------------- + # TODO 2.1: Implement reset() + # ----------------------------------------------------------------- + # Step 1: Randomize agent position + # self.agent_pos[:] = self.rng.uniform(-0.8, 0.8, size=2) + # + # Step 2: Randomize target position + # self.target_pos[:] = self.rng.uniform(-0.8, 0.8, size=2) + # + # Step 3: Ensure target is far enough from agent (at least 0.3 units) + # while np.linalg.norm(self.agent_pos - self.target_pos) < 0.3: + # self.target_pos[:] = self.rng.uniform(-0.8, 0.8, size=2) + # + # Step 4: Reset step counter + # self.tick = 0 + # + # Step 5: Compute initial distance + # self.prev_dist = np.linalg.norm(self.agent_pos - self.target_pos) + # + # Step 6: Fill observations buffer + # self.observations[0, 0] = self.agent_pos[0] # agent_x + # self.observations[0, 1] = self.agent_pos[1] # agent_y + # self.observations[0, 2] = self.target_pos[0] # target_x + # self.observations[0, 3] = self.target_pos[1] # target_y + # self.observations[0, 4] = self.target_pos[0] - self.agent_pos[0] # dx + # self.observations[0, 5] = self.target_pos[1] - self.agent_pos[1] # dy + # + # Note: We index [0, :] because num_agents=1, so observations has shape (1, 6) + # + # YOUR CODE: + + self.agent_pos[:] = self.rng.uniform(-0.8, 0.8, size=2) + self.target_pos[:] = self.rng.uniform(-0.8, 0.8, size=2) + + while np.linalg.norm(self.agent_pos - self.target_pos) < 0.3: + self.target_pos[:] = self.rng.uniform(-0.8, 0.8, size=2) + + self.tick = 0 + + self.prev_dist = np.linalg.norm(self.agent_pos - self.target_pos) + + self.observations[0, 0] = self.agent_pos[0] + self.observations[0, 1] = self.agent_pos[1] + self.observations[0, 2] = self.target_pos[0] + self.observations[0, 3] = self.target_pos[1] + self.observations[0, 4] = self.target_pos[0] - self.agent_pos[0] + self.observations[0, 5] = self.target_pos[1] - self.agent_pos[1] + + return self.observations, [] + + def step(self, actions): + """ + WHY step()? + ----------- + The core game loop. Called every timestep with the agent's chosen action. + + WHAT to do: + 1. Apply the action (move agent) + 2. Compute reward (time penalty + distance shaping + terminal bonus) + 3. Check terminal conditions (reached target? hit wall? timeout?) + 4. Update buffers (observations, rewards, terminals, truncations) + 5. Auto-reset if episode ended + + PARAMETERS: + - actions: numpy array of shape (num_agents,) = (1,) for us + Each value is an integer 0-4 + + RETURNS: + - self.observations: updated observation buffer + - self.rewards: updated reward buffer + - self.terminals: updated terminal buffer + - self.truncations: updated truncation buffer + - infos: list of dicts with episode stats for finished episodes + """ + # ----------------------------------------------------------------- + # TODO 2.2: Implement step() + # ----------------------------------------------------------------- + # Step 1: Get the action (we only have 1 agent) + # action = actions[0] + # + # Step 2: Convert action to movement + # dx, dy = 0.0, 0.0 + # if action == 1: dy = self.move_speed # UP + # elif action == 2: dy = -self.move_speed # DOWN + # elif action == 3: dx = -self.move_speed # LEFT + # elif action == 4: dx = self.move_speed # RIGHT + # + # Step 3: Apply movement + # self.agent_pos[0] += dx + # self.agent_pos[1] += dy + # self.tick += 1 + # + # Step 4: Compute distance and rewards + # distance = np.linalg.norm(self.agent_pos - self.target_pos) + # reward = -0.01 # Time penalty + # reward += 2.0 * (self.prev_dist - distance) # Distance shaping + # self.prev_dist = distance + # + # Step 5: Check terminal conditions + # reached_target = distance < self.target_radius + # hit_wall = (abs(self.agent_pos[0]) > self.arena_size or + # abs(self.agent_pos[1]) > self.arena_size) + # timed_out = self.tick >= self.max_steps + # + # Step 6: Apply terminal rewards + # if reached_target: reward += 1.0 + # if hit_wall: reward -= 0.5 + # + # Step 7: Set terminal and truncation flags + # terminal = reached_target or hit_wall + # truncation = timed_out and not terminal + # + # Step 8: Update buffers + # self.rewards[0] = reward + # self.terminals[0] = terminal + # self.truncations[0] = truncation + # + # Step 9: Build info dict for finished episodes + # infos = [] + # if terminal or truncation: + # infos.append({ + # 'episode_length': self.tick, + # 'reached_target': reached_target, + # 'hit_wall': hit_wall, + # 'reward': reward, + # }) + # # Auto-reset for next episode + # self.reset() + # + # Step 10: Update observations (whether reset or not) + # self.observations[0, 0] = self.agent_pos[0] + # self.observations[0, 1] = self.agent_pos[1] + # self.observations[0, 2] = self.target_pos[0] + # self.observations[0, 3] = self.target_pos[1] + # self.observations[0, 4] = self.target_pos[0] - self.agent_pos[0] + # self.observations[0, 5] = self.target_pos[1] - self.agent_pos[1] + # + # YOUR CODE: + + action = actions[0] + + dx, dy = 0.0, 0.0 + if action == 1: + dy = self.move_speed + elif action == 2: + dy = -self.move_speed # DOWN + elif action == 3: + dx = -self.move_speed # LEFT + elif action == 4: + dx = self.move_speed # RIGHT + + self.agent_pos[0] += dx + self.agent_pos[1] += dy + self.tick += 1 + + distance = np.linalg.norm(self.target_pos - self.agent_pos) + reward = -0.01 + reward += 2 * (self.prev_dist - distance) + self.prev_dist = distance + + reached_target = distance < self.target_radius + hit_wall = ( + abs(self.agent_pos[0]) > self.arena_size + or abs(self.agent_pos[1]) > self.arena_size + ) + timed_out = self.tick >= self.max_steps + + if reached_target: + reward += 1.0 + if hit_wall: + reward -= 0.5 + + terminal = reached_target or hit_wall + truncation = timed_out and not terminal + + self.rewards[0] = reward + self.terminals[0] = terminal + self.truncations[0] = truncation + + infos = [] + if terminal or truncation: + infos.append( + { + "episode_length": self.tick, + "reached_target": reached_target, + "hit_wall": hit_wall, + "reward": reward, + } + ) + self.reset() + + self.observations[0, 0] = self.agent_pos[0] + self.observations[0, 1] = self.agent_pos[1] + self.observations[0, 2] = self.target_pos[0] + self.observations[0, 3] = self.target_pos[1] + self.observations[0, 4] = self.target_pos[0] - self.agent_pos[0] + self.observations[0, 5] = self.target_pos[1] - self.agent_pos[1] + + return self.observations, self.rewards, self.terminals, self.truncations, infos + + def render(self): + """ + Simple ASCII rendering for debugging. + Shows a 20x20 grid with agent (A) and target (T). + """ + grid_size = 20 + grid = [["." for _ in range(grid_size)] for _ in range(grid_size)] + + # Convert positions from [-1, 1] to grid indices [0, grid_size-1] + def to_grid(pos): + x = int((pos[0] + 1) / 2 * (grid_size - 1)) + y = int((1 - (pos[1] + 1) / 2) * (grid_size - 1)) # Flip y for display + return max(0, min(grid_size - 1, x)), max(0, min(grid_size - 1, y)) + + tx, ty = to_grid(self.target_pos) + ax, ay = to_grid(self.agent_pos) + + grid[ty][tx] = "T" + grid[ay][ax] = "A" + + print(f"\nStep {self.tick}:") + print("+" + "-" * grid_size + "+") + for row in grid: + print("|" + "".join(row) + "|") + print("+" + "-" * grid_size + "+") + + def close(self): + pass + + +# ============================================================================= +# SECTION 2: TESTING ENVIRONMENT +# ============================================================================= +""" +WHY test before training? +------------------------- +If your environment is broken, RL will silently fail to learn. +You'll waste hours wondering why training doesn't work. + +ALWAYS verify: +1. Environment creates without errors +2. reset() returns correct shapes +3. step() works with valid actions +4. Episodes actually terminate +5. A simple heuristic can solve it +""" + + +def test_environment(): + """Run basic sanity checks on the PufferLib environment.""" + print("=" * 60) + print("TESTING MoveToTargetEnv (PufferLib)") + print("=" * 60) + + # Test 1: Creation + print("\n[TEST 1] Creating environment...") + try: + env = MoveToTargetEnv(seed=42) + print(f" OK: Created env") + print(f" Observation space: {env.single_observation_space}") + print(f" Action space: {env.single_action_space}") + print(f" Num agents: {env.num_agents}") + except Exception as e: + print(f" FAILED: {e}") + import traceback + + traceback.print_exc() + return False + + # Test 2: Reset + print("\n[TEST 2] Testing reset()...") + try: + obs, info = env.reset() + print(f" OK: reset() returned observations with shape {obs.shape}") + print(f" Sample observation: {obs[0]}") + assert obs.shape == (1, 6), f"Wrong shape: {obs.shape}, expected (1, 6)" + except Exception as e: + print(f" FAILED: {e}") + import traceback + + traceback.print_exc() + return False + + # Test 3: Step with random actions + print("\n[TEST 3] Testing step() with random actions...") + try: + for i in range(5): + actions = np.array([np.random.randint(0, 5)]) # Shape (1,) + obs, rewards, terminals, truncations, infos = env.step(actions) + print(f" Step {i + 1}: reward={rewards[0]:.3f}, terminal={terminals[0]}") + print(f" OK: step() works") + except Exception as e: + print(f" FAILED: {e}") + import traceback + + traceback.print_exc() + return False + + # Test 4: Run until episode terminates using heuristic + print("\n[TEST 4] Running until episode terminates...") + try: + obs, _ = env.reset() + total_steps = 0 + episodes_finished = 0 + + while episodes_finished < 2 and total_steps < 500: + # Simple heuristic: move toward target + dx = obs[0, 4] # target_x - agent_x + dy = obs[0, 5] # target_y - agent_y + + if abs(dx) > abs(dy): + action = 4 if dx > 0 else 3 # RIGHT or LEFT + else: + action = 1 if dy > 0 else 2 # UP or DOWN + + actions = np.array([action]) + obs, rewards, terminals, truncations, infos = env.step(actions) + total_steps += 1 + + if infos: + for info in infos: + episodes_finished += 1 + print(f" Episode finished: {info}") + + print(f" OK: Completed {episodes_finished} episodes in {total_steps} steps") + except Exception as e: + print(f" FAILED: {e}") + import traceback + + traceback.print_exc() + return False + + # Test 5: Test with PufferLib vectorization + print("\n[TEST 5] Testing with pufferlib.vector.make()...") + try: + vecenv = pufferlib.vector.make( + MoveToTargetEnv, + num_envs=4, + backend=pufferlib.vector.Serial, + ) + obs, _ = vecenv.reset() + print(f" OK: Created vectorized env with 4 copies") + print(f" Vectorized observation shape: {obs.shape}") + + # Take a few steps + for i in range(3): + actions = np.random.randint(0, 5, size=4) + obs, rewards, terminals, truncations, infos = vecenv.step(actions) + print(f" OK: Vectorized stepping works") + vecenv.close() + except Exception as e: + print(f" FAILED: {e}") + import traceback + + traceback.print_exc() + return False + + print("\n" + "=" * 60) + print("ALL ENVIRONMENT TESTS PASSED!") + print("=" * 60) + return True + + +# ============================================================================= +# SECTION 3: POLICY NETWORK +# ============================================================================= +""" +WHY this specific architecture? +------------------------------- +PufferLib expects policies to follow certain conventions: + +1. forward_eval(observations, state=None) -> (logits, values) + - This is what the trainer calls during rollout collection + - Returns action LOGITS (not probabilities) and value estimates + - The `state` parameter is for RNNs (we return None for feedforward) + +2. Use pufferlib.pytorch.layer_init() for weight initialization + - Proper initialization is crucial for stable learning + - Different std values for actor vs critic heads + +WHY layer_init? +--------------- +Neural network initialization matters A LOT for RL: +- Too large weights -> exploding gradients, unstable training +- Too small weights -> vanishing gradients, slow learning +- layer_init uses orthogonal initialization which works well for RL + +ARCHITECTURE: +observation (6) -> encoder (64 -> 64) -> actor head (5) + critic head (1) +""" + + +class Policy(nn.Module): + """ + Actor-Critic policy network following PufferLib conventions. + + The network has: + - Shared encoder: processes observations into features + - Actor head: outputs action logits (5 actions) + - Critic head: outputs value estimate (1 value) + """ + + def __init__(self, env, hidden_size=64): + """ + WHY take env as parameter? + -------------------------- + We extract observation and action sizes from the environment. + This is more robust than hardcoding dimensions. + + PufferLib's vectorized envs provide: + - env.single_observation_space: shape of one agent's observation + - env.single_action_space: the action space for one agent + + For regular Gymnasium envs, these would be observation_space/action_space. + """ + super().__init__() + + # Get dimensions from environment + obs_size = env.single_observation_space.shape[0] + action_size = env.single_action_space.n + + # ----------------------------------------------------------------- + # TODO 3.1: Create the encoder (shared backbone) + # ----------------------------------------------------------------- + # The encoder processes observations into a feature vector. + # Both actor and critic will use these features. + # + # Architecture: Linear(obs_size, hidden_size) -> ReLU -> Linear(hidden_size, hidden_size) -> ReLU + # + # Use pufferlib.pytorch.layer_init() for each Linear layer. + # Default std works for hidden layers. + # + # Example: + # self.encoder = nn.Sequential( + # pufferlib.pytorch.layer_init(nn.Linear(obs_size, hidden_size)), + # nn.ReLU(), + # pufferlib.pytorch.layer_init(nn.Linear(hidden_size, hidden_size)), + # nn.ReLU(), + # ) + # + # YOUR CODE: + + self.encoder = nn.Sequential( + pufferlib.pytorch.layer_init(nn.Linear(obs_size, hidden_size)), + nn.ReLU(), + pufferlib.pytorch.layer_init(nn.Linear(hidden_size, hidden_size)), + nn.ReLU(), + ) + + # ----------------------------------------------------------------- + # TODO 3.2: Create the actor head + # ----------------------------------------------------------------- + # Outputs action logits. Use std=0.01 for small initial outputs. + # WHY small std? We want initial actions to be nearly uniform. + # + # self.actor = pufferlib.pytorch.layer_init( + # nn.Linear(hidden_size, action_size), std=0.01 + # ) + # + # YOUR CODE: + + self.actor = pufferlib.pytorch.layer_init( + nn.Linear(hidden_size, action_size), std=0.01 + ) + + # ----------------------------------------------------------------- + # TODO 3.3: Create the critic head + # ----------------------------------------------------------------- + # Outputs value estimate. Use std=1.0 for reasonable initial values. + # + # self.critic = pufferlib.pytorch.layer_init( + # nn.Linear(hidden_size, 1), std=1.0 + # ) + # + # YOUR CODE: + + self.critic = pufferlib.pytorch.layer_init(nn.Linear(hidden_size, 1), std=1.0) + + def forward_eval(self, observations, state=None): + """ + WHY forward_eval specifically? + ------------------------------ + PufferLib's trainer calls forward_eval() during rollout collection. + It expects (logits, values) as return value. + + The state parameter is for recurrent networks (LSTMs). For feedforward + networks like ours, we ignore it and return None. + + PARAMETERS: + - observations: tensor of shape (batch_size, obs_size) + - state: For RNN/LSTM policies, carries hidden state between steps. + For feedforward networks (like ours), always None. + + RETURNS: + - logits: tensor of shape (batch_size, action_size) - unnormalized action scores + - values: tensor of shape (batch_size, 1) - value estimates + """ + # ----------------------------------------------------------------- + # TODO 3.4: Implement forward_eval + # ----------------------------------------------------------------- + # Step 1: Pass observations through encoder + # hidden = self.encoder(observations) + # + # Step 2: Get action logits from actor head + # logits = self.actor(hidden) + # + # Step 3: Get value estimate from critic head + # values = self.critic(hidden) + # + # Step 4: Return (logits, values) + + hidden = self.encoder(observations) + logits = self.actor(hidden) + values = self.critic(hidden) + + return logits, values + + def forward(self, observations, state=None): + """Standard PyTorch forward - required by PufferLib trainer.""" + return self.forward_eval(observations, state) + + +# ============================================================================= +# SECTION 4: TESTING POLICY +# ============================================================================= +""" +WHY test the policy? +-------------------- +Verify the network architecture is correct before training. +Common bugs: +- Wrong input/output dimensions +- Missing activations +- NaN in outputs +""" + + +def test_policy(): + """Run basic sanity checks on the Policy network.""" + print("\n" + "=" * 60) + print("TESTING Policy Network") + print("=" * 60) + + # Test 1: Creation + print("\n[TEST 1] Creating policy...") + try: + # Create a dummy env to get dimensions + env = MoveToTargetEnv() + env.reset() # Initialize the env + + policy = Policy(env, hidden_size=64) + print(f" OK: Created policy") + + # Count parameters + total_params = sum(p.numel() for p in policy.parameters()) + print(f" Total parameters: {total_params}") + except Exception as e: + print(f" FAILED: {e}") + import traceback + + traceback.print_exc() + return False + + # Test 2: forward_eval + print("\n[TEST 2] Testing forward_eval()...") + try: + # Create batch of observations + obs = torch.randn(4, 6) # batch of 4 + logits, values = policy.forward_eval(obs) + + print(f" Input shape: {obs.shape}") + print(f" Logits shape: {logits.shape} (expected: [4, 5])") + print(f" Values shape: {values.shape} (expected: [4, 1])") + + assert logits.shape == (4, 5), f"Wrong logits shape: {logits.shape}" + assert values.shape == (4, 1), f"Wrong values shape: {values.shape}" + + # Check for NaN + assert not torch.isnan(logits).any(), "NaN in logits!" + assert not torch.isnan(values).any(), "NaN in values!" + + print(" OK: Shapes correct, no NaN") + except Exception as e: + print(f" FAILED: {e}") + import traceback + + traceback.print_exc() + return False + + # Test 3: Single observation + print("\n[TEST 3] Testing with single observation...") + try: + obs = torch.randn(1, 6) + logits, values = policy.forward_eval(obs) + + print(f" Logits: {logits}") + print(f" Value: {values}") + print(" OK: Single observation works") + except Exception as e: + print(f" FAILED: {e}") + import traceback + + traceback.print_exc() + return False + + print("\n" + "=" * 60) + print("ALL POLICY TESTS PASSED!") + print("=" * 60) + return True + + +# ============================================================================= +# SECTION 5: TRAINING WITH PUFFERLIB +# ============================================================================= +""" +WHY use pufferl.PuffeRL? +------------------------ +PufferLib's trainer handles ALL the RL internals: +- Rollout collection (running envs, storing experiences) +- GAE advantage computation +- PPO loss calculation (clipped surrogate, value loss, entropy) +- Gradient updates with clipping +- Logging and metrics + +This means our training code is MUCH simpler than learn.py! + +THE TRAINING LOOP: +----------------- +1. Create vectorized environment +2. Create policy +3. Create config dict with hyperparameters +4. Create PuffeRL trainer +5. Loop: trainer.evaluate() -> trainer.train() + +WHAT trainer.evaluate() does: +- Runs the policy in all environments +- Collects experiences into buffers +- Computes advantages and returns + +WHAT trainer.train() does: +- Runs PPO update on collected experiences +- Updates policy weights +- Logs metrics +""" + + +def train(quick_test=False): + """ + Main training function using PufferLib's trainer. + + PARAMETERS: + - quick_test: if True, run short training to verify code works + if False, run full training to see actual learning + """ + # ----------------------------------------------------------------- + # Hyperparameters + # ----------------------------------------------------------------- + if quick_test: + total_timesteps = 10000 + num_envs = 4 + else: + total_timesteps = 100000 + num_envs = 8 + + # Detect device + # device = "mps" if torch.backends.mps.is_available() else "cpu" + device = "cpu" + + print("=" * 60) + print("TRAINING WITH PUFFERLIB") + print("=" * 60) + print(f"MPS available: {torch.backends.mps.is_available()}") + print(f"Using device: {device}") + print(f"Total timesteps: {total_timesteps}") + print(f"Num environments: {num_envs}") + print("=" * 60) + + # ----------------------------------------------------------------- + # TODO 5.1: Create vectorized environment + # ----------------------------------------------------------------- + # PufferLib's vector.make() creates multiple environment copies. + # + # Backend options: + # - Serial: Runs envs sequentially. Good for debugging because errors + # appear in the main process with full stack traces. + # - Multiprocessing: Runs envs in parallel. Much faster for many envs, + # but errors in subprocesses are harder to debug. + # + # Tip: Use Serial until your code works, then switch to Multiprocessing. + # + # vecenv = pufferlib.vector.make( + # MoveToTargetEnv, + # num_envs=num_envs, + # backend=pufferlib.vector.Serial, + # ) + # + # YOUR CODE: + + vecenv = pufferlib.vector.make( + MoveToTargetEnv, num_envs=num_envs, backend=pufferlib.vector.Multiprocessing + ) + + # ----------------------------------------------------------------- + # TODO 5.2: Create policy + # ----------------------------------------------------------------- + # Use vecenv.driver_env to get a reference to one of the environment copies. + # This lets us access single_observation_space and single_action_space + # for creating the policy with correct input/output dimensions. + # Move policy to device for GPU training. + # + # policy = Policy(vecenv.driver_env, hidden_size=64).to(device) + # + # YOUR CODE: + + policy = Policy(vecenv.driver_env, hidden_size=64).to(device) + next(policy.parameters()).device + + # ----------------------------------------------------------------- + # TODO 5.3: Create config + # ----------------------------------------------------------------- + # PufferLib's trainer uses a Config object for hyperparameters. + # These are standard PPO values that work well. + # + # config = pufferl.Config( + # total_timesteps=total_timesteps, + # learning_rate=3e-4, + # num_steps=128, # Steps per rollout + # num_minibatches=4, # Minibatches per update + # update_epochs=4, # PPO epochs per update + # gamma=0.99, # Discount factor + # gae_lambda=0.95, # GAE parameter + # clip_coef=0.2, # PPO clipping + # vf_coef=0.5, # Value loss coefficient + # ent_coef=0.01, # Entropy bonus coefficient + # max_grad_norm=0.5, # Gradient clipping + # ) + # + # YOUR CODE: + + config = { + "env": "MoveToTarget", + "total_timesteps": total_timesteps, + "learning_rate": 3e-4, + "batch_size": num_envs * 128, + "bptt_horizon": 128, + "minibatch_size": 512, + "max_minibatch_size": 512, + "update_epochs": 4, + "gamma": 0.99, + "gae_lambda": 0.95, + "clip_coef": 0.2, + "vf_coef": 0.5, + "vf_clip_coef": 0.2, + "ent_coef": 0.01, + "max_grad_norm": 0.5, + "device": device, + "seed": 42, + "torch_deterministic": True, + "cpu_offload": False, + "use_rnn": False, + "compile": False, + "optimizer": "adam", + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_eps": 1e-8, + "anneal_lr": True, + "vtrace_rho_clip": 1.0, + "vtrace_c_clip": 1.0, + "prio_alpha": 0.8, + "prio_beta0": 0.2, + "checkpoint_interval": 200, + "data_dir": "experiments", + "precision": "float32", + } + + # ----------------------------------------------------------------- + # TODO 5.4: Create trainer + # ----------------------------------------------------------------- + # The PuffeRL trainer handles the entire training loop internals. + # + # trainer = pufferl.PuffeRL( + # config=config, + # vecenv=vecenv, + # policy=policy, + # optimizer=torch.optim.Adam(policy.parameters(), lr=config.learning_rate), + # ) + # + # YOUR CODE: + + trainer = pufferl.PuffeRL(config, vecenv, policy) + + # ----------------------------------------------------------------- + # TODO 5.5: Training loop + # ----------------------------------------------------------------- + # The training loop is very simple with PufferLib: + # 1. trainer.evaluate() - collect experiences + # 2. trainer.train() - run PPO update + # 3. Repeat until done + # + # Example: + # while not trainer.done: + # trainer.evaluate() + # trainer.train() + # + # # Print progress every 10 epochs + # if trainer.epoch % 10 == 0: + # # Get metrics from trainer + # metrics = trainer.metrics + # print(f"Epoch {trainer.epoch} | " + # f"reward: {metrics.get('episode_reward', 0):.2f} | " + # f"length: {metrics.get('episode_length', 0):.1f}") + # + # Or use the built-in dashboard: + # while not trainer.done: + # trainer.evaluate() + # trainer.train() + # trainer.print_dashboard() # Pretty-printed metrics + # + # YOUR CODE: + + while trainer.global_step < total_timesteps: + trainer.evaluate() + trainer.train() + + # Cleanup + trainer.close() + vecenv.close() + + print("\n" + "=" * 60) + print("TRAINING COMPLETE!") + print("=" * 60) + + return policy + + +# ============================================================================= +# SECTION 6: EVALUATION WITH ASCII RENDERING +# ============================================================================= + + +def eval_policy(num_episodes=3, delay=0.1): + """ + Run the trained policy and watch it play with ASCII rendering. + + PARAMETERS: + - num_episodes: number of episodes to run + - delay: seconds between frames (for watchability) + """ + import time + import glob + + print("=" * 60) + print("EVALUATING TRAINED POLICY") + print("=" * 60) + + # Find latest checkpoint + checkpoints = glob.glob("experiments/**/model.pt", recursive=True) + if not checkpoints: + print( + "No checkpoint found in experiments/. Train first with 'python learn_v2.py train'" + ) + return + + latest_checkpoint = max(checkpoints, key=lambda x: os.path.getmtime(x)) + print(f"Loading checkpoint: {latest_checkpoint}") + + # Create environment (single, not vectorized) + env = MoveToTargetEnv(seed=int(time.time())) + + # Create and load policy + policy = Policy(env, hidden_size=64) + checkpoint = torch.load(latest_checkpoint, map_location="cpu", weights_only=True) + policy.load_state_dict(checkpoint) + policy.eval() + + print(f"Running {num_episodes} episodes...\n") + + for ep in range(num_episodes): + print(f"\n{'=' * 60}") + print(f"EPISODE {ep + 1}") + print(f"{'=' * 60}") + + obs, _ = env.reset() + env.render() + time.sleep(delay) + + done = False + total_reward = 0.0 + + while not done: + # Get action from policy + with torch.no_grad(): + obs_tensor = torch.from_numpy(obs).float() + logits, _ = policy(obs_tensor) + action = torch.argmax(logits, dim=-1).item() + + # Step environment + obs, rewards, terminals, truncations, infos = env.step(np.array([action])) + total_reward += rewards[0] + done = terminals[0] or truncations[0] + + # Render + env.render() + action_names = ["NOOP", "UP", "DOWN", "LEFT", "RIGHT"] + print(f"Action: {action_names[action]}, Reward: {rewards[0]:.3f}") + time.sleep(delay) + + # Episode summary + if infos: + info = infos[0] + result = ( + "REACHED TARGET!" + if info.get("reached_target") + else "Failed (wall/timeout)" + ) + print(f"\nResult: {result}") + print(f"Episode length: {info.get('episode_length', 'N/A')}") + print(f"Total reward: {total_reward:.3f}") + + env.close() + print("\n" + "=" * 60) + print("EVALUATION COMPLETE!") + print("=" * 60) + + +# ============================================================================= +# MAIN EXECUTION +# ============================================================================= + +if __name__ == "__main__": + import sys + + # Parse command line arguments + if len(sys.argv) > 1: + command = sys.argv[1] + if command == "test": + # Run all tests + env_ok = test_environment() + if env_ok: + test_policy() + elif command == "train": + # Run full training + test_environment() + test_policy() + train(quick_test=False) + elif command == "quick": + # Quick training test + # test_environment() + # test_policy() + train(quick_test=True) + elif command == "eval": + # Evaluate trained policy with ASCII rendering + eval_policy(num_episodes=3, delay=0.1) + else: + print(f"Unknown command: {command}") + print("Usage: python learn_v2.py [test|train|quick|eval]") + else: + # Default: run tests only + print("Running tests... (use 'python learn_v2.py train' for full training)") + print() + env_ok = test_environment() + if env_ok: + test_policy() diff --git a/pufferlib/ocean/boss_fight/README.md b/pufferlib/ocean/boss_fight/README.md index a9cbcd8b8..48999f6de 100644 --- a/pufferlib/ocean/boss_fight/README.md +++ b/pufferlib/ocean/boss_fight/README.md @@ -5,7 +5,7 @@ Build a **minimal** 2D boss fight environment to learn RL concepts with PufferLib. Focus: **observation design, reward shaping, training experiments and a bit of game dev using Raylib** -The boss has **1 attack** (AOE burst). All hitboxes are circles. +The boss has **1 attack** (AOE burst). All hitboxes are circles (collision = circles overlap). --- @@ -57,6 +57,8 @@ ATTACK — windup(4) + active(3) + recovery(6) = 13 ticks total, no movement - Circle at `player_pos + facing * 0.7`, radius `0.4` - `facing` = direction to boss at attack start +- Hits boss if circles overlap: `dist(attack, boss) < 0.4 + 0.5` +- **Effective range: 1.6 units from boss center** - Damage: 10 ### Boss Behavior (Single Attack) @@ -73,109 +75,102 @@ RECOVERY: 15 ticks (0.5s) — vulnerable, no damage **AOE Attack:** - Circle centered on boss, radius `1.5` +- Hits player if circles overlap: `dist(player, boss) < 1.5 + 0.3` +- **Effective range: 1.8 units from boss center** - Damage: 20 -- Player takes damage if: in AOE radius AND not in i-frames +- Player avoids damage if: outside range OR in i-frames --- -## Observation Space (14 floats) +## Observation Space (13 floats) -Keep it minimal. You can ablate later. +Raw game state values — let the network learn its own representations. ``` -Geometry (3): - 0: rel_boss_x = boss_x - player_x (normalized by arena half-size) - 1: rel_boss_y = boss_y - player_y - 2: distance = clamp(dist / 5.0, 0, 1) +Geometry (6): + 0: dx = boss_x - player_x (relative position) + 1: dy = boss_y - player_y + 2: player_x = absolute position [-5, 5] + 3: player_y = absolute position [-5, 5] + 4: boss_x = absolute position (fixed at 0) + 5: boss_y = absolute position (fixed at 0) Player (5): - 3: player_hp = hp / 100 - 4: dodge_ready = 1.0 if can dodge, else 0.0 - 5: player_state = {FREE: 0, DODGE: 0.33, ATTACK: 0.66} # scalar encoding - 6: state_progress = ticks_in_state / state_duration - 7: move_dir_x = -1 to 1 - -Boss (6): - 8: boss_hp = hp / 100 - 9: boss_phase = {IDLE: 0, WINDUP: 0.33, ACTIVE: 0.66, RECOVERY: 1.0} - 10: phase_progress = ticks_in_phase / phase_duration - 11: time_to_damage = ticks until ACTIVE starts / 18 (1.0 during IDLE/RECOVERY) - 12: in_aoe_range = 1.0 if distance < 1.5, else 0.0 - 13: boss_attacking = 1.0 if in WINDUP/ACTIVE, else 0.0 + 6: player_hp = raw HP [0, 100] + 7: boss_hp = raw HP [0, 100] + 8: player_state = enum {IDLING: 0, DODGING: 1, ATTACKING: 2} + 9: player_dodge_cooldown = ticks remaining [0, 15] + 10: player_state_ticks = ticks in current state + +Boss (2): + 11: boss_state = enum {IDLING: 0, WINDING_UP: 1, ATTACKING: 2, RECOVERING: 3} + 12: boss_phase_ticks = ticks in current phase ``` --- -## Reward Function (v1 — HP delta) +## Reward Function -```python -# Per step -reward = 0 -reward += (boss_hp_prev - boss_hp_now) * 0.1 # +1.0 per hit landed -reward += (player_hp_prev - player_hp_now) * -0.1 # -2.0 per AOE hit taken -reward += -0.001 # time penalty +Design your own! Consider these questions: -# Terminal -if boss_hp <= 0: reward += 1.0 # win bonus -if player_hp <= 0: reward -= 1.0 # lose penalty -``` +- **What behaviors do you want to encourage?** (dealing damage, staying alive, winning) +- **What behaviors do you want to discourage?** (taking hits, timing out, being passive) +- **Dense vs sparse?** Should the agent get feedback every step, or only at episode end? +- **Scaling?** How do you balance different reward components so one doesn't dominate? + +Hint: Track HP changes between steps. Think about terminal bonuses. --- ## Episode Termination -- `terminated = True` if player or boss HP <= 0 -- `truncated = True` if ticks >= 900 (30 seconds) +Episodes end when: + +- Someone wins (HP reaches 0) +- Time runs out (prevent infinite episodes) --- -## Implementation (Single File) +## Implementation (C + Python) -Everything in `soulsrl.py` (~250-300 lines): +Core game logic in C with Python bindings: -```python -class SoulsEnv(pufferlib.PufferEnv): - # Player state machine - # Boss state machine - # Collision detection (circle-circle only) - # Observation building - # Reward calculation +``` +boss_fight.h — Game state struct, enums, c_reset(), c_step(), c_render() +boss_fight.c — Standalone test with keyboard input (Shift+WASD/Space/J) +boss_fight.py — PufferLib environment wrapper ``` -No separate core.py, no rendering, no curriculum stages. +Uses Raylib for rendering (1080x720 window @ 30 FPS). --- ## RL Experiments -Once v1 is working, run these experiments to learn RL concepts: +Once v1 is working, design experiments to understand RL concepts: + +### Experiment Ideas -### Experiment 1: Observation Ablations +**Observation Ablations** — Which observations actually matter? -| Variant | Change | Hypothesis | -| --------- | --------------------------------------------------------------- | -------------------------------------- | -| no_timing | Remove `time_to_damage`, `phase_progress` | Agent can't learn precise dodge timing | -| no_range | Remove `in_aoe_range`, `distance` | Agent can't learn spacing | -| minimal | Only: `distance`, `time_to_damage`, `dodge_ready`, `boss_phase` | Test minimum viable obs | -| noisy | Add 5 uniform random floats | Network should ignore noise | +- What happens if the agent can't see timing information? +- Does it need absolute position, or is relative enough? +- What's the minimum viable observation space? +- Can the network learn to ignore irrelevant/noisy inputs? -### Experiment 2: Reward Shaping +**Reward Shaping** — How does reward design affect behavior? -| Variant | Change | Hypothesis | -| --------------- | -------------------------------- | -------------------------- | -| sparse | Only win/lose bonus, no HP delta | Much slower learning | -| no_time_penalty | Remove -0.001/step | Agent becomes passive | -| dodge_bonus | +0.2 for dodging during ACTIVE | Might create dodge spam | -| proximity | +0.01 for being close to boss | Might discourage safe play | +- What if you only reward winning/losing (sparse)? +- What happens without a time penalty? +- Can you incentivize specific behaviors (dodging at the right time)? +- What unintended behaviors might reward bonuses create? -### Experiment 3: Hyperparameters +**Hyperparameters** — See `boss_fight.ini` for the sweep config -| Param | Values | What to observe | -| ------------- | ---------------- | --------------------------- | -| learning_rate | 1e-3, 3e-4, 1e-4 | Learning speed vs stability | -| ent_coef | 0.0, 0.01, 0.05 | Exploration vs exploitation | -| num_envs | 8, 32, 128 | Sample efficiency | -| hidden_size | 32, 64, 128 | Model capacity | +- Learning rate: stability vs speed +- Entropy coefficient: exploration vs exploitation +- Batch size / num_envs: sample efficiency +- Network size: capacity vs overfitting --- @@ -202,17 +197,18 @@ Only add these if baseline experiments are done: ## Deliverables -1. `soulsrl.py` — Environment (PufferEnv) -2. `train.py` — Training script with logging -3. `experiments/` — Saved runs with different configs -4. `results.md` — Summary of what you learned from experiments +1. `boss_fight.h` — Core game logic in C +2. `boss_fight.c` — Standalone test binary +3. `boss_fight.py` — PufferLib environment wrapper +4. `experiments/` — Saved runs with different configs +5. `results.md` — Summary of what you learned from experiments --- -## Timeline Estimate +## Milestones -- Day 1: Implement `soulsrl.py`, verify with random agent -- Day 2: Train baseline, confirm learning -- Day 3-4: Run observation ablations -- Day 5-6: Run reward experiments -- Day 7: Document findings, optional extensions +1. **Environment works**: `c_step()` implemented, can play manually with keyboard +2. **Random baseline**: Random agent wins ~0%, confirms game is non-trivial +3. **Learning signal**: Trained agent shows improvement over random +4. **Competent agent**: Win rate >80% +5. **Experiments**: At least 3 ablations with documented findings diff --git a/pufferlib/ocean/boss_fight/binding.c b/pufferlib/ocean/boss_fight/binding.c index 812e31bb7..45731694b 100644 --- a/pufferlib/ocean/boss_fight/binding.c +++ b/pufferlib/ocean/boss_fight/binding.c @@ -4,7 +4,7 @@ #include "../env_binding.h" static int my_init(Env *env, PyObject *args, PyObject *kwargs) { - env->size = unpack(kwargs, "size"); + // No special init needed for now return 0; } diff --git a/pufferlib/ocean/boss_fight/boss_fight.c b/pufferlib/ocean/boss_fight/boss_fight.c index 0e1e152a2..67e6e8d8b 100644 --- a/pufferlib/ocean/boss_fight/boss_fight.c +++ b/pufferlib/ocean/boss_fight/boss_fight.c @@ -1,25 +1,39 @@ #include "boss_fight.h" +#include "raylib.h" int main() { - BossFight env = {.size = 5}; - env.observations = (unsigned char *)calloc(1, sizeof(unsigned char)); - env.actions = (int *)calloc(1, sizeof(int)); - env.rewards = (float *)calloc(1, sizeof(float)); - env.terminals = (unsigned char *)calloc(1, sizeof(unsigned char)); + int num_obs = 13; + int num_actions = 1; + int num_agents = 1; + BossFight env = {}; + env.observations = (float *)calloc(num_obs, sizeof(unsigned char)); + env.actions = (float *)calloc(num_actions, sizeof(int)); + env.rewards = (float *)calloc(num_agents, sizeof(float)); + env.terminals = (unsigned char *)calloc(num_agents, sizeof(unsigned char)); + + // Always call reset and render first c_reset(&env); c_render(&env); + while (!WindowShouldClose()) { if (IsKeyDown(KEY_LEFT_SHIFT)) { - if (IsKeyDown(KEY_A) || IsKeyDown(KEY_LEFT)) { - env.actions[0] = 0; - } else if (IsKeyDown(KEY_D) || IsKeyDown(KEY_RIGHT)) { + if (IsKeyDown(KEY_W)) env.actions[0] = 1; - } else { - env.actions[0] = -1; - } + else if (IsKeyDown(KEY_S)) + env.actions[0] = 2; + else if (IsKeyDown(KEY_A)) + env.actions[0] = 3; + else if (IsKeyDown(KEY_D)) + env.actions[0] = 4; + else if (IsKeyDown(KEY_SPACE)) + env.actions[0] = 5; + else if (IsKeyDown(KEY_J)) + env.actions[0] = 6; + else + env.actions[0] = 0; } else { - env.actions[0] = rand() % 2; + env.actions[0] = rand() % 7; } c_step(&env); c_render(&env); diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h index 75d2932b1..2c37cb275 100644 --- a/pufferlib/ocean/boss_fight/boss_fight.h +++ b/pufferlib/ocean/boss_fight/boss_fight.h @@ -1,11 +1,29 @@ #include "raylib.h" +#include +#include #include #include -const Color PUFF_RED = (Color){187, 0, 0, 255}; -const Color PUFF_CYAN = (Color){0, 187, 187, 255}; -const Color PUFF_WHITE = (Color){241, 241, 241, 241}; -const Color PUFF_BACKGROUND = (Color){6, 24, 24, 255}; +#define ARENA_HALF_SIZE 5.0f +#define MAX_HP 100 +#define PLAYER_SPEED_PER_TICK 0.1f +#define PLAYER_SIZE 0.3f +#define BOSS_SIZE 0.5f + +const Color PLAYER_COLOR = (Color){187, 0, 0, 255}; +const Color BOSS_COLOR = (Color){0, 187, 187, 255}; +const Color TEXT_COLOR = (Color){241, 241, 241, 241}; +const Color HITBOX_COLOR = (Color){241, 241, 241, 241}; +const Color BACKGROUND_COLOR = (Color){6, 24, 24, 255}; + +typedef enum { PLAYER_IDLING, PLAYER_DODGING, PLAYER_ATTACKING } PlayerState; + +typedef enum { + BOSS_IDLING, + BOSS_WINDING_UP, + BOSS_ATTACKING, + BOSS_RECOVERING, +} BossState; // Only use floats! typedef struct { @@ -14,62 +32,102 @@ typedef struct { } Log; typedef struct { - Log log; // Required field - unsigned char - *observations; // Required field. Ensure type matches in .py and .c - int *actions; // Required field. Ensure type matches in .py and .c + Log log; // Required field + float *observations; // Required field. Ensure type matches in .py and .c + float *actions; // Required field. Ensure type matches in .py and .c float *rewards; // Required field unsigned char *terminals; // Required field - int size; - int x; - int goal; + + int tick; + float player_x; + float player_y; + float boss_x; + float boss_y; + float distance; + + PlayerState player_state; + int player_hp; + int player_dodge_cooldown; + int player_state_ticks; + + BossState boss_state; + int boss_hp; + int boss_phase_ticks; + } BossFight; +float rand_uniform(float low, float high) { + return low + (high - low) * ((float)rand() / ((float)RAND_MAX + 1.0f)); +} + +float distance(float x1, float y1, float x2, float y2) { + float dx = x1 - x2; + float dy = y1 - y2; + return sqrtf(dx * dx + dy * dy); +} + void c_reset(BossFight *env) { - env->x = 0; - env->goal = (rand() % 2 == 0) ? env->size : -env->size; + env->tick = 0; + env->player_x = 0; + env->player_y = 0; + env->boss_x = 0; + env->boss_y = 0; + env->player_hp = 100; + env->boss_hp = 100; + env->player_state = PLAYER_IDLING; + env->player_dodge_cooldown = 0; + env->player_state_ticks = 0; + env->boss_state = BOSS_IDLING; + env->boss_phase_ticks = 0; + env->distance = 0; + + env->player_x = rand_uniform(-ARENA_HALF_SIZE, ARENA_HALF_SIZE); + env->player_y = rand_uniform(-ARENA_HALF_SIZE, ARENA_HALF_SIZE); + + while (distance(env->player_x, env->player_y, env->boss_x, env->boss_y) < + 0.1) { + env->player_x = rand_uniform(-ARENA_HALF_SIZE, ARENA_HALF_SIZE); + env->player_y = rand_uniform(-ARENA_HALF_SIZE, ARENA_HALF_SIZE); + } + + env->distance = + distance(env->player_x, env->player_y, env->boss_x, env->boss_y); + + int obs_idx = 0; + + env->observations[obs_idx++] = 0; // dx + env->observations[obs_idx++] = 0; // dy + env->observations[obs_idx++] = env->player_x; + env->observations[obs_idx++] = env->player_y; + env->observations[obs_idx++] = env->boss_x; + env->observations[obs_idx++] = env->boss_y; + env->observations[obs_idx++] = 100; + env->observations[obs_idx++] = 100; + env->observations[obs_idx++] = PLAYER_IDLING; + env->observations[obs_idx++] = 0; // player_dodge_cooldown + env->observations[obs_idx++] = 0; // player_state_ticks + env->observations[obs_idx++] = BOSS_IDLING; + env->observations[obs_idx++] = 0; // boss_phase_ticks } void c_step(BossFight *env) { env->rewards[0] = 0; env->terminals[0] = 0; - if (env->actions[0] == 0) { - env->x -= 1; - } else if (env->actions[0] == 1) { - env->x += 1; - } - if (env->x == env->goal) { - c_reset(env); - env->rewards[0] = 1; - env->terminals[0] = 1; - env->log.score += 1; - env->log.n += 1; - } else if (env->x == -env->goal) { - c_reset(env); - env->rewards[0] = -1; - env->terminals[0] = 1; - env->log.score -= 1; - env->log.n += 1; - } - env->observations[0] = (env->goal > 0) ? 1 : -1; } void c_render(BossFight *env) { if (!IsWindowReady()) { - InitWindow(1080, 720, "PufferLib Template"); - SetTargetFPS(5); + InitWindow(1080, 720, "BossFight"); + SetTargetFPS(30); } if (IsKeyDown(KEY_ESCAPE)) { exit(0); } - DrawText("Go to the red square!", 20, 20, 20, PUFF_WHITE); - DrawRectangle(540 - 32 + 64 * env->goal, 360 - 32, 64, 64, PUFF_RED); - DrawRectangle(540 - 32 + 64 * env->x, 360 - 32, 64, 64, PUFF_CYAN); - BeginDrawing(); - ClearBackground(PUFF_BACKGROUND); + ClearBackground(BACKGROUND_COLOR); + DrawText("Beat the boss!", 20, 20, 20, TEXT_COLOR); EndDrawing(); } From 1a67976d811bc6ee9e64607ce9d2688691a6581a Mon Sep 17 00:00:00 2001 From: frixaco Date: Tue, 20 Jan 2026 02:11:24 +0500 Subject: [PATCH 05/29] wip: step function --- pufferlib/ocean/boss_fight/boss_fight.c | 4 +- pufferlib/ocean/boss_fight/boss_fight.h | 72 +++++++++++++++++++----- pufferlib/ocean/boss_fight/boss_fight.py | 4 +- 3 files changed, 63 insertions(+), 17 deletions(-) diff --git a/pufferlib/ocean/boss_fight/boss_fight.c b/pufferlib/ocean/boss_fight/boss_fight.c index 67e6e8d8b..49198b733 100644 --- a/pufferlib/ocean/boss_fight/boss_fight.c +++ b/pufferlib/ocean/boss_fight/boss_fight.c @@ -7,8 +7,8 @@ int main() { int num_agents = 1; BossFight env = {}; - env.observations = (float *)calloc(num_obs, sizeof(unsigned char)); - env.actions = (float *)calloc(num_actions, sizeof(int)); + env.observations = (float *)calloc(num_obs, sizeof(float)); + env.actions = (int *)calloc(num_actions, sizeof(float)); env.rewards = (float *)calloc(num_agents, sizeof(float)); env.terminals = (unsigned char *)calloc(num_agents, sizeof(unsigned char)); diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h index 2c37cb275..353e98db3 100644 --- a/pufferlib/ocean/boss_fight/boss_fight.h +++ b/pufferlib/ocean/boss_fight/boss_fight.h @@ -34,7 +34,7 @@ typedef struct { typedef struct { Log log; // Required field float *observations; // Required field. Ensure type matches in .py and .c - float *actions; // Required field. Ensure type matches in .py and .c + int *actions; // Required field. Ensure type matches in .py and .c float *rewards; // Required field unsigned char *terminals; // Required field @@ -72,8 +72,8 @@ void c_reset(BossFight *env) { env->player_y = 0; env->boss_x = 0; env->boss_y = 0; - env->player_hp = 100; - env->boss_hp = 100; + env->player_hp = MAX_HP; + env->boss_hp = MAX_HP; env->player_state = PLAYER_IDLING; env->player_dodge_cooldown = 0; env->player_state_ticks = 0; @@ -95,24 +95,70 @@ void c_reset(BossFight *env) { int obs_idx = 0; - env->observations[obs_idx++] = 0; // dx - env->observations[obs_idx++] = 0; // dy + env->observations[obs_idx++] = env->boss_x - env->player_x; + env->observations[obs_idx++] = env->boss_y - env->player_y; env->observations[obs_idx++] = env->player_x; env->observations[obs_idx++] = env->player_y; env->observations[obs_idx++] = env->boss_x; env->observations[obs_idx++] = env->boss_y; - env->observations[obs_idx++] = 100; - env->observations[obs_idx++] = 100; - env->observations[obs_idx++] = PLAYER_IDLING; - env->observations[obs_idx++] = 0; // player_dodge_cooldown - env->observations[obs_idx++] = 0; // player_state_ticks - env->observations[obs_idx++] = BOSS_IDLING; - env->observations[obs_idx++] = 0; // boss_phase_ticks + env->observations[obs_idx++] = (float)env->player_hp; + env->observations[obs_idx++] = (float)env->boss_hp; + env->observations[obs_idx++] = (float)env->player_state; + env->observations[obs_idx++] = (float)env->player_dodge_cooldown; + env->observations[obs_idx++] = (float)env->player_state_ticks; + env->observations[obs_idx++] = (float)env->boss_state; + env->observations[obs_idx++] = (float)env->boss_phase_ticks; } void c_step(BossFight *env) { - env->rewards[0] = 0; + float reward = -0.01; env->terminals[0] = 0; + + int action = env->actions[0]; + float dx = 0; + float dy = 0; + + if (action == 1) { + dy = PLAYER_SPEED_PER_TICK; + } else if (action == 2) { + dy = -PLAYER_SPEED_PER_TICK; + } else if (action == 3) { + dx = -PLAYER_SPEED_PER_TICK; + } else if (action == 4) { + dx = PLAYER_SPEED_PER_TICK; + } + + env->player_x += dx; + env->player_y += dy; + + bool wanna_idle = action == 0; + bool wanna_dodge = action == 5; + bool wanna_attack = action == 6; + bool can_dodge = + env->player_state == PLAYER_IDLING && env->player_dodge_cooldown == 0; + bool can_attack = env->player_state == PLAYER_IDLING; + + bool hit_wall = fabsf(env->player_x) > ARENA_HALF_SIZE && + fabsf(env->player_y) > ARENA_HALF_SIZE; + if (hit_wall) { + reward -= 0.5; + } + + // TODO: here i should handle "player attacks and reduces boss hp" case + + bool killed_boss = env->boss_hp == 0; + if (killed_boss) { + reward += 2; + } + + env->rewards[0] = reward; + + bool player_died = env->player_hp == 0; + if (player_died) { + env->terminals[0] = 1; + } + + env->tick++; } void c_render(BossFight *env) { diff --git a/pufferlib/ocean/boss_fight/boss_fight.py b/pufferlib/ocean/boss_fight/boss_fight.py index a952dbf41..2e6ce6681 100644 --- a/pufferlib/ocean/boss_fight/boss_fight.py +++ b/pufferlib/ocean/boss_fight/boss_fight.py @@ -12,9 +12,9 @@ def __init__( self, num_envs=1, render_mode=None, log_interval=128, size=5, buf=None, seed=0 ): self.single_observation_space = gymnasium.spaces.Box( - low=0, high=1, shape=(1,), dtype=np.uint8 + low=-10, high=110, shape=(13,), dtype=np.float32 ) - self.single_action_space = gymnasium.spaces.Discrete(2) + self.single_action_space = gymnasium.spaces.Discrete(7) self.render_mode = render_mode self.num_agents = num_envs From b884f4b0d0af71b3949df72201930b5bcf52a09a Mon Sep 17 00:00:00 2001 From: frixaco Date: Tue, 20 Jan 2026 23:55:29 +0500 Subject: [PATCH 06/29] finish step function --- pufferlib/ocean/boss_fight/README.md | 86 ---------------- pufferlib/ocean/boss_fight/boss_fight.h | 127 +++++++++++++++++++----- 2 files changed, 103 insertions(+), 110 deletions(-) diff --git a/pufferlib/ocean/boss_fight/README.md b/pufferlib/ocean/boss_fight/README.md index 48999f6de..3d5d189e6 100644 --- a/pufferlib/ocean/boss_fight/README.md +++ b/pufferlib/ocean/boss_fight/README.md @@ -48,19 +48,6 @@ DODGE — 6 ticks, i-frames on ticks 1-5, moves at 2.5x speed in last move_di ATTACK — windup(4) + active(3) + recovery(6) = 13 ticks total, no movement ``` -**Cooldowns:** - -- Dodge: 15 ticks after dodge ends -- Attack: No cooldown (but you're locked for 13 ticks) - -**Attack hitbox (during ACTIVE):** - -- Circle at `player_pos + facing * 0.7`, radius `0.4` -- `facing` = direction to boss at attack start -- Hits boss if circles overlap: `dist(attack, boss) < 0.4 + 0.5` -- **Effective range: 1.6 units from boss center** -- Damage: 10 - ### Boss Behavior (Single Attack) Boss cycles: `IDLE → WINDUP → ACTIVE → RECOVERY → IDLE` @@ -72,79 +59,6 @@ ACTIVE: 3 ticks (0.1s) — AOE hits RECOVERY: 15 ticks (0.5s) — vulnerable, no damage ``` -**AOE Attack:** - -- Circle centered on boss, radius `1.5` -- Hits player if circles overlap: `dist(player, boss) < 1.5 + 0.3` -- **Effective range: 1.8 units from boss center** -- Damage: 20 -- Player avoids damage if: outside range OR in i-frames - ---- - -## Observation Space (13 floats) - -Raw game state values — let the network learn its own representations. - -``` -Geometry (6): - 0: dx = boss_x - player_x (relative position) - 1: dy = boss_y - player_y - 2: player_x = absolute position [-5, 5] - 3: player_y = absolute position [-5, 5] - 4: boss_x = absolute position (fixed at 0) - 5: boss_y = absolute position (fixed at 0) - -Player (5): - 6: player_hp = raw HP [0, 100] - 7: boss_hp = raw HP [0, 100] - 8: player_state = enum {IDLING: 0, DODGING: 1, ATTACKING: 2} - 9: player_dodge_cooldown = ticks remaining [0, 15] - 10: player_state_ticks = ticks in current state - -Boss (2): - 11: boss_state = enum {IDLING: 0, WINDING_UP: 1, ATTACKING: 2, RECOVERING: 3} - 12: boss_phase_ticks = ticks in current phase -``` - ---- - -## Reward Function - -Design your own! Consider these questions: - -- **What behaviors do you want to encourage?** (dealing damage, staying alive, winning) -- **What behaviors do you want to discourage?** (taking hits, timing out, being passive) -- **Dense vs sparse?** Should the agent get feedback every step, or only at episode end? -- **Scaling?** How do you balance different reward components so one doesn't dominate? - -Hint: Track HP changes between steps. Think about terminal bonuses. - ---- - -## Episode Termination - -Episodes end when: - -- Someone wins (HP reaches 0) -- Time runs out (prevent infinite episodes) - ---- - -## Implementation (C + Python) - -Core game logic in C with Python bindings: - -``` -boss_fight.h — Game state struct, enums, c_reset(), c_step(), c_render() -boss_fight.c — Standalone test with keyboard input (Shift+WASD/Space/J) -boss_fight.py — PufferLib environment wrapper -``` - -Uses Raylib for rendering (1080x720 window @ 30 FPS). - ---- - ## RL Experiments Once v1 is working, design experiments to understand RL concepts: diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h index 353e98db3..ec8357067 100644 --- a/pufferlib/ocean/boss_fight/boss_fight.h +++ b/pufferlib/ocean/boss_fight/boss_fight.h @@ -9,6 +9,17 @@ #define PLAYER_SPEED_PER_TICK 0.1f #define PLAYER_SIZE 0.3f #define BOSS_SIZE 0.5f +#define PLAYER_ATTACK_RADIUS 0.1f +#define PLAYER_ATTACK_TICKS 3 +#define PLAYER_DODGE_TICKS 6 +#define PLAYER_DODGE_COOLDOWN 15 +#define PLAYER_ATTACK_DMG 3 +#define BOSS_ATTACK_DMG 3 +#define BOSS_AOE_ATTACK_RADIUS 0.7f +#define BOSS_IDLE_TICKS 12 +#define BOSS_WINDUP_TICKS 18 +#define BOSS_ACTIVE_TICKS 3 +#define BOSS_RECOVERY_TICKS 12 const Color PLAYER_COLOR = (Color){187, 0, 0, 255}; const Color BOSS_COLOR = (Color){0, 187, 187, 255}; @@ -43,7 +54,7 @@ typedef struct { float player_y; float boss_x; float boss_y; - float distance; + // float distance; PlayerState player_state; int player_hp; @@ -66,6 +77,23 @@ float distance(float x1, float y1, float x2, float y2) { return sqrtf(dx * dx + dy * dy); } +void update_observations(BossFight *env) { + int obs_idx = 0; + env->observations[obs_idx++] = env->boss_x - env->player_x; + env->observations[obs_idx++] = env->boss_y - env->player_y; + env->observations[obs_idx++] = env->player_x; + env->observations[obs_idx++] = env->player_y; + env->observations[obs_idx++] = env->boss_x; + env->observations[obs_idx++] = env->boss_y; + env->observations[obs_idx++] = (float)env->player_hp; + env->observations[obs_idx++] = (float)env->boss_hp; + env->observations[obs_idx++] = (float)env->player_state; + env->observations[obs_idx++] = (float)env->player_dodge_cooldown; + env->observations[obs_idx++] = (float)env->player_state_ticks; + env->observations[obs_idx++] = (float)env->boss_state; + env->observations[obs_idx++] = (float)env->boss_phase_ticks; +} + void c_reset(BossFight *env) { env->tick = 0; env->player_x = 0; @@ -79,7 +107,7 @@ void c_reset(BossFight *env) { env->player_state_ticks = 0; env->boss_state = BOSS_IDLING; env->boss_phase_ticks = 0; - env->distance = 0; + // env->distance = 0; env->player_x = rand_uniform(-ARENA_HALF_SIZE, ARENA_HALF_SIZE); env->player_y = rand_uniform(-ARENA_HALF_SIZE, ARENA_HALF_SIZE); @@ -90,24 +118,10 @@ void c_reset(BossFight *env) { env->player_y = rand_uniform(-ARENA_HALF_SIZE, ARENA_HALF_SIZE); } - env->distance = - distance(env->player_x, env->player_y, env->boss_x, env->boss_y); - - int obs_idx = 0; + // env->distance = + // distance(env->player_x, env->player_y, env->boss_x, env->boss_y); - env->observations[obs_idx++] = env->boss_x - env->player_x; - env->observations[obs_idx++] = env->boss_y - env->player_y; - env->observations[obs_idx++] = env->player_x; - env->observations[obs_idx++] = env->player_y; - env->observations[obs_idx++] = env->boss_x; - env->observations[obs_idx++] = env->boss_y; - env->observations[obs_idx++] = (float)env->player_hp; - env->observations[obs_idx++] = (float)env->boss_hp; - env->observations[obs_idx++] = (float)env->player_state; - env->observations[obs_idx++] = (float)env->player_dodge_cooldown; - env->observations[obs_idx++] = (float)env->player_state_ticks; - env->observations[obs_idx++] = (float)env->boss_state; - env->observations[obs_idx++] = (float)env->boss_phase_ticks; + update_observations(env); } void c_step(BossFight *env) { @@ -137,33 +151,98 @@ void c_step(BossFight *env) { bool can_dodge = env->player_state == PLAYER_IDLING && env->player_dodge_cooldown == 0; bool can_attack = env->player_state == PLAYER_IDLING; + bool close_enough = distance(env->player_x, env->player_y, env->boss_x, + env->boss_y) < PLAYER_ATTACK_RADIUS; - bool hit_wall = fabsf(env->player_x) > ARENA_HALF_SIZE && + bool hit_wall = fabsf(env->player_x) > ARENA_HALF_SIZE || fabsf(env->player_y) > ARENA_HALF_SIZE; if (hit_wall) { reward -= 0.5; } + // can't walk out of bounds + env->player_x = + fmaxf(-ARENA_HALF_SIZE, fminf(ARENA_HALF_SIZE, env->player_x)); + env->player_y = + fmaxf(-ARENA_HALF_SIZE, fminf(ARENA_HALF_SIZE, env->player_y)); + + if (wanna_attack && can_attack && close_enough) { + env->boss_hp -= PLAYER_ATTACK_DMG; + } - // TODO: here i should handle "player attacks and reduces boss hp" case + bool in_aoe_attack = distance(env->player_x, env->player_y, env->boss_x, + env->boss_y) <= BOSS_AOE_ATTACK_RADIUS; + bool boss_can_hit = env->player_state != PLAYER_DODGING && in_aoe_attack; + bool boss_can_damage = env->boss_state == BOSS_ATTACKING && boss_can_hit; + if (boss_can_damage) { + env->player_hp -= BOSS_ATTACK_DMG; + } - bool killed_boss = env->boss_hp == 0; + bool killed_boss = env->boss_hp <= 0; if (killed_boss) { reward += 2; + env->terminals[0] = 1; } env->rewards[0] = reward; - bool player_died = env->player_hp == 0; + bool player_died = env->player_hp <= 0; if (player_died) { env->terminals[0] = 1; } + if (wanna_attack && can_attack) { + env->player_state_ticks = PLAYER_ATTACK_TICKS; + env->player_state = PLAYER_ATTACKING; + } + if (wanna_dodge && can_dodge) { + env->player_state_ticks = PLAYER_DODGE_TICKS; + env->player_state = PLAYER_DODGING; + } + if (env->player_state == PLAYER_DODGING && env->player_state_ticks == 0) { + env->player_dodge_cooldown = PLAYER_DODGE_COOLDOWN; + env->player_state = PLAYER_IDLING; + } + if (env->player_state == PLAYER_ATTACKING && env->player_state_ticks == 0) { + env->player_state = PLAYER_IDLING; + } + + if (env->boss_phase_ticks == 0) { + if (env->boss_state == BOSS_IDLING) { + env->boss_state = BOSS_WINDING_UP; + env->boss_phase_ticks = BOSS_WINDUP_TICKS; + } else if (env->boss_state == BOSS_WINDING_UP) { + env->boss_state = BOSS_ATTACKING; + env->boss_phase_ticks = BOSS_ACTIVE_TICKS; + } else if (env->boss_state == BOSS_ATTACKING) { + env->boss_state = BOSS_RECOVERING; + env->boss_phase_ticks = BOSS_RECOVERY_TICKS; + } else if (env->boss_state == BOSS_RECOVERING) { + env->boss_state = BOSS_IDLING; + env->boss_phase_ticks = BOSS_IDLE_TICKS; + } + } + env->tick++; + if (env->boss_phase_ticks > 0) { + env->boss_phase_ticks--; + } + if (env->player_state_ticks > 0) { + env->player_state_ticks--; + } + if (env->player_dodge_cooldown > 0) { + env->player_dodge_cooldown--; + } + + if (env->tick >= 1500) { + env->terminals[0] = 1; + } + + update_observations(env); } void c_render(BossFight *env) { if (!IsWindowReady()) { - InitWindow(1080, 720, "BossFight"); + InitWindow(720, 720, "BossFight"); SetTargetFPS(30); } From e80ecdb446983b60872d38db96b8257ca2a98818 Mon Sep 17 00:00:00 2001 From: frixaco Date: Wed, 21 Jan 2026 00:28:02 +0500 Subject: [PATCH 07/29] attemps to fix agent not learning --- pufferlib/ocean/boss_fight/binding.c | 2 ++ pufferlib/ocean/boss_fight/boss_fight.h | 42 +++++++++++++++++++----- pufferlib/ocean/boss_fight/boss_fight.py | 7 +++- 3 files changed, 41 insertions(+), 10 deletions(-) diff --git a/pufferlib/ocean/boss_fight/binding.c b/pufferlib/ocean/boss_fight/binding.c index 45731694b..b037d8c14 100644 --- a/pufferlib/ocean/boss_fight/binding.c +++ b/pufferlib/ocean/boss_fight/binding.c @@ -10,5 +10,7 @@ static int my_init(Env *env, PyObject *args, PyObject *kwargs) { static int my_log(PyObject *dict, Log *log) { assign_to_dict(dict, "score", log->score); + assign_to_dict(dict, "episode_return", log->episode_return); + assign_to_dict(dict, "episode_length", log->episode_length); return 0; } diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h index ec8357067..fbc9a4767 100644 --- a/pufferlib/ocean/boss_fight/boss_fight.h +++ b/pufferlib/ocean/boss_fight/boss_fight.h @@ -9,7 +9,7 @@ #define PLAYER_SPEED_PER_TICK 0.1f #define PLAYER_SIZE 0.3f #define BOSS_SIZE 0.5f -#define PLAYER_ATTACK_RADIUS 0.1f +#define PLAYER_ATTACK_RADIUS 0.5f #define PLAYER_ATTACK_TICKS 3 #define PLAYER_DODGE_TICKS 6 #define PLAYER_DODGE_COOLDOWN 15 @@ -38,8 +38,11 @@ typedef enum { // Only use floats! typedef struct { - float score; - float n; // Required as the last field + float perf; // 0-1 normalized metric + float score; // unnormalized metric + float episode_return; // sum of rewards + float episode_length; // steps per episode + float n; // Required as last field } Log; typedef struct { @@ -65,6 +68,8 @@ typedef struct { int boss_hp; int boss_phase_ticks; + float episode_return; // track within episode + } BossFight; float rand_uniform(float low, float high) { @@ -77,6 +82,13 @@ float distance(float x1, float y1, float x2, float y2) { return sqrtf(dx * dx + dy * dy); } +void add_log(BossFight *env) { + env->log.episode_return += env->episode_return; + env->log.episode_length += env->tick; + env->log.score += env->episode_return; + env->log.n++; +} + void update_observations(BossFight *env) { int obs_idx = 0; env->observations[obs_idx++] = env->boss_x - env->player_x; @@ -106,8 +118,8 @@ void c_reset(BossFight *env) { env->player_dodge_cooldown = 0; env->player_state_ticks = 0; env->boss_state = BOSS_IDLING; - env->boss_phase_ticks = 0; - // env->distance = 0; + env->boss_phase_ticks = BOSS_IDLE_TICKS; + env->episode_return = 0; env->player_x = rand_uniform(-ARENA_HALF_SIZE, ARENA_HALF_SIZE); env->player_y = rand_uniform(-ARENA_HALF_SIZE, ARENA_HALF_SIZE); @@ -167,6 +179,7 @@ void c_step(BossFight *env) { if (wanna_attack && can_attack && close_enough) { env->boss_hp -= PLAYER_ATTACK_DMG; + reward += 0.5; } bool in_aoe_attack = distance(env->player_x, env->player_y, env->boss_x, @@ -184,12 +197,23 @@ void c_step(BossFight *env) { } env->rewards[0] = reward; + env->episode_return += reward; bool player_died = env->player_hp <= 0; if (player_died) { env->terminals[0] = 1; } + if (env->tick >= 300) { + env->terminals[0] = 1; + } + + if (env->terminals[0] == 1) { + add_log(env); + c_reset(env); + return; + } + if (wanna_attack && can_attack) { env->player_state_ticks = PLAYER_ATTACK_TICKS; env->player_state = PLAYER_ATTACKING; @@ -233,10 +257,6 @@ void c_step(BossFight *env) { env->player_dodge_cooldown--; } - if (env->tick >= 1500) { - env->terminals[0] = 1; - } - update_observations(env); } @@ -251,8 +271,12 @@ void c_render(BossFight *env) { } BeginDrawing(); + ClearBackground(BACKGROUND_COLOR); DrawText("Beat the boss!", 20, 20, 20, TEXT_COLOR); + + // DrawCircle(int centerX, int centerY, float radius, Color color) + EndDrawing(); } diff --git a/pufferlib/ocean/boss_fight/boss_fight.py b/pufferlib/ocean/boss_fight/boss_fight.py index 2e6ce6681..7f6b51667 100644 --- a/pufferlib/ocean/boss_fight/boss_fight.py +++ b/pufferlib/ocean/boss_fight/boss_fight.py @@ -17,6 +17,8 @@ def __init__( self.single_action_space = gymnasium.spaces.Discrete(7) self.render_mode = render_mode self.num_agents = num_envs + self.log_interval = log_interval + self.tick = 0 super().__init__(buf) self.c_envs = binding.vec_init( @@ -38,7 +40,10 @@ def reset(self, seed=0): def step(self, actions): self.actions[:] = actions binding.vec_step(self.c_envs) - info = [binding.vec_log(self.c_envs)] + self.tick += 1 + info = [] + if self.tick % self.log_interval == 0: + info.append(binding.vec_log(self.c_envs)) return (self.observations, self.rewards, self.terminals, self.truncations, info) def render(self): From 8d9a6c0555301ec70eb34493584e867a6dd164de Mon Sep 17 00:00:00 2001 From: frixaco Date: Wed, 21 Jan 2026 01:06:52 +0500 Subject: [PATCH 08/29] adjust distance calcs --- pufferlib/ocean/boss_fight/boss_fight.h | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h index fbc9a4767..1d2be24b5 100644 --- a/pufferlib/ocean/boss_fight/boss_fight.h +++ b/pufferlib/ocean/boss_fight/boss_fight.h @@ -124,8 +124,9 @@ void c_reset(BossFight *env) { env->player_x = rand_uniform(-ARENA_HALF_SIZE, ARENA_HALF_SIZE); env->player_y = rand_uniform(-ARENA_HALF_SIZE, ARENA_HALF_SIZE); - while (distance(env->player_x, env->player_y, env->boss_x, env->boss_y) < - 0.1) { + while (distance(env->player_x, env->player_y, env->boss_x, env->boss_y) <= + PLAYER_SIZE + PLAYER_ATTACK_RADIUS + BOSS_SIZE + + BOSS_AOE_ATTACK_RADIUS) { env->player_x = rand_uniform(-ARENA_HALF_SIZE, ARENA_HALF_SIZE); env->player_y = rand_uniform(-ARENA_HALF_SIZE, ARENA_HALF_SIZE); } @@ -163,8 +164,11 @@ void c_step(BossFight *env) { bool can_dodge = env->player_state == PLAYER_IDLING && env->player_dodge_cooldown == 0; bool can_attack = env->player_state == PLAYER_IDLING; - bool close_enough = distance(env->player_x, env->player_y, env->boss_x, - env->boss_y) < PLAYER_ATTACK_RADIUS; + + float dist = distance(env->player_x, env->player_y, env->boss_x, env->boss_y); + + bool close_enough = dist < BOSS_SIZE + PLAYER_ATTACK_RADIUS + PLAYER_SIZE && + dist > BOSS_SIZE + PLAYER_SIZE; bool hit_wall = fabsf(env->player_x) > ARENA_HALF_SIZE || fabsf(env->player_y) > ARENA_HALF_SIZE; @@ -182,8 +186,9 @@ void c_step(BossFight *env) { reward += 0.5; } - bool in_aoe_attack = distance(env->player_x, env->player_y, env->boss_x, - env->boss_y) <= BOSS_AOE_ATTACK_RADIUS; + bool in_aoe_attack = + dist <= BOSS_SIZE + PLAYER_SIZE + BOSS_AOE_ATTACK_RADIUS && + dist > BOSS_SIZE + PLAYER_SIZE; bool boss_can_hit = env->player_state != PLAYER_DODGING && in_aoe_attack; bool boss_can_damage = env->boss_state == BOSS_ATTACKING && boss_can_hit; if (boss_can_damage) { From 8edc6055d92d95d9893b728e5b933e908845f48f Mon Sep 17 00:00:00 2001 From: frixaco Date: Wed, 21 Jan 2026 01:07:00 +0500 Subject: [PATCH 09/29] add raylib ui --- pufferlib/ocean/boss_fight/boss_fight.h | 28 +++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h index 1d2be24b5..95e6806ce 100644 --- a/pufferlib/ocean/boss_fight/boss_fight.h +++ b/pufferlib/ocean/boss_fight/boss_fight.h @@ -167,8 +167,7 @@ void c_step(BossFight *env) { float dist = distance(env->player_x, env->player_y, env->boss_x, env->boss_y); - bool close_enough = dist < BOSS_SIZE + PLAYER_ATTACK_RADIUS + PLAYER_SIZE && - dist > BOSS_SIZE + PLAYER_SIZE; + bool close_enough = dist <= BOSS_SIZE + PLAYER_ATTACK_RADIUS + PLAYER_SIZE; bool hit_wall = fabsf(env->player_x) > ARENA_HALF_SIZE || fabsf(env->player_y) > ARENA_HALF_SIZE; @@ -186,9 +185,7 @@ void c_step(BossFight *env) { reward += 0.5; } - bool in_aoe_attack = - dist <= BOSS_SIZE + PLAYER_SIZE + BOSS_AOE_ATTACK_RADIUS && - dist > BOSS_SIZE + PLAYER_SIZE; + bool in_aoe_attack = dist <= BOSS_SIZE + PLAYER_SIZE + BOSS_AOE_ATTACK_RADIUS; bool boss_can_hit = env->player_state != PLAYER_DODGING && in_aoe_attack; bool boss_can_damage = env->boss_state == BOSS_ATTACKING && boss_can_hit; if (boss_can_damage) { @@ -265,6 +262,15 @@ void c_step(BossFight *env) { update_observations(env); } +int world_to_screen(float world_coord) { + return (int)((world_coord + ARENA_HALF_SIZE) / (2 * ARENA_HALF_SIZE) * + 720.0f); +} + +float radius_to_screen(float world_radius) { + return world_radius / (2 * ARENA_HALF_SIZE) * 720.0f; +} + void c_render(BossFight *env) { if (!IsWindowReady()) { InitWindow(720, 720, "BossFight"); @@ -280,7 +286,17 @@ void c_render(BossFight *env) { ClearBackground(BACKGROUND_COLOR); DrawText("Beat the boss!", 20, 20, 20, TEXT_COLOR); - // DrawCircle(int centerX, int centerY, float radius, Color color) + DrawCircle(world_to_screen(env->player_x), world_to_screen(env->player_y), + radius_to_screen(PLAYER_SIZE + PLAYER_ATTACK_RADIUS), + HITBOX_COLOR); + DrawCircle(world_to_screen(env->player_x), world_to_screen(env->player_y), + radius_to_screen(PLAYER_SIZE), PLAYER_COLOR); + + DrawCircle(world_to_screen(env->boss_x), world_to_screen(env->boss_y), + radius_to_screen(BOSS_SIZE + BOSS_AOE_ATTACK_RADIUS), + HITBOX_COLOR); + DrawCircle(world_to_screen(env->boss_x), world_to_screen(env->boss_y), + radius_to_screen(BOSS_SIZE), BOSS_COLOR); EndDrawing(); } From 8c78b11aeaff7d2c62c06b483bc66cb9597a28e3 Mon Sep 17 00:00:00 2001 From: frixaco Date: Wed, 21 Jan 2026 01:10:50 +0500 Subject: [PATCH 10/29] reward for getting closer --- pufferlib/ocean/boss_fight/boss_fight.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h index 95e6806ce..0c62149e4 100644 --- a/pufferlib/ocean/boss_fight/boss_fight.h +++ b/pufferlib/ocean/boss_fight/boss_fight.h @@ -57,7 +57,7 @@ typedef struct { float player_y; float boss_x; float boss_y; - // float distance; + float prev_distance; PlayerState player_state; int player_hp; @@ -131,8 +131,8 @@ void c_reset(BossFight *env) { env->player_y = rand_uniform(-ARENA_HALF_SIZE, ARENA_HALF_SIZE); } - // env->distance = - // distance(env->player_x, env->player_y, env->boss_x, env->boss_y); + env->prev_distance = + distance(env->player_x, env->player_y, env->boss_x, env->boss_y); update_observations(env); } @@ -167,6 +167,11 @@ void c_step(BossFight *env) { float dist = distance(env->player_x, env->player_y, env->boss_x, env->boss_y); + if (dist < env->prev_distance) { + reward += 0.5; + } + env->prev_distance = dist; + bool close_enough = dist <= BOSS_SIZE + PLAYER_ATTACK_RADIUS + PLAYER_SIZE; bool hit_wall = fabsf(env->player_x) > ARENA_HALF_SIZE || From 9f273df2d73fee376eba7105077ac58a139e9dcd Mon Sep 17 00:00:00 2001 From: frixaco Date: Wed, 21 Jan 2026 01:33:29 +0500 Subject: [PATCH 11/29] adjust rewards; speed up training --- pufferlib/config/boss_fight.ini | 4 ++-- pufferlib/ocean/boss_fight/boss_fight.h | 8 +++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/pufferlib/config/boss_fight.ini b/pufferlib/config/boss_fight.ini index fcfe697cc..c0913bb62 100644 --- a/pufferlib/config/boss_fight.ini +++ b/pufferlib/config/boss_fight.ini @@ -5,7 +5,7 @@ policy_name = Policy # rnn_name = Recurrent # Uncomment if adding LSTM/GRU [vec] -num_envs = 112 +num_envs = 448 num_workers = 14 batch_size = auto zero_copy = True @@ -30,7 +30,7 @@ checkpoint_interval = 200 seed = 42 # TODO: disable for sweep or speed torch_deterministic = True -device = mps +device = cpu # Optimization # TODO: try muon with 0.015 lr diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h index 0c62149e4..271c72676 100644 --- a/pufferlib/ocean/boss_fight/boss_fight.h +++ b/pufferlib/ocean/boss_fight/boss_fight.h @@ -42,6 +42,7 @@ typedef struct { float score; // unnormalized metric float episode_return; // sum of rewards float episode_length; // steps per episode + float wins; // episodes where boss died float n; // Required as last field } Log; @@ -168,7 +169,7 @@ void c_step(BossFight *env) { float dist = distance(env->player_x, env->player_y, env->boss_x, env->boss_y); if (dist < env->prev_distance) { - reward += 0.5; + reward += 0.3; } env->prev_distance = dist; @@ -177,7 +178,7 @@ void c_step(BossFight *env) { bool hit_wall = fabsf(env->player_x) > ARENA_HALF_SIZE || fabsf(env->player_y) > ARENA_HALF_SIZE; if (hit_wall) { - reward -= 0.5; + reward -= 1; } // can't walk out of bounds env->player_x = @@ -187,7 +188,7 @@ void c_step(BossFight *env) { if (wanna_attack && can_attack && close_enough) { env->boss_hp -= PLAYER_ATTACK_DMG; - reward += 0.5; + reward += 1; } bool in_aoe_attack = dist <= BOSS_SIZE + PLAYER_SIZE + BOSS_AOE_ATTACK_RADIUS; @@ -195,6 +196,7 @@ void c_step(BossFight *env) { bool boss_can_damage = env->boss_state == BOSS_ATTACKING && boss_can_hit; if (boss_can_damage) { env->player_hp -= BOSS_ATTACK_DMG; + reward -= 0.5; } bool killed_boss = env->boss_hp <= 0; From 828073d938960c225f2a97682a4ae410279dccf3 Mon Sep 17 00:00:00 2001 From: frixaco Date: Wed, 21 Jan 2026 02:25:20 +0500 Subject: [PATCH 12/29] actually fix agent not learning --- pufferlib/ocean/boss_fight/binding.c | 1 + pufferlib/ocean/boss_fight/boss_fight.h | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pufferlib/ocean/boss_fight/binding.c b/pufferlib/ocean/boss_fight/binding.c index b037d8c14..011e50928 100644 --- a/pufferlib/ocean/boss_fight/binding.c +++ b/pufferlib/ocean/boss_fight/binding.c @@ -12,5 +12,6 @@ static int my_log(PyObject *dict, Log *log) { assign_to_dict(dict, "score", log->score); assign_to_dict(dict, "episode_return", log->episode_return); assign_to_dict(dict, "episode_length", log->episode_length); + assign_to_dict(dict, "wins", log->wins); return 0; } diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h index 271c72676..f225f2272 100644 --- a/pufferlib/ocean/boss_fight/boss_fight.h +++ b/pufferlib/ocean/boss_fight/boss_fight.h @@ -87,6 +87,7 @@ void add_log(BossFight *env) { env->log.episode_return += env->episode_return; env->log.episode_length += env->tick; env->log.score += env->episode_return; + env->log.wins += (env->boss_hp <= 0) ? 1.0f : 0.0f; env->log.n++; } @@ -169,7 +170,7 @@ void c_step(BossFight *env) { float dist = distance(env->player_x, env->player_y, env->boss_x, env->boss_y); if (dist < env->prev_distance) { - reward += 0.3; + reward += 0.01; // small hint, not main reward } env->prev_distance = dist; @@ -201,7 +202,7 @@ void c_step(BossFight *env) { bool killed_boss = env->boss_hp <= 0; if (killed_boss) { - reward += 2; + reward += 10; // main goal - make it big env->terminals[0] = 1; } From f3c132d5f8a543b517d7b3974491c5c11712c805 Mon Sep 17 00:00:00 2001 From: frixaco Date: Wed, 21 Jan 2026 02:25:45 +0500 Subject: [PATCH 13/29] add collision; add hp bars --- AGENTS.md | 6 +++ pufferlib/ocean/boss_fight/boss_fight.h | 53 ++++++++++++++++++++----- 2 files changed, 48 insertions(+), 11 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 0712dc874..52f2b5d9a 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -50,3 +50,9 @@ Train and check scores: ``` puffer train puffer_boss_fight --train.total-timesteps 50000 ``` + +## Eval + +``` +puffer eval puffer_boss_fight --load-model-path $(ls -t experiments/puffer_boss_fight_*/model_*.pt | head -1) +``` diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h index f225f2272..1cf5d8c76 100644 --- a/pufferlib/ocean/boss_fight/boss_fight.h +++ b/pufferlib/ocean/boss_fight/boss_fight.h @@ -26,6 +26,7 @@ const Color BOSS_COLOR = (Color){0, 187, 187, 255}; const Color TEXT_COLOR = (Color){241, 241, 241, 241}; const Color HITBOX_COLOR = (Color){241, 241, 241, 241}; const Color BACKGROUND_COLOR = (Color){6, 24, 24, 255}; +const Color HP_COLOR = (Color){0, 255, 0, 255}; typedef enum { PLAYER_IDLING, PLAYER_DODGING, PLAYER_ATTACKING } PlayerState; @@ -187,6 +188,17 @@ void c_step(BossFight *env) { env->player_y = fmaxf(-ARENA_HALF_SIZE, fminf(ARENA_HALF_SIZE, env->player_y)); + // push player out if clipping into boss + if (dist < BOSS_SIZE + PLAYER_SIZE) { + float overlap = BOSS_SIZE + PLAYER_SIZE - dist; + float dx = env->player_x - env->boss_x; + float dy = env->player_y - env->boss_y; + env->player_x += (dx / dist) * overlap; + env->player_y += (dy / dist) * overlap; + // recalculate distance after push + dist = distance(env->player_x, env->player_y, env->boss_x, env->boss_y); + } + if (wanna_attack && can_attack && close_enough) { env->boss_hp -= PLAYER_ATTACK_DMG; reward += 1; @@ -294,17 +306,36 @@ void c_render(BossFight *env) { ClearBackground(BACKGROUND_COLOR); DrawText("Beat the boss!", 20, 20, 20, TEXT_COLOR); - DrawCircle(world_to_screen(env->player_x), world_to_screen(env->player_y), - radius_to_screen(PLAYER_SIZE + PLAYER_ATTACK_RADIUS), - HITBOX_COLOR); - DrawCircle(world_to_screen(env->player_x), world_to_screen(env->player_y), - radius_to_screen(PLAYER_SIZE), PLAYER_COLOR); - - DrawCircle(world_to_screen(env->boss_x), world_to_screen(env->boss_y), - radius_to_screen(BOSS_SIZE + BOSS_AOE_ATTACK_RADIUS), - HITBOX_COLOR); - DrawCircle(world_to_screen(env->boss_x), world_to_screen(env->boss_y), - radius_to_screen(BOSS_SIZE), BOSS_COLOR); + #define HP_BAR_WIDTH 40 + #define HP_BAR_HEIGHT 5 + + // Player + int player_sx = world_to_screen(env->player_x); + int player_sy = world_to_screen(env->player_y); + int player_hp_bar_y = player_sy + (int)radius_to_screen(PLAYER_SIZE) + 5; + int player_hp_width = (int)((float)env->player_hp / MAX_HP * HP_BAR_WIDTH); + + DrawCircle(player_sx, player_sy, + radius_to_screen(PLAYER_SIZE + PLAYER_ATTACK_RADIUS), HITBOX_COLOR); + DrawCircle(player_sx, player_sy, radius_to_screen(PLAYER_SIZE), PLAYER_COLOR); + DrawRectangle(player_sx - HP_BAR_WIDTH / 2, player_hp_bar_y, + HP_BAR_WIDTH, HP_BAR_HEIGHT, DARKGRAY); + DrawRectangle(player_sx - HP_BAR_WIDTH / 2, player_hp_bar_y, + player_hp_width, HP_BAR_HEIGHT, HP_COLOR); + + // Boss + int boss_sx = world_to_screen(env->boss_x); + int boss_sy = world_to_screen(env->boss_y); + int boss_hp_bar_y = boss_sy + (int)radius_to_screen(BOSS_SIZE) + 5; + int boss_hp_width = (int)((float)env->boss_hp / MAX_HP * HP_BAR_WIDTH); + + DrawCircle(boss_sx, boss_sy, + radius_to_screen(BOSS_SIZE + BOSS_AOE_ATTACK_RADIUS), HITBOX_COLOR); + DrawCircle(boss_sx, boss_sy, radius_to_screen(BOSS_SIZE), BOSS_COLOR); + DrawRectangle(boss_sx - HP_BAR_WIDTH / 2, boss_hp_bar_y, + HP_BAR_WIDTH, HP_BAR_HEIGHT, DARKGRAY); + DrawRectangle(boss_sx - HP_BAR_WIDTH / 2, boss_hp_bar_y, + boss_hp_width, HP_BAR_HEIGHT, HP_COLOR); EndDrawing(); } From 20ce3fad36029f37e19adba2a8a33a5179027a40 Mon Sep 17 00:00:00 2001 From: frixaco Date: Wed, 21 Jan 2026 02:53:16 +0500 Subject: [PATCH 14/29] add reward for dodging --- pufferlib/ocean/boss_fight/boss_fight.h | 57 +++++++++++++++---------- 1 file changed, 35 insertions(+), 22 deletions(-) diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h index 1cf5d8c76..9698ef558 100644 --- a/pufferlib/ocean/boss_fight/boss_fight.h +++ b/pufferlib/ocean/boss_fight/boss_fight.h @@ -14,17 +14,19 @@ #define PLAYER_DODGE_TICKS 6 #define PLAYER_DODGE_COOLDOWN 15 #define PLAYER_ATTACK_DMG 3 -#define BOSS_ATTACK_DMG 3 +#define BOSS_ATTACK_DMG 10 #define BOSS_AOE_ATTACK_RADIUS 0.7f #define BOSS_IDLE_TICKS 12 -#define BOSS_WINDUP_TICKS 18 +#define BOSS_WINDUP_TICKS 10 #define BOSS_ACTIVE_TICKS 3 -#define BOSS_RECOVERY_TICKS 12 +#define BOSS_RECOVERY_TICKS 10 +#define HP_BAR_WIDTH 40 +#define HP_BAR_HEIGHT 5 -const Color PLAYER_COLOR = (Color){187, 0, 0, 255}; +const Color PLAYER_COLOR = (Color){50, 100, 255, 255}; const Color BOSS_COLOR = (Color){0, 187, 187, 255}; -const Color TEXT_COLOR = (Color){241, 241, 241, 241}; -const Color HITBOX_COLOR = (Color){241, 241, 241, 241}; +const Color TEXT_COLOR = (Color){241, 241, 241, 255}; +const Color HITBOX_COLOR = (Color){241, 241, 241, 50}; const Color BACKGROUND_COLOR = (Color){6, 24, 24, 255}; const Color HP_COLOR = (Color){0, 255, 0, 255}; @@ -209,7 +211,15 @@ void c_step(BossFight *env) { bool boss_can_damage = env->boss_state == BOSS_ATTACKING && boss_can_hit; if (boss_can_damage) { env->player_hp -= BOSS_ATTACK_DMG; - reward -= 0.5; + reward -= 5; // make tanking hurt more + } + + // reward for successfully dodging an attack + bool dodged_attack = env->player_state == PLAYER_DODGING && + env->boss_state == BOSS_ATTACKING && + in_aoe_attack; + if (dodged_attack) { + reward += 2; // incentivize dodge timing } bool killed_boss = env->boss_hp <= 0; @@ -306,22 +316,17 @@ void c_render(BossFight *env) { ClearBackground(BACKGROUND_COLOR); DrawText("Beat the boss!", 20, 20, 20, TEXT_COLOR); - #define HP_BAR_WIDTH 40 - #define HP_BAR_HEIGHT 5 - // Player int player_sx = world_to_screen(env->player_x); int player_sy = world_to_screen(env->player_y); int player_hp_bar_y = player_sy + (int)radius_to_screen(PLAYER_SIZE) + 5; int player_hp_width = (int)((float)env->player_hp / MAX_HP * HP_BAR_WIDTH); + Color player_color = env->player_hp <= 0 ? RED : PLAYER_COLOR; DrawCircle(player_sx, player_sy, - radius_to_screen(PLAYER_SIZE + PLAYER_ATTACK_RADIUS), HITBOX_COLOR); - DrawCircle(player_sx, player_sy, radius_to_screen(PLAYER_SIZE), PLAYER_COLOR); - DrawRectangle(player_sx - HP_BAR_WIDTH / 2, player_hp_bar_y, - HP_BAR_WIDTH, HP_BAR_HEIGHT, DARKGRAY); - DrawRectangle(player_sx - HP_BAR_WIDTH / 2, player_hp_bar_y, - player_hp_width, HP_BAR_HEIGHT, HP_COLOR); + radius_to_screen(PLAYER_SIZE + PLAYER_ATTACK_RADIUS), + HITBOX_COLOR); + DrawCircle(player_sx, player_sy, radius_to_screen(PLAYER_SIZE), player_color); // Boss int boss_sx = world_to_screen(env->boss_x); @@ -329,13 +334,21 @@ void c_render(BossFight *env) { int boss_hp_bar_y = boss_sy + (int)radius_to_screen(BOSS_SIZE) + 5; int boss_hp_width = (int)((float)env->boss_hp / MAX_HP * HP_BAR_WIDTH); + Color boss_color = env->boss_hp <= 0 ? RED : BOSS_COLOR; DrawCircle(boss_sx, boss_sy, - radius_to_screen(BOSS_SIZE + BOSS_AOE_ATTACK_RADIUS), HITBOX_COLOR); - DrawCircle(boss_sx, boss_sy, radius_to_screen(BOSS_SIZE), BOSS_COLOR); - DrawRectangle(boss_sx - HP_BAR_WIDTH / 2, boss_hp_bar_y, - HP_BAR_WIDTH, HP_BAR_HEIGHT, DARKGRAY); - DrawRectangle(boss_sx - HP_BAR_WIDTH / 2, boss_hp_bar_y, - boss_hp_width, HP_BAR_HEIGHT, HP_COLOR); + radius_to_screen(BOSS_SIZE + BOSS_AOE_ATTACK_RADIUS), + HITBOX_COLOR); + DrawCircle(boss_sx, boss_sy, radius_to_screen(BOSS_SIZE), boss_color); + + // Player HP bar - bottom left + DrawText("Player", 20, 680, 16, TEXT_COLOR); + DrawRectangle(20, 700, HP_BAR_WIDTH * 3, HP_BAR_HEIGHT, DARKGRAY); + DrawRectangle(20, 700, player_hp_width * 3, HP_BAR_HEIGHT, HP_COLOR); + + // Boss HP bar - bottom right + DrawText("Boss", 580, 680, 16, TEXT_COLOR); + DrawRectangle(580, 700, HP_BAR_WIDTH * 3, HP_BAR_HEIGHT, DARKGRAY); + DrawRectangle(580, 700, boss_hp_width * 3, HP_BAR_HEIGHT, HP_COLOR); EndDrawing(); } From 125e492acf489841f032e7942e0e461d6b8a0529 Mon Sep 17 00:00:00 2001 From: frixaco Date: Wed, 21 Jan 2026 03:02:39 +0500 Subject: [PATCH 15/29] show wins/losses/timeouts in ui --- pufferlib/ocean/boss_fight/boss_fight.h | 31 +++++++++++++++++-------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h index 9698ef558..2255c72ed 100644 --- a/pufferlib/ocean/boss_fight/boss_fight.h +++ b/pufferlib/ocean/boss_fight/boss_fight.h @@ -74,6 +74,10 @@ typedef struct { float episode_return; // track within episode + // stats + int player_wins; + int boss_wins; + int timeouts; } BossFight; float rand_uniform(float low, float high) { @@ -223,23 +227,24 @@ void c_step(BossFight *env) { } bool killed_boss = env->boss_hp <= 0; + bool player_died = env->player_hp <= 0; + bool timed_out = env->tick >= 300; + if (killed_boss) { - reward += 10; // main goal - make it big + reward += 10; + env->terminals[0] = 1; + env->player_wins++; + } else if (player_died) { env->terminals[0] = 1; + env->boss_wins++; + } else if (timed_out) { + env->terminals[0] = 1; + env->timeouts++; } env->rewards[0] = reward; env->episode_return += reward; - bool player_died = env->player_hp <= 0; - if (player_died) { - env->terminals[0] = 1; - } - - if (env->tick >= 300) { - env->terminals[0] = 1; - } - if (env->terminals[0] == 1) { add_log(env); c_reset(env); @@ -316,6 +321,12 @@ void c_render(BossFight *env) { ClearBackground(BACKGROUND_COLOR); DrawText("Beat the boss!", 20, 20, 20, TEXT_COLOR); + // Stats top-right + char stats[64]; + snprintf(stats, sizeof(stats), "W:%d L:%d T:%d", + env->player_wins, env->boss_wins, env->timeouts); + DrawText(stats, 580, 20, 20, TEXT_COLOR); + // Player int player_sx = world_to_screen(env->player_x); int player_sy = world_to_screen(env->player_y); From 5669bea91e4cd49f262d625d2b4f996b2d8646bb Mon Sep 17 00:00:00 2001 From: frixaco Date: Wed, 21 Jan 2026 04:23:38 +0500 Subject: [PATCH 16/29] wip: reward shaping --- pufferlib/ocean/boss_fight/boss_fight.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h index 2255c72ed..67caa7af7 100644 --- a/pufferlib/ocean/boss_fight/boss_fight.h +++ b/pufferlib/ocean/boss_fight/boss_fight.h @@ -14,7 +14,7 @@ #define PLAYER_DODGE_TICKS 6 #define PLAYER_DODGE_COOLDOWN 15 #define PLAYER_ATTACK_DMG 3 -#define BOSS_ATTACK_DMG 10 +#define BOSS_ATTACK_DMG 30 #define BOSS_AOE_ATTACK_RADIUS 0.7f #define BOSS_IDLE_TICKS 12 #define BOSS_WINDUP_TICKS 10 @@ -215,15 +215,14 @@ void c_step(BossFight *env) { bool boss_can_damage = env->boss_state == BOSS_ATTACKING && boss_can_hit; if (boss_can_damage) { env->player_hp -= BOSS_ATTACK_DMG; - reward -= 5; // make tanking hurt more + reward -= 5; // make tanking hurt more } // reward for successfully dodging an attack - bool dodged_attack = env->player_state == PLAYER_DODGING && - env->boss_state == BOSS_ATTACKING && - in_aoe_attack; + bool dodged_attack = env->player_state == PLAYER_DODGING && + env->boss_state == BOSS_ATTACKING && in_aoe_attack; if (dodged_attack) { - reward += 2; // incentivize dodge timing + reward += 5; // incentivize dodge timing } bool killed_boss = env->boss_hp <= 0; @@ -323,8 +322,8 @@ void c_render(BossFight *env) { // Stats top-right char stats[64]; - snprintf(stats, sizeof(stats), "W:%d L:%d T:%d", - env->player_wins, env->boss_wins, env->timeouts); + snprintf(stats, sizeof(stats), "W:%d L:%d T:%d", env->player_wins, + env->boss_wins, env->timeouts); DrawText(stats, 580, 20, 20, TEXT_COLOR); // Player From 7a2438718d3be210bf70eb28e00b52a8f9ec78f4 Mon Sep 17 00:00:00 2001 From: frixaco Date: Wed, 21 Jan 2026 10:33:55 +0500 Subject: [PATCH 17/29] runpod instructions; more live logging --- pufferlib/ocean/boss_fight/boss_fight.py | 2 +- runpod.md | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) create mode 100644 runpod.md diff --git a/pufferlib/ocean/boss_fight/boss_fight.py b/pufferlib/ocean/boss_fight/boss_fight.py index 7f6b51667..fdb4bfb4f 100644 --- a/pufferlib/ocean/boss_fight/boss_fight.py +++ b/pufferlib/ocean/boss_fight/boss_fight.py @@ -9,7 +9,7 @@ class BossFight(pufferlib.PufferEnv): def __init__( - self, num_envs=1, render_mode=None, log_interval=128, size=5, buf=None, seed=0 + self, num_envs=1, render_mode=None, log_interval=1, size=5, buf=None, seed=0 ): self.single_observation_space = gymnasium.spaces.Box( low=-10, high=110, shape=(13,), dtype=np.float32 diff --git a/runpod.md b/runpod.md new file mode 100644 index 000000000..2ce8814ec --- /dev/null +++ b/runpod.md @@ -0,0 +1,12 @@ +curl -LsSf https://astral.sh/uv/install.sh | sh +source ~/.bashrc +git clone https://github.com/frixaco/PufferLib +cd PufferLib +git switch boss-fight +uv venv +source .venv/bin/activate +uv pip install -e . +python setup.py build_boss_fight --inplace --force +puffer train puffer_boss_fight --train.total-timestamps 5000000 --train.device cuda --vec.num-envs 8192 --vec.workers 16 --train.minibatch-size 8192 --train.max-minibatch-size 65536 + +puffer eval puffer*boss_fight --load-model-path $(ls -t experiments/puffer_boss_fight*\_/model\_\_.pt | head -1) From 46077143ac79b5f1ab14a53d2ecad2f3f4f5a68a Mon Sep 17 00:00:00 2001 From: frixaco Date: Wed, 21 Jan 2026 10:41:37 +0500 Subject: [PATCH 18/29] force player to dodge more --- pufferlib/ocean/boss_fight/boss_fight.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h index 67caa7af7..4ed14d83f 100644 --- a/pufferlib/ocean/boss_fight/boss_fight.h +++ b/pufferlib/ocean/boss_fight/boss_fight.h @@ -14,7 +14,7 @@ #define PLAYER_DODGE_TICKS 6 #define PLAYER_DODGE_COOLDOWN 15 #define PLAYER_ATTACK_DMG 3 -#define BOSS_ATTACK_DMG 30 +#define BOSS_ATTACK_DMG 10 #define BOSS_AOE_ATTACK_RADIUS 0.7f #define BOSS_IDLE_TICKS 12 #define BOSS_WINDUP_TICKS 10 @@ -215,7 +215,7 @@ void c_step(BossFight *env) { bool boss_can_damage = env->boss_state == BOSS_ATTACKING && boss_can_hit; if (boss_can_damage) { env->player_hp -= BOSS_ATTACK_DMG; - reward -= 5; // make tanking hurt more + reward -= 8; // make tanking expensive but survivable } // reward for successfully dodging an attack From 615afdb1e00f8d09cd911dc383ab10ea41905fc1 Mon Sep 17 00:00:00 2001 From: frixaco Date: Wed, 21 Jan 2026 12:05:54 +0500 Subject: [PATCH 19/29] wip: agent not learning --- pufferlib/ocean/boss_fight/boss_fight.h | 80 +++++++++++++++---------- runpod.md | 2 +- 2 files changed, 48 insertions(+), 34 deletions(-) diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h index 4ed14d83f..37348fe9c 100644 --- a/pufferlib/ocean/boss_fight/boss_fight.h +++ b/pufferlib/ocean/boss_fight/boss_fight.h @@ -16,13 +16,24 @@ #define PLAYER_ATTACK_DMG 3 #define BOSS_ATTACK_DMG 10 #define BOSS_AOE_ATTACK_RADIUS 0.7f -#define BOSS_IDLE_TICKS 12 -#define BOSS_WINDUP_TICKS 10 -#define BOSS_ACTIVE_TICKS 3 -#define BOSS_RECOVERY_TICKS 10 +#define BOSS_IDLE_TICKS 7 +#define BOSS_WINDUP_TICKS 5 +#define BOSS_ACTIVE_TICKS 5 +#define BOSS_RECOVERY_TICKS 5 #define HP_BAR_WIDTH 40 #define HP_BAR_HEIGHT 5 +// Rewards +#define REWARD_APPROACH 0.01f +#define REWARD_HIT_WALL -1.0f +#define REWARD_PLAYER_HIT_BOSS 1.0f +#define REWARD_BOSS_HIT_PLAYER -2.0f +#define REWARD_DODGE_SUCCESS 1.0f +#define REWARD_KILL_BOSS 10.0f +#define REWARD_PLAYER_DIED -10.0f +#define REWARD_TIMEOUT -10.0f +#define EPISODE_LENGTH 500 + const Color PLAYER_COLOR = (Color){50, 100, 255, 255}; const Color BOSS_COLOR = (Color){0, 187, 187, 255}; const Color TEXT_COLOR = (Color){241, 241, 241, 255}; @@ -174,10 +185,19 @@ void c_step(BossFight *env) { env->player_state == PLAYER_IDLING && env->player_dodge_cooldown == 0; bool can_attack = env->player_state == PLAYER_IDLING; + if (wanna_attack && can_attack) { + env->player_state_ticks = PLAYER_ATTACK_TICKS; + env->player_state = PLAYER_ATTACKING; + } + if (wanna_dodge && can_dodge) { + env->player_state_ticks = PLAYER_DODGE_TICKS; + env->player_state = PLAYER_DODGING; + } + float dist = distance(env->player_x, env->player_y, env->boss_x, env->boss_y); if (dist < env->prev_distance) { - reward += 0.01; // small hint, not main reward + reward += REWARD_APPROACH; } env->prev_distance = dist; @@ -186,7 +206,7 @@ void c_step(BossFight *env) { bool hit_wall = fabsf(env->player_x) > ARENA_HALF_SIZE || fabsf(env->player_y) > ARENA_HALF_SIZE; if (hit_wall) { - reward -= 1; + reward += REWARD_HIT_WALL; } // can't walk out of bounds env->player_x = @@ -207,7 +227,7 @@ void c_step(BossFight *env) { if (wanna_attack && can_attack && close_enough) { env->boss_hp -= PLAYER_ATTACK_DMG; - reward += 1; + reward += REWARD_PLAYER_HIT_BOSS; } bool in_aoe_attack = dist <= BOSS_SIZE + PLAYER_SIZE + BOSS_AOE_ATTACK_RADIUS; @@ -215,28 +235,31 @@ void c_step(BossFight *env) { bool boss_can_damage = env->boss_state == BOSS_ATTACKING && boss_can_hit; if (boss_can_damage) { env->player_hp -= BOSS_ATTACK_DMG; - reward -= 8; // make tanking expensive but survivable + reward += REWARD_BOSS_HIT_PLAYER; } // reward for successfully dodging an attack bool dodged_attack = env->player_state == PLAYER_DODGING && - env->boss_state == BOSS_ATTACKING && in_aoe_attack; + env->boss_state == BOSS_ATTACKING && in_aoe_attack && + env->boss_phase_ticks == BOSS_ACTIVE_TICKS; if (dodged_attack) { - reward += 5; // incentivize dodge timing + reward += REWARD_DODGE_SUCCESS; } bool killed_boss = env->boss_hp <= 0; bool player_died = env->player_hp <= 0; - bool timed_out = env->tick >= 300; + bool timed_out = env->tick >= EPISODE_LENGTH; if (killed_boss) { - reward += 10; + reward += REWARD_KILL_BOSS; env->terminals[0] = 1; env->player_wins++; } else if (player_died) { + reward += REWARD_PLAYER_DIED; env->terminals[0] = 1; env->boss_wins++; } else if (timed_out) { + reward += REWARD_TIMEOUT; env->terminals[0] = 1; env->timeouts++; } @@ -250,22 +273,13 @@ void c_step(BossFight *env) { return; } - if (wanna_attack && can_attack) { - env->player_state_ticks = PLAYER_ATTACK_TICKS; - env->player_state = PLAYER_ATTACKING; - } - if (wanna_dodge && can_dodge) { - env->player_state_ticks = PLAYER_DODGE_TICKS; - env->player_state = PLAYER_DODGING; - } - if (env->player_state == PLAYER_DODGING && env->player_state_ticks == 0) { - env->player_dodge_cooldown = PLAYER_DODGE_COOLDOWN; - env->player_state = PLAYER_IDLING; + env->tick++; + if (env->boss_phase_ticks > 0) { + env->boss_phase_ticks--; } - if (env->player_state == PLAYER_ATTACKING && env->player_state_ticks == 0) { - env->player_state = PLAYER_IDLING; + if (env->player_state_ticks > 0) { + env->player_state_ticks--; } - if (env->boss_phase_ticks == 0) { if (env->boss_state == BOSS_IDLING) { env->boss_state = BOSS_WINDING_UP; @@ -281,13 +295,13 @@ void c_step(BossFight *env) { env->boss_phase_ticks = BOSS_IDLE_TICKS; } } - - env->tick++; - if (env->boss_phase_ticks > 0) { - env->boss_phase_ticks--; - } - if (env->player_state_ticks > 0) { - env->player_state_ticks--; + if (env->player_state_ticks == 0) { + if (env->player_state == PLAYER_DODGING) { + env->player_dodge_cooldown = PLAYER_DODGE_COOLDOWN; + env->player_state = PLAYER_IDLING; + } else if (env->player_state == PLAYER_ATTACKING) { + env->player_state = PLAYER_IDLING; + } } if (env->player_dodge_cooldown > 0) { env->player_dodge_cooldown--; diff --git a/runpod.md b/runpod.md index 2ce8814ec..81859f0f9 100644 --- a/runpod.md +++ b/runpod.md @@ -7,6 +7,6 @@ uv venv source .venv/bin/activate uv pip install -e . python setup.py build_boss_fight --inplace --force -puffer train puffer_boss_fight --train.total-timestamps 5000000 --train.device cuda --vec.num-envs 8192 --vec.workers 16 --train.minibatch-size 8192 --train.max-minibatch-size 65536 +puffer train puffer_boss_fight --train.total-timesteps 5000000 --train.device cuda --vec.num-envs 8192 --vec.num-workers 16 --train.minibatch-size 8192 --train.max-minibatch-size 65536 puffer eval puffer*boss_fight --load-model-path $(ls -t experiments/puffer_boss_fight*\_/model\_\_.pt | head -1) From 8a50f7ed58eaf236862f6861a7dc42d376769595 Mon Sep 17 00:00:00 2001 From: frixaco Date: Wed, 21 Jan 2026 21:14:23 +0500 Subject: [PATCH 20/29] wip: agent not learning --- AGENTS.md | 55 +++++++++++++++++++ pufferlib/ocean/boss_fight/boss_fight.h | 57 ++++++++++---------- pufferlib/ocean/boss_fight/compile_flags.txt | 1 + 3 files changed, 85 insertions(+), 28 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 52f2b5d9a..9a339eb5c 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -56,3 +56,58 @@ puffer train puffer_boss_fight --train.total-timesteps 50000 ``` puffer eval puffer_boss_fight --load-model-path $(ls -t experiments/puffer_boss_fight_*/model_*.pt | head -1) ``` + +## Environment + +**Gameplay**: 2D boss fight. Player moves around a 10x10 arena (-5 to +5), dodges boss AOE attacks, and attacks back. Boss is stationary at center (0,0). Tick rate: 30/sec. + +**Actions** (7 discrete): NOOP, UP, DOWN, LEFT, RIGHT, DODGE, ATTACK + +**Player States**: +- `IDLING` — can move, dodge, or attack +- `DODGING` — 6 ticks, invincible, can't act +- `ATTACKING` — 3 ticks, stationary, can't act + +**Boss Behavior** (cycles continuously): +- `IDLE` (7 ticks) — does nothing +- `WINDUP` (5 ticks) — telegraph, player should prepare to dodge +- `ACTIVE` (5 ticks) — AOE damage zone active, dodge or get hit +- `RECOVERY` (5 ticks) — safe window to attack boss + +**Rewards**: +- `+10.0` — kill boss +- `+0.5` — hit boss with attack +- `+0.5` — successfully dodge during boss attack +- `+0.05` — approach boss (distance shaping) +- `-0.01` — per-step penalty +- `-0.5` — get hit by boss (10 dmg) +- `-1.0` — hit arena wall +- `-10.0` — die +- `-10.0` — timeout + +**Episode termination**: +- Boss HP ≤ 0 (player wins) +- Player HP ≤ 0 (player dies) +- 300 ticks timeout (~10 sec) + +**Parameters**: +- Player/Boss HP: 100 +- Player attack dmg: 3, Boss AOE dmg: 10 +- Player speed: 0.1 units/tick +- Dodge: 6 ticks duration, 15 tick cooldown +- Boss cycle: IDLE(7) → WINDUP(5) → ACTIVE(5) → RECOVERY(5) = 22 ticks/cycle + +**Observations** (13 floats): +1. `boss_x - player_x` — relative X to boss +2. `boss_y - player_y` — relative Y to boss +3. `player_x` — absolute X position +4. `player_y` — absolute Y position +5. `boss_x` — boss X (always 0) +6. `boss_y` — boss Y (always 0) +7. `player_hp` — player health +8. `boss_hp` — boss health +9. `player_state` — 0=IDLING, 1=DODGING, 2=ATTACKING +10. `player_dodge_cooldown` — ticks until dodge available +11. `player_state_ticks` — ticks remaining in current state +12. `boss_state` — 0=IDLE, 1=WINDUP, 2=ATTACKING, 3=RECOVERY +13. `boss_phase_ticks` — ticks remaining in current boss phase diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h index 37348fe9c..abb0af890 100644 --- a/pufferlib/ocean/boss_fight/boss_fight.h +++ b/pufferlib/ocean/boss_fight/boss_fight.h @@ -2,19 +2,18 @@ #include #include #include -#include #define ARENA_HALF_SIZE 5.0f -#define MAX_HP 100 +#define MAX_HP 1.0f #define PLAYER_SPEED_PER_TICK 0.1f #define PLAYER_SIZE 0.3f #define BOSS_SIZE 0.5f -#define PLAYER_ATTACK_RADIUS 0.5f +#define PLAYER_ATTACK_RADIUS 0.4f #define PLAYER_ATTACK_TICKS 3 -#define PLAYER_DODGE_TICKS 6 +#define PLAYER_DODGE_TICKS 4 #define PLAYER_DODGE_COOLDOWN 15 -#define PLAYER_ATTACK_DMG 3 -#define BOSS_ATTACK_DMG 10 +#define PLAYER_ATTACK_DMG 0.1f +#define BOSS_ATTACK_DMG 0.05f #define BOSS_AOE_ATTACK_RADIUS 0.7f #define BOSS_IDLE_TICKS 7 #define BOSS_WINDUP_TICKS 5 @@ -24,15 +23,16 @@ #define HP_BAR_HEIGHT 5 // Rewards -#define REWARD_APPROACH 0.01f -#define REWARD_HIT_WALL -1.0f -#define REWARD_PLAYER_HIT_BOSS 1.0f -#define REWARD_BOSS_HIT_PLAYER -2.0f -#define REWARD_DODGE_SUCCESS 1.0f -#define REWARD_KILL_BOSS 10.0f -#define REWARD_PLAYER_DIED -10.0f -#define REWARD_TIMEOUT -10.0f -#define EPISODE_LENGTH 500 +#define REWARD_APPROACH 0.5f +#define REWARD_HIT_WALL -0.1f +#define REWARD_PLAYER_HIT_BOSS 5.0f +#define REWARD_BOSS_HIT_PLAYER -0.5f +#define REWARD_DODGE_SUCCESS 2.0f +#define REWARD_KILL_BOSS 50.0f +#define REWARD_PLAYER_DIED -5.0f +#define REWARD_TIMEOUT -20.0f +#define REWARD_TICK -0.001f +#define EPISODE_LENGTH 300 const Color PLAYER_COLOR = (Color){50, 100, 255, 255}; const Color BOSS_COLOR = (Color){0, 187, 187, 255}; @@ -75,12 +75,12 @@ typedef struct { float prev_distance; PlayerState player_state; - int player_hp; + float player_hp; int player_dodge_cooldown; int player_state_ticks; BossState boss_state; - int boss_hp; + float boss_hp; int boss_phase_ticks; float episode_return; // track within episode @@ -158,7 +158,7 @@ void c_reset(BossFight *env) { } void c_step(BossFight *env) { - float reward = -0.01; + float reward = REWARD_TICK; env->terminals[0] = 0; int action = env->actions[0]; @@ -175,8 +175,10 @@ void c_step(BossFight *env) { dx = PLAYER_SPEED_PER_TICK; } - env->player_x += dx; - env->player_y += dy; + if (env->player_state == PLAYER_IDLING) { + env->player_x += dx; + env->player_y += dy; + } bool wanna_idle = action == 0; bool wanna_dodge = action == 5; @@ -196,9 +198,7 @@ void c_step(BossFight *env) { float dist = distance(env->player_x, env->player_y, env->boss_x, env->boss_y); - if (dist < env->prev_distance) { - reward += REWARD_APPROACH; - } + reward += REWARD_APPROACH * (env->prev_distance - dist); env->prev_distance = dist; bool close_enough = dist <= BOSS_SIZE + PLAYER_ATTACK_RADIUS + PLAYER_SIZE; @@ -238,11 +238,12 @@ void c_step(BossFight *env) { reward += REWARD_BOSS_HIT_PLAYER; } - // reward for successfully dodging an attack - bool dodged_attack = env->player_state == PLAYER_DODGING && - env->boss_state == BOSS_ATTACKING && in_aoe_attack && - env->boss_phase_ticks == BOSS_ACTIVE_TICKS; - if (dodged_attack) { + bool would_be_hit = env->boss_state == BOSS_ATTACKING && in_aoe_attack; + + bool successfully_dodging = + would_be_hit && env->player_state == PLAYER_DODGING; + + if (successfully_dodging) { reward += REWARD_DODGE_SUCCESS; } diff --git a/pufferlib/ocean/boss_fight/compile_flags.txt b/pufferlib/ocean/boss_fight/compile_flags.txt index ea96eb002..c6fecbb72 100644 --- a/pufferlib/ocean/boss_fight/compile_flags.txt +++ b/pufferlib/ocean/boss_fight/compile_flags.txt @@ -1 +1,2 @@ -I../../../raylib-5.5_macos/include +-I../../../raylib-5.5_linux_amd64/include From 286d3788a67f7646548549ee9502c9b64384c581 Mon Sep 17 00:00:00 2001 From: frixaco Date: Sat, 24 Jan 2026 17:22:07 +0500 Subject: [PATCH 21/29] fix agent not learning --- AGENTS.md | 107 ++------------------ pufferlib/ocean/boss_fight/README.md | 128 ------------------------ pufferlib/ocean/boss_fight/boss_fight.h | 23 +++-- 3 files changed, 18 insertions(+), 240 deletions(-) delete mode 100644 pufferlib/ocean/boss_fight/README.md diff --git a/AGENTS.md b/AGENTS.md index 9a339eb5c..321f95f8d 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -2,112 +2,17 @@ I'm implementing a RL environment using PufferLib in C + Python. -Environment spec file is in `./pufferlib/ocean/boss_fight/README.md`. +It's a **minimal** 2D boss fight environment to learn RL concepts with PufferLib. +Focus: **observation design, reward shaping, training experiments and a bit of game dev using Raylib** + +The boss has **1 attack** (AOE burst). All hitboxes are circles (collision = circles overlap). You are in PufferLib's (puffer.ai) source repository which contains "Ocean" - a collection of environments. The environment code I'm working on is located in `./pufferlib/ocean/boss_fight/`. Environment configuration is in `./pufferlib/config/boss_fight.ini` -### Setup - -1. Fork pufferlib, create new branch - -2. Run these: - -``` -uv venv -uv pip install -e . -``` - -3. Setup files using templates, update `environment.py` - -4. Not sure what this does yet: - -``` -python setup.py build_boss_fight --inplace --force -``` - -### Testing - -Make sure shit's running: - -``` -uv pip install -e . -python -c " -from pufferlib.ocean.boss_fight import BossFight -import numpy as np -env = BossFight(num_envs=2) -env.reset() -for _ in range(100): - env.step(np.random.randint(0, 7, size=2)) -print('ok') -env.close() -" -``` - -Train and check scores: - -``` -puffer train puffer_boss_fight --train.total-timesteps 50000 -``` - -## Eval +After modifying C files, to test you can run: ``` -puffer eval puffer_boss_fight --load-model-path $(ls -t experiments/puffer_boss_fight_*/model_*.pt | head -1) +python setup.py build_boss_fight --inplace --force && puffer train puffer_boss_fight --train.device cpu --vec.num-workers 8 --vec.num-envs 1024 --train.total-timesteps 5000000 && puffer eval puffer_boss_fight --load-model-path $(ls -t experiments/puffer_boss_fight_*/model_*.pt | head -1) ``` - -## Environment - -**Gameplay**: 2D boss fight. Player moves around a 10x10 arena (-5 to +5), dodges boss AOE attacks, and attacks back. Boss is stationary at center (0,0). Tick rate: 30/sec. - -**Actions** (7 discrete): NOOP, UP, DOWN, LEFT, RIGHT, DODGE, ATTACK - -**Player States**: -- `IDLING` — can move, dodge, or attack -- `DODGING` — 6 ticks, invincible, can't act -- `ATTACKING` — 3 ticks, stationary, can't act - -**Boss Behavior** (cycles continuously): -- `IDLE` (7 ticks) — does nothing -- `WINDUP` (5 ticks) — telegraph, player should prepare to dodge -- `ACTIVE` (5 ticks) — AOE damage zone active, dodge or get hit -- `RECOVERY` (5 ticks) — safe window to attack boss - -**Rewards**: -- `+10.0` — kill boss -- `+0.5` — hit boss with attack -- `+0.5` — successfully dodge during boss attack -- `+0.05` — approach boss (distance shaping) -- `-0.01` — per-step penalty -- `-0.5` — get hit by boss (10 dmg) -- `-1.0` — hit arena wall -- `-10.0` — die -- `-10.0` — timeout - -**Episode termination**: -- Boss HP ≤ 0 (player wins) -- Player HP ≤ 0 (player dies) -- 300 ticks timeout (~10 sec) - -**Parameters**: -- Player/Boss HP: 100 -- Player attack dmg: 3, Boss AOE dmg: 10 -- Player speed: 0.1 units/tick -- Dodge: 6 ticks duration, 15 tick cooldown -- Boss cycle: IDLE(7) → WINDUP(5) → ACTIVE(5) → RECOVERY(5) = 22 ticks/cycle - -**Observations** (13 floats): -1. `boss_x - player_x` — relative X to boss -2. `boss_y - player_y` — relative Y to boss -3. `player_x` — absolute X position -4. `player_y` — absolute Y position -5. `boss_x` — boss X (always 0) -6. `boss_y` — boss Y (always 0) -7. `player_hp` — player health -8. `boss_hp` — boss health -9. `player_state` — 0=IDLING, 1=DODGING, 2=ATTACKING -10. `player_dodge_cooldown` — ticks until dodge available -11. `player_state_ticks` — ticks remaining in current state -12. `boss_state` — 0=IDLE, 1=WINDUP, 2=ATTACKING, 3=RECOVERY -13. `boss_phase_ticks` — ticks remaining in current boss phase diff --git a/pufferlib/ocean/boss_fight/README.md b/pufferlib/ocean/boss_fight/README.md deleted file mode 100644 index 3d5d189e6..000000000 --- a/pufferlib/ocean/boss_fight/README.md +++ /dev/null @@ -1,128 +0,0 @@ -# SoulsRL Minimal — RL-Focused Boss Fight Environment - -## Goal - -Build a **minimal** 2D boss fight environment to learn RL concepts with PufferLib. -Focus: **observation design, reward shaping, training experiments and a bit of game dev using Raylib** - -The boss has **1 attack** (AOE burst). All hitboxes are circles (collision = circles overlap). - ---- - -## Core Mechanics (Simplified) - -### Constants - -``` -Tick rate: 30 ticks/sec (dt = 1/30) -Arena: 10 x 10 units (centered at origin, so bounds are -5 to +5) - -Player: - - radius: 0.3 - - HP: 100 - - speed: 3.0 units/sec (~0.1 units/tick) - -Boss: - - radius: 0.5 - - HP: 100 - - position: fixed at (0, 0) — does not move -``` - -### Player Actions (Discrete, 7 total) - -``` -0: NOOP -1: UP -2: DOWN -3: LEFT -4: RIGHT -5: DODGE -6: ATTACK -``` - -### Player States - -``` -FREE — can move, can act -DODGE — 6 ticks, i-frames on ticks 1-5, moves at 2.5x speed in last move_dir -ATTACK — windup(4) + active(3) + recovery(6) = 13 ticks total, no movement -``` - -### Boss Behavior (Single Attack) - -Boss cycles: `IDLE → WINDUP → ACTIVE → RECOVERY → IDLE` - -``` -IDLE: 12 ticks (0.4s) — does nothing -WINDUP: 18 ticks (0.6s) — telegraphing, no damage -ACTIVE: 3 ticks (0.1s) — AOE hits -RECOVERY: 15 ticks (0.5s) — vulnerable, no damage -``` - -## RL Experiments - -Once v1 is working, design experiments to understand RL concepts: - -### Experiment Ideas - -**Observation Ablations** — Which observations actually matter? - -- What happens if the agent can't see timing information? -- Does it need absolute position, or is relative enough? -- What's the minimum viable observation space? -- Can the network learn to ignore irrelevant/noisy inputs? - -**Reward Shaping** — How does reward design affect behavior? - -- What if you only reward winning/losing (sparse)? -- What happens without a time penalty? -- Can you incentivize specific behaviors (dodging at the right time)? -- What unintended behaviors might reward bonuses create? - -**Hyperparameters** — See `boss_fight.ini` for the sweep config - -- Learning rate: stability vs speed -- Entropy coefficient: exploration vs exploitation -- Batch size / num_envs: sample efficiency -- Network size: capacity vs overfitting - ---- - -## Success Criteria - -1. **Baseline works**: Random agent wins ~0%, trained agent wins >80% -2. **Learned timing**: Agent dodges during WINDUP, not randomly -3. **Learned punish**: Agent attacks during RECOVERY, not during ACTIVE -4. **Experiments complete**: At least 3 ablations run with plotted comparisons - ---- - -## Optional Extensions (After Experiments) - -Only add these if baseline experiments are done: - -1. **Sweep attack**: Cone hitbox, tests directional dodging -2. **Boss movement**: Slow drift toward player -3. **Combo attack**: Multi-hit sequence, tests dodge timing -4. **ASCII rendering**: For debugging/demo -5. **Curriculum**: Start with longer windup, tighten over training - ---- - -## Deliverables - -1. `boss_fight.h` — Core game logic in C -2. `boss_fight.c` — Standalone test binary -3. `boss_fight.py` — PufferLib environment wrapper -4. `experiments/` — Saved runs with different configs -5. `results.md` — Summary of what you learned from experiments - ---- - -## Milestones - -1. **Environment works**: `c_step()` implemented, can play manually with keyboard -2. **Random baseline**: Random agent wins ~0%, confirms game is non-trivial -3. **Learning signal**: Trained agent shows improvement over random -4. **Competent agent**: Win rate >80% -5. **Experiments**: At least 3 ablations with documented findings diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h index abb0af890..3a3f0fd5e 100644 --- a/pufferlib/ocean/boss_fight/boss_fight.h +++ b/pufferlib/ocean/boss_fight/boss_fight.h @@ -23,15 +23,15 @@ #define HP_BAR_HEIGHT 5 // Rewards -#define REWARD_APPROACH 0.5f +#define REWARD_APPROACH 0.1f #define REWARD_HIT_WALL -0.1f -#define REWARD_PLAYER_HIT_BOSS 5.0f -#define REWARD_BOSS_HIT_PLAYER -0.5f -#define REWARD_DODGE_SUCCESS 2.0f -#define REWARD_KILL_BOSS 50.0f -#define REWARD_PLAYER_DIED -5.0f -#define REWARD_TIMEOUT -20.0f -#define REWARD_TICK -0.001f +#define REWARD_PLAYER_HIT_BOSS 0.4f +#define REWARD_BOSS_HIT_PLAYER -0.35f +#define REWARD_DODGE_SUCCESS 0.0f +#define REWARD_KILL_BOSS 1.0f +#define REWARD_PLAYER_DIED -1.0f +#define REWARD_TIMEOUT -1.0f +#define REWARD_TICK -0.01f #define EPISODE_LENGTH 300 const Color PLAYER_COLOR = (Color){50, 100, 255, 255}; @@ -240,10 +240,11 @@ void c_step(BossFight *env) { bool would_be_hit = env->boss_state == BOSS_ATTACKING && in_aoe_attack; - bool successfully_dodging = - would_be_hit && env->player_state == PLAYER_DODGING; + bool started_successful_dodge = would_be_hit && + env->player_state == PLAYER_DODGING && + env->player_state_ticks == PLAYER_DODGE_TICKS; - if (successfully_dodging) { + if (started_successful_dodge) { reward += REWARD_DODGE_SUCCESS; } From 3fcaf2ceef83f7d635db4a414f2af69361804d15 Mon Sep 17 00:00:00 2001 From: frixaco Date: Sat, 24 Jan 2026 19:09:16 +0500 Subject: [PATCH 22/29] tune numbers --- pufferlib/ocean/boss_fight/boss_fight.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h index 3a3f0fd5e..6667810e3 100644 --- a/pufferlib/ocean/boss_fight/boss_fight.h +++ b/pufferlib/ocean/boss_fight/boss_fight.h @@ -5,16 +5,16 @@ #define ARENA_HALF_SIZE 5.0f #define MAX_HP 1.0f -#define PLAYER_SPEED_PER_TICK 0.1f +#define PLAYER_SPEED_PER_TICK 0.25f #define PLAYER_SIZE 0.3f #define BOSS_SIZE 0.5f #define PLAYER_ATTACK_RADIUS 0.4f #define PLAYER_ATTACK_TICKS 3 -#define PLAYER_DODGE_TICKS 4 +#define PLAYER_DODGE_TICKS 6 #define PLAYER_DODGE_COOLDOWN 15 -#define PLAYER_ATTACK_DMG 0.1f -#define BOSS_ATTACK_DMG 0.05f -#define BOSS_AOE_ATTACK_RADIUS 0.7f +#define PLAYER_ATTACK_DMG 0.02f +#define BOSS_ATTACK_DMG 0.15f +#define BOSS_AOE_ATTACK_RADIUS 0.8f #define BOSS_IDLE_TICKS 7 #define BOSS_WINDUP_TICKS 5 #define BOSS_ACTIVE_TICKS 5 @@ -23,11 +23,11 @@ #define HP_BAR_HEIGHT 5 // Rewards -#define REWARD_APPROACH 0.1f -#define REWARD_HIT_WALL -0.1f -#define REWARD_PLAYER_HIT_BOSS 0.4f -#define REWARD_BOSS_HIT_PLAYER -0.35f -#define REWARD_DODGE_SUCCESS 0.0f +#define REWARD_APPROACH 0.05f +#define REWARD_HIT_WALL -0.05f +#define REWARD_PLAYER_HIT_BOSS 0.07f +#define REWARD_BOSS_HIT_PLAYER -0.05f +#define REWARD_DODGE_SUCCESS 0.07f #define REWARD_KILL_BOSS 1.0f #define REWARD_PLAYER_DIED -1.0f #define REWARD_TIMEOUT -1.0f From 04ca78e404c7ff9fefc77208173cff208c2b2a09 Mon Sep 17 00:00:00 2001 From: frixaco Date: Sat, 24 Jan 2026 22:32:06 +0500 Subject: [PATCH 23/29] tune numbers even more; good results --- pufferlib/ocean/boss_fight/boss_fight.h | 76 ++++++++++++++++++------- 1 file changed, 55 insertions(+), 21 deletions(-) diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h index 6667810e3..55799cf94 100644 --- a/pufferlib/ocean/boss_fight/boss_fight.h +++ b/pufferlib/ocean/boss_fight/boss_fight.h @@ -10,9 +10,11 @@ #define BOSS_SIZE 0.5f #define PLAYER_ATTACK_RADIUS 0.4f #define PLAYER_ATTACK_TICKS 3 -#define PLAYER_DODGE_TICKS 6 +#define PLAYER_DODGE_TICKS 4 +#define PLAYER_IFRAME_TICKS 2 #define PLAYER_DODGE_COOLDOWN 15 -#define PLAYER_ATTACK_DMG 0.02f +#define PLAYER_DODGE_SPEED_PER_TICK 0.35f +#define PLAYER_ATTACK_DMG 0.05f #define BOSS_ATTACK_DMG 0.15f #define BOSS_AOE_ATTACK_RADIUS 0.8f #define BOSS_IDLE_TICKS 7 @@ -32,7 +34,7 @@ #define REWARD_PLAYER_DIED -1.0f #define REWARD_TIMEOUT -1.0f #define REWARD_TICK -0.01f -#define EPISODE_LENGTH 300 +#define EPISODE_LENGTH 600 const Color PLAYER_COLOR = (Color){50, 100, 255, 255}; const Color BOSS_COLOR = (Color){0, 187, 187, 255}; @@ -78,6 +80,7 @@ typedef struct { float player_hp; int player_dodge_cooldown; int player_state_ticks; + int dodge_escape_pending; BossState boss_state; float boss_hp; @@ -137,6 +140,7 @@ void c_reset(BossFight *env) { env->player_state = PLAYER_IDLING; env->player_dodge_cooldown = 0; env->player_state_ticks = 0; + env->dodge_escape_pending = 0; env->boss_state = BOSS_IDLING; env->boss_phase_ticks = BOSS_IDLE_TICKS; env->episode_return = 0; @@ -184,38 +188,59 @@ void c_step(BossFight *env) { bool wanna_dodge = action == 5; bool wanna_attack = action == 6; bool can_dodge = - env->player_state == PLAYER_IDLING && env->player_dodge_cooldown == 0; + env->player_state != PLAYER_DODGING && env->player_dodge_cooldown == 0; bool can_attack = env->player_state == PLAYER_IDLING; if (wanna_attack && can_attack) { env->player_state_ticks = PLAYER_ATTACK_TICKS; env->player_state = PLAYER_ATTACKING; } + + float aoe_dist = BOSS_SIZE + PLAYER_SIZE + BOSS_AOE_ATTACK_RADIUS; + bool boss_threatening = + env->boss_state == BOSS_WINDING_UP || env->boss_state == BOSS_ATTACKING; + + float pre_dodge_dist = 0.0f; if (wanna_dodge && can_dodge) { + pre_dodge_dist = + distance(env->player_x, env->player_y, env->boss_x, env->boss_y); + env->dodge_escape_pending = + boss_threatening && pre_dodge_dist <= aoe_dist ? 1 : 0; + env->player_state_ticks = PLAYER_DODGE_TICKS; env->player_state = PLAYER_DODGING; } - float dist = distance(env->player_x, env->player_y, env->boss_x, env->boss_y); - - reward += REWARD_APPROACH * (env->prev_distance - dist); - env->prev_distance = dist; - - bool close_enough = dist <= BOSS_SIZE + PLAYER_ATTACK_RADIUS + PLAYER_SIZE; + // Dodge = multi-tick movement out of the AOE (no i-frames) + if (env->player_state == PLAYER_DODGING) { + float away_x = env->player_x - env->boss_x; + float away_y = env->player_y - env->boss_y; + float away_norm = sqrtf(away_x * away_x + away_y * away_y); + if (away_norm > 1e-6f) { + env->player_x += (away_x / away_norm) * PLAYER_DODGE_SPEED_PER_TICK; + env->player_y += (away_y / away_norm) * PLAYER_DODGE_SPEED_PER_TICK; + } + } bool hit_wall = fabsf(env->player_x) > ARENA_HALF_SIZE || fabsf(env->player_y) > ARENA_HALF_SIZE; if (hit_wall) { reward += REWARD_HIT_WALL; } + // can't walk out of bounds env->player_x = fmaxf(-ARENA_HALF_SIZE, fminf(ARENA_HALF_SIZE, env->player_x)); env->player_y = fmaxf(-ARENA_HALF_SIZE, fminf(ARENA_HALF_SIZE, env->player_y)); + float dist = distance(env->player_x, env->player_y, env->boss_x, env->boss_y); + + reward += REWARD_APPROACH * (env->prev_distance - dist); + env->prev_distance = dist; + // push player out if clipping into boss - if (dist < BOSS_SIZE + PLAYER_SIZE) { + if (dist < BOSS_SIZE + PLAYER_SIZE && dist > 1e-6f) { float overlap = BOSS_SIZE + PLAYER_SIZE - dist; float dx = env->player_x - env->boss_x; float dy = env->player_y - env->boss_y; @@ -225,27 +250,35 @@ void c_step(BossFight *env) { dist = distance(env->player_x, env->player_y, env->boss_x, env->boss_y); } + bool close_enough = dist <= BOSS_SIZE + PLAYER_ATTACK_RADIUS + PLAYER_SIZE; + if (wanna_attack && can_attack && close_enough) { env->boss_hp -= PLAYER_ATTACK_DMG; reward += REWARD_PLAYER_HIT_BOSS; } - bool in_aoe_attack = dist <= BOSS_SIZE + PLAYER_SIZE + BOSS_AOE_ATTACK_RADIUS; - bool boss_can_hit = env->player_state != PLAYER_DODGING && in_aoe_attack; + bool in_aoe_attack = dist <= aoe_dist; + bool player_iframed = + env->player_state == PLAYER_DODGING && + env->player_state_ticks > (PLAYER_DODGE_TICKS - PLAYER_IFRAME_TICKS); + + // Souls-like: you can i-frame briefly, but the AOE persists longer than the + // i-frame window; if you're still in the hitbox after i-frames, you get hit. + bool boss_can_hit = in_aoe_attack && !player_iframed; bool boss_can_damage = env->boss_state == BOSS_ATTACKING && boss_can_hit; if (boss_can_damage) { env->player_hp -= BOSS_ATTACK_DMG; reward += REWARD_BOSS_HIT_PLAYER; } - bool would_be_hit = env->boss_state == BOSS_ATTACKING && in_aoe_attack; - - bool started_successful_dodge = would_be_hit && - env->player_state == PLAYER_DODGING && - env->player_state_ticks == PLAYER_DODGE_TICKS; - - if (started_successful_dodge) { - reward += REWARD_DODGE_SUCCESS; + // Reward dodges that actually exit the AOE during the danger window + if (env->dodge_escape_pending) { + if (!boss_threatening) { + env->dodge_escape_pending = 0; + } else if (dist > aoe_dist) { + reward += REWARD_DODGE_SUCCESS; + env->dodge_escape_pending = 0; + } } bool killed_boss = env->boss_hp <= 0; @@ -301,6 +334,7 @@ void c_step(BossFight *env) { if (env->player_state == PLAYER_DODGING) { env->player_dodge_cooldown = PLAYER_DODGE_COOLDOWN; env->player_state = PLAYER_IDLING; + env->dodge_escape_pending = 0; } else if (env->player_state == PLAYER_ATTACKING) { env->player_state = PLAYER_IDLING; } From 6ca0be54adcc68503b361d50962c46dbf86fe0b0 Mon Sep 17 00:00:00 2001 From: frixaco Date: Sat, 24 Jan 2026 22:37:26 +0500 Subject: [PATCH 24/29] cleanup --- AGENTS.md | 18 - LEARN_TODO.md | 259 ----- learn-pufferlib.py | 1175 ----------------------- pufferlib/ocean/boss_fight/boss_fight.h | 22 +- runpod.md | 12 - 5 files changed, 9 insertions(+), 1477 deletions(-) delete mode 100644 AGENTS.md delete mode 100644 LEARN_TODO.md delete mode 100644 learn-pufferlib.py delete mode 100644 runpod.md diff --git a/AGENTS.md b/AGENTS.md deleted file mode 100644 index 321f95f8d..000000000 --- a/AGENTS.md +++ /dev/null @@ -1,18 +0,0 @@ -# BossFight Reinforcement Learning project - -I'm implementing a RL environment using PufferLib in C + Python. - -It's a **minimal** 2D boss fight environment to learn RL concepts with PufferLib. -Focus: **observation design, reward shaping, training experiments and a bit of game dev using Raylib** - -The boss has **1 attack** (AOE burst). All hitboxes are circles (collision = circles overlap). - -You are in PufferLib's (puffer.ai) source repository which contains "Ocean" - a collection of environments. - -The environment code I'm working on is located in `./pufferlib/ocean/boss_fight/`. Environment configuration is in `./pufferlib/config/boss_fight.ini` - -After modifying C files, to test you can run: - -``` -python setup.py build_boss_fight --inplace --force && puffer train puffer_boss_fight --train.device cpu --vec.num-workers 8 --vec.num-envs 1024 --train.total-timesteps 5000000 && puffer eval puffer_boss_fight --load-model-path $(ls -t experiments/puffer_boss_fight_*/model_*.pt | head -1) -``` diff --git a/LEARN_TODO.md b/LEARN_TODO.md deleted file mode 100644 index 35c99eebc..000000000 --- a/LEARN_TODO.md +++ /dev/null @@ -1,259 +0,0 @@ -# Learning TODO: RL Foundations - -Everything you need to understand `bptt_horizon` and RL training in general. - ---- - -## Level 1: Basic ML Concepts - -### 1.1 What is a Neural Network? -- Function that takes numbers in, spits numbers out -- Has "weights" (parameters) that get adjusted during training -- `input → [neural network] → output` - -### 1.2 What is Training / Learning? -- Adjusting weights so the network gives better outputs -- Done by computing "loss" (how wrong it was) and updating weights to reduce loss - -### 1.3 What is Backpropagation? -- Algorithm to figure out HOW to adjust each weight -- Flows backwards through the network: output → hidden layers → input -- "If the output was wrong, which weights were responsible?" - -### 1.4 What is a Batch? -- Group of training examples processed together -- Instead of: train on example 1, then example 2, then example 3... -- Do: train on [example 1, 2, 3, 4, 5] at once -- Why? Faster (GPU parallelism) + more stable learning - -### 1.5 What is Minibatch? -- When your batch is too big for GPU memory -- Split batch into smaller "minibatches" -- `batch_size = 1024, minibatch_size = 256` → 4 gradient updates per batch - ---- - -## Level 2: RL Basics - -### 2.1 What is a Timestep? -- One tick of the game/simulation -- Agent observes state → takes action → gets reward → new state -- `t=0: see game → press button → get +1 point → game changes` - -### 2.2 What is an Episode? -- One complete playthrough from start to end -- Boss fight: episode = one full fight (win or lose) -- `[spawn] → step → step → step → ... → [death or victory]` - -``` -Episode 1: t0 → t1 → t2 → t3 → DEAD (4 steps) -Episode 2: t0 → t1 → t2 → t3 → t4 → t5 → WIN (6 steps) -``` - -### 2.3 What is an Observation? -- What the agent "sees" at each timestep -- Your boss_fight: 14 numbers (player pos, boss HP, etc.) - -### 2.4 What is a Policy? -- The neural network that decides actions -- `observation (14 floats) → [policy network] → action (0-6)` -- Training = making this network choose better actions - -### 2.5 What is a Value Function? -- Predicts "how good is this situation?" -- "I have full HP, boss is low" → high value -- "I'm almost dead, boss is full HP" → low value -- Helps the agent learn which states to aim for - ---- - -## Level 3: How RL Training Works - -### 3.1 Collect Experience -``` -Run 56 environments in parallel: - Env 1: obs → action → reward → obs → action → reward → ... - Env 2: obs → action → reward → obs → action → reward → ... - ... - Env 56: obs → action → reward → obs → action → reward → ... - -After N steps, you have a "batch" of experience -``` - -### 3.2 Compute Advantages -- "Was this action better or worse than expected?" -- `advantage = actual_reward - predicted_value` -- Positive advantage → reinforce this action -- Negative advantage → discourage this action - -### 3.3 Update the Network -- Use collected experience to adjust policy weights -- Make good actions more likely, bad actions less likely - -### 3.4 Repeat -``` -while not done: - 1. Collect batch of experience (many timesteps) - 2. Compute advantages - 3. Update network with minibatches - 4. Go to 1 -``` - ---- - -## Level 4: Sequential Data & Memory - -### 4.1 Why Sequence Matters -In games, the PAST affects what you should do NOW: - -``` -Timestep 1: Boss starts wind-up animation -Timestep 2: Boss still winding up -Timestep 3: Boss about to attack! ← YOU SHOULD DODGE NOW -Timestep 4: Boss attacks - -If you only see timestep 3 in isolation, you might not know to dodge. -But if you saw timesteps 1-2-3 together, you'd see the pattern. -``` - -### 4.2 MLP (Multi-Layer Perceptron) — No Memory -- Standard neural network -- Only sees CURRENT observation -- `obs_t → [MLP] → action` -- No memory of previous timesteps -- Fine if observation contains all needed info - -### 4.3 RNN (Recurrent Neural Network) — Has Memory -- Sees current observation + remembers past -- `obs_t + memory → [RNN] → action + updated_memory` -- Can learn patterns over time -- Types: LSTM, GRU (different memory mechanisms) - -``` -MLP: sees [___] [___] [_X_] ← only current frame -RNN: sees [_X_] [_X_] [_X_] ← current + memory of past -``` - -### 4.4 When Do You Need RNN? -- When current observation is INCOMPLETE -- Example: "Boss is standing still" — is he about to attack or recovering? -- If your observation includes `boss_phase` and `time_to_damage`, MLP might be enough -- If observation only has positions, RNN helps learn timing - ---- - -## Level 5: BPTT (Backpropagation Through Time) - -### 5.1 The Problem -RNN has memory that flows through time: - -``` -t1 → t2 → t3 → t4 → t5 → t6 → ... → t1000 - -To train RNN, backprop must flow backwards through ALL these connections. -1000 timesteps = 1000 layers of backprop = VERY slow, uses tons of memory -``` - -### 5.2 The Solution: Truncated BPTT -Don't backprop through entire episode. Cut it into chunks: - -``` -Episode: [t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12] - -bptt_horizon = 4: - -Chunk 1: [t1 → t2 → t3 → t4] ← backprop only through these 4 -Chunk 2: [t5 → t6 → t7 → t8] ← backprop only through these 4 -Chunk 3: [t9 → t10 → t11 → t12] ← backprop only through these 4 -``` - -### 5.3 What bptt_horizon Controls -``` -bptt_horizon = 16 means: -- RNN sees 16 consecutive timesteps during training -- Gradients flow back through 16 steps max -- RNN can learn patterns up to ~16 steps long -``` - -### 5.4 Trade-offs -``` -Small horizon (8): - ✓ Fast, low memory - ✗ RNN can't learn long patterns (>8 steps) - -Large horizon (128): - ✓ RNN learns longer patterns - ✗ Slow, high memory usage -``` - ---- - -## Level 6: Putting It Together - -### 6.1 The Batch Math -``` -num_envs = 56 (parallel environments) -bptt_horizon = 16 (timesteps per chunk) - -batch_size = num_envs × bptt_horizon - = 56 × 16 - = 896 total samples per training batch -``` - -### 6.2 Why minibatch_size Must Be ≤ batch_size -``` -batch_size = 896 (you collected 896 samples) -minibatch_size = 2048 (you want to train on 2048 at a time) - -ERROR: Can't take 2048 samples from a pile of 896! - -Fix: minibatch_size = 256 or 512 (smaller than 896) -``` - -### 6.3 For Your Boss Fight (No RNN) -You're using MLP, so `bptt_horizon` just affects batch math: - -```ini -[vec] -num_envs = 56 - -[train] -bptt_horizon = 16 # 56 × 16 = 896 batch -minibatch_size = 256 # Must be ≤ 896 -``` - -Or increase horizon if you want bigger batches: - -```ini -bptt_horizon = 64 # 56 × 64 = 3584 batch -minibatch_size = 2048 # Now this works -``` - ---- - -## Summary: What You Actually Need to Know - -1. **batch_size** = total samples collected before training -2. **minibatch_size** = chunk size for each gradient update (must be ≤ batch_size) -3. **bptt_horizon** = consecutive timesteps kept together - - For RNN: determines how far back it can learn patterns - - For MLP: just affects batch_size math -4. **Your boss_fight uses MLP** — bptt_horizon is just a number to make the math work - ---- - -## Learning Resources - -### Videos (start here) -- [ ] 3Blue1Brown: "Neural Networks" series (YouTube) -- [ ] Mutual Information: "Reinforcement Learning" series (YouTube) - -### Interactive -- [ ] Andrej Karpathy: "Neural Networks: Zero to Hero" (YouTube + code) - -### Reading -- [ ] Spinning Up in Deep RL (OpenAI) — https://spinningup.openai.com -- [ ] CleanRL documentation — similar to PufferLib - -### Hands-on -- [ ] Train boss_fight, watch the numbers, build intuition diff --git a/learn-pufferlib.py b/learn-pufferlib.py deleted file mode 100644 index 4091fb6fb..000000000 --- a/learn-pufferlib.py +++ /dev/null @@ -1,1175 +0,0 @@ -""" -LEARN_V2.PY - RL with PufferLib (The Right Way) -================================================ - -PURPOSE: Learn reinforcement learning using PufferLib's patterns and infrastructure. - -This is the "full PufferLib" version of learn.py. Instead of implementing PPO -from scratch, we use PufferLib's pufferl.PuffeRL trainer which handles: -- Rollout collection -- GAE advantage computation -- PPO loss calculation -- Gradient updates -- Logging and metrics - -HOW TO USE: -1. Read each section's comments (the WHY and WHAT) -2. Fill in the TODO sections -3. Run and test after each section: python learn_v2.py -4. Only move to next section when current one works - -The environment is the same as learn.py: -- 2D arena where an agent must reach a target -- Agent can move UP/DOWN/LEFT/RIGHT or stay still -- Episode ends when: agent reaches target, hits wall, or 200 steps pass - -DEPENDENCIES: - pip install pufferlib torch numpy gymnasium -""" - -import os -import numpy as np -import gymnasium -import torch -import torch.nn as nn -import pufferlib -import pufferlib.vector -import pufferlib.pytorch -from pufferlib import pufferl - - -# ============================================================================= -# SECTION 1: PUFFERLIB ENVIRONMENT -# ============================================================================= -""" -WHY inherit from pufferlib.PufferEnv? -------------------------------------- -PufferLib provides optimized environment vectorization. When you inherit from -PufferEnv, you get: - -1. AUTOMATIC BUFFER MANAGEMENT: PufferLib creates shared memory buffers for - observations, rewards, terminals, truncations. You just write to them. - -2. MULTI-AGENT SUPPORT: The same pattern works for 1 agent or 100 agents. - You define `num_agents` and PufferLib handles the rest. - -3. VECTORIZATION COMPATIBILITY: Your env works with pufferlib.vector.make() - which can run multiple copies in parallel (Serial or Multiprocessing). - -KEY DIFFERENCES from Gymnasium: -------------------------------- -- Define `single_observation_space` and `single_action_space` (not plural) -- Set `self.num_agents` (1 for single-agent) -- Call `super().__init__(buf)` which creates self.observations, self.rewards, etc. -- Update arrays IN-PLACE: `self.observations[:] = ...` not `return obs` -- reset() and step() still return values, but also update internal buffers -""" - - -class MoveToTargetEnv(pufferlib.PufferEnv): - """ - A simple environment where an agent navigates to a target position. - - This is identical to learn.py's MoveToTargetEnv, but adapted to PufferLib's - patterns. The game logic is the same, only the interface changes. - - GAME RULES: - - Agent starts at random position in [-0.8, 0.8] x [-0.8, 0.8] - - Target is at random position (at least 0.3 units away from agent) - - Agent can: NOOP (0), UP (1), DOWN (2), LEFT (3), RIGHT (4) - - Episode ends when: agent reaches target, hits wall (|x|>1 or |y|>1), or 200 steps - - Reward: -0.01/step + distance shaping + terminal bonuses - """ - - # Type hints for attributes created by super().__init__() - observations: np.ndarray - rewards: np.ndarray - terminals: np.ndarray - truncations: np.ndarray - - def __init__(self, buf=None, seed=0): - """ - WHY these parameters? - --------------------- - - buf: Optional shared memory buffer from PufferLib's vectorization. - When running multiple envs, they share memory for efficiency. - If None, PufferLib creates a buffer automatically. - - - seed: Random seed for reproducibility. Essential for debugging! - - WHAT to do in __init__: - 1. Define single_observation_space (what ONE agent sees) - 2. Define single_action_space (what actions ONE agent can take) - 3. Set self.num_agents (1 for single-agent env) - 4. Call super().__init__(buf) - THIS CREATES self.observations, etc. - 5. Initialize game state variables - 6. Set up random number generator - """ - # ----------------------------------------------------------------- - # TODO 1.1: Define the observation space - # ----------------------------------------------------------------- - # WHAT the agent sees: [agent_x, agent_y, target_x, target_y, dx, dy] - # - Positions are in [-1, 1] (arena bounds) - # - dx, dy (direction to target) can be in [-2, 2] - # - # WHY "single_observation_space" not "observation_space"? - # PufferLib distinguishes single-agent spaces from joint spaces. - # For multi-agent, observation_space would be (num_agents, obs_dim). - # We define the SINGLE agent's view, PufferLib handles batching. - # - # YOUR CODE: Create self.single_observation_space as gymnasium.spaces.Box - # Hint: Box(low=-2.0, high=2.0, shape=(6,), dtype=np.float32) - - self.single_observation_space = gymnasium.spaces.Box( - low=-2.0, high=2.0, shape=(6,), dtype=np.float32 - ) - - # ----------------------------------------------------------------- - # TODO 1.2: Define the action space - # ----------------------------------------------------------------- - # WHAT actions are available: 0=NOOP, 1=UP, 2=DOWN, 3=LEFT, 4=RIGHT - # - # YOUR CODE: Create self.single_action_space as gymnasium.spaces.Discrete(5) - - self.single_action_space = gymnasium.spaces.Discrete(5) - - # ----------------------------------------------------------------- - # TODO 1.3: Set the number of agents - # ----------------------------------------------------------------- - # For single-agent environments, num_agents = 1. - # PufferLib uses this to allocate the right buffer sizes. - # - # YOUR CODE: Set self.num_agents = 1 - - self.num_agents = 1 - - # ----------------------------------------------------------------- - # CRITICAL: Call super().__init__(buf) - # ----------------------------------------------------------------- - # This MUST come after defining spaces and num_agents! - # It creates: - # - self.observations: array of shape (num_agents, *obs_shape) - # - self.rewards: array of shape (num_agents,) - # - self.terminals: array of shape (num_agents,) - # - self.truncations: array of shape (num_agents,) - # - # These are the buffers you'll update in reset() and step(). - super().__init__(buf) - - # ----------------------------------------------------------------- - # TODO 1.4: Initialize game state variables - # ----------------------------------------------------------------- - # Track the actual game state (not observations, those are derived). - # For single-agent, these are simple arrays of shape (2,) for positions. - # - # WHAT to initialize: - # - self.agent_pos: np.zeros(2, dtype=np.float32) - agent's [x, y] - # - self.target_pos: np.zeros(2, dtype=np.float32) - target's [x, y] - # - self.tick: 0 - step counter within episode - # - # Also initialize constants: - # - self.max_steps = 200 - # - self.target_radius = 0.1 (how close to count as "reached") - # - self.move_speed = 0.05 (movement per action) - # - self.arena_size = 1.0 (arena is [-1, 1] x [-1, 1]) - # - # YOUR CODE: Initialize game state - - self.agent_pos = np.zeros(2, dtype=np.float32) - self.target_pos = np.zeros(2, dtype=np.float32) - self.tick = 0 - - self.max_steps = 200 - self.target_radius = 0.1 - self.move_speed = 0.05 - self.arena_size = 1.0 - - # Set up random number generator for reproducibility - self.rng = np.random.default_rng(seed=seed) - - # Track previous distance for reward shaping - self.prev_dist = 0.0 - - def reset(self, seed=None): - """ - WHY reset()? - ------------ - Start a fresh episode. Called at the beginning and after each episode ends. - - WHAT to do: - 1. Randomize agent position - 2. Randomize target position (not too close to agent!) - 3. Reset step counter - 4. Compute initial distance (for reward shaping) - 5. Fill self.observations[:] with initial state - - WHY update self.observations[:] in-place? - PufferLib uses shared memory buffers. By updating in-place, we avoid - copying data. The [:] syntax means "update the existing array contents". - - RETURNS: - - self.observations: the observation buffer (now filled with initial state) - - []: empty list of infos (PufferLib expects a list) - """ - # ----------------------------------------------------------------- - # TODO 2.1: Implement reset() - # ----------------------------------------------------------------- - # Step 1: Randomize agent position - # self.agent_pos[:] = self.rng.uniform(-0.8, 0.8, size=2) - # - # Step 2: Randomize target position - # self.target_pos[:] = self.rng.uniform(-0.8, 0.8, size=2) - # - # Step 3: Ensure target is far enough from agent (at least 0.3 units) - # while np.linalg.norm(self.agent_pos - self.target_pos) < 0.3: - # self.target_pos[:] = self.rng.uniform(-0.8, 0.8, size=2) - # - # Step 4: Reset step counter - # self.tick = 0 - # - # Step 5: Compute initial distance - # self.prev_dist = np.linalg.norm(self.agent_pos - self.target_pos) - # - # Step 6: Fill observations buffer - # self.observations[0, 0] = self.agent_pos[0] # agent_x - # self.observations[0, 1] = self.agent_pos[1] # agent_y - # self.observations[0, 2] = self.target_pos[0] # target_x - # self.observations[0, 3] = self.target_pos[1] # target_y - # self.observations[0, 4] = self.target_pos[0] - self.agent_pos[0] # dx - # self.observations[0, 5] = self.target_pos[1] - self.agent_pos[1] # dy - # - # Note: We index [0, :] because num_agents=1, so observations has shape (1, 6) - # - # YOUR CODE: - - self.agent_pos[:] = self.rng.uniform(-0.8, 0.8, size=2) - self.target_pos[:] = self.rng.uniform(-0.8, 0.8, size=2) - - while np.linalg.norm(self.agent_pos - self.target_pos) < 0.3: - self.target_pos[:] = self.rng.uniform(-0.8, 0.8, size=2) - - self.tick = 0 - - self.prev_dist = np.linalg.norm(self.agent_pos - self.target_pos) - - self.observations[0, 0] = self.agent_pos[0] - self.observations[0, 1] = self.agent_pos[1] - self.observations[0, 2] = self.target_pos[0] - self.observations[0, 3] = self.target_pos[1] - self.observations[0, 4] = self.target_pos[0] - self.agent_pos[0] - self.observations[0, 5] = self.target_pos[1] - self.agent_pos[1] - - return self.observations, [] - - def step(self, actions): - """ - WHY step()? - ----------- - The core game loop. Called every timestep with the agent's chosen action. - - WHAT to do: - 1. Apply the action (move agent) - 2. Compute reward (time penalty + distance shaping + terminal bonus) - 3. Check terminal conditions (reached target? hit wall? timeout?) - 4. Update buffers (observations, rewards, terminals, truncations) - 5. Auto-reset if episode ended - - PARAMETERS: - - actions: numpy array of shape (num_agents,) = (1,) for us - Each value is an integer 0-4 - - RETURNS: - - self.observations: updated observation buffer - - self.rewards: updated reward buffer - - self.terminals: updated terminal buffer - - self.truncations: updated truncation buffer - - infos: list of dicts with episode stats for finished episodes - """ - # ----------------------------------------------------------------- - # TODO 2.2: Implement step() - # ----------------------------------------------------------------- - # Step 1: Get the action (we only have 1 agent) - # action = actions[0] - # - # Step 2: Convert action to movement - # dx, dy = 0.0, 0.0 - # if action == 1: dy = self.move_speed # UP - # elif action == 2: dy = -self.move_speed # DOWN - # elif action == 3: dx = -self.move_speed # LEFT - # elif action == 4: dx = self.move_speed # RIGHT - # - # Step 3: Apply movement - # self.agent_pos[0] += dx - # self.agent_pos[1] += dy - # self.tick += 1 - # - # Step 4: Compute distance and rewards - # distance = np.linalg.norm(self.agent_pos - self.target_pos) - # reward = -0.01 # Time penalty - # reward += 2.0 * (self.prev_dist - distance) # Distance shaping - # self.prev_dist = distance - # - # Step 5: Check terminal conditions - # reached_target = distance < self.target_radius - # hit_wall = (abs(self.agent_pos[0]) > self.arena_size or - # abs(self.agent_pos[1]) > self.arena_size) - # timed_out = self.tick >= self.max_steps - # - # Step 6: Apply terminal rewards - # if reached_target: reward += 1.0 - # if hit_wall: reward -= 0.5 - # - # Step 7: Set terminal and truncation flags - # terminal = reached_target or hit_wall - # truncation = timed_out and not terminal - # - # Step 8: Update buffers - # self.rewards[0] = reward - # self.terminals[0] = terminal - # self.truncations[0] = truncation - # - # Step 9: Build info dict for finished episodes - # infos = [] - # if terminal or truncation: - # infos.append({ - # 'episode_length': self.tick, - # 'reached_target': reached_target, - # 'hit_wall': hit_wall, - # 'reward': reward, - # }) - # # Auto-reset for next episode - # self.reset() - # - # Step 10: Update observations (whether reset or not) - # self.observations[0, 0] = self.agent_pos[0] - # self.observations[0, 1] = self.agent_pos[1] - # self.observations[0, 2] = self.target_pos[0] - # self.observations[0, 3] = self.target_pos[1] - # self.observations[0, 4] = self.target_pos[0] - self.agent_pos[0] - # self.observations[0, 5] = self.target_pos[1] - self.agent_pos[1] - # - # YOUR CODE: - - action = actions[0] - - dx, dy = 0.0, 0.0 - if action == 1: - dy = self.move_speed - elif action == 2: - dy = -self.move_speed # DOWN - elif action == 3: - dx = -self.move_speed # LEFT - elif action == 4: - dx = self.move_speed # RIGHT - - self.agent_pos[0] += dx - self.agent_pos[1] += dy - self.tick += 1 - - distance = np.linalg.norm(self.target_pos - self.agent_pos) - reward = -0.01 - reward += 2 * (self.prev_dist - distance) - self.prev_dist = distance - - reached_target = distance < self.target_radius - hit_wall = ( - abs(self.agent_pos[0]) > self.arena_size - or abs(self.agent_pos[1]) > self.arena_size - ) - timed_out = self.tick >= self.max_steps - - if reached_target: - reward += 1.0 - if hit_wall: - reward -= 0.5 - - terminal = reached_target or hit_wall - truncation = timed_out and not terminal - - self.rewards[0] = reward - self.terminals[0] = terminal - self.truncations[0] = truncation - - infos = [] - if terminal or truncation: - infos.append( - { - "episode_length": self.tick, - "reached_target": reached_target, - "hit_wall": hit_wall, - "reward": reward, - } - ) - self.reset() - - self.observations[0, 0] = self.agent_pos[0] - self.observations[0, 1] = self.agent_pos[1] - self.observations[0, 2] = self.target_pos[0] - self.observations[0, 3] = self.target_pos[1] - self.observations[0, 4] = self.target_pos[0] - self.agent_pos[0] - self.observations[0, 5] = self.target_pos[1] - self.agent_pos[1] - - return self.observations, self.rewards, self.terminals, self.truncations, infos - - def render(self): - """ - Simple ASCII rendering for debugging. - Shows a 20x20 grid with agent (A) and target (T). - """ - grid_size = 20 - grid = [["." for _ in range(grid_size)] for _ in range(grid_size)] - - # Convert positions from [-1, 1] to grid indices [0, grid_size-1] - def to_grid(pos): - x = int((pos[0] + 1) / 2 * (grid_size - 1)) - y = int((1 - (pos[1] + 1) / 2) * (grid_size - 1)) # Flip y for display - return max(0, min(grid_size - 1, x)), max(0, min(grid_size - 1, y)) - - tx, ty = to_grid(self.target_pos) - ax, ay = to_grid(self.agent_pos) - - grid[ty][tx] = "T" - grid[ay][ax] = "A" - - print(f"\nStep {self.tick}:") - print("+" + "-" * grid_size + "+") - for row in grid: - print("|" + "".join(row) + "|") - print("+" + "-" * grid_size + "+") - - def close(self): - pass - - -# ============================================================================= -# SECTION 2: TESTING ENVIRONMENT -# ============================================================================= -""" -WHY test before training? -------------------------- -If your environment is broken, RL will silently fail to learn. -You'll waste hours wondering why training doesn't work. - -ALWAYS verify: -1. Environment creates without errors -2. reset() returns correct shapes -3. step() works with valid actions -4. Episodes actually terminate -5. A simple heuristic can solve it -""" - - -def test_environment(): - """Run basic sanity checks on the PufferLib environment.""" - print("=" * 60) - print("TESTING MoveToTargetEnv (PufferLib)") - print("=" * 60) - - # Test 1: Creation - print("\n[TEST 1] Creating environment...") - try: - env = MoveToTargetEnv(seed=42) - print(f" OK: Created env") - print(f" Observation space: {env.single_observation_space}") - print(f" Action space: {env.single_action_space}") - print(f" Num agents: {env.num_agents}") - except Exception as e: - print(f" FAILED: {e}") - import traceback - - traceback.print_exc() - return False - - # Test 2: Reset - print("\n[TEST 2] Testing reset()...") - try: - obs, info = env.reset() - print(f" OK: reset() returned observations with shape {obs.shape}") - print(f" Sample observation: {obs[0]}") - assert obs.shape == (1, 6), f"Wrong shape: {obs.shape}, expected (1, 6)" - except Exception as e: - print(f" FAILED: {e}") - import traceback - - traceback.print_exc() - return False - - # Test 3: Step with random actions - print("\n[TEST 3] Testing step() with random actions...") - try: - for i in range(5): - actions = np.array([np.random.randint(0, 5)]) # Shape (1,) - obs, rewards, terminals, truncations, infos = env.step(actions) - print(f" Step {i + 1}: reward={rewards[0]:.3f}, terminal={terminals[0]}") - print(f" OK: step() works") - except Exception as e: - print(f" FAILED: {e}") - import traceback - - traceback.print_exc() - return False - - # Test 4: Run until episode terminates using heuristic - print("\n[TEST 4] Running until episode terminates...") - try: - obs, _ = env.reset() - total_steps = 0 - episodes_finished = 0 - - while episodes_finished < 2 and total_steps < 500: - # Simple heuristic: move toward target - dx = obs[0, 4] # target_x - agent_x - dy = obs[0, 5] # target_y - agent_y - - if abs(dx) > abs(dy): - action = 4 if dx > 0 else 3 # RIGHT or LEFT - else: - action = 1 if dy > 0 else 2 # UP or DOWN - - actions = np.array([action]) - obs, rewards, terminals, truncations, infos = env.step(actions) - total_steps += 1 - - if infos: - for info in infos: - episodes_finished += 1 - print(f" Episode finished: {info}") - - print(f" OK: Completed {episodes_finished} episodes in {total_steps} steps") - except Exception as e: - print(f" FAILED: {e}") - import traceback - - traceback.print_exc() - return False - - # Test 5: Test with PufferLib vectorization - print("\n[TEST 5] Testing with pufferlib.vector.make()...") - try: - vecenv = pufferlib.vector.make( - MoveToTargetEnv, - num_envs=4, - backend=pufferlib.vector.Serial, - ) - obs, _ = vecenv.reset() - print(f" OK: Created vectorized env with 4 copies") - print(f" Vectorized observation shape: {obs.shape}") - - # Take a few steps - for i in range(3): - actions = np.random.randint(0, 5, size=4) - obs, rewards, terminals, truncations, infos = vecenv.step(actions) - print(f" OK: Vectorized stepping works") - vecenv.close() - except Exception as e: - print(f" FAILED: {e}") - import traceback - - traceback.print_exc() - return False - - print("\n" + "=" * 60) - print("ALL ENVIRONMENT TESTS PASSED!") - print("=" * 60) - return True - - -# ============================================================================= -# SECTION 3: POLICY NETWORK -# ============================================================================= -""" -WHY this specific architecture? -------------------------------- -PufferLib expects policies to follow certain conventions: - -1. forward_eval(observations, state=None) -> (logits, values) - - This is what the trainer calls during rollout collection - - Returns action LOGITS (not probabilities) and value estimates - - The `state` parameter is for RNNs (we return None for feedforward) - -2. Use pufferlib.pytorch.layer_init() for weight initialization - - Proper initialization is crucial for stable learning - - Different std values for actor vs critic heads - -WHY layer_init? ---------------- -Neural network initialization matters A LOT for RL: -- Too large weights -> exploding gradients, unstable training -- Too small weights -> vanishing gradients, slow learning -- layer_init uses orthogonal initialization which works well for RL - -ARCHITECTURE: -observation (6) -> encoder (64 -> 64) -> actor head (5) + critic head (1) -""" - - -class Policy(nn.Module): - """ - Actor-Critic policy network following PufferLib conventions. - - The network has: - - Shared encoder: processes observations into features - - Actor head: outputs action logits (5 actions) - - Critic head: outputs value estimate (1 value) - """ - - def __init__(self, env, hidden_size=64): - """ - WHY take env as parameter? - -------------------------- - We extract observation and action sizes from the environment. - This is more robust than hardcoding dimensions. - - PufferLib's vectorized envs provide: - - env.single_observation_space: shape of one agent's observation - - env.single_action_space: the action space for one agent - - For regular Gymnasium envs, these would be observation_space/action_space. - """ - super().__init__() - - # Get dimensions from environment - obs_size = env.single_observation_space.shape[0] - action_size = env.single_action_space.n - - # ----------------------------------------------------------------- - # TODO 3.1: Create the encoder (shared backbone) - # ----------------------------------------------------------------- - # The encoder processes observations into a feature vector. - # Both actor and critic will use these features. - # - # Architecture: Linear(obs_size, hidden_size) -> ReLU -> Linear(hidden_size, hidden_size) -> ReLU - # - # Use pufferlib.pytorch.layer_init() for each Linear layer. - # Default std works for hidden layers. - # - # Example: - # self.encoder = nn.Sequential( - # pufferlib.pytorch.layer_init(nn.Linear(obs_size, hidden_size)), - # nn.ReLU(), - # pufferlib.pytorch.layer_init(nn.Linear(hidden_size, hidden_size)), - # nn.ReLU(), - # ) - # - # YOUR CODE: - - self.encoder = nn.Sequential( - pufferlib.pytorch.layer_init(nn.Linear(obs_size, hidden_size)), - nn.ReLU(), - pufferlib.pytorch.layer_init(nn.Linear(hidden_size, hidden_size)), - nn.ReLU(), - ) - - # ----------------------------------------------------------------- - # TODO 3.2: Create the actor head - # ----------------------------------------------------------------- - # Outputs action logits. Use std=0.01 for small initial outputs. - # WHY small std? We want initial actions to be nearly uniform. - # - # self.actor = pufferlib.pytorch.layer_init( - # nn.Linear(hidden_size, action_size), std=0.01 - # ) - # - # YOUR CODE: - - self.actor = pufferlib.pytorch.layer_init( - nn.Linear(hidden_size, action_size), std=0.01 - ) - - # ----------------------------------------------------------------- - # TODO 3.3: Create the critic head - # ----------------------------------------------------------------- - # Outputs value estimate. Use std=1.0 for reasonable initial values. - # - # self.critic = pufferlib.pytorch.layer_init( - # nn.Linear(hidden_size, 1), std=1.0 - # ) - # - # YOUR CODE: - - self.critic = pufferlib.pytorch.layer_init(nn.Linear(hidden_size, 1), std=1.0) - - def forward_eval(self, observations, state=None): - """ - WHY forward_eval specifically? - ------------------------------ - PufferLib's trainer calls forward_eval() during rollout collection. - It expects (logits, values) as return value. - - The state parameter is for recurrent networks (LSTMs). For feedforward - networks like ours, we ignore it and return None. - - PARAMETERS: - - observations: tensor of shape (batch_size, obs_size) - - state: For RNN/LSTM policies, carries hidden state between steps. - For feedforward networks (like ours), always None. - - RETURNS: - - logits: tensor of shape (batch_size, action_size) - unnormalized action scores - - values: tensor of shape (batch_size, 1) - value estimates - """ - # ----------------------------------------------------------------- - # TODO 3.4: Implement forward_eval - # ----------------------------------------------------------------- - # Step 1: Pass observations through encoder - # hidden = self.encoder(observations) - # - # Step 2: Get action logits from actor head - # logits = self.actor(hidden) - # - # Step 3: Get value estimate from critic head - # values = self.critic(hidden) - # - # Step 4: Return (logits, values) - - hidden = self.encoder(observations) - logits = self.actor(hidden) - values = self.critic(hidden) - - return logits, values - - def forward(self, observations, state=None): - """Standard PyTorch forward - required by PufferLib trainer.""" - return self.forward_eval(observations, state) - - -# ============================================================================= -# SECTION 4: TESTING POLICY -# ============================================================================= -""" -WHY test the policy? --------------------- -Verify the network architecture is correct before training. -Common bugs: -- Wrong input/output dimensions -- Missing activations -- NaN in outputs -""" - - -def test_policy(): - """Run basic sanity checks on the Policy network.""" - print("\n" + "=" * 60) - print("TESTING Policy Network") - print("=" * 60) - - # Test 1: Creation - print("\n[TEST 1] Creating policy...") - try: - # Create a dummy env to get dimensions - env = MoveToTargetEnv() - env.reset() # Initialize the env - - policy = Policy(env, hidden_size=64) - print(f" OK: Created policy") - - # Count parameters - total_params = sum(p.numel() for p in policy.parameters()) - print(f" Total parameters: {total_params}") - except Exception as e: - print(f" FAILED: {e}") - import traceback - - traceback.print_exc() - return False - - # Test 2: forward_eval - print("\n[TEST 2] Testing forward_eval()...") - try: - # Create batch of observations - obs = torch.randn(4, 6) # batch of 4 - logits, values = policy.forward_eval(obs) - - print(f" Input shape: {obs.shape}") - print(f" Logits shape: {logits.shape} (expected: [4, 5])") - print(f" Values shape: {values.shape} (expected: [4, 1])") - - assert logits.shape == (4, 5), f"Wrong logits shape: {logits.shape}" - assert values.shape == (4, 1), f"Wrong values shape: {values.shape}" - - # Check for NaN - assert not torch.isnan(logits).any(), "NaN in logits!" - assert not torch.isnan(values).any(), "NaN in values!" - - print(" OK: Shapes correct, no NaN") - except Exception as e: - print(f" FAILED: {e}") - import traceback - - traceback.print_exc() - return False - - # Test 3: Single observation - print("\n[TEST 3] Testing with single observation...") - try: - obs = torch.randn(1, 6) - logits, values = policy.forward_eval(obs) - - print(f" Logits: {logits}") - print(f" Value: {values}") - print(" OK: Single observation works") - except Exception as e: - print(f" FAILED: {e}") - import traceback - - traceback.print_exc() - return False - - print("\n" + "=" * 60) - print("ALL POLICY TESTS PASSED!") - print("=" * 60) - return True - - -# ============================================================================= -# SECTION 5: TRAINING WITH PUFFERLIB -# ============================================================================= -""" -WHY use pufferl.PuffeRL? ------------------------- -PufferLib's trainer handles ALL the RL internals: -- Rollout collection (running envs, storing experiences) -- GAE advantage computation -- PPO loss calculation (clipped surrogate, value loss, entropy) -- Gradient updates with clipping -- Logging and metrics - -This means our training code is MUCH simpler than learn.py! - -THE TRAINING LOOP: ------------------ -1. Create vectorized environment -2. Create policy -3. Create config dict with hyperparameters -4. Create PuffeRL trainer -5. Loop: trainer.evaluate() -> trainer.train() - -WHAT trainer.evaluate() does: -- Runs the policy in all environments -- Collects experiences into buffers -- Computes advantages and returns - -WHAT trainer.train() does: -- Runs PPO update on collected experiences -- Updates policy weights -- Logs metrics -""" - - -def train(quick_test=False): - """ - Main training function using PufferLib's trainer. - - PARAMETERS: - - quick_test: if True, run short training to verify code works - if False, run full training to see actual learning - """ - # ----------------------------------------------------------------- - # Hyperparameters - # ----------------------------------------------------------------- - if quick_test: - total_timesteps = 10000 - num_envs = 4 - else: - total_timesteps = 100000 - num_envs = 8 - - # Detect device - # device = "mps" if torch.backends.mps.is_available() else "cpu" - device = "cpu" - - print("=" * 60) - print("TRAINING WITH PUFFERLIB") - print("=" * 60) - print(f"MPS available: {torch.backends.mps.is_available()}") - print(f"Using device: {device}") - print(f"Total timesteps: {total_timesteps}") - print(f"Num environments: {num_envs}") - print("=" * 60) - - # ----------------------------------------------------------------- - # TODO 5.1: Create vectorized environment - # ----------------------------------------------------------------- - # PufferLib's vector.make() creates multiple environment copies. - # - # Backend options: - # - Serial: Runs envs sequentially. Good for debugging because errors - # appear in the main process with full stack traces. - # - Multiprocessing: Runs envs in parallel. Much faster for many envs, - # but errors in subprocesses are harder to debug. - # - # Tip: Use Serial until your code works, then switch to Multiprocessing. - # - # vecenv = pufferlib.vector.make( - # MoveToTargetEnv, - # num_envs=num_envs, - # backend=pufferlib.vector.Serial, - # ) - # - # YOUR CODE: - - vecenv = pufferlib.vector.make( - MoveToTargetEnv, num_envs=num_envs, backend=pufferlib.vector.Multiprocessing - ) - - # ----------------------------------------------------------------- - # TODO 5.2: Create policy - # ----------------------------------------------------------------- - # Use vecenv.driver_env to get a reference to one of the environment copies. - # This lets us access single_observation_space and single_action_space - # for creating the policy with correct input/output dimensions. - # Move policy to device for GPU training. - # - # policy = Policy(vecenv.driver_env, hidden_size=64).to(device) - # - # YOUR CODE: - - policy = Policy(vecenv.driver_env, hidden_size=64).to(device) - next(policy.parameters()).device - - # ----------------------------------------------------------------- - # TODO 5.3: Create config - # ----------------------------------------------------------------- - # PufferLib's trainer uses a Config object for hyperparameters. - # These are standard PPO values that work well. - # - # config = pufferl.Config( - # total_timesteps=total_timesteps, - # learning_rate=3e-4, - # num_steps=128, # Steps per rollout - # num_minibatches=4, # Minibatches per update - # update_epochs=4, # PPO epochs per update - # gamma=0.99, # Discount factor - # gae_lambda=0.95, # GAE parameter - # clip_coef=0.2, # PPO clipping - # vf_coef=0.5, # Value loss coefficient - # ent_coef=0.01, # Entropy bonus coefficient - # max_grad_norm=0.5, # Gradient clipping - # ) - # - # YOUR CODE: - - config = { - "env": "MoveToTarget", - "total_timesteps": total_timesteps, - "learning_rate": 3e-4, - "batch_size": num_envs * 128, - "bptt_horizon": 128, - "minibatch_size": 512, - "max_minibatch_size": 512, - "update_epochs": 4, - "gamma": 0.99, - "gae_lambda": 0.95, - "clip_coef": 0.2, - "vf_coef": 0.5, - "vf_clip_coef": 0.2, - "ent_coef": 0.01, - "max_grad_norm": 0.5, - "device": device, - "seed": 42, - "torch_deterministic": True, - "cpu_offload": False, - "use_rnn": False, - "compile": False, - "optimizer": "adam", - "adam_beta1": 0.9, - "adam_beta2": 0.999, - "adam_eps": 1e-8, - "anneal_lr": True, - "vtrace_rho_clip": 1.0, - "vtrace_c_clip": 1.0, - "prio_alpha": 0.8, - "prio_beta0": 0.2, - "checkpoint_interval": 200, - "data_dir": "experiments", - "precision": "float32", - } - - # ----------------------------------------------------------------- - # TODO 5.4: Create trainer - # ----------------------------------------------------------------- - # The PuffeRL trainer handles the entire training loop internals. - # - # trainer = pufferl.PuffeRL( - # config=config, - # vecenv=vecenv, - # policy=policy, - # optimizer=torch.optim.Adam(policy.parameters(), lr=config.learning_rate), - # ) - # - # YOUR CODE: - - trainer = pufferl.PuffeRL(config, vecenv, policy) - - # ----------------------------------------------------------------- - # TODO 5.5: Training loop - # ----------------------------------------------------------------- - # The training loop is very simple with PufferLib: - # 1. trainer.evaluate() - collect experiences - # 2. trainer.train() - run PPO update - # 3. Repeat until done - # - # Example: - # while not trainer.done: - # trainer.evaluate() - # trainer.train() - # - # # Print progress every 10 epochs - # if trainer.epoch % 10 == 0: - # # Get metrics from trainer - # metrics = trainer.metrics - # print(f"Epoch {trainer.epoch} | " - # f"reward: {metrics.get('episode_reward', 0):.2f} | " - # f"length: {metrics.get('episode_length', 0):.1f}") - # - # Or use the built-in dashboard: - # while not trainer.done: - # trainer.evaluate() - # trainer.train() - # trainer.print_dashboard() # Pretty-printed metrics - # - # YOUR CODE: - - while trainer.global_step < total_timesteps: - trainer.evaluate() - trainer.train() - - # Cleanup - trainer.close() - vecenv.close() - - print("\n" + "=" * 60) - print("TRAINING COMPLETE!") - print("=" * 60) - - return policy - - -# ============================================================================= -# SECTION 6: EVALUATION WITH ASCII RENDERING -# ============================================================================= - - -def eval_policy(num_episodes=3, delay=0.1): - """ - Run the trained policy and watch it play with ASCII rendering. - - PARAMETERS: - - num_episodes: number of episodes to run - - delay: seconds between frames (for watchability) - """ - import time - import glob - - print("=" * 60) - print("EVALUATING TRAINED POLICY") - print("=" * 60) - - # Find latest checkpoint - checkpoints = glob.glob("experiments/**/model.pt", recursive=True) - if not checkpoints: - print( - "No checkpoint found in experiments/. Train first with 'python learn_v2.py train'" - ) - return - - latest_checkpoint = max(checkpoints, key=lambda x: os.path.getmtime(x)) - print(f"Loading checkpoint: {latest_checkpoint}") - - # Create environment (single, not vectorized) - env = MoveToTargetEnv(seed=int(time.time())) - - # Create and load policy - policy = Policy(env, hidden_size=64) - checkpoint = torch.load(latest_checkpoint, map_location="cpu", weights_only=True) - policy.load_state_dict(checkpoint) - policy.eval() - - print(f"Running {num_episodes} episodes...\n") - - for ep in range(num_episodes): - print(f"\n{'=' * 60}") - print(f"EPISODE {ep + 1}") - print(f"{'=' * 60}") - - obs, _ = env.reset() - env.render() - time.sleep(delay) - - done = False - total_reward = 0.0 - - while not done: - # Get action from policy - with torch.no_grad(): - obs_tensor = torch.from_numpy(obs).float() - logits, _ = policy(obs_tensor) - action = torch.argmax(logits, dim=-1).item() - - # Step environment - obs, rewards, terminals, truncations, infos = env.step(np.array([action])) - total_reward += rewards[0] - done = terminals[0] or truncations[0] - - # Render - env.render() - action_names = ["NOOP", "UP", "DOWN", "LEFT", "RIGHT"] - print(f"Action: {action_names[action]}, Reward: {rewards[0]:.3f}") - time.sleep(delay) - - # Episode summary - if infos: - info = infos[0] - result = ( - "REACHED TARGET!" - if info.get("reached_target") - else "Failed (wall/timeout)" - ) - print(f"\nResult: {result}") - print(f"Episode length: {info.get('episode_length', 'N/A')}") - print(f"Total reward: {total_reward:.3f}") - - env.close() - print("\n" + "=" * 60) - print("EVALUATION COMPLETE!") - print("=" * 60) - - -# ============================================================================= -# MAIN EXECUTION -# ============================================================================= - -if __name__ == "__main__": - import sys - - # Parse command line arguments - if len(sys.argv) > 1: - command = sys.argv[1] - if command == "test": - # Run all tests - env_ok = test_environment() - if env_ok: - test_policy() - elif command == "train": - # Run full training - test_environment() - test_policy() - train(quick_test=False) - elif command == "quick": - # Quick training test - # test_environment() - # test_policy() - train(quick_test=True) - elif command == "eval": - # Evaluate trained policy with ASCII rendering - eval_policy(num_episodes=3, delay=0.1) - else: - print(f"Unknown command: {command}") - print("Usage: python learn_v2.py [test|train|quick|eval]") - else: - # Default: run tests only - print("Running tests... (use 'python learn_v2.py train' for full training)") - print() - env_ok = test_environment() - if env_ok: - test_policy() diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h index 55799cf94..6df80098c 100644 --- a/pufferlib/ocean/boss_fight/boss_fight.h +++ b/pufferlib/ocean/boss_fight/boss_fight.h @@ -24,7 +24,6 @@ #define HP_BAR_WIDTH 40 #define HP_BAR_HEIGHT 5 -// Rewards #define REWARD_APPROACH 0.05f #define REWARD_HIT_WALL -0.05f #define REWARD_PLAYER_HIT_BOSS 0.07f @@ -52,13 +51,12 @@ typedef enum { BOSS_RECOVERING, } BossState; -// Only use floats! typedef struct { float perf; // 0-1 normalized metric - float score; // unnormalized metric - float episode_return; // sum of rewards - float episode_length; // steps per episode - float wins; // episodes where boss died + float score; // Unnormalized metric + float episode_return; // Sum of rewards + float episode_length; // Steps per episode + float wins; // Episodes where boss died float n; // Required as last field } Log; @@ -86,9 +84,8 @@ typedef struct { float boss_hp; int boss_phase_ticks; - float episode_return; // track within episode + float episode_return; - // stats int player_wins; int boss_wins; int timeouts; @@ -228,7 +225,7 @@ void c_step(BossFight *env) { reward += REWARD_HIT_WALL; } - // can't walk out of bounds + // Can't walk out of bounds env->player_x = fmaxf(-ARENA_HALF_SIZE, fminf(ARENA_HALF_SIZE, env->player_x)); env->player_y = @@ -239,14 +236,13 @@ void c_step(BossFight *env) { reward += REWARD_APPROACH * (env->prev_distance - dist); env->prev_distance = dist; - // push player out if clipping into boss + // Push player out if clipping into boss if (dist < BOSS_SIZE + PLAYER_SIZE && dist > 1e-6f) { float overlap = BOSS_SIZE + PLAYER_SIZE - dist; float dx = env->player_x - env->boss_x; float dy = env->player_y - env->boss_y; env->player_x += (dx / dist) * overlap; env->player_y += (dy / dist) * overlap; - // recalculate distance after push dist = distance(env->player_x, env->player_y, env->boss_x, env->boss_y); } @@ -262,8 +258,8 @@ void c_step(BossFight *env) { env->player_state == PLAYER_DODGING && env->player_state_ticks > (PLAYER_DODGE_TICKS - PLAYER_IFRAME_TICKS); - // Souls-like: you can i-frame briefly, but the AOE persists longer than the - // i-frame window; if you're still in the hitbox after i-frames, you get hit. + // AOE persists longer than the i-frame window + // If player is still in the hitbox after i-frames, you get hit. bool boss_can_hit = in_aoe_attack && !player_iframed; bool boss_can_damage = env->boss_state == BOSS_ATTACKING && boss_can_hit; if (boss_can_damage) { diff --git a/runpod.md b/runpod.md deleted file mode 100644 index 81859f0f9..000000000 --- a/runpod.md +++ /dev/null @@ -1,12 +0,0 @@ -curl -LsSf https://astral.sh/uv/install.sh | sh -source ~/.bashrc -git clone https://github.com/frixaco/PufferLib -cd PufferLib -git switch boss-fight -uv venv -source .venv/bin/activate -uv pip install -e . -python setup.py build_boss_fight --inplace --force -puffer train puffer_boss_fight --train.total-timesteps 5000000 --train.device cuda --vec.num-envs 8192 --vec.num-workers 16 --train.minibatch-size 8192 --train.max-minibatch-size 65536 - -puffer eval puffer*boss_fight --load-model-path $(ls -t experiments/puffer_boss_fight*\_/model\_\_.pt | head -1) From 27ede234995413f863bdeda97780471208a6ec87 Mon Sep 17 00:00:00 2001 From: frixaco Date: Sat, 24 Jan 2026 23:06:22 +0500 Subject: [PATCH 25/29] add readme --- pufferlib/config/boss_fight.ini | 6 +- pufferlib/ocean/boss_fight/README.md | 100 +++++++++++++++++++++++++++ 2 files changed, 103 insertions(+), 3 deletions(-) create mode 100644 pufferlib/ocean/boss_fight/README.md diff --git a/pufferlib/config/boss_fight.ini b/pufferlib/config/boss_fight.ini index c0913bb62..bb7f8859d 100644 --- a/pufferlib/config/boss_fight.ini +++ b/pufferlib/config/boss_fight.ini @@ -5,8 +5,8 @@ policy_name = Policy # rnn_name = Recurrent # Uncomment if adding LSTM/GRU [vec] -num_envs = 448 -num_workers = 14 +num_envs = 1024 +num_workers = 8 batch_size = auto zero_copy = True seed = 42 @@ -39,7 +39,7 @@ precision = float32 compile = False # Core PPO hyperparameters -total_timesteps = 10_000_000 +total_timesteps = 5_000_000 learning_rate = 0.0003 anneal_lr = True min_lr_ratio = 0.0 diff --git a/pufferlib/ocean/boss_fight/README.md b/pufferlib/ocean/boss_fight/README.md new file mode 100644 index 000000000..958cdcd57 --- /dev/null +++ b/pufferlib/ocean/boss_fight/README.md @@ -0,0 +1,100 @@ +# BossFight (PufferLib Ocean) + +BossFight is a simple 2D boss-fight reinforcement learning environment. + +The boss currently has **one attack**: a circular **AOE burst** and cycles between 4 states. +Player (agent) has to defeat the boss by attacking and avoiding AoE attacks by dodging (has i-frames). +All hitboxes are circles (collision = circles overlap). + +## Game rules + +- **Arena:** square `[-ARENA_HALF_SIZE, ARENA_HALF_SIZE]^2` (default `5.0`) +- **Boss:** stationary at `(0, 0)` +- **Episode ends on:** + - win: boss HP reaches 0 + - loss: player HP reaches 0 + - timeout: `EPISODE_LENGTH` steps + +### Boss attack cycle + +The boss cycles through: + +`IDLE (BOSS_IDLE_TICKS) -> WINDUP (BOSS_WINDUP_TICKS) -> ACTIVE (BOSS_ACTIVE_TICKS) -> RECOVERY (BOSS_RECOVERY_TICKS) -> ...` + +During **ACTIVE**, the boss deals damage if the player overlaps the AOE circle. + +### Player mechanics + +- **Move** (only while idling): 4 directional movement at `PLAYER_SPEED_PER_TICK` +- **Attack**: melee hit if within `PLAYER_ATTACK_RADIUS` (locks the player for `PLAYER_ATTACK_TICKS`) +- **Dodge**: + - lasts `PLAYER_DODGE_TICKS` and automatically moves the player directly **away from the boss** at `PLAYER_DODGE_SPEED_PER_TICK` + - the first `PLAYER_IFRAME_TICKS` are i-frames + - the boss AOE lasts longer than the i-frame window, so “dodge in place” isn’t sufficient -- you must **exit the AOE** + - after dodge ends, `PLAYER_DODGE_COOLDOWN` ticks must pass before dodging again + +## Action space + +`Discrete(7)`: + +| id | action | +| --: | ---------- | +| 0 | idle | +| 1 | move up | +| 2 | move down | +| 3 | move left | +| 4 | move right | +| 5 | dodge | +| 6 | attack | + +## Observation space + +`Box(shape=(13,), dtype=float32)` (see `update_observations` in `boss_fight.h`): + +| idx | meaning | +| --: | ------------------------------------------------------ | +| 0 | `boss_x - player_x` | +| 1 | `boss_y - player_y` | +| 2 | `player_x` | +| 3 | `player_y` | +| 4 | `boss_x` | +| 5 | `boss_y` | +| 6 | `player_hp` | +| 7 | `boss_hp` | +| 8 | `player_state` (`0=idle, 1=dodge, 2=attack`) | +| 9 | `player_dodge_cooldown` | +| 10 | `player_state_ticks` (remaining) | +| 11 | `boss_state` (`0=idle, 1=windup, 2=attack, 3=recover`) | +| 12 | `boss_phase_ticks` (remaining) | + +## Rewards (defaults) + +All reward constants are in `boss_fight.h`: + +- **Per-step:** `REWARD_TICK` +- **Shaping:** `REWARD_APPROACH * (prev_distance - distance)` +- **Events:** + - `REWARD_PLAYER_HIT_BOSS` + - `REWARD_BOSS_HIT_PLAYER` + - `REWARD_DODGE_SUCCESS` + - `REWARD_HIT_WALL` +- **Terminal:** `REWARD_KILL_BOSS`, `REWARD_PLAYER_DIED`, `REWARD_TIMEOUT` + +**Dodge success reward** is only paid when: + +1. you **start** a dodge while inside the AOE during the boss danger window (**WINDUP** or **ACTIVE**), and +2. you **exit** the AOE before the danger window ends. + +## Rendering / manual play + +- Rendering uses **Raylib**. `BossFight.render()` opens a window and draws the player/boss circles + hit radii. +- A tiny standalone debug harness lives in `boss_fight.c`: + - Hold `Left Shift` for manual controls: `WASD` move, `Space` dodge, `J` attack + - Without `Left Shift` it takes random actions + +## Files + +- `boss_fight.h`: core environment logic (`c_reset`, `c_step`, `c_render`) +- `binding.c`: CPython extension glue (uses `pufferlib/ocean/env_binding.h`) +- `boss_fight.py`: PufferLib wrapper (`PufferEnv`) + vectorized stepping +- `pufferlib/config/boss_fight.ini`: default training config for `puffer train puffer_boss_fight` From ef4822306eae1e175e765aae5565e967ac31e6cd Mon Sep 17 00:00:00 2001 From: frixaco Date: Sat, 24 Jan 2026 23:18:34 +0500 Subject: [PATCH 26/29] fix manual control --- pufferlib/ocean/boss_fight/boss_fight.h | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h index 6df80098c..609c6c07e 100644 --- a/pufferlib/ocean/boss_fight/boss_fight.h +++ b/pufferlib/ocean/boss_fight/boss_fight.h @@ -163,13 +163,29 @@ void c_step(BossFight *env) { env->terminals[0] = 0; int action = env->actions[0]; + if (IsKeyDown(KEY_LEFT_SHIFT)) { + if (IsKeyDown(KEY_W)) + action = 1; + else if (IsKeyDown(KEY_S)) + action = 2; + else if (IsKeyDown(KEY_A)) + action = 3; + else if (IsKeyDown(KEY_D)) + action = 4; + else if (IsKeyDown(KEY_SPACE)) + action = 5; + else if (IsKeyDown(KEY_J)) + action = 6; + else + action = 0; + } float dx = 0; float dy = 0; if (action == 1) { - dy = PLAYER_SPEED_PER_TICK; - } else if (action == 2) { dy = -PLAYER_SPEED_PER_TICK; + } else if (action == 2) { + dy = PLAYER_SPEED_PER_TICK; } else if (action == 3) { dx = -PLAYER_SPEED_PER_TICK; } else if (action == 4) { From d2c4a6c9d4c46a24d14179efd37e09698ea658cc Mon Sep 17 00:00:00 2001 From: frixaco Date: Sun, 25 Jan 2026 00:17:42 +0500 Subject: [PATCH 27/29] sweep sweep --- pufferlib/config/boss_fight.ini | 55 +++++++++------------------------ 1 file changed, 15 insertions(+), 40 deletions(-) diff --git a/pufferlib/config/boss_fight.ini b/pufferlib/config/boss_fight.ini index bb7f8859d..8013ea38b 100644 --- a/pufferlib/config/boss_fight.ini +++ b/pufferlib/config/boss_fight.ini @@ -2,7 +2,6 @@ package = ocean env_name = puffer_boss_fight policy_name = Policy -# rnn_name = Recurrent # Uncomment if adding LSTM/GRU [vec] num_envs = 1024 @@ -12,59 +11,40 @@ zero_copy = True seed = 42 [env] -# Environment-specific params (passed to env constructor) -# None needed - using defaults from README [policy] -# Policy constructor args (e.g., hidden_size) -# hidden_size = 64 # Experiment: 32, 64, 128 [train] -# Experiment tracking name = boss_fight project = boss_fight_experiments data_dir = experiments checkpoint_interval = 200 - -# Reproducibility seed = 42 -# TODO: disable for sweep or speed torch_deterministic = True device = cpu - -# Optimization -# TODO: try muon with 0.015 lr optimizer = adam precision = float32 compile = False - -# Core PPO hyperparameters total_timesteps = 5_000_000 -learning_rate = 0.0003 +learning_rate = 0.000864 anneal_lr = True -min_lr_ratio = 0.0 -gamma = 0.99 -gae_lambda = 0.95 +min_lr_ratio = 0.437 +gamma = 0.983 +gae_lambda = 0.902 update_epochs = 4 -clip_coef = 0.2 -vf_coef = 0.5 -vf_clip_coef = 0.2 -max_grad_norm = 0.5 -ent_coef = 0.01 - -# Batch sizes -minibatch_size = 2048 +clip_coef = 0.421 +vf_coef = 4.38 +vf_clip_coef = 0.303 +max_grad_norm = 2.28 +ent_coef = 0.00623 +minibatch_size = 2048 max_minibatch_size = 32768 bptt_horizon = 32 - -# Adam parameters (if optimizer = adam) -adam_beta1 = 0.9 -adam_beta2 = 0.999 -adam_eps = 1e-8 - -# V-trace (for off-policy correction) -# vtrace_rho_clip = 1.0 -# vtrace_c_clip = 1.0 +adam_beta1 = 0.991 +adam_beta2 = 0.998 +adam_eps = 1e-14 +vtrace_rho_clip = 2.72 +vtrace_c_clip = 2.13 [sweep] goal = maximize @@ -74,31 +54,26 @@ metric_distribution = linear max_suggestion_cost = 3600 use_gpu = True -# Learning rate sweep [sweep.train.learning_rate] distribution = log_normal min = 0.0001 max = 0.003 -# Entropy coefficient sweep (exploration vs exploitation) [sweep.train.ent_coef] distribution = log_normal min = 0.0001 max = 0.05 -# Discount factor sweep [sweep.train.gamma] distribution = logit_normal min = 0.95 max = 0.999 -# GAE lambda sweep [sweep.train.gae_lambda] distribution = logit_normal min = 0.9 max = 0.99 -# Minibatch size sweep [sweep.train.minibatch_size] distribution = uniform_pow2 min = 1024 From 0896d218aaddf9c9a8692bbfdb40871b74131964 Mon Sep 17 00:00:00 2001 From: frixaco Date: Sun, 25 Jan 2026 04:11:43 +0500 Subject: [PATCH 28/29] normalize observation space data --- pufferlib/ocean/boss_fight/README.md | 35 ++++---- pufferlib/ocean/boss_fight/boss_fight.c | 2 +- pufferlib/ocean/boss_fight/boss_fight.h | 105 ++++++++++++++++------- pufferlib/ocean/boss_fight/boss_fight.py | 2 +- 4 files changed, 91 insertions(+), 53 deletions(-) diff --git a/pufferlib/ocean/boss_fight/README.md b/pufferlib/ocean/boss_fight/README.md index 958cdcd57..fe56d6b30 100644 --- a/pufferlib/ocean/boss_fight/README.md +++ b/pufferlib/ocean/boss_fight/README.md @@ -8,7 +8,7 @@ All hitboxes are circles (collision = circles overlap). ## Game rules -- **Arena:** square `[-ARENA_HALF_SIZE, ARENA_HALF_SIZE]^2` (default `5.0`) +- **Arena:** square `[-ARENA_HALF_SIZE, ARENA_HALF_SIZE]^2` (default `500.0`) - **Boss:** stationary at `(0, 0)` - **Episode ends on:** - win: boss HP reaches 0 @@ -49,23 +49,22 @@ During **ACTIVE**, the boss deals damage if the player overlaps the AOE circle. ## Observation space -`Box(shape=(13,), dtype=float32)` (see `update_observations` in `boss_fight.h`): - -| idx | meaning | -| --: | ------------------------------------------------------ | -| 0 | `boss_x - player_x` | -| 1 | `boss_y - player_y` | -| 2 | `player_x` | -| 3 | `player_y` | -| 4 | `boss_x` | -| 5 | `boss_y` | -| 6 | `player_hp` | -| 7 | `boss_hp` | -| 8 | `player_state` (`0=idle, 1=dodge, 2=attack`) | -| 9 | `player_dodge_cooldown` | -| 10 | `player_state_ticks` (remaining) | -| 11 | `boss_state` (`0=idle, 1=windup, 2=attack, 3=recover`) | -| 12 | `boss_phase_ticks` (remaining) | +`Box(shape=(12,), dtype=float32)` — all normalized to [-1, 1] or [0, 1] (see `update_observations` in `boss_fight.h`): + +| idx | meaning | range | +| --: | ---------------------------- | ------- | +| 0 | `player_x` normalized | [-1, 1] | +| 1 | `player_y` normalized | [-1, 1] | +| 2 | `dist_to_boss` normalized | [0, 1] | +| 3 | `player_hp` normalized | [0, 1] | +| 4 | `boss_hp` normalized | [0, 1] | +| 5 | `dodge_cooldown` normalized | [0, 1] | +| 6 | `dodge_remaining` | [0, 1] | +| 7 | `iframe_remaining` | [0, 1] | +| 8 | `attack_remaining` | [0, 1] | +| 9 | `time_until_aoe` | [0, 1] | +| 10 | `aoe_remaining` | [0, 1] | +| 11 | `episode_time_remaining` | [0, 1] | ## Rewards (defaults) diff --git a/pufferlib/ocean/boss_fight/boss_fight.c b/pufferlib/ocean/boss_fight/boss_fight.c index 49198b733..5d69c4e27 100644 --- a/pufferlib/ocean/boss_fight/boss_fight.c +++ b/pufferlib/ocean/boss_fight/boss_fight.c @@ -2,7 +2,7 @@ #include "raylib.h" int main() { - int num_obs = 13; + int num_obs = 12; int num_actions = 1; int num_agents = 1; diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h index 609c6c07e..26dbd65a8 100644 --- a/pufferlib/ocean/boss_fight/boss_fight.h +++ b/pufferlib/ocean/boss_fight/boss_fight.h @@ -3,28 +3,29 @@ #include #include -#define ARENA_HALF_SIZE 5.0f -#define MAX_HP 1.0f -#define PLAYER_SPEED_PER_TICK 0.25f -#define PLAYER_SIZE 0.3f -#define BOSS_SIZE 0.5f -#define PLAYER_ATTACK_RADIUS 0.4f +#define ARENA_HALF_SIZE 500.0f +#define MAX_HP 100.0f +#define PLAYER_SPEED_PER_TICK 25.0f +#define PLAYER_SIZE 30.0f +#define BOSS_SIZE 50.0f +#define PLAYER_ATTACK_RADIUS 40.0f #define PLAYER_ATTACK_TICKS 3 #define PLAYER_DODGE_TICKS 4 #define PLAYER_IFRAME_TICKS 2 #define PLAYER_DODGE_COOLDOWN 15 -#define PLAYER_DODGE_SPEED_PER_TICK 0.35f -#define PLAYER_ATTACK_DMG 0.05f -#define BOSS_ATTACK_DMG 0.15f -#define BOSS_AOE_ATTACK_RADIUS 0.8f +#define PLAYER_DODGE_SPEED_PER_TICK 35.0f +#define PLAYER_ATTACK_DMG 5.0f +#define BOSS_ATTACK_DMG 15.0f +#define BOSS_AOE_ATTACK_RADIUS 80.0f #define BOSS_IDLE_TICKS 7 #define BOSS_WINDUP_TICKS 5 #define BOSS_ACTIVE_TICKS 5 #define BOSS_RECOVERY_TICKS 5 + #define HP_BAR_WIDTH 40 #define HP_BAR_HEIGHT 5 -#define REWARD_APPROACH 0.05f +#define REWARD_APPROACH 0.7f #define REWARD_HIT_WALL -0.05f #define REWARD_PLAYER_HIT_BOSS 0.07f #define REWARD_BOSS_HIT_PLAYER -0.05f @@ -72,7 +73,7 @@ typedef struct { float player_y; float boss_x; float boss_y; - float prev_distance; + float dist_to_boss; PlayerState player_state; float player_hp; @@ -111,19 +112,59 @@ void add_log(BossFight *env) { void update_observations(BossFight *env) { int obs_idx = 0; - env->observations[obs_idx++] = env->boss_x - env->player_x; - env->observations[obs_idx++] = env->boss_y - env->player_y; - env->observations[obs_idx++] = env->player_x; - env->observations[obs_idx++] = env->player_y; - env->observations[obs_idx++] = env->boss_x; - env->observations[obs_idx++] = env->boss_y; - env->observations[obs_idx++] = (float)env->player_hp; - env->observations[obs_idx++] = (float)env->boss_hp; - env->observations[obs_idx++] = (float)env->player_state; - env->observations[obs_idx++] = (float)env->player_dodge_cooldown; - env->observations[obs_idx++] = (float)env->player_state_ticks; - env->observations[obs_idx++] = (float)env->boss_state; - env->observations[obs_idx++] = (float)env->boss_phase_ticks; + + env->observations[obs_idx++] = env->player_x / ARENA_HALF_SIZE; + env->observations[obs_idx++] = env->player_y / ARENA_HALF_SIZE; + + float dist = distance(env->player_x, env->player_y, env->boss_x, env->boss_y); + float max_dist = sqrtf(2.0f) * ARENA_HALF_SIZE; + env->observations[obs_idx++] = dist / max_dist; + + env->observations[obs_idx++] = env->player_hp / MAX_HP; + env->observations[obs_idx++] = env->boss_hp / MAX_HP; + + env->observations[obs_idx++] = + (float)env->player_dodge_cooldown / PLAYER_DODGE_COOLDOWN; + + float dodge_remaining = + (env->player_state == PLAYER_DODGING) + ? (float)env->player_state_ticks / PLAYER_DODGE_TICKS + : 0.0f; + env->observations[obs_idx++] = dodge_remaining; + + int iframe_ticks = + env->player_state_ticks - (PLAYER_DODGE_TICKS - PLAYER_IFRAME_TICKS); + float iframe_remaining = + (env->player_state == PLAYER_DODGING && iframe_ticks > 0) + ? fminf((float)iframe_ticks / PLAYER_IFRAME_TICKS, 1.0f) + : 0.0f; + env->observations[obs_idx++] = iframe_remaining; + + float attack_remaining = + (env->player_state == PLAYER_ATTACKING) + ? (float)env->player_state_ticks / PLAYER_ATTACK_TICKS + : 0.0f; + env->observations[obs_idx++] = attack_remaining; + + float cycle_len = BOSS_IDLE_TICKS + BOSS_WINDUP_TICKS + BOSS_ACTIVE_TICKS + + BOSS_RECOVERY_TICKS; + float time_until_aoe = 0.0f; + if (env->boss_state == BOSS_IDLING) + time_until_aoe = env->boss_phase_ticks + BOSS_WINDUP_TICKS; + else if (env->boss_state == BOSS_WINDING_UP) + time_until_aoe = env->boss_phase_ticks; + else if (env->boss_state == BOSS_RECOVERING) + time_until_aoe = + env->boss_phase_ticks + BOSS_IDLE_TICKS + BOSS_WINDUP_TICKS; + env->observations[obs_idx++] = time_until_aoe / cycle_len; + + float aoe_remaining = (env->boss_state == BOSS_ATTACKING) + ? (float)env->boss_phase_ticks / BOSS_ACTIVE_TICKS + : 0.0f; + env->observations[obs_idx++] = aoe_remaining; + + env->observations[obs_idx++] = + (float)(EPISODE_LENGTH - env->tick) / EPISODE_LENGTH; } void c_reset(BossFight *env) { @@ -152,7 +193,7 @@ void c_reset(BossFight *env) { env->player_y = rand_uniform(-ARENA_HALF_SIZE, ARENA_HALF_SIZE); } - env->prev_distance = + env->dist_to_boss = distance(env->player_x, env->player_y, env->boss_x, env->boss_y); update_observations(env); @@ -204,11 +245,6 @@ void c_step(BossFight *env) { env->player_state != PLAYER_DODGING && env->player_dodge_cooldown == 0; bool can_attack = env->player_state == PLAYER_IDLING; - if (wanna_attack && can_attack) { - env->player_state_ticks = PLAYER_ATTACK_TICKS; - env->player_state = PLAYER_ATTACKING; - } - float aoe_dist = BOSS_SIZE + PLAYER_SIZE + BOSS_AOE_ATTACK_RADIUS; bool boss_threatening = env->boss_state == BOSS_WINDING_UP || env->boss_state == BOSS_ATTACKING; @@ -249,8 +285,9 @@ void c_step(BossFight *env) { float dist = distance(env->player_x, env->player_y, env->boss_x, env->boss_y); - reward += REWARD_APPROACH * (env->prev_distance - dist); - env->prev_distance = dist; + float max_dist = sqrtf(2.0f) * ARENA_HALF_SIZE; + reward += REWARD_APPROACH * ((env->dist_to_boss - dist) / max_dist); + env->dist_to_boss = dist; // Push player out if clipping into boss if (dist < BOSS_SIZE + PLAYER_SIZE && dist > 1e-6f) { @@ -265,6 +302,8 @@ void c_step(BossFight *env) { bool close_enough = dist <= BOSS_SIZE + PLAYER_ATTACK_RADIUS + PLAYER_SIZE; if (wanna_attack && can_attack && close_enough) { + env->player_state_ticks = PLAYER_ATTACK_TICKS; + env->player_state = PLAYER_ATTACKING; env->boss_hp -= PLAYER_ATTACK_DMG; reward += REWARD_PLAYER_HIT_BOSS; } diff --git a/pufferlib/ocean/boss_fight/boss_fight.py b/pufferlib/ocean/boss_fight/boss_fight.py index fdb4bfb4f..f966243ab 100644 --- a/pufferlib/ocean/boss_fight/boss_fight.py +++ b/pufferlib/ocean/boss_fight/boss_fight.py @@ -12,7 +12,7 @@ def __init__( self, num_envs=1, render_mode=None, log_interval=1, size=5, buf=None, seed=0 ): self.single_observation_space = gymnasium.spaces.Box( - low=-10, high=110, shape=(13,), dtype=np.float32 + low=-1, high=1, shape=(12,), dtype=np.float32 ) self.single_action_space = gymnasium.spaces.Discrete(7) self.render_mode = render_mode From 8ed96f304e873c55b236c6984991df231975e1a8 Mon Sep 17 00:00:00 2001 From: frixaco Date: Sun, 25 Jan 2026 05:43:24 +0500 Subject: [PATCH 29/29] better UI --- pufferlib/ocean/boss_fight/README.md | 11 +- pufferlib/ocean/boss_fight/boss_fight.h | 301 ++++++++++++++++++++---- pufferlib/ocean/environment.py | 280 +++++++--------------- 3 files changed, 353 insertions(+), 239 deletions(-) diff --git a/pufferlib/ocean/boss_fight/README.md b/pufferlib/ocean/boss_fight/README.md index fe56d6b30..5df7ada54 100644 --- a/pufferlib/ocean/boss_fight/README.md +++ b/pufferlib/ocean/boss_fight/README.md @@ -21,7 +21,7 @@ The boss cycles through: `IDLE (BOSS_IDLE_TICKS) -> WINDUP (BOSS_WINDUP_TICKS) -> ACTIVE (BOSS_ACTIVE_TICKS) -> RECOVERY (BOSS_RECOVERY_TICKS) -> ...` -During **ACTIVE**, the boss deals damage if the player overlaps the AOE circle. +During **ACTIVE**, the boss deals `BOSS_ATTACK_DMG` damage **every tick** the player overlaps the AOE circle (unless i-framed). Staying in the AOE for the full 5 ticks = 75 damage. ### Player mechanics @@ -86,7 +86,14 @@ All reward constants are in `boss_fight.h`: ## Rendering / manual play -- Rendering uses **Raylib**. `BossFight.render()` opens a window and draws the player/boss circles + hit radii. +- Rendering uses **Raylib** with enhanced visuals: + - Grid overlay + crosshair axes + - Time remaining bar (steps + seconds) + - Boss AoE telegraph (charging ring during WINDUP, filled during ACTIVE) + - Boss state label (IDLE/WINDUP/ACTIVE/RECOVER) + - Dodge trail particles + i-frame blink effect + - Attack pulse ring effect + - HP bars + dodge cooldown bar in HUD - A tiny standalone debug harness lives in `boss_fight.c`: - Hold `Left Shift` for manual controls: `WASD` move, `Space` dodge, `J` attack - Without `Left Shift` it takes random actions diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h index 26dbd65a8..2f0c4f488 100644 --- a/pufferlib/ocean/boss_fight/boss_fight.h +++ b/pufferlib/ocean/boss_fight/boss_fight.h @@ -5,16 +5,19 @@ #define ARENA_HALF_SIZE 500.0f #define MAX_HP 100.0f -#define PLAYER_SPEED_PER_TICK 25.0f +#define EPSILON 1e-6f + #define PLAYER_SIZE 30.0f -#define BOSS_SIZE 50.0f +#define PLAYER_SPEED_PER_TICK 25.0f #define PLAYER_ATTACK_RADIUS 40.0f #define PLAYER_ATTACK_TICKS 3 +#define PLAYER_ATTACK_DMG 5.0f #define PLAYER_DODGE_TICKS 4 #define PLAYER_IFRAME_TICKS 2 #define PLAYER_DODGE_COOLDOWN 15 #define PLAYER_DODGE_SPEED_PER_TICK 35.0f -#define PLAYER_ATTACK_DMG 5.0f + +#define BOSS_SIZE 50.0f #define BOSS_ATTACK_DMG 15.0f #define BOSS_AOE_ATTACK_RADIUS 80.0f #define BOSS_IDLE_TICKS 7 @@ -22,9 +25,6 @@ #define BOSS_ACTIVE_TICKS 5 #define BOSS_RECOVERY_TICKS 5 -#define HP_BAR_WIDTH 40 -#define HP_BAR_HEIGHT 5 - #define REWARD_APPROACH 0.7f #define REWARD_HIT_WALL -0.05f #define REWARD_PLAYER_HIT_BOSS 0.07f @@ -36,12 +36,30 @@ #define REWARD_TICK -0.01f #define EPISODE_LENGTH 600 -const Color PLAYER_COLOR = (Color){50, 100, 255, 255}; -const Color BOSS_COLOR = (Color){0, 187, 187, 255}; -const Color TEXT_COLOR = (Color){241, 241, 241, 255}; -const Color HITBOX_COLOR = (Color){241, 241, 241, 50}; -const Color BACKGROUND_COLOR = (Color){6, 24, 24, 255}; -const Color HP_COLOR = (Color){0, 255, 0, 255}; +#define WINDOW_SIZE 720 +#define TARGET_FPS 30 +#define HP_BAR_WIDTH 40 +#define HP_BAR_HEIGHT 5 +#define UI_MARGIN 20 +#define UI_RIGHT_X 580 +#define UI_BOTTOM_Y 680 +#define UI_HP_BAR_Y 700 +#define UI_FONT_SIZE 20 +#define UI_FONT_SIZE_SMALL 16 + +static const Color PLAYER_COLOR = (Color){50, 100, 255, 255}; +static const Color BOSS_COLOR = (Color){0, 187, 187, 255}; +static const Color TEXT_COLOR = (Color){241, 241, 241, 255}; +static const Color HITBOX_COLOR = (Color){241, 241, 241, 50}; +static const Color BACKGROUND_COLOR = (Color){6, 24, 24, 255}; +static const Color HP_COLOR = (Color){0, 255, 0, 255}; + +static const Color ARENA_BORDER_COLOR = (Color){30, 120, 120, 255}; +static const Color ARENA_GRID_COLOR = (Color){30, 70, 70, 255}; + +static const Color PLAYER_DODGE_COLOR = (Color){255, 215, 90, 255}; +static const Color PLAYER_ATTACK_COLOR = (Color){170, 220, 255, 255}; +static const Color BOSS_DANGER_COLOR = (Color){255, 80, 80, 255}; typedef enum { PLAYER_IDLING, PLAYER_DODGING, PLAYER_ATTACKING } PlayerState; @@ -220,6 +238,7 @@ void c_step(BossFight *env) { else action = 0; } + float dx = 0; float dy = 0; @@ -260,12 +279,12 @@ void c_step(BossFight *env) { env->player_state = PLAYER_DODGING; } - // Dodge = multi-tick movement out of the AOE (no i-frames) + // Dodge: multi-tick movement away from boss, with i-frames at start if (env->player_state == PLAYER_DODGING) { float away_x = env->player_x - env->boss_x; float away_y = env->player_y - env->boss_y; float away_norm = sqrtf(away_x * away_x + away_y * away_y); - if (away_norm > 1e-6f) { + if (away_norm > EPSILON) { env->player_x += (away_x / away_norm) * PLAYER_DODGE_SPEED_PER_TICK; env->player_y += (away_y / away_norm) * PLAYER_DODGE_SPEED_PER_TICK; } @@ -290,7 +309,7 @@ void c_step(BossFight *env) { env->dist_to_boss = dist; // Push player out if clipping into boss - if (dist < BOSS_SIZE + PLAYER_SIZE && dist > 1e-6f) { + if (dist < BOSS_SIZE + PLAYER_SIZE && dist > EPSILON) { float overlap = BOSS_SIZE + PLAYER_SIZE - dist; float dx = env->player_x - env->boss_x; float dy = env->player_y - env->boss_y; @@ -313,8 +332,7 @@ void c_step(BossFight *env) { env->player_state == PLAYER_DODGING && env->player_state_ticks > (PLAYER_DODGE_TICKS - PLAYER_IFRAME_TICKS); - // AOE persists longer than the i-frame window - // If player is still in the hitbox after i-frames, you get hit. + // Boss deals damage every tick while player in AOE (unless i-framed) bool boss_can_hit = in_aoe_attack && !player_iframed; bool boss_can_damage = env->boss_state == BOSS_ATTACKING && boss_can_hit; if (boss_can_damage) { @@ -399,17 +417,17 @@ void c_step(BossFight *env) { int world_to_screen(float world_coord) { return (int)((world_coord + ARENA_HALF_SIZE) / (2 * ARENA_HALF_SIZE) * - 720.0f); + (float)WINDOW_SIZE); } float radius_to_screen(float world_radius) { - return world_radius / (2 * ARENA_HALF_SIZE) * 720.0f; + return world_radius / (2 * ARENA_HALF_SIZE) * (float)WINDOW_SIZE; } void c_render(BossFight *env) { if (!IsWindowReady()) { - InitWindow(720, 720, "BossFight"); - SetTargetFPS(30); + InitWindow(WINDOW_SIZE, WINDOW_SIZE, "BossFight"); + SetTargetFPS(TARGET_FPS); } if (IsKeyDown(KEY_ESCAPE)) { @@ -419,47 +437,248 @@ void c_render(BossFight *env) { BeginDrawing(); ClearBackground(BACKGROUND_COLOR); - DrawText("Beat the boss!", 20, 20, 20, TEXT_COLOR); + DrawText("Beat the boss!", UI_MARGIN, UI_MARGIN, UI_FONT_SIZE, TEXT_COLOR); + + // Arena (bounds + subtle grid) + { + const float grid_step = 100.0f; + const float axis_step = 250.0f; + const Color grid = Fade(ARENA_GRID_COLOR, 0.28f); + const Color axis = Fade(ARENA_BORDER_COLOR, 0.35f); + + for (float x = -ARENA_HALF_SIZE; x <= ARENA_HALF_SIZE + 0.5f; + x += grid_step) { + int sx = world_to_screen(x); + DrawLine(sx, 0, sx, WINDOW_SIZE, grid); + } + for (float y = -ARENA_HALF_SIZE; y <= ARENA_HALF_SIZE + 0.5f; + y += grid_step) { + int sy = world_to_screen(y); + DrawLine(0, sy, WINDOW_SIZE, sy, grid); + } + + // Crosshair axes + DrawLine(world_to_screen(0.0f), 0, world_to_screen(0.0f), WINDOW_SIZE, + axis); + DrawLine(0, world_to_screen(0.0f), WINDOW_SIZE, world_to_screen(0.0f), + axis); + + // Quadrant markers + for (float t = -ARENA_HALF_SIZE; t <= ARENA_HALF_SIZE + 0.5f; + t += axis_step) { + int s = world_to_screen(t); + DrawLineEx((Vector2){(float)s, 4.0f}, (Vector2){(float)s, 14.0f}, 2.0f, + Fade(ARENA_BORDER_COLOR, 0.45f)); + DrawLineEx((Vector2){4.0f, (float)s}, (Vector2){14.0f, (float)s}, 2.0f, + Fade(ARENA_BORDER_COLOR, 0.45f)); + DrawLineEx((Vector2){(float)s, (float)WINDOW_SIZE - 4.0f}, + (Vector2){(float)s, (float)WINDOW_SIZE - 14.0f}, 2.0f, + Fade(ARENA_BORDER_COLOR, 0.45f)); + DrawLineEx((Vector2){(float)WINDOW_SIZE - 4.0f, (float)s}, + (Vector2){(float)WINDOW_SIZE - 14.0f, (float)s}, 2.0f, + Fade(ARENA_BORDER_COLOR, 0.45f)); + } + + DrawRectangleLinesEx((Rectangle){0, 0, WINDOW_SIZE, WINDOW_SIZE}, 6.0f, + Fade(ARENA_BORDER_COLOR, 0.75f)); + } // Stats top-right char stats[64]; snprintf(stats, sizeof(stats), "W:%d L:%d T:%d", env->player_wins, env->boss_wins, env->timeouts); - DrawText(stats, 580, 20, 20, TEXT_COLOR); + DrawText(stats, UI_RIGHT_X, UI_MARGIN, UI_FONT_SIZE, TEXT_COLOR); + + // Time-left HUD (steps + approx seconds) + { + int steps_left = EPISODE_LENGTH - env->tick; + if (steps_left < 0) + steps_left = 0; + float t = (float)steps_left / (float)EPISODE_LENGTH; + + const int bar_w = 260; + const int bar_h = 10; + const int bar_x = (WINDOW_SIZE - bar_w) / 2; + const int bar_y = UI_MARGIN + UI_FONT_SIZE + 8; + + DrawText("TIME", bar_x - 50, bar_y - 4, UI_FONT_SIZE_SMALL, + Fade(TEXT_COLOR, 0.85f)); + DrawRectangle(bar_x, bar_y, bar_w, bar_h, Fade(DARKGRAY, 0.8f)); + DrawRectangle(bar_x, bar_y, (int)((float)bar_w * t), bar_h, + Fade((Color){120, 210, 210, 255}, 0.95f)); + DrawRectangleLinesEx( + (Rectangle){(float)bar_x, (float)bar_y, (float)bar_w, (float)bar_h}, + 2.0f, Fade(ARENA_BORDER_COLOR, 0.7f)); + + char tbuf[64]; + int secs_left = (int)ceilf((float)steps_left / (float)TARGET_FPS); + snprintf(tbuf, sizeof(tbuf), "%d steps (~%ds)", steps_left, secs_left); + DrawText(tbuf, bar_x, bar_y + bar_h + 6, UI_FONT_SIZE_SMALL, + Fade(TEXT_COLOR, 0.85f)); + } // Player int player_sx = world_to_screen(env->player_x); int player_sy = world_to_screen(env->player_y); - int player_hp_bar_y = player_sy + (int)radius_to_screen(PLAYER_SIZE) + 5; - int player_hp_width = (int)((float)env->player_hp / MAX_HP * HP_BAR_WIDTH); + float player_hp_ratio = fmaxf(0.0f, fminf(1.0f, env->player_hp / MAX_HP)); + int player_hp_width = (int)(player_hp_ratio * HP_BAR_WIDTH); - Color player_color = env->player_hp <= 0 ? RED : PLAYER_COLOR; - DrawCircle(player_sx, player_sy, - radius_to_screen(PLAYER_SIZE + PLAYER_ATTACK_RADIUS), - HITBOX_COLOR); - DrawCircle(player_sx, player_sy, radius_to_screen(PLAYER_SIZE), player_color); + float player_attack_r = radius_to_screen(PLAYER_SIZE + PLAYER_ATTACK_RADIUS); + bool player_iframed = + env->player_state == PLAYER_DODGING && + env->player_state_ticks > (PLAYER_DODGE_TICKS - PLAYER_IFRAME_TICKS); + + Color player_base = env->player_hp <= 0 ? RED : PLAYER_COLOR; + if (env->player_state == PLAYER_DODGING) + player_base = PLAYER_DODGE_COLOR; + DrawCircleLines(player_sx, player_sy, player_attack_r, + Fade(PLAYER_ATTACK_COLOR, 0.18f)); + + // Dodge trail (stateless: inferred from away-from-boss direction) + if (env->player_state == PLAYER_DODGING) { + float away_x = env->player_x - env->boss_x; + float away_y = env->player_y - env->boss_y; + float away_norm = sqrtf(away_x * away_x + away_y * away_y); + if (away_norm > EPSILON) { + float ux = away_x / away_norm; + float uy = away_y / away_norm; + for (int i = 1; i <= 4; i++) { + float w = (float)(5 - i) / 5.0f; + int tx = world_to_screen(env->player_x - ux * (float)i * 40.0f); + int ty = world_to_screen(env->player_y - uy * (float)i * 40.0f); + DrawCircle(tx, ty, radius_to_screen(PLAYER_SIZE) * (0.9f - 0.08f * i), + Fade(PLAYER_DODGE_COLOR, 0.08f + 0.12f * w)); + } + } + } + + // Player body (shadow + fill + outline) + DrawCircle(player_sx + 3, player_sy + 4, radius_to_screen(PLAYER_SIZE), + Fade(BLACK, 0.25f)); + DrawCircle(player_sx, player_sy, radius_to_screen(PLAYER_SIZE), player_base); + DrawCircleLines(player_sx, player_sy, radius_to_screen(PLAYER_SIZE), + Fade(WHITE, 0.25f)); + + // Attack effect (duration) + if (env->player_state == PLAYER_ATTACKING) { + float rem = (float)env->player_state_ticks / (float)PLAYER_ATTACK_TICKS; + rem = fmaxf(0.0f, fminf(1.0f, rem)); + float pulse = 1.0f - rem; + float outer = player_attack_r * (1.0f + 0.10f * pulse); + float inner = player_attack_r * (0.92f + 0.04f * pulse); + BeginBlendMode(BLEND_ADDITIVE); + DrawRing((Vector2){(float)player_sx, (float)player_sy}, inner, outer, 0.0f, + 360.0f, 64, Fade(PLAYER_ATTACK_COLOR, 0.30f + 0.45f * rem)); + EndBlendMode(); + DrawCircleLines(player_sx, player_sy, outer, + Fade(PLAYER_ATTACK_COLOR, 0.25f + 0.35f * rem)); + } + + // I-frame blink + if (player_iframed) { + BeginBlendMode(BLEND_ADDITIVE); + DrawCircleLines(player_sx, player_sy, radius_to_screen(PLAYER_SIZE) * 1.12f, + Fade(WHITE, 0.65f)); + EndBlendMode(); + } // Boss int boss_sx = world_to_screen(env->boss_x); int boss_sy = world_to_screen(env->boss_y); - int boss_hp_bar_y = boss_sy + (int)radius_to_screen(BOSS_SIZE) + 5; - int boss_hp_width = (int)((float)env->boss_hp / MAX_HP * HP_BAR_WIDTH); + float boss_hp_ratio = fmaxf(0.0f, fminf(1.0f, env->boss_hp / MAX_HP)); + int boss_hp_width = (int)(boss_hp_ratio * HP_BAR_WIDTH); + + float boss_aoe_r = + radius_to_screen(BOSS_SIZE + PLAYER_SIZE + BOSS_AOE_ATTACK_RADIUS); + + // Boss AoE telegraph/active zone + { + float a = 0.10f; + if (env->boss_state == BOSS_WINDING_UP) { + float p = 1.0f - (float)env->boss_phase_ticks / (float)BOSS_WINDUP_TICKS; + p = fmaxf(0.0f, fminf(1.0f, p)); + a = 0.15f + 0.25f * p; + BeginBlendMode(BLEND_ADDITIVE); + DrawRing((Vector2){(float)boss_sx, (float)boss_sy}, boss_aoe_r * 0.93f, + boss_aoe_r, 0.0f, 360.0f * p, 64, Fade(BOSS_DANGER_COLOR, a)); + EndBlendMode(); + DrawCircleLines(boss_sx, boss_sy, boss_aoe_r, + Fade(BOSS_DANGER_COLOR, 0.28f + 0.25f * p)); + } else if (env->boss_state == BOSS_ATTACKING) { + float rem = (float)env->boss_phase_ticks / (float)BOSS_ACTIVE_TICKS; + rem = fmaxf(0.0f, fminf(1.0f, rem)); + DrawCircle(boss_sx, boss_sy, boss_aoe_r, + Fade(BOSS_DANGER_COLOR, 0.22f + 0.08f * (1.0f - rem))); + DrawCircleLines(boss_sx, boss_sy, boss_aoe_r, + Fade(BOSS_DANGER_COLOR, 0.95f)); + } else if (env->boss_state == BOSS_RECOVERING) { + float rem = (float)env->boss_phase_ticks / (float)BOSS_RECOVERY_TICKS; + rem = fmaxf(0.0f, fminf(1.0f, rem)); + DrawCircle(boss_sx, boss_sy, boss_aoe_r, + Fade(BOSS_DANGER_COLOR, 0.16f * rem)); + DrawCircleLines(boss_sx, boss_sy, boss_aoe_r, + Fade(BOSS_DANGER_COLOR, 0.55f * rem)); + } else { + DrawCircleLines(boss_sx, boss_sy, boss_aoe_r, + Fade(BOSS_DANGER_COLOR, 0.12f)); + } + } Color boss_color = env->boss_hp <= 0 ? RED : BOSS_COLOR; - DrawCircle(boss_sx, boss_sy, - radius_to_screen(BOSS_SIZE + BOSS_AOE_ATTACK_RADIUS), - HITBOX_COLOR); + DrawCircleGradient(boss_sx, boss_sy, radius_to_screen(BOSS_SIZE) * 1.25f, + Fade(BOSS_COLOR, 0.10f), Fade(BOSS_COLOR, 0.0f)); + DrawCircle(boss_sx + 4, boss_sy + 5, radius_to_screen(BOSS_SIZE), + Fade(BLACK, 0.22f)); DrawCircle(boss_sx, boss_sy, radius_to_screen(BOSS_SIZE), boss_color); + DrawCircleLines(boss_sx, boss_sy, radius_to_screen(BOSS_SIZE), + Fade(WHITE, 0.18f)); + + // Boss state label + { + const char *phase = "IDLE"; + if (env->boss_state == BOSS_WINDING_UP) + phase = "WINDUP"; + else if (env->boss_state == BOSS_ATTACKING) + phase = "ACTIVE"; + else if (env->boss_state == BOSS_RECOVERING) + phase = "RECOVER"; + + char pbuf[32]; + snprintf(pbuf, sizeof(pbuf), "%s", phase); + int w = MeasureText(pbuf, UI_FONT_SIZE_SMALL); + DrawText(pbuf, boss_sx - w / 2, + boss_sy - (int)radius_to_screen(BOSS_SIZE) - 22, + UI_FONT_SIZE_SMALL, Fade(TEXT_COLOR, 0.85f)); + } // Player HP bar - bottom left - DrawText("Player", 20, 680, 16, TEXT_COLOR); - DrawRectangle(20, 700, HP_BAR_WIDTH * 3, HP_BAR_HEIGHT, DARKGRAY); - DrawRectangle(20, 700, player_hp_width * 3, HP_BAR_HEIGHT, HP_COLOR); + const int hud_label_y = UI_HP_BAR_Y - 40; + DrawText("Player", UI_MARGIN, hud_label_y, UI_FONT_SIZE_SMALL, TEXT_COLOR); + DrawRectangle(UI_MARGIN, UI_HP_BAR_Y, HP_BAR_WIDTH * 3, HP_BAR_HEIGHT, + DARKGRAY); + DrawRectangle(UI_MARGIN, UI_HP_BAR_Y, player_hp_width * 3, HP_BAR_HEIGHT, + HP_COLOR); + + // Dodge cooldown (under player hp) + { + float cd = + 1.0f - fmaxf(0.0f, fminf(1.0f, (float)env->player_dodge_cooldown / + (float)PLAYER_DODGE_COOLDOWN)); + const int dodge_label_y = UI_HP_BAR_Y - 22; + const int dodge_bar_y = UI_HP_BAR_Y - 18; + DrawText("Dodge", UI_MARGIN, dodge_label_y, UI_FONT_SIZE_SMALL, + Fade(TEXT_COLOR, 0.75f)); + DrawRectangle(UI_MARGIN + 58, dodge_bar_y, 90, 6, Fade(DARKGRAY, 0.8f)); + DrawRectangle(UI_MARGIN + 58, dodge_bar_y, (int)(90.0f * cd), 6, + Fade(PLAYER_DODGE_COLOR, 0.85f)); + } // Boss HP bar - bottom right - DrawText("Boss", 580, 680, 16, TEXT_COLOR); - DrawRectangle(580, 700, HP_BAR_WIDTH * 3, HP_BAR_HEIGHT, DARKGRAY); - DrawRectangle(580, 700, boss_hp_width * 3, HP_BAR_HEIGHT, HP_COLOR); + DrawText("Boss", UI_RIGHT_X, hud_label_y, UI_FONT_SIZE_SMALL, TEXT_COLOR); + DrawRectangle(UI_RIGHT_X, UI_HP_BAR_Y, HP_BAR_WIDTH * 3, HP_BAR_HEIGHT, + DARKGRAY); + DrawRectangle(UI_RIGHT_X, UI_HP_BAR_Y, boss_hp_width * 3, HP_BAR_HEIGHT, + HP_COLOR); EndDrawing(); } diff --git a/pufferlib/ocean/environment.py b/pufferlib/ocean/environment.py index 51f131ac1..08e505adc 100644 --- a/pufferlib/ocean/environment.py +++ b/pufferlib/ocean/environment.py @@ -1,290 +1,178 @@ import importlib import pufferlib.emulation - def lazy_import(module_path, attr): """ Returns a callable that, when called with any arguments, will import the module, retrieve the attribute (usually a class or factory) and then call it with the given arguments. """ - return lambda *args, **kwargs: getattr( - __import__(module_path, fromlist=[attr]), attr - )(*args, **kwargs) - + return lambda *args, **kwargs: getattr(__import__(module_path, fromlist=[attr]), attr)(*args, **kwargs) -def make_foraging( - width=1080, - height=720, - num_agents=4096, - horizon=512, - discretize=True, - food_reward=0.1, - render_mode="rgb_array", -): +def make_foraging(width=1080, height=720, num_agents=4096, horizon=512, + discretize=True, food_reward=0.1, render_mode='rgb_array'): from .grid import grid - init_fn = grid.init_foraging reward_fn = grid.reward_foraging - return grid.PufferGrid( - width, - height, - num_agents, - horizon, - discretize=discretize, - food_reward=food_reward, - init_fn=init_fn, - reward_fn=reward_fn, - render_mode=render_mode, - ) - + return grid.PufferGrid(width, height, num_agents, + horizon, discretize=discretize, food_reward=food_reward, init_fn=init_fn, reward_fn=reward_fn, render_mode=render_mode) -def make_predator_prey( - width=1080, - height=720, - num_agents=4096, - horizon=512, - discretize=True, - food_reward=0.1, - render_mode="rgb_array", -): +def make_predator_prey(width=1080, height=720, num_agents=4096, horizon=512, + discretize=True, food_reward=0.1, render_mode='rgb_array'): from .grid import grid - init_fn = grid.init_predator_prey reward_fn = grid.reward_predator_prey - return grid.PufferGrid( - width, - height, - num_agents, - horizon, - discretize=discretize, - food_reward=food_reward, - init_fn=init_fn, - reward_fn=reward_fn, - render_mode=render_mode, - ) - + return grid.PufferGrid(width, height, num_agents, + horizon, discretize=discretize, food_reward=food_reward, + init_fn=init_fn, reward_fn=reward_fn, + render_mode=render_mode) -def make_group( - width=1080, - height=720, - num_agents=4096, - horizon=512, - discretize=True, - food_reward=0.1, - render_mode="rgb_array", -): +def make_group(width=1080, height=720, num_agents=4096, horizon=512, + discretize=True, food_reward=0.1, render_mode='rgb_array'): from .grid import grid - init_fn = grid.init_group reward_fn = grid.reward_group - return grid.PufferGrid( - width, - height, - num_agents, - horizon, - discretize=discretize, - food_reward=food_reward, - init_fn=init_fn, - reward_fn=reward_fn, - render_mode=render_mode, - ) - + return grid.PufferGrid(width, height, num_agents, + horizon, discretize=discretize, food_reward=food_reward, + init_fn=init_fn, reward_fn=reward_fn, + render_mode=render_mode) -def make_puffer( - width=1080, - height=720, - num_agents=4096, - horizon=512, - discretize=True, - food_reward=0.1, - render_mode="rgb_array", -): +def make_puffer(width=1080, height=720, num_agents=4096, horizon=512, + discretize=True, food_reward=0.1, render_mode='rgb_array'): from .grid import grid - init_fn = grid.init_puffer reward_fn = grid.reward_puffer - return grid.PufferGrid( - width, - height, - num_agents, - horizon, - discretize=discretize, - food_reward=food_reward, - init_fn=init_fn, - reward_fn=reward_fn, - render_mode=render_mode, - ) - - -def make_puffergrid( - render_mode="raylib", - vision_range=5, - num_envs=4096, - num_maps=1000, - max_map_size=9, - report_interval=128, - buf=None, -): - return PufferGrid( - render_mode, - vision_range, - num_envs, - num_maps, - max_map_size, - report_interval, - buf, - ) + return grid.PufferGrid(width, height, num_agents, + horizon, discretize=discretize, food_reward=food_reward, + init_fn=init_fn, reward_fn=reward_fn, + render_mode=render_mode) +def make_puffergrid(render_mode='raylib', vision_range=5, + num_envs=4096, num_maps=1000, max_map_size=9, + report_interval=128, buf=None): + return PufferGrid(render_mode, vision_range, num_envs, + num_maps, max_map_size, report_interval, buf) def make_continuous(discretize=False, buf=None, **kwargs): from . import sanity - env = sanity.Continuous(discretize=discretize) if not discretize: env = pufferlib.ClipAction(env) env = pufferlib.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf) - def make_squared(distance_to_target=3, num_targets=1, buf=None, **kwargs): from . import sanity - - env = sanity.Squared( - distance_to_target=distance_to_target, num_targets=num_targets, **kwargs - ) + env = sanity.Squared(distance_to_target=distance_to_target, num_targets=num_targets, **kwargs) env = pufferlib.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf, **kwargs) - def make_bandit(num_actions=10, reward_scale=1, reward_noise=1, buf=None): from . import sanity - - env = sanity.Bandit( - num_actions=num_actions, reward_scale=reward_scale, reward_noise=reward_noise - ) + env = sanity.Bandit(num_actions=num_actions, reward_scale=reward_scale, + reward_noise=reward_noise) env = pufferlib.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf) - def make_memory(mem_length=2, mem_delay=2, buf=None, **kwargs): from . import sanity - env = sanity.Memory(mem_length=mem_length, mem_delay=mem_delay) env = pufferlib.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf) - def make_password(password_length=5, buf=None, **kwargs): from . import sanity - env = sanity.Password(password_length=password_length) env = pufferlib.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf) - def make_performance(delay_mean=0, delay_std=0, bandwidth=1, buf=None, **kwargs): from . import sanity - - env = sanity.Performance( - delay_mean=delay_mean, delay_std=delay_std, bandwidth=bandwidth - ) + env = sanity.Performance(delay_mean=delay_mean, delay_std=delay_std, bandwidth=bandwidth) env = pufferlib.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf) - def make_performance_empiric(count_n=0, count_std=0, bandwidth=1, buf=None, **kwargs): from . import sanity - - env = sanity.PerformanceEmpiric( - count_n=count_n, count_std=count_std, bandwidth=bandwidth - ) + env = sanity.PerformanceEmpiric(count_n=count_n, count_std=count_std, bandwidth=bandwidth) env = pufferlib.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf) - def make_stochastic(p=0.7, horizon=100, buf=None, **kwargs): from . import sanity - env = sanity.Stochastic(p=p, horizon=100) env = pufferlib.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf) - def make_spaces(buf=None, **kwargs): from . import sanity - env = sanity.Spaces() env = pufferlib.EpisodeStats(env) return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf, **kwargs) - def make_multiagent(buf=None, **kwargs): from . import sanity - env = sanity.Multiagent() env = pufferlib.MultiagentEpisodeStats(env) return pufferlib.emulation.PettingZooPufferEnv(env=env, buf=buf) - MAKE_FUNCTIONS = { - "battle": "Battle", - "breakout": "Breakout", - "blastar": "Blastar", - "boss_fight": "BossFight", - "convert": "Convert", - "convert_circle": "ConvertCircle", - "pong": "Pong", - "freeway": "Freeway", - "enduro": "Enduro", - "tetris": "Tetris", - "cartpole": "Cartpole", - "moba": "Moba", - "matsci": "Matsci", - "memory": "Memory", - "boids": "Boids", - "drone": "Drone", - "nmmo3": "NMMO3", - "snake": "Snake", - "squared": "Squared", - "pysquared": "PySquared", - "connect4": "Connect4", - "g2048": "G2048", - "terraform": "Terraform", - "template": "Template", - "tripletriad": "TripleTriad", - "tactical": "Tactical", - "target": "Target", - "go": "Go", - "rware": "Rware", - "trash_pickup": "TrashPickupEnv", - "tower_climb": "TowerClimb", - "grid": "Grid", - "shared_pool": "PyCPR", - "impulse_wars": "ImpulseWars", - "drive": "Drive", - "pacman": "Pacman", - "tmaze": "TMaze", - "checkers": "Checkers", - "asteroids": "Asteroids", - "whisker_racer": "WhiskerRacer", - "onestateworld": "World", - "onlyfish": "OnlyFish", - "chain_mdp": "Chain", - "spaces": make_spaces, - "multiagent": make_multiagent, - "slimevolley": "SlimeVolley", + 'battle': 'Battle', + 'breakout': 'Breakout', + 'blastar': 'Blastar', + 'convert': 'Convert', + 'convert_circle': 'ConvertCircle', + 'pong': 'Pong', + 'freeway': 'Freeway', + 'enduro': 'Enduro', + 'tetris': 'Tetris', + 'cartpole': 'Cartpole', + 'moba': 'Moba', + 'matsci': 'Matsci', + 'memory': 'Memory', + 'boids': 'Boids', + 'drone': 'Drone', + 'nmmo3': 'NMMO3', + 'snake': 'Snake', + 'squared': 'Squared', + 'pysquared': 'PySquared', + 'connect4': 'Connect4', + 'g2048': 'G2048', + 'terraform': 'Terraform', + 'template': 'Template', + 'tripletriad': 'TripleTriad', + 'tactical': 'Tactical', + 'target': 'Target', + 'go': 'Go', + 'rware': 'Rware', + 'trash_pickup': 'TrashPickupEnv', + 'tower_climb': 'TowerClimb', + 'grid': 'Grid', + 'shared_pool': 'PyCPR', + 'impulse_wars': 'ImpulseWars', + 'drive': 'Drive', + 'pacman': 'Pacman', + 'tmaze': 'TMaze', + 'checkers': 'Checkers', + 'asteroids': 'Asteroids', + 'whisker_racer': 'WhiskerRacer', + 'onestateworld': 'World', + 'onlyfish': 'OnlyFish', + 'chain_mdp': 'Chain', + 'spaces': make_spaces, + 'multiagent': make_multiagent, + 'slimevolley': 'SlimeVolley', + 'boss_fight': 'BossFight', } - -def env_creator(name="squared", *args, **kwargs): - if "puffer_" not in name: - raise pufferlib.APIUsageError(f"Invalid environment name: {name}") +def env_creator(name='squared', *args, **kwargs): + if 'puffer_' not in name: + raise pufferlib.APIUsageError(f'Invalid environment name: {name}') # TODO: Robust sanity / ocean imports - name = name.replace("puffer_", "") + name = name.replace('puffer_', '') try: - module = importlib.import_module(f"pufferlib.ocean.{name}.{name}") + module = importlib.import_module(f'pufferlib.ocean.{name}.{name}') return getattr(module, MAKE_FUNCTIONS[name]) except ModuleNotFoundError: return MAKE_FUNCTIONS[name]