From 11ebb26a5e51c25a69828d0ada6425399d3e2f16 Mon Sep 17 00:00:00 2001
From: frixaco <rustam21ashurmatov@gmail.com>
Date: Tue, 13 Jan 2026 02:47:44 +0500
Subject: [PATCH 01/29] init boss fight env

---
 TODO.md                                  |  18 ++
 pufferlib/config/boss_fight.ini          |  15 ++
 pufferlib/ocean/boss_fight/README.md     | 218 ++++++++++++++++++
 pufferlib/ocean/boss_fight/binding.c     |  14 ++
 pufferlib/ocean/boss_fight/boss_fight.c  |  32 +++
 pufferlib/ocean/boss_fight/boss_fight.h  |  80 +++++++
 pufferlib/ocean/boss_fight/boss_fight.py |  67 ++++++
 pufferlib/ocean/environment.py           | 279 ++++++++++++++++-------
 8 files changed, 640 insertions(+), 83 deletions(-)
 create mode 100644 TODO.md
 create mode 100644 pufferlib/config/boss_fight.ini
 create mode 100644 pufferlib/ocean/boss_fight/README.md
 create mode 100644 pufferlib/ocean/boss_fight/binding.c
 create mode 100644 pufferlib/ocean/boss_fight/boss_fight.c
 create mode 100644 pufferlib/ocean/boss_fight/boss_fight.h
 create mode 100644 pufferlib/ocean/boss_fight/boss_fight.py

diff --git a/TODO.md b/TODO.md
new file mode 100644
index 000000000..938ca7614
--- /dev/null
+++ b/TODO.md
@@ -0,0 +1,18 @@
+## Notes for for my Boss Fight environment
+
+### Setup
+
+1. Fork pufferlib, create new branch
+
+2. Run these:
+  ```
+  uv venv
+  uv pip install -e .
+  ```
+
+3. Setup files using templates, update `environment.py`
+
+4. Not sure what this does yet:
+  ```
+  python setup.py build_boss_fight --inplace
+  ```
diff --git a/pufferlib/config/boss_fight.ini b/pufferlib/config/boss_fight.ini
new file mode 100644
index 000000000..d7f426abf
--- /dev/null
+++ b/pufferlib/config/boss_fight.ini
@@ -0,0 +1,15 @@
+[base]
+package = ocean
+env_name = puffer_boss_fight
+policy_name = Policy
+
+[env]
+num_envs = 14
+
+[train]
+total_timesteps = 1_000_000
+minibatch_size=1024
+
+[sweep]
+goal = maximize
+metric = episode_return
diff --git a/pufferlib/ocean/boss_fight/README.md b/pufferlib/ocean/boss_fight/README.md
new file mode 100644
index 000000000..e81c5f874
--- /dev/null
+++ b/pufferlib/ocean/boss_fight/README.md
@@ -0,0 +1,218 @@
+# SoulsRL Minimal — RL-Focused Boss Fight Environment
+
+## Goal
+
+Build a **minimal** 2D boss fight environment to learn RL concepts with PufferLib.
+Focus: **observation design, reward shaping, and training experiments** — not game engine complexity.
+
+The boss has **1 attack** (AOE burst). All hitboxes are circles. No rendering required.
+
+---
+
+## Core Mechanics (Simplified)
+
+### Constants
+
+```
+Tick rate: 30 ticks/sec (dt = 1/30)
+Arena: 10 x 10 units (centered at origin, so bounds are -5 to +5)
+
+Player:
+  - radius: 0.3
+  - HP: 100
+  - speed: 3.0 units/sec (~0.1 units/tick)
+
+Boss:
+  - radius: 0.5
+  - HP: 100
+  - position: fixed at (0, 0) — does not move
+```
+
+### Player Actions (Discrete, 7 total)
+
+```
+0: NOOP
+1: UP
+2: DOWN
+3: LEFT
+4: RIGHT
+5: DODGE
+6: ATTACK
+```
+
+### Player States
+
+```
+FREE     — can move, can act
+DODGE    — 6 ticks, i-frames on ticks 1-5, moves at 2.5x speed in last move_dir
+ATTACK   — windup(4) + active(3) + recovery(6) = 13 ticks total, no movement
+```
+
+**Cooldowns:**
+
+- Dodge: 15 ticks after dodge ends
+- Attack: No cooldown (but you're locked for 13 ticks)
+
+**Attack hitbox (during ACTIVE):**
+
+- Circle at `player_pos + facing * 0.7`, radius `0.4`
+- `facing` = direction to boss at attack start
+- Damage: 10
+
+### Boss Behavior (Single Attack)
+
+Boss cycles: `IDLE → WINDUP → ACTIVE → RECOVERY → IDLE`
+
+```
+IDLE:     12 ticks (0.4s) — does nothing
+WINDUP:   18 ticks (0.6s) — telegraphing, no damage
+ACTIVE:    3 ticks (0.1s) — AOE hits
+RECOVERY: 15 ticks (0.5s) — vulnerable, no damage
+```
+
+**AOE Attack:**
+
+- Circle centered on boss, radius `1.5`
+- Damage: 20
+- Player takes damage if: in AOE radius AND not in i-frames
+
+---
+
+## Observation Space (14 floats)
+
+Keep it minimal. You can ablate later.
+
+```
+Geometry (3):
+  0: rel_boss_x      = boss_x - player_x (normalized by arena half-size)
+  1: rel_boss_y      = boss_y - player_y
+  2: distance        = clamp(dist / 5.0, 0, 1)
+
+Player (5):
+  3: player_hp       = hp / 100
+  4: dodge_ready     = 1.0 if can dodge, else 0.0
+  5: player_state    = {FREE: 0, DODGE: 0.33, ATTACK: 0.66}  # scalar encoding
+  6: state_progress  = ticks_in_state / state_duration
+  7: move_dir_x      = -1 to 1
+
+Boss (6):
+  8:  boss_hp        = hp / 100
+  9:  boss_phase     = {IDLE: 0, WINDUP: 0.33, ACTIVE: 0.66, RECOVERY: 1.0}
+  10: phase_progress = ticks_in_phase / phase_duration
+  11: time_to_damage = ticks until ACTIVE starts / 18 (1.0 during IDLE/RECOVERY)
+  12: in_aoe_range   = 1.0 if distance < 1.5, else 0.0
+  13: boss_attacking = 1.0 if in WINDUP/ACTIVE, else 0.0
+```
+
+---
+
+## Reward Function (v1 — HP delta)
+
+```python
+# Per step
+reward = 0
+reward += (boss_hp_prev - boss_hp_now) * 0.1      # +1.0 per hit landed
+reward += (player_hp_prev - player_hp_now) * -0.1 # -2.0 per AOE hit taken
+reward += -0.001                                   # time penalty
+
+# Terminal
+if boss_hp <= 0: reward += 1.0   # win bonus
+if player_hp <= 0: reward -= 1.0 # lose penalty
+```
+
+---
+
+## Episode Termination
+
+- `terminated = True` if player or boss HP <= 0
+- `truncated = True` if ticks >= 900 (30 seconds)
+
+---
+
+## Implementation (Single File)
+
+Everything in `soulsrl.py` (~250-300 lines):
+
+```python
+class SoulsEnv(pufferlib.PufferEnv):
+    # Player state machine
+    # Boss state machine
+    # Collision detection (circle-circle only)
+    # Observation building
+    # Reward calculation
+```
+
+No separate core.py, no rendering, no curriculum stages.
+
+---
+
+## RL Experiments
+
+Once v1 is working, run these experiments to learn RL concepts:
+
+### Experiment 1: Observation Ablations
+
+| Variant   | Change                                                          | Hypothesis                             |
+| --------- | --------------------------------------------------------------- | -------------------------------------- |
+| no_timing | Remove `time_to_damage`, `phase_progress`                       | Agent can't learn precise dodge timing |
+| no_range  | Remove `in_aoe_range`, `distance`                               | Agent can't learn spacing              |
+| minimal   | Only: `distance`, `time_to_damage`, `dodge_ready`, `boss_phase` | Test minimum viable obs                |
+| noisy     | Add 5 uniform random floats                                     | Network should ignore noise            |
+
+### Experiment 2: Reward Shaping
+
+| Variant         | Change                           | Hypothesis                 |
+| --------------- | -------------------------------- | -------------------------- |
+| sparse          | Only win/lose bonus, no HP delta | Much slower learning       |
+| no_time_penalty | Remove -0.001/step               | Agent becomes passive      |
+| dodge_bonus     | +0.2 for dodging during ACTIVE   | Might create dodge spam    |
+| proximity       | +0.01 for being close to boss    | Might discourage safe play |
+
+### Experiment 3: Hyperparameters
+
+| Param         | Values           | What to observe             |
+| ------------- | ---------------- | --------------------------- |
+| learning_rate | 1e-3, 3e-4, 1e-4 | Learning speed vs stability |
+| ent_coef      | 0.0, 0.01, 0.05  | Exploration vs exploitation |
+| num_envs      | 8, 32, 128       | Sample efficiency           |
+| hidden_size   | 32, 64, 128      | Model capacity              |
+
+---
+
+## Success Criteria
+
+1. **Baseline works**: Random agent wins ~0%, trained agent wins >80%
+2. **Learned timing**: Agent dodges during WINDUP, not randomly
+3. **Learned punish**: Agent attacks during RECOVERY, not during ACTIVE
+4. **Experiments complete**: At least 3 ablations run with plotted comparisons
+
+---
+
+## Optional Extensions (After Experiments)
+
+Only add these if baseline experiments are done:
+
+1. **Sweep attack**: Cone hitbox, tests directional dodging
+2. **Boss movement**: Slow drift toward player
+3. **Combo attack**: Multi-hit sequence, tests dodge timing
+4. **ASCII rendering**: For debugging/demo
+5. **Curriculum**: Start with longer windup, tighten over training
+
+---
+
+## Deliverables
+
+1. `soulsrl.py` — Environment (PufferEnv)
+2. `train.py` — Training script with logging
+3. `experiments/` — Saved runs with different configs
+4. `results.md` — Summary of what you learned from experiments
+
+---
+
+## Timeline Estimate
+
+- Day 1: Implement `soulsrl.py`, verify with random agent
+- Day 2: Train baseline, confirm learning
+- Day 3-4: Run observation ablations
+- Day 5-6: Run reward experiments
+- Day 7: Document findings, optional extensions
diff --git a/pufferlib/ocean/boss_fight/binding.c b/pufferlib/ocean/boss_fight/binding.c
new file mode 100644
index 000000000..812e31bb7
--- /dev/null
+++ b/pufferlib/ocean/boss_fight/binding.c
@@ -0,0 +1,14 @@
+#include "boss_fight.h"
+
+#define Env BossFight
+#include "../env_binding.h"
+
+static int my_init(Env *env, PyObject *args, PyObject *kwargs) {
+  env->size = unpack(kwargs, "size");
+  return 0;
+}
+
+static int my_log(PyObject *dict, Log *log) {
+  assign_to_dict(dict, "score", log->score);
+  return 0;
+}
diff --git a/pufferlib/ocean/boss_fight/boss_fight.c b/pufferlib/ocean/boss_fight/boss_fight.c
new file mode 100644
index 000000000..0e1e152a2
--- /dev/null
+++ b/pufferlib/ocean/boss_fight/boss_fight.c
@@ -0,0 +1,32 @@
+#include "boss_fight.h"
+
+int main() {
+  BossFight env = {.size = 5};
+  env.observations = (unsigned char *)calloc(1, sizeof(unsigned char));
+  env.actions = (int *)calloc(1, sizeof(int));
+  env.rewards = (float *)calloc(1, sizeof(float));
+  env.terminals = (unsigned char *)calloc(1, sizeof(unsigned char));
+
+  c_reset(&env);
+  c_render(&env);
+  while (!WindowShouldClose()) {
+    if (IsKeyDown(KEY_LEFT_SHIFT)) {
+      if (IsKeyDown(KEY_A) || IsKeyDown(KEY_LEFT)) {
+        env.actions[0] = 0;
+      } else if (IsKeyDown(KEY_D) || IsKeyDown(KEY_RIGHT)) {
+        env.actions[0] = 1;
+      } else {
+        env.actions[0] = -1;
+      }
+    } else {
+      env.actions[0] = rand() % 2;
+    }
+    c_step(&env);
+    c_render(&env);
+  }
+  free(env.observations);
+  free(env.actions);
+  free(env.rewards);
+  free(env.terminals);
+  c_close(&env);
+}
diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h
new file mode 100644
index 000000000..75d2932b1
--- /dev/null
+++ b/pufferlib/ocean/boss_fight/boss_fight.h
@@ -0,0 +1,80 @@
+#include "raylib.h"
+#include <stdlib.h>
+#include <string.h>
+
+const Color PUFF_RED = (Color){187, 0, 0, 255};
+const Color PUFF_CYAN = (Color){0, 187, 187, 255};
+const Color PUFF_WHITE = (Color){241, 241, 241, 241};
+const Color PUFF_BACKGROUND = (Color){6, 24, 24, 255};
+
+// Only use floats!
+typedef struct {
+  float score;
+  float n; // Required as the last field
+} Log;
+
+typedef struct {
+  Log log; // Required field
+  unsigned char
+      *observations;        // Required field. Ensure type matches in .py and .c
+  int *actions;             // Required field. Ensure type matches in .py and .c
+  float *rewards;           // Required field
+  unsigned char *terminals; // Required field
+  int size;
+  int x;
+  int goal;
+} BossFight;
+
+void c_reset(BossFight *env) {
+  env->x = 0;
+  env->goal = (rand() % 2 == 0) ? env->size : -env->size;
+}
+
+void c_step(BossFight *env) {
+  env->rewards[0] = 0;
+  env->terminals[0] = 0;
+  if (env->actions[0] == 0) {
+    env->x -= 1;
+  } else if (env->actions[0] == 1) {
+    env->x += 1;
+  }
+  if (env->x == env->goal) {
+    c_reset(env);
+    env->rewards[0] = 1;
+    env->terminals[0] = 1;
+    env->log.score += 1;
+    env->log.n += 1;
+  } else if (env->x == -env->goal) {
+    c_reset(env);
+    env->rewards[0] = -1;
+    env->terminals[0] = 1;
+    env->log.score -= 1;
+    env->log.n += 1;
+  }
+  env->observations[0] = (env->goal > 0) ? 1 : -1;
+}
+
+void c_render(BossFight *env) {
+  if (!IsWindowReady()) {
+    InitWindow(1080, 720, "PufferLib Template");
+    SetTargetFPS(5);
+  }
+
+  if (IsKeyDown(KEY_ESCAPE)) {
+    exit(0);
+  }
+
+  DrawText("Go to the red square!", 20, 20, 20, PUFF_WHITE);
+  DrawRectangle(540 - 32 + 64 * env->goal, 360 - 32, 64, 64, PUFF_RED);
+  DrawRectangle(540 - 32 + 64 * env->x, 360 - 32, 64, 64, PUFF_CYAN);
+
+  BeginDrawing();
+  ClearBackground(PUFF_BACKGROUND);
+  EndDrawing();
+}
+
+void c_close(BossFight *env) {
+  if (IsWindowReady()) {
+    CloseWindow();
+  }
+}
diff --git a/pufferlib/ocean/boss_fight/boss_fight.py b/pufferlib/ocean/boss_fight/boss_fight.py
new file mode 100644
index 000000000..4f0bcdbb3
--- /dev/null
+++ b/pufferlib/ocean/boss_fight/boss_fight.py
@@ -0,0 +1,67 @@
+"""A minimal template for your own envs."""
+
+import gymnasium
+import numpy as np
+
+import pufferlib
+from pufferlib.ocean.template import binding
+
+
+class BossFight(pufferlib.PufferEnv):
+    def __init__(
+        self, num_envs=1, render_mode=None, log_interval=128, size=5, buf=None, seed=0
+    ):
+        self.single_observation_space = gymnasium.spaces.Box(
+            low=0, high=1, shape=(1,), dtype=np.uint8
+        )
+        self.single_action_space = gymnasium.spaces.Discrete(2)
+        self.render_mode = render_mode
+        self.num_agents = num_envs
+
+        super().__init__(buf)
+        self.c_envs = binding.vec_init(
+            self.observations,
+            self.actions,
+            self.rewards,
+            self.terminals,
+            self.truncations,
+            num_envs,
+            seed,
+            size=size,
+        )
+        self.size = size
+
+    def reset(self, seed=0):
+        binding.vec_reset(self.c_envs, seed)
+        return self.observations, []
+
+    def step(self, actions):
+        self.actions[:] = actions
+        binding.vec_step(self.c_envs)
+        info = [binding.vec_log(self.c_envs)]
+        return (self.observations, self.rewards, self.terminals, self.truncations, info)
+
+    def render(self):
+        binding.vec_render(self.c_envs, 0)
+
+    def close(self):
+        binding.vec_close(self.c_envs)
+
+
+if __name__ == "__main__":
+    N = 4096
+    env = BossFight(num_envs=N)
+    env.reset()
+    steps = 0
+
+    CACHE = 1024
+    actions = np.random.randint(0, 5, (CACHE, N))
+
+    import time
+
+    start = time.time()
+    while time.time() - start < 10:
+        env.step(actions[steps % CACHE])
+        steps += 1
+
+    print("Squared SPS:", int(env.num_agents * steps / (time.time() - start)))
diff --git a/pufferlib/ocean/environment.py b/pufferlib/ocean/environment.py
index 6c56a4ea2..51f131ac1 100644
--- a/pufferlib/ocean/environment.py
+++ b/pufferlib/ocean/environment.py
@@ -1,177 +1,290 @@
 import importlib
 import pufferlib.emulation
 
+
 def lazy_import(module_path, attr):
     """
     Returns a callable that, when called with any arguments, will
     import the module, retrieve the attribute (usually a class or factory)
     and then call it with the given arguments.
     """
-    return lambda *args, **kwargs: getattr(__import__(module_path, fromlist=[attr]), attr)(*args, **kwargs)
+    return lambda *args, **kwargs: getattr(
+        __import__(module_path, fromlist=[attr]), attr
+    )(*args, **kwargs)
+
 
-def make_foraging(width=1080, height=720, num_agents=4096, horizon=512,
-        discretize=True, food_reward=0.1, render_mode='rgb_array'):
+def make_foraging(
+    width=1080,
+    height=720,
+    num_agents=4096,
+    horizon=512,
+    discretize=True,
+    food_reward=0.1,
+    render_mode="rgb_array",
+):
     from .grid import grid
+
     init_fn = grid.init_foraging
     reward_fn = grid.reward_foraging
-    return grid.PufferGrid(width, height, num_agents,
-        horizon, discretize=discretize, food_reward=food_reward, init_fn=init_fn, reward_fn=reward_fn, render_mode=render_mode)
+    return grid.PufferGrid(
+        width,
+        height,
+        num_agents,
+        horizon,
+        discretize=discretize,
+        food_reward=food_reward,
+        init_fn=init_fn,
+        reward_fn=reward_fn,
+        render_mode=render_mode,
+    )
+
 
-def make_predator_prey(width=1080, height=720, num_agents=4096, horizon=512,
-        discretize=True, food_reward=0.1, render_mode='rgb_array'):
+def make_predator_prey(
+    width=1080,
+    height=720,
+    num_agents=4096,
+    horizon=512,
+    discretize=True,
+    food_reward=0.1,
+    render_mode="rgb_array",
+):
     from .grid import grid
+
     init_fn = grid.init_predator_prey
     reward_fn = grid.reward_predator_prey
-    return grid.PufferGrid(width, height, num_agents,
-        horizon, discretize=discretize, food_reward=food_reward,
-        init_fn=init_fn, reward_fn=reward_fn,
-        render_mode=render_mode)
+    return grid.PufferGrid(
+        width,
+        height,
+        num_agents,
+        horizon,
+        discretize=discretize,
+        food_reward=food_reward,
+        init_fn=init_fn,
+        reward_fn=reward_fn,
+        render_mode=render_mode,
+    )
+
 
-def make_group(width=1080, height=720, num_agents=4096, horizon=512,
-        discretize=True, food_reward=0.1, render_mode='rgb_array'):
+def make_group(
+    width=1080,
+    height=720,
+    num_agents=4096,
+    horizon=512,
+    discretize=True,
+    food_reward=0.1,
+    render_mode="rgb_array",
+):
     from .grid import grid
+
     init_fn = grid.init_group
     reward_fn = grid.reward_group
-    return grid.PufferGrid(width, height, num_agents,
-        horizon, discretize=discretize, food_reward=food_reward,
-        init_fn=init_fn, reward_fn=reward_fn,
-        render_mode=render_mode)
+    return grid.PufferGrid(
+        width,
+        height,
+        num_agents,
+        horizon,
+        discretize=discretize,
+        food_reward=food_reward,
+        init_fn=init_fn,
+        reward_fn=reward_fn,
+        render_mode=render_mode,
+    )
+
 
-def make_puffer(width=1080, height=720, num_agents=4096, horizon=512,
-        discretize=True, food_reward=0.1, render_mode='rgb_array'):
+def make_puffer(
+    width=1080,
+    height=720,
+    num_agents=4096,
+    horizon=512,
+    discretize=True,
+    food_reward=0.1,
+    render_mode="rgb_array",
+):
     from .grid import grid
+
     init_fn = grid.init_puffer
     reward_fn = grid.reward_puffer
-    return grid.PufferGrid(width, height, num_agents,
-        horizon, discretize=discretize, food_reward=food_reward,
-        init_fn=init_fn, reward_fn=reward_fn,
-        render_mode=render_mode)
+    return grid.PufferGrid(
+        width,
+        height,
+        num_agents,
+        horizon,
+        discretize=discretize,
+        food_reward=food_reward,
+        init_fn=init_fn,
+        reward_fn=reward_fn,
+        render_mode=render_mode,
+    )
+
+
+def make_puffergrid(
+    render_mode="raylib",
+    vision_range=5,
+    num_envs=4096,
+    num_maps=1000,
+    max_map_size=9,
+    report_interval=128,
+    buf=None,
+):
+    return PufferGrid(
+        render_mode,
+        vision_range,
+        num_envs,
+        num_maps,
+        max_map_size,
+        report_interval,
+        buf,
+    )
 
-def make_puffergrid(render_mode='raylib', vision_range=5,
-        num_envs=4096, num_maps=1000, max_map_size=9,
-        report_interval=128, buf=None):
-    return PufferGrid(render_mode, vision_range, num_envs,
-        num_maps, max_map_size, report_interval, buf)
 
 def make_continuous(discretize=False, buf=None, **kwargs):
     from . import sanity
+
     env = sanity.Continuous(discretize=discretize)
     if not discretize:
         env = pufferlib.ClipAction(env)
     env = pufferlib.EpisodeStats(env)
     return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf)
 
+
 def make_squared(distance_to_target=3, num_targets=1, buf=None, **kwargs):
     from . import sanity
-    env = sanity.Squared(distance_to_target=distance_to_target, num_targets=num_targets, **kwargs)
+
+    env = sanity.Squared(
+        distance_to_target=distance_to_target, num_targets=num_targets, **kwargs
+    )
     env = pufferlib.EpisodeStats(env)
     return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf, **kwargs)
 
+
 def make_bandit(num_actions=10, reward_scale=1, reward_noise=1, buf=None):
     from . import sanity
-    env = sanity.Bandit(num_actions=num_actions, reward_scale=reward_scale,
-        reward_noise=reward_noise)
+
+    env = sanity.Bandit(
+        num_actions=num_actions, reward_scale=reward_scale, reward_noise=reward_noise
+    )
     env = pufferlib.EpisodeStats(env)
     return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf)
 
+
 def make_memory(mem_length=2, mem_delay=2, buf=None, **kwargs):
     from . import sanity
+
     env = sanity.Memory(mem_length=mem_length, mem_delay=mem_delay)
     env = pufferlib.EpisodeStats(env)
     return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf)
 
+
 def make_password(password_length=5, buf=None, **kwargs):
     from . import sanity
+
     env = sanity.Password(password_length=password_length)
     env = pufferlib.EpisodeStats(env)
     return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf)
 
+
 def make_performance(delay_mean=0, delay_std=0, bandwidth=1, buf=None, **kwargs):
     from . import sanity
-    env = sanity.Performance(delay_mean=delay_mean, delay_std=delay_std, bandwidth=bandwidth)
+
+    env = sanity.Performance(
+        delay_mean=delay_mean, delay_std=delay_std, bandwidth=bandwidth
+    )
     env = pufferlib.EpisodeStats(env)
     return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf)
 
+
 def make_performance_empiric(count_n=0, count_std=0, bandwidth=1, buf=None, **kwargs):
     from . import sanity
-    env = sanity.PerformanceEmpiric(count_n=count_n, count_std=count_std, bandwidth=bandwidth)
+
+    env = sanity.PerformanceEmpiric(
+        count_n=count_n, count_std=count_std, bandwidth=bandwidth
+    )
     env = pufferlib.EpisodeStats(env)
     return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf)
 
+
 def make_stochastic(p=0.7, horizon=100, buf=None, **kwargs):
     from . import sanity
+
     env = sanity.Stochastic(p=p, horizon=100)
     env = pufferlib.EpisodeStats(env)
     return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf)
 
+
 def make_spaces(buf=None, **kwargs):
     from . import sanity
+
     env = sanity.Spaces()
     env = pufferlib.EpisodeStats(env)
     return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf, **kwargs)
 
+
 def make_multiagent(buf=None, **kwargs):
     from . import sanity
+
     env = sanity.Multiagent()
     env = pufferlib.MultiagentEpisodeStats(env)
     return pufferlib.emulation.PettingZooPufferEnv(env=env, buf=buf)
 
+
 MAKE_FUNCTIONS = {
-    'battle': 'Battle',
-    'breakout': 'Breakout',
-    'blastar': 'Blastar',
-    'convert': 'Convert',
-    'convert_circle': 'ConvertCircle',
-    'pong': 'Pong',
-    'freeway': 'Freeway',
-    'enduro': 'Enduro',
-    'tetris': 'Tetris',
-    'cartpole': 'Cartpole',
-    'moba': 'Moba',
-    'matsci': 'Matsci',
-    'memory': 'Memory',
-    'boids': 'Boids',
-    'drone': 'Drone',
-    'nmmo3': 'NMMO3',
-    'snake': 'Snake',
-    'squared': 'Squared',
-    'pysquared': 'PySquared',
-    'connect4': 'Connect4',
-    'g2048': 'G2048',
-    'terraform': 'Terraform',
-    'template': 'Template',
-    'tripletriad': 'TripleTriad',
-    'tactical': 'Tactical',
-    'target': 'Target',
-    'go': 'Go',
-    'rware': 'Rware',
-    'trash_pickup': 'TrashPickupEnv',
-    'tower_climb': 'TowerClimb',
-    'grid': 'Grid',
-    'shared_pool': 'PyCPR',
-    'impulse_wars': 'ImpulseWars',
-    'drive': 'Drive',
-    'pacman': 'Pacman',
-    'tmaze': 'TMaze',
-    'checkers': 'Checkers',
-    'asteroids': 'Asteroids',
-    'whisker_racer': 'WhiskerRacer',
-    'onestateworld': 'World',
-    'onlyfish': 'OnlyFish',
-    'chain_mdp': 'Chain',
-    'spaces': make_spaces,
-    'multiagent': make_multiagent,
-    'slimevolley': 'SlimeVolley',
+    "battle": "Battle",
+    "breakout": "Breakout",
+    "blastar": "Blastar",
+    "boss_fight": "BossFight",
+    "convert": "Convert",
+    "convert_circle": "ConvertCircle",
+    "pong": "Pong",
+    "freeway": "Freeway",
+    "enduro": "Enduro",
+    "tetris": "Tetris",
+    "cartpole": "Cartpole",
+    "moba": "Moba",
+    "matsci": "Matsci",
+    "memory": "Memory",
+    "boids": "Boids",
+    "drone": "Drone",
+    "nmmo3": "NMMO3",
+    "snake": "Snake",
+    "squared": "Squared",
+    "pysquared": "PySquared",
+    "connect4": "Connect4",
+    "g2048": "G2048",
+    "terraform": "Terraform",
+    "template": "Template",
+    "tripletriad": "TripleTriad",
+    "tactical": "Tactical",
+    "target": "Target",
+    "go": "Go",
+    "rware": "Rware",
+    "trash_pickup": "TrashPickupEnv",
+    "tower_climb": "TowerClimb",
+    "grid": "Grid",
+    "shared_pool": "PyCPR",
+    "impulse_wars": "ImpulseWars",
+    "drive": "Drive",
+    "pacman": "Pacman",
+    "tmaze": "TMaze",
+    "checkers": "Checkers",
+    "asteroids": "Asteroids",
+    "whisker_racer": "WhiskerRacer",
+    "onestateworld": "World",
+    "onlyfish": "OnlyFish",
+    "chain_mdp": "Chain",
+    "spaces": make_spaces,
+    "multiagent": make_multiagent,
+    "slimevolley": "SlimeVolley",
 }
 
-def env_creator(name='squared', *args, **kwargs):
-    if 'puffer_' not in name:
-        raise pufferlib.APIUsageError(f'Invalid environment name: {name}')
+
+def env_creator(name="squared", *args, **kwargs):
+    if "puffer_" not in name:
+        raise pufferlib.APIUsageError(f"Invalid environment name: {name}")
 
     # TODO: Robust sanity / ocean imports
-    name = name.replace('puffer_', '')
+    name = name.replace("puffer_", "")
     try:
-        module = importlib.import_module(f'pufferlib.ocean.{name}.{name}')
+        module = importlib.import_module(f"pufferlib.ocean.{name}.{name}")
         return getattr(module, MAKE_FUNCTIONS[name])
     except ModuleNotFoundError:
         return MAKE_FUNCTIONS[name]

From ec5efca9c9d89af192bea700290eb8bbb696ceb4 Mon Sep 17 00:00:00 2001
From: frixaco <rustam21ashurmatov@gmail.com>
Date: Tue, 13 Jan 2026 16:19:17 +0500
Subject: [PATCH 02/29] setup fixes for bossfight env

---
 LEARN_TODO.md                            | 259 +++++++++++++++++++++++
 TODO.md                                  |  29 ++-
 pufferlib/config/boss_fight.ini          |  96 ++++++++-
 pufferlib/ocean/boss_fight/__init__.py   |   3 +
 pufferlib/ocean/boss_fight/boss_fight.py |   2 +-
 5 files changed, 380 insertions(+), 9 deletions(-)
 create mode 100644 LEARN_TODO.md
 create mode 100644 pufferlib/ocean/boss_fight/__init__.py

diff --git a/LEARN_TODO.md b/LEARN_TODO.md
new file mode 100644
index 000000000..35c99eebc
--- /dev/null
+++ b/LEARN_TODO.md
@@ -0,0 +1,259 @@
+# Learning TODO: RL Foundations
+
+Everything you need to understand `bptt_horizon` and RL training in general.
+
+---
+
+## Level 1: Basic ML Concepts
+
+### 1.1 What is a Neural Network?
+- Function that takes numbers in, spits numbers out
+- Has "weights" (parameters) that get adjusted during training
+- `input → [neural network] → output`
+
+### 1.2 What is Training / Learning?
+- Adjusting weights so the network gives better outputs
+- Done by computing "loss" (how wrong it was) and updating weights to reduce loss
+
+### 1.3 What is Backpropagation?
+- Algorithm to figure out HOW to adjust each weight
+- Flows backwards through the network: output → hidden layers → input
+- "If the output was wrong, which weights were responsible?"
+
+### 1.4 What is a Batch?
+- Group of training examples processed together
+- Instead of: train on example 1, then example 2, then example 3...
+- Do: train on [example 1, 2, 3, 4, 5] at once
+- Why? Faster (GPU parallelism) + more stable learning
+
+### 1.5 What is Minibatch?
+- When your batch is too big for GPU memory
+- Split batch into smaller "minibatches"
+- `batch_size = 1024, minibatch_size = 256` → 4 gradient updates per batch
+
+---
+
+## Level 2: RL Basics
+
+### 2.1 What is a Timestep?
+- One tick of the game/simulation
+- Agent observes state → takes action → gets reward → new state
+- `t=0: see game → press button → get +1 point → game changes`
+
+### 2.2 What is an Episode?
+- One complete playthrough from start to end
+- Boss fight: episode = one full fight (win or lose)
+- `[spawn] → step → step → step → ... → [death or victory]`
+
+```
+Episode 1: t0 → t1 → t2 → t3 → DEAD (4 steps)
+Episode 2: t0 → t1 → t2 → t3 → t4 → t5 → WIN (6 steps)
+```
+
+### 2.3 What is an Observation?
+- What the agent "sees" at each timestep
+- Your boss_fight: 14 numbers (player pos, boss HP, etc.)
+
+### 2.4 What is a Policy?
+- The neural network that decides actions
+- `observation (14 floats) → [policy network] → action (0-6)`
+- Training = making this network choose better actions
+
+### 2.5 What is a Value Function?
+- Predicts "how good is this situation?"
+- "I have full HP, boss is low" → high value
+- "I'm almost dead, boss is full HP" → low value
+- Helps the agent learn which states to aim for
+
+---
+
+## Level 3: How RL Training Works
+
+### 3.1 Collect Experience
+```
+Run 56 environments in parallel:
+  Env 1: obs → action → reward → obs → action → reward → ...
+  Env 2: obs → action → reward → obs → action → reward → ...
+  ...
+  Env 56: obs → action → reward → obs → action → reward → ...
+
+After N steps, you have a "batch" of experience
+```
+
+### 3.2 Compute Advantages
+- "Was this action better or worse than expected?"
+- `advantage = actual_reward - predicted_value`
+- Positive advantage → reinforce this action
+- Negative advantage → discourage this action
+
+### 3.3 Update the Network
+- Use collected experience to adjust policy weights
+- Make good actions more likely, bad actions less likely
+
+### 3.4 Repeat
+```
+while not done:
+    1. Collect batch of experience (many timesteps)
+    2. Compute advantages
+    3. Update network with minibatches
+    4. Go to 1
+```
+
+---
+
+## Level 4: Sequential Data & Memory
+
+### 4.1 Why Sequence Matters
+In games, the PAST affects what you should do NOW:
+
+```
+Timestep 1: Boss starts wind-up animation
+Timestep 2: Boss still winding up
+Timestep 3: Boss about to attack!     ← YOU SHOULD DODGE NOW
+Timestep 4: Boss attacks
+
+If you only see timestep 3 in isolation, you might not know to dodge.
+But if you saw timesteps 1-2-3 together, you'd see the pattern.
+```
+
+### 4.2 MLP (Multi-Layer Perceptron) — No Memory
+- Standard neural network
+- Only sees CURRENT observation
+- `obs_t → [MLP] → action`
+- No memory of previous timesteps
+- Fine if observation contains all needed info
+
+### 4.3 RNN (Recurrent Neural Network) — Has Memory
+- Sees current observation + remembers past
+- `obs_t + memory → [RNN] → action + updated_memory`
+- Can learn patterns over time
+- Types: LSTM, GRU (different memory mechanisms)
+
+```
+MLP:  sees [___] [___] [_X_]     ← only current frame
+RNN:  sees [_X_] [_X_] [_X_]     ← current + memory of past
+```
+
+### 4.4 When Do You Need RNN?
+- When current observation is INCOMPLETE
+- Example: "Boss is standing still" — is he about to attack or recovering?
+- If your observation includes `boss_phase` and `time_to_damage`, MLP might be enough
+- If observation only has positions, RNN helps learn timing
+
+---
+
+## Level 5: BPTT (Backpropagation Through Time)
+
+### 5.1 The Problem
+RNN has memory that flows through time:
+
+```
+t1 → t2 → t3 → t4 → t5 → t6 → ... → t1000
+
+To train RNN, backprop must flow backwards through ALL these connections.
+1000 timesteps = 1000 layers of backprop = VERY slow, uses tons of memory
+```
+
+### 5.2 The Solution: Truncated BPTT
+Don't backprop through entire episode. Cut it into chunks:
+
+```
+Episode:     [t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12]
+
+bptt_horizon = 4:
+
+Chunk 1: [t1 → t2 → t3 → t4]   ← backprop only through these 4
+Chunk 2: [t5 → t6 → t7 → t8]   ← backprop only through these 4
+Chunk 3: [t9 → t10 → t11 → t12] ← backprop only through these 4
+```
+
+### 5.3 What bptt_horizon Controls
+```
+bptt_horizon = 16 means:
+- RNN sees 16 consecutive timesteps during training
+- Gradients flow back through 16 steps max
+- RNN can learn patterns up to ~16 steps long
+```
+
+### 5.4 Trade-offs
+```
+Small horizon (8):
+  ✓ Fast, low memory
+  ✗ RNN can't learn long patterns (>8 steps)
+
+Large horizon (128):
+  ✓ RNN learns longer patterns
+  ✗ Slow, high memory usage
+```
+
+---
+
+## Level 6: Putting It Together
+
+### 6.1 The Batch Math
+```
+num_envs = 56        (parallel environments)
+bptt_horizon = 16    (timesteps per chunk)
+
+batch_size = num_envs × bptt_horizon
+           = 56 × 16
+           = 896 total samples per training batch
+```
+
+### 6.2 Why minibatch_size Must Be ≤ batch_size
+```
+batch_size = 896     (you collected 896 samples)
+minibatch_size = 2048 (you want to train on 2048 at a time)
+
+ERROR: Can't take 2048 samples from a pile of 896!
+
+Fix: minibatch_size = 256 or 512 (smaller than 896)
+```
+
+### 6.3 For Your Boss Fight (No RNN)
+You're using MLP, so `bptt_horizon` just affects batch math:
+
+```ini
+[vec]
+num_envs = 56
+
+[train]
+bptt_horizon = 16        # 56 × 16 = 896 batch
+minibatch_size = 256     # Must be ≤ 896
+```
+
+Or increase horizon if you want bigger batches:
+
+```ini
+bptt_horizon = 64        # 56 × 64 = 3584 batch
+minibatch_size = 2048    # Now this works
+```
+
+---
+
+## Summary: What You Actually Need to Know
+
+1. **batch_size** = total samples collected before training
+2. **minibatch_size** = chunk size for each gradient update (must be ≤ batch_size)
+3. **bptt_horizon** = consecutive timesteps kept together
+   - For RNN: determines how far back it can learn patterns
+   - For MLP: just affects batch_size math
+4. **Your boss_fight uses MLP** — bptt_horizon is just a number to make the math work
+
+---
+
+## Learning Resources
+
+### Videos (start here)
+- [ ] 3Blue1Brown: "Neural Networks" series (YouTube)
+- [ ] Mutual Information: "Reinforcement Learning" series (YouTube)
+
+### Interactive
+- [ ] Andrej Karpathy: "Neural Networks: Zero to Hero" (YouTube + code)
+
+### Reading
+- [ ] Spinning Up in Deep RL (OpenAI) — https://spinningup.openai.com
+- [ ] CleanRL documentation — similar to PufferLib
+
+### Hands-on
+- [ ] Train boss_fight, watch the numbers, build intuition
diff --git a/TODO.md b/TODO.md
index 938ca7614..f98518639 100644
--- a/TODO.md
+++ b/TODO.md
@@ -5,14 +5,33 @@
 1. Fork pufferlib, create new branch
 
 2. Run these:
-  ```
-  uv venv
-  uv pip install -e .
-  ```
+
+```
+uv venv
+uv pip install -e .
+```
 
 3. Setup files using templates, update `environment.py`
 
 4. Not sure what this does yet:
+
+```
+python setup.py build_boss_fight --inplace
+```
+
+### Testing
+
+- Make sure shit's running:
   ```
-  python setup.py build_boss_fight --inplace
+  uv pip install -e . && python -c "
+  from pufferlib.ocean.boss_fight import BossFight
+  import numpy as np
+  env = BossFight(num_envs=2)
+  env.reset()
+  for _ in range(100):
+      env.step(np.random.randint(0, 7, size=2))
+  print('ok')
+  env.close()
+  "
   ```
+- Train and check scores: `puffer train puffer_boss_fight --train.total-timesteps 50000`
diff --git a/pufferlib/config/boss_fight.ini b/pufferlib/config/boss_fight.ini
index d7f426abf..f6d97ae12 100644
--- a/pufferlib/config/boss_fight.ini
+++ b/pufferlib/config/boss_fight.ini
@@ -2,14 +2,104 @@
 package = ocean
 env_name = puffer_boss_fight
 policy_name = Policy
+# rnn_name = Recurrent  # Uncomment if adding LSTM/GRU
+
+[vec]
+num_envs = 56
+num_workers = 14
+batch_size = auto
+zero_copy = True
+seed = 42
 
 [env]
-num_envs = 14
+# Environment-specific params (passed to env constructor)
+# None needed - using defaults from README
+
+[policy]
+# Policy constructor args (e.g., hidden_size)
+# hidden_size = 64  # Experiment: 32, 64, 128
 
 [train]
-total_timesteps = 1_000_000
-minibatch_size=1024
+# Experiment tracking
+name = boss_fight
+project = boss_fight_experiments
+data_dir = experiments
+checkpoint_interval = 200
+
+# Reproducibility
+seed = 42
+# TODO: disable for sweep or speed
+torch_deterministic = True
+device = mps
+
+# Optimization
+# TODO: try muon with 0.015 lr
+optimizer = adam
+precision = float32
+compile = False
+
+# Core PPO hyperparameters
+total_timesteps = 10_000_000
+learning_rate = 0.0003
+anneal_lr = True
+min_lr_ratio = 0.0
+gamma = 0.99
+gae_lambda = 0.95
+update_epochs = 4
+clip_coef = 0.2
+vf_coef = 0.5
+vf_clip_coef = 0.2
+max_grad_norm = 0.5
+ent_coef = 0.01
+
+# Batch sizes
+minibatch_size =  512
+max_minibatch_size = 32768
+bptt_horizon = 16
+
+# Adam parameters (if optimizer = adam)
+adam_beta1 = 0.9
+adam_beta2 = 0.999
+adam_eps = 1e-8
+
+# V-trace (for off-policy correction)
+# vtrace_rho_clip = 1.0
+# vtrace_c_clip = 1.0
 
 [sweep]
 goal = maximize
 metric = episode_return
+method = Protein
+metric_distribution = linear
+max_suggestion_cost = 3600
+use_gpu = True
+
+# Learning rate sweep
+[sweep.train.learning_rate]
+distribution = log_normal
+min = 0.0001
+max = 0.003
+
+# Entropy coefficient sweep (exploration vs exploitation)
+[sweep.train.ent_coef]
+distribution = log_normal
+min = 0.0001
+max = 0.05
+
+# Discount factor sweep
+[sweep.train.gamma]
+distribution = logit_normal
+min = 0.95
+max = 0.999
+
+# GAE lambda sweep
+[sweep.train.gae_lambda]
+distribution = logit_normal
+min = 0.9
+max = 0.99
+
+# Minibatch size sweep
+[sweep.train.minibatch_size]
+distribution = uniform_pow2
+min = 1024
+max = 8192
diff --git a/pufferlib/ocean/boss_fight/__init__.py b/pufferlib/ocean/boss_fight/__init__.py
new file mode 100644
index 000000000..4a93af7f9
--- /dev/null
+++ b/pufferlib/ocean/boss_fight/__init__.py
@@ -0,0 +1,3 @@
+"""BossFight Ocean Environment."""
+
+from .boss_fight import BossFight
diff --git a/pufferlib/ocean/boss_fight/boss_fight.py b/pufferlib/ocean/boss_fight/boss_fight.py
index 4f0bcdbb3..a952dbf41 100644
--- a/pufferlib/ocean/boss_fight/boss_fight.py
+++ b/pufferlib/ocean/boss_fight/boss_fight.py
@@ -4,7 +4,7 @@
 import numpy as np
 
 import pufferlib
-from pufferlib.ocean.template import binding
+from pufferlib.ocean.boss_fight import binding
 
 
 class BossFight(pufferlib.PufferEnv):

From 094fda8f552ccb04ce4cd5c28fd577be482243fc Mon Sep 17 00:00:00 2001
From: frixaco <rustam21ashurmatov@gmail.com>
Date: Wed, 14 Jan 2026 02:10:08 +0500
Subject: [PATCH 03/29] prep work

---
 AGENTS.md                                    | 52 ++++++++++++++++++++
 TODO.md                                      | 37 --------------
 pufferlib/config/boss_fight.ini              |  6 +--
 pufferlib/ocean/boss_fight/README.md         |  4 +-
 pufferlib/ocean/boss_fight/compile_flags.txt |  1 +
 5 files changed, 58 insertions(+), 42 deletions(-)
 create mode 100644 AGENTS.md
 delete mode 100644 TODO.md
 create mode 100644 pufferlib/ocean/boss_fight/compile_flags.txt

diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 000000000..184f98caa
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,52 @@
+# BossFight Reinforcement Learning project
+
+I'm implementing a RL environment using PufferLib in C + Python.
+
+Environment spec file is in `./pufferlib/ocean/boss_fight/README.md`.
+
+You are in PufferLib's (puffer.ai) source repository which contains "Ocean" - a collection of environments.
+
+The environment code I'm working on is located in `./pufferlib/ocean/boss_fight/`. Environment configuration is in `./pufferlib/config/boss_fight.ini`
+
+### Setup
+
+1. Fork pufferlib, create new branch
+
+2. Run these:
+
+```
+uv venv
+uv pip install -e .
+```
+
+3. Setup files using templates, update `environment.py`
+
+4. Not sure what this does yet:
+
+```
+python setup.py build_boss_fight --inplace
+```
+
+### Testing
+
+Make sure shit's running:
+
+```
+uv pip install -e .
+python -c "
+from pufferlib.ocean.boss_fight import BossFight
+import numpy as np
+env = BossFight(num_envs=2)
+env.reset()
+for _ in range(100):
+    env.step(np.random.randint(0, 7, size=2))
+print('ok')
+env.close()
+"
+```
+
+Train and check scores:
+
+```
+puffer train puffer_boss_fight --train.total-timesteps 50000
+```
diff --git a/TODO.md b/TODO.md
deleted file mode 100644
index f98518639..000000000
--- a/TODO.md
+++ /dev/null
@@ -1,37 +0,0 @@
-## Notes for for my Boss Fight environment
-
-### Setup
-
-1. Fork pufferlib, create new branch
-
-2. Run these:
-
-```
-uv venv
-uv pip install -e .
-```
-
-3. Setup files using templates, update `environment.py`
-
-4. Not sure what this does yet:
-
-```
-python setup.py build_boss_fight --inplace
-```
-
-### Testing
-
-- Make sure shit's running:
-  ```
-  uv pip install -e . && python -c "
-  from pufferlib.ocean.boss_fight import BossFight
-  import numpy as np
-  env = BossFight(num_envs=2)
-  env.reset()
-  for _ in range(100):
-      env.step(np.random.randint(0, 7, size=2))
-  print('ok')
-  env.close()
-  "
-  ```
-- Train and check scores: `puffer train puffer_boss_fight --train.total-timesteps 50000`
diff --git a/pufferlib/config/boss_fight.ini b/pufferlib/config/boss_fight.ini
index f6d97ae12..fcfe697cc 100644
--- a/pufferlib/config/boss_fight.ini
+++ b/pufferlib/config/boss_fight.ini
@@ -5,7 +5,7 @@ policy_name = Policy
 # rnn_name = Recurrent  # Uncomment if adding LSTM/GRU
 
 [vec]
-num_envs = 56
+num_envs = 112
 num_workers = 14
 batch_size = auto
 zero_copy = True
@@ -53,9 +53,9 @@ max_grad_norm = 0.5
 ent_coef = 0.01
 
 # Batch sizes
-minibatch_size =  512
+minibatch_size =  2048
 max_minibatch_size = 32768
-bptt_horizon = 16
+bptt_horizon = 32
 
 # Adam parameters (if optimizer = adam)
 adam_beta1 = 0.9
diff --git a/pufferlib/ocean/boss_fight/README.md b/pufferlib/ocean/boss_fight/README.md
index e81c5f874..a9cbcd8b8 100644
--- a/pufferlib/ocean/boss_fight/README.md
+++ b/pufferlib/ocean/boss_fight/README.md
@@ -3,9 +3,9 @@
 ## Goal
 
 Build a **minimal** 2D boss fight environment to learn RL concepts with PufferLib.
-Focus: **observation design, reward shaping, and training experiments** — not game engine complexity.
+Focus: **observation design, reward shaping, training experiments and a bit of game dev using Raylib**
 
-The boss has **1 attack** (AOE burst). All hitboxes are circles. No rendering required.
+The boss has **1 attack** (AOE burst). All hitboxes are circles.
 
 ---
 
diff --git a/pufferlib/ocean/boss_fight/compile_flags.txt b/pufferlib/ocean/boss_fight/compile_flags.txt
new file mode 100644
index 000000000..ea96eb002
--- /dev/null
+++ b/pufferlib/ocean/boss_fight/compile_flags.txt
@@ -0,0 +1 @@
+-I../../../raylib-5.5_macos/include

From 09a79ad159e8fd96847b41546bb70d71227a9066 Mon Sep 17 00:00:00 2001
From: frixaco <rustam21ashurmatov@gmail.com>
Date: Fri, 16 Jan 2026 18:14:39 +0500
Subject: [PATCH 04/29] implement reset

---
 AGENTS.md                               |    2 +-
 learn-pufferlib.py                      | 1175 +++++++++++++++++++++++
 pufferlib/ocean/boss_fight/README.md    |  148 ++-
 pufferlib/ocean/boss_fight/binding.c    |    2 +-
 pufferlib/ocean/boss_fight/boss_fight.c |   38 +-
 pufferlib/ocean/boss_fight/boss_fight.h |  136 ++-
 6 files changed, 1372 insertions(+), 129 deletions(-)
 create mode 100644 learn-pufferlib.py

diff --git a/AGENTS.md b/AGENTS.md
index 184f98caa..0712dc874 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -24,7 +24,7 @@ uv pip install -e .
 4. Not sure what this does yet:
 
 ```
-python setup.py build_boss_fight --inplace
+python setup.py build_boss_fight --inplace --force
 ```
 
 ### Testing
diff --git a/learn-pufferlib.py b/learn-pufferlib.py
new file mode 100644
index 000000000..4091fb6fb
--- /dev/null
+++ b/learn-pufferlib.py
@@ -0,0 +1,1175 @@
+"""
+LEARN_V2.PY - RL with PufferLib (The Right Way)
+================================================
+
+PURPOSE: Learn reinforcement learning using PufferLib's patterns and infrastructure.
+
+This is the "full PufferLib" version of learn.py. Instead of implementing PPO
+from scratch, we use PufferLib's pufferl.PuffeRL trainer which handles:
+- Rollout collection
+- GAE advantage computation
+- PPO loss calculation
+- Gradient updates
+- Logging and metrics
+
+HOW TO USE:
+1. Read each section's comments (the WHY and WHAT)
+2. Fill in the TODO sections
+3. Run and test after each section: python learn_v2.py
+4. Only move to next section when current one works
+
+The environment is the same as learn.py:
+- 2D arena where an agent must reach a target
+- Agent can move UP/DOWN/LEFT/RIGHT or stay still
+- Episode ends when: agent reaches target, hits wall, or 200 steps pass
+
+DEPENDENCIES:
+    pip install pufferlib torch numpy gymnasium
+"""
+
+import os
+import numpy as np
+import gymnasium
+import torch
+import torch.nn as nn
+import pufferlib
+import pufferlib.vector
+import pufferlib.pytorch
+from pufferlib import pufferl
+
+
+# =============================================================================
+# SECTION 1: PUFFERLIB ENVIRONMENT
+# =============================================================================
+"""
+WHY inherit from pufferlib.PufferEnv?
+-------------------------------------
+PufferLib provides optimized environment vectorization. When you inherit from
+PufferEnv, you get:
+
+1. AUTOMATIC BUFFER MANAGEMENT: PufferLib creates shared memory buffers for
+   observations, rewards, terminals, truncations. You just write to them.
+
+2. MULTI-AGENT SUPPORT: The same pattern works for 1 agent or 100 agents.
+   You define `num_agents` and PufferLib handles the rest.
+
+3. VECTORIZATION COMPATIBILITY: Your env works with pufferlib.vector.make()
+   which can run multiple copies in parallel (Serial or Multiprocessing).
+
+KEY DIFFERENCES from Gymnasium:
+-------------------------------
+- Define `single_observation_space` and `single_action_space` (not plural)
+- Set `self.num_agents` (1 for single-agent)
+- Call `super().__init__(buf)` which creates self.observations, self.rewards, etc.
+- Update arrays IN-PLACE: `self.observations[:] = ...` not `return obs`
+- reset() and step() still return values, but also update internal buffers
+"""
+
+
+class MoveToTargetEnv(pufferlib.PufferEnv):
+    """
+    A simple environment where an agent navigates to a target position.
+
+    This is identical to learn.py's MoveToTargetEnv, but adapted to PufferLib's
+    patterns. The game logic is the same, only the interface changes.
+
+    GAME RULES:
+    - Agent starts at random position in [-0.8, 0.8] x [-0.8, 0.8]
+    - Target is at random position (at least 0.3 units away from agent)
+    - Agent can: NOOP (0), UP (1), DOWN (2), LEFT (3), RIGHT (4)
+    - Episode ends when: agent reaches target, hits wall (|x|>1 or |y|>1), or 200 steps
+    - Reward: -0.01/step + distance shaping + terminal bonuses
+    """
+
+    # Type hints for attributes created by super().__init__()
+    observations: np.ndarray
+    rewards: np.ndarray
+    terminals: np.ndarray
+    truncations: np.ndarray
+
+    def __init__(self, buf=None, seed=0):
+        """
+        WHY these parameters?
+        ---------------------
+        - buf: Optional shared memory buffer from PufferLib's vectorization.
+               When running multiple envs, they share memory for efficiency.
+               If None, PufferLib creates a buffer automatically.
+
+        - seed: Random seed for reproducibility. Essential for debugging!
+
+        WHAT to do in __init__:
+        1. Define single_observation_space (what ONE agent sees)
+        2. Define single_action_space (what actions ONE agent can take)
+        3. Set self.num_agents (1 for single-agent env)
+        4. Call super().__init__(buf) - THIS CREATES self.observations, etc.
+        5. Initialize game state variables
+        6. Set up random number generator
+        """
+        # -----------------------------------------------------------------
+        # TODO 1.1: Define the observation space
+        # -----------------------------------------------------------------
+        # WHAT the agent sees: [agent_x, agent_y, target_x, target_y, dx, dy]
+        # - Positions are in [-1, 1] (arena bounds)
+        # - dx, dy (direction to target) can be in [-2, 2]
+        #
+        # WHY "single_observation_space" not "observation_space"?
+        # PufferLib distinguishes single-agent spaces from joint spaces.
+        # For multi-agent, observation_space would be (num_agents, obs_dim).
+        # We define the SINGLE agent's view, PufferLib handles batching.
+        #
+        # YOUR CODE: Create self.single_observation_space as gymnasium.spaces.Box
+        # Hint: Box(low=-2.0, high=2.0, shape=(6,), dtype=np.float32)
+
+        self.single_observation_space = gymnasium.spaces.Box(
+            low=-2.0, high=2.0, shape=(6,), dtype=np.float32
+        )
+
+        # -----------------------------------------------------------------
+        # TODO 1.2: Define the action space
+        # -----------------------------------------------------------------
+        # WHAT actions are available: 0=NOOP, 1=UP, 2=DOWN, 3=LEFT, 4=RIGHT
+        #
+        # YOUR CODE: Create self.single_action_space as gymnasium.spaces.Discrete(5)
+
+        self.single_action_space = gymnasium.spaces.Discrete(5)
+
+        # -----------------------------------------------------------------
+        # TODO 1.3: Set the number of agents
+        # -----------------------------------------------------------------
+        # For single-agent environments, num_agents = 1.
+        # PufferLib uses this to allocate the right buffer sizes.
+        #
+        # YOUR CODE: Set self.num_agents = 1
+
+        self.num_agents = 1
+
+        # -----------------------------------------------------------------
+        # CRITICAL: Call super().__init__(buf)
+        # -----------------------------------------------------------------
+        # This MUST come after defining spaces and num_agents!
+        # It creates:
+        #   - self.observations: array of shape (num_agents, *obs_shape)
+        #   - self.rewards: array of shape (num_agents,)
+        #   - self.terminals: array of shape (num_agents,)
+        #   - self.truncations: array of shape (num_agents,)
+        #
+        # These are the buffers you'll update in reset() and step().
+        super().__init__(buf)
+
+        # -----------------------------------------------------------------
+        # TODO 1.4: Initialize game state variables
+        # -----------------------------------------------------------------
+        # Track the actual game state (not observations, those are derived).
+        # For single-agent, these are simple arrays of shape (2,) for positions.
+        #
+        # WHAT to initialize:
+        # - self.agent_pos: np.zeros(2, dtype=np.float32) - agent's [x, y]
+        # - self.target_pos: np.zeros(2, dtype=np.float32) - target's [x, y]
+        # - self.tick: 0 - step counter within episode
+        #
+        # Also initialize constants:
+        # - self.max_steps = 200
+        # - self.target_radius = 0.1 (how close to count as "reached")
+        # - self.move_speed = 0.05 (movement per action)
+        # - self.arena_size = 1.0 (arena is [-1, 1] x [-1, 1])
+        #
+        # YOUR CODE: Initialize game state
+
+        self.agent_pos = np.zeros(2, dtype=np.float32)
+        self.target_pos = np.zeros(2, dtype=np.float32)
+        self.tick = 0
+
+        self.max_steps = 200
+        self.target_radius = 0.1
+        self.move_speed = 0.05
+        self.arena_size = 1.0
+
+        # Set up random number generator for reproducibility
+        self.rng = np.random.default_rng(seed=seed)
+
+        # Track previous distance for reward shaping
+        self.prev_dist = 0.0
+
+    def reset(self, seed=None):
+        """
+        WHY reset()?
+        ------------
+        Start a fresh episode. Called at the beginning and after each episode ends.
+
+        WHAT to do:
+        1. Randomize agent position
+        2. Randomize target position (not too close to agent!)
+        3. Reset step counter
+        4. Compute initial distance (for reward shaping)
+        5. Fill self.observations[:] with initial state
+
+        WHY update self.observations[:] in-place?
+        PufferLib uses shared memory buffers. By updating in-place, we avoid
+        copying data. The [:] syntax means "update the existing array contents".
+
+        RETURNS:
+        - self.observations: the observation buffer (now filled with initial state)
+        - []: empty list of infos (PufferLib expects a list)
+        """
+        # -----------------------------------------------------------------
+        # TODO 2.1: Implement reset()
+        # -----------------------------------------------------------------
+        # Step 1: Randomize agent position
+        #         self.agent_pos[:] = self.rng.uniform(-0.8, 0.8, size=2)
+        #
+        # Step 2: Randomize target position
+        #         self.target_pos[:] = self.rng.uniform(-0.8, 0.8, size=2)
+        #
+        # Step 3: Ensure target is far enough from agent (at least 0.3 units)
+        #         while np.linalg.norm(self.agent_pos - self.target_pos) < 0.3:
+        #             self.target_pos[:] = self.rng.uniform(-0.8, 0.8, size=2)
+        #
+        # Step 4: Reset step counter
+        #         self.tick = 0
+        #
+        # Step 5: Compute initial distance
+        #         self.prev_dist = np.linalg.norm(self.agent_pos - self.target_pos)
+        #
+        # Step 6: Fill observations buffer
+        #         self.observations[0, 0] = self.agent_pos[0]  # agent_x
+        #         self.observations[0, 1] = self.agent_pos[1]  # agent_y
+        #         self.observations[0, 2] = self.target_pos[0]  # target_x
+        #         self.observations[0, 3] = self.target_pos[1]  # target_y
+        #         self.observations[0, 4] = self.target_pos[0] - self.agent_pos[0]  # dx
+        #         self.observations[0, 5] = self.target_pos[1] - self.agent_pos[1]  # dy
+        #
+        # Note: We index [0, :] because num_agents=1, so observations has shape (1, 6)
+        #
+        # YOUR CODE:
+
+        self.agent_pos[:] = self.rng.uniform(-0.8, 0.8, size=2)
+        self.target_pos[:] = self.rng.uniform(-0.8, 0.8, size=2)
+
+        while np.linalg.norm(self.agent_pos - self.target_pos) < 0.3:
+            self.target_pos[:] = self.rng.uniform(-0.8, 0.8, size=2)
+
+        self.tick = 0
+
+        self.prev_dist = np.linalg.norm(self.agent_pos - self.target_pos)
+
+        self.observations[0, 0] = self.agent_pos[0]
+        self.observations[0, 1] = self.agent_pos[1]
+        self.observations[0, 2] = self.target_pos[0]
+        self.observations[0, 3] = self.target_pos[1]
+        self.observations[0, 4] = self.target_pos[0] - self.agent_pos[0]
+        self.observations[0, 5] = self.target_pos[1] - self.agent_pos[1]
+
+        return self.observations, []
+
+    def step(self, actions):
+        """
+        WHY step()?
+        -----------
+        The core game loop. Called every timestep with the agent's chosen action.
+
+        WHAT to do:
+        1. Apply the action (move agent)
+        2. Compute reward (time penalty + distance shaping + terminal bonus)
+        3. Check terminal conditions (reached target? hit wall? timeout?)
+        4. Update buffers (observations, rewards, terminals, truncations)
+        5. Auto-reset if episode ended
+
+        PARAMETERS:
+        - actions: numpy array of shape (num_agents,) = (1,) for us
+                   Each value is an integer 0-4
+
+        RETURNS:
+        - self.observations: updated observation buffer
+        - self.rewards: updated reward buffer
+        - self.terminals: updated terminal buffer
+        - self.truncations: updated truncation buffer
+        - infos: list of dicts with episode stats for finished episodes
+        """
+        # -----------------------------------------------------------------
+        # TODO 2.2: Implement step()
+        # -----------------------------------------------------------------
+        # Step 1: Get the action (we only have 1 agent)
+        #         action = actions[0]
+        #
+        # Step 2: Convert action to movement
+        #         dx, dy = 0.0, 0.0
+        #         if action == 1: dy = self.move_speed   # UP
+        #         elif action == 2: dy = -self.move_speed  # DOWN
+        #         elif action == 3: dx = -self.move_speed  # LEFT
+        #         elif action == 4: dx = self.move_speed   # RIGHT
+        #
+        # Step 3: Apply movement
+        #         self.agent_pos[0] += dx
+        #         self.agent_pos[1] += dy
+        #         self.tick += 1
+        #
+        # Step 4: Compute distance and rewards
+        #         distance = np.linalg.norm(self.agent_pos - self.target_pos)
+        #         reward = -0.01  # Time penalty
+        #         reward += 2.0 * (self.prev_dist - distance)  # Distance shaping
+        #         self.prev_dist = distance
+        #
+        # Step 5: Check terminal conditions
+        #         reached_target = distance < self.target_radius
+        #         hit_wall = (abs(self.agent_pos[0]) > self.arena_size or
+        #                     abs(self.agent_pos[1]) > self.arena_size)
+        #         timed_out = self.tick >= self.max_steps
+        #
+        # Step 6: Apply terminal rewards
+        #         if reached_target: reward += 1.0
+        #         if hit_wall: reward -= 0.5
+        #
+        # Step 7: Set terminal and truncation flags
+        #         terminal = reached_target or hit_wall
+        #         truncation = timed_out and not terminal
+        #
+        # Step 8: Update buffers
+        #         self.rewards[0] = reward
+        #         self.terminals[0] = terminal
+        #         self.truncations[0] = truncation
+        #
+        # Step 9: Build info dict for finished episodes
+        #         infos = []
+        #         if terminal or truncation:
+        #             infos.append({
+        #                 'episode_length': self.tick,
+        #                 'reached_target': reached_target,
+        #                 'hit_wall': hit_wall,
+        #                 'reward': reward,
+        #             })
+        #             # Auto-reset for next episode
+        #             self.reset()
+        #
+        # Step 10: Update observations (whether reset or not)
+        #          self.observations[0, 0] = self.agent_pos[0]
+        #          self.observations[0, 1] = self.agent_pos[1]
+        #          self.observations[0, 2] = self.target_pos[0]
+        #          self.observations[0, 3] = self.target_pos[1]
+        #          self.observations[0, 4] = self.target_pos[0] - self.agent_pos[0]
+        #          self.observations[0, 5] = self.target_pos[1] - self.agent_pos[1]
+        #
+        # YOUR CODE:
+
+        action = actions[0]
+
+        dx, dy = 0.0, 0.0
+        if action == 1:
+            dy = self.move_speed
+        elif action == 2:
+            dy = -self.move_speed  # DOWN
+        elif action == 3:
+            dx = -self.move_speed  # LEFT
+        elif action == 4:
+            dx = self.move_speed  # RIGHT
+
+        self.agent_pos[0] += dx
+        self.agent_pos[1] += dy
+        self.tick += 1
+
+        distance = np.linalg.norm(self.target_pos - self.agent_pos)
+        reward = -0.01
+        reward += 2 * (self.prev_dist - distance)
+        self.prev_dist = distance
+
+        reached_target = distance < self.target_radius
+        hit_wall = (
+            abs(self.agent_pos[0]) > self.arena_size
+            or abs(self.agent_pos[1]) > self.arena_size
+        )
+        timed_out = self.tick >= self.max_steps
+
+        if reached_target:
+            reward += 1.0
+        if hit_wall:
+            reward -= 0.5
+
+        terminal = reached_target or hit_wall
+        truncation = timed_out and not terminal
+
+        self.rewards[0] = reward
+        self.terminals[0] = terminal
+        self.truncations[0] = truncation
+
+        infos = []
+        if terminal or truncation:
+            infos.append(
+                {
+                    "episode_length": self.tick,
+                    "reached_target": reached_target,
+                    "hit_wall": hit_wall,
+                    "reward": reward,
+                }
+            )
+            self.reset()
+
+        self.observations[0, 0] = self.agent_pos[0]
+        self.observations[0, 1] = self.agent_pos[1]
+        self.observations[0, 2] = self.target_pos[0]
+        self.observations[0, 3] = self.target_pos[1]
+        self.observations[0, 4] = self.target_pos[0] - self.agent_pos[0]
+        self.observations[0, 5] = self.target_pos[1] - self.agent_pos[1]
+
+        return self.observations, self.rewards, self.terminals, self.truncations, infos
+
+    def render(self):
+        """
+        Simple ASCII rendering for debugging.
+        Shows a 20x20 grid with agent (A) and target (T).
+        """
+        grid_size = 20
+        grid = [["." for _ in range(grid_size)] for _ in range(grid_size)]
+
+        # Convert positions from [-1, 1] to grid indices [0, grid_size-1]
+        def to_grid(pos):
+            x = int((pos[0] + 1) / 2 * (grid_size - 1))
+            y = int((1 - (pos[1] + 1) / 2) * (grid_size - 1))  # Flip y for display
+            return max(0, min(grid_size - 1, x)), max(0, min(grid_size - 1, y))
+
+        tx, ty = to_grid(self.target_pos)
+        ax, ay = to_grid(self.agent_pos)
+
+        grid[ty][tx] = "T"
+        grid[ay][ax] = "A"
+
+        print(f"\nStep {self.tick}:")
+        print("+" + "-" * grid_size + "+")
+        for row in grid:
+            print("|" + "".join(row) + "|")
+        print("+" + "-" * grid_size + "+")
+
+    def close(self):
+        pass
+
+
+# =============================================================================
+# SECTION 2: TESTING ENVIRONMENT
+# =============================================================================
+"""
+WHY test before training?
+-------------------------
+If your environment is broken, RL will silently fail to learn.
+You'll waste hours wondering why training doesn't work.
+
+ALWAYS verify:
+1. Environment creates without errors
+2. reset() returns correct shapes
+3. step() works with valid actions
+4. Episodes actually terminate
+5. A simple heuristic can solve it
+"""
+
+
+def test_environment():
+    """Run basic sanity checks on the PufferLib environment."""
+    print("=" * 60)
+    print("TESTING MoveToTargetEnv (PufferLib)")
+    print("=" * 60)
+
+    # Test 1: Creation
+    print("\n[TEST 1] Creating environment...")
+    try:
+        env = MoveToTargetEnv(seed=42)
+        print(f"  OK: Created env")
+        print(f"  Observation space: {env.single_observation_space}")
+        print(f"  Action space: {env.single_action_space}")
+        print(f"  Num agents: {env.num_agents}")
+    except Exception as e:
+        print(f"  FAILED: {e}")
+        import traceback
+
+        traceback.print_exc()
+        return False
+
+    # Test 2: Reset
+    print("\n[TEST 2] Testing reset()...")
+    try:
+        obs, info = env.reset()
+        print(f"  OK: reset() returned observations with shape {obs.shape}")
+        print(f"  Sample observation: {obs[0]}")
+        assert obs.shape == (1, 6), f"Wrong shape: {obs.shape}, expected (1, 6)"
+    except Exception as e:
+        print(f"  FAILED: {e}")
+        import traceback
+
+        traceback.print_exc()
+        return False
+
+    # Test 3: Step with random actions
+    print("\n[TEST 3] Testing step() with random actions...")
+    try:
+        for i in range(5):
+            actions = np.array([np.random.randint(0, 5)])  # Shape (1,)
+            obs, rewards, terminals, truncations, infos = env.step(actions)
+            print(f"  Step {i + 1}: reward={rewards[0]:.3f}, terminal={terminals[0]}")
+        print(f"  OK: step() works")
+    except Exception as e:
+        print(f"  FAILED: {e}")
+        import traceback
+
+        traceback.print_exc()
+        return False
+
+    # Test 4: Run until episode terminates using heuristic
+    print("\n[TEST 4] Running until episode terminates...")
+    try:
+        obs, _ = env.reset()
+        total_steps = 0
+        episodes_finished = 0
+
+        while episodes_finished < 2 and total_steps < 500:
+            # Simple heuristic: move toward target
+            dx = obs[0, 4]  # target_x - agent_x
+            dy = obs[0, 5]  # target_y - agent_y
+
+            if abs(dx) > abs(dy):
+                action = 4 if dx > 0 else 3  # RIGHT or LEFT
+            else:
+                action = 1 if dy > 0 else 2  # UP or DOWN
+
+            actions = np.array([action])
+            obs, rewards, terminals, truncations, infos = env.step(actions)
+            total_steps += 1
+
+            if infos:
+                for info in infos:
+                    episodes_finished += 1
+                    print(f"  Episode finished: {info}")
+
+        print(f"  OK: Completed {episodes_finished} episodes in {total_steps} steps")
+    except Exception as e:
+        print(f"  FAILED: {e}")
+        import traceback
+
+        traceback.print_exc()
+        return False
+
+    # Test 5: Test with PufferLib vectorization
+    print("\n[TEST 5] Testing with pufferlib.vector.make()...")
+    try:
+        vecenv = pufferlib.vector.make(
+            MoveToTargetEnv,
+            num_envs=4,
+            backend=pufferlib.vector.Serial,
+        )
+        obs, _ = vecenv.reset()
+        print(f"  OK: Created vectorized env with 4 copies")
+        print(f"  Vectorized observation shape: {obs.shape}")
+
+        # Take a few steps
+        for i in range(3):
+            actions = np.random.randint(0, 5, size=4)
+            obs, rewards, terminals, truncations, infos = vecenv.step(actions)
+        print(f"  OK: Vectorized stepping works")
+        vecenv.close()
+    except Exception as e:
+        print(f"  FAILED: {e}")
+        import traceback
+
+        traceback.print_exc()
+        return False
+
+    print("\n" + "=" * 60)
+    print("ALL ENVIRONMENT TESTS PASSED!")
+    print("=" * 60)
+    return True
+
+
+# =============================================================================
+# SECTION 3: POLICY NETWORK
+# =============================================================================
+"""
+WHY this specific architecture?
+-------------------------------
+PufferLib expects policies to follow certain conventions:
+
+1. forward_eval(observations, state=None) -> (logits, values)
+   - This is what the trainer calls during rollout collection
+   - Returns action LOGITS (not probabilities) and value estimates
+   - The `state` parameter is for RNNs (we return None for feedforward)
+
+2. Use pufferlib.pytorch.layer_init() for weight initialization
+   - Proper initialization is crucial for stable learning
+   - Different std values for actor vs critic heads
+
+WHY layer_init?
+---------------
+Neural network initialization matters A LOT for RL:
+- Too large weights -> exploding gradients, unstable training
+- Too small weights -> vanishing gradients, slow learning
+- layer_init uses orthogonal initialization which works well for RL
+
+ARCHITECTURE:
+observation (6) -> encoder (64 -> 64) -> actor head (5) + critic head (1)
+"""
+
+
+class Policy(nn.Module):
+    """
+    Actor-Critic policy network following PufferLib conventions.
+
+    The network has:
+    - Shared encoder: processes observations into features
+    - Actor head: outputs action logits (5 actions)
+    - Critic head: outputs value estimate (1 value)
+    """
+
+    def __init__(self, env, hidden_size=64):
+        """
+        WHY take env as parameter?
+        --------------------------
+        We extract observation and action sizes from the environment.
+        This is more robust than hardcoding dimensions.
+
+        PufferLib's vectorized envs provide:
+        - env.single_observation_space: shape of one agent's observation
+        - env.single_action_space: the action space for one agent
+
+        For regular Gymnasium envs, these would be observation_space/action_space.
+        """
+        super().__init__()
+
+        # Get dimensions from environment
+        obs_size = env.single_observation_space.shape[0]
+        action_size = env.single_action_space.n
+
+        # -----------------------------------------------------------------
+        # TODO 3.1: Create the encoder (shared backbone)
+        # -----------------------------------------------------------------
+        # The encoder processes observations into a feature vector.
+        # Both actor and critic will use these features.
+        #
+        # Architecture: Linear(obs_size, hidden_size) -> ReLU -> Linear(hidden_size, hidden_size) -> ReLU
+        #
+        # Use pufferlib.pytorch.layer_init() for each Linear layer.
+        # Default std works for hidden layers.
+        #
+        # Example:
+        #   self.encoder = nn.Sequential(
+        #       pufferlib.pytorch.layer_init(nn.Linear(obs_size, hidden_size)),
+        #       nn.ReLU(),
+        #       pufferlib.pytorch.layer_init(nn.Linear(hidden_size, hidden_size)),
+        #       nn.ReLU(),
+        #   )
+        #
+        # YOUR CODE:
+
+        self.encoder = nn.Sequential(
+            pufferlib.pytorch.layer_init(nn.Linear(obs_size, hidden_size)),
+            nn.ReLU(),
+            pufferlib.pytorch.layer_init(nn.Linear(hidden_size, hidden_size)),
+            nn.ReLU(),
+        )
+
+        # -----------------------------------------------------------------
+        # TODO 3.2: Create the actor head
+        # -----------------------------------------------------------------
+        # Outputs action logits. Use std=0.01 for small initial outputs.
+        # WHY small std? We want initial actions to be nearly uniform.
+        #
+        # self.actor = pufferlib.pytorch.layer_init(
+        #     nn.Linear(hidden_size, action_size), std=0.01
+        # )
+        #
+        # YOUR CODE:
+
+        self.actor = pufferlib.pytorch.layer_init(
+            nn.Linear(hidden_size, action_size), std=0.01
+        )
+
+        # -----------------------------------------------------------------
+        # TODO 3.3: Create the critic head
+        # -----------------------------------------------------------------
+        # Outputs value estimate. Use std=1.0 for reasonable initial values.
+        #
+        # self.critic = pufferlib.pytorch.layer_init(
+        #     nn.Linear(hidden_size, 1), std=1.0
+        # )
+        #
+        # YOUR CODE:
+
+        self.critic = pufferlib.pytorch.layer_init(nn.Linear(hidden_size, 1), std=1.0)
+
+    def forward_eval(self, observations, state=None):
+        """
+        WHY forward_eval specifically?
+        ------------------------------
+        PufferLib's trainer calls forward_eval() during rollout collection.
+        It expects (logits, values) as return value.
+
+        The state parameter is for recurrent networks (LSTMs). For feedforward
+        networks like ours, we ignore it and return None.
+
+        PARAMETERS:
+        - observations: tensor of shape (batch_size, obs_size)
+        - state: For RNN/LSTM policies, carries hidden state between steps.
+                 For feedforward networks (like ours), always None.
+
+        RETURNS:
+        - logits: tensor of shape (batch_size, action_size) - unnormalized action scores
+        - values: tensor of shape (batch_size, 1) - value estimates
+        """
+        # -----------------------------------------------------------------
+        # TODO 3.4: Implement forward_eval
+        # -----------------------------------------------------------------
+        # Step 1: Pass observations through encoder
+        #         hidden = self.encoder(observations)
+        #
+        # Step 2: Get action logits from actor head
+        #         logits = self.actor(hidden)
+        #
+        # Step 3: Get value estimate from critic head
+        #         values = self.critic(hidden)
+        #
+        # Step 4: Return (logits, values)
+
+        hidden = self.encoder(observations)
+        logits = self.actor(hidden)
+        values = self.critic(hidden)
+
+        return logits, values
+
+    def forward(self, observations, state=None):
+        """Standard PyTorch forward - required by PufferLib trainer."""
+        return self.forward_eval(observations, state)
+
+
+# =============================================================================
+# SECTION 4: TESTING POLICY
+# =============================================================================
+"""
+WHY test the policy?
+--------------------
+Verify the network architecture is correct before training.
+Common bugs:
+- Wrong input/output dimensions
+- Missing activations
+- NaN in outputs
+"""
+
+
+def test_policy():
+    """Run basic sanity checks on the Policy network."""
+    print("\n" + "=" * 60)
+    print("TESTING Policy Network")
+    print("=" * 60)
+
+    # Test 1: Creation
+    print("\n[TEST 1] Creating policy...")
+    try:
+        # Create a dummy env to get dimensions
+        env = MoveToTargetEnv()
+        env.reset()  # Initialize the env
+
+        policy = Policy(env, hidden_size=64)
+        print(f"  OK: Created policy")
+
+        # Count parameters
+        total_params = sum(p.numel() for p in policy.parameters())
+        print(f"  Total parameters: {total_params}")
+    except Exception as e:
+        print(f"  FAILED: {e}")
+        import traceback
+
+        traceback.print_exc()
+        return False
+
+    # Test 2: forward_eval
+    print("\n[TEST 2] Testing forward_eval()...")
+    try:
+        # Create batch of observations
+        obs = torch.randn(4, 6)  # batch of 4
+        logits, values = policy.forward_eval(obs)
+
+        print(f"  Input shape: {obs.shape}")
+        print(f"  Logits shape: {logits.shape} (expected: [4, 5])")
+        print(f"  Values shape: {values.shape} (expected: [4, 1])")
+
+        assert logits.shape == (4, 5), f"Wrong logits shape: {logits.shape}"
+        assert values.shape == (4, 1), f"Wrong values shape: {values.shape}"
+
+        # Check for NaN
+        assert not torch.isnan(logits).any(), "NaN in logits!"
+        assert not torch.isnan(values).any(), "NaN in values!"
+
+        print("  OK: Shapes correct, no NaN")
+    except Exception as e:
+        print(f"  FAILED: {e}")
+        import traceback
+
+        traceback.print_exc()
+        return False
+
+    # Test 3: Single observation
+    print("\n[TEST 3] Testing with single observation...")
+    try:
+        obs = torch.randn(1, 6)
+        logits, values = policy.forward_eval(obs)
+
+        print(f"  Logits: {logits}")
+        print(f"  Value: {values}")
+        print("  OK: Single observation works")
+    except Exception as e:
+        print(f"  FAILED: {e}")
+        import traceback
+
+        traceback.print_exc()
+        return False
+
+    print("\n" + "=" * 60)
+    print("ALL POLICY TESTS PASSED!")
+    print("=" * 60)
+    return True
+
+
+# =============================================================================
+# SECTION 5: TRAINING WITH PUFFERLIB
+# =============================================================================
+"""
+WHY use pufferl.PuffeRL?
+------------------------
+PufferLib's trainer handles ALL the RL internals:
+- Rollout collection (running envs, storing experiences)
+- GAE advantage computation
+- PPO loss calculation (clipped surrogate, value loss, entropy)
+- Gradient updates with clipping
+- Logging and metrics
+
+This means our training code is MUCH simpler than learn.py!
+
+THE TRAINING LOOP:
+-----------------
+1. Create vectorized environment
+2. Create policy
+3. Create config dict with hyperparameters
+4. Create PuffeRL trainer
+5. Loop: trainer.evaluate() -> trainer.train()
+
+WHAT trainer.evaluate() does:
+- Runs the policy in all environments
+- Collects experiences into buffers
+- Computes advantages and returns
+
+WHAT trainer.train() does:
+- Runs PPO update on collected experiences
+- Updates policy weights
+- Logs metrics
+"""
+
+
+def train(quick_test=False):
+    """
+    Main training function using PufferLib's trainer.
+
+    PARAMETERS:
+    - quick_test: if True, run short training to verify code works
+                  if False, run full training to see actual learning
+    """
+    # -----------------------------------------------------------------
+    # Hyperparameters
+    # -----------------------------------------------------------------
+    if quick_test:
+        total_timesteps = 10000
+        num_envs = 4
+    else:
+        total_timesteps = 100000
+        num_envs = 8
+
+    # Detect device
+    # device = "mps" if torch.backends.mps.is_available() else "cpu"
+    device = "cpu"
+
+    print("=" * 60)
+    print("TRAINING WITH PUFFERLIB")
+    print("=" * 60)
+    print(f"MPS available: {torch.backends.mps.is_available()}")
+    print(f"Using device: {device}")
+    print(f"Total timesteps: {total_timesteps}")
+    print(f"Num environments: {num_envs}")
+    print("=" * 60)
+
+    # -----------------------------------------------------------------
+    # TODO 5.1: Create vectorized environment
+    # -----------------------------------------------------------------
+    # PufferLib's vector.make() creates multiple environment copies.
+    #
+    # Backend options:
+    # - Serial: Runs envs sequentially. Good for debugging because errors
+    #   appear in the main process with full stack traces.
+    # - Multiprocessing: Runs envs in parallel. Much faster for many envs,
+    #   but errors in subprocesses are harder to debug.
+    #
+    # Tip: Use Serial until your code works, then switch to Multiprocessing.
+    #
+    # vecenv = pufferlib.vector.make(
+    #     MoveToTargetEnv,
+    #     num_envs=num_envs,
+    #     backend=pufferlib.vector.Serial,
+    # )
+    #
+    # YOUR CODE:
+
+    vecenv = pufferlib.vector.make(
+        MoveToTargetEnv, num_envs=num_envs, backend=pufferlib.vector.Multiprocessing
+    )
+
+    # -----------------------------------------------------------------
+    # TODO 5.2: Create policy
+    # -----------------------------------------------------------------
+    # Use vecenv.driver_env to get a reference to one of the environment copies.
+    # This lets us access single_observation_space and single_action_space
+    # for creating the policy with correct input/output dimensions.
+    # Move policy to device for GPU training.
+    #
+    # policy = Policy(vecenv.driver_env, hidden_size=64).to(device)
+    #
+    # YOUR CODE:
+
+    policy = Policy(vecenv.driver_env, hidden_size=64).to(device)
+    next(policy.parameters()).device
+
+    # -----------------------------------------------------------------
+    # TODO 5.3: Create config
+    # -----------------------------------------------------------------
+    # PufferLib's trainer uses a Config object for hyperparameters.
+    # These are standard PPO values that work well.
+    #
+    # config = pufferl.Config(
+    #     total_timesteps=total_timesteps,
+    #     learning_rate=3e-4,
+    #     num_steps=128,        # Steps per rollout
+    #     num_minibatches=4,    # Minibatches per update
+    #     update_epochs=4,      # PPO epochs per update
+    #     gamma=0.99,           # Discount factor
+    #     gae_lambda=0.95,      # GAE parameter
+    #     clip_coef=0.2,        # PPO clipping
+    #     vf_coef=0.5,          # Value loss coefficient
+    #     ent_coef=0.01,        # Entropy bonus coefficient
+    #     max_grad_norm=0.5,    # Gradient clipping
+    # )
+    #
+    # YOUR CODE:
+
+    config = {
+        "env": "MoveToTarget",
+        "total_timesteps": total_timesteps,
+        "learning_rate": 3e-4,
+        "batch_size": num_envs * 128,
+        "bptt_horizon": 128,
+        "minibatch_size": 512,
+        "max_minibatch_size": 512,
+        "update_epochs": 4,
+        "gamma": 0.99,
+        "gae_lambda": 0.95,
+        "clip_coef": 0.2,
+        "vf_coef": 0.5,
+        "vf_clip_coef": 0.2,
+        "ent_coef": 0.01,
+        "max_grad_norm": 0.5,
+        "device": device,
+        "seed": 42,
+        "torch_deterministic": True,
+        "cpu_offload": False,
+        "use_rnn": False,
+        "compile": False,
+        "optimizer": "adam",
+        "adam_beta1": 0.9,
+        "adam_beta2": 0.999,
+        "adam_eps": 1e-8,
+        "anneal_lr": True,
+        "vtrace_rho_clip": 1.0,
+        "vtrace_c_clip": 1.0,
+        "prio_alpha": 0.8,
+        "prio_beta0": 0.2,
+        "checkpoint_interval": 200,
+        "data_dir": "experiments",
+        "precision": "float32",
+    }
+
+    # -----------------------------------------------------------------
+    # TODO 5.4: Create trainer
+    # -----------------------------------------------------------------
+    # The PuffeRL trainer handles the entire training loop internals.
+    #
+    # trainer = pufferl.PuffeRL(
+    #     config=config,
+    #     vecenv=vecenv,
+    #     policy=policy,
+    #     optimizer=torch.optim.Adam(policy.parameters(), lr=config.learning_rate),
+    # )
+    #
+    # YOUR CODE:
+
+    trainer = pufferl.PuffeRL(config, vecenv, policy)
+
+    # -----------------------------------------------------------------
+    # TODO 5.5: Training loop
+    # -----------------------------------------------------------------
+    # The training loop is very simple with PufferLib:
+    # 1. trainer.evaluate() - collect experiences
+    # 2. trainer.train() - run PPO update
+    # 3. Repeat until done
+    #
+    # Example:
+    # while not trainer.done:
+    #     trainer.evaluate()
+    #     trainer.train()
+    #
+    #     # Print progress every 10 epochs
+    #     if trainer.epoch % 10 == 0:
+    #         # Get metrics from trainer
+    #         metrics = trainer.metrics
+    #         print(f"Epoch {trainer.epoch} | "
+    #               f"reward: {metrics.get('episode_reward', 0):.2f} | "
+    #               f"length: {metrics.get('episode_length', 0):.1f}")
+    #
+    # Or use the built-in dashboard:
+    # while not trainer.done:
+    #     trainer.evaluate()
+    #     trainer.train()
+    #     trainer.print_dashboard()  # Pretty-printed metrics
+    #
+    # YOUR CODE:
+
+    while trainer.global_step < total_timesteps:
+        trainer.evaluate()
+        trainer.train()
+
+    # Cleanup
+    trainer.close()
+    vecenv.close()
+
+    print("\n" + "=" * 60)
+    print("TRAINING COMPLETE!")
+    print("=" * 60)
+
+    return policy
+
+
+# =============================================================================
+# SECTION 6: EVALUATION WITH ASCII RENDERING
+# =============================================================================
+
+
+def eval_policy(num_episodes=3, delay=0.1):
+    """
+    Run the trained policy and watch it play with ASCII rendering.
+
+    PARAMETERS:
+    - num_episodes: number of episodes to run
+    - delay: seconds between frames (for watchability)
+    """
+    import time
+    import glob
+
+    print("=" * 60)
+    print("EVALUATING TRAINED POLICY")
+    print("=" * 60)
+
+    # Find latest checkpoint
+    checkpoints = glob.glob("experiments/**/model.pt", recursive=True)
+    if not checkpoints:
+        print(
+            "No checkpoint found in experiments/. Train first with 'python learn_v2.py train'"
+        )
+        return
+
+    latest_checkpoint = max(checkpoints, key=lambda x: os.path.getmtime(x))
+    print(f"Loading checkpoint: {latest_checkpoint}")
+
+    # Create environment (single, not vectorized)
+    env = MoveToTargetEnv(seed=int(time.time()))
+
+    # Create and load policy
+    policy = Policy(env, hidden_size=64)
+    checkpoint = torch.load(latest_checkpoint, map_location="cpu", weights_only=True)
+    policy.load_state_dict(checkpoint)
+    policy.eval()
+
+    print(f"Running {num_episodes} episodes...\n")
+
+    for ep in range(num_episodes):
+        print(f"\n{'=' * 60}")
+        print(f"EPISODE {ep + 1}")
+        print(f"{'=' * 60}")
+
+        obs, _ = env.reset()
+        env.render()
+        time.sleep(delay)
+
+        done = False
+        total_reward = 0.0
+
+        while not done:
+            # Get action from policy
+            with torch.no_grad():
+                obs_tensor = torch.from_numpy(obs).float()
+                logits, _ = policy(obs_tensor)
+                action = torch.argmax(logits, dim=-1).item()
+
+            # Step environment
+            obs, rewards, terminals, truncations, infos = env.step(np.array([action]))
+            total_reward += rewards[0]
+            done = terminals[0] or truncations[0]
+
+            # Render
+            env.render()
+            action_names = ["NOOP", "UP", "DOWN", "LEFT", "RIGHT"]
+            print(f"Action: {action_names[action]}, Reward: {rewards[0]:.3f}")
+            time.sleep(delay)
+
+        # Episode summary
+        if infos:
+            info = infos[0]
+            result = (
+                "REACHED TARGET!"
+                if info.get("reached_target")
+                else "Failed (wall/timeout)"
+            )
+            print(f"\nResult: {result}")
+            print(f"Episode length: {info.get('episode_length', 'N/A')}")
+        print(f"Total reward: {total_reward:.3f}")
+
+    env.close()
+    print("\n" + "=" * 60)
+    print("EVALUATION COMPLETE!")
+    print("=" * 60)
+
+
+# =============================================================================
+# MAIN EXECUTION
+# =============================================================================
+
+if __name__ == "__main__":
+    import sys
+
+    # Parse command line arguments
+    if len(sys.argv) > 1:
+        command = sys.argv[1]
+        if command == "test":
+            # Run all tests
+            env_ok = test_environment()
+            if env_ok:
+                test_policy()
+        elif command == "train":
+            # Run full training
+            test_environment()
+            test_policy()
+            train(quick_test=False)
+        elif command == "quick":
+            # Quick training test
+            # test_environment()
+            # test_policy()
+            train(quick_test=True)
+        elif command == "eval":
+            # Evaluate trained policy with ASCII rendering
+            eval_policy(num_episodes=3, delay=0.1)
+        else:
+            print(f"Unknown command: {command}")
+            print("Usage: python learn_v2.py [test|train|quick|eval]")
+    else:
+        # Default: run tests only
+        print("Running tests... (use 'python learn_v2.py train' for full training)")
+        print()
+        env_ok = test_environment()
+        if env_ok:
+            test_policy()
diff --git a/pufferlib/ocean/boss_fight/README.md b/pufferlib/ocean/boss_fight/README.md
index a9cbcd8b8..48999f6de 100644
--- a/pufferlib/ocean/boss_fight/README.md
+++ b/pufferlib/ocean/boss_fight/README.md
@@ -5,7 +5,7 @@
 Build a **minimal** 2D boss fight environment to learn RL concepts with PufferLib.
 Focus: **observation design, reward shaping, training experiments and a bit of game dev using Raylib**
 
-The boss has **1 attack** (AOE burst). All hitboxes are circles.
+The boss has **1 attack** (AOE burst). All hitboxes are circles (collision = circles overlap).
 
 ---
 
@@ -57,6 +57,8 @@ ATTACK   — windup(4) + active(3) + recovery(6) = 13 ticks total, no movement
 
 - Circle at `player_pos + facing * 0.7`, radius `0.4`
 - `facing` = direction to boss at attack start
+- Hits boss if circles overlap: `dist(attack, boss) < 0.4 + 0.5`
+- **Effective range: 1.6 units from boss center**
 - Damage: 10
 
 ### Boss Behavior (Single Attack)
@@ -73,109 +75,102 @@ RECOVERY: 15 ticks (0.5s) — vulnerable, no damage
 **AOE Attack:**
 
 - Circle centered on boss, radius `1.5`
+- Hits player if circles overlap: `dist(player, boss) < 1.5 + 0.3`
+- **Effective range: 1.8 units from boss center**
 - Damage: 20
-- Player takes damage if: in AOE radius AND not in i-frames
+- Player avoids damage if: outside range OR in i-frames
 
 ---
 
-## Observation Space (14 floats)
+## Observation Space (13 floats)
 
-Keep it minimal. You can ablate later.
+Raw game state values — let the network learn its own representations.
 
 ```
-Geometry (3):
-  0: rel_boss_x      = boss_x - player_x (normalized by arena half-size)
-  1: rel_boss_y      = boss_y - player_y
-  2: distance        = clamp(dist / 5.0, 0, 1)
+Geometry (6):
+  0: dx              = boss_x - player_x (relative position)
+  1: dy              = boss_y - player_y
+  2: player_x        = absolute position [-5, 5]
+  3: player_y        = absolute position [-5, 5]
+  4: boss_x          = absolute position (fixed at 0)
+  5: boss_y          = absolute position (fixed at 0)
 
 Player (5):
-  3: player_hp       = hp / 100
-  4: dodge_ready     = 1.0 if can dodge, else 0.0
-  5: player_state    = {FREE: 0, DODGE: 0.33, ATTACK: 0.66}  # scalar encoding
-  6: state_progress  = ticks_in_state / state_duration
-  7: move_dir_x      = -1 to 1
-
-Boss (6):
-  8:  boss_hp        = hp / 100
-  9:  boss_phase     = {IDLE: 0, WINDUP: 0.33, ACTIVE: 0.66, RECOVERY: 1.0}
-  10: phase_progress = ticks_in_phase / phase_duration
-  11: time_to_damage = ticks until ACTIVE starts / 18 (1.0 during IDLE/RECOVERY)
-  12: in_aoe_range   = 1.0 if distance < 1.5, else 0.0
-  13: boss_attacking = 1.0 if in WINDUP/ACTIVE, else 0.0
+  6: player_hp       = raw HP [0, 100]
+  7: boss_hp         = raw HP [0, 100]
+  8: player_state    = enum {IDLING: 0, DODGING: 1, ATTACKING: 2}
+  9: player_dodge_cooldown = ticks remaining [0, 15]
+  10: player_state_ticks   = ticks in current state
+
+Boss (2):
+  11: boss_state     = enum {IDLING: 0, WINDING_UP: 1, ATTACKING: 2, RECOVERING: 3}
+  12: boss_phase_ticks = ticks in current phase
 ```
 
 ---
 
-## Reward Function (v1 — HP delta)
+## Reward Function
 
-```python
-# Per step
-reward = 0
-reward += (boss_hp_prev - boss_hp_now) * 0.1      # +1.0 per hit landed
-reward += (player_hp_prev - player_hp_now) * -0.1 # -2.0 per AOE hit taken
-reward += -0.001                                   # time penalty
+Design your own! Consider these questions:
 
-# Terminal
-if boss_hp <= 0: reward += 1.0   # win bonus
-if player_hp <= 0: reward -= 1.0 # lose penalty
-```
+- **What behaviors do you want to encourage?** (dealing damage, staying alive, winning)
+- **What behaviors do you want to discourage?** (taking hits, timing out, being passive)
+- **Dense vs sparse?** Should the agent get feedback every step, or only at episode end?
+- **Scaling?** How do you balance different reward components so one doesn't dominate?
+
+Hint: Track HP changes between steps. Think about terminal bonuses.
 
 ---
 
 ## Episode Termination
 
-- `terminated = True` if player or boss HP <= 0
-- `truncated = True` if ticks >= 900 (30 seconds)
+Episodes end when:
+
+- Someone wins (HP reaches 0)
+- Time runs out (prevent infinite episodes)
 
 ---
 
-## Implementation (Single File)
+## Implementation (C + Python)
 
-Everything in `soulsrl.py` (~250-300 lines):
+Core game logic in C with Python bindings:
 
-```python
-class SoulsEnv(pufferlib.PufferEnv):
-    # Player state machine
-    # Boss state machine
-    # Collision detection (circle-circle only)
-    # Observation building
-    # Reward calculation
+```
+boss_fight.h    — Game state struct, enums, c_reset(), c_step(), c_render()
+boss_fight.c    — Standalone test with keyboard input (Shift+WASD/Space/J)
+boss_fight.py   — PufferLib environment wrapper
 ```
 
-No separate core.py, no rendering, no curriculum stages.
+Uses Raylib for rendering (1080x720 window @ 30 FPS).
 
 ---
 
 ## RL Experiments
 
-Once v1 is working, run these experiments to learn RL concepts:
+Once v1 is working, design experiments to understand RL concepts:
+
+### Experiment Ideas
 
-### Experiment 1: Observation Ablations
+**Observation Ablations** — Which observations actually matter?
 
-| Variant   | Change                                                          | Hypothesis                             |
-| --------- | --------------------------------------------------------------- | -------------------------------------- |
-| no_timing | Remove `time_to_damage`, `phase_progress`                       | Agent can't learn precise dodge timing |
-| no_range  | Remove `in_aoe_range`, `distance`                               | Agent can't learn spacing              |
-| minimal   | Only: `distance`, `time_to_damage`, `dodge_ready`, `boss_phase` | Test minimum viable obs                |
-| noisy     | Add 5 uniform random floats                                     | Network should ignore noise            |
+- What happens if the agent can't see timing information?
+- Does it need absolute position, or is relative enough?
+- What's the minimum viable observation space?
+- Can the network learn to ignore irrelevant/noisy inputs?
 
-### Experiment 2: Reward Shaping
+**Reward Shaping** — How does reward design affect behavior?
 
-| Variant         | Change                           | Hypothesis                 |
-| --------------- | -------------------------------- | -------------------------- |
-| sparse          | Only win/lose bonus, no HP delta | Much slower learning       |
-| no_time_penalty | Remove -0.001/step               | Agent becomes passive      |
-| dodge_bonus     | +0.2 for dodging during ACTIVE   | Might create dodge spam    |
-| proximity       | +0.01 for being close to boss    | Might discourage safe play |
+- What if you only reward winning/losing (sparse)?
+- What happens without a time penalty?
+- Can you incentivize specific behaviors (dodging at the right time)?
+- What unintended behaviors might reward bonuses create?
 
-### Experiment 3: Hyperparameters
+**Hyperparameters** — See `boss_fight.ini` for the sweep config
 
-| Param         | Values           | What to observe             |
-| ------------- | ---------------- | --------------------------- |
-| learning_rate | 1e-3, 3e-4, 1e-4 | Learning speed vs stability |
-| ent_coef      | 0.0, 0.01, 0.05  | Exploration vs exploitation |
-| num_envs      | 8, 32, 128       | Sample efficiency           |
-| hidden_size   | 32, 64, 128      | Model capacity              |
+- Learning rate: stability vs speed
+- Entropy coefficient: exploration vs exploitation
+- Batch size / num_envs: sample efficiency
+- Network size: capacity vs overfitting
 
 ---
 
@@ -202,17 +197,18 @@ Only add these if baseline experiments are done:
 
 ## Deliverables
 
-1. `soulsrl.py` — Environment (PufferEnv)
-2. `train.py` — Training script with logging
-3. `experiments/` — Saved runs with different configs
-4. `results.md` — Summary of what you learned from experiments
+1. `boss_fight.h` — Core game logic in C
+2. `boss_fight.c` — Standalone test binary
+3. `boss_fight.py` — PufferLib environment wrapper
+4. `experiments/` — Saved runs with different configs
+5. `results.md` — Summary of what you learned from experiments
 
 ---
 
-## Timeline Estimate
+## Milestones
 
-- Day 1: Implement `soulsrl.py`, verify with random agent
-- Day 2: Train baseline, confirm learning
-- Day 3-4: Run observation ablations
-- Day 5-6: Run reward experiments
-- Day 7: Document findings, optional extensions
+1. **Environment works**: `c_step()` implemented, can play manually with keyboard
+2. **Random baseline**: Random agent wins ~0%, confirms game is non-trivial
+3. **Learning signal**: Trained agent shows improvement over random
+4. **Competent agent**: Win rate >80%
+5. **Experiments**: At least 3 ablations with documented findings
diff --git a/pufferlib/ocean/boss_fight/binding.c b/pufferlib/ocean/boss_fight/binding.c
index 812e31bb7..45731694b 100644
--- a/pufferlib/ocean/boss_fight/binding.c
+++ b/pufferlib/ocean/boss_fight/binding.c
@@ -4,7 +4,7 @@
 #include "../env_binding.h"
 
 static int my_init(Env *env, PyObject *args, PyObject *kwargs) {
-  env->size = unpack(kwargs, "size");
+  // No special init needed for now
   return 0;
 }
 
diff --git a/pufferlib/ocean/boss_fight/boss_fight.c b/pufferlib/ocean/boss_fight/boss_fight.c
index 0e1e152a2..67e6e8d8b 100644
--- a/pufferlib/ocean/boss_fight/boss_fight.c
+++ b/pufferlib/ocean/boss_fight/boss_fight.c
@@ -1,25 +1,39 @@
 #include "boss_fight.h"
+#include "raylib.h"
 
 int main() {
-  BossFight env = {.size = 5};
-  env.observations = (unsigned char *)calloc(1, sizeof(unsigned char));
-  env.actions = (int *)calloc(1, sizeof(int));
-  env.rewards = (float *)calloc(1, sizeof(float));
-  env.terminals = (unsigned char *)calloc(1, sizeof(unsigned char));
+  int num_obs = 13;
+  int num_actions = 1;
+  int num_agents = 1;
 
+  BossFight env = {};
+  env.observations = (float *)calloc(num_obs, sizeof(unsigned char));
+  env.actions = (float *)calloc(num_actions, sizeof(int));
+  env.rewards = (float *)calloc(num_agents, sizeof(float));
+  env.terminals = (unsigned char *)calloc(num_agents, sizeof(unsigned char));
+
+  // Always call reset and render first
   c_reset(&env);
   c_render(&env);
+
   while (!WindowShouldClose()) {
     if (IsKeyDown(KEY_LEFT_SHIFT)) {
-      if (IsKeyDown(KEY_A) || IsKeyDown(KEY_LEFT)) {
-        env.actions[0] = 0;
-      } else if (IsKeyDown(KEY_D) || IsKeyDown(KEY_RIGHT)) {
+      if (IsKeyDown(KEY_W))
         env.actions[0] = 1;
-      } else {
-        env.actions[0] = -1;
-      }
+      else if (IsKeyDown(KEY_S))
+        env.actions[0] = 2;
+      else if (IsKeyDown(KEY_A))
+        env.actions[0] = 3;
+      else if (IsKeyDown(KEY_D))
+        env.actions[0] = 4;
+      else if (IsKeyDown(KEY_SPACE))
+        env.actions[0] = 5;
+      else if (IsKeyDown(KEY_J))
+        env.actions[0] = 6;
+      else
+        env.actions[0] = 0;
     } else {
-      env.actions[0] = rand() % 2;
+      env.actions[0] = rand() % 7;
     }
     c_step(&env);
     c_render(&env);
diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h
index 75d2932b1..2c37cb275 100644
--- a/pufferlib/ocean/boss_fight/boss_fight.h
+++ b/pufferlib/ocean/boss_fight/boss_fight.h
@@ -1,11 +1,29 @@
 #include "raylib.h"
+#include <math.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
-const Color PUFF_RED = (Color){187, 0, 0, 255};
-const Color PUFF_CYAN = (Color){0, 187, 187, 255};
-const Color PUFF_WHITE = (Color){241, 241, 241, 241};
-const Color PUFF_BACKGROUND = (Color){6, 24, 24, 255};
+#define ARENA_HALF_SIZE 5.0f
+#define MAX_HP 100
+#define PLAYER_SPEED_PER_TICK 0.1f
+#define PLAYER_SIZE 0.3f
+#define BOSS_SIZE 0.5f
+
+const Color PLAYER_COLOR = (Color){187, 0, 0, 255};
+const Color BOSS_COLOR = (Color){0, 187, 187, 255};
+const Color TEXT_COLOR = (Color){241, 241, 241, 241};
+const Color HITBOX_COLOR = (Color){241, 241, 241, 241};
+const Color BACKGROUND_COLOR = (Color){6, 24, 24, 255};
+
+typedef enum { PLAYER_IDLING, PLAYER_DODGING, PLAYER_ATTACKING } PlayerState;
+
+typedef enum {
+  BOSS_IDLING,
+  BOSS_WINDING_UP,
+  BOSS_ATTACKING,
+  BOSS_RECOVERING,
+} BossState;
 
 // Only use floats!
 typedef struct {
@@ -14,62 +32,102 @@ typedef struct {
 } Log;
 
 typedef struct {
-  Log log; // Required field
-  unsigned char
-      *observations;        // Required field. Ensure type matches in .py and .c
-  int *actions;             // Required field. Ensure type matches in .py and .c
+  Log log;                  // Required field
+  float *observations;      // Required field. Ensure type matches in .py and .c
+  float *actions;           // Required field. Ensure type matches in .py and .c
   float *rewards;           // Required field
   unsigned char *terminals; // Required field
-  int size;
-  int x;
-  int goal;
+
+  int tick;
+  float player_x;
+  float player_y;
+  float boss_x;
+  float boss_y;
+  float distance;
+
+  PlayerState player_state;
+  int player_hp;
+  int player_dodge_cooldown;
+  int player_state_ticks;
+
+  BossState boss_state;
+  int boss_hp;
+  int boss_phase_ticks;
+
 } BossFight;
 
+float rand_uniform(float low, float high) {
+  return low + (high - low) * ((float)rand() / ((float)RAND_MAX + 1.0f));
+}
+
+float distance(float x1, float y1, float x2, float y2) {
+  float dx = x1 - x2;
+  float dy = y1 - y2;
+  return sqrtf(dx * dx + dy * dy);
+}
+
 void c_reset(BossFight *env) {
-  env->x = 0;
-  env->goal = (rand() % 2 == 0) ? env->size : -env->size;
+  env->tick = 0;
+  env->player_x = 0;
+  env->player_y = 0;
+  env->boss_x = 0;
+  env->boss_y = 0;
+  env->player_hp = 100;
+  env->boss_hp = 100;
+  env->player_state = PLAYER_IDLING;
+  env->player_dodge_cooldown = 0;
+  env->player_state_ticks = 0;
+  env->boss_state = BOSS_IDLING;
+  env->boss_phase_ticks = 0;
+  env->distance = 0;
+
+  env->player_x = rand_uniform(-ARENA_HALF_SIZE, ARENA_HALF_SIZE);
+  env->player_y = rand_uniform(-ARENA_HALF_SIZE, ARENA_HALF_SIZE);
+
+  while (distance(env->player_x, env->player_y, env->boss_x, env->boss_y) <
+         0.1) {
+    env->player_x = rand_uniform(-ARENA_HALF_SIZE, ARENA_HALF_SIZE);
+    env->player_y = rand_uniform(-ARENA_HALF_SIZE, ARENA_HALF_SIZE);
+  }
+
+  env->distance =
+      distance(env->player_x, env->player_y, env->boss_x, env->boss_y);
+
+  int obs_idx = 0;
+
+  env->observations[obs_idx++] = 0; // dx
+  env->observations[obs_idx++] = 0; // dy
+  env->observations[obs_idx++] = env->player_x;
+  env->observations[obs_idx++] = env->player_y;
+  env->observations[obs_idx++] = env->boss_x;
+  env->observations[obs_idx++] = env->boss_y;
+  env->observations[obs_idx++] = 100;
+  env->observations[obs_idx++] = 100;
+  env->observations[obs_idx++] = PLAYER_IDLING;
+  env->observations[obs_idx++] = 0; // player_dodge_cooldown
+  env->observations[obs_idx++] = 0; // player_state_ticks
+  env->observations[obs_idx++] = BOSS_IDLING;
+  env->observations[obs_idx++] = 0; // boss_phase_ticks
 }
 
 void c_step(BossFight *env) {
   env->rewards[0] = 0;
   env->terminals[0] = 0;
-  if (env->actions[0] == 0) {
-    env->x -= 1;
-  } else if (env->actions[0] == 1) {
-    env->x += 1;
-  }
-  if (env->x == env->goal) {
-    c_reset(env);
-    env->rewards[0] = 1;
-    env->terminals[0] = 1;
-    env->log.score += 1;
-    env->log.n += 1;
-  } else if (env->x == -env->goal) {
-    c_reset(env);
-    env->rewards[0] = -1;
-    env->terminals[0] = 1;
-    env->log.score -= 1;
-    env->log.n += 1;
-  }
-  env->observations[0] = (env->goal > 0) ? 1 : -1;
 }
 
 void c_render(BossFight *env) {
   if (!IsWindowReady()) {
-    InitWindow(1080, 720, "PufferLib Template");
-    SetTargetFPS(5);
+    InitWindow(1080, 720, "BossFight");
+    SetTargetFPS(30);
   }
 
   if (IsKeyDown(KEY_ESCAPE)) {
     exit(0);
   }
 
-  DrawText("Go to the red square!", 20, 20, 20, PUFF_WHITE);
-  DrawRectangle(540 - 32 + 64 * env->goal, 360 - 32, 64, 64, PUFF_RED);
-  DrawRectangle(540 - 32 + 64 * env->x, 360 - 32, 64, 64, PUFF_CYAN);
-
   BeginDrawing();
-  ClearBackground(PUFF_BACKGROUND);
+  ClearBackground(BACKGROUND_COLOR);
+  DrawText("Beat the boss!", 20, 20, 20, TEXT_COLOR);
   EndDrawing();
 }
 

From 1a67976d811bc6ee9e64607ce9d2688691a6581a Mon Sep 17 00:00:00 2001
From: frixaco <rustam21ashurmatov@gmail.com>
Date: Tue, 20 Jan 2026 02:11:24 +0500
Subject: [PATCH 05/29] wip: step function

---
 pufferlib/ocean/boss_fight/boss_fight.c  |  4 +-
 pufferlib/ocean/boss_fight/boss_fight.h  | 72 +++++++++++++++++++-----
 pufferlib/ocean/boss_fight/boss_fight.py |  4 +-
 3 files changed, 63 insertions(+), 17 deletions(-)

diff --git a/pufferlib/ocean/boss_fight/boss_fight.c b/pufferlib/ocean/boss_fight/boss_fight.c
index 67e6e8d8b..49198b733 100644
--- a/pufferlib/ocean/boss_fight/boss_fight.c
+++ b/pufferlib/ocean/boss_fight/boss_fight.c
@@ -7,8 +7,8 @@ int main() {
   int num_agents = 1;
 
   BossFight env = {};
-  env.observations = (float *)calloc(num_obs, sizeof(unsigned char));
-  env.actions = (float *)calloc(num_actions, sizeof(int));
+  env.observations = (float *)calloc(num_obs, sizeof(float));
+  env.actions = (int *)calloc(num_actions, sizeof(float));
   env.rewards = (float *)calloc(num_agents, sizeof(float));
   env.terminals = (unsigned char *)calloc(num_agents, sizeof(unsigned char));
 
diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h
index 2c37cb275..353e98db3 100644
--- a/pufferlib/ocean/boss_fight/boss_fight.h
+++ b/pufferlib/ocean/boss_fight/boss_fight.h
@@ -34,7 +34,7 @@ typedef struct {
 typedef struct {
   Log log;                  // Required field
   float *observations;      // Required field. Ensure type matches in .py and .c
-  float *actions;           // Required field. Ensure type matches in .py and .c
+  int *actions;             // Required field. Ensure type matches in .py and .c
   float *rewards;           // Required field
   unsigned char *terminals; // Required field
 
@@ -72,8 +72,8 @@ void c_reset(BossFight *env) {
   env->player_y = 0;
   env->boss_x = 0;
   env->boss_y = 0;
-  env->player_hp = 100;
-  env->boss_hp = 100;
+  env->player_hp = MAX_HP;
+  env->boss_hp = MAX_HP;
   env->player_state = PLAYER_IDLING;
   env->player_dodge_cooldown = 0;
   env->player_state_ticks = 0;
@@ -95,24 +95,70 @@ void c_reset(BossFight *env) {
 
   int obs_idx = 0;
 
-  env->observations[obs_idx++] = 0; // dx
-  env->observations[obs_idx++] = 0; // dy
+  env->observations[obs_idx++] = env->boss_x - env->player_x;
+  env->observations[obs_idx++] = env->boss_y - env->player_y;
   env->observations[obs_idx++] = env->player_x;
   env->observations[obs_idx++] = env->player_y;
   env->observations[obs_idx++] = env->boss_x;
   env->observations[obs_idx++] = env->boss_y;
-  env->observations[obs_idx++] = 100;
-  env->observations[obs_idx++] = 100;
-  env->observations[obs_idx++] = PLAYER_IDLING;
-  env->observations[obs_idx++] = 0; // player_dodge_cooldown
-  env->observations[obs_idx++] = 0; // player_state_ticks
-  env->observations[obs_idx++] = BOSS_IDLING;
-  env->observations[obs_idx++] = 0; // boss_phase_ticks
+  env->observations[obs_idx++] = (float)env->player_hp;
+  env->observations[obs_idx++] = (float)env->boss_hp;
+  env->observations[obs_idx++] = (float)env->player_state;
+  env->observations[obs_idx++] = (float)env->player_dodge_cooldown;
+  env->observations[obs_idx++] = (float)env->player_state_ticks;
+  env->observations[obs_idx++] = (float)env->boss_state;
+  env->observations[obs_idx++] = (float)env->boss_phase_ticks;
 }
 
 void c_step(BossFight *env) {
-  env->rewards[0] = 0;
+  float reward = -0.01;
   env->terminals[0] = 0;
+
+  int action = env->actions[0];
+  float dx = 0;
+  float dy = 0;
+
+  if (action == 1) {
+    dy = PLAYER_SPEED_PER_TICK;
+  } else if (action == 2) {
+    dy = -PLAYER_SPEED_PER_TICK;
+  } else if (action == 3) {
+    dx = -PLAYER_SPEED_PER_TICK;
+  } else if (action == 4) {
+    dx = PLAYER_SPEED_PER_TICK;
+  }
+
+  env->player_x += dx;
+  env->player_y += dy;
+
+  bool wanna_idle = action == 0;
+  bool wanna_dodge = action == 5;
+  bool wanna_attack = action == 6;
+  bool can_dodge =
+      env->player_state == PLAYER_IDLING && env->player_dodge_cooldown == 0;
+  bool can_attack = env->player_state == PLAYER_IDLING;
+
+  bool hit_wall = fabsf(env->player_x) > ARENA_HALF_SIZE &&
+                  fabsf(env->player_y) > ARENA_HALF_SIZE;
+  if (hit_wall) {
+    reward -= 0.5;
+  }
+
+  // TODO: here i should handle "player attacks and reduces boss hp" case
+
+  bool killed_boss = env->boss_hp == 0;
+  if (killed_boss) {
+    reward += 2;
+  }
+
+  env->rewards[0] = reward;
+
+  bool player_died = env->player_hp == 0;
+  if (player_died) {
+    env->terminals[0] = 1;
+  }
+
+  env->tick++;
 }
 
 void c_render(BossFight *env) {
diff --git a/pufferlib/ocean/boss_fight/boss_fight.py b/pufferlib/ocean/boss_fight/boss_fight.py
index a952dbf41..2e6ce6681 100644
--- a/pufferlib/ocean/boss_fight/boss_fight.py
+++ b/pufferlib/ocean/boss_fight/boss_fight.py
@@ -12,9 +12,9 @@ def __init__(
         self, num_envs=1, render_mode=None, log_interval=128, size=5, buf=None, seed=0
     ):
         self.single_observation_space = gymnasium.spaces.Box(
-            low=0, high=1, shape=(1,), dtype=np.uint8
+            low=-10, high=110, shape=(13,), dtype=np.float32
         )
-        self.single_action_space = gymnasium.spaces.Discrete(2)
+        self.single_action_space = gymnasium.spaces.Discrete(7)
         self.render_mode = render_mode
         self.num_agents = num_envs
 

From b884f4b0d0af71b3949df72201930b5bcf52a09a Mon Sep 17 00:00:00 2001
From: frixaco <rustam21ashurmatov@gmail.com>
Date: Tue, 20 Jan 2026 23:55:29 +0500
Subject: [PATCH 06/29] finish step function

---
 pufferlib/ocean/boss_fight/README.md    |  86 ----------------
 pufferlib/ocean/boss_fight/boss_fight.h | 127 +++++++++++++++++++-----
 2 files changed, 103 insertions(+), 110 deletions(-)

diff --git a/pufferlib/ocean/boss_fight/README.md b/pufferlib/ocean/boss_fight/README.md
index 48999f6de..3d5d189e6 100644
--- a/pufferlib/ocean/boss_fight/README.md
+++ b/pufferlib/ocean/boss_fight/README.md
@@ -48,19 +48,6 @@ DODGE    — 6 ticks, i-frames on ticks 1-5, moves at 2.5x speed in last move_di
 ATTACK   — windup(4) + active(3) + recovery(6) = 13 ticks total, no movement
 ```
 
-**Cooldowns:**
-
-- Dodge: 15 ticks after dodge ends
-- Attack: No cooldown (but you're locked for 13 ticks)
-
-**Attack hitbox (during ACTIVE):**
-
-- Circle at `player_pos + facing * 0.7`, radius `0.4`
-- `facing` = direction to boss at attack start
-- Hits boss if circles overlap: `dist(attack, boss) < 0.4 + 0.5`
-- **Effective range: 1.6 units from boss center**
-- Damage: 10
-
 ### Boss Behavior (Single Attack)
 
 Boss cycles: `IDLE → WINDUP → ACTIVE → RECOVERY → IDLE`
@@ -72,79 +59,6 @@ ACTIVE:    3 ticks (0.1s) — AOE hits
 RECOVERY: 15 ticks (0.5s) — vulnerable, no damage
 ```
 
-**AOE Attack:**
-
-- Circle centered on boss, radius `1.5`
-- Hits player if circles overlap: `dist(player, boss) < 1.5 + 0.3`
-- **Effective range: 1.8 units from boss center**
-- Damage: 20
-- Player avoids damage if: outside range OR in i-frames
-
----
-
-## Observation Space (13 floats)
-
-Raw game state values — let the network learn its own representations.
-
-```
-Geometry (6):
-  0: dx              = boss_x - player_x (relative position)
-  1: dy              = boss_y - player_y
-  2: player_x        = absolute position [-5, 5]
-  3: player_y        = absolute position [-5, 5]
-  4: boss_x          = absolute position (fixed at 0)
-  5: boss_y          = absolute position (fixed at 0)
-
-Player (5):
-  6: player_hp       = raw HP [0, 100]
-  7: boss_hp         = raw HP [0, 100]
-  8: player_state    = enum {IDLING: 0, DODGING: 1, ATTACKING: 2}
-  9: player_dodge_cooldown = ticks remaining [0, 15]
-  10: player_state_ticks   = ticks in current state
-
-Boss (2):
-  11: boss_state     = enum {IDLING: 0, WINDING_UP: 1, ATTACKING: 2, RECOVERING: 3}
-  12: boss_phase_ticks = ticks in current phase
-```
-
----
-
-## Reward Function
-
-Design your own! Consider these questions:
-
-- **What behaviors do you want to encourage?** (dealing damage, staying alive, winning)
-- **What behaviors do you want to discourage?** (taking hits, timing out, being passive)
-- **Dense vs sparse?** Should the agent get feedback every step, or only at episode end?
-- **Scaling?** How do you balance different reward components so one doesn't dominate?
-
-Hint: Track HP changes between steps. Think about terminal bonuses.
-
----
-
-## Episode Termination
-
-Episodes end when:
-
-- Someone wins (HP reaches 0)
-- Time runs out (prevent infinite episodes)
-
----
-
-## Implementation (C + Python)
-
-Core game logic in C with Python bindings:
-
-```
-boss_fight.h    — Game state struct, enums, c_reset(), c_step(), c_render()
-boss_fight.c    — Standalone test with keyboard input (Shift+WASD/Space/J)
-boss_fight.py   — PufferLib environment wrapper
-```
-
-Uses Raylib for rendering (1080x720 window @ 30 FPS).
-
----
-
 ## RL Experiments
 
 Once v1 is working, design experiments to understand RL concepts:
diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h
index 353e98db3..ec8357067 100644
--- a/pufferlib/ocean/boss_fight/boss_fight.h
+++ b/pufferlib/ocean/boss_fight/boss_fight.h
@@ -9,6 +9,17 @@
 #define PLAYER_SPEED_PER_TICK 0.1f
 #define PLAYER_SIZE 0.3f
 #define BOSS_SIZE 0.5f
+#define PLAYER_ATTACK_RADIUS 0.1f
+#define PLAYER_ATTACK_TICKS 3
+#define PLAYER_DODGE_TICKS 6
+#define PLAYER_DODGE_COOLDOWN 15
+#define PLAYER_ATTACK_DMG 3
+#define BOSS_ATTACK_DMG 3
+#define BOSS_AOE_ATTACK_RADIUS 0.7f
+#define BOSS_IDLE_TICKS 12
+#define BOSS_WINDUP_TICKS 18
+#define BOSS_ACTIVE_TICKS 3
+#define BOSS_RECOVERY_TICKS 12
 
 const Color PLAYER_COLOR = (Color){187, 0, 0, 255};
 const Color BOSS_COLOR = (Color){0, 187, 187, 255};
@@ -43,7 +54,7 @@ typedef struct {
   float player_y;
   float boss_x;
   float boss_y;
-  float distance;
+  // float distance;
 
   PlayerState player_state;
   int player_hp;
@@ -66,6 +77,23 @@ float distance(float x1, float y1, float x2, float y2) {
   return sqrtf(dx * dx + dy * dy);
 }
 
+void update_observations(BossFight *env) {
+  int obs_idx = 0;
+  env->observations[obs_idx++] = env->boss_x - env->player_x;
+  env->observations[obs_idx++] = env->boss_y - env->player_y;
+  env->observations[obs_idx++] = env->player_x;
+  env->observations[obs_idx++] = env->player_y;
+  env->observations[obs_idx++] = env->boss_x;
+  env->observations[obs_idx++] = env->boss_y;
+  env->observations[obs_idx++] = (float)env->player_hp;
+  env->observations[obs_idx++] = (float)env->boss_hp;
+  env->observations[obs_idx++] = (float)env->player_state;
+  env->observations[obs_idx++] = (float)env->player_dodge_cooldown;
+  env->observations[obs_idx++] = (float)env->player_state_ticks;
+  env->observations[obs_idx++] = (float)env->boss_state;
+  env->observations[obs_idx++] = (float)env->boss_phase_ticks;
+}
+
 void c_reset(BossFight *env) {
   env->tick = 0;
   env->player_x = 0;
@@ -79,7 +107,7 @@ void c_reset(BossFight *env) {
   env->player_state_ticks = 0;
   env->boss_state = BOSS_IDLING;
   env->boss_phase_ticks = 0;
-  env->distance = 0;
+  // env->distance = 0;
 
   env->player_x = rand_uniform(-ARENA_HALF_SIZE, ARENA_HALF_SIZE);
   env->player_y = rand_uniform(-ARENA_HALF_SIZE, ARENA_HALF_SIZE);
@@ -90,24 +118,10 @@ void c_reset(BossFight *env) {
     env->player_y = rand_uniform(-ARENA_HALF_SIZE, ARENA_HALF_SIZE);
   }
 
-  env->distance =
-      distance(env->player_x, env->player_y, env->boss_x, env->boss_y);
-
-  int obs_idx = 0;
+  // env->distance =
+  //     distance(env->player_x, env->player_y, env->boss_x, env->boss_y);
 
-  env->observations[obs_idx++] = env->boss_x - env->player_x;
-  env->observations[obs_idx++] = env->boss_y - env->player_y;
-  env->observations[obs_idx++] = env->player_x;
-  env->observations[obs_idx++] = env->player_y;
-  env->observations[obs_idx++] = env->boss_x;
-  env->observations[obs_idx++] = env->boss_y;
-  env->observations[obs_idx++] = (float)env->player_hp;
-  env->observations[obs_idx++] = (float)env->boss_hp;
-  env->observations[obs_idx++] = (float)env->player_state;
-  env->observations[obs_idx++] = (float)env->player_dodge_cooldown;
-  env->observations[obs_idx++] = (float)env->player_state_ticks;
-  env->observations[obs_idx++] = (float)env->boss_state;
-  env->observations[obs_idx++] = (float)env->boss_phase_ticks;
+  update_observations(env);
 }
 
 void c_step(BossFight *env) {
@@ -137,33 +151,98 @@ void c_step(BossFight *env) {
   bool can_dodge =
       env->player_state == PLAYER_IDLING && env->player_dodge_cooldown == 0;
   bool can_attack = env->player_state == PLAYER_IDLING;
+  bool close_enough = distance(env->player_x, env->player_y, env->boss_x,
+                               env->boss_y) < PLAYER_ATTACK_RADIUS;
 
-  bool hit_wall = fabsf(env->player_x) > ARENA_HALF_SIZE &&
+  bool hit_wall = fabsf(env->player_x) > ARENA_HALF_SIZE ||
                   fabsf(env->player_y) > ARENA_HALF_SIZE;
   if (hit_wall) {
     reward -= 0.5;
   }
+  // can't walk out of bounds
+  env->player_x =
+      fmaxf(-ARENA_HALF_SIZE, fminf(ARENA_HALF_SIZE, env->player_x));
+  env->player_y =
+      fmaxf(-ARENA_HALF_SIZE, fminf(ARENA_HALF_SIZE, env->player_y));
+
+  if (wanna_attack && can_attack && close_enough) {
+    env->boss_hp -= PLAYER_ATTACK_DMG;
+  }
 
-  // TODO: here i should handle "player attacks and reduces boss hp" case
+  bool in_aoe_attack = distance(env->player_x, env->player_y, env->boss_x,
+                                env->boss_y) <= BOSS_AOE_ATTACK_RADIUS;
+  bool boss_can_hit = env->player_state != PLAYER_DODGING && in_aoe_attack;
+  bool boss_can_damage = env->boss_state == BOSS_ATTACKING && boss_can_hit;
+  if (boss_can_damage) {
+    env->player_hp -= BOSS_ATTACK_DMG;
+  }
 
-  bool killed_boss = env->boss_hp == 0;
+  bool killed_boss = env->boss_hp <= 0;
   if (killed_boss) {
     reward += 2;
+    env->terminals[0] = 1;
   }
 
   env->rewards[0] = reward;
 
-  bool player_died = env->player_hp == 0;
+  bool player_died = env->player_hp <= 0;
   if (player_died) {
     env->terminals[0] = 1;
   }
 
+  if (wanna_attack && can_attack) {
+    env->player_state_ticks = PLAYER_ATTACK_TICKS;
+    env->player_state = PLAYER_ATTACKING;
+  }
+  if (wanna_dodge && can_dodge) {
+    env->player_state_ticks = PLAYER_DODGE_TICKS;
+    env->player_state = PLAYER_DODGING;
+  }
+  if (env->player_state == PLAYER_DODGING && env->player_state_ticks == 0) {
+    env->player_dodge_cooldown = PLAYER_DODGE_COOLDOWN;
+    env->player_state = PLAYER_IDLING;
+  }
+  if (env->player_state == PLAYER_ATTACKING && env->player_state_ticks == 0) {
+    env->player_state = PLAYER_IDLING;
+  }
+
+  if (env->boss_phase_ticks == 0) {
+    if (env->boss_state == BOSS_IDLING) {
+      env->boss_state = BOSS_WINDING_UP;
+      env->boss_phase_ticks = BOSS_WINDUP_TICKS;
+    } else if (env->boss_state == BOSS_WINDING_UP) {
+      env->boss_state = BOSS_ATTACKING;
+      env->boss_phase_ticks = BOSS_ACTIVE_TICKS;
+    } else if (env->boss_state == BOSS_ATTACKING) {
+      env->boss_state = BOSS_RECOVERING;
+      env->boss_phase_ticks = BOSS_RECOVERY_TICKS;
+    } else if (env->boss_state == BOSS_RECOVERING) {
+      env->boss_state = BOSS_IDLING;
+      env->boss_phase_ticks = BOSS_IDLE_TICKS;
+    }
+  }
+
   env->tick++;
+  if (env->boss_phase_ticks > 0) {
+    env->boss_phase_ticks--;
+  }
+  if (env->player_state_ticks > 0) {
+    env->player_state_ticks--;
+  }
+  if (env->player_dodge_cooldown > 0) {
+    env->player_dodge_cooldown--;
+  }
+
+  if (env->tick >= 1500) {
+    env->terminals[0] = 1;
+  }
+
+  update_observations(env);
 }
 
 void c_render(BossFight *env) {
   if (!IsWindowReady()) {
-    InitWindow(1080, 720, "BossFight");
+    InitWindow(720, 720, "BossFight");
     SetTargetFPS(30);
   }
 

From e80ecdb446983b60872d38db96b8257ca2a98818 Mon Sep 17 00:00:00 2001
From: frixaco <rustam21ashurmatov@gmail.com>
Date: Wed, 21 Jan 2026 00:28:02 +0500
Subject: [PATCH 07/29] attemps to fix agent not learning

---
 pufferlib/ocean/boss_fight/binding.c     |  2 ++
 pufferlib/ocean/boss_fight/boss_fight.h  | 42 +++++++++++++++++++-----
 pufferlib/ocean/boss_fight/boss_fight.py |  7 +++-
 3 files changed, 41 insertions(+), 10 deletions(-)

diff --git a/pufferlib/ocean/boss_fight/binding.c b/pufferlib/ocean/boss_fight/binding.c
index 45731694b..b037d8c14 100644
--- a/pufferlib/ocean/boss_fight/binding.c
+++ b/pufferlib/ocean/boss_fight/binding.c
@@ -10,5 +10,7 @@ static int my_init(Env *env, PyObject *args, PyObject *kwargs) {
 
 static int my_log(PyObject *dict, Log *log) {
   assign_to_dict(dict, "score", log->score);
+  assign_to_dict(dict, "episode_return", log->episode_return);
+  assign_to_dict(dict, "episode_length", log->episode_length);
   return 0;
 }
diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h
index ec8357067..fbc9a4767 100644
--- a/pufferlib/ocean/boss_fight/boss_fight.h
+++ b/pufferlib/ocean/boss_fight/boss_fight.h
@@ -9,7 +9,7 @@
 #define PLAYER_SPEED_PER_TICK 0.1f
 #define PLAYER_SIZE 0.3f
 #define BOSS_SIZE 0.5f
-#define PLAYER_ATTACK_RADIUS 0.1f
+#define PLAYER_ATTACK_RADIUS 0.5f
 #define PLAYER_ATTACK_TICKS 3
 #define PLAYER_DODGE_TICKS 6
 #define PLAYER_DODGE_COOLDOWN 15
@@ -38,8 +38,11 @@ typedef enum {
 
 // Only use floats!
 typedef struct {
-  float score;
-  float n; // Required as the last field
+  float perf;           // 0-1 normalized metric
+  float score;          // unnormalized metric
+  float episode_return; // sum of rewards
+  float episode_length; // steps per episode
+  float n;              // Required as last field
 } Log;
 
 typedef struct {
@@ -65,6 +68,8 @@ typedef struct {
   int boss_hp;
   int boss_phase_ticks;
 
+  float episode_return; // track within episode
+
 } BossFight;
 
 float rand_uniform(float low, float high) {
@@ -77,6 +82,13 @@ float distance(float x1, float y1, float x2, float y2) {
   return sqrtf(dx * dx + dy * dy);
 }
 
+void add_log(BossFight *env) {
+  env->log.episode_return += env->episode_return;
+  env->log.episode_length += env->tick;
+  env->log.score += env->episode_return;
+  env->log.n++;
+}
+
 void update_observations(BossFight *env) {
   int obs_idx = 0;
   env->observations[obs_idx++] = env->boss_x - env->player_x;
@@ -106,8 +118,8 @@ void c_reset(BossFight *env) {
   env->player_dodge_cooldown = 0;
   env->player_state_ticks = 0;
   env->boss_state = BOSS_IDLING;
-  env->boss_phase_ticks = 0;
-  // env->distance = 0;
+  env->boss_phase_ticks = BOSS_IDLE_TICKS;
+  env->episode_return = 0;
 
   env->player_x = rand_uniform(-ARENA_HALF_SIZE, ARENA_HALF_SIZE);
   env->player_y = rand_uniform(-ARENA_HALF_SIZE, ARENA_HALF_SIZE);
@@ -167,6 +179,7 @@ void c_step(BossFight *env) {
 
   if (wanna_attack && can_attack && close_enough) {
     env->boss_hp -= PLAYER_ATTACK_DMG;
+    reward += 0.5;
   }
 
   bool in_aoe_attack = distance(env->player_x, env->player_y, env->boss_x,
@@ -184,12 +197,23 @@ void c_step(BossFight *env) {
   }
 
   env->rewards[0] = reward;
+  env->episode_return += reward;
 
   bool player_died = env->player_hp <= 0;
   if (player_died) {
     env->terminals[0] = 1;
   }
 
+  if (env->tick >= 300) {
+    env->terminals[0] = 1;
+  }
+
+  if (env->terminals[0] == 1) {
+    add_log(env);
+    c_reset(env);
+    return;
+  }
+
   if (wanna_attack && can_attack) {
     env->player_state_ticks = PLAYER_ATTACK_TICKS;
     env->player_state = PLAYER_ATTACKING;
@@ -233,10 +257,6 @@ void c_step(BossFight *env) {
     env->player_dodge_cooldown--;
   }
 
-  if (env->tick >= 1500) {
-    env->terminals[0] = 1;
-  }
-
   update_observations(env);
 }
 
@@ -251,8 +271,12 @@ void c_render(BossFight *env) {
   }
 
   BeginDrawing();
+
   ClearBackground(BACKGROUND_COLOR);
   DrawText("Beat the boss!", 20, 20, 20, TEXT_COLOR);
+
+  // DrawCircle(int centerX, int centerY, float radius, Color color)
+
   EndDrawing();
 }
 
diff --git a/pufferlib/ocean/boss_fight/boss_fight.py b/pufferlib/ocean/boss_fight/boss_fight.py
index 2e6ce6681..7f6b51667 100644
--- a/pufferlib/ocean/boss_fight/boss_fight.py
+++ b/pufferlib/ocean/boss_fight/boss_fight.py
@@ -17,6 +17,8 @@ def __init__(
         self.single_action_space = gymnasium.spaces.Discrete(7)
         self.render_mode = render_mode
         self.num_agents = num_envs
+        self.log_interval = log_interval
+        self.tick = 0
 
         super().__init__(buf)
         self.c_envs = binding.vec_init(
@@ -38,7 +40,10 @@ def reset(self, seed=0):
     def step(self, actions):
         self.actions[:] = actions
         binding.vec_step(self.c_envs)
-        info = [binding.vec_log(self.c_envs)]
+        self.tick += 1
+        info = []
+        if self.tick % self.log_interval == 0:
+            info.append(binding.vec_log(self.c_envs))
         return (self.observations, self.rewards, self.terminals, self.truncations, info)
 
     def render(self):

From 8d9a6c0555301ec70eb34493584e867a6dd164de Mon Sep 17 00:00:00 2001
From: frixaco <rustam21ashurmatov@gmail.com>
Date: Wed, 21 Jan 2026 01:06:52 +0500
Subject: [PATCH 08/29] adjust distance calcs

---
 pufferlib/ocean/boss_fight/boss_fight.h | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h
index fbc9a4767..1d2be24b5 100644
--- a/pufferlib/ocean/boss_fight/boss_fight.h
+++ b/pufferlib/ocean/boss_fight/boss_fight.h
@@ -124,8 +124,9 @@ void c_reset(BossFight *env) {
   env->player_x = rand_uniform(-ARENA_HALF_SIZE, ARENA_HALF_SIZE);
   env->player_y = rand_uniform(-ARENA_HALF_SIZE, ARENA_HALF_SIZE);
 
-  while (distance(env->player_x, env->player_y, env->boss_x, env->boss_y) <
-         0.1) {
+  while (distance(env->player_x, env->player_y, env->boss_x, env->boss_y) <=
+         PLAYER_SIZE + PLAYER_ATTACK_RADIUS + BOSS_SIZE +
+             BOSS_AOE_ATTACK_RADIUS) {
     env->player_x = rand_uniform(-ARENA_HALF_SIZE, ARENA_HALF_SIZE);
     env->player_y = rand_uniform(-ARENA_HALF_SIZE, ARENA_HALF_SIZE);
   }
@@ -163,8 +164,11 @@ void c_step(BossFight *env) {
   bool can_dodge =
       env->player_state == PLAYER_IDLING && env->player_dodge_cooldown == 0;
   bool can_attack = env->player_state == PLAYER_IDLING;
-  bool close_enough = distance(env->player_x, env->player_y, env->boss_x,
-                               env->boss_y) < PLAYER_ATTACK_RADIUS;
+
+  float dist = distance(env->player_x, env->player_y, env->boss_x, env->boss_y);
+
+  bool close_enough = dist < BOSS_SIZE + PLAYER_ATTACK_RADIUS + PLAYER_SIZE &&
+                      dist > BOSS_SIZE + PLAYER_SIZE;
 
   bool hit_wall = fabsf(env->player_x) > ARENA_HALF_SIZE ||
                   fabsf(env->player_y) > ARENA_HALF_SIZE;
@@ -182,8 +186,9 @@ void c_step(BossFight *env) {
     reward += 0.5;
   }
 
-  bool in_aoe_attack = distance(env->player_x, env->player_y, env->boss_x,
-                                env->boss_y) <= BOSS_AOE_ATTACK_RADIUS;
+  bool in_aoe_attack =
+      dist <= BOSS_SIZE + PLAYER_SIZE + BOSS_AOE_ATTACK_RADIUS &&
+      dist > BOSS_SIZE + PLAYER_SIZE;
   bool boss_can_hit = env->player_state != PLAYER_DODGING && in_aoe_attack;
   bool boss_can_damage = env->boss_state == BOSS_ATTACKING && boss_can_hit;
   if (boss_can_damage) {

From 8edc6055d92d95d9893b728e5b933e908845f48f Mon Sep 17 00:00:00 2001
From: frixaco <rustam21ashurmatov@gmail.com>
Date: Wed, 21 Jan 2026 01:07:00 +0500
Subject: [PATCH 09/29] add raylib ui

---
 pufferlib/ocean/boss_fight/boss_fight.h | 28 +++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h
index 1d2be24b5..95e6806ce 100644
--- a/pufferlib/ocean/boss_fight/boss_fight.h
+++ b/pufferlib/ocean/boss_fight/boss_fight.h
@@ -167,8 +167,7 @@ void c_step(BossFight *env) {
 
   float dist = distance(env->player_x, env->player_y, env->boss_x, env->boss_y);
 
-  bool close_enough = dist < BOSS_SIZE + PLAYER_ATTACK_RADIUS + PLAYER_SIZE &&
-                      dist > BOSS_SIZE + PLAYER_SIZE;
+  bool close_enough = dist <= BOSS_SIZE + PLAYER_ATTACK_RADIUS + PLAYER_SIZE;
 
   bool hit_wall = fabsf(env->player_x) > ARENA_HALF_SIZE ||
                   fabsf(env->player_y) > ARENA_HALF_SIZE;
@@ -186,9 +185,7 @@ void c_step(BossFight *env) {
     reward += 0.5;
   }
 
-  bool in_aoe_attack =
-      dist <= BOSS_SIZE + PLAYER_SIZE + BOSS_AOE_ATTACK_RADIUS &&
-      dist > BOSS_SIZE + PLAYER_SIZE;
+  bool in_aoe_attack = dist <= BOSS_SIZE + PLAYER_SIZE + BOSS_AOE_ATTACK_RADIUS;
   bool boss_can_hit = env->player_state != PLAYER_DODGING && in_aoe_attack;
   bool boss_can_damage = env->boss_state == BOSS_ATTACKING && boss_can_hit;
   if (boss_can_damage) {
@@ -265,6 +262,15 @@ void c_step(BossFight *env) {
   update_observations(env);
 }
 
+int world_to_screen(float world_coord) {
+  return (int)((world_coord + ARENA_HALF_SIZE) / (2 * ARENA_HALF_SIZE) *
+               720.0f);
+}
+
+float radius_to_screen(float world_radius) {
+  return world_radius / (2 * ARENA_HALF_SIZE) * 720.0f;
+}
+
 void c_render(BossFight *env) {
   if (!IsWindowReady()) {
     InitWindow(720, 720, "BossFight");
@@ -280,7 +286,17 @@ void c_render(BossFight *env) {
   ClearBackground(BACKGROUND_COLOR);
   DrawText("Beat the boss!", 20, 20, 20, TEXT_COLOR);
 
-  // DrawCircle(int centerX, int centerY, float radius, Color color)
+  DrawCircle(world_to_screen(env->player_x), world_to_screen(env->player_y),
+             radius_to_screen(PLAYER_SIZE + PLAYER_ATTACK_RADIUS),
+             HITBOX_COLOR);
+  DrawCircle(world_to_screen(env->player_x), world_to_screen(env->player_y),
+             radius_to_screen(PLAYER_SIZE), PLAYER_COLOR);
+
+  DrawCircle(world_to_screen(env->boss_x), world_to_screen(env->boss_y),
+             radius_to_screen(BOSS_SIZE + BOSS_AOE_ATTACK_RADIUS),
+             HITBOX_COLOR);
+  DrawCircle(world_to_screen(env->boss_x), world_to_screen(env->boss_y),
+             radius_to_screen(BOSS_SIZE), BOSS_COLOR);
 
   EndDrawing();
 }

From 8c78b11aeaff7d2c62c06b483bc66cb9597a28e3 Mon Sep 17 00:00:00 2001
From: frixaco <rustam21ashurmatov@gmail.com>
Date: Wed, 21 Jan 2026 01:10:50 +0500
Subject: [PATCH 10/29] reward for getting closer

---
 pufferlib/ocean/boss_fight/boss_fight.h | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h
index 95e6806ce..0c62149e4 100644
--- a/pufferlib/ocean/boss_fight/boss_fight.h
+++ b/pufferlib/ocean/boss_fight/boss_fight.h
@@ -57,7 +57,7 @@ typedef struct {
   float player_y;
   float boss_x;
   float boss_y;
-  // float distance;
+  float prev_distance;
 
   PlayerState player_state;
   int player_hp;
@@ -131,8 +131,8 @@ void c_reset(BossFight *env) {
     env->player_y = rand_uniform(-ARENA_HALF_SIZE, ARENA_HALF_SIZE);
   }
 
-  // env->distance =
-  //     distance(env->player_x, env->player_y, env->boss_x, env->boss_y);
+  env->prev_distance =
+      distance(env->player_x, env->player_y, env->boss_x, env->boss_y);
 
   update_observations(env);
 }
@@ -167,6 +167,11 @@ void c_step(BossFight *env) {
 
   float dist = distance(env->player_x, env->player_y, env->boss_x, env->boss_y);
 
+  if (dist < env->prev_distance) {
+    reward += 0.5;
+  }
+  env->prev_distance = dist;
+
   bool close_enough = dist <= BOSS_SIZE + PLAYER_ATTACK_RADIUS + PLAYER_SIZE;
 
   bool hit_wall = fabsf(env->player_x) > ARENA_HALF_SIZE ||

From 9f273df2d73fee376eba7105077ac58a139e9dcd Mon Sep 17 00:00:00 2001
From: frixaco <rustam21ashurmatov@gmail.com>
Date: Wed, 21 Jan 2026 01:33:29 +0500
Subject: [PATCH 11/29] adjust rewards; speed up training

---
 pufferlib/config/boss_fight.ini         | 4 ++--
 pufferlib/ocean/boss_fight/boss_fight.h | 8 +++++---
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/pufferlib/config/boss_fight.ini b/pufferlib/config/boss_fight.ini
index fcfe697cc..c0913bb62 100644
--- a/pufferlib/config/boss_fight.ini
+++ b/pufferlib/config/boss_fight.ini
@@ -5,7 +5,7 @@ policy_name = Policy
 # rnn_name = Recurrent  # Uncomment if adding LSTM/GRU
 
 [vec]
-num_envs = 112
+num_envs = 448
 num_workers = 14
 batch_size = auto
 zero_copy = True
@@ -30,7 +30,7 @@ checkpoint_interval = 200
 seed = 42
 # TODO: disable for sweep or speed
 torch_deterministic = True
-device = mps
+device = cpu
 
 # Optimization
 # TODO: try muon with 0.015 lr
diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h
index 0c62149e4..271c72676 100644
--- a/pufferlib/ocean/boss_fight/boss_fight.h
+++ b/pufferlib/ocean/boss_fight/boss_fight.h
@@ -42,6 +42,7 @@ typedef struct {
   float score;          // unnormalized metric
   float episode_return; // sum of rewards
   float episode_length; // steps per episode
+  float wins;           // episodes where boss died
   float n;              // Required as last field
 } Log;
 
@@ -168,7 +169,7 @@ void c_step(BossFight *env) {
   float dist = distance(env->player_x, env->player_y, env->boss_x, env->boss_y);
 
   if (dist < env->prev_distance) {
-    reward += 0.5;
+    reward += 0.3;
   }
   env->prev_distance = dist;
 
@@ -177,7 +178,7 @@ void c_step(BossFight *env) {
   bool hit_wall = fabsf(env->player_x) > ARENA_HALF_SIZE ||
                   fabsf(env->player_y) > ARENA_HALF_SIZE;
   if (hit_wall) {
-    reward -= 0.5;
+    reward -= 1;
   }
   // can't walk out of bounds
   env->player_x =
@@ -187,7 +188,7 @@ void c_step(BossFight *env) {
 
   if (wanna_attack && can_attack && close_enough) {
     env->boss_hp -= PLAYER_ATTACK_DMG;
-    reward += 0.5;
+    reward += 1;
   }
 
   bool in_aoe_attack = dist <= BOSS_SIZE + PLAYER_SIZE + BOSS_AOE_ATTACK_RADIUS;
@@ -195,6 +196,7 @@ void c_step(BossFight *env) {
   bool boss_can_damage = env->boss_state == BOSS_ATTACKING && boss_can_hit;
   if (boss_can_damage) {
     env->player_hp -= BOSS_ATTACK_DMG;
+    reward -= 0.5;
   }
 
   bool killed_boss = env->boss_hp <= 0;

From 828073d938960c225f2a97682a4ae410279dccf3 Mon Sep 17 00:00:00 2001
From: frixaco <rustam21ashurmatov@gmail.com>
Date: Wed, 21 Jan 2026 02:25:20 +0500
Subject: [PATCH 12/29] actually fix agent not learning

---
 pufferlib/ocean/boss_fight/binding.c    | 1 +
 pufferlib/ocean/boss_fight/boss_fight.h | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/pufferlib/ocean/boss_fight/binding.c b/pufferlib/ocean/boss_fight/binding.c
index b037d8c14..011e50928 100644
--- a/pufferlib/ocean/boss_fight/binding.c
+++ b/pufferlib/ocean/boss_fight/binding.c
@@ -12,5 +12,6 @@ static int my_log(PyObject *dict, Log *log) {
   assign_to_dict(dict, "score", log->score);
   assign_to_dict(dict, "episode_return", log->episode_return);
   assign_to_dict(dict, "episode_length", log->episode_length);
+  assign_to_dict(dict, "wins", log->wins);
   return 0;
 }
diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h
index 271c72676..f225f2272 100644
--- a/pufferlib/ocean/boss_fight/boss_fight.h
+++ b/pufferlib/ocean/boss_fight/boss_fight.h
@@ -87,6 +87,7 @@ void add_log(BossFight *env) {
   env->log.episode_return += env->episode_return;
   env->log.episode_length += env->tick;
   env->log.score += env->episode_return;
+  env->log.wins += (env->boss_hp <= 0) ? 1.0f : 0.0f;
   env->log.n++;
 }
 
@@ -169,7 +170,7 @@ void c_step(BossFight *env) {
   float dist = distance(env->player_x, env->player_y, env->boss_x, env->boss_y);
 
   if (dist < env->prev_distance) {
-    reward += 0.3;
+    reward += 0.01; // small hint, not main reward
   }
   env->prev_distance = dist;
 
@@ -201,7 +202,7 @@ void c_step(BossFight *env) {
 
   bool killed_boss = env->boss_hp <= 0;
   if (killed_boss) {
-    reward += 2;
+    reward += 10; // main goal - make it big
     env->terminals[0] = 1;
   }
 

From f3c132d5f8a543b517d7b3974491c5c11712c805 Mon Sep 17 00:00:00 2001
From: frixaco <rustam21ashurmatov@gmail.com>
Date: Wed, 21 Jan 2026 02:25:45 +0500
Subject: [PATCH 13/29] add collision; add hp bars

---
 AGENTS.md                               |  6 +++
 pufferlib/ocean/boss_fight/boss_fight.h | 53 ++++++++++++++++++++-----
 2 files changed, 48 insertions(+), 11 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index 0712dc874..52f2b5d9a 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -50,3 +50,9 @@ Train and check scores:
 ```
 puffer train puffer_boss_fight --train.total-timesteps 50000
 ```
+
+## Eval
+
+```
+puffer eval puffer_boss_fight --load-model-path $(ls -t experiments/puffer_boss_fight_*/model_*.pt | head -1)
+```
diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h
index f225f2272..1cf5d8c76 100644
--- a/pufferlib/ocean/boss_fight/boss_fight.h
+++ b/pufferlib/ocean/boss_fight/boss_fight.h
@@ -26,6 +26,7 @@ const Color BOSS_COLOR = (Color){0, 187, 187, 255};
 const Color TEXT_COLOR = (Color){241, 241, 241, 241};
 const Color HITBOX_COLOR = (Color){241, 241, 241, 241};
 const Color BACKGROUND_COLOR = (Color){6, 24, 24, 255};
+const Color HP_COLOR = (Color){0, 255, 0, 255};
 
 typedef enum { PLAYER_IDLING, PLAYER_DODGING, PLAYER_ATTACKING } PlayerState;
 
@@ -187,6 +188,17 @@ void c_step(BossFight *env) {
   env->player_y =
       fmaxf(-ARENA_HALF_SIZE, fminf(ARENA_HALF_SIZE, env->player_y));
 
+  // push player out if clipping into boss
+  if (dist < BOSS_SIZE + PLAYER_SIZE) {
+    float overlap = BOSS_SIZE + PLAYER_SIZE - dist;
+    float dx = env->player_x - env->boss_x;
+    float dy = env->player_y - env->boss_y;
+    env->player_x += (dx / dist) * overlap;
+    env->player_y += (dy / dist) * overlap;
+    // recalculate distance after push
+    dist = distance(env->player_x, env->player_y, env->boss_x, env->boss_y);
+  }
+
   if (wanna_attack && can_attack && close_enough) {
     env->boss_hp -= PLAYER_ATTACK_DMG;
     reward += 1;
@@ -294,17 +306,36 @@ void c_render(BossFight *env) {
   ClearBackground(BACKGROUND_COLOR);
   DrawText("Beat the boss!", 20, 20, 20, TEXT_COLOR);
 
-  DrawCircle(world_to_screen(env->player_x), world_to_screen(env->player_y),
-             radius_to_screen(PLAYER_SIZE + PLAYER_ATTACK_RADIUS),
-             HITBOX_COLOR);
-  DrawCircle(world_to_screen(env->player_x), world_to_screen(env->player_y),
-             radius_to_screen(PLAYER_SIZE), PLAYER_COLOR);
-
-  DrawCircle(world_to_screen(env->boss_x), world_to_screen(env->boss_y),
-             radius_to_screen(BOSS_SIZE + BOSS_AOE_ATTACK_RADIUS),
-             HITBOX_COLOR);
-  DrawCircle(world_to_screen(env->boss_x), world_to_screen(env->boss_y),
-             radius_to_screen(BOSS_SIZE), BOSS_COLOR);
+  #define HP_BAR_WIDTH 40
+  #define HP_BAR_HEIGHT 5
+
+  // Player
+  int player_sx = world_to_screen(env->player_x);
+  int player_sy = world_to_screen(env->player_y);
+  int player_hp_bar_y = player_sy + (int)radius_to_screen(PLAYER_SIZE) + 5;
+  int player_hp_width = (int)((float)env->player_hp / MAX_HP * HP_BAR_WIDTH);
+
+  DrawCircle(player_sx, player_sy,
+             radius_to_screen(PLAYER_SIZE + PLAYER_ATTACK_RADIUS), HITBOX_COLOR);
+  DrawCircle(player_sx, player_sy, radius_to_screen(PLAYER_SIZE), PLAYER_COLOR);
+  DrawRectangle(player_sx - HP_BAR_WIDTH / 2, player_hp_bar_y,
+                HP_BAR_WIDTH, HP_BAR_HEIGHT, DARKGRAY);
+  DrawRectangle(player_sx - HP_BAR_WIDTH / 2, player_hp_bar_y,
+                player_hp_width, HP_BAR_HEIGHT, HP_COLOR);
+
+  // Boss
+  int boss_sx = world_to_screen(env->boss_x);
+  int boss_sy = world_to_screen(env->boss_y);
+  int boss_hp_bar_y = boss_sy + (int)radius_to_screen(BOSS_SIZE) + 5;
+  int boss_hp_width = (int)((float)env->boss_hp / MAX_HP * HP_BAR_WIDTH);
+
+  DrawCircle(boss_sx, boss_sy,
+             radius_to_screen(BOSS_SIZE + BOSS_AOE_ATTACK_RADIUS), HITBOX_COLOR);
+  DrawCircle(boss_sx, boss_sy, radius_to_screen(BOSS_SIZE), BOSS_COLOR);
+  DrawRectangle(boss_sx - HP_BAR_WIDTH / 2, boss_hp_bar_y,
+                HP_BAR_WIDTH, HP_BAR_HEIGHT, DARKGRAY);
+  DrawRectangle(boss_sx - HP_BAR_WIDTH / 2, boss_hp_bar_y,
+                boss_hp_width, HP_BAR_HEIGHT, HP_COLOR);
 
   EndDrawing();
 }

From 20ce3fad36029f37e19adba2a8a33a5179027a40 Mon Sep 17 00:00:00 2001
From: frixaco <rustam21ashurmatov@gmail.com>
Date: Wed, 21 Jan 2026 02:53:16 +0500
Subject: [PATCH 14/29] add reward for dodging

---
 pufferlib/ocean/boss_fight/boss_fight.h | 57 +++++++++++++++----------
 1 file changed, 35 insertions(+), 22 deletions(-)

diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h
index 1cf5d8c76..9698ef558 100644
--- a/pufferlib/ocean/boss_fight/boss_fight.h
+++ b/pufferlib/ocean/boss_fight/boss_fight.h
@@ -14,17 +14,19 @@
 #define PLAYER_DODGE_TICKS 6
 #define PLAYER_DODGE_COOLDOWN 15
 #define PLAYER_ATTACK_DMG 3
-#define BOSS_ATTACK_DMG 3
+#define BOSS_ATTACK_DMG 10
 #define BOSS_AOE_ATTACK_RADIUS 0.7f
 #define BOSS_IDLE_TICKS 12
-#define BOSS_WINDUP_TICKS 18
+#define BOSS_WINDUP_TICKS 10
 #define BOSS_ACTIVE_TICKS 3
-#define BOSS_RECOVERY_TICKS 12
+#define BOSS_RECOVERY_TICKS 10
+#define HP_BAR_WIDTH 40
+#define HP_BAR_HEIGHT 5
 
-const Color PLAYER_COLOR = (Color){187, 0, 0, 255};
+const Color PLAYER_COLOR = (Color){50, 100, 255, 255};
 const Color BOSS_COLOR = (Color){0, 187, 187, 255};
-const Color TEXT_COLOR = (Color){241, 241, 241, 241};
-const Color HITBOX_COLOR = (Color){241, 241, 241, 241};
+const Color TEXT_COLOR = (Color){241, 241, 241, 255};
+const Color HITBOX_COLOR = (Color){241, 241, 241, 50};
 const Color BACKGROUND_COLOR = (Color){6, 24, 24, 255};
 const Color HP_COLOR = (Color){0, 255, 0, 255};
 
@@ -209,7 +211,15 @@ void c_step(BossFight *env) {
   bool boss_can_damage = env->boss_state == BOSS_ATTACKING && boss_can_hit;
   if (boss_can_damage) {
     env->player_hp -= BOSS_ATTACK_DMG;
-    reward -= 0.5;
+    reward -= 5;  // make tanking hurt more
+  }
+
+  // reward for successfully dodging an attack
+  bool dodged_attack = env->player_state == PLAYER_DODGING && 
+                       env->boss_state == BOSS_ATTACKING && 
+                       in_aoe_attack;
+  if (dodged_attack) {
+    reward += 2;  // incentivize dodge timing
   }
 
   bool killed_boss = env->boss_hp <= 0;
@@ -306,22 +316,17 @@ void c_render(BossFight *env) {
   ClearBackground(BACKGROUND_COLOR);
   DrawText("Beat the boss!", 20, 20, 20, TEXT_COLOR);
 
-  #define HP_BAR_WIDTH 40
-  #define HP_BAR_HEIGHT 5
-
   // Player
   int player_sx = world_to_screen(env->player_x);
   int player_sy = world_to_screen(env->player_y);
   int player_hp_bar_y = player_sy + (int)radius_to_screen(PLAYER_SIZE) + 5;
   int player_hp_width = (int)((float)env->player_hp / MAX_HP * HP_BAR_WIDTH);
 
+  Color player_color = env->player_hp <= 0 ? RED : PLAYER_COLOR;
   DrawCircle(player_sx, player_sy,
-             radius_to_screen(PLAYER_SIZE + PLAYER_ATTACK_RADIUS), HITBOX_COLOR);
-  DrawCircle(player_sx, player_sy, radius_to_screen(PLAYER_SIZE), PLAYER_COLOR);
-  DrawRectangle(player_sx - HP_BAR_WIDTH / 2, player_hp_bar_y,
-                HP_BAR_WIDTH, HP_BAR_HEIGHT, DARKGRAY);
-  DrawRectangle(player_sx - HP_BAR_WIDTH / 2, player_hp_bar_y,
-                player_hp_width, HP_BAR_HEIGHT, HP_COLOR);
+             radius_to_screen(PLAYER_SIZE + PLAYER_ATTACK_RADIUS),
+             HITBOX_COLOR);
+  DrawCircle(player_sx, player_sy, radius_to_screen(PLAYER_SIZE), player_color);
 
   // Boss
   int boss_sx = world_to_screen(env->boss_x);
@@ -329,13 +334,21 @@ void c_render(BossFight *env) {
   int boss_hp_bar_y = boss_sy + (int)radius_to_screen(BOSS_SIZE) + 5;
   int boss_hp_width = (int)((float)env->boss_hp / MAX_HP * HP_BAR_WIDTH);
 
+  Color boss_color = env->boss_hp <= 0 ? RED : BOSS_COLOR;
   DrawCircle(boss_sx, boss_sy,
-             radius_to_screen(BOSS_SIZE + BOSS_AOE_ATTACK_RADIUS), HITBOX_COLOR);
-  DrawCircle(boss_sx, boss_sy, radius_to_screen(BOSS_SIZE), BOSS_COLOR);
-  DrawRectangle(boss_sx - HP_BAR_WIDTH / 2, boss_hp_bar_y,
-                HP_BAR_WIDTH, HP_BAR_HEIGHT, DARKGRAY);
-  DrawRectangle(boss_sx - HP_BAR_WIDTH / 2, boss_hp_bar_y,
-                boss_hp_width, HP_BAR_HEIGHT, HP_COLOR);
+             radius_to_screen(BOSS_SIZE + BOSS_AOE_ATTACK_RADIUS),
+             HITBOX_COLOR);
+  DrawCircle(boss_sx, boss_sy, radius_to_screen(BOSS_SIZE), boss_color);
+
+  // Player HP bar - bottom left
+  DrawText("Player", 20, 680, 16, TEXT_COLOR);
+  DrawRectangle(20, 700, HP_BAR_WIDTH * 3, HP_BAR_HEIGHT, DARKGRAY);
+  DrawRectangle(20, 700, player_hp_width * 3, HP_BAR_HEIGHT, HP_COLOR);
+
+  // Boss HP bar - bottom right
+  DrawText("Boss", 580, 680, 16, TEXT_COLOR);
+  DrawRectangle(580, 700, HP_BAR_WIDTH * 3, HP_BAR_HEIGHT, DARKGRAY);
+  DrawRectangle(580, 700, boss_hp_width * 3, HP_BAR_HEIGHT, HP_COLOR);
 
   EndDrawing();
 }

From 125e492acf489841f032e7942e0e461d6b8a0529 Mon Sep 17 00:00:00 2001
From: frixaco <rustam21ashurmatov@gmail.com>
Date: Wed, 21 Jan 2026 03:02:39 +0500
Subject: [PATCH 15/29] show wins/losses/timeouts in ui

---
 pufferlib/ocean/boss_fight/boss_fight.h | 31 +++++++++++++++++--------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h
index 9698ef558..2255c72ed 100644
--- a/pufferlib/ocean/boss_fight/boss_fight.h
+++ b/pufferlib/ocean/boss_fight/boss_fight.h
@@ -74,6 +74,10 @@ typedef struct {
 
   float episode_return; // track within episode
 
+  // stats
+  int player_wins;
+  int boss_wins;
+  int timeouts;
 } BossFight;
 
 float rand_uniform(float low, float high) {
@@ -223,23 +227,24 @@ void c_step(BossFight *env) {
   }
 
   bool killed_boss = env->boss_hp <= 0;
+  bool player_died = env->player_hp <= 0;
+  bool timed_out = env->tick >= 300;
+
   if (killed_boss) {
-    reward += 10; // main goal - make it big
+    reward += 10;
+    env->terminals[0] = 1;
+    env->player_wins++;
+  } else if (player_died) {
     env->terminals[0] = 1;
+    env->boss_wins++;
+  } else if (timed_out) {
+    env->terminals[0] = 1;
+    env->timeouts++;
   }
 
   env->rewards[0] = reward;
   env->episode_return += reward;
 
-  bool player_died = env->player_hp <= 0;
-  if (player_died) {
-    env->terminals[0] = 1;
-  }
-
-  if (env->tick >= 300) {
-    env->terminals[0] = 1;
-  }
-
   if (env->terminals[0] == 1) {
     add_log(env);
     c_reset(env);
@@ -316,6 +321,12 @@ void c_render(BossFight *env) {
   ClearBackground(BACKGROUND_COLOR);
   DrawText("Beat the boss!", 20, 20, 20, TEXT_COLOR);
 
+  // Stats top-right
+  char stats[64];
+  snprintf(stats, sizeof(stats), "W:%d L:%d T:%d", 
+           env->player_wins, env->boss_wins, env->timeouts);
+  DrawText(stats, 580, 20, 20, TEXT_COLOR);
+
   // Player
   int player_sx = world_to_screen(env->player_x);
   int player_sy = world_to_screen(env->player_y);

From 5669bea91e4cd49f262d625d2b4f996b2d8646bb Mon Sep 17 00:00:00 2001
From: frixaco <rustam21ashurmatov@gmail.com>
Date: Wed, 21 Jan 2026 04:23:38 +0500
Subject: [PATCH 16/29] wip: reward shaping

---
 pufferlib/ocean/boss_fight/boss_fight.h | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h
index 2255c72ed..67caa7af7 100644
--- a/pufferlib/ocean/boss_fight/boss_fight.h
+++ b/pufferlib/ocean/boss_fight/boss_fight.h
@@ -14,7 +14,7 @@
 #define PLAYER_DODGE_TICKS 6
 #define PLAYER_DODGE_COOLDOWN 15
 #define PLAYER_ATTACK_DMG 3
-#define BOSS_ATTACK_DMG 10
+#define BOSS_ATTACK_DMG 30
 #define BOSS_AOE_ATTACK_RADIUS 0.7f
 #define BOSS_IDLE_TICKS 12
 #define BOSS_WINDUP_TICKS 10
@@ -215,15 +215,14 @@ void c_step(BossFight *env) {
   bool boss_can_damage = env->boss_state == BOSS_ATTACKING && boss_can_hit;
   if (boss_can_damage) {
     env->player_hp -= BOSS_ATTACK_DMG;
-    reward -= 5;  // make tanking hurt more
+    reward -= 5; // make tanking hurt more
   }
 
   // reward for successfully dodging an attack
-  bool dodged_attack = env->player_state == PLAYER_DODGING && 
-                       env->boss_state == BOSS_ATTACKING && 
-                       in_aoe_attack;
+  bool dodged_attack = env->player_state == PLAYER_DODGING &&
+                       env->boss_state == BOSS_ATTACKING && in_aoe_attack;
   if (dodged_attack) {
-    reward += 2;  // incentivize dodge timing
+    reward += 5; // incentivize dodge timing
   }
 
   bool killed_boss = env->boss_hp <= 0;
@@ -323,8 +322,8 @@ void c_render(BossFight *env) {
 
   // Stats top-right
   char stats[64];
-  snprintf(stats, sizeof(stats), "W:%d L:%d T:%d", 
-           env->player_wins, env->boss_wins, env->timeouts);
+  snprintf(stats, sizeof(stats), "W:%d L:%d T:%d", env->player_wins,
+           env->boss_wins, env->timeouts);
   DrawText(stats, 580, 20, 20, TEXT_COLOR);
 
   // Player

From 7a2438718d3be210bf70eb28e00b52a8f9ec78f4 Mon Sep 17 00:00:00 2001
From: frixaco <rustam21ashurmatov@gmail.com>
Date: Wed, 21 Jan 2026 10:33:55 +0500
Subject: [PATCH 17/29] runpod instructions; more live logging

---
 pufferlib/ocean/boss_fight/boss_fight.py |  2 +-
 runpod.md                                | 12 ++++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)
 create mode 100644 runpod.md

diff --git a/pufferlib/ocean/boss_fight/boss_fight.py b/pufferlib/ocean/boss_fight/boss_fight.py
index 7f6b51667..fdb4bfb4f 100644
--- a/pufferlib/ocean/boss_fight/boss_fight.py
+++ b/pufferlib/ocean/boss_fight/boss_fight.py
@@ -9,7 +9,7 @@
 
 class BossFight(pufferlib.PufferEnv):
     def __init__(
-        self, num_envs=1, render_mode=None, log_interval=128, size=5, buf=None, seed=0
+        self, num_envs=1, render_mode=None, log_interval=1, size=5, buf=None, seed=0
     ):
         self.single_observation_space = gymnasium.spaces.Box(
             low=-10, high=110, shape=(13,), dtype=np.float32
diff --git a/runpod.md b/runpod.md
new file mode 100644
index 000000000..2ce8814ec
--- /dev/null
+++ b/runpod.md
@@ -0,0 +1,12 @@
+curl -LsSf https://astral.sh/uv/install.sh | sh
+source ~/.bashrc
+git clone https://github.com/frixaco/PufferLib
+cd PufferLib
+git switch boss-fight
+uv venv
+source .venv/bin/activate
+uv pip install -e .
+python setup.py build_boss_fight --inplace --force
+puffer train puffer_boss_fight --train.total-timestamps 5000000 --train.device cuda --vec.num-envs 8192 --vec.workers 16 --train.minibatch-size 8192 --train.max-minibatch-size 65536
+
+puffer eval puffer*boss_fight --load-model-path $(ls -t experiments/puffer_boss_fight*\_/model\_\_.pt | head -1)

From 46077143ac79b5f1ab14a53d2ecad2f3f4f5a68a Mon Sep 17 00:00:00 2001
From: frixaco <rustam21ashurmatov@gmail.com>
Date: Wed, 21 Jan 2026 10:41:37 +0500
Subject: [PATCH 18/29] force player to dodge more

---
 pufferlib/ocean/boss_fight/boss_fight.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h
index 67caa7af7..4ed14d83f 100644
--- a/pufferlib/ocean/boss_fight/boss_fight.h
+++ b/pufferlib/ocean/boss_fight/boss_fight.h
@@ -14,7 +14,7 @@
 #define PLAYER_DODGE_TICKS 6
 #define PLAYER_DODGE_COOLDOWN 15
 #define PLAYER_ATTACK_DMG 3
-#define BOSS_ATTACK_DMG 30
+#define BOSS_ATTACK_DMG 10
 #define BOSS_AOE_ATTACK_RADIUS 0.7f
 #define BOSS_IDLE_TICKS 12
 #define BOSS_WINDUP_TICKS 10
@@ -215,7 +215,7 @@ void c_step(BossFight *env) {
   bool boss_can_damage = env->boss_state == BOSS_ATTACKING && boss_can_hit;
   if (boss_can_damage) {
     env->player_hp -= BOSS_ATTACK_DMG;
-    reward -= 5; // make tanking hurt more
+    reward -= 8; // make tanking expensive but survivable
   }
 
   // reward for successfully dodging an attack

From 615afdb1e00f8d09cd911dc383ab10ea41905fc1 Mon Sep 17 00:00:00 2001
From: frixaco <rustam21ashurmatov@gmail.com>
Date: Wed, 21 Jan 2026 12:05:54 +0500
Subject: [PATCH 19/29] wip: agent not learning

---
 pufferlib/ocean/boss_fight/boss_fight.h | 80 +++++++++++++++----------
 runpod.md                               |  2 +-
 2 files changed, 48 insertions(+), 34 deletions(-)

diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h
index 4ed14d83f..37348fe9c 100644
--- a/pufferlib/ocean/boss_fight/boss_fight.h
+++ b/pufferlib/ocean/boss_fight/boss_fight.h
@@ -16,13 +16,24 @@
 #define PLAYER_ATTACK_DMG 3
 #define BOSS_ATTACK_DMG 10
 #define BOSS_AOE_ATTACK_RADIUS 0.7f
-#define BOSS_IDLE_TICKS 12
-#define BOSS_WINDUP_TICKS 10
-#define BOSS_ACTIVE_TICKS 3
-#define BOSS_RECOVERY_TICKS 10
+#define BOSS_IDLE_TICKS 7
+#define BOSS_WINDUP_TICKS 5
+#define BOSS_ACTIVE_TICKS 5
+#define BOSS_RECOVERY_TICKS 5
 #define HP_BAR_WIDTH 40
 #define HP_BAR_HEIGHT 5
 
+// Rewards
+#define REWARD_APPROACH 0.01f
+#define REWARD_HIT_WALL -1.0f
+#define REWARD_PLAYER_HIT_BOSS 1.0f
+#define REWARD_BOSS_HIT_PLAYER -2.0f
+#define REWARD_DODGE_SUCCESS 1.0f
+#define REWARD_KILL_BOSS 10.0f
+#define REWARD_PLAYER_DIED -10.0f
+#define REWARD_TIMEOUT -10.0f
+#define EPISODE_LENGTH 500
+
 const Color PLAYER_COLOR = (Color){50, 100, 255, 255};
 const Color BOSS_COLOR = (Color){0, 187, 187, 255};
 const Color TEXT_COLOR = (Color){241, 241, 241, 255};
@@ -174,10 +185,19 @@ void c_step(BossFight *env) {
       env->player_state == PLAYER_IDLING && env->player_dodge_cooldown == 0;
   bool can_attack = env->player_state == PLAYER_IDLING;
 
+  if (wanna_attack && can_attack) {
+    env->player_state_ticks = PLAYER_ATTACK_TICKS;
+    env->player_state = PLAYER_ATTACKING;
+  }
+  if (wanna_dodge && can_dodge) {
+    env->player_state_ticks = PLAYER_DODGE_TICKS;
+    env->player_state = PLAYER_DODGING;
+  }
+
   float dist = distance(env->player_x, env->player_y, env->boss_x, env->boss_y);
 
   if (dist < env->prev_distance) {
-    reward += 0.01; // small hint, not main reward
+    reward += REWARD_APPROACH;
   }
   env->prev_distance = dist;
 
@@ -186,7 +206,7 @@ void c_step(BossFight *env) {
   bool hit_wall = fabsf(env->player_x) > ARENA_HALF_SIZE ||
                   fabsf(env->player_y) > ARENA_HALF_SIZE;
   if (hit_wall) {
-    reward -= 1;
+    reward += REWARD_HIT_WALL;
   }
   // can't walk out of bounds
   env->player_x =
@@ -207,7 +227,7 @@ void c_step(BossFight *env) {
 
   if (wanna_attack && can_attack && close_enough) {
     env->boss_hp -= PLAYER_ATTACK_DMG;
-    reward += 1;
+    reward += REWARD_PLAYER_HIT_BOSS;
   }
 
   bool in_aoe_attack = dist <= BOSS_SIZE + PLAYER_SIZE + BOSS_AOE_ATTACK_RADIUS;
@@ -215,28 +235,31 @@ void c_step(BossFight *env) {
   bool boss_can_damage = env->boss_state == BOSS_ATTACKING && boss_can_hit;
   if (boss_can_damage) {
     env->player_hp -= BOSS_ATTACK_DMG;
-    reward -= 8; // make tanking expensive but survivable
+    reward += REWARD_BOSS_HIT_PLAYER;
   }
 
   // reward for successfully dodging an attack
   bool dodged_attack = env->player_state == PLAYER_DODGING &&
-                       env->boss_state == BOSS_ATTACKING && in_aoe_attack;
+                       env->boss_state == BOSS_ATTACKING && in_aoe_attack &&
+                       env->boss_phase_ticks == BOSS_ACTIVE_TICKS;
   if (dodged_attack) {
-    reward += 5; // incentivize dodge timing
+    reward += REWARD_DODGE_SUCCESS;
   }
 
   bool killed_boss = env->boss_hp <= 0;
   bool player_died = env->player_hp <= 0;
-  bool timed_out = env->tick >= 300;
+  bool timed_out = env->tick >= EPISODE_LENGTH;
 
   if (killed_boss) {
-    reward += 10;
+    reward += REWARD_KILL_BOSS;
     env->terminals[0] = 1;
     env->player_wins++;
   } else if (player_died) {
+    reward += REWARD_PLAYER_DIED;
     env->terminals[0] = 1;
     env->boss_wins++;
   } else if (timed_out) {
+    reward += REWARD_TIMEOUT;
     env->terminals[0] = 1;
     env->timeouts++;
   }
@@ -250,22 +273,13 @@ void c_step(BossFight *env) {
     return;
   }
 
-  if (wanna_attack && can_attack) {
-    env->player_state_ticks = PLAYER_ATTACK_TICKS;
-    env->player_state = PLAYER_ATTACKING;
-  }
-  if (wanna_dodge && can_dodge) {
-    env->player_state_ticks = PLAYER_DODGE_TICKS;
-    env->player_state = PLAYER_DODGING;
-  }
-  if (env->player_state == PLAYER_DODGING && env->player_state_ticks == 0) {
-    env->player_dodge_cooldown = PLAYER_DODGE_COOLDOWN;
-    env->player_state = PLAYER_IDLING;
+  env->tick++;
+  if (env->boss_phase_ticks > 0) {
+    env->boss_phase_ticks--;
   }
-  if (env->player_state == PLAYER_ATTACKING && env->player_state_ticks == 0) {
-    env->player_state = PLAYER_IDLING;
+  if (env->player_state_ticks > 0) {
+    env->player_state_ticks--;
   }
-
   if (env->boss_phase_ticks == 0) {
     if (env->boss_state == BOSS_IDLING) {
       env->boss_state = BOSS_WINDING_UP;
@@ -281,13 +295,13 @@ void c_step(BossFight *env) {
       env->boss_phase_ticks = BOSS_IDLE_TICKS;
     }
   }
-
-  env->tick++;
-  if (env->boss_phase_ticks > 0) {
-    env->boss_phase_ticks--;
-  }
-  if (env->player_state_ticks > 0) {
-    env->player_state_ticks--;
+  if (env->player_state_ticks == 0) {
+    if (env->player_state == PLAYER_DODGING) {
+      env->player_dodge_cooldown = PLAYER_DODGE_COOLDOWN;
+      env->player_state = PLAYER_IDLING;
+    } else if (env->player_state == PLAYER_ATTACKING) {
+      env->player_state = PLAYER_IDLING;
+    }
   }
   if (env->player_dodge_cooldown > 0) {
     env->player_dodge_cooldown--;
diff --git a/runpod.md b/runpod.md
index 2ce8814ec..81859f0f9 100644
--- a/runpod.md
+++ b/runpod.md
@@ -7,6 +7,6 @@ uv venv
 source .venv/bin/activate
 uv pip install -e .
 python setup.py build_boss_fight --inplace --force
-puffer train puffer_boss_fight --train.total-timestamps 5000000 --train.device cuda --vec.num-envs 8192 --vec.workers 16 --train.minibatch-size 8192 --train.max-minibatch-size 65536
+puffer train puffer_boss_fight --train.total-timesteps 5000000 --train.device cuda --vec.num-envs 8192 --vec.num-workers 16 --train.minibatch-size 8192 --train.max-minibatch-size 65536
 
 puffer eval puffer*boss_fight --load-model-path $(ls -t experiments/puffer_boss_fight*\_/model\_\_.pt | head -1)

From 8a50f7ed58eaf236862f6861a7dc42d376769595 Mon Sep 17 00:00:00 2001
From: frixaco <rustam21ashurmatov@gmail.com>
Date: Wed, 21 Jan 2026 21:14:23 +0500
Subject: [PATCH 20/29] wip: agent not learning

---
 AGENTS.md                                    | 55 +++++++++++++++++++
 pufferlib/ocean/boss_fight/boss_fight.h      | 57 ++++++++++----------
 pufferlib/ocean/boss_fight/compile_flags.txt |  1 +
 3 files changed, 85 insertions(+), 28 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index 52f2b5d9a..9a339eb5c 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -56,3 +56,58 @@ puffer train puffer_boss_fight --train.total-timesteps 50000
 ```
 puffer eval puffer_boss_fight --load-model-path $(ls -t experiments/puffer_boss_fight_*/model_*.pt | head -1)
 ```
+
+## Environment
+
+**Gameplay**: 2D boss fight. Player moves around a 10x10 arena (-5 to +5), dodges boss AOE attacks, and attacks back. Boss is stationary at center (0,0). Tick rate: 30/sec.
+
+**Actions** (7 discrete): NOOP, UP, DOWN, LEFT, RIGHT, DODGE, ATTACK
+
+**Player States**:
+- `IDLING` — can move, dodge, or attack
+- `DODGING` — 6 ticks, invincible, can't act
+- `ATTACKING` — 3 ticks, stationary, can't act
+
+**Boss Behavior** (cycles continuously):
+- `IDLE` (7 ticks) — does nothing
+- `WINDUP` (5 ticks) — telegraph, player should prepare to dodge
+- `ACTIVE` (5 ticks) — AOE damage zone active, dodge or get hit
+- `RECOVERY` (5 ticks) — safe window to attack boss
+
+**Rewards**:
+- `+10.0` — kill boss
+- `+0.5` — hit boss with attack
+- `+0.5` — successfully dodge during boss attack
+- `+0.05` — approach boss (distance shaping)
+- `-0.01` — per-step penalty
+- `-0.5` — get hit by boss (10 dmg)
+- `-1.0` — hit arena wall
+- `-10.0` — die
+- `-10.0` — timeout
+
+**Episode termination**:
+- Boss HP ≤ 0 (player wins)
+- Player HP ≤ 0 (player dies)
+- 300 ticks timeout (~10 sec)
+
+**Parameters**:
+- Player/Boss HP: 100
+- Player attack dmg: 3, Boss AOE dmg: 10
+- Player speed: 0.1 units/tick
+- Dodge: 6 ticks duration, 15 tick cooldown
+- Boss cycle: IDLE(7) → WINDUP(5) → ACTIVE(5) → RECOVERY(5) = 22 ticks/cycle
+
+**Observations** (13 floats):
+1. `boss_x - player_x` — relative X to boss
+2. `boss_y - player_y` — relative Y to boss
+3. `player_x` — absolute X position
+4. `player_y` — absolute Y position
+5. `boss_x` — boss X (always 0)
+6. `boss_y` — boss Y (always 0)
+7. `player_hp` — player health
+8. `boss_hp` — boss health
+9. `player_state` — 0=IDLING, 1=DODGING, 2=ATTACKING
+10. `player_dodge_cooldown` — ticks until dodge available
+11. `player_state_ticks` — ticks remaining in current state
+12. `boss_state` — 0=IDLE, 1=WINDUP, 2=ATTACKING, 3=RECOVERY
+13. `boss_phase_ticks` — ticks remaining in current boss phase
diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h
index 37348fe9c..abb0af890 100644
--- a/pufferlib/ocean/boss_fight/boss_fight.h
+++ b/pufferlib/ocean/boss_fight/boss_fight.h
@@ -2,19 +2,18 @@
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <string.h>
 
 #define ARENA_HALF_SIZE 5.0f
-#define MAX_HP 100
+#define MAX_HP 1.0f
 #define PLAYER_SPEED_PER_TICK 0.1f
 #define PLAYER_SIZE 0.3f
 #define BOSS_SIZE 0.5f
-#define PLAYER_ATTACK_RADIUS 0.5f
+#define PLAYER_ATTACK_RADIUS 0.4f
 #define PLAYER_ATTACK_TICKS 3
-#define PLAYER_DODGE_TICKS 6
+#define PLAYER_DODGE_TICKS 4
 #define PLAYER_DODGE_COOLDOWN 15
-#define PLAYER_ATTACK_DMG 3
-#define BOSS_ATTACK_DMG 10
+#define PLAYER_ATTACK_DMG 0.1f
+#define BOSS_ATTACK_DMG 0.05f
 #define BOSS_AOE_ATTACK_RADIUS 0.7f
 #define BOSS_IDLE_TICKS 7
 #define BOSS_WINDUP_TICKS 5
@@ -24,15 +23,16 @@
 #define HP_BAR_HEIGHT 5
 
 // Rewards
-#define REWARD_APPROACH 0.01f
-#define REWARD_HIT_WALL -1.0f
-#define REWARD_PLAYER_HIT_BOSS 1.0f
-#define REWARD_BOSS_HIT_PLAYER -2.0f
-#define REWARD_DODGE_SUCCESS 1.0f
-#define REWARD_KILL_BOSS 10.0f
-#define REWARD_PLAYER_DIED -10.0f
-#define REWARD_TIMEOUT -10.0f
-#define EPISODE_LENGTH 500
+#define REWARD_APPROACH 0.5f
+#define REWARD_HIT_WALL -0.1f
+#define REWARD_PLAYER_HIT_BOSS 5.0f
+#define REWARD_BOSS_HIT_PLAYER -0.5f
+#define REWARD_DODGE_SUCCESS 2.0f
+#define REWARD_KILL_BOSS 50.0f
+#define REWARD_PLAYER_DIED -5.0f
+#define REWARD_TIMEOUT -20.0f
+#define REWARD_TICK -0.001f
+#define EPISODE_LENGTH 300
 
 const Color PLAYER_COLOR = (Color){50, 100, 255, 255};
 const Color BOSS_COLOR = (Color){0, 187, 187, 255};
@@ -75,12 +75,12 @@ typedef struct {
   float prev_distance;
 
   PlayerState player_state;
-  int player_hp;
+  float player_hp;
   int player_dodge_cooldown;
   int player_state_ticks;
 
   BossState boss_state;
-  int boss_hp;
+  float boss_hp;
   int boss_phase_ticks;
 
   float episode_return; // track within episode
@@ -158,7 +158,7 @@ void c_reset(BossFight *env) {
 }
 
 void c_step(BossFight *env) {
-  float reward = -0.01;
+  float reward = REWARD_TICK;
   env->terminals[0] = 0;
 
   int action = env->actions[0];
@@ -175,8 +175,10 @@ void c_step(BossFight *env) {
     dx = PLAYER_SPEED_PER_TICK;
   }
 
-  env->player_x += dx;
-  env->player_y += dy;
+  if (env->player_state == PLAYER_IDLING) {
+    env->player_x += dx;
+    env->player_y += dy;
+  }
 
   bool wanna_idle = action == 0;
   bool wanna_dodge = action == 5;
@@ -196,9 +198,7 @@ void c_step(BossFight *env) {
 
   float dist = distance(env->player_x, env->player_y, env->boss_x, env->boss_y);
 
-  if (dist < env->prev_distance) {
-    reward += REWARD_APPROACH;
-  }
+  reward += REWARD_APPROACH * (env->prev_distance - dist);
   env->prev_distance = dist;
 
   bool close_enough = dist <= BOSS_SIZE + PLAYER_ATTACK_RADIUS + PLAYER_SIZE;
@@ -238,11 +238,12 @@ void c_step(BossFight *env) {
     reward += REWARD_BOSS_HIT_PLAYER;
   }
 
-  // reward for successfully dodging an attack
-  bool dodged_attack = env->player_state == PLAYER_DODGING &&
-                       env->boss_state == BOSS_ATTACKING && in_aoe_attack &&
-                       env->boss_phase_ticks == BOSS_ACTIVE_TICKS;
-  if (dodged_attack) {
+  bool would_be_hit = env->boss_state == BOSS_ATTACKING && in_aoe_attack;
+
+  bool successfully_dodging =
+      would_be_hit && env->player_state == PLAYER_DODGING;
+
+  if (successfully_dodging) {
     reward += REWARD_DODGE_SUCCESS;
   }
 
diff --git a/pufferlib/ocean/boss_fight/compile_flags.txt b/pufferlib/ocean/boss_fight/compile_flags.txt
index ea96eb002..c6fecbb72 100644
--- a/pufferlib/ocean/boss_fight/compile_flags.txt
+++ b/pufferlib/ocean/boss_fight/compile_flags.txt
@@ -1 +1,2 @@
 -I../../../raylib-5.5_macos/include
+-I../../../raylib-5.5_linux_amd64/include

From 286d3788a67f7646548549ee9502c9b64384c581 Mon Sep 17 00:00:00 2001
From: frixaco <rustam21ashurmatov@gmail.com>
Date: Sat, 24 Jan 2026 17:22:07 +0500
Subject: [PATCH 21/29] fix agent not learning

---
 AGENTS.md                               | 107 ++------------------
 pufferlib/ocean/boss_fight/README.md    | 128 ------------------------
 pufferlib/ocean/boss_fight/boss_fight.h |  23 +++--
 3 files changed, 18 insertions(+), 240 deletions(-)
 delete mode 100644 pufferlib/ocean/boss_fight/README.md

diff --git a/AGENTS.md b/AGENTS.md
index 9a339eb5c..321f95f8d 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -2,112 +2,17 @@
 
 I'm implementing a RL environment using PufferLib in C + Python.
 
-Environment spec file is in `./pufferlib/ocean/boss_fight/README.md`.
+It's a **minimal** 2D boss fight environment to learn RL concepts with PufferLib.
+Focus: **observation design, reward shaping, training experiments and a bit of game dev using Raylib**
+
+The boss has **1 attack** (AOE burst). All hitboxes are circles (collision = circles overlap).
 
 You are in PufferLib's (puffer.ai) source repository which contains "Ocean" - a collection of environments.
 
 The environment code I'm working on is located in `./pufferlib/ocean/boss_fight/`. Environment configuration is in `./pufferlib/config/boss_fight.ini`
 
-### Setup
-
-1. Fork pufferlib, create new branch
-
-2. Run these:
-
-```
-uv venv
-uv pip install -e .
-```
-
-3. Setup files using templates, update `environment.py`
-
-4. Not sure what this does yet:
-
-```
-python setup.py build_boss_fight --inplace --force
-```
-
-### Testing
-
-Make sure shit's running:
-
-```
-uv pip install -e .
-python -c "
-from pufferlib.ocean.boss_fight import BossFight
-import numpy as np
-env = BossFight(num_envs=2)
-env.reset()
-for _ in range(100):
-    env.step(np.random.randint(0, 7, size=2))
-print('ok')
-env.close()
-"
-```
-
-Train and check scores:
-
-```
-puffer train puffer_boss_fight --train.total-timesteps 50000
-```
-
-## Eval
+After modifying C files, to test you can run:
 
 ```
-puffer eval puffer_boss_fight --load-model-path $(ls -t experiments/puffer_boss_fight_*/model_*.pt | head -1)
+python setup.py build_boss_fight --inplace --force && puffer train puffer_boss_fight --train.device cpu --vec.num-workers 8 --vec.num-envs 1024 --train.total-timesteps 5000000 && puffer eval puffer_boss_fight --load-model-path $(ls -t experiments/puffer_boss_fight_*/model_*.pt | head -1)
 ```
-
-## Environment
-
-**Gameplay**: 2D boss fight. Player moves around a 10x10 arena (-5 to +5), dodges boss AOE attacks, and attacks back. Boss is stationary at center (0,0). Tick rate: 30/sec.
-
-**Actions** (7 discrete): NOOP, UP, DOWN, LEFT, RIGHT, DODGE, ATTACK
-
-**Player States**:
-- `IDLING` — can move, dodge, or attack
-- `DODGING` — 6 ticks, invincible, can't act
-- `ATTACKING` — 3 ticks, stationary, can't act
-
-**Boss Behavior** (cycles continuously):
-- `IDLE` (7 ticks) — does nothing
-- `WINDUP` (5 ticks) — telegraph, player should prepare to dodge
-- `ACTIVE` (5 ticks) — AOE damage zone active, dodge or get hit
-- `RECOVERY` (5 ticks) — safe window to attack boss
-
-**Rewards**:
-- `+10.0` — kill boss
-- `+0.5` — hit boss with attack
-- `+0.5` — successfully dodge during boss attack
-- `+0.05` — approach boss (distance shaping)
-- `-0.01` — per-step penalty
-- `-0.5` — get hit by boss (10 dmg)
-- `-1.0` — hit arena wall
-- `-10.0` — die
-- `-10.0` — timeout
-
-**Episode termination**:
-- Boss HP ≤ 0 (player wins)
-- Player HP ≤ 0 (player dies)
-- 300 ticks timeout (~10 sec)
-
-**Parameters**:
-- Player/Boss HP: 100
-- Player attack dmg: 3, Boss AOE dmg: 10
-- Player speed: 0.1 units/tick
-- Dodge: 6 ticks duration, 15 tick cooldown
-- Boss cycle: IDLE(7) → WINDUP(5) → ACTIVE(5) → RECOVERY(5) = 22 ticks/cycle
-
-**Observations** (13 floats):
-1. `boss_x - player_x` — relative X to boss
-2. `boss_y - player_y` — relative Y to boss
-3. `player_x` — absolute X position
-4. `player_y` — absolute Y position
-5. `boss_x` — boss X (always 0)
-6. `boss_y` — boss Y (always 0)
-7. `player_hp` — player health
-8. `boss_hp` — boss health
-9. `player_state` — 0=IDLING, 1=DODGING, 2=ATTACKING
-10. `player_dodge_cooldown` — ticks until dodge available
-11. `player_state_ticks` — ticks remaining in current state
-12. `boss_state` — 0=IDLE, 1=WINDUP, 2=ATTACKING, 3=RECOVERY
-13. `boss_phase_ticks` — ticks remaining in current boss phase
diff --git a/pufferlib/ocean/boss_fight/README.md b/pufferlib/ocean/boss_fight/README.md
deleted file mode 100644
index 3d5d189e6..000000000
--- a/pufferlib/ocean/boss_fight/README.md
+++ /dev/null
@@ -1,128 +0,0 @@
-# SoulsRL Minimal — RL-Focused Boss Fight Environment
-
-## Goal
-
-Build a **minimal** 2D boss fight environment to learn RL concepts with PufferLib.
-Focus: **observation design, reward shaping, training experiments and a bit of game dev using Raylib**
-
-The boss has **1 attack** (AOE burst). All hitboxes are circles (collision = circles overlap).
-
----
-
-## Core Mechanics (Simplified)
-
-### Constants
-
-```
-Tick rate: 30 ticks/sec (dt = 1/30)
-Arena: 10 x 10 units (centered at origin, so bounds are -5 to +5)
-
-Player:
-  - radius: 0.3
-  - HP: 100
-  - speed: 3.0 units/sec (~0.1 units/tick)
-
-Boss:
-  - radius: 0.5
-  - HP: 100
-  - position: fixed at (0, 0) — does not move
-```
-
-### Player Actions (Discrete, 7 total)
-
-```
-0: NOOP
-1: UP
-2: DOWN
-3: LEFT
-4: RIGHT
-5: DODGE
-6: ATTACK
-```
-
-### Player States
-
-```
-FREE     — can move, can act
-DODGE    — 6 ticks, i-frames on ticks 1-5, moves at 2.5x speed in last move_dir
-ATTACK   — windup(4) + active(3) + recovery(6) = 13 ticks total, no movement
-```
-
-### Boss Behavior (Single Attack)
-
-Boss cycles: `IDLE → WINDUP → ACTIVE → RECOVERY → IDLE`
-
-```
-IDLE:     12 ticks (0.4s) — does nothing
-WINDUP:   18 ticks (0.6s) — telegraphing, no damage
-ACTIVE:    3 ticks (0.1s) — AOE hits
-RECOVERY: 15 ticks (0.5s) — vulnerable, no damage
-```
-
-## RL Experiments
-
-Once v1 is working, design experiments to understand RL concepts:
-
-### Experiment Ideas
-
-**Observation Ablations** — Which observations actually matter?
-
-- What happens if the agent can't see timing information?
-- Does it need absolute position, or is relative enough?
-- What's the minimum viable observation space?
-- Can the network learn to ignore irrelevant/noisy inputs?
-
-**Reward Shaping** — How does reward design affect behavior?
-
-- What if you only reward winning/losing (sparse)?
-- What happens without a time penalty?
-- Can you incentivize specific behaviors (dodging at the right time)?
-- What unintended behaviors might reward bonuses create?
-
-**Hyperparameters** — See `boss_fight.ini` for the sweep config
-
-- Learning rate: stability vs speed
-- Entropy coefficient: exploration vs exploitation
-- Batch size / num_envs: sample efficiency
-- Network size: capacity vs overfitting
-
----
-
-## Success Criteria
-
-1. **Baseline works**: Random agent wins ~0%, trained agent wins >80%
-2. **Learned timing**: Agent dodges during WINDUP, not randomly
-3. **Learned punish**: Agent attacks during RECOVERY, not during ACTIVE
-4. **Experiments complete**: At least 3 ablations run with plotted comparisons
-
----
-
-## Optional Extensions (After Experiments)
-
-Only add these if baseline experiments are done:
-
-1. **Sweep attack**: Cone hitbox, tests directional dodging
-2. **Boss movement**: Slow drift toward player
-3. **Combo attack**: Multi-hit sequence, tests dodge timing
-4. **ASCII rendering**: For debugging/demo
-5. **Curriculum**: Start with longer windup, tighten over training
-
----
-
-## Deliverables
-
-1. `boss_fight.h` — Core game logic in C
-2. `boss_fight.c` — Standalone test binary
-3. `boss_fight.py` — PufferLib environment wrapper
-4. `experiments/` — Saved runs with different configs
-5. `results.md` — Summary of what you learned from experiments
-
----
-
-## Milestones
-
-1. **Environment works**: `c_step()` implemented, can play manually with keyboard
-2. **Random baseline**: Random agent wins ~0%, confirms game is non-trivial
-3. **Learning signal**: Trained agent shows improvement over random
-4. **Competent agent**: Win rate >80%
-5. **Experiments**: At least 3 ablations with documented findings
diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h
index abb0af890..3a3f0fd5e 100644
--- a/pufferlib/ocean/boss_fight/boss_fight.h
+++ b/pufferlib/ocean/boss_fight/boss_fight.h
@@ -23,15 +23,15 @@
 #define HP_BAR_HEIGHT 5
 
 // Rewards
-#define REWARD_APPROACH 0.5f
+#define REWARD_APPROACH 0.1f
 #define REWARD_HIT_WALL -0.1f
-#define REWARD_PLAYER_HIT_BOSS 5.0f
-#define REWARD_BOSS_HIT_PLAYER -0.5f
-#define REWARD_DODGE_SUCCESS 2.0f
-#define REWARD_KILL_BOSS 50.0f
-#define REWARD_PLAYER_DIED -5.0f
-#define REWARD_TIMEOUT -20.0f
-#define REWARD_TICK -0.001f
+#define REWARD_PLAYER_HIT_BOSS 0.4f
+#define REWARD_BOSS_HIT_PLAYER -0.35f
+#define REWARD_DODGE_SUCCESS 0.0f
+#define REWARD_KILL_BOSS 1.0f
+#define REWARD_PLAYER_DIED -1.0f
+#define REWARD_TIMEOUT -1.0f
+#define REWARD_TICK -0.01f
 #define EPISODE_LENGTH 300
 
 const Color PLAYER_COLOR = (Color){50, 100, 255, 255};
@@ -240,10 +240,11 @@ void c_step(BossFight *env) {
 
   bool would_be_hit = env->boss_state == BOSS_ATTACKING && in_aoe_attack;
 
-  bool successfully_dodging =
-      would_be_hit && env->player_state == PLAYER_DODGING;
+  bool started_successful_dodge = would_be_hit &&
+                                  env->player_state == PLAYER_DODGING &&
+                                  env->player_state_ticks == PLAYER_DODGE_TICKS;
 
-  if (successfully_dodging) {
+  if (started_successful_dodge) {
     reward += REWARD_DODGE_SUCCESS;
   }
 

From 3fcaf2ceef83f7d635db4a414f2af69361804d15 Mon Sep 17 00:00:00 2001
From: frixaco <rustam21ashurmatov@gmail.com>
Date: Sat, 24 Jan 2026 19:09:16 +0500
Subject: [PATCH 22/29] tune numbers

---
 pufferlib/ocean/boss_fight/boss_fight.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h
index 3a3f0fd5e..6667810e3 100644
--- a/pufferlib/ocean/boss_fight/boss_fight.h
+++ b/pufferlib/ocean/boss_fight/boss_fight.h
@@ -5,16 +5,16 @@
 
 #define ARENA_HALF_SIZE 5.0f
 #define MAX_HP 1.0f
-#define PLAYER_SPEED_PER_TICK 0.1f
+#define PLAYER_SPEED_PER_TICK 0.25f
 #define PLAYER_SIZE 0.3f
 #define BOSS_SIZE 0.5f
 #define PLAYER_ATTACK_RADIUS 0.4f
 #define PLAYER_ATTACK_TICKS 3
-#define PLAYER_DODGE_TICKS 4
+#define PLAYER_DODGE_TICKS 6
 #define PLAYER_DODGE_COOLDOWN 15
-#define PLAYER_ATTACK_DMG 0.1f
-#define BOSS_ATTACK_DMG 0.05f
-#define BOSS_AOE_ATTACK_RADIUS 0.7f
+#define PLAYER_ATTACK_DMG 0.02f
+#define BOSS_ATTACK_DMG 0.15f
+#define BOSS_AOE_ATTACK_RADIUS 0.8f
 #define BOSS_IDLE_TICKS 7
 #define BOSS_WINDUP_TICKS 5
 #define BOSS_ACTIVE_TICKS 5
@@ -23,11 +23,11 @@
 #define HP_BAR_HEIGHT 5
 
 // Rewards
-#define REWARD_APPROACH 0.1f
-#define REWARD_HIT_WALL -0.1f
-#define REWARD_PLAYER_HIT_BOSS 0.4f
-#define REWARD_BOSS_HIT_PLAYER -0.35f
-#define REWARD_DODGE_SUCCESS 0.0f
+#define REWARD_APPROACH 0.05f
+#define REWARD_HIT_WALL -0.05f
+#define REWARD_PLAYER_HIT_BOSS 0.07f
+#define REWARD_BOSS_HIT_PLAYER -0.05f
+#define REWARD_DODGE_SUCCESS 0.07f
 #define REWARD_KILL_BOSS 1.0f
 #define REWARD_PLAYER_DIED -1.0f
 #define REWARD_TIMEOUT -1.0f

From 04ca78e404c7ff9fefc77208173cff208c2b2a09 Mon Sep 17 00:00:00 2001
From: frixaco <rustam21ashurmatov@gmail.com>
Date: Sat, 24 Jan 2026 22:32:06 +0500
Subject: [PATCH 23/29] tune numbers even more; good results

---
 pufferlib/ocean/boss_fight/boss_fight.h | 76 ++++++++++++++++++-------
 1 file changed, 55 insertions(+), 21 deletions(-)

diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h
index 6667810e3..55799cf94 100644
--- a/pufferlib/ocean/boss_fight/boss_fight.h
+++ b/pufferlib/ocean/boss_fight/boss_fight.h
@@ -10,9 +10,11 @@
 #define BOSS_SIZE 0.5f
 #define PLAYER_ATTACK_RADIUS 0.4f
 #define PLAYER_ATTACK_TICKS 3
-#define PLAYER_DODGE_TICKS 6
+#define PLAYER_DODGE_TICKS 4
+#define PLAYER_IFRAME_TICKS 2
 #define PLAYER_DODGE_COOLDOWN 15
-#define PLAYER_ATTACK_DMG 0.02f
+#define PLAYER_DODGE_SPEED_PER_TICK 0.35f
+#define PLAYER_ATTACK_DMG 0.05f
 #define BOSS_ATTACK_DMG 0.15f
 #define BOSS_AOE_ATTACK_RADIUS 0.8f
 #define BOSS_IDLE_TICKS 7
@@ -32,7 +34,7 @@
 #define REWARD_PLAYER_DIED -1.0f
 #define REWARD_TIMEOUT -1.0f
 #define REWARD_TICK -0.01f
-#define EPISODE_LENGTH 300
+#define EPISODE_LENGTH 600
 
 const Color PLAYER_COLOR = (Color){50, 100, 255, 255};
 const Color BOSS_COLOR = (Color){0, 187, 187, 255};
@@ -78,6 +80,7 @@ typedef struct {
   float player_hp;
   int player_dodge_cooldown;
   int player_state_ticks;
+  int dodge_escape_pending;
 
   BossState boss_state;
   float boss_hp;
@@ -137,6 +140,7 @@ void c_reset(BossFight *env) {
   env->player_state = PLAYER_IDLING;
   env->player_dodge_cooldown = 0;
   env->player_state_ticks = 0;
+  env->dodge_escape_pending = 0;
   env->boss_state = BOSS_IDLING;
   env->boss_phase_ticks = BOSS_IDLE_TICKS;
   env->episode_return = 0;
@@ -184,38 +188,59 @@ void c_step(BossFight *env) {
   bool wanna_dodge = action == 5;
   bool wanna_attack = action == 6;
   bool can_dodge =
-      env->player_state == PLAYER_IDLING && env->player_dodge_cooldown == 0;
+      env->player_state != PLAYER_DODGING && env->player_dodge_cooldown == 0;
   bool can_attack = env->player_state == PLAYER_IDLING;
 
   if (wanna_attack && can_attack) {
     env->player_state_ticks = PLAYER_ATTACK_TICKS;
     env->player_state = PLAYER_ATTACKING;
   }
+
+  float aoe_dist = BOSS_SIZE + PLAYER_SIZE + BOSS_AOE_ATTACK_RADIUS;
+  bool boss_threatening =
+      env->boss_state == BOSS_WINDING_UP || env->boss_state == BOSS_ATTACKING;
+
+  float pre_dodge_dist = 0.0f;
   if (wanna_dodge && can_dodge) {
+    pre_dodge_dist =
+        distance(env->player_x, env->player_y, env->boss_x, env->boss_y);
+    env->dodge_escape_pending =
+        boss_threatening && pre_dodge_dist <= aoe_dist ? 1 : 0;
+
     env->player_state_ticks = PLAYER_DODGE_TICKS;
     env->player_state = PLAYER_DODGING;
   }
 
-  float dist = distance(env->player_x, env->player_y, env->boss_x, env->boss_y);
-
-  reward += REWARD_APPROACH * (env->prev_distance - dist);
-  env->prev_distance = dist;
-
-  bool close_enough = dist <= BOSS_SIZE + PLAYER_ATTACK_RADIUS + PLAYER_SIZE;
+  // Dodge = multi-tick movement out of the AOE (no i-frames)
+  if (env->player_state == PLAYER_DODGING) {
+    float away_x = env->player_x - env->boss_x;
+    float away_y = env->player_y - env->boss_y;
+    float away_norm = sqrtf(away_x * away_x + away_y * away_y);
+    if (away_norm > 1e-6f) {
+      env->player_x += (away_x / away_norm) * PLAYER_DODGE_SPEED_PER_TICK;
+      env->player_y += (away_y / away_norm) * PLAYER_DODGE_SPEED_PER_TICK;
+    }
+  }
 
   bool hit_wall = fabsf(env->player_x) > ARENA_HALF_SIZE ||
                   fabsf(env->player_y) > ARENA_HALF_SIZE;
   if (hit_wall) {
     reward += REWARD_HIT_WALL;
   }
+
   // can't walk out of bounds
   env->player_x =
       fmaxf(-ARENA_HALF_SIZE, fminf(ARENA_HALF_SIZE, env->player_x));
   env->player_y =
       fmaxf(-ARENA_HALF_SIZE, fminf(ARENA_HALF_SIZE, env->player_y));
 
+  float dist = distance(env->player_x, env->player_y, env->boss_x, env->boss_y);
+
+  reward += REWARD_APPROACH * (env->prev_distance - dist);
+  env->prev_distance = dist;
+
   // push player out if clipping into boss
-  if (dist < BOSS_SIZE + PLAYER_SIZE) {
+  if (dist < BOSS_SIZE + PLAYER_SIZE && dist > 1e-6f) {
     float overlap = BOSS_SIZE + PLAYER_SIZE - dist;
     float dx = env->player_x - env->boss_x;
     float dy = env->player_y - env->boss_y;
@@ -225,27 +250,35 @@ void c_step(BossFight *env) {
     dist = distance(env->player_x, env->player_y, env->boss_x, env->boss_y);
   }
 
+  bool close_enough = dist <= BOSS_SIZE + PLAYER_ATTACK_RADIUS + PLAYER_SIZE;
+
   if (wanna_attack && can_attack && close_enough) {
     env->boss_hp -= PLAYER_ATTACK_DMG;
     reward += REWARD_PLAYER_HIT_BOSS;
   }
 
-  bool in_aoe_attack = dist <= BOSS_SIZE + PLAYER_SIZE + BOSS_AOE_ATTACK_RADIUS;
-  bool boss_can_hit = env->player_state != PLAYER_DODGING && in_aoe_attack;
+  bool in_aoe_attack = dist <= aoe_dist;
+  bool player_iframed =
+      env->player_state == PLAYER_DODGING &&
+      env->player_state_ticks > (PLAYER_DODGE_TICKS - PLAYER_IFRAME_TICKS);
+
+  // Souls-like: you can i-frame briefly, but the AOE persists longer than the
+  // i-frame window; if you're still in the hitbox after i-frames, you get hit.
+  bool boss_can_hit = in_aoe_attack && !player_iframed;
   bool boss_can_damage = env->boss_state == BOSS_ATTACKING && boss_can_hit;
   if (boss_can_damage) {
     env->player_hp -= BOSS_ATTACK_DMG;
     reward += REWARD_BOSS_HIT_PLAYER;
   }
 
-  bool would_be_hit = env->boss_state == BOSS_ATTACKING && in_aoe_attack;
-
-  bool started_successful_dodge = would_be_hit &&
-                                  env->player_state == PLAYER_DODGING &&
-                                  env->player_state_ticks == PLAYER_DODGE_TICKS;
-
-  if (started_successful_dodge) {
-    reward += REWARD_DODGE_SUCCESS;
+  // Reward dodges that actually exit the AOE during the danger window
+  if (env->dodge_escape_pending) {
+    if (!boss_threatening) {
+      env->dodge_escape_pending = 0;
+    } else if (dist > aoe_dist) {
+      reward += REWARD_DODGE_SUCCESS;
+      env->dodge_escape_pending = 0;
+    }
   }
 
   bool killed_boss = env->boss_hp <= 0;
@@ -301,6 +334,7 @@ void c_step(BossFight *env) {
     if (env->player_state == PLAYER_DODGING) {
       env->player_dodge_cooldown = PLAYER_DODGE_COOLDOWN;
       env->player_state = PLAYER_IDLING;
+      env->dodge_escape_pending = 0;
     } else if (env->player_state == PLAYER_ATTACKING) {
       env->player_state = PLAYER_IDLING;
     }

From 6ca0be54adcc68503b361d50962c46dbf86fe0b0 Mon Sep 17 00:00:00 2001
From: frixaco <rustam21ashurmatov@gmail.com>
Date: Sat, 24 Jan 2026 22:37:26 +0500
Subject: [PATCH 24/29] cleanup

---
 AGENTS.md                               |   18 -
 LEARN_TODO.md                           |  259 -----
 learn-pufferlib.py                      | 1175 -----------------------
 pufferlib/ocean/boss_fight/boss_fight.h |   22 +-
 runpod.md                               |   12 -
 5 files changed, 9 insertions(+), 1477 deletions(-)
 delete mode 100644 AGENTS.md
 delete mode 100644 LEARN_TODO.md
 delete mode 100644 learn-pufferlib.py
 delete mode 100644 runpod.md

diff --git a/AGENTS.md b/AGENTS.md
deleted file mode 100644
index 321f95f8d..000000000
--- a/AGENTS.md
+++ /dev/null
@@ -1,18 +0,0 @@
-# BossFight Reinforcement Learning project
-
-I'm implementing a RL environment using PufferLib in C + Python.
-
-It's a **minimal** 2D boss fight environment to learn RL concepts with PufferLib.
-Focus: **observation design, reward shaping, training experiments and a bit of game dev using Raylib**
-
-The boss has **1 attack** (AOE burst). All hitboxes are circles (collision = circles overlap).
-
-You are in PufferLib's (puffer.ai) source repository which contains "Ocean" - a collection of environments.
-
-The environment code I'm working on is located in `./pufferlib/ocean/boss_fight/`. Environment configuration is in `./pufferlib/config/boss_fight.ini`
-
-After modifying C files, to test you can run:
-
-```
-python setup.py build_boss_fight --inplace --force && puffer train puffer_boss_fight --train.device cpu --vec.num-workers 8 --vec.num-envs 1024 --train.total-timesteps 5000000 && puffer eval puffer_boss_fight --load-model-path $(ls -t experiments/puffer_boss_fight_*/model_*.pt | head -1)
-```
diff --git a/LEARN_TODO.md b/LEARN_TODO.md
deleted file mode 100644
index 35c99eebc..000000000
--- a/LEARN_TODO.md
+++ /dev/null
@@ -1,259 +0,0 @@
-# Learning TODO: RL Foundations
-
-Everything you need to understand `bptt_horizon` and RL training in general.
-
----
-
-## Level 1: Basic ML Concepts
-
-### 1.1 What is a Neural Network?
-- Function that takes numbers in, spits numbers out
-- Has "weights" (parameters) that get adjusted during training
-- `input → [neural network] → output`
-
-### 1.2 What is Training / Learning?
-- Adjusting weights so the network gives better outputs
-- Done by computing "loss" (how wrong it was) and updating weights to reduce loss
-
-### 1.3 What is Backpropagation?
-- Algorithm to figure out HOW to adjust each weight
-- Flows backwards through the network: output → hidden layers → input
-- "If the output was wrong, which weights were responsible?"
-
-### 1.4 What is a Batch?
-- Group of training examples processed together
-- Instead of: train on example 1, then example 2, then example 3...
-- Do: train on [example 1, 2, 3, 4, 5] at once
-- Why? Faster (GPU parallelism) + more stable learning
-
-### 1.5 What is Minibatch?
-- When your batch is too big for GPU memory
-- Split batch into smaller "minibatches"
-- `batch_size = 1024, minibatch_size = 256` → 4 gradient updates per batch
-
----
-
-## Level 2: RL Basics
-
-### 2.1 What is a Timestep?
-- One tick of the game/simulation
-- Agent observes state → takes action → gets reward → new state
-- `t=0: see game → press button → get +1 point → game changes`
-
-### 2.2 What is an Episode?
-- One complete playthrough from start to end
-- Boss fight: episode = one full fight (win or lose)
-- `[spawn] → step → step → step → ... → [death or victory]`
-
-```
-Episode 1: t0 → t1 → t2 → t3 → DEAD (4 steps)
-Episode 2: t0 → t1 → t2 → t3 → t4 → t5 → WIN (6 steps)
-```
-
-### 2.3 What is an Observation?
-- What the agent "sees" at each timestep
-- Your boss_fight: 14 numbers (player pos, boss HP, etc.)
-
-### 2.4 What is a Policy?
-- The neural network that decides actions
-- `observation (14 floats) → [policy network] → action (0-6)`
-- Training = making this network choose better actions
-
-### 2.5 What is a Value Function?
-- Predicts "how good is this situation?"
-- "I have full HP, boss is low" → high value
-- "I'm almost dead, boss is full HP" → low value
-- Helps the agent learn which states to aim for
-
----
-
-## Level 3: How RL Training Works
-
-### 3.1 Collect Experience
-```
-Run 56 environments in parallel:
-  Env 1: obs → action → reward → obs → action → reward → ...
-  Env 2: obs → action → reward → obs → action → reward → ...
-  ...
-  Env 56: obs → action → reward → obs → action → reward → ...
-
-After N steps, you have a "batch" of experience
-```
-
-### 3.2 Compute Advantages
-- "Was this action better or worse than expected?"
-- `advantage = actual_reward - predicted_value`
-- Positive advantage → reinforce this action
-- Negative advantage → discourage this action
-
-### 3.3 Update the Network
-- Use collected experience to adjust policy weights
-- Make good actions more likely, bad actions less likely
-
-### 3.4 Repeat
-```
-while not done:
-    1. Collect batch of experience (many timesteps)
-    2. Compute advantages
-    3. Update network with minibatches
-    4. Go to 1
-```
-
----
-
-## Level 4: Sequential Data & Memory
-
-### 4.1 Why Sequence Matters
-In games, the PAST affects what you should do NOW:
-
-```
-Timestep 1: Boss starts wind-up animation
-Timestep 2: Boss still winding up
-Timestep 3: Boss about to attack!     ← YOU SHOULD DODGE NOW
-Timestep 4: Boss attacks
-
-If you only see timestep 3 in isolation, you might not know to dodge.
-But if you saw timesteps 1-2-3 together, you'd see the pattern.
-```
-
-### 4.2 MLP (Multi-Layer Perceptron) — No Memory
-- Standard neural network
-- Only sees CURRENT observation
-- `obs_t → [MLP] → action`
-- No memory of previous timesteps
-- Fine if observation contains all needed info
-
-### 4.3 RNN (Recurrent Neural Network) — Has Memory
-- Sees current observation + remembers past
-- `obs_t + memory → [RNN] → action + updated_memory`
-- Can learn patterns over time
-- Types: LSTM, GRU (different memory mechanisms)
-
-```
-MLP:  sees [___] [___] [_X_]     ← only current frame
-RNN:  sees [_X_] [_X_] [_X_]     ← current + memory of past
-```
-
-### 4.4 When Do You Need RNN?
-- When current observation is INCOMPLETE
-- Example: "Boss is standing still" — is he about to attack or recovering?
-- If your observation includes `boss_phase` and `time_to_damage`, MLP might be enough
-- If observation only has positions, RNN helps learn timing
-
----
-
-## Level 5: BPTT (Backpropagation Through Time)
-
-### 5.1 The Problem
-RNN has memory that flows through time:
-
-```
-t1 → t2 → t3 → t4 → t5 → t6 → ... → t1000
-
-To train RNN, backprop must flow backwards through ALL these connections.
-1000 timesteps = 1000 layers of backprop = VERY slow, uses tons of memory
-```
-
-### 5.2 The Solution: Truncated BPTT
-Don't backprop through entire episode. Cut it into chunks:
-
-```
-Episode:     [t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12]
-
-bptt_horizon = 4:
-
-Chunk 1: [t1 → t2 → t3 → t4]   ← backprop only through these 4
-Chunk 2: [t5 → t6 → t7 → t8]   ← backprop only through these 4
-Chunk 3: [t9 → t10 → t11 → t12] ← backprop only through these 4
-```
-
-### 5.3 What bptt_horizon Controls
-```
-bptt_horizon = 16 means:
-- RNN sees 16 consecutive timesteps during training
-- Gradients flow back through 16 steps max
-- RNN can learn patterns up to ~16 steps long
-```
-
-### 5.4 Trade-offs
-```
-Small horizon (8):
-  ✓ Fast, low memory
-  ✗ RNN can't learn long patterns (>8 steps)
-
-Large horizon (128):
-  ✓ RNN learns longer patterns
-  ✗ Slow, high memory usage
-```
-
----
-
-## Level 6: Putting It Together
-
-### 6.1 The Batch Math
-```
-num_envs = 56        (parallel environments)
-bptt_horizon = 16    (timesteps per chunk)
-
-batch_size = num_envs × bptt_horizon
-           = 56 × 16
-           = 896 total samples per training batch
-```
-
-### 6.2 Why minibatch_size Must Be ≤ batch_size
-```
-batch_size = 896     (you collected 896 samples)
-minibatch_size = 2048 (you want to train on 2048 at a time)
-
-ERROR: Can't take 2048 samples from a pile of 896!
-
-Fix: minibatch_size = 256 or 512 (smaller than 896)
-```
-
-### 6.3 For Your Boss Fight (No RNN)
-You're using MLP, so `bptt_horizon` just affects batch math:
-
-```ini
-[vec]
-num_envs = 56
-
-[train]
-bptt_horizon = 16        # 56 × 16 = 896 batch
-minibatch_size = 256     # Must be ≤ 896
-```
-
-Or increase horizon if you want bigger batches:
-
-```ini
-bptt_horizon = 64        # 56 × 64 = 3584 batch
-minibatch_size = 2048    # Now this works
-```
-
----
-
-## Summary: What You Actually Need to Know
-
-1. **batch_size** = total samples collected before training
-2. **minibatch_size** = chunk size for each gradient update (must be ≤ batch_size)
-3. **bptt_horizon** = consecutive timesteps kept together
-   - For RNN: determines how far back it can learn patterns
-   - For MLP: just affects batch_size math
-4. **Your boss_fight uses MLP** — bptt_horizon is just a number to make the math work
-
----
-
-## Learning Resources
-
-### Videos (start here)
-- [ ] 3Blue1Brown: "Neural Networks" series (YouTube)
-- [ ] Mutual Information: "Reinforcement Learning" series (YouTube)
-
-### Interactive
-- [ ] Andrej Karpathy: "Neural Networks: Zero to Hero" (YouTube + code)
-
-### Reading
-- [ ] Spinning Up in Deep RL (OpenAI) — https://spinningup.openai.com
-- [ ] CleanRL documentation — similar to PufferLib
-
-### Hands-on
-- [ ] Train boss_fight, watch the numbers, build intuition
diff --git a/learn-pufferlib.py b/learn-pufferlib.py
deleted file mode 100644
index 4091fb6fb..000000000
--- a/learn-pufferlib.py
+++ /dev/null
@@ -1,1175 +0,0 @@
-"""
-LEARN_V2.PY - RL with PufferLib (The Right Way)
-================================================
-
-PURPOSE: Learn reinforcement learning using PufferLib's patterns and infrastructure.
-
-This is the "full PufferLib" version of learn.py. Instead of implementing PPO
-from scratch, we use PufferLib's pufferl.PuffeRL trainer which handles:
-- Rollout collection
-- GAE advantage computation
-- PPO loss calculation
-- Gradient updates
-- Logging and metrics
-
-HOW TO USE:
-1. Read each section's comments (the WHY and WHAT)
-2. Fill in the TODO sections
-3. Run and test after each section: python learn_v2.py
-4. Only move to next section when current one works
-
-The environment is the same as learn.py:
-- 2D arena where an agent must reach a target
-- Agent can move UP/DOWN/LEFT/RIGHT or stay still
-- Episode ends when: agent reaches target, hits wall, or 200 steps pass
-
-DEPENDENCIES:
-    pip install pufferlib torch numpy gymnasium
-"""
-
-import os
-import numpy as np
-import gymnasium
-import torch
-import torch.nn as nn
-import pufferlib
-import pufferlib.vector
-import pufferlib.pytorch
-from pufferlib import pufferl
-
-
-# =============================================================================
-# SECTION 1: PUFFERLIB ENVIRONMENT
-# =============================================================================
-"""
-WHY inherit from pufferlib.PufferEnv?
--------------------------------------
-PufferLib provides optimized environment vectorization. When you inherit from
-PufferEnv, you get:
-
-1. AUTOMATIC BUFFER MANAGEMENT: PufferLib creates shared memory buffers for
-   observations, rewards, terminals, truncations. You just write to them.
-
-2. MULTI-AGENT SUPPORT: The same pattern works for 1 agent or 100 agents.
-   You define `num_agents` and PufferLib handles the rest.
-
-3. VECTORIZATION COMPATIBILITY: Your env works with pufferlib.vector.make()
-   which can run multiple copies in parallel (Serial or Multiprocessing).
-
-KEY DIFFERENCES from Gymnasium:
--------------------------------
-- Define `single_observation_space` and `single_action_space` (not plural)
-- Set `self.num_agents` (1 for single-agent)
-- Call `super().__init__(buf)` which creates self.observations, self.rewards, etc.
-- Update arrays IN-PLACE: `self.observations[:] = ...` not `return obs`
-- reset() and step() still return values, but also update internal buffers
-"""
-
-
-class MoveToTargetEnv(pufferlib.PufferEnv):
-    """
-    A simple environment where an agent navigates to a target position.
-
-    This is identical to learn.py's MoveToTargetEnv, but adapted to PufferLib's
-    patterns. The game logic is the same, only the interface changes.
-
-    GAME RULES:
-    - Agent starts at random position in [-0.8, 0.8] x [-0.8, 0.8]
-    - Target is at random position (at least 0.3 units away from agent)
-    - Agent can: NOOP (0), UP (1), DOWN (2), LEFT (3), RIGHT (4)
-    - Episode ends when: agent reaches target, hits wall (|x|>1 or |y|>1), or 200 steps
-    - Reward: -0.01/step + distance shaping + terminal bonuses
-    """
-
-    # Type hints for attributes created by super().__init__()
-    observations: np.ndarray
-    rewards: np.ndarray
-    terminals: np.ndarray
-    truncations: np.ndarray
-
-    def __init__(self, buf=None, seed=0):
-        """
-        WHY these parameters?
-        ---------------------
-        - buf: Optional shared memory buffer from PufferLib's vectorization.
-               When running multiple envs, they share memory for efficiency.
-               If None, PufferLib creates a buffer automatically.
-
-        - seed: Random seed for reproducibility. Essential for debugging!
-
-        WHAT to do in __init__:
-        1. Define single_observation_space (what ONE agent sees)
-        2. Define single_action_space (what actions ONE agent can take)
-        3. Set self.num_agents (1 for single-agent env)
-        4. Call super().__init__(buf) - THIS CREATES self.observations, etc.
-        5. Initialize game state variables
-        6. Set up random number generator
-        """
-        # -----------------------------------------------------------------
-        # TODO 1.1: Define the observation space
-        # -----------------------------------------------------------------
-        # WHAT the agent sees: [agent_x, agent_y, target_x, target_y, dx, dy]
-        # - Positions are in [-1, 1] (arena bounds)
-        # - dx, dy (direction to target) can be in [-2, 2]
-        #
-        # WHY "single_observation_space" not "observation_space"?
-        # PufferLib distinguishes single-agent spaces from joint spaces.
-        # For multi-agent, observation_space would be (num_agents, obs_dim).
-        # We define the SINGLE agent's view, PufferLib handles batching.
-        #
-        # YOUR CODE: Create self.single_observation_space as gymnasium.spaces.Box
-        # Hint: Box(low=-2.0, high=2.0, shape=(6,), dtype=np.float32)
-
-        self.single_observation_space = gymnasium.spaces.Box(
-            low=-2.0, high=2.0, shape=(6,), dtype=np.float32
-        )
-
-        # -----------------------------------------------------------------
-        # TODO 1.2: Define the action space
-        # -----------------------------------------------------------------
-        # WHAT actions are available: 0=NOOP, 1=UP, 2=DOWN, 3=LEFT, 4=RIGHT
-        #
-        # YOUR CODE: Create self.single_action_space as gymnasium.spaces.Discrete(5)
-
-        self.single_action_space = gymnasium.spaces.Discrete(5)
-
-        # -----------------------------------------------------------------
-        # TODO 1.3: Set the number of agents
-        # -----------------------------------------------------------------
-        # For single-agent environments, num_agents = 1.
-        # PufferLib uses this to allocate the right buffer sizes.
-        #
-        # YOUR CODE: Set self.num_agents = 1
-
-        self.num_agents = 1
-
-        # -----------------------------------------------------------------
-        # CRITICAL: Call super().__init__(buf)
-        # -----------------------------------------------------------------
-        # This MUST come after defining spaces and num_agents!
-        # It creates:
-        #   - self.observations: array of shape (num_agents, *obs_shape)
-        #   - self.rewards: array of shape (num_agents,)
-        #   - self.terminals: array of shape (num_agents,)
-        #   - self.truncations: array of shape (num_agents,)
-        #
-        # These are the buffers you'll update in reset() and step().
-        super().__init__(buf)
-
-        # -----------------------------------------------------------------
-        # TODO 1.4: Initialize game state variables
-        # -----------------------------------------------------------------
-        # Track the actual game state (not observations, those are derived).
-        # For single-agent, these are simple arrays of shape (2,) for positions.
-        #
-        # WHAT to initialize:
-        # - self.agent_pos: np.zeros(2, dtype=np.float32) - agent's [x, y]
-        # - self.target_pos: np.zeros(2, dtype=np.float32) - target's [x, y]
-        # - self.tick: 0 - step counter within episode
-        #
-        # Also initialize constants:
-        # - self.max_steps = 200
-        # - self.target_radius = 0.1 (how close to count as "reached")
-        # - self.move_speed = 0.05 (movement per action)
-        # - self.arena_size = 1.0 (arena is [-1, 1] x [-1, 1])
-        #
-        # YOUR CODE: Initialize game state
-
-        self.agent_pos = np.zeros(2, dtype=np.float32)
-        self.target_pos = np.zeros(2, dtype=np.float32)
-        self.tick = 0
-
-        self.max_steps = 200
-        self.target_radius = 0.1
-        self.move_speed = 0.05
-        self.arena_size = 1.0
-
-        # Set up random number generator for reproducibility
-        self.rng = np.random.default_rng(seed=seed)
-
-        # Track previous distance for reward shaping
-        self.prev_dist = 0.0
-
-    def reset(self, seed=None):
-        """
-        WHY reset()?
-        ------------
-        Start a fresh episode. Called at the beginning and after each episode ends.
-
-        WHAT to do:
-        1. Randomize agent position
-        2. Randomize target position (not too close to agent!)
-        3. Reset step counter
-        4. Compute initial distance (for reward shaping)
-        5. Fill self.observations[:] with initial state
-
-        WHY update self.observations[:] in-place?
-        PufferLib uses shared memory buffers. By updating in-place, we avoid
-        copying data. The [:] syntax means "update the existing array contents".
-
-        RETURNS:
-        - self.observations: the observation buffer (now filled with initial state)
-        - []: empty list of infos (PufferLib expects a list)
-        """
-        # -----------------------------------------------------------------
-        # TODO 2.1: Implement reset()
-        # -----------------------------------------------------------------
-        # Step 1: Randomize agent position
-        #         self.agent_pos[:] = self.rng.uniform(-0.8, 0.8, size=2)
-        #
-        # Step 2: Randomize target position
-        #         self.target_pos[:] = self.rng.uniform(-0.8, 0.8, size=2)
-        #
-        # Step 3: Ensure target is far enough from agent (at least 0.3 units)
-        #         while np.linalg.norm(self.agent_pos - self.target_pos) < 0.3:
-        #             self.target_pos[:] = self.rng.uniform(-0.8, 0.8, size=2)
-        #
-        # Step 4: Reset step counter
-        #         self.tick = 0
-        #
-        # Step 5: Compute initial distance
-        #         self.prev_dist = np.linalg.norm(self.agent_pos - self.target_pos)
-        #
-        # Step 6: Fill observations buffer
-        #         self.observations[0, 0] = self.agent_pos[0]  # agent_x
-        #         self.observations[0, 1] = self.agent_pos[1]  # agent_y
-        #         self.observations[0, 2] = self.target_pos[0]  # target_x
-        #         self.observations[0, 3] = self.target_pos[1]  # target_y
-        #         self.observations[0, 4] = self.target_pos[0] - self.agent_pos[0]  # dx
-        #         self.observations[0, 5] = self.target_pos[1] - self.agent_pos[1]  # dy
-        #
-        # Note: We index [0, :] because num_agents=1, so observations has shape (1, 6)
-        #
-        # YOUR CODE:
-
-        self.agent_pos[:] = self.rng.uniform(-0.8, 0.8, size=2)
-        self.target_pos[:] = self.rng.uniform(-0.8, 0.8, size=2)
-
-        while np.linalg.norm(self.agent_pos - self.target_pos) < 0.3:
-            self.target_pos[:] = self.rng.uniform(-0.8, 0.8, size=2)
-
-        self.tick = 0
-
-        self.prev_dist = np.linalg.norm(self.agent_pos - self.target_pos)
-
-        self.observations[0, 0] = self.agent_pos[0]
-        self.observations[0, 1] = self.agent_pos[1]
-        self.observations[0, 2] = self.target_pos[0]
-        self.observations[0, 3] = self.target_pos[1]
-        self.observations[0, 4] = self.target_pos[0] - self.agent_pos[0]
-        self.observations[0, 5] = self.target_pos[1] - self.agent_pos[1]
-
-        return self.observations, []
-
-    def step(self, actions):
-        """
-        WHY step()?
-        -----------
-        The core game loop. Called every timestep with the agent's chosen action.
-
-        WHAT to do:
-        1. Apply the action (move agent)
-        2. Compute reward (time penalty + distance shaping + terminal bonus)
-        3. Check terminal conditions (reached target? hit wall? timeout?)
-        4. Update buffers (observations, rewards, terminals, truncations)
-        5. Auto-reset if episode ended
-
-        PARAMETERS:
-        - actions: numpy array of shape (num_agents,) = (1,) for us
-                   Each value is an integer 0-4
-
-        RETURNS:
-        - self.observations: updated observation buffer
-        - self.rewards: updated reward buffer
-        - self.terminals: updated terminal buffer
-        - self.truncations: updated truncation buffer
-        - infos: list of dicts with episode stats for finished episodes
-        """
-        # -----------------------------------------------------------------
-        # TODO 2.2: Implement step()
-        # -----------------------------------------------------------------
-        # Step 1: Get the action (we only have 1 agent)
-        #         action = actions[0]
-        #
-        # Step 2: Convert action to movement
-        #         dx, dy = 0.0, 0.0
-        #         if action == 1: dy = self.move_speed   # UP
-        #         elif action == 2: dy = -self.move_speed  # DOWN
-        #         elif action == 3: dx = -self.move_speed  # LEFT
-        #         elif action == 4: dx = self.move_speed   # RIGHT
-        #
-        # Step 3: Apply movement
-        #         self.agent_pos[0] += dx
-        #         self.agent_pos[1] += dy
-        #         self.tick += 1
-        #
-        # Step 4: Compute distance and rewards
-        #         distance = np.linalg.norm(self.agent_pos - self.target_pos)
-        #         reward = -0.01  # Time penalty
-        #         reward += 2.0 * (self.prev_dist - distance)  # Distance shaping
-        #         self.prev_dist = distance
-        #
-        # Step 5: Check terminal conditions
-        #         reached_target = distance < self.target_radius
-        #         hit_wall = (abs(self.agent_pos[0]) > self.arena_size or
-        #                     abs(self.agent_pos[1]) > self.arena_size)
-        #         timed_out = self.tick >= self.max_steps
-        #
-        # Step 6: Apply terminal rewards
-        #         if reached_target: reward += 1.0
-        #         if hit_wall: reward -= 0.5
-        #
-        # Step 7: Set terminal and truncation flags
-        #         terminal = reached_target or hit_wall
-        #         truncation = timed_out and not terminal
-        #
-        # Step 8: Update buffers
-        #         self.rewards[0] = reward
-        #         self.terminals[0] = terminal
-        #         self.truncations[0] = truncation
-        #
-        # Step 9: Build info dict for finished episodes
-        #         infos = []
-        #         if terminal or truncation:
-        #             infos.append({
-        #                 'episode_length': self.tick,
-        #                 'reached_target': reached_target,
-        #                 'hit_wall': hit_wall,
-        #                 'reward': reward,
-        #             })
-        #             # Auto-reset for next episode
-        #             self.reset()
-        #
-        # Step 10: Update observations (whether reset or not)
-        #          self.observations[0, 0] = self.agent_pos[0]
-        #          self.observations[0, 1] = self.agent_pos[1]
-        #          self.observations[0, 2] = self.target_pos[0]
-        #          self.observations[0, 3] = self.target_pos[1]
-        #          self.observations[0, 4] = self.target_pos[0] - self.agent_pos[0]
-        #          self.observations[0, 5] = self.target_pos[1] - self.agent_pos[1]
-        #
-        # YOUR CODE:
-
-        action = actions[0]
-
-        dx, dy = 0.0, 0.0
-        if action == 1:
-            dy = self.move_speed
-        elif action == 2:
-            dy = -self.move_speed  # DOWN
-        elif action == 3:
-            dx = -self.move_speed  # LEFT
-        elif action == 4:
-            dx = self.move_speed  # RIGHT
-
-        self.agent_pos[0] += dx
-        self.agent_pos[1] += dy
-        self.tick += 1
-
-        distance = np.linalg.norm(self.target_pos - self.agent_pos)
-        reward = -0.01
-        reward += 2 * (self.prev_dist - distance)
-        self.prev_dist = distance
-
-        reached_target = distance < self.target_radius
-        hit_wall = (
-            abs(self.agent_pos[0]) > self.arena_size
-            or abs(self.agent_pos[1]) > self.arena_size
-        )
-        timed_out = self.tick >= self.max_steps
-
-        if reached_target:
-            reward += 1.0
-        if hit_wall:
-            reward -= 0.5
-
-        terminal = reached_target or hit_wall
-        truncation = timed_out and not terminal
-
-        self.rewards[0] = reward
-        self.terminals[0] = terminal
-        self.truncations[0] = truncation
-
-        infos = []
-        if terminal or truncation:
-            infos.append(
-                {
-                    "episode_length": self.tick,
-                    "reached_target": reached_target,
-                    "hit_wall": hit_wall,
-                    "reward": reward,
-                }
-            )
-            self.reset()
-
-        self.observations[0, 0] = self.agent_pos[0]
-        self.observations[0, 1] = self.agent_pos[1]
-        self.observations[0, 2] = self.target_pos[0]
-        self.observations[0, 3] = self.target_pos[1]
-        self.observations[0, 4] = self.target_pos[0] - self.agent_pos[0]
-        self.observations[0, 5] = self.target_pos[1] - self.agent_pos[1]
-
-        return self.observations, self.rewards, self.terminals, self.truncations, infos
-
-    def render(self):
-        """
-        Simple ASCII rendering for debugging.
-        Shows a 20x20 grid with agent (A) and target (T).
-        """
-        grid_size = 20
-        grid = [["." for _ in range(grid_size)] for _ in range(grid_size)]
-
-        # Convert positions from [-1, 1] to grid indices [0, grid_size-1]
-        def to_grid(pos):
-            x = int((pos[0] + 1) / 2 * (grid_size - 1))
-            y = int((1 - (pos[1] + 1) / 2) * (grid_size - 1))  # Flip y for display
-            return max(0, min(grid_size - 1, x)), max(0, min(grid_size - 1, y))
-
-        tx, ty = to_grid(self.target_pos)
-        ax, ay = to_grid(self.agent_pos)
-
-        grid[ty][tx] = "T"
-        grid[ay][ax] = "A"
-
-        print(f"\nStep {self.tick}:")
-        print("+" + "-" * grid_size + "+")
-        for row in grid:
-            print("|" + "".join(row) + "|")
-        print("+" + "-" * grid_size + "+")
-
-    def close(self):
-        pass
-
-
-# =============================================================================
-# SECTION 2: TESTING ENVIRONMENT
-# =============================================================================
-"""
-WHY test before training?
--------------------------
-If your environment is broken, RL will silently fail to learn.
-You'll waste hours wondering why training doesn't work.
-
-ALWAYS verify:
-1. Environment creates without errors
-2. reset() returns correct shapes
-3. step() works with valid actions
-4. Episodes actually terminate
-5. A simple heuristic can solve it
-"""
-
-
-def test_environment():
-    """Run basic sanity checks on the PufferLib environment."""
-    print("=" * 60)
-    print("TESTING MoveToTargetEnv (PufferLib)")
-    print("=" * 60)
-
-    # Test 1: Creation
-    print("\n[TEST 1] Creating environment...")
-    try:
-        env = MoveToTargetEnv(seed=42)
-        print(f"  OK: Created env")
-        print(f"  Observation space: {env.single_observation_space}")
-        print(f"  Action space: {env.single_action_space}")
-        print(f"  Num agents: {env.num_agents}")
-    except Exception as e:
-        print(f"  FAILED: {e}")
-        import traceback
-
-        traceback.print_exc()
-        return False
-
-    # Test 2: Reset
-    print("\n[TEST 2] Testing reset()...")
-    try:
-        obs, info = env.reset()
-        print(f"  OK: reset() returned observations with shape {obs.shape}")
-        print(f"  Sample observation: {obs[0]}")
-        assert obs.shape == (1, 6), f"Wrong shape: {obs.shape}, expected (1, 6)"
-    except Exception as e:
-        print(f"  FAILED: {e}")
-        import traceback
-
-        traceback.print_exc()
-        return False
-
-    # Test 3: Step with random actions
-    print("\n[TEST 3] Testing step() with random actions...")
-    try:
-        for i in range(5):
-            actions = np.array([np.random.randint(0, 5)])  # Shape (1,)
-            obs, rewards, terminals, truncations, infos = env.step(actions)
-            print(f"  Step {i + 1}: reward={rewards[0]:.3f}, terminal={terminals[0]}")
-        print(f"  OK: step() works")
-    except Exception as e:
-        print(f"  FAILED: {e}")
-        import traceback
-
-        traceback.print_exc()
-        return False
-
-    # Test 4: Run until episode terminates using heuristic
-    print("\n[TEST 4] Running until episode terminates...")
-    try:
-        obs, _ = env.reset()
-        total_steps = 0
-        episodes_finished = 0
-
-        while episodes_finished < 2 and total_steps < 500:
-            # Simple heuristic: move toward target
-            dx = obs[0, 4]  # target_x - agent_x
-            dy = obs[0, 5]  # target_y - agent_y
-
-            if abs(dx) > abs(dy):
-                action = 4 if dx > 0 else 3  # RIGHT or LEFT
-            else:
-                action = 1 if dy > 0 else 2  # UP or DOWN
-
-            actions = np.array([action])
-            obs, rewards, terminals, truncations, infos = env.step(actions)
-            total_steps += 1
-
-            if infos:
-                for info in infos:
-                    episodes_finished += 1
-                    print(f"  Episode finished: {info}")
-
-        print(f"  OK: Completed {episodes_finished} episodes in {total_steps} steps")
-    except Exception as e:
-        print(f"  FAILED: {e}")
-        import traceback
-
-        traceback.print_exc()
-        return False
-
-    # Test 5: Test with PufferLib vectorization
-    print("\n[TEST 5] Testing with pufferlib.vector.make()...")
-    try:
-        vecenv = pufferlib.vector.make(
-            MoveToTargetEnv,
-            num_envs=4,
-            backend=pufferlib.vector.Serial,
-        )
-        obs, _ = vecenv.reset()
-        print(f"  OK: Created vectorized env with 4 copies")
-        print(f"  Vectorized observation shape: {obs.shape}")
-
-        # Take a few steps
-        for i in range(3):
-            actions = np.random.randint(0, 5, size=4)
-            obs, rewards, terminals, truncations, infos = vecenv.step(actions)
-        print(f"  OK: Vectorized stepping works")
-        vecenv.close()
-    except Exception as e:
-        print(f"  FAILED: {e}")
-        import traceback
-
-        traceback.print_exc()
-        return False
-
-    print("\n" + "=" * 60)
-    print("ALL ENVIRONMENT TESTS PASSED!")
-    print("=" * 60)
-    return True
-
-
-# =============================================================================
-# SECTION 3: POLICY NETWORK
-# =============================================================================
-"""
-WHY this specific architecture?
--------------------------------
-PufferLib expects policies to follow certain conventions:
-
-1. forward_eval(observations, state=None) -> (logits, values)
-   - This is what the trainer calls during rollout collection
-   - Returns action LOGITS (not probabilities) and value estimates
-   - The `state` parameter is for RNNs (we return None for feedforward)
-
-2. Use pufferlib.pytorch.layer_init() for weight initialization
-   - Proper initialization is crucial for stable learning
-   - Different std values for actor vs critic heads
-
-WHY layer_init?
----------------
-Neural network initialization matters A LOT for RL:
-- Too large weights -> exploding gradients, unstable training
-- Too small weights -> vanishing gradients, slow learning
-- layer_init uses orthogonal initialization which works well for RL
-
-ARCHITECTURE:
-observation (6) -> encoder (64 -> 64) -> actor head (5) + critic head (1)
-"""
-
-
-class Policy(nn.Module):
-    """
-    Actor-Critic policy network following PufferLib conventions.
-
-    The network has:
-    - Shared encoder: processes observations into features
-    - Actor head: outputs action logits (5 actions)
-    - Critic head: outputs value estimate (1 value)
-    """
-
-    def __init__(self, env, hidden_size=64):
-        """
-        WHY take env as parameter?
-        --------------------------
-        We extract observation and action sizes from the environment.
-        This is more robust than hardcoding dimensions.
-
-        PufferLib's vectorized envs provide:
-        - env.single_observation_space: shape of one agent's observation
-        - env.single_action_space: the action space for one agent
-
-        For regular Gymnasium envs, these would be observation_space/action_space.
-        """
-        super().__init__()
-
-        # Get dimensions from environment
-        obs_size = env.single_observation_space.shape[0]
-        action_size = env.single_action_space.n
-
-        # -----------------------------------------------------------------
-        # TODO 3.1: Create the encoder (shared backbone)
-        # -----------------------------------------------------------------
-        # The encoder processes observations into a feature vector.
-        # Both actor and critic will use these features.
-        #
-        # Architecture: Linear(obs_size, hidden_size) -> ReLU -> Linear(hidden_size, hidden_size) -> ReLU
-        #
-        # Use pufferlib.pytorch.layer_init() for each Linear layer.
-        # Default std works for hidden layers.
-        #
-        # Example:
-        #   self.encoder = nn.Sequential(
-        #       pufferlib.pytorch.layer_init(nn.Linear(obs_size, hidden_size)),
-        #       nn.ReLU(),
-        #       pufferlib.pytorch.layer_init(nn.Linear(hidden_size, hidden_size)),
-        #       nn.ReLU(),
-        #   )
-        #
-        # YOUR CODE:
-
-        self.encoder = nn.Sequential(
-            pufferlib.pytorch.layer_init(nn.Linear(obs_size, hidden_size)),
-            nn.ReLU(),
-            pufferlib.pytorch.layer_init(nn.Linear(hidden_size, hidden_size)),
-            nn.ReLU(),
-        )
-
-        # -----------------------------------------------------------------
-        # TODO 3.2: Create the actor head
-        # -----------------------------------------------------------------
-        # Outputs action logits. Use std=0.01 for small initial outputs.
-        # WHY small std? We want initial actions to be nearly uniform.
-        #
-        # self.actor = pufferlib.pytorch.layer_init(
-        #     nn.Linear(hidden_size, action_size), std=0.01
-        # )
-        #
-        # YOUR CODE:
-
-        self.actor = pufferlib.pytorch.layer_init(
-            nn.Linear(hidden_size, action_size), std=0.01
-        )
-
-        # -----------------------------------------------------------------
-        # TODO 3.3: Create the critic head
-        # -----------------------------------------------------------------
-        # Outputs value estimate. Use std=1.0 for reasonable initial values.
-        #
-        # self.critic = pufferlib.pytorch.layer_init(
-        #     nn.Linear(hidden_size, 1), std=1.0
-        # )
-        #
-        # YOUR CODE:
-
-        self.critic = pufferlib.pytorch.layer_init(nn.Linear(hidden_size, 1), std=1.0)
-
-    def forward_eval(self, observations, state=None):
-        """
-        WHY forward_eval specifically?
-        ------------------------------
-        PufferLib's trainer calls forward_eval() during rollout collection.
-        It expects (logits, values) as return value.
-
-        The state parameter is for recurrent networks (LSTMs). For feedforward
-        networks like ours, we ignore it and return None.
-
-        PARAMETERS:
-        - observations: tensor of shape (batch_size, obs_size)
-        - state: For RNN/LSTM policies, carries hidden state between steps.
-                 For feedforward networks (like ours), always None.
-
-        RETURNS:
-        - logits: tensor of shape (batch_size, action_size) - unnormalized action scores
-        - values: tensor of shape (batch_size, 1) - value estimates
-        """
-        # -----------------------------------------------------------------
-        # TODO 3.4: Implement forward_eval
-        # -----------------------------------------------------------------
-        # Step 1: Pass observations through encoder
-        #         hidden = self.encoder(observations)
-        #
-        # Step 2: Get action logits from actor head
-        #         logits = self.actor(hidden)
-        #
-        # Step 3: Get value estimate from critic head
-        #         values = self.critic(hidden)
-        #
-        # Step 4: Return (logits, values)
-
-        hidden = self.encoder(observations)
-        logits = self.actor(hidden)
-        values = self.critic(hidden)
-
-        return logits, values
-
-    def forward(self, observations, state=None):
-        """Standard PyTorch forward - required by PufferLib trainer."""
-        return self.forward_eval(observations, state)
-
-
-# =============================================================================
-# SECTION 4: TESTING POLICY
-# =============================================================================
-"""
-WHY test the policy?
---------------------
-Verify the network architecture is correct before training.
-Common bugs:
-- Wrong input/output dimensions
-- Missing activations
-- NaN in outputs
-"""
-
-
-def test_policy():
-    """Run basic sanity checks on the Policy network."""
-    print("\n" + "=" * 60)
-    print("TESTING Policy Network")
-    print("=" * 60)
-
-    # Test 1: Creation
-    print("\n[TEST 1] Creating policy...")
-    try:
-        # Create a dummy env to get dimensions
-        env = MoveToTargetEnv()
-        env.reset()  # Initialize the env
-
-        policy = Policy(env, hidden_size=64)
-        print(f"  OK: Created policy")
-
-        # Count parameters
-        total_params = sum(p.numel() for p in policy.parameters())
-        print(f"  Total parameters: {total_params}")
-    except Exception as e:
-        print(f"  FAILED: {e}")
-        import traceback
-
-        traceback.print_exc()
-        return False
-
-    # Test 2: forward_eval
-    print("\n[TEST 2] Testing forward_eval()...")
-    try:
-        # Create batch of observations
-        obs = torch.randn(4, 6)  # batch of 4
-        logits, values = policy.forward_eval(obs)
-
-        print(f"  Input shape: {obs.shape}")
-        print(f"  Logits shape: {logits.shape} (expected: [4, 5])")
-        print(f"  Values shape: {values.shape} (expected: [4, 1])")
-
-        assert logits.shape == (4, 5), f"Wrong logits shape: {logits.shape}"
-        assert values.shape == (4, 1), f"Wrong values shape: {values.shape}"
-
-        # Check for NaN
-        assert not torch.isnan(logits).any(), "NaN in logits!"
-        assert not torch.isnan(values).any(), "NaN in values!"
-
-        print("  OK: Shapes correct, no NaN")
-    except Exception as e:
-        print(f"  FAILED: {e}")
-        import traceback
-
-        traceback.print_exc()
-        return False
-
-    # Test 3: Single observation
-    print("\n[TEST 3] Testing with single observation...")
-    try:
-        obs = torch.randn(1, 6)
-        logits, values = policy.forward_eval(obs)
-
-        print(f"  Logits: {logits}")
-        print(f"  Value: {values}")
-        print("  OK: Single observation works")
-    except Exception as e:
-        print(f"  FAILED: {e}")
-        import traceback
-
-        traceback.print_exc()
-        return False
-
-    print("\n" + "=" * 60)
-    print("ALL POLICY TESTS PASSED!")
-    print("=" * 60)
-    return True
-
-
-# =============================================================================
-# SECTION 5: TRAINING WITH PUFFERLIB
-# =============================================================================
-"""
-WHY use pufferl.PuffeRL?
-------------------------
-PufferLib's trainer handles ALL the RL internals:
-- Rollout collection (running envs, storing experiences)
-- GAE advantage computation
-- PPO loss calculation (clipped surrogate, value loss, entropy)
-- Gradient updates with clipping
-- Logging and metrics
-
-This means our training code is MUCH simpler than learn.py!
-
-THE TRAINING LOOP:
------------------
-1. Create vectorized environment
-2. Create policy
-3. Create config dict with hyperparameters
-4. Create PuffeRL trainer
-5. Loop: trainer.evaluate() -> trainer.train()
-
-WHAT trainer.evaluate() does:
-- Runs the policy in all environments
-- Collects experiences into buffers
-- Computes advantages and returns
-
-WHAT trainer.train() does:
-- Runs PPO update on collected experiences
-- Updates policy weights
-- Logs metrics
-"""
-
-
-def train(quick_test=False):
-    """
-    Main training function using PufferLib's trainer.
-
-    PARAMETERS:
-    - quick_test: if True, run short training to verify code works
-                  if False, run full training to see actual learning
-    """
-    # -----------------------------------------------------------------
-    # Hyperparameters
-    # -----------------------------------------------------------------
-    if quick_test:
-        total_timesteps = 10000
-        num_envs = 4
-    else:
-        total_timesteps = 100000
-        num_envs = 8
-
-    # Detect device
-    # device = "mps" if torch.backends.mps.is_available() else "cpu"
-    device = "cpu"
-
-    print("=" * 60)
-    print("TRAINING WITH PUFFERLIB")
-    print("=" * 60)
-    print(f"MPS available: {torch.backends.mps.is_available()}")
-    print(f"Using device: {device}")
-    print(f"Total timesteps: {total_timesteps}")
-    print(f"Num environments: {num_envs}")
-    print("=" * 60)
-
-    # -----------------------------------------------------------------
-    # TODO 5.1: Create vectorized environment
-    # -----------------------------------------------------------------
-    # PufferLib's vector.make() creates multiple environment copies.
-    #
-    # Backend options:
-    # - Serial: Runs envs sequentially. Good for debugging because errors
-    #   appear in the main process with full stack traces.
-    # - Multiprocessing: Runs envs in parallel. Much faster for many envs,
-    #   but errors in subprocesses are harder to debug.
-    #
-    # Tip: Use Serial until your code works, then switch to Multiprocessing.
-    #
-    # vecenv = pufferlib.vector.make(
-    #     MoveToTargetEnv,
-    #     num_envs=num_envs,
-    #     backend=pufferlib.vector.Serial,
-    # )
-    #
-    # YOUR CODE:
-
-    vecenv = pufferlib.vector.make(
-        MoveToTargetEnv, num_envs=num_envs, backend=pufferlib.vector.Multiprocessing
-    )
-
-    # -----------------------------------------------------------------
-    # TODO 5.2: Create policy
-    # -----------------------------------------------------------------
-    # Use vecenv.driver_env to get a reference to one of the environment copies.
-    # This lets us access single_observation_space and single_action_space
-    # for creating the policy with correct input/output dimensions.
-    # Move policy to device for GPU training.
-    #
-    # policy = Policy(vecenv.driver_env, hidden_size=64).to(device)
-    #
-    # YOUR CODE:
-
-    policy = Policy(vecenv.driver_env, hidden_size=64).to(device)
-    next(policy.parameters()).device
-
-    # -----------------------------------------------------------------
-    # TODO 5.3: Create config
-    # -----------------------------------------------------------------
-    # PufferLib's trainer uses a Config object for hyperparameters.
-    # These are standard PPO values that work well.
-    #
-    # config = pufferl.Config(
-    #     total_timesteps=total_timesteps,
-    #     learning_rate=3e-4,
-    #     num_steps=128,        # Steps per rollout
-    #     num_minibatches=4,    # Minibatches per update
-    #     update_epochs=4,      # PPO epochs per update
-    #     gamma=0.99,           # Discount factor
-    #     gae_lambda=0.95,      # GAE parameter
-    #     clip_coef=0.2,        # PPO clipping
-    #     vf_coef=0.5,          # Value loss coefficient
-    #     ent_coef=0.01,        # Entropy bonus coefficient
-    #     max_grad_norm=0.5,    # Gradient clipping
-    # )
-    #
-    # YOUR CODE:
-
-    config = {
-        "env": "MoveToTarget",
-        "total_timesteps": total_timesteps,
-        "learning_rate": 3e-4,
-        "batch_size": num_envs * 128,
-        "bptt_horizon": 128,
-        "minibatch_size": 512,
-        "max_minibatch_size": 512,
-        "update_epochs": 4,
-        "gamma": 0.99,
-        "gae_lambda": 0.95,
-        "clip_coef": 0.2,
-        "vf_coef": 0.5,
-        "vf_clip_coef": 0.2,
-        "ent_coef": 0.01,
-        "max_grad_norm": 0.5,
-        "device": device,
-        "seed": 42,
-        "torch_deterministic": True,
-        "cpu_offload": False,
-        "use_rnn": False,
-        "compile": False,
-        "optimizer": "adam",
-        "adam_beta1": 0.9,
-        "adam_beta2": 0.999,
-        "adam_eps": 1e-8,
-        "anneal_lr": True,
-        "vtrace_rho_clip": 1.0,
-        "vtrace_c_clip": 1.0,
-        "prio_alpha": 0.8,
-        "prio_beta0": 0.2,
-        "checkpoint_interval": 200,
-        "data_dir": "experiments",
-        "precision": "float32",
-    }
-
-    # -----------------------------------------------------------------
-    # TODO 5.4: Create trainer
-    # -----------------------------------------------------------------
-    # The PuffeRL trainer handles the entire training loop internals.
-    #
-    # trainer = pufferl.PuffeRL(
-    #     config=config,
-    #     vecenv=vecenv,
-    #     policy=policy,
-    #     optimizer=torch.optim.Adam(policy.parameters(), lr=config.learning_rate),
-    # )
-    #
-    # YOUR CODE:
-
-    trainer = pufferl.PuffeRL(config, vecenv, policy)
-
-    # -----------------------------------------------------------------
-    # TODO 5.5: Training loop
-    # -----------------------------------------------------------------
-    # The training loop is very simple with PufferLib:
-    # 1. trainer.evaluate() - collect experiences
-    # 2. trainer.train() - run PPO update
-    # 3. Repeat until done
-    #
-    # Example:
-    # while not trainer.done:
-    #     trainer.evaluate()
-    #     trainer.train()
-    #
-    #     # Print progress every 10 epochs
-    #     if trainer.epoch % 10 == 0:
-    #         # Get metrics from trainer
-    #         metrics = trainer.metrics
-    #         print(f"Epoch {trainer.epoch} | "
-    #               f"reward: {metrics.get('episode_reward', 0):.2f} | "
-    #               f"length: {metrics.get('episode_length', 0):.1f}")
-    #
-    # Or use the built-in dashboard:
-    # while not trainer.done:
-    #     trainer.evaluate()
-    #     trainer.train()
-    #     trainer.print_dashboard()  # Pretty-printed metrics
-    #
-    # YOUR CODE:
-
-    while trainer.global_step < total_timesteps:
-        trainer.evaluate()
-        trainer.train()
-
-    # Cleanup
-    trainer.close()
-    vecenv.close()
-
-    print("\n" + "=" * 60)
-    print("TRAINING COMPLETE!")
-    print("=" * 60)
-
-    return policy
-
-
-# =============================================================================
-# SECTION 6: EVALUATION WITH ASCII RENDERING
-# =============================================================================
-
-
-def eval_policy(num_episodes=3, delay=0.1):
-    """
-    Run the trained policy and watch it play with ASCII rendering.
-
-    PARAMETERS:
-    - num_episodes: number of episodes to run
-    - delay: seconds between frames (for watchability)
-    """
-    import time
-    import glob
-
-    print("=" * 60)
-    print("EVALUATING TRAINED POLICY")
-    print("=" * 60)
-
-    # Find latest checkpoint
-    checkpoints = glob.glob("experiments/**/model.pt", recursive=True)
-    if not checkpoints:
-        print(
-            "No checkpoint found in experiments/. Train first with 'python learn_v2.py train'"
-        )
-        return
-
-    latest_checkpoint = max(checkpoints, key=lambda x: os.path.getmtime(x))
-    print(f"Loading checkpoint: {latest_checkpoint}")
-
-    # Create environment (single, not vectorized)
-    env = MoveToTargetEnv(seed=int(time.time()))
-
-    # Create and load policy
-    policy = Policy(env, hidden_size=64)
-    checkpoint = torch.load(latest_checkpoint, map_location="cpu", weights_only=True)
-    policy.load_state_dict(checkpoint)
-    policy.eval()
-
-    print(f"Running {num_episodes} episodes...\n")
-
-    for ep in range(num_episodes):
-        print(f"\n{'=' * 60}")
-        print(f"EPISODE {ep + 1}")
-        print(f"{'=' * 60}")
-
-        obs, _ = env.reset()
-        env.render()
-        time.sleep(delay)
-
-        done = False
-        total_reward = 0.0
-
-        while not done:
-            # Get action from policy
-            with torch.no_grad():
-                obs_tensor = torch.from_numpy(obs).float()
-                logits, _ = policy(obs_tensor)
-                action = torch.argmax(logits, dim=-1).item()
-
-            # Step environment
-            obs, rewards, terminals, truncations, infos = env.step(np.array([action]))
-            total_reward += rewards[0]
-            done = terminals[0] or truncations[0]
-
-            # Render
-            env.render()
-            action_names = ["NOOP", "UP", "DOWN", "LEFT", "RIGHT"]
-            print(f"Action: {action_names[action]}, Reward: {rewards[0]:.3f}")
-            time.sleep(delay)
-
-        # Episode summary
-        if infos:
-            info = infos[0]
-            result = (
-                "REACHED TARGET!"
-                if info.get("reached_target")
-                else "Failed (wall/timeout)"
-            )
-            print(f"\nResult: {result}")
-            print(f"Episode length: {info.get('episode_length', 'N/A')}")
-        print(f"Total reward: {total_reward:.3f}")
-
-    env.close()
-    print("\n" + "=" * 60)
-    print("EVALUATION COMPLETE!")
-    print("=" * 60)
-
-
-# =============================================================================
-# MAIN EXECUTION
-# =============================================================================
-
-if __name__ == "__main__":
-    import sys
-
-    # Parse command line arguments
-    if len(sys.argv) > 1:
-        command = sys.argv[1]
-        if command == "test":
-            # Run all tests
-            env_ok = test_environment()
-            if env_ok:
-                test_policy()
-        elif command == "train":
-            # Run full training
-            test_environment()
-            test_policy()
-            train(quick_test=False)
-        elif command == "quick":
-            # Quick training test
-            # test_environment()
-            # test_policy()
-            train(quick_test=True)
-        elif command == "eval":
-            # Evaluate trained policy with ASCII rendering
-            eval_policy(num_episodes=3, delay=0.1)
-        else:
-            print(f"Unknown command: {command}")
-            print("Usage: python learn_v2.py [test|train|quick|eval]")
-    else:
-        # Default: run tests only
-        print("Running tests... (use 'python learn_v2.py train' for full training)")
-        print()
-        env_ok = test_environment()
-        if env_ok:
-            test_policy()
diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h
index 55799cf94..6df80098c 100644
--- a/pufferlib/ocean/boss_fight/boss_fight.h
+++ b/pufferlib/ocean/boss_fight/boss_fight.h
@@ -24,7 +24,6 @@
 #define HP_BAR_WIDTH 40
 #define HP_BAR_HEIGHT 5
 
-// Rewards
 #define REWARD_APPROACH 0.05f
 #define REWARD_HIT_WALL -0.05f
 #define REWARD_PLAYER_HIT_BOSS 0.07f
@@ -52,13 +51,12 @@ typedef enum {
   BOSS_RECOVERING,
 } BossState;
 
-// Only use floats!
 typedef struct {
   float perf;           // 0-1 normalized metric
-  float score;          // unnormalized metric
-  float episode_return; // sum of rewards
-  float episode_length; // steps per episode
-  float wins;           // episodes where boss died
+  float score;          // Unnormalized metric
+  float episode_return; // Sum of rewards
+  float episode_length; // Steps per episode
+  float wins;           // Episodes where boss died
   float n;              // Required as last field
 } Log;
 
@@ -86,9 +84,8 @@ typedef struct {
   float boss_hp;
   int boss_phase_ticks;
 
-  float episode_return; // track within episode
+  float episode_return;
 
-  // stats
   int player_wins;
   int boss_wins;
   int timeouts;
@@ -228,7 +225,7 @@ void c_step(BossFight *env) {
     reward += REWARD_HIT_WALL;
   }
 
-  // can't walk out of bounds
+  // Can't walk out of bounds
   env->player_x =
       fmaxf(-ARENA_HALF_SIZE, fminf(ARENA_HALF_SIZE, env->player_x));
   env->player_y =
@@ -239,14 +236,13 @@ void c_step(BossFight *env) {
   reward += REWARD_APPROACH * (env->prev_distance - dist);
   env->prev_distance = dist;
 
-  // push player out if clipping into boss
+  // Push player out if clipping into boss
   if (dist < BOSS_SIZE + PLAYER_SIZE && dist > 1e-6f) {
     float overlap = BOSS_SIZE + PLAYER_SIZE - dist;
     float dx = env->player_x - env->boss_x;
     float dy = env->player_y - env->boss_y;
     env->player_x += (dx / dist) * overlap;
     env->player_y += (dy / dist) * overlap;
-    // recalculate distance after push
     dist = distance(env->player_x, env->player_y, env->boss_x, env->boss_y);
   }
 
@@ -262,8 +258,8 @@ void c_step(BossFight *env) {
       env->player_state == PLAYER_DODGING &&
       env->player_state_ticks > (PLAYER_DODGE_TICKS - PLAYER_IFRAME_TICKS);
 
-  // Souls-like: you can i-frame briefly, but the AOE persists longer than the
-  // i-frame window; if you're still in the hitbox after i-frames, you get hit.
+  // AOE persists longer than the i-frame window
+  // If player is still in the hitbox after i-frames, you get hit.
   bool boss_can_hit = in_aoe_attack && !player_iframed;
   bool boss_can_damage = env->boss_state == BOSS_ATTACKING && boss_can_hit;
   if (boss_can_damage) {
diff --git a/runpod.md b/runpod.md
deleted file mode 100644
index 81859f0f9..000000000
--- a/runpod.md
+++ /dev/null
@@ -1,12 +0,0 @@
-curl -LsSf https://astral.sh/uv/install.sh | sh
-source ~/.bashrc
-git clone https://github.com/frixaco/PufferLib
-cd PufferLib
-git switch boss-fight
-uv venv
-source .venv/bin/activate
-uv pip install -e .
-python setup.py build_boss_fight --inplace --force
-puffer train puffer_boss_fight --train.total-timesteps 5000000 --train.device cuda --vec.num-envs 8192 --vec.num-workers 16 --train.minibatch-size 8192 --train.max-minibatch-size 65536
-
-puffer eval puffer*boss_fight --load-model-path $(ls -t experiments/puffer_boss_fight*\_/model\_\_.pt | head -1)

From 27ede234995413f863bdeda97780471208a6ec87 Mon Sep 17 00:00:00 2001
From: frixaco <rustam21ashurmatov@gmail.com>
Date: Sat, 24 Jan 2026 23:06:22 +0500
Subject: [PATCH 25/29] add readme

---
 pufferlib/config/boss_fight.ini      |   6 +-
 pufferlib/ocean/boss_fight/README.md | 100 +++++++++++++++++++++++++++
 2 files changed, 103 insertions(+), 3 deletions(-)
 create mode 100644 pufferlib/ocean/boss_fight/README.md

diff --git a/pufferlib/config/boss_fight.ini b/pufferlib/config/boss_fight.ini
index c0913bb62..bb7f8859d 100644
--- a/pufferlib/config/boss_fight.ini
+++ b/pufferlib/config/boss_fight.ini
@@ -5,8 +5,8 @@ policy_name = Policy
 # rnn_name = Recurrent  # Uncomment if adding LSTM/GRU
 
 [vec]
-num_envs = 448
-num_workers = 14
+num_envs = 1024
+num_workers = 8
 batch_size = auto
 zero_copy = True
 seed = 42
@@ -39,7 +39,7 @@ precision = float32
 compile = False
 
 # Core PPO hyperparameters
-total_timesteps = 10_000_000
+total_timesteps = 5_000_000
 learning_rate = 0.0003
 anneal_lr = True
 min_lr_ratio = 0.0
diff --git a/pufferlib/ocean/boss_fight/README.md b/pufferlib/ocean/boss_fight/README.md
new file mode 100644
index 000000000..958cdcd57
--- /dev/null
+++ b/pufferlib/ocean/boss_fight/README.md
@@ -0,0 +1,100 @@
+# BossFight (PufferLib Ocean)
+
+BossFight is a simple 2D boss-fight reinforcement learning environment.
+
+The boss currently has **one attack**: a circular **AOE burst** and cycles between 4 states.
+Player (agent) has to defeat the boss by attacking and avoiding AoE attacks by dodging (has i-frames).
+All hitboxes are circles (collision = circles overlap).
+
+## Game rules
+
+- **Arena:** square `[-ARENA_HALF_SIZE, ARENA_HALF_SIZE]^2` (default `5.0`)
+- **Boss:** stationary at `(0, 0)`
+- **Episode ends on:**
+  - win: boss HP reaches 0
+  - loss: player HP reaches 0
+  - timeout: `EPISODE_LENGTH` steps
+
+### Boss attack cycle
+
+The boss cycles through:
+
+`IDLE (BOSS_IDLE_TICKS) -> WINDUP (BOSS_WINDUP_TICKS) -> ACTIVE (BOSS_ACTIVE_TICKS) -> RECOVERY (BOSS_RECOVERY_TICKS) -> ...`
+
+During **ACTIVE**, the boss deals damage if the player overlaps the AOE circle.
+
+### Player mechanics
+
+- **Move** (only while idling): 4 directional movement at `PLAYER_SPEED_PER_TICK`
+- **Attack**: melee hit if within `PLAYER_ATTACK_RADIUS` (locks the player for `PLAYER_ATTACK_TICKS`)
+- **Dodge**:
+  - lasts `PLAYER_DODGE_TICKS` and automatically moves the player directly **away from the boss** at `PLAYER_DODGE_SPEED_PER_TICK`
+  - the first `PLAYER_IFRAME_TICKS` are i-frames
+  - the boss AOE lasts longer than the i-frame window, so “dodge in place” isn’t sufficient -- you must **exit the AOE**
+  - after dodge ends, `PLAYER_DODGE_COOLDOWN` ticks must pass before dodging again
+
+## Action space
+
+`Discrete(7)`:
+
+|  id | action     |
+| --: | ---------- |
+|   0 | idle       |
+|   1 | move up    |
+|   2 | move down  |
+|   3 | move left  |
+|   4 | move right |
+|   5 | dodge      |
+|   6 | attack     |
+
+## Observation space
+
+`Box(shape=(13,), dtype=float32)` (see `update_observations` in `boss_fight.h`):
+
+| idx | meaning                                                |
+| --: | ------------------------------------------------------ |
+|   0 | `boss_x - player_x`                                    |
+|   1 | `boss_y - player_y`                                    |
+|   2 | `player_x`                                             |
+|   3 | `player_y`                                             |
+|   4 | `boss_x`                                               |
+|   5 | `boss_y`                                               |
+|   6 | `player_hp`                                            |
+|   7 | `boss_hp`                                              |
+|   8 | `player_state` (`0=idle, 1=dodge, 2=attack`)           |
+|   9 | `player_dodge_cooldown`                                |
+|  10 | `player_state_ticks` (remaining)                       |
+|  11 | `boss_state` (`0=idle, 1=windup, 2=attack, 3=recover`) |
+|  12 | `boss_phase_ticks` (remaining)                         |
+
+## Rewards (defaults)
+
+All reward constants are in `boss_fight.h`:
+
+- **Per-step:** `REWARD_TICK`
+- **Shaping:** `REWARD_APPROACH * (prev_distance - distance)`
+- **Events:**
+  - `REWARD_PLAYER_HIT_BOSS`
+  - `REWARD_BOSS_HIT_PLAYER`
+  - `REWARD_DODGE_SUCCESS`
+  - `REWARD_HIT_WALL`
+- **Terminal:** `REWARD_KILL_BOSS`, `REWARD_PLAYER_DIED`, `REWARD_TIMEOUT`
+
+**Dodge success reward** is only paid when:
+
+1. you **start** a dodge while inside the AOE during the boss danger window (**WINDUP** or **ACTIVE**), and
+2. you **exit** the AOE before the danger window ends.
+
+## Rendering / manual play
+
+- Rendering uses **Raylib**. `BossFight.render()` opens a window and draws the player/boss circles + hit radii.
+- A tiny standalone debug harness lives in `boss_fight.c`:
+  - Hold `Left Shift` for manual controls: `WASD` move, `Space` dodge, `J` attack
+  - Without `Left Shift` it takes random actions
+
+## Files
+
+- `boss_fight.h`: core environment logic (`c_reset`, `c_step`, `c_render`)
+- `binding.c`: CPython extension glue (uses `pufferlib/ocean/env_binding.h`)
+- `boss_fight.py`: PufferLib wrapper (`PufferEnv`) + vectorized stepping
+- `pufferlib/config/boss_fight.ini`: default training config for `puffer train puffer_boss_fight`

From ef4822306eae1e175e765aae5565e967ac31e6cd Mon Sep 17 00:00:00 2001
From: frixaco <rustam21ashurmatov@gmail.com>
Date: Sat, 24 Jan 2026 23:18:34 +0500
Subject: [PATCH 26/29] fix manual control

---
 pufferlib/ocean/boss_fight/boss_fight.h | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h
index 6df80098c..609c6c07e 100644
--- a/pufferlib/ocean/boss_fight/boss_fight.h
+++ b/pufferlib/ocean/boss_fight/boss_fight.h
@@ -163,13 +163,29 @@ void c_step(BossFight *env) {
   env->terminals[0] = 0;
 
   int action = env->actions[0];
+  if (IsKeyDown(KEY_LEFT_SHIFT)) {
+    if (IsKeyDown(KEY_W))
+      action = 1;
+    else if (IsKeyDown(KEY_S))
+      action = 2;
+    else if (IsKeyDown(KEY_A))
+      action = 3;
+    else if (IsKeyDown(KEY_D))
+      action = 4;
+    else if (IsKeyDown(KEY_SPACE))
+      action = 5;
+    else if (IsKeyDown(KEY_J))
+      action = 6;
+    else
+      action = 0;
+  }
   float dx = 0;
   float dy = 0;
 
   if (action == 1) {
-    dy = PLAYER_SPEED_PER_TICK;
-  } else if (action == 2) {
     dy = -PLAYER_SPEED_PER_TICK;
+  } else if (action == 2) {
+    dy = PLAYER_SPEED_PER_TICK;
   } else if (action == 3) {
     dx = -PLAYER_SPEED_PER_TICK;
   } else if (action == 4) {

From d2c4a6c9d4c46a24d14179efd37e09698ea658cc Mon Sep 17 00:00:00 2001
From: frixaco <rustam21ashurmatov@gmail.com>
Date: Sun, 25 Jan 2026 00:17:42 +0500
Subject: [PATCH 27/29] sweep sweep

---
 pufferlib/config/boss_fight.ini | 55 +++++++++------------------------
 1 file changed, 15 insertions(+), 40 deletions(-)

diff --git a/pufferlib/config/boss_fight.ini b/pufferlib/config/boss_fight.ini
index bb7f8859d..8013ea38b 100644
--- a/pufferlib/config/boss_fight.ini
+++ b/pufferlib/config/boss_fight.ini
@@ -2,7 +2,6 @@
 package = ocean
 env_name = puffer_boss_fight
 policy_name = Policy
-# rnn_name = Recurrent  # Uncomment if adding LSTM/GRU
 
 [vec]
 num_envs = 1024
@@ -12,59 +11,40 @@ zero_copy = True
 seed = 42
 
 [env]
-# Environment-specific params (passed to env constructor)
-# None needed - using defaults from README
 
 [policy]
-# Policy constructor args (e.g., hidden_size)
-# hidden_size = 64  # Experiment: 32, 64, 128
 
 [train]
-# Experiment tracking
 name = boss_fight
 project = boss_fight_experiments
 data_dir = experiments
 checkpoint_interval = 200
-
-# Reproducibility
 seed = 42
-# TODO: disable for sweep or speed
 torch_deterministic = True
 device = cpu
-
-# Optimization
-# TODO: try muon with 0.015 lr
 optimizer = adam
 precision = float32
 compile = False
-
-# Core PPO hyperparameters
 total_timesteps = 5_000_000
-learning_rate = 0.0003
+learning_rate = 0.000864
 anneal_lr = True
-min_lr_ratio = 0.0
-gamma = 0.99
-gae_lambda = 0.95
+min_lr_ratio = 0.437
+gamma = 0.983
+gae_lambda = 0.902
 update_epochs = 4
-clip_coef = 0.2
-vf_coef = 0.5
-vf_clip_coef = 0.2
-max_grad_norm = 0.5
-ent_coef = 0.01
-
-# Batch sizes
-minibatch_size =  2048
+clip_coef = 0.421
+vf_coef = 4.38
+vf_clip_coef = 0.303
+max_grad_norm = 2.28
+ent_coef = 0.00623
+minibatch_size = 2048
 max_minibatch_size = 32768
 bptt_horizon = 32
-
-# Adam parameters (if optimizer = adam)
-adam_beta1 = 0.9
-adam_beta2 = 0.999
-adam_eps = 1e-8
-
-# V-trace (for off-policy correction)
-# vtrace_rho_clip = 1.0
-# vtrace_c_clip = 1.0
+adam_beta1 = 0.991
+adam_beta2 = 0.998
+adam_eps = 1e-14
+vtrace_rho_clip = 2.72
+vtrace_c_clip = 2.13
 
 [sweep]
 goal = maximize
@@ -74,31 +54,26 @@ metric_distribution = linear
 max_suggestion_cost = 3600
 use_gpu = True
 
-# Learning rate sweep
 [sweep.train.learning_rate]
 distribution = log_normal
 min = 0.0001
 max = 0.003
 
-# Entropy coefficient sweep (exploration vs exploitation)
 [sweep.train.ent_coef]
 distribution = log_normal
 min = 0.0001
 max = 0.05
 
-# Discount factor sweep
 [sweep.train.gamma]
 distribution = logit_normal
 min = 0.95
 max = 0.999
 
-# GAE lambda sweep
 [sweep.train.gae_lambda]
 distribution = logit_normal
 min = 0.9
 max = 0.99
 
-# Minibatch size sweep
 [sweep.train.minibatch_size]
 distribution = uniform_pow2
 min = 1024

From 0896d218aaddf9c9a8692bbfdb40871b74131964 Mon Sep 17 00:00:00 2001
From: frixaco <rustam21ashurmatov@gmail.com>
Date: Sun, 25 Jan 2026 04:11:43 +0500
Subject: [PATCH 28/29] normalize observation space data

---
 pufferlib/ocean/boss_fight/README.md     |  35 ++++----
 pufferlib/ocean/boss_fight/boss_fight.c  |   2 +-
 pufferlib/ocean/boss_fight/boss_fight.h  | 105 ++++++++++++++++-------
 pufferlib/ocean/boss_fight/boss_fight.py |   2 +-
 4 files changed, 91 insertions(+), 53 deletions(-)

diff --git a/pufferlib/ocean/boss_fight/README.md b/pufferlib/ocean/boss_fight/README.md
index 958cdcd57..fe56d6b30 100644
--- a/pufferlib/ocean/boss_fight/README.md
+++ b/pufferlib/ocean/boss_fight/README.md
@@ -8,7 +8,7 @@ All hitboxes are circles (collision = circles overlap).
 
 ## Game rules
 
-- **Arena:** square `[-ARENA_HALF_SIZE, ARENA_HALF_SIZE]^2` (default `5.0`)
+- **Arena:** square `[-ARENA_HALF_SIZE, ARENA_HALF_SIZE]^2` (default `500.0`)
 - **Boss:** stationary at `(0, 0)`
 - **Episode ends on:**
   - win: boss HP reaches 0
@@ -49,23 +49,22 @@ During **ACTIVE**, the boss deals damage if the player overlaps the AOE circle.
 
 ## Observation space
 
-`Box(shape=(13,), dtype=float32)` (see `update_observations` in `boss_fight.h`):
-
-| idx | meaning                                                |
-| --: | ------------------------------------------------------ |
-|   0 | `boss_x - player_x`                                    |
-|   1 | `boss_y - player_y`                                    |
-|   2 | `player_x`                                             |
-|   3 | `player_y`                                             |
-|   4 | `boss_x`                                               |
-|   5 | `boss_y`                                               |
-|   6 | `player_hp`                                            |
-|   7 | `boss_hp`                                              |
-|   8 | `player_state` (`0=idle, 1=dodge, 2=attack`)           |
-|   9 | `player_dodge_cooldown`                                |
-|  10 | `player_state_ticks` (remaining)                       |
-|  11 | `boss_state` (`0=idle, 1=windup, 2=attack, 3=recover`) |
-|  12 | `boss_phase_ticks` (remaining)                         |
+`Box(shape=(12,), dtype=float32)` — all normalized to [-1, 1] or [0, 1] (see `update_observations` in `boss_fight.h`):
+
+| idx | meaning                      | range   |
+| --: | ---------------------------- | ------- |
+|   0 | `player_x` normalized        | [-1, 1] |
+|   1 | `player_y` normalized        | [-1, 1] |
+|   2 | `dist_to_boss` normalized    | [0, 1]  |
+|   3 | `player_hp` normalized       | [0, 1]  |
+|   4 | `boss_hp` normalized         | [0, 1]  |
+|   5 | `dodge_cooldown` normalized  | [0, 1]  |
+|   6 | `dodge_remaining`            | [0, 1]  |
+|   7 | `iframe_remaining`           | [0, 1]  |
+|   8 | `attack_remaining`           | [0, 1]  |
+|   9 | `time_until_aoe`             | [0, 1]  |
+|  10 | `aoe_remaining`              | [0, 1]  |
+|  11 | `episode_time_remaining`     | [0, 1]  |
 
 ## Rewards (defaults)
 
diff --git a/pufferlib/ocean/boss_fight/boss_fight.c b/pufferlib/ocean/boss_fight/boss_fight.c
index 49198b733..5d69c4e27 100644
--- a/pufferlib/ocean/boss_fight/boss_fight.c
+++ b/pufferlib/ocean/boss_fight/boss_fight.c
@@ -2,7 +2,7 @@
 #include "raylib.h"
 
 int main() {
-  int num_obs = 13;
+  int num_obs = 12;
   int num_actions = 1;
   int num_agents = 1;
 
diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h
index 609c6c07e..26dbd65a8 100644
--- a/pufferlib/ocean/boss_fight/boss_fight.h
+++ b/pufferlib/ocean/boss_fight/boss_fight.h
@@ -3,28 +3,29 @@
 #include <stdio.h>
 #include <stdlib.h>
 
-#define ARENA_HALF_SIZE 5.0f
-#define MAX_HP 1.0f
-#define PLAYER_SPEED_PER_TICK 0.25f
-#define PLAYER_SIZE 0.3f
-#define BOSS_SIZE 0.5f
-#define PLAYER_ATTACK_RADIUS 0.4f
+#define ARENA_HALF_SIZE 500.0f
+#define MAX_HP 100.0f
+#define PLAYER_SPEED_PER_TICK 25.0f
+#define PLAYER_SIZE 30.0f
+#define BOSS_SIZE 50.0f
+#define PLAYER_ATTACK_RADIUS 40.0f
 #define PLAYER_ATTACK_TICKS 3
 #define PLAYER_DODGE_TICKS 4
 #define PLAYER_IFRAME_TICKS 2
 #define PLAYER_DODGE_COOLDOWN 15
-#define PLAYER_DODGE_SPEED_PER_TICK 0.35f
-#define PLAYER_ATTACK_DMG 0.05f
-#define BOSS_ATTACK_DMG 0.15f
-#define BOSS_AOE_ATTACK_RADIUS 0.8f
+#define PLAYER_DODGE_SPEED_PER_TICK 35.0f
+#define PLAYER_ATTACK_DMG 5.0f
+#define BOSS_ATTACK_DMG 15.0f
+#define BOSS_AOE_ATTACK_RADIUS 80.0f
 #define BOSS_IDLE_TICKS 7
 #define BOSS_WINDUP_TICKS 5
 #define BOSS_ACTIVE_TICKS 5
 #define BOSS_RECOVERY_TICKS 5
+
 #define HP_BAR_WIDTH 40
 #define HP_BAR_HEIGHT 5
 
-#define REWARD_APPROACH 0.05f
+#define REWARD_APPROACH 0.7f
 #define REWARD_HIT_WALL -0.05f
 #define REWARD_PLAYER_HIT_BOSS 0.07f
 #define REWARD_BOSS_HIT_PLAYER -0.05f
@@ -72,7 +73,7 @@ typedef struct {
   float player_y;
   float boss_x;
   float boss_y;
-  float prev_distance;
+  float dist_to_boss;
 
   PlayerState player_state;
   float player_hp;
@@ -111,19 +112,59 @@ void add_log(BossFight *env) {
 
 void update_observations(BossFight *env) {
   int obs_idx = 0;
-  env->observations[obs_idx++] = env->boss_x - env->player_x;
-  env->observations[obs_idx++] = env->boss_y - env->player_y;
-  env->observations[obs_idx++] = env->player_x;
-  env->observations[obs_idx++] = env->player_y;
-  env->observations[obs_idx++] = env->boss_x;
-  env->observations[obs_idx++] = env->boss_y;
-  env->observations[obs_idx++] = (float)env->player_hp;
-  env->observations[obs_idx++] = (float)env->boss_hp;
-  env->observations[obs_idx++] = (float)env->player_state;
-  env->observations[obs_idx++] = (float)env->player_dodge_cooldown;
-  env->observations[obs_idx++] = (float)env->player_state_ticks;
-  env->observations[obs_idx++] = (float)env->boss_state;
-  env->observations[obs_idx++] = (float)env->boss_phase_ticks;
+
+  env->observations[obs_idx++] = env->player_x / ARENA_HALF_SIZE;
+  env->observations[obs_idx++] = env->player_y / ARENA_HALF_SIZE;
+
+  float dist = distance(env->player_x, env->player_y, env->boss_x, env->boss_y);
+  float max_dist = sqrtf(2.0f) * ARENA_HALF_SIZE;
+  env->observations[obs_idx++] = dist / max_dist;
+
+  env->observations[obs_idx++] = env->player_hp / MAX_HP;
+  env->observations[obs_idx++] = env->boss_hp / MAX_HP;
+
+  env->observations[obs_idx++] =
+      (float)env->player_dodge_cooldown / PLAYER_DODGE_COOLDOWN;
+
+  float dodge_remaining =
+      (env->player_state == PLAYER_DODGING)
+          ? (float)env->player_state_ticks / PLAYER_DODGE_TICKS
+          : 0.0f;
+  env->observations[obs_idx++] = dodge_remaining;
+
+  int iframe_ticks =
+      env->player_state_ticks - (PLAYER_DODGE_TICKS - PLAYER_IFRAME_TICKS);
+  float iframe_remaining =
+      (env->player_state == PLAYER_DODGING && iframe_ticks > 0)
+          ? fminf((float)iframe_ticks / PLAYER_IFRAME_TICKS, 1.0f)
+          : 0.0f;
+  env->observations[obs_idx++] = iframe_remaining;
+
+  float attack_remaining =
+      (env->player_state == PLAYER_ATTACKING)
+          ? (float)env->player_state_ticks / PLAYER_ATTACK_TICKS
+          : 0.0f;
+  env->observations[obs_idx++] = attack_remaining;
+
+  float cycle_len = BOSS_IDLE_TICKS + BOSS_WINDUP_TICKS + BOSS_ACTIVE_TICKS +
+                    BOSS_RECOVERY_TICKS;
+  float time_until_aoe = 0.0f;
+  if (env->boss_state == BOSS_IDLING)
+    time_until_aoe = env->boss_phase_ticks + BOSS_WINDUP_TICKS;
+  else if (env->boss_state == BOSS_WINDING_UP)
+    time_until_aoe = env->boss_phase_ticks;
+  else if (env->boss_state == BOSS_RECOVERING)
+    time_until_aoe =
+        env->boss_phase_ticks + BOSS_IDLE_TICKS + BOSS_WINDUP_TICKS;
+  env->observations[obs_idx++] = time_until_aoe / cycle_len;
+
+  float aoe_remaining = (env->boss_state == BOSS_ATTACKING)
+                            ? (float)env->boss_phase_ticks / BOSS_ACTIVE_TICKS
+                            : 0.0f;
+  env->observations[obs_idx++] = aoe_remaining;
+
+  env->observations[obs_idx++] =
+      (float)(EPISODE_LENGTH - env->tick) / EPISODE_LENGTH;
 }
 
 void c_reset(BossFight *env) {
@@ -152,7 +193,7 @@ void c_reset(BossFight *env) {
     env->player_y = rand_uniform(-ARENA_HALF_SIZE, ARENA_HALF_SIZE);
   }
 
-  env->prev_distance =
+  env->dist_to_boss =
       distance(env->player_x, env->player_y, env->boss_x, env->boss_y);
 
   update_observations(env);
@@ -204,11 +245,6 @@ void c_step(BossFight *env) {
       env->player_state != PLAYER_DODGING && env->player_dodge_cooldown == 0;
   bool can_attack = env->player_state == PLAYER_IDLING;
 
-  if (wanna_attack && can_attack) {
-    env->player_state_ticks = PLAYER_ATTACK_TICKS;
-    env->player_state = PLAYER_ATTACKING;
-  }
-
   float aoe_dist = BOSS_SIZE + PLAYER_SIZE + BOSS_AOE_ATTACK_RADIUS;
   bool boss_threatening =
       env->boss_state == BOSS_WINDING_UP || env->boss_state == BOSS_ATTACKING;
@@ -249,8 +285,9 @@ void c_step(BossFight *env) {
 
   float dist = distance(env->player_x, env->player_y, env->boss_x, env->boss_y);
 
-  reward += REWARD_APPROACH * (env->prev_distance - dist);
-  env->prev_distance = dist;
+  float max_dist = sqrtf(2.0f) * ARENA_HALF_SIZE;
+  reward += REWARD_APPROACH * ((env->dist_to_boss - dist) / max_dist);
+  env->dist_to_boss = dist;
 
   // Push player out if clipping into boss
   if (dist < BOSS_SIZE + PLAYER_SIZE && dist > 1e-6f) {
@@ -265,6 +302,8 @@ void c_step(BossFight *env) {
   bool close_enough = dist <= BOSS_SIZE + PLAYER_ATTACK_RADIUS + PLAYER_SIZE;
 
   if (wanna_attack && can_attack && close_enough) {
+    env->player_state_ticks = PLAYER_ATTACK_TICKS;
+    env->player_state = PLAYER_ATTACKING;
     env->boss_hp -= PLAYER_ATTACK_DMG;
     reward += REWARD_PLAYER_HIT_BOSS;
   }
diff --git a/pufferlib/ocean/boss_fight/boss_fight.py b/pufferlib/ocean/boss_fight/boss_fight.py
index fdb4bfb4f..f966243ab 100644
--- a/pufferlib/ocean/boss_fight/boss_fight.py
+++ b/pufferlib/ocean/boss_fight/boss_fight.py
@@ -12,7 +12,7 @@ def __init__(
         self, num_envs=1, render_mode=None, log_interval=1, size=5, buf=None, seed=0
     ):
         self.single_observation_space = gymnasium.spaces.Box(
-            low=-10, high=110, shape=(13,), dtype=np.float32
+            low=-1, high=1, shape=(12,), dtype=np.float32
         )
         self.single_action_space = gymnasium.spaces.Discrete(7)
         self.render_mode = render_mode

From 8ed96f304e873c55b236c6984991df231975e1a8 Mon Sep 17 00:00:00 2001
From: frixaco <rustam21ashurmatov@gmail.com>
Date: Sun, 25 Jan 2026 05:43:24 +0500
Subject: [PATCH 29/29] better UI

---
 pufferlib/ocean/boss_fight/README.md    |  11 +-
 pufferlib/ocean/boss_fight/boss_fight.h | 301 ++++++++++++++++++++----
 pufferlib/ocean/environment.py          | 280 +++++++---------------
 3 files changed, 353 insertions(+), 239 deletions(-)

diff --git a/pufferlib/ocean/boss_fight/README.md b/pufferlib/ocean/boss_fight/README.md
index fe56d6b30..5df7ada54 100644
--- a/pufferlib/ocean/boss_fight/README.md
+++ b/pufferlib/ocean/boss_fight/README.md
@@ -21,7 +21,7 @@ The boss cycles through:
 
 `IDLE (BOSS_IDLE_TICKS) -> WINDUP (BOSS_WINDUP_TICKS) -> ACTIVE (BOSS_ACTIVE_TICKS) -> RECOVERY (BOSS_RECOVERY_TICKS) -> ...`
 
-During **ACTIVE**, the boss deals damage if the player overlaps the AOE circle.
+During **ACTIVE**, the boss deals `BOSS_ATTACK_DMG` damage **every tick** the player overlaps the AOE circle (unless i-framed). Staying in the AOE for the full 5 ticks = 75 damage.
 
 ### Player mechanics
 
@@ -86,7 +86,14 @@ All reward constants are in `boss_fight.h`:
 
 ## Rendering / manual play
 
-- Rendering uses **Raylib**. `BossFight.render()` opens a window and draws the player/boss circles + hit radii.
+- Rendering uses **Raylib** with enhanced visuals:
+  - Grid overlay + crosshair axes
+  - Time remaining bar (steps + seconds)
+  - Boss AoE telegraph (charging ring during WINDUP, filled during ACTIVE)
+  - Boss state label (IDLE/WINDUP/ACTIVE/RECOVER)
+  - Dodge trail particles + i-frame blink effect
+  - Attack pulse ring effect
+  - HP bars + dodge cooldown bar in HUD
 - A tiny standalone debug harness lives in `boss_fight.c`:
   - Hold `Left Shift` for manual controls: `WASD` move, `Space` dodge, `J` attack
   - Without `Left Shift` it takes random actions
diff --git a/pufferlib/ocean/boss_fight/boss_fight.h b/pufferlib/ocean/boss_fight/boss_fight.h
index 26dbd65a8..2f0c4f488 100644
--- a/pufferlib/ocean/boss_fight/boss_fight.h
+++ b/pufferlib/ocean/boss_fight/boss_fight.h
@@ -5,16 +5,19 @@
 
 #define ARENA_HALF_SIZE 500.0f
 #define MAX_HP 100.0f
-#define PLAYER_SPEED_PER_TICK 25.0f
+#define EPSILON 1e-6f
+
 #define PLAYER_SIZE 30.0f
-#define BOSS_SIZE 50.0f
+#define PLAYER_SPEED_PER_TICK 25.0f
 #define PLAYER_ATTACK_RADIUS 40.0f
 #define PLAYER_ATTACK_TICKS 3
+#define PLAYER_ATTACK_DMG 5.0f
 #define PLAYER_DODGE_TICKS 4
 #define PLAYER_IFRAME_TICKS 2
 #define PLAYER_DODGE_COOLDOWN 15
 #define PLAYER_DODGE_SPEED_PER_TICK 35.0f
-#define PLAYER_ATTACK_DMG 5.0f
+
+#define BOSS_SIZE 50.0f
 #define BOSS_ATTACK_DMG 15.0f
 #define BOSS_AOE_ATTACK_RADIUS 80.0f
 #define BOSS_IDLE_TICKS 7
@@ -22,9 +25,6 @@
 #define BOSS_ACTIVE_TICKS 5
 #define BOSS_RECOVERY_TICKS 5
 
-#define HP_BAR_WIDTH 40
-#define HP_BAR_HEIGHT 5
-
 #define REWARD_APPROACH 0.7f
 #define REWARD_HIT_WALL -0.05f
 #define REWARD_PLAYER_HIT_BOSS 0.07f
@@ -36,12 +36,30 @@
 #define REWARD_TICK -0.01f
 #define EPISODE_LENGTH 600
 
-const Color PLAYER_COLOR = (Color){50, 100, 255, 255};
-const Color BOSS_COLOR = (Color){0, 187, 187, 255};
-const Color TEXT_COLOR = (Color){241, 241, 241, 255};
-const Color HITBOX_COLOR = (Color){241, 241, 241, 50};
-const Color BACKGROUND_COLOR = (Color){6, 24, 24, 255};
-const Color HP_COLOR = (Color){0, 255, 0, 255};
+#define WINDOW_SIZE 720
+#define TARGET_FPS 30
+#define HP_BAR_WIDTH 40
+#define HP_BAR_HEIGHT 5
+#define UI_MARGIN 20
+#define UI_RIGHT_X 580
+#define UI_BOTTOM_Y 680
+#define UI_HP_BAR_Y 700
+#define UI_FONT_SIZE 20
+#define UI_FONT_SIZE_SMALL 16
+
+static const Color PLAYER_COLOR = (Color){50, 100, 255, 255};
+static const Color BOSS_COLOR = (Color){0, 187, 187, 255};
+static const Color TEXT_COLOR = (Color){241, 241, 241, 255};
+static const Color HITBOX_COLOR = (Color){241, 241, 241, 50};
+static const Color BACKGROUND_COLOR = (Color){6, 24, 24, 255};
+static const Color HP_COLOR = (Color){0, 255, 0, 255};
+
+static const Color ARENA_BORDER_COLOR = (Color){30, 120, 120, 255};
+static const Color ARENA_GRID_COLOR = (Color){30, 70, 70, 255};
+
+static const Color PLAYER_DODGE_COLOR = (Color){255, 215, 90, 255};
+static const Color PLAYER_ATTACK_COLOR = (Color){170, 220, 255, 255};
+static const Color BOSS_DANGER_COLOR = (Color){255, 80, 80, 255};
 
 typedef enum { PLAYER_IDLING, PLAYER_DODGING, PLAYER_ATTACKING } PlayerState;
 
@@ -220,6 +238,7 @@ void c_step(BossFight *env) {
     else
       action = 0;
   }
+
   float dx = 0;
   float dy = 0;
 
@@ -260,12 +279,12 @@ void c_step(BossFight *env) {
     env->player_state = PLAYER_DODGING;
   }
 
-  // Dodge = multi-tick movement out of the AOE (no i-frames)
+  // Dodge: multi-tick movement away from boss, with i-frames at start
   if (env->player_state == PLAYER_DODGING) {
     float away_x = env->player_x - env->boss_x;
     float away_y = env->player_y - env->boss_y;
     float away_norm = sqrtf(away_x * away_x + away_y * away_y);
-    if (away_norm > 1e-6f) {
+    if (away_norm > EPSILON) {
       env->player_x += (away_x / away_norm) * PLAYER_DODGE_SPEED_PER_TICK;
       env->player_y += (away_y / away_norm) * PLAYER_DODGE_SPEED_PER_TICK;
     }
@@ -290,7 +309,7 @@ void c_step(BossFight *env) {
   env->dist_to_boss = dist;
 
   // Push player out if clipping into boss
-  if (dist < BOSS_SIZE + PLAYER_SIZE && dist > 1e-6f) {
+  if (dist < BOSS_SIZE + PLAYER_SIZE && dist > EPSILON) {
     float overlap = BOSS_SIZE + PLAYER_SIZE - dist;
     float dx = env->player_x - env->boss_x;
     float dy = env->player_y - env->boss_y;
@@ -313,8 +332,7 @@ void c_step(BossFight *env) {
       env->player_state == PLAYER_DODGING &&
       env->player_state_ticks > (PLAYER_DODGE_TICKS - PLAYER_IFRAME_TICKS);
 
-  // AOE persists longer than the i-frame window
-  // If player is still in the hitbox after i-frames, you get hit.
+  // Boss deals damage every tick while player in AOE (unless i-framed)
   bool boss_can_hit = in_aoe_attack && !player_iframed;
   bool boss_can_damage = env->boss_state == BOSS_ATTACKING && boss_can_hit;
   if (boss_can_damage) {
@@ -399,17 +417,17 @@ void c_step(BossFight *env) {
 
 int world_to_screen(float world_coord) {
   return (int)((world_coord + ARENA_HALF_SIZE) / (2 * ARENA_HALF_SIZE) *
-               720.0f);
+               (float)WINDOW_SIZE);
 }
 
 float radius_to_screen(float world_radius) {
-  return world_radius / (2 * ARENA_HALF_SIZE) * 720.0f;
+  return world_radius / (2 * ARENA_HALF_SIZE) * (float)WINDOW_SIZE;
 }
 
 void c_render(BossFight *env) {
   if (!IsWindowReady()) {
-    InitWindow(720, 720, "BossFight");
-    SetTargetFPS(30);
+    InitWindow(WINDOW_SIZE, WINDOW_SIZE, "BossFight");
+    SetTargetFPS(TARGET_FPS);
   }
 
   if (IsKeyDown(KEY_ESCAPE)) {
@@ -419,47 +437,248 @@ void c_render(BossFight *env) {
   BeginDrawing();
 
   ClearBackground(BACKGROUND_COLOR);
-  DrawText("Beat the boss!", 20, 20, 20, TEXT_COLOR);
+  DrawText("Beat the boss!", UI_MARGIN, UI_MARGIN, UI_FONT_SIZE, TEXT_COLOR);
+
+  // Arena (bounds + subtle grid)
+  {
+    const float grid_step = 100.0f;
+    const float axis_step = 250.0f;
+    const Color grid = Fade(ARENA_GRID_COLOR, 0.28f);
+    const Color axis = Fade(ARENA_BORDER_COLOR, 0.35f);
+
+    for (float x = -ARENA_HALF_SIZE; x <= ARENA_HALF_SIZE + 0.5f;
+         x += grid_step) {
+      int sx = world_to_screen(x);
+      DrawLine(sx, 0, sx, WINDOW_SIZE, grid);
+    }
+    for (float y = -ARENA_HALF_SIZE; y <= ARENA_HALF_SIZE + 0.5f;
+         y += grid_step) {
+      int sy = world_to_screen(y);
+      DrawLine(0, sy, WINDOW_SIZE, sy, grid);
+    }
+
+    // Crosshair axes
+    DrawLine(world_to_screen(0.0f), 0, world_to_screen(0.0f), WINDOW_SIZE,
+             axis);
+    DrawLine(0, world_to_screen(0.0f), WINDOW_SIZE, world_to_screen(0.0f),
+             axis);
+
+    // Quadrant markers
+    for (float t = -ARENA_HALF_SIZE; t <= ARENA_HALF_SIZE + 0.5f;
+         t += axis_step) {
+      int s = world_to_screen(t);
+      DrawLineEx((Vector2){(float)s, 4.0f}, (Vector2){(float)s, 14.0f}, 2.0f,
+                 Fade(ARENA_BORDER_COLOR, 0.45f));
+      DrawLineEx((Vector2){4.0f, (float)s}, (Vector2){14.0f, (float)s}, 2.0f,
+                 Fade(ARENA_BORDER_COLOR, 0.45f));
+      DrawLineEx((Vector2){(float)s, (float)WINDOW_SIZE - 4.0f},
+                 (Vector2){(float)s, (float)WINDOW_SIZE - 14.0f}, 2.0f,
+                 Fade(ARENA_BORDER_COLOR, 0.45f));
+      DrawLineEx((Vector2){(float)WINDOW_SIZE - 4.0f, (float)s},
+                 (Vector2){(float)WINDOW_SIZE - 14.0f, (float)s}, 2.0f,
+                 Fade(ARENA_BORDER_COLOR, 0.45f));
+    }
+
+    DrawRectangleLinesEx((Rectangle){0, 0, WINDOW_SIZE, WINDOW_SIZE}, 6.0f,
+                         Fade(ARENA_BORDER_COLOR, 0.75f));
+  }
 
   // Stats top-right
   char stats[64];
   snprintf(stats, sizeof(stats), "W:%d L:%d T:%d", env->player_wins,
            env->boss_wins, env->timeouts);
-  DrawText(stats, 580, 20, 20, TEXT_COLOR);
+  DrawText(stats, UI_RIGHT_X, UI_MARGIN, UI_FONT_SIZE, TEXT_COLOR);
+
+  // Time-left HUD (steps + approx seconds)
+  {
+    int steps_left = EPISODE_LENGTH - env->tick;
+    if (steps_left < 0)
+      steps_left = 0;
+    float t = (float)steps_left / (float)EPISODE_LENGTH;
+
+    const int bar_w = 260;
+    const int bar_h = 10;
+    const int bar_x = (WINDOW_SIZE - bar_w) / 2;
+    const int bar_y = UI_MARGIN + UI_FONT_SIZE + 8;
+
+    DrawText("TIME", bar_x - 50, bar_y - 4, UI_FONT_SIZE_SMALL,
+             Fade(TEXT_COLOR, 0.85f));
+    DrawRectangle(bar_x, bar_y, bar_w, bar_h, Fade(DARKGRAY, 0.8f));
+    DrawRectangle(bar_x, bar_y, (int)((float)bar_w * t), bar_h,
+                  Fade((Color){120, 210, 210, 255}, 0.95f));
+    DrawRectangleLinesEx(
+        (Rectangle){(float)bar_x, (float)bar_y, (float)bar_w, (float)bar_h},
+        2.0f, Fade(ARENA_BORDER_COLOR, 0.7f));
+
+    char tbuf[64];
+    int secs_left = (int)ceilf((float)steps_left / (float)TARGET_FPS);
+    snprintf(tbuf, sizeof(tbuf), "%d steps  (~%ds)", steps_left, secs_left);
+    DrawText(tbuf, bar_x, bar_y + bar_h + 6, UI_FONT_SIZE_SMALL,
+             Fade(TEXT_COLOR, 0.85f));
+  }
 
   // Player
   int player_sx = world_to_screen(env->player_x);
   int player_sy = world_to_screen(env->player_y);
-  int player_hp_bar_y = player_sy + (int)radius_to_screen(PLAYER_SIZE) + 5;
-  int player_hp_width = (int)((float)env->player_hp / MAX_HP * HP_BAR_WIDTH);
+  float player_hp_ratio = fmaxf(0.0f, fminf(1.0f, env->player_hp / MAX_HP));
+  int player_hp_width = (int)(player_hp_ratio * HP_BAR_WIDTH);
 
-  Color player_color = env->player_hp <= 0 ? RED : PLAYER_COLOR;
-  DrawCircle(player_sx, player_sy,
-             radius_to_screen(PLAYER_SIZE + PLAYER_ATTACK_RADIUS),
-             HITBOX_COLOR);
-  DrawCircle(player_sx, player_sy, radius_to_screen(PLAYER_SIZE), player_color);
+  float player_attack_r = radius_to_screen(PLAYER_SIZE + PLAYER_ATTACK_RADIUS);
+  bool player_iframed =
+      env->player_state == PLAYER_DODGING &&
+      env->player_state_ticks > (PLAYER_DODGE_TICKS - PLAYER_IFRAME_TICKS);
+
+  Color player_base = env->player_hp <= 0 ? RED : PLAYER_COLOR;
+  if (env->player_state == PLAYER_DODGING)
+    player_base = PLAYER_DODGE_COLOR;
+  DrawCircleLines(player_sx, player_sy, player_attack_r,
+                  Fade(PLAYER_ATTACK_COLOR, 0.18f));
+
+  // Dodge trail (stateless: inferred from away-from-boss direction)
+  if (env->player_state == PLAYER_DODGING) {
+    float away_x = env->player_x - env->boss_x;
+    float away_y = env->player_y - env->boss_y;
+    float away_norm = sqrtf(away_x * away_x + away_y * away_y);
+    if (away_norm > EPSILON) {
+      float ux = away_x / away_norm;
+      float uy = away_y / away_norm;
+      for (int i = 1; i <= 4; i++) {
+        float w = (float)(5 - i) / 5.0f;
+        int tx = world_to_screen(env->player_x - ux * (float)i * 40.0f);
+        int ty = world_to_screen(env->player_y - uy * (float)i * 40.0f);
+        DrawCircle(tx, ty, radius_to_screen(PLAYER_SIZE) * (0.9f - 0.08f * i),
+                   Fade(PLAYER_DODGE_COLOR, 0.08f + 0.12f * w));
+      }
+    }
+  }
+
+  // Player body (shadow + fill + outline)
+  DrawCircle(player_sx + 3, player_sy + 4, radius_to_screen(PLAYER_SIZE),
+             Fade(BLACK, 0.25f));
+  DrawCircle(player_sx, player_sy, radius_to_screen(PLAYER_SIZE), player_base);
+  DrawCircleLines(player_sx, player_sy, radius_to_screen(PLAYER_SIZE),
+                  Fade(WHITE, 0.25f));
+
+  // Attack effect (duration)
+  if (env->player_state == PLAYER_ATTACKING) {
+    float rem = (float)env->player_state_ticks / (float)PLAYER_ATTACK_TICKS;
+    rem = fmaxf(0.0f, fminf(1.0f, rem));
+    float pulse = 1.0f - rem;
+    float outer = player_attack_r * (1.0f + 0.10f * pulse);
+    float inner = player_attack_r * (0.92f + 0.04f * pulse);
+    BeginBlendMode(BLEND_ADDITIVE);
+    DrawRing((Vector2){(float)player_sx, (float)player_sy}, inner, outer, 0.0f,
+             360.0f, 64, Fade(PLAYER_ATTACK_COLOR, 0.30f + 0.45f * rem));
+    EndBlendMode();
+    DrawCircleLines(player_sx, player_sy, outer,
+                    Fade(PLAYER_ATTACK_COLOR, 0.25f + 0.35f * rem));
+  }
+
+  // I-frame blink
+  if (player_iframed) {
+    BeginBlendMode(BLEND_ADDITIVE);
+    DrawCircleLines(player_sx, player_sy, radius_to_screen(PLAYER_SIZE) * 1.12f,
+                    Fade(WHITE, 0.65f));
+    EndBlendMode();
+  }
 
   // Boss
   int boss_sx = world_to_screen(env->boss_x);
   int boss_sy = world_to_screen(env->boss_y);
-  int boss_hp_bar_y = boss_sy + (int)radius_to_screen(BOSS_SIZE) + 5;
-  int boss_hp_width = (int)((float)env->boss_hp / MAX_HP * HP_BAR_WIDTH);
+  float boss_hp_ratio = fmaxf(0.0f, fminf(1.0f, env->boss_hp / MAX_HP));
+  int boss_hp_width = (int)(boss_hp_ratio * HP_BAR_WIDTH);
+
+  float boss_aoe_r =
+      radius_to_screen(BOSS_SIZE + PLAYER_SIZE + BOSS_AOE_ATTACK_RADIUS);
+
+  // Boss AoE telegraph/active zone
+  {
+    float a = 0.10f;
+    if (env->boss_state == BOSS_WINDING_UP) {
+      float p = 1.0f - (float)env->boss_phase_ticks / (float)BOSS_WINDUP_TICKS;
+      p = fmaxf(0.0f, fminf(1.0f, p));
+      a = 0.15f + 0.25f * p;
+      BeginBlendMode(BLEND_ADDITIVE);
+      DrawRing((Vector2){(float)boss_sx, (float)boss_sy}, boss_aoe_r * 0.93f,
+               boss_aoe_r, 0.0f, 360.0f * p, 64, Fade(BOSS_DANGER_COLOR, a));
+      EndBlendMode();
+      DrawCircleLines(boss_sx, boss_sy, boss_aoe_r,
+                      Fade(BOSS_DANGER_COLOR, 0.28f + 0.25f * p));
+    } else if (env->boss_state == BOSS_ATTACKING) {
+      float rem = (float)env->boss_phase_ticks / (float)BOSS_ACTIVE_TICKS;
+      rem = fmaxf(0.0f, fminf(1.0f, rem));
+      DrawCircle(boss_sx, boss_sy, boss_aoe_r,
+                 Fade(BOSS_DANGER_COLOR, 0.22f + 0.08f * (1.0f - rem)));
+      DrawCircleLines(boss_sx, boss_sy, boss_aoe_r,
+                      Fade(BOSS_DANGER_COLOR, 0.95f));
+    } else if (env->boss_state == BOSS_RECOVERING) {
+      float rem = (float)env->boss_phase_ticks / (float)BOSS_RECOVERY_TICKS;
+      rem = fmaxf(0.0f, fminf(1.0f, rem));
+      DrawCircle(boss_sx, boss_sy, boss_aoe_r,
+                 Fade(BOSS_DANGER_COLOR, 0.16f * rem));
+      DrawCircleLines(boss_sx, boss_sy, boss_aoe_r,
+                      Fade(BOSS_DANGER_COLOR, 0.55f * rem));
+    } else {
+      DrawCircleLines(boss_sx, boss_sy, boss_aoe_r,
+                      Fade(BOSS_DANGER_COLOR, 0.12f));
+    }
+  }
 
   Color boss_color = env->boss_hp <= 0 ? RED : BOSS_COLOR;
-  DrawCircle(boss_sx, boss_sy,
-             radius_to_screen(BOSS_SIZE + BOSS_AOE_ATTACK_RADIUS),
-             HITBOX_COLOR);
+  DrawCircleGradient(boss_sx, boss_sy, radius_to_screen(BOSS_SIZE) * 1.25f,
+                     Fade(BOSS_COLOR, 0.10f), Fade(BOSS_COLOR, 0.0f));
+  DrawCircle(boss_sx + 4, boss_sy + 5, radius_to_screen(BOSS_SIZE),
+             Fade(BLACK, 0.22f));
   DrawCircle(boss_sx, boss_sy, radius_to_screen(BOSS_SIZE), boss_color);
+  DrawCircleLines(boss_sx, boss_sy, radius_to_screen(BOSS_SIZE),
+                  Fade(WHITE, 0.18f));
+
+  // Boss state label
+  {
+    const char *phase = "IDLE";
+    if (env->boss_state == BOSS_WINDING_UP)
+      phase = "WINDUP";
+    else if (env->boss_state == BOSS_ATTACKING)
+      phase = "ACTIVE";
+    else if (env->boss_state == BOSS_RECOVERING)
+      phase = "RECOVER";
+
+    char pbuf[32];
+    snprintf(pbuf, sizeof(pbuf), "%s", phase);
+    int w = MeasureText(pbuf, UI_FONT_SIZE_SMALL);
+    DrawText(pbuf, boss_sx - w / 2,
+             boss_sy - (int)radius_to_screen(BOSS_SIZE) - 22,
+             UI_FONT_SIZE_SMALL, Fade(TEXT_COLOR, 0.85f));
+  }
 
   // Player HP bar - bottom left
-  DrawText("Player", 20, 680, 16, TEXT_COLOR);
-  DrawRectangle(20, 700, HP_BAR_WIDTH * 3, HP_BAR_HEIGHT, DARKGRAY);
-  DrawRectangle(20, 700, player_hp_width * 3, HP_BAR_HEIGHT, HP_COLOR);
+  const int hud_label_y = UI_HP_BAR_Y - 40;
+  DrawText("Player", UI_MARGIN, hud_label_y, UI_FONT_SIZE_SMALL, TEXT_COLOR);
+  DrawRectangle(UI_MARGIN, UI_HP_BAR_Y, HP_BAR_WIDTH * 3, HP_BAR_HEIGHT,
+                DARKGRAY);
+  DrawRectangle(UI_MARGIN, UI_HP_BAR_Y, player_hp_width * 3, HP_BAR_HEIGHT,
+                HP_COLOR);
+
+  // Dodge cooldown (under player hp)
+  {
+    float cd =
+        1.0f - fmaxf(0.0f, fminf(1.0f, (float)env->player_dodge_cooldown /
+                                           (float)PLAYER_DODGE_COOLDOWN));
+    const int dodge_label_y = UI_HP_BAR_Y - 22;
+    const int dodge_bar_y = UI_HP_BAR_Y - 18;
+    DrawText("Dodge", UI_MARGIN, dodge_label_y, UI_FONT_SIZE_SMALL,
+             Fade(TEXT_COLOR, 0.75f));
+    DrawRectangle(UI_MARGIN + 58, dodge_bar_y, 90, 6, Fade(DARKGRAY, 0.8f));
+    DrawRectangle(UI_MARGIN + 58, dodge_bar_y, (int)(90.0f * cd), 6,
+                  Fade(PLAYER_DODGE_COLOR, 0.85f));
+  }
 
   // Boss HP bar - bottom right
-  DrawText("Boss", 580, 680, 16, TEXT_COLOR);
-  DrawRectangle(580, 700, HP_BAR_WIDTH * 3, HP_BAR_HEIGHT, DARKGRAY);
-  DrawRectangle(580, 700, boss_hp_width * 3, HP_BAR_HEIGHT, HP_COLOR);
+  DrawText("Boss", UI_RIGHT_X, hud_label_y, UI_FONT_SIZE_SMALL, TEXT_COLOR);
+  DrawRectangle(UI_RIGHT_X, UI_HP_BAR_Y, HP_BAR_WIDTH * 3, HP_BAR_HEIGHT,
+                DARKGRAY);
+  DrawRectangle(UI_RIGHT_X, UI_HP_BAR_Y, boss_hp_width * 3, HP_BAR_HEIGHT,
+                HP_COLOR);
 
   EndDrawing();
 }
diff --git a/pufferlib/ocean/environment.py b/pufferlib/ocean/environment.py
index 51f131ac1..08e505adc 100644
--- a/pufferlib/ocean/environment.py
+++ b/pufferlib/ocean/environment.py
@@ -1,290 +1,178 @@
 import importlib
 import pufferlib.emulation
 
-
 def lazy_import(module_path, attr):
     """
     Returns a callable that, when called with any arguments, will
     import the module, retrieve the attribute (usually a class or factory)
     and then call it with the given arguments.
     """
-    return lambda *args, **kwargs: getattr(
-        __import__(module_path, fromlist=[attr]), attr
-    )(*args, **kwargs)
-
+    return lambda *args, **kwargs: getattr(__import__(module_path, fromlist=[attr]), attr)(*args, **kwargs)
 
-def make_foraging(
-    width=1080,
-    height=720,
-    num_agents=4096,
-    horizon=512,
-    discretize=True,
-    food_reward=0.1,
-    render_mode="rgb_array",
-):
+def make_foraging(width=1080, height=720, num_agents=4096, horizon=512,
+        discretize=True, food_reward=0.1, render_mode='rgb_array'):
     from .grid import grid
-
     init_fn = grid.init_foraging
     reward_fn = grid.reward_foraging
-    return grid.PufferGrid(
-        width,
-        height,
-        num_agents,
-        horizon,
-        discretize=discretize,
-        food_reward=food_reward,
-        init_fn=init_fn,
-        reward_fn=reward_fn,
-        render_mode=render_mode,
-    )
-
+    return grid.PufferGrid(width, height, num_agents,
+        horizon, discretize=discretize, food_reward=food_reward, init_fn=init_fn, reward_fn=reward_fn, render_mode=render_mode)
 
-def make_predator_prey(
-    width=1080,
-    height=720,
-    num_agents=4096,
-    horizon=512,
-    discretize=True,
-    food_reward=0.1,
-    render_mode="rgb_array",
-):
+def make_predator_prey(width=1080, height=720, num_agents=4096, horizon=512,
+        discretize=True, food_reward=0.1, render_mode='rgb_array'):
     from .grid import grid
-
     init_fn = grid.init_predator_prey
     reward_fn = grid.reward_predator_prey
-    return grid.PufferGrid(
-        width,
-        height,
-        num_agents,
-        horizon,
-        discretize=discretize,
-        food_reward=food_reward,
-        init_fn=init_fn,
-        reward_fn=reward_fn,
-        render_mode=render_mode,
-    )
-
+    return grid.PufferGrid(width, height, num_agents,
+        horizon, discretize=discretize, food_reward=food_reward,
+        init_fn=init_fn, reward_fn=reward_fn,
+        render_mode=render_mode)
 
-def make_group(
-    width=1080,
-    height=720,
-    num_agents=4096,
-    horizon=512,
-    discretize=True,
-    food_reward=0.1,
-    render_mode="rgb_array",
-):
+def make_group(width=1080, height=720, num_agents=4096, horizon=512,
+        discretize=True, food_reward=0.1, render_mode='rgb_array'):
     from .grid import grid
-
     init_fn = grid.init_group
     reward_fn = grid.reward_group
-    return grid.PufferGrid(
-        width,
-        height,
-        num_agents,
-        horizon,
-        discretize=discretize,
-        food_reward=food_reward,
-        init_fn=init_fn,
-        reward_fn=reward_fn,
-        render_mode=render_mode,
-    )
-
+    return grid.PufferGrid(width, height, num_agents,
+        horizon, discretize=discretize, food_reward=food_reward,
+        init_fn=init_fn, reward_fn=reward_fn,
+        render_mode=render_mode)
 
-def make_puffer(
-    width=1080,
-    height=720,
-    num_agents=4096,
-    horizon=512,
-    discretize=True,
-    food_reward=0.1,
-    render_mode="rgb_array",
-):
+def make_puffer(width=1080, height=720, num_agents=4096, horizon=512,
+        discretize=True, food_reward=0.1, render_mode='rgb_array'):
     from .grid import grid
-
     init_fn = grid.init_puffer
     reward_fn = grid.reward_puffer
-    return grid.PufferGrid(
-        width,
-        height,
-        num_agents,
-        horizon,
-        discretize=discretize,
-        food_reward=food_reward,
-        init_fn=init_fn,
-        reward_fn=reward_fn,
-        render_mode=render_mode,
-    )
-
-
-def make_puffergrid(
-    render_mode="raylib",
-    vision_range=5,
-    num_envs=4096,
-    num_maps=1000,
-    max_map_size=9,
-    report_interval=128,
-    buf=None,
-):
-    return PufferGrid(
-        render_mode,
-        vision_range,
-        num_envs,
-        num_maps,
-        max_map_size,
-        report_interval,
-        buf,
-    )
+    return grid.PufferGrid(width, height, num_agents,
+        horizon, discretize=discretize, food_reward=food_reward,
+        init_fn=init_fn, reward_fn=reward_fn,
+        render_mode=render_mode)
 
+def make_puffergrid(render_mode='raylib', vision_range=5,
+        num_envs=4096, num_maps=1000, max_map_size=9,
+        report_interval=128, buf=None):
+    return PufferGrid(render_mode, vision_range, num_envs,
+        num_maps, max_map_size, report_interval, buf)
 
 def make_continuous(discretize=False, buf=None, **kwargs):
     from . import sanity
-
     env = sanity.Continuous(discretize=discretize)
     if not discretize:
         env = pufferlib.ClipAction(env)
     env = pufferlib.EpisodeStats(env)
     return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf)
 
-
 def make_squared(distance_to_target=3, num_targets=1, buf=None, **kwargs):
     from . import sanity
-
-    env = sanity.Squared(
-        distance_to_target=distance_to_target, num_targets=num_targets, **kwargs
-    )
+    env = sanity.Squared(distance_to_target=distance_to_target, num_targets=num_targets, **kwargs)
     env = pufferlib.EpisodeStats(env)
     return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf, **kwargs)
 
-
 def make_bandit(num_actions=10, reward_scale=1, reward_noise=1, buf=None):
     from . import sanity
-
-    env = sanity.Bandit(
-        num_actions=num_actions, reward_scale=reward_scale, reward_noise=reward_noise
-    )
+    env = sanity.Bandit(num_actions=num_actions, reward_scale=reward_scale,
+        reward_noise=reward_noise)
     env = pufferlib.EpisodeStats(env)
     return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf)
 
-
 def make_memory(mem_length=2, mem_delay=2, buf=None, **kwargs):
     from . import sanity
-
     env = sanity.Memory(mem_length=mem_length, mem_delay=mem_delay)
     env = pufferlib.EpisodeStats(env)
     return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf)
 
-
 def make_password(password_length=5, buf=None, **kwargs):
     from . import sanity
-
     env = sanity.Password(password_length=password_length)
     env = pufferlib.EpisodeStats(env)
     return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf)
 
-
 def make_performance(delay_mean=0, delay_std=0, bandwidth=1, buf=None, **kwargs):
     from . import sanity
-
-    env = sanity.Performance(
-        delay_mean=delay_mean, delay_std=delay_std, bandwidth=bandwidth
-    )
+    env = sanity.Performance(delay_mean=delay_mean, delay_std=delay_std, bandwidth=bandwidth)
     env = pufferlib.EpisodeStats(env)
     return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf)
 
-
 def make_performance_empiric(count_n=0, count_std=0, bandwidth=1, buf=None, **kwargs):
     from . import sanity
-
-    env = sanity.PerformanceEmpiric(
-        count_n=count_n, count_std=count_std, bandwidth=bandwidth
-    )
+    env = sanity.PerformanceEmpiric(count_n=count_n, count_std=count_std, bandwidth=bandwidth)
     env = pufferlib.EpisodeStats(env)
     return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf)
 
-
 def make_stochastic(p=0.7, horizon=100, buf=None, **kwargs):
     from . import sanity
-
     env = sanity.Stochastic(p=p, horizon=100)
     env = pufferlib.EpisodeStats(env)
     return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf)
 
-
 def make_spaces(buf=None, **kwargs):
     from . import sanity
-
     env = sanity.Spaces()
     env = pufferlib.EpisodeStats(env)
     return pufferlib.emulation.GymnasiumPufferEnv(env=env, buf=buf, **kwargs)
 
-
 def make_multiagent(buf=None, **kwargs):
     from . import sanity
-
     env = sanity.Multiagent()
     env = pufferlib.MultiagentEpisodeStats(env)
     return pufferlib.emulation.PettingZooPufferEnv(env=env, buf=buf)
 
-
 MAKE_FUNCTIONS = {
-    "battle": "Battle",
-    "breakout": "Breakout",
-    "blastar": "Blastar",
-    "boss_fight": "BossFight",
-    "convert": "Convert",
-    "convert_circle": "ConvertCircle",
-    "pong": "Pong",
-    "freeway": "Freeway",
-    "enduro": "Enduro",
-    "tetris": "Tetris",
-    "cartpole": "Cartpole",
-    "moba": "Moba",
-    "matsci": "Matsci",
-    "memory": "Memory",
-    "boids": "Boids",
-    "drone": "Drone",
-    "nmmo3": "NMMO3",
-    "snake": "Snake",
-    "squared": "Squared",
-    "pysquared": "PySquared",
-    "connect4": "Connect4",
-    "g2048": "G2048",
-    "terraform": "Terraform",
-    "template": "Template",
-    "tripletriad": "TripleTriad",
-    "tactical": "Tactical",
-    "target": "Target",
-    "go": "Go",
-    "rware": "Rware",
-    "trash_pickup": "TrashPickupEnv",
-    "tower_climb": "TowerClimb",
-    "grid": "Grid",
-    "shared_pool": "PyCPR",
-    "impulse_wars": "ImpulseWars",
-    "drive": "Drive",
-    "pacman": "Pacman",
-    "tmaze": "TMaze",
-    "checkers": "Checkers",
-    "asteroids": "Asteroids",
-    "whisker_racer": "WhiskerRacer",
-    "onestateworld": "World",
-    "onlyfish": "OnlyFish",
-    "chain_mdp": "Chain",
-    "spaces": make_spaces,
-    "multiagent": make_multiagent,
-    "slimevolley": "SlimeVolley",
+    'battle': 'Battle',
+    'breakout': 'Breakout',
+    'blastar': 'Blastar',
+    'convert': 'Convert',
+    'convert_circle': 'ConvertCircle',
+    'pong': 'Pong',
+    'freeway': 'Freeway',
+    'enduro': 'Enduro',
+    'tetris': 'Tetris',
+    'cartpole': 'Cartpole',
+    'moba': 'Moba',
+    'matsci': 'Matsci',
+    'memory': 'Memory',
+    'boids': 'Boids',
+    'drone': 'Drone',
+    'nmmo3': 'NMMO3',
+    'snake': 'Snake',
+    'squared': 'Squared',
+    'pysquared': 'PySquared',
+    'connect4': 'Connect4',
+    'g2048': 'G2048',
+    'terraform': 'Terraform',
+    'template': 'Template',
+    'tripletriad': 'TripleTriad',
+    'tactical': 'Tactical',
+    'target': 'Target',
+    'go': 'Go',
+    'rware': 'Rware',
+    'trash_pickup': 'TrashPickupEnv',
+    'tower_climb': 'TowerClimb',
+    'grid': 'Grid',
+    'shared_pool': 'PyCPR',
+    'impulse_wars': 'ImpulseWars',
+    'drive': 'Drive',
+    'pacman': 'Pacman',
+    'tmaze': 'TMaze',
+    'checkers': 'Checkers',
+    'asteroids': 'Asteroids',
+    'whisker_racer': 'WhiskerRacer',
+    'onestateworld': 'World',
+    'onlyfish': 'OnlyFish',
+    'chain_mdp': 'Chain',
+    'spaces': make_spaces,
+    'multiagent': make_multiagent,
+    'slimevolley': 'SlimeVolley',
+    'boss_fight': 'BossFight',
 }
 
-
-def env_creator(name="squared", *args, **kwargs):
-    if "puffer_" not in name:
-        raise pufferlib.APIUsageError(f"Invalid environment name: {name}")
+def env_creator(name='squared', *args, **kwargs):
+    if 'puffer_' not in name:
+        raise pufferlib.APIUsageError(f'Invalid environment name: {name}')
 
     # TODO: Robust sanity / ocean imports
-    name = name.replace("puffer_", "")
+    name = name.replace('puffer_', '')
     try:
-        module = importlib.import_module(f"pufferlib.ocean.{name}.{name}")
+        module = importlib.import_module(f'pufferlib.ocean.{name}.{name}')
         return getattr(module, MAKE_FUNCTIONS[name])
     except ModuleNotFoundError:
         return MAKE_FUNCTIONS[name]