PufferAI · frixaco · Jan 12, 2026 · Jan 13, 2026 · Jan 13, 2026 · Jan 16, 2026
diff --git a/pufferlib/config/boss_fight.ini b/pufferlib/config/boss_fight.ini
@@ -0,0 +1,80 @@
+[base]
+package = ocean
+env_name = puffer_boss_fight
+policy_name = Policy
+
+[vec]
+num_envs = 1024
+num_workers = 8
+batch_size = auto
+zero_copy = True
+seed = 42
+
+[env]
+
+[policy]
+
+[train]
+name = boss_fight
+project = boss_fight_experiments
+data_dir = experiments
+checkpoint_interval = 200
+seed = 42
+torch_deterministic = True
+device = cpu
+optimizer = adam
+precision = float32
+compile = False
+total_timesteps = 5_000_000
+learning_rate = 0.000864
+anneal_lr = True
+min_lr_ratio = 0.437
+gamma = 0.983
+gae_lambda = 0.902
+update_epochs = 4
+clip_coef = 0.421
+vf_coef = 4.38
+vf_clip_coef = 0.303
+max_grad_norm = 2.28
+ent_coef = 0.00623
+minibatch_size = 2048
+max_minibatch_size = 32768
+bptt_horizon = 32
+adam_beta1 = 0.991
+adam_beta2 = 0.998
+adam_eps = 1e-14
+vtrace_rho_clip = 2.72
+vtrace_c_clip = 2.13
+
+[sweep]
+goal = maximize
+metric = episode_return
+method = Protein
+metric_distribution = linear
+max_suggestion_cost = 3600
+use_gpu = True
+
+[sweep.train.learning_rate]
+distribution = log_normal
+min = 0.0001
+max = 0.003
+
+[sweep.train.ent_coef]
+distribution = log_normal
+min = 0.0001
+max = 0.05
+
+[sweep.train.gamma]
+distribution = logit_normal
+min = 0.95
+max = 0.999
+
+[sweep.train.gae_lambda]
+distribution = logit_normal
+min = 0.9
+max = 0.99
+
+[sweep.train.minibatch_size]
+distribution = uniform_pow2
+min = 1024
+max = 8192
diff --git a/pufferlib/ocean/boss_fight/README.md b/pufferlib/ocean/boss_fight/README.md
@@ -0,0 +1,106 @@
+# BossFight (PufferLib Ocean)
+
+BossFight is a simple 2D boss-fight reinforcement learning environment.
+
+The boss currently has **one attack**: a circular **AOE burst** and cycles between 4 states.
+Player (agent) has to defeat the boss by attacking and avoiding AoE attacks by dodging (has i-frames).
+All hitboxes are circles (collision = circles overlap).
+
+## Game rules
+
+- **Arena:** square `[-ARENA_HALF_SIZE, ARENA_HALF_SIZE]^2` (default `500.0`)
+- **Boss:** stationary at `(0, 0)`
+- **Episode ends on:**
+  - win: boss HP reaches 0
+  - loss: player HP reaches 0
+  - timeout: `EPISODE_LENGTH` steps
+
+### Boss attack cycle
+
+The boss cycles through:
+
+`IDLE (BOSS_IDLE_TICKS) -> WINDUP (BOSS_WINDUP_TICKS) -> ACTIVE (BOSS_ACTIVE_TICKS) -> RECOVERY (BOSS_RECOVERY_TICKS) -> ...`
+
+During **ACTIVE**, the boss deals `BOSS_ATTACK_DMG` damage **every tick** the player overlaps the AOE circle (unless i-framed). Staying in the AOE for the full 5 ticks = 75 damage.
+
+### Player mechanics
+
+- **Move** (only while idling): 4 directional movement at `PLAYER_SPEED_PER_TICK`
+- **Attack**: melee hit if within `PLAYER_ATTACK_RADIUS` (locks the player for `PLAYER_ATTACK_TICKS`)
+- **Dodge**:
+  - lasts `PLAYER_DODGE_TICKS` and automatically moves the player directly **away from the boss** at `PLAYER_DODGE_SPEED_PER_TICK`
+  - the first `PLAYER_IFRAME_TICKS` are i-frames
+  - the boss AOE lasts longer than the i-frame window, so “dodge in place” isn’t sufficient -- you must **exit the AOE**
+  - after dodge ends, `PLAYER_DODGE_COOLDOWN` ticks must pass before dodging again
+
+## Action space
+
+`Discrete(7)`:
+
+|  id | action     |
+| --: | ---------- |
+|   0 | idle       |
+|   1 | move up    |
+|   2 | move down  |
+|   3 | move left  |
+|   4 | move right |
+|   5 | dodge      |
+|   6 | attack     |
+
+## Observation space
+
+`Box(shape=(12,), dtype=float32)` — all normalized to [-1, 1] or [0, 1] (see `update_observations` in `boss_fight.h`):
+
+| idx | meaning                      | range   |
+| --: | ---------------------------- | ------- |
+|   0 | `player_x` normalized        | [-1, 1] |
+|   1 | `player_y` normalized        | [-1, 1] |
+|   2 | `dist_to_boss` normalized    | [0, 1]  |
+|   3 | `player_hp` normalized       | [0, 1]  |
+|   4 | `boss_hp` normalized         | [0, 1]  |
+|   5 | `dodge_cooldown` normalized  | [0, 1]  |
+|   6 | `dodge_remaining`            | [0, 1]  |
+|   7 | `iframe_remaining`           | [0, 1]  |
+|   8 | `attack_remaining`           | [0, 1]  |
+|   9 | `time_until_aoe`             | [0, 1]  |
+|  10 | `aoe_remaining`              | [0, 1]  |
+|  11 | `episode_time_remaining`     | [0, 1]  |
+
+## Rewards (defaults)
+
+All reward constants are in `boss_fight.h`:
+
+- **Per-step:** `REWARD_TICK`
+- **Shaping:** `REWARD_APPROACH * (prev_distance - distance)`
+- **Events:**
+  - `REWARD_PLAYER_HIT_BOSS`
+  - `REWARD_BOSS_HIT_PLAYER`
+  - `REWARD_DODGE_SUCCESS`
+  - `REWARD_HIT_WALL`
+- **Terminal:** `REWARD_KILL_BOSS`, `REWARD_PLAYER_DIED`, `REWARD_TIMEOUT`
+
+**Dodge success reward** is only paid when:
+
+1. you **start** a dodge while inside the AOE during the boss danger window (**WINDUP** or **ACTIVE**), and
+2. you **exit** the AOE before the danger window ends.
+
+## Rendering / manual play
+
+- Rendering uses **Raylib** with enhanced visuals:
+  - Grid overlay + crosshair axes
+  - Time remaining bar (steps + seconds)
+  - Boss AoE telegraph (charging ring during WINDUP, filled during ACTIVE)
+  - Boss state label (IDLE/WINDUP/ACTIVE/RECOVER)
+  - Dodge trail particles + i-frame blink effect
+  - Attack pulse ring effect
+  - HP bars + dodge cooldown bar in HUD
+- A tiny standalone debug harness lives in `boss_fight.c`:
+  - Hold `Left Shift` for manual controls: `WASD` move, `Space` dodge, `J` attack
+  - Without `Left Shift` it takes random actions
+
+## Files
+
+- `boss_fight.h`: core environment logic (`c_reset`, `c_step`, `c_render`)
+- `binding.c`: CPython extension glue (uses `pufferlib/ocean/env_binding.h`)
+- `boss_fight.py`: PufferLib wrapper (`PufferEnv`) + vectorized stepping
+- `pufferlib/config/boss_fight.ini`: default training config for `puffer train puffer_boss_fight`
diff --git a/pufferlib/ocean/boss_fight/__init__.py b/pufferlib/ocean/boss_fight/__init__.py
@@ -0,0 +1,3 @@
+"""BossFight Ocean Environment."""
+
+from .boss_fight import BossFight
diff --git a/pufferlib/ocean/boss_fight/binding.c b/pufferlib/ocean/boss_fight/binding.c
@@ -0,0 +1,17 @@
+#include "boss_fight.h"
+
+#define Env BossFight
+#include "../env_binding.h"
+
+static int my_init(Env *env, PyObject *args, PyObject *kwargs) {
+  // No special init needed for now
+  return 0;
+}
+
+static int my_log(PyObject *dict, Log *log) {
+  assign_to_dict(dict, "score", log->score);
+  assign_to_dict(dict, "episode_return", log->episode_return);
+  assign_to_dict(dict, "episode_length", log->episode_length);
+  assign_to_dict(dict, "wins", log->wins);
+  return 0;
+}
diff --git a/pufferlib/ocean/boss_fight/boss_fight.c b/pufferlib/ocean/boss_fight/boss_fight.c
@@ -0,0 +1,46 @@
+#include "boss_fight.h"
+#include "raylib.h"
+
+int main() {
+  int num_obs = 12;
+  int num_actions = 1;
+  int num_agents = 1;
+
+  BossFight env = {};
+  env.observations = (float *)calloc(num_obs, sizeof(float));
+  env.actions = (int *)calloc(num_actions, sizeof(float));
+  env.rewards = (float *)calloc(num_agents, sizeof(float));
+  env.terminals = (unsigned char *)calloc(num_agents, sizeof(unsigned char));
+
+  // Always call reset and render first
+  c_reset(&env);
+  c_render(&env);
+
+  while (!WindowShouldClose()) {
+    if (IsKeyDown(KEY_LEFT_SHIFT)) {
+      if (IsKeyDown(KEY_W))
+        env.actions[0] = 1;
+      else if (IsKeyDown(KEY_S))
+        env.actions[0] = 2;
+      else if (IsKeyDown(KEY_A))
+        env.actions[0] = 3;
+      else if (IsKeyDown(KEY_D))
+        env.actions[0] = 4;
+      else if (IsKeyDown(KEY_SPACE))
+        env.actions[0] = 5;
+      else if (IsKeyDown(KEY_J))
+        env.actions[0] = 6;
+      else
+        env.actions[0] = 0;
+    } else {
+      env.actions[0] = rand() % 7;
+    }
+    c_step(&env);
+    c_render(&env);
+  }
+  free(env.observations);
+  free(env.actions);
+  free(env.rewards);
+  free(env.terminals);
+  c_close(&env);
+}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		"""BossFight Ocean Environment."""

		from .boss_fight import BossFight