From 2dae299b04c20b02f80a6741ad2db495107f3fe2 Mon Sep 17 00:00:00 2001 From: jonah Date: Thu, 8 Jan 2026 08:37:06 -0800 Subject: [PATCH 1/8] starting backgammon env --- pufferlib/config/ocean/backgammon.ini | 0 pufferlib/ocean/backgammon/backgammon.c | 0 pufferlib/ocean/backgammon/backgammon.h | 362 +++++++++++++++++++++++ pufferlib/ocean/backgammon/backgammon.py | 0 pufferlib/ocean/backgammon/binding.c | 18 ++ pufferlib/ocean/environment.py | 1 + 6 files changed, 381 insertions(+) create mode 100644 pufferlib/config/ocean/backgammon.ini create mode 100644 pufferlib/ocean/backgammon/backgammon.c create mode 100644 pufferlib/ocean/backgammon/backgammon.h create mode 100644 pufferlib/ocean/backgammon/backgammon.py create mode 100644 pufferlib/ocean/backgammon/binding.c diff --git a/pufferlib/config/ocean/backgammon.ini b/pufferlib/config/ocean/backgammon.ini new file mode 100644 index 000000000..e69de29bb diff --git a/pufferlib/ocean/backgammon/backgammon.c b/pufferlib/ocean/backgammon/backgammon.c new file mode 100644 index 000000000..e69de29bb diff --git a/pufferlib/ocean/backgammon/backgammon.h b/pufferlib/ocean/backgammon/backgammon.h new file mode 100644 index 000000000..e78fb4eec --- /dev/null +++ b/pufferlib/ocean/backgammon/backgammon.h @@ -0,0 +1,362 @@ +/* + * Backgammon: A two-player board game environment for PufferLib + * +*/ + +#include +#include +#include +#include +#include +#include + +#define NUM_POINTS 24 // Number of points on the board +#define NUM_CHECKERS 15 // Each player starts with 15 checkers +#define MAX_CHECKERS_PER_POINT 15 + +// Players +#define WHITE 0 +#define BLACK 1 + +// Movement directions (white moves negative, black moves positive) +#define WHITE_DIRECTION -1 +#define BLACK_DIRECTION 1 + +#define BAR_POSITION 0 // Action from bar +#define BEAR_OFF_POSITION 25 // Action to bear off + +// Action space: source (0-25) * 4 dice options = 104 actions +// source 0 = bar, 1-24 = points, actions encode which die to use +#define NUM_ACTIONS 104 + +// Observation space size +// 24 points + 2 bar + 2 bear-off + 4 dice + 1 current_player + 2 can_bear_off = 35 +#define OBSERVATION_SIZE 35 + +#define MAX_STEPS 1000 +#define DEFAULT_LOG_INTERVAL 128 + + +typedef struct Log { + float episode_return; + float episode_length; // Steps in episode + float win_rate; // Fraction of games won by white + float avg_moves_per_turn; + float hit_rate; // Rate of hitting opponent blots + float n; // Number of episodes +} Log; + + +typedef struct Client { + int width; + int height; + int point_width; + int point_height; + bool initialized; + // Textures would go here for full rendering +} Client; + +// ============================================================================ +// CBackgammon struct - main environment state +// ============================================================================ + +typedef struct CBackgammon { + // ======================================================================== + // PufferLib I/O - pointers to shared memory buffers + // ======================================================================== + float* observations; // Neural network input [OBSERVATION_SIZE] + int* actions; // Agent's chosen action [1] + float* rewards; // Reward signal [1] + unsigned char* terminals; // Episode done flag [1] + + // ======================================================================== + // Logging and rendering + // ======================================================================== + Log log; + Client* client; + + // ======================================================================== + // Board state + // ======================================================================== + + // Points 1-24 (index 0 unused for clarity, use indices 1-24) + // Positive values = white checkers, Negative values = black checkers + int8_t board[NUM_POINTS + 1]; // board[1] to board[24] + + // Bar - checkers that have been hit and must re-enter + // bar[WHITE] = white checkers on bar, bar[BLACK] = black checkers on bar + int8_t bar[2]; + + // Borne off - checkers that have been removed from the board + // off[WHITE] = white checkers borne off, off[BLACK] = black checkers borne off + int8_t off[2]; + + // ======================================================================== + // Dice state + // ======================================================================== + + // Dice values (1-6 each) + // For doubles, all 4 entries have the same value + int8_t dice[4]; + + // Number of dice available (2 normally, 4 for doubles) + int8_t num_dice; + + // Number of dice already used this turn + int8_t dice_used; + + // Which specific dice are still available (for tracking after partial moves) + bool dice_available[4]; + + // ======================================================================== + // Turn state + // ======================================================================== + + // Current player (WHITE or BLACK) + int8_t current_player; + + // Whether the current player must move from the bar first + bool must_enter_from_bar; + + // ======================================================================== + // Episode tracking + // ======================================================================== + + // Current step count within episode + int tick; + + // Cumulative return for current episode + float episode_return; + + // Statistics for current episode + int moves_this_episode; + int hits_this_episode; + int turns_this_episode; + +} CBackgammon; + +// ============================================================================ +// Function declarations (to be implemented in backgammon.h below) +// ============================================================================ + +// Core PufferLib interface +void init(CBackgammon* env); +void c_reset(CBackgammon* env); +void c_step(CBackgammon* env); +void c_render(CBackgammon* env); +void c_close(CBackgammon* env); + +// Game logic helpers +void roll_dice(CBackgammon* env); +int get_direction(int player); +bool in_home_board(int player, int point); +bool can_bear_off(CBackgammon* env, int player); +bool is_dst_available(CBackgammon* env, int player); +bool is_legal_move(CBackgammon* env, int from, int die_index); +bool has_legal_moves(CBackgammon* env); +void make_move(CBackgammon* env, int from, int die_index); +bool check_win(CBackgammon* env, int player); +void compute_observations(CBackgammon* env); +void opponent_move(CBackgammon* env); + +// Logging +void add_log(CBackgammon* env); + +// Rendering (optional) +Client* make_client(CBackgammon* env); +void close_client(Client* client); + + +// Implementations + +float randf(float min, float max) { + return min + (max - min)*(float)rand()/(float)RAND_MAX; +} + +float randi(int min, int max) { + return min + rand() % (max - min + 1); +} + +void init(CBackgammon *env) { + env->log = (Log){0}; + env->tick = 0; + env->client = NULL; +} + +void c_reset(CBackgammon* env) { + env->log = (Log){0}; + for (int i = 0; i <= NUM_POINTS; i++) env->board[i] = 0; + env->bar[WHITE] = 0; env->bar[BLACK] = 0; + env->off[WHITE] = 0; env->off[BLACK] = 0; + env->tick = 0; + env->episode_return = 0.0; + env->moves_this_episode = 0; + env->hits_this_episode = 0; + env->turns_this_episode = 0; + env->current_player = rand() % 2; + env->must_enter_from_bar = false; + + // white + env->board[24] = 2; + env->board[13] = 5; + env->board[8] = 3; + env->board[6] = 5; + // black + env->board[1] = -2; + env->board[12] = -5; + env->board[17] = -3; + env->board[19] = -5; + + env->rewards[0] = 0.0; + env->terminals[0] = 0; + + roll_dice(env); + compute_observations(env); + +} + +void roll_dice(CBackgammon *env) { + env->dice[0] = 1 + (rand() % 6); + env->dice[1] = 1 + (rand() % 6); + if (env->dice[0] == env->dice[1]) { + env->dice[2] = env->dice[0]; + env->dice[3] = env->dice[0]; + env->num_dice = 4; + } else { + env->dice[2] = 0; + env->dice[3] = 0; + env->num_dice = 2; + } + env->dice_used = 0; + for (int i = 0; i < 4; i++) { + env->dice_available[i] = i < env->num_dice; + } +} + + +int get_direction(int player) { + return player == WHITE ? WHITE_DIRECTION: BLACK_DIRECTION; +} + + +bool in_home_board(int player, int point) { + if (player == WHITE) { + return point >= 1 && point <= 6; + } else { + return point >= 19 && point <= 24; + } +} + + +bool can_bear_off(CBackgammon *env, int player) { + for (int i = 1; i <= NUM_POINTS; i++) { + if (player == WHITE && env->board[i] > 0 && !in_home_board(WHITE, i)) { + return false; + } + if (player == BLACK && env->board[i] < 0 && !in_home_board(BLACK, i)) { + return false; + } + } + return env->bar[player] == 0; +} + +bool is_dst_available(CBackgammon *env, int position, int player) { + if (!env->board[position] || env->board[position] == 1 || env->board[position] == -1) return true; + if (player == WHITE) { + return env->board[position] > 0; + } else { + return env->board[position] < 0; + } + +} + +bool is_legal_move(CBackgammon *env, int from, int die_index) { + int8_t cp = env->current_player; + int8_t die_value = env->dice[die_index]; + int direction = get_direction(cp); + + if (!env->dice_available[die_index]) return false; + + if (env->bar[cp] > 0) { + if (from != 0) return false; + + int entry = cp == WHITE ? NUM_POINTS + 1 - die_value : die_value; + return is_dst_available(env, entry, cp); + } + + if (from < 1 || from > NUM_POINTS) return false; + + int8_t fvalue = env->board[from]; + if (cp == WHITE && fvalue <= 0) return false; + if (cp == BLACK && fvalue >= 0) return false; + + int dst = from + (die_value * direction); + + if (cp == WHITE && dst < 1) { + return can_bear_off(env, cp); + } + if (cp == BLACK && dst > NUM_POINTS) { + return can_bear_off(env, cp); + } + + if (dst < 1 || dst > NUM_POINTS) return false; + return is_dst_available(env, dst, cp); +} + + +bool has_legal_moves(CBackgammon *env) { + for (int i = 0; i <= NUM_POINTS; i++) { + for (int d = 0; d < env->num_dice; d++) { + if (is_legal_move(env, i, d)) return true; + } + } + return false; +} + + +void make_move(CBackgammon* env, int from, int die_index) { + int8_t cp = env->current_player; + int8_t die_value = env->dice[die_index]; + int direction = get_direction(cp); + + env->dice_available[die_index] = false; + env->dice_used++; + env->moves_this_episode++; + + if (from == 0) { + env->bar[cp]--; + int dst = (cp == WHITE) ? (NUM_POINTS + 1 - die_value) : die_value; + + if ((cp == WHITE && env->board[dst] == -1) || + (cp == BLACK && env->board[dst] == 1)) { + env->bar[cp ^ 1]++; + env->board[dst] = 0; + env->hits_this_episode++; + } + + env->board[dst] += (cp == WHITE) ? 1 : -1; + } else { + env->board[from] += (cp == WHITE) ? -1 : 1; // remove from board + int dst = from + (die_value * direction); + // check bear off + if ((cp == WHITE && dst < 1) || (cp == BLACK && dst > NUM_POINTS)) { + env->off[cp]++; + return; + } + + // check hit + if ((cp == WHITE && env->board[dst] == -1) || + (cp == BLACK && env->board[dst] == 1)) { + env->bar[cp ^ 1]++; + env->board[dst] = 0; + env->hits_this_episode++; + } + + env->board[dst] += (cp == WHITE) ? 1 : -1; + } +} + + +bool check_win(CBackgammon* env, int player) { + return env->off[player] == NUM_CHECKERS; +} \ No newline at end of file diff --git a/pufferlib/ocean/backgammon/backgammon.py b/pufferlib/ocean/backgammon/backgammon.py new file mode 100644 index 000000000..e69de29bb diff --git a/pufferlib/ocean/backgammon/binding.c b/pufferlib/ocean/backgammon/binding.c new file mode 100644 index 000000000..586ef5e23 --- /dev/null +++ b/pufferlib/ocean/backgammon/binding.c @@ -0,0 +1,18 @@ +#include "backgammon.h" + +#define Env CBackgammon +#include "../env_binding.h" + +static int my_init(Env* env, PyObject *args, PyObject* kwargs) { + init(env); + return 0; +} + +static int my_log(PyObject* dict, Log* log) { + assign_to_dict(dict, "episode_return", log->episode_return); + assign_to_dict(dict, "episode_length", log->episode_length); + assign_to_dict(dict, "win_rate", log->win_rate); + assign_to_dict(dict, "avg_moves_per_turn", log->avg_moves_per_turn); + assign_to_dict(dict, "hit_rate", log->hit_rate); + return 0; +} \ No newline at end of file diff --git a/pufferlib/ocean/environment.py b/pufferlib/ocean/environment.py index 6c56a4ea2..e34f9d230 100644 --- a/pufferlib/ocean/environment.py +++ b/pufferlib/ocean/environment.py @@ -117,6 +117,7 @@ def make_multiagent(buf=None, **kwargs): return pufferlib.emulation.PettingZooPufferEnv(env=env, buf=buf) MAKE_FUNCTIONS = { + 'backgammon': 'Backgammon', 'battle': 'Battle', 'breakout': 'Breakout', 'blastar': 'Blastar', From 1d8190a3bd5c56f8181ad52a311c2941c6261fe3 Mon Sep 17 00:00:00 2001 From: jonah Date: Fri, 9 Jan 2026 06:33:20 -0800 Subject: [PATCH 2/8] done game logic --- pufferlib/ocean/backgammon/backgammon.h | 186 ++++++++++++++++++++++++ 1 file changed, 186 insertions(+) diff --git a/pufferlib/ocean/backgammon/backgammon.h b/pufferlib/ocean/backgammon/backgammon.h index e78fb4eec..443268258 100644 --- a/pufferlib/ocean/backgammon/backgammon.h +++ b/pufferlib/ocean/backgammon/backgammon.h @@ -359,4 +359,190 @@ void make_move(CBackgammon* env, int from, int die_index) { bool check_win(CBackgammon* env, int player) { return env->off[player] == NUM_CHECKERS; +} + + +void compute_observations(CBackgammon* env) { + float *obs = env->observations; + int cur = 0; + + for (int i = 1; i <= NUM_POINTS; i++) { + obs[cur++] = env->board[i] / (float)NUM_CHECKERS; + } + + obs[cur++] = (float)env->bar[WHITE] / NUM_CHECKERS; + obs[cur++] = (float)env->bar[BLACK] / NUM_CHECKERS; + + obs[cur++] = (float)env->off[WHITE] / NUM_CHECKERS; + obs[cur++] = (float)env->off[BLACK] / NUM_CHECKERS; + + for (int i = 0; i < 4; i++) { + obs[cur++] = env->dice_available[i] ? (float)env->dice[i] / 6.0f : 0.0f; + } + + obs[cur++] = (float)env->current_player; + + obs[cur++] = can_bear_off(env, WHITE) ? 1.0f : 0.0f; + obs[cur++] = can_bear_off(env, BLACK) ? 1.0f : 0.0f; +} + + +int score_move(CBackgammon* env, int from, int die_index) { + int die_value = env->dice[die_index]; + int score = 0; + + if (from == 0) { + score += 100; + int dst = die_value; // Black enters at die_value + // Bonus for hitting white + if (env->board[dst] == 1) score += 50; + return score; + } + + int dst = from + die_value; + + // Bearing off is highest priority + if (dst > NUM_POINTS && can_bear_off(env, BLACK)) { + score += 200; + return score; + } + + // Can't bear off and destination is off board + if (dst > NUM_POINTS) return -1000; + + // Hitting a white checker is very good + if (env->board[dst] == 1) { + score += 80; + } + + // Making a point (having 2+ checkers) is good for safety + if (env->board[dst] == -1) { + score += 30; + } + + // Advancing toward home board is good + score += dst; + + // Moving a lone checker (blot) to safety is good + if (env->board[from] == -1) { + score += 20; + } + + // Prefer not to leave blots in opponent's home board + if (env->board[from] == -2 && from >= 1 && from <= 6) { + score -= 10; + } + + return score; +} + +void opponent_move(CBackgammon* env) { + env->current_player = BLACK; + roll_dice(env); + env->turns_this_episode++; + + // find best pre-programmaed move + while (has_legal_moves(env)) { + int best_from = -1; + int best_die = -1; + int best_score = -10000; + + // find the best scoring legal move + for (int from = 0; from <= NUM_POINTS; from++) { + for (int d = 0; d < env->num_dice; d++) { + if (is_legal_move(env, from, d)) { + int score = score_move(env, from, d); + if (score > best_score) { + best_score = score; + best_from = from; + best_die = d; + } + } + } + } + + if (best_from < 0) break; // no move found + + make_move(env, best_from, best_die); + + // did opponent won + if (check_win(env, BLACK)) { + return; + } + } + + env->current_player = WHITE; + roll_dice(env); +} + + +void add_log(CBackgammon* env) { + env->log.episode_return += env->episode_return; + env->log.episode_length += env->tick; + + env->log.win_rate += check_win(env, WHITE) ? 1.0f : 0.0f; + + if (env->turns_this_episode > 0) { + env->log.avg_moves_per_turn += (float)env->moves_this_episode / env->turns_this_episode; + } + + if (env->moves_this_episode > 0) { + env->log.hit_rate += (float)env->hits_this_episode / env->moves_this_episode; + } + + env->log.n += 1; +} + + +void c_step(CBackgammon* env) { + if (env->terminals[0]) { + c_reset(env); + return; + } + + int action = env->actions[0]; + int from = action / 4; + int die_index = action % 4; + + float reward = 0.0f; + + if (is_legal_move(env, from, die_index)) { + make_move(env, from, die_index); + } else { + reward -= 0.1f; + } + + if (check_win(env, WHITE)) { + reward = 1.0f; + env->terminals[0] = 1; + env->episode_return += reward; + add_log(env); + goto end; + } + + if (env->dice_used >= env->num_dice || !has_legal_moves(env)) { + opponent_move(env); + } + + if (check_win(env, BLACK)) { + reward = -1.0f; + env->terminals[0] = 1; + env->episode_return += reward; + add_log(env); + goto end; + } + + if (env->tick >= MAX_STEPS) { + env->terminals[0] = 1; + env->episode_return += reward; + add_log(env); + goto end; + } + + env->episode_return += reward; + +end: + env->rewards[0] = reward; + compute_observations(env); + env->tick++; } \ No newline at end of file From ebd2f3682a1864c97a8b32491f35c70bae041a7c Mon Sep 17 00:00:00 2001 From: jonah Date: Fri, 9 Jan 2026 08:02:08 -0800 Subject: [PATCH 3/8] trains on cpu --- pufferlib/config/ocean/backgammon.ini | 21 ++++++++ pufferlib/ocean/backgammon/backgammon.h | 21 +++++++- pufferlib/ocean/backgammon/backgammon.py | 69 ++++++++++++++++++++++++ 3 files changed, 110 insertions(+), 1 deletion(-) diff --git a/pufferlib/config/ocean/backgammon.ini b/pufferlib/config/ocean/backgammon.ini index e69de29bb..e2f47f6ec 100644 --- a/pufferlib/config/ocean/backgammon.ini +++ b/pufferlib/config/ocean/backgammon.ini @@ -0,0 +1,21 @@ +[base] +package = ocean +env_name = puffer_backgammon +policy_name = Policy + +[policy] +hidden_size = 256 + +[vec] +num_envs = 8 +num_workers = 8 + +[env] +num_envs = 4 + +[train] +total_timesteps = 10_000_000 +device = cpu + +batch_size = 8192 +mini_batch_size = 2048 \ No newline at end of file diff --git a/pufferlib/ocean/backgammon/backgammon.h b/pufferlib/ocean/backgammon/backgammon.h index 443268258..8c6c74f45 100644 --- a/pufferlib/ocean/backgammon/backgammon.h +++ b/pufferlib/ocean/backgammon/backgammon.h @@ -151,7 +151,7 @@ void roll_dice(CBackgammon* env); int get_direction(int player); bool in_home_board(int player, int point); bool can_bear_off(CBackgammon* env, int player); -bool is_dst_available(CBackgammon* env, int player); +bool is_dst_available(CBackgammon *env, int position, int player); bool is_legal_move(CBackgammon* env, int from, int die_index); bool has_legal_moves(CBackgammon* env); void make_move(CBackgammon* env, int from, int die_index); @@ -545,4 +545,23 @@ void c_step(CBackgammon* env) { env->rewards[0] = reward; compute_observations(env); env->tick++; +} + + +// Rendering logic + +Client* make_client(CBackgammon* env) { + return NULL; +} + +void close_client(Client* client) { +} + +void c_render(CBackgammon* env) { +} + +void c_close(CBackgammon* env) { + if (env->client) { + close_client(env->client); + } } \ No newline at end of file diff --git a/pufferlib/ocean/backgammon/backgammon.py b/pufferlib/ocean/backgammon/backgammon.py index e69de29bb..fcb1b1f74 100644 --- a/pufferlib/ocean/backgammon/backgammon.py +++ b/pufferlib/ocean/backgammon/backgammon.py @@ -0,0 +1,69 @@ +'''Backgammon environment for PufferLib''' + +import gymnasium +import numpy as np + +import pufferlib +from pufferlib.ocean.backgammon import binding + +OBSERVATION_SIZE = 35 +NUM_ACTIONS = 104 + +class Backgammon(pufferlib.PufferEnv): + def __init__(self, num_envs=1, render_mode=None, log_interval=128, buf=None, seed=0): + self.single_observation_space = gymnasium.spaces.Box(low=-1, high=1, + shape=(OBSERVATION_SIZE,), dtype=np.float32) + self.single_action_space = gymnasium.spaces.Discrete(NUM_ACTIONS) + self.render_mode = render_mode + self.log_interval = log_interval + self.num_agents = num_envs + + super().__init__(buf) + self.c_envs = binding.vec_init(self.observations, self.actions, self.rewards, + self.terminals, self.truncations, num_envs, seed) + + def reset(self, seed=0): + binding.vec_reset(self.c_envs, seed) + self.tick = 0 + return self.observations, [] + + def step(self, actions): + self.tick += 1 + self.actions[:] = actions + binding.vec_step(self.c_envs) + + info = [] + if self.tick % self.log_interval == 0: + log = binding.vec_log(self.c_envs) + if log: + info.append(log) + + return (self.observations, self.rewards, + self.terminals, self.truncations, info) + + def render(self): + binding.vec_render(self.c_envs, 0) + + def close(self): + binding.vec_close(self.c_envs) + + +if __name__ == '__main__': + import time + + N = 512 + env = Backgammon(num_envs=N) + env.reset() + + CACHE = 1024 + actions = np.random.randint(0, NUM_ACTIONS, size=(CACHE, N)) + + steps = 0 + start = time.time() + i = 0 + while time.time() - start < 10: + env.step(actions[i % CACHE]) + steps += N + i += 1 + + print('Backgammon SPS:', int(steps / (time.time() - start))) \ No newline at end of file From 621e87b4e34d37d6aa91acd6865a2473e59ce9b1 Mon Sep 17 00:00:00 2001 From: jonah Date: Fri, 9 Jan 2026 08:47:32 -0800 Subject: [PATCH 4/8] add gpu --- pufferlib/ocean/backgammon/backgammon.py | 5 ++++- pufferlib/ocean/torch.py | 20 ++++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/pufferlib/ocean/backgammon/backgammon.py b/pufferlib/ocean/backgammon/backgammon.py index fcb1b1f74..e8b2a4bcb 100644 --- a/pufferlib/ocean/backgammon/backgammon.py +++ b/pufferlib/ocean/backgammon/backgammon.py @@ -1,4 +1,7 @@ -'''Backgammon environment for PufferLib''' +'''Backgammon environment for PufferLib + +Test with: python -m pufferlib.pufferl train puffer_backgammon --vec.num-workers 8 +''' import gymnasium import numpy as np diff --git a/pufferlib/ocean/torch.py b/pufferlib/ocean/torch.py index c7663c5f5..83f0637cb 100644 --- a/pufferlib/ocean/torch.py +++ b/pufferlib/ocean/torch.py @@ -963,3 +963,23 @@ def decode_actions(self, hidden): logits = self.decoder(hidden) values = self.value(hidden) return logits, values + + +class Backgammon(nn.Module): + def __init__(self, env, hidden_size=256): + super().__init__() + obs_size = env.single_observation_space.shape[0] # 35 + act_size = env.single_action_space.n # 104 + + self.encoder = nn.Sequential( + nn.Linear(obs_size, hidden_size), + nn.ReLU(), + nn.Linear(hidden_size, hidden_size), + nn.ReLU(), + ) + self.policy_head = nn.Linear(hidden_size, act_size) + self.value_head = nn.Linear(hidden_size, 1) + + def forward(self, obs): + x = self.encoder(obs) + return self.policy_head(x), self.value_head(x) From 010f644dc705fcc82299d42a0c67ef0f15c88398 Mon Sep 17 00:00:00 2001 From: jonah Date: Fri, 9 Jan 2026 09:14:11 -0800 Subject: [PATCH 5/8] training broken --- pufferlib/config/ocean/backgammon.ini | 2 +- pufferlib/ocean/backgammon/backgammon.h | 22 +++++++++++++++------- pufferlib/ocean/backgammon/backgammon.py | 2 +- pufferlib/ocean/torch.py | 4 +++- 4 files changed, 20 insertions(+), 10 deletions(-) diff --git a/pufferlib/config/ocean/backgammon.ini b/pufferlib/config/ocean/backgammon.ini index e2f47f6ec..df5c040ed 100644 --- a/pufferlib/config/ocean/backgammon.ini +++ b/pufferlib/config/ocean/backgammon.ini @@ -17,5 +17,5 @@ num_envs = 4 total_timesteps = 10_000_000 device = cpu -batch_size = 8192 +batch_size = auto mini_batch_size = 2048 \ No newline at end of file diff --git a/pufferlib/ocean/backgammon/backgammon.h b/pufferlib/ocean/backgammon/backgammon.h index 8c6c74f45..9b256d632 100644 --- a/pufferlib/ocean/backgammon/backgammon.h +++ b/pufferlib/ocean/backgammon/backgammon.h @@ -261,13 +261,17 @@ bool can_bear_off(CBackgammon *env, int player) { } bool is_dst_available(CBackgammon *env, int position, int player) { - if (!env->board[position] || env->board[position] == 1 || env->board[position] == -1) return true; - if (player == WHITE) { - return env->board[position] > 0; - } else { - return env->board[position] < 0; - } - + int8_t val = env->board[position]; + + if (val == 0) return true; + + if (player == WHITE && val > 0) return true; + if (player == BLACK && val < 0) return true; + + if (player == WHITE && val == -1) return true; + if (player == BLACK && val == 1) return true; + + return false; } bool is_legal_move(CBackgammon *env, int from, int die_index) { @@ -545,6 +549,10 @@ void c_step(CBackgammon* env) { env->rewards[0] = reward; compute_observations(env); env->tick++; + + if (env->tick % 100 == 0) { + printf("tick=%d white_off=%d black_off=%d\n", env->tick, env->off[WHITE], env->off[BLACK]); + } } diff --git a/pufferlib/ocean/backgammon/backgammon.py b/pufferlib/ocean/backgammon/backgammon.py index e8b2a4bcb..d28dee151 100644 --- a/pufferlib/ocean/backgammon/backgammon.py +++ b/pufferlib/ocean/backgammon/backgammon.py @@ -1,6 +1,6 @@ '''Backgammon environment for PufferLib -Test with: python -m pufferlib.pufferl train puffer_backgammon --vec.num-workers 8 +python -m pufferlib.pufferl train puffer_backgammon --vec.num-envs 64 --env.num-envs 256 --train.batch-size 1048576 --train.bptt-horizon 64 ''' import gymnasium diff --git a/pufferlib/ocean/torch.py b/pufferlib/ocean/torch.py index 83f0637cb..53203f4c7 100644 --- a/pufferlib/ocean/torch.py +++ b/pufferlib/ocean/torch.py @@ -966,7 +966,7 @@ def decode_actions(self, hidden): class Backgammon(nn.Module): - def __init__(self, env, hidden_size=256): + def __init__(self, env, hidden_size=1024): super().__init__() obs_size = env.single_observation_space.shape[0] # 35 act_size = env.single_action_space.n # 104 @@ -976,6 +976,8 @@ def __init__(self, env, hidden_size=256): nn.ReLU(), nn.Linear(hidden_size, hidden_size), nn.ReLU(), + nn.Linear(hidden_size, hidden_size), + nn.ReLU(), ) self.policy_head = nn.Linear(hidden_size, act_size) self.value_head = nn.Linear(hidden_size, 1) From eb440865a99ab5bca040898da963ab91a3551ce4 Mon Sep 17 00:00:00 2001 From: jonah Date: Fri, 9 Jan 2026 10:02:30 -0800 Subject: [PATCH 6/8] training works --- pufferlib/config/ocean/backgammon.ini | 2 +- pufferlib/ocean/backgammon/backgammon.h | 91 +++++++++++++++++------- pufferlib/ocean/backgammon/backgammon.py | 2 +- pufferlib/ocean/backgammon/binding.c | 3 + 4 files changed, 70 insertions(+), 28 deletions(-) diff --git a/pufferlib/config/ocean/backgammon.ini b/pufferlib/config/ocean/backgammon.ini index df5c040ed..2f8f077a7 100644 --- a/pufferlib/config/ocean/backgammon.ini +++ b/pufferlib/config/ocean/backgammon.ini @@ -15,7 +15,7 @@ num_envs = 4 [train] total_timesteps = 10_000_000 -device = cpu +device = cuda batch_size = auto mini_batch_size = 2048 \ No newline at end of file diff --git a/pufferlib/ocean/backgammon/backgammon.h b/pufferlib/ocean/backgammon/backgammon.h index 9b256d632..878a05752 100644 --- a/pufferlib/ocean/backgammon/backgammon.h +++ b/pufferlib/ocean/backgammon/backgammon.h @@ -33,16 +33,23 @@ // 24 points + 2 bar + 2 bear-off + 4 dice + 1 current_player + 2 can_bear_off = 35 #define OBSERVATION_SIZE 35 -#define MAX_STEPS 1000 +#define MAX_STEPS 5000 #define DEFAULT_LOG_INTERVAL 128 +// Opponent difficulty: probability of making a random move instead of greedy +// 0.0 = fully greedy, 1.0 = fully random +#define OPPONENT_RANDOM_PROB 0.9f + typedef struct Log { float episode_return; float episode_length; // Steps in episode float win_rate; // Fraction of games won by white + float black_win_rate; // Fraction of games won by black (opponent) float avg_moves_per_turn; float hit_rate; // Rate of hitting opponent blots + float checkers_home; // Avg checkers in home board at episode end + float checkers_off; // Avg checkers borne off at episode end float n; // Number of episodes } Log; @@ -95,17 +102,11 @@ typedef struct CBackgammon { // Dice state // ======================================================================== - // Dice values (1-6 each) - // For doubles, all 4 entries have the same value int8_t dice[4]; - // Number of dice available (2 normally, 4 for doubles) int8_t num_dice; - // Number of dice already used this turn int8_t dice_used; - - // Which specific dice are still available (for tracking after partial moves) bool dice_available[4]; // ======================================================================== @@ -445,31 +446,52 @@ void opponent_move(CBackgammon* env) { roll_dice(env); env->turns_this_episode++; - // find best pre-programmaed move while (has_legal_moves(env)) { - int best_from = -1; - int best_die = -1; - int best_score = -10000; + int chosen_from = -1; + int chosen_die = -1; - // find the best scoring legal move - for (int from = 0; from <= NUM_POINTS; from++) { - for (int d = 0; d < env->num_dice; d++) { - if (is_legal_move(env, from, d)) { - int score = score_move(env, from, d); - if (score > best_score) { - best_score = score; - best_from = from; - best_die = d; + float r = (float)rand() / (float)RAND_MAX; + if (r < OPPONENT_RANDOM_PROB) { + int legal_moves[NUM_ACTIONS][2]; // [from, die_index] + int num_legal = 0; + + for (int from = 0; from <= NUM_POINTS; from++) { + for (int d = 0; d < env->num_dice; d++) { + if (is_legal_move(env, from, d)) { + legal_moves[num_legal][0] = from; + legal_moves[num_legal][1] = d; + num_legal++; + } + } + } + + if (num_legal > 0) { + int pick = rand() % num_legal; + chosen_from = legal_moves[pick][0]; + chosen_die = legal_moves[pick][1]; + } + } else { + // find best scoring legal move + int best_score = -10000; + + for (int from = 0; from <= NUM_POINTS; from++) { + for (int d = 0; d < env->num_dice; d++) { + if (is_legal_move(env, from, d)) { + int score = score_move(env, from, d); + if (score > best_score) { + best_score = score; + chosen_from = from; + chosen_die = d; + } } } } } - if (best_from < 0) break; // no move found + if (chosen_from < 0) break; // no move found - make_move(env, best_from, best_die); + make_move(env, chosen_from, chosen_die); - // did opponent won if (check_win(env, BLACK)) { return; } @@ -485,6 +507,7 @@ void add_log(CBackgammon* env) { env->log.episode_length += env->tick; env->log.win_rate += check_win(env, WHITE) ? 1.0f : 0.0f; + env->log.black_win_rate += check_win(env, BLACK) ? 1.0f : 0.0f; if (env->turns_this_episode > 0) { env->log.avg_moves_per_turn += (float)env->moves_this_episode / env->turns_this_episode; @@ -494,6 +517,14 @@ void add_log(CBackgammon* env) { env->log.hit_rate += (float)env->hits_this_episode / env->moves_this_episode; } + // Count checkers in home board and borne off for white + int home_count = 0; + for (int i = 1; i <= 6; i++) { + if (env->board[i] > 0) home_count += env->board[i]; + } + env->log.checkers_home += home_count; + env->log.checkers_off += env->off[WHITE]; + env->log.n += 1; } @@ -510,8 +541,19 @@ void c_step(CBackgammon* env) { float reward = 0.0f; + int old_off = env->off[WHITE]; + int old_bar_opponent = env->bar[BLACK]; + if (is_legal_move(env, from, die_index)) { make_move(env, from, die_index); + + // Reward shaping + if (env->off[WHITE] > old_off) { + reward += 0.05f; + } + if (env->bar[BLACK] > old_bar_opponent) { + reward += 0.02f; + } } else { reward -= 0.1f; } @@ -550,9 +592,6 @@ void c_step(CBackgammon* env) { compute_observations(env); env->tick++; - if (env->tick % 100 == 0) { - printf("tick=%d white_off=%d black_off=%d\n", env->tick, env->off[WHITE], env->off[BLACK]); - } } diff --git a/pufferlib/ocean/backgammon/backgammon.py b/pufferlib/ocean/backgammon/backgammon.py index d28dee151..56f487a3d 100644 --- a/pufferlib/ocean/backgammon/backgammon.py +++ b/pufferlib/ocean/backgammon/backgammon.py @@ -1,6 +1,6 @@ '''Backgammon environment for PufferLib -python -m pufferlib.pufferl train puffer_backgammon --vec.num-envs 64 --env.num-envs 256 --train.batch-size 1048576 --train.bptt-horizon 64 +python -m pufferlib.pufferl train puffer_backgammon --vec.num-envs 64 --env.num-envs 256 --train.batch-size 1048576 --train.bptt-horizon 64 --train.total-timesteps 500_000_000 ''' import gymnasium diff --git a/pufferlib/ocean/backgammon/binding.c b/pufferlib/ocean/backgammon/binding.c index 586ef5e23..f011d6ba1 100644 --- a/pufferlib/ocean/backgammon/binding.c +++ b/pufferlib/ocean/backgammon/binding.c @@ -12,7 +12,10 @@ static int my_log(PyObject* dict, Log* log) { assign_to_dict(dict, "episode_return", log->episode_return); assign_to_dict(dict, "episode_length", log->episode_length); assign_to_dict(dict, "win_rate", log->win_rate); + assign_to_dict(dict, "black_win_rate", log->black_win_rate); assign_to_dict(dict, "avg_moves_per_turn", log->avg_moves_per_turn); assign_to_dict(dict, "hit_rate", log->hit_rate); + assign_to_dict(dict, "checkers_home", log->checkers_home); + assign_to_dict(dict, "checkers_off", log->checkers_off); return 0; } \ No newline at end of file From 95399d4cfce68d7b34f111dd6ee9f98b6d84c8a2 Mon Sep 17 00:00:00 2001 From: jonah Date: Fri, 9 Jan 2026 10:18:31 -0800 Subject: [PATCH 7/8] readme --- pufferlib/ocean/backgammon/README.md | 53 +++++++++++++++++++++++++ pufferlib/ocean/backgammon/backgammon.h | 4 +- 2 files changed, 55 insertions(+), 2 deletions(-) create mode 100644 pufferlib/ocean/backgammon/README.md diff --git a/pufferlib/ocean/backgammon/README.md b/pufferlib/ocean/backgammon/README.md new file mode 100644 index 000000000..f87f31992 --- /dev/null +++ b/pufferlib/ocean/backgammon/README.md @@ -0,0 +1,53 @@ +# Backgammon + +A backgammon environment for PufferLib. The agent plays as White against a configurable opponent (Black). + +## Rules + +Standard backgammon rules: 15 checkers per player, roll two dice, move checkers toward your home board, bear them off to win. You can hit opponent blots (single checkers) to send them to the bar. First to bear off all 15 checkers wins. + +White moves from point 24 → 1 (home board is 1-6). +Black moves from point 1 → 24 (home board is 19-24). + +## Observations (35 floats) + +- Points 1-24: checker counts normalized by 15 (positive = white, negative = black) +- Bar counts for white and black +- Borne-off counts for white and black +- 4 dice values (0 if used) +- Current player +- Can-bear-off flags for both players + +## Actions (104 discrete) + +`action = source * 4 + die_index` + +- `source`: 0 = entering from bar, 1-24 = board points, 25 = bear off +- `die_index`: which die to use (0-3, since doubles give 4 moves) + +Invalid moves get a small penalty (-0.1) and are skipped. + +## Rewards + +- Win: +1.0 +- Lose: -1.0 +- Bear off a checker: +0.05 +- Hit opponent: +0.02 +- Invalid move: -0.1 + +## Opponent + +The opponent difficulty is controlled by `OPPONENT_RANDOM_PROB` in `backgammon.h`: +- 1.0 = fully random (easiest) +- 0.0 = greedy heuristic (harder) + +For training, start with a weak opponent and gradually decrease randomness as the agent improves. + +## Training + +```bash +python -m pufferlib.pufferl train puffer_backgammon --vec.num-envs 64 --env.num-envs 256 --train.batch-size 1048576 --train.bptt-horizon 64 --train.total-timesteps 500_000_000 --train.learning-rate 0.0001 +``` + +Against a random opponent, expect >50% win rate after a few minutes of training. + diff --git a/pufferlib/ocean/backgammon/backgammon.h b/pufferlib/ocean/backgammon/backgammon.h index 878a05752..4d602f1e1 100644 --- a/pufferlib/ocean/backgammon/backgammon.h +++ b/pufferlib/ocean/backgammon/backgammon.h @@ -36,9 +36,9 @@ #define MAX_STEPS 5000 #define DEFAULT_LOG_INTERVAL 128 -// Opponent difficulty: probability of making a random move instead of greedy +// Opponent difficulty // 0.0 = fully greedy, 1.0 = fully random -#define OPPONENT_RANDOM_PROB 0.9f +#define OPPONENT_RANDOM_PROB 1.0f typedef struct Log { From 120d1edf755dbb5798a576e3d168e10f6b128ad7 Mon Sep 17 00:00:00 2001 From: jonah Date: Fri, 9 Jan 2026 10:25:09 -0800 Subject: [PATCH 8/8] update readme --- pufferlib/ocean/backgammon/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pufferlib/ocean/backgammon/README.md b/pufferlib/ocean/backgammon/README.md index f87f31992..b3c8d92d4 100644 --- a/pufferlib/ocean/backgammon/README.md +++ b/pufferlib/ocean/backgammon/README.md @@ -46,7 +46,7 @@ For training, start with a weak opponent and gradually decrease randomness as th ## Training ```bash -python -m pufferlib.pufferl train puffer_backgammon --vec.num-envs 64 --env.num-envs 256 --train.batch-size 1048576 --train.bptt-horizon 64 --train.total-timesteps 500_000_000 --train.learning-rate 0.0001 +python -m pufferlib.pufferl train puffer_backgammon --vec.num-envs 64 --env.num-envs 256 --train.batch-size 1048576 --train.bptt-horizon 64 --train.total-timesteps 500_000_000 --train.learning-rate 0.001 ``` Against a random opponent, expect >50% win rate after a few minutes of training.