Boxoban #448

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open

TBBristol wants to merge 43 commits into PufferAI:3.0 from TBBristol:boxoban

.gitignore

-Original file line number
+Diff line change
@@ -1,3 +1,6 @@
+    *_maps.bin
+    *_valid.bin
     # Annoying temp files generated by Cython
     c_*.c
     pufferlib/extensions.c
@@ Expand Down Expand Up / @@ -162,3 +165,5 @@ pufferlib/ocean/impulse_wars/*-release/ @@
     pufferlib/ocean/impulse_wars/debug-*/
     pufferlib/ocean/impulse_wars/release-*/
     pufferlib/ocean/impulse_wars/benchmark/
+    *.dSYM/

.gitmodules

Empty file.

pufferlib/config/default.ini

-Original file line number
+Diff line change
@@ Expand Up / @@ -93,7 +93,7 @@ scale = auto @@
     [sweep.train.minibatch_size]
     distribution = uniform_pow2
     min = 8192
-    max = 65536
+    max = 16384
     scale = auto
     [sweep.train.learning_rate]
@@ Expand Down @@

pufferlib/config/ocean/boxoban.ini

-Original file line number
+Diff line change
@@ -0,0 +1,77 @@
+    [base]
+    package = ocean
+    env_name = puffer_boxoban
+    policy_name = Boxoban
+    rnn_name = Recurrent
+    [vec]
+    num_envs = 2
+    [env]
+    num_envs = 1024
+    difficulty = "easy"
+    #reward per intermediate target (once per episode)
+    int_r_coeff = 0.25
+    #moving box off target
+    target_loss_pen_coeff = 0.1
+    #neg reward per step
+    length_reward_coeff = 0.0
+    [policy]
+    [train]
+    #BASIC
+    #adam_beta1 = 0.9398378409770966
+    #adam_beta2 = 0.9989332259552188
+    #adam_eps = 0.00000000000206071635
+    #anneal_lr = true
+    #batch_size = "auto"
+    #bptt_horizon = 64
+    #checkpoint_interval = 200
+    #clip_coef = 0.11478794743865613
+    #ent_coef = 0.0029962808388471485
+    #gae_lambda = 0.8493271024211292
+    #gamma = 0.9993401324579252
+    #learning_rate = 0.014686393387259022
+    #max_grad_norm = 0.9813762605915642
+    #min_lr_ratio = 0.0919479673291089
+    #minibatch_size = 16384
+    #optimizer = "muon"
+    #prio_alpha = 0.9306424191723168
+    #prio_beta0 = 0.6438373386977116
+    #update_epochs = 1
+    #total_timesteps = 30000000
+    #vf_clip_coef = 0.3663806329531388
+    #vf_coef = 2.528717985356681
+    #vtrace_c_clip = 1.2791176791333148
+    #vtrace_rho_clip = 1.1263937056422595
+    #
+    #EASY
+    adam_beta1 = 0.9401745430570272
+    adam_beta2 = 0.9131850488636376
+    adam_eps = 0.00000003606344842944
+    anneal_lr = "true"
+    batch_size = "auto"
+    bptt_horizon = 64
+    clip_coef = 0.03332279377492652
+    ent_coef = 0.052842630147383426
+    gae_lambda = 0.7936070081802409
+    gamma = 0.9589112076898656
+    learning_rate = 0.012534394901687526
+    max_grad_norm = 2.096905570892092
+    max_minibatch_size = 32768
+    min_lr_ratio = 0.28390691472987917
+    minibatch_size = 16384
+    optimizer = "muon"
+    precision = "float32"
+    prio_alpha = 0.974402356259871
+    prio_beta0 = 0.9402320261892596
+    total_timesteps = 74257668
+    update_epochs = 1
+    use_rnn = true
+    vf_clip_coef = 1.5271841942808977
+    vf_coef = 5
+    vtrace_c_clip = 2.7424047105884948
+    vtrace_rho_clip = 2.5409738450112447

pufferlib/models.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -8,6 +8,106 @@ @@
     import pufferlib.pytorch
     import pufferlib.spaces
+    import numpy as np
+    import torch
+    import torch.nn as nn
+    import pufferlib
+    class Boxoban(nn.Module):
+        """
+        Observations: always (B, 400) = 4 * (10*10), planes concatenated:
+          [agent_plane(100), target_plane(100), box_plane(100), wall_plane(100)]
+        Each plane is binary/float occupancy. Target+box can co-locate naturally.
+        Embedding per cell:
+          cell_vec = pos_embed[cell] + sum_{type present} type_embed[type]
+        """
+        def __init__(self, env, hidden_size=128, embed_dim=8):
+            super().__init__()
+            self.hidden_size = hidden_size
+            self.embed_dim = embed_dim
+            self.is_multidiscrete = isinstance(env.single_action_space, pufferlib.spaces.MultiDiscrete)
+            self.is_continuous = isinstance(env.single_action_space, pufferlib.spaces.Box)
+            # Fixed layout
+            self.num_types = 4
+            self.num_cells = 100
+            self.obs_n = 400
+            self.type_embed = nn.Embedding(self.num_types, self.embed_dim)
+            self.pos_embed = nn.Embedding(self.num_cells, self.embed_dim)
+            self.encoder = nn.Sequential(
+                pufferlib.pytorch.layer_init(nn.Linear(self.num_cells * self.embed_dim, 2 * hidden_size)),
+                nn.GELU(),
+                pufferlib.pytorch.layer_init(nn.Linear(2 * hidden_size, hidden_size)),
+                nn.GELU(),
+                pufferlib.pytorch.layer_init(nn.Linear(hidden_size, hidden_size)),
+                nn.GELU(),
+            )
+            if self.is_multidiscrete:
+                self.action_nvec = tuple(env.single_action_space.nvec)
+                num_atns = sum(self.action_nvec)
+                self.decoder = pufferlib.pytorch.layer_init(nn.Linear(hidden_size, num_atns), std=0.01)
+            elif not self.is_continuous:
+                num_atns = env.single_action_space.n
+                self.decoder = pufferlib.pytorch.layer_init(nn.Linear(hidden_size, num_atns), std=0.01)
+            else:
+                self.decoder_mean = pufferlib.pytorch.layer_init(
+                    nn.Linear(hidden_size, env.single_action_space.shape[0]), std=0.01
+                )
+                self.decoder_logstd = nn.Parameter(torch.zeros(1, env.single_action_space.shape[0]))
+            self.value = pufferlib.pytorch.layer_init(nn.Linear(hidden_size, 1), std=1.0)
+        def forward_eval(self, observations, state=None):
+            hidden = self.encode_observations(observations, state=state)
+            logits, values = self.decode_actions(hidden)
+            return logits, values
+        def forward(self, observations, state=None):
+            return self.forward_eval(observations, state)
+        def encode_observations(self, observations, state=None):
+            # observations: (B, 400)
+            B = observations.shape[0]
+            x = observations
+            if x.shape[1] != self.obs_n:
+                raise ValueError(f"Expected observations shape (B, {self.obs_n}), got {tuple(x.shape)}")
+            if x.dtype not in (torch.float16, torch.float32, torch.bfloat16):
+                x = x.float()
+            # (B, 400) -> (B, 4, 100) -> (B, 100, 4)
+            x = x.view(B, self.num_types, self.num_cells).permute(0, 2, 1).contiguous()
+            # Sum entity-type embeddings for present types
+            type_vec = x @ self.type_embed.weight  # (B, 100, embed_dim)
+            # Add position embedding
+            pos_vec = self.pos_embed.weight.unsqueeze(0).expand(B, -1, -1)  # (B, 100, embed_dim)
+            cell_vec = type_vec + pos_vec
+            flat = cell_vec.view(B, self.num_cells * self.embed_dim)
+            return self.encoder(flat)
+        def decode_actions(self, hidden):
+            if self.is_multidiscrete:
+                logits = self.decoder(hidden).split(self.action_nvec, dim=1)
+            elif self.is_continuous:
+                mean = self.decoder_mean(hidden)
+                logstd = self.decoder_logstd.expand_as(mean)
+                std = torch.exp(logstd)
+                logits = torch.distributions.Normal(mean, std)
+            else:
+                logits = self.decoder(hidden)
+            values = self.value(hidden)
+            return logits, values
     class Default(nn.Module):
         '''Default PyTorch policy. Flattens obs and applies a linear layer.
@@ Expand Down Expand Up / @@ -79,6 +179,7 @@ def encode_observations(self, observations, state=None): @@
                 observations = torch.cat([v.view(batch_size, -1) for v in observations.values()], dim=1)
             else:
                 observations = observations.view(batch_size, -1)
+            breakpoint()
             return self.encoder(observations.float())
         def decode_actions(self, hidden):
@@ Expand Down @@

pufferlib/ocean/boxoban/README.md

-Original file line number
+Diff line change
@@ -0,0 +1,30 @@
+    #### BOXOBAN
+    A simple game based on Sokoban where the player must push all boxes on to the targets
+    Boxoban-levels contains the levels for the game as .txt files. There are various difficulties chosen by the environment variable 'difficulty' which can be 'basic', 'easy', 'medium', 'hard', 'unfiltered'.
+    Basic - only externals walls and one box
+    Easy - only externals walls and up to 4 boxes
+    These can both be generated using the generate_easy_maps.py script and settings the internals to required options and output str.
+    The hard, medium and unfiltered levels are taken from Googles Boxoban dataset and the license info is included in the file.
+    These maps are not easy to generate since they need to be solveable but also interesting, however there are a very good number of maps in those folders ~1M.
+    Medium and ulfiltered also have validation sets though these aren't used.
+    ## The first time each difficulty is used a .bin is generated
+    Play manually using the .c compiled with bash scripts/build_ocean boxoban.
+    You can play different difficulties by adding the arg eg. ./boxoban easy HOWEVER the .bin needs to have been built
+    <img width="315" height="342" alt="image" src="https://github.com/user-attachments/assets/f5ea4eac-ec64-4444-b54a-b06c9ef2d252" />

pufferlib/ocean/boxoban/binding.c

-Original file line number
+Diff line change
@@ -0,0 +1,48 @@
+    #define BOXOBAN_MAPS_IMPLEMENTATION //enables mmap
+    #include "boxoban.h"
+    #define Env Boxoban
+    #include "../env_binding.h"
+    //Map stuff
+    static int update_map_path(PyObject* kwargs) {
+        PyObject* map_path_obj = PyDict_GetItemString(kwargs, "map_path");
+        if (map_path_obj == NULL || !PyUnicode_Check(map_path_obj)) {
+            PyErr_SetString(PyExc_TypeError, "Boxoban requires a string 'map_path' kwarg");
+            return -1;
+        }
+        const char* new_path = PyUnicode_AsUTF8(map_path_obj);
+        if (new_path == NULL) {
+            return -1;
+        }
+        if (boxoban_set_map_path(new_path) != 0) {
+            PyErr_SetString(PyExc_RuntimeError, "Failed to set Boxoban map path");
+            return -1;
+        }
+        return 0;
+    }
+    static int my_init(Env* env, PyObject* args, PyObject* kwargs) {
+        if (update_map_path(kwargs) != 0) {
+            return -1;
+        }
+        env->size = (int)unpack(kwargs, "size");
+        env->max_steps = (int)unpack(kwargs, "max_steps");
+        env->int_r_coeff = (float)unpack(kwargs, "int_r_coeff");
+        env->target_loss_pen_coeff = (float)unpack(kwargs, "target_loss_pen_coeff");
+        env->len_reward_coeff = (float)unpack(kwargs, "length_reward_coeff");
+        init(env);
+        return 0;
+    }
+    static int my_log(PyObject* dict, Log* log) {
+        assign_to_dict(dict, "perf", log->perf);
+        assign_to_dict(dict, "score", log->score);
+        assign_to_dict(dict, "episode_return", log->episode_return);
+        assign_to_dict(dict, "episode_length", log->episode_length);
+        assign_to_dict(dict, "targets_hit", log->n_targets);
+        return 0;
+    }

pufferlib/ocean/boxoban/boxoban-levels/CONTRIBUTING.md

-Original file line number
+Diff line change
@@ -0,0 +1,23 @@
+    # How to Contribute
+    We'd love to accept your patches and contributions to this project. There are
+    just a few small guidelines you need to follow.
+    ## Contributor License Agreement
+    Contributions to this project must be accompanied by a Contributor License
+    Agreement. You (or your employer) retain the copyright to your contribution,
+    this simply gives us permission to use and redistribute your contributions as
+    part of the project. Head over to <https://cla.developers.google.com/> to see
+    your current agreements on file or to sign a new one.
+    You generally only need to submit a CLA once, so if you've already submitted one
+    (even if it was for a different project), you probably don't need to do it
+    again.
+    ## Code reviews
+    All submissions, including submissions by project members, require review. We
+    use GitHub pull requests for this purpose. Consult
+    [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
+    information on using pull requests.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Boxoban #448

Uh oh!

Diff view

Diff view

Uh oh!

There are no files selected for viewing

Uh oh!

Uh oh!

Boxoban #448

Are you sure you want to change the base?

Uh oh!

Boxoban #448

Uh oh!

Diff view

Diff view

Uh oh!

There are no files selected for viewing

Uh oh!

Uh oh!