diff --git a/profile_kernels.cu b/profile_kernels.cu index 6261660e8..cac6d7455 100644 --- a/profile_kernels.cu +++ b/profile_kernels.cu @@ -3,35 +3,39 @@ // // Without torch: nvcc -O3 -arch=sm_80 profile_kernels.cu -o profile_kernels -I. // With torch: Build with cmake/pytorch and -DUSE_TORCH +// With env: ./scripts/build_profile_kernels.sh // -// Run: ./profile_kernels +// Run: ./profile_kernels +// kernels - Individual kernel profiling +// envspeed - Environment step throughput (requires -DUSE_STATIC_ENV) #include #include -#include +#include #include #include #include -#include #include +#include #ifdef USE_TORCH -#include "pufferlib/extensions/pufferlib.cpp" +#include +#include +#include +#include #include "pufferlib/extensions/cuda/kernels.cu" -// #include "pufferlib/extensions/modules.cpp" +#include "pufferlib/extensions/pufferlib.cpp" using namespace pufferlib; #endif -#include "pufferlib/extensions/vecenv.h" - #ifndef USE_TORCH #include "pufferlib/extensions/cuda/kernels.cu" #endif -const int WARMUP_ITERS = 1000; -const int TIMING_ITERS = 10000; -const float TIMEOUT_SEC = 5.0f; +const int WARMUP_ITERS = 100; +const int TIMING_ITERS = 1000; +const float TIMEOUT_SEC = 3.0f; const int BUF = 2; const int BR = 4096; // Rollout batch (no T dim) @@ -47,6 +51,13 @@ void print_timing(const char* name, float ms, int N) { printf(" %-18s %6.1f us %6.2f M elem/s\n", name, ms * 1000, N / ms / 1e3); } +// Wall-clock time for timeout checks (only checked every BATCH_SIZE iters) +float get_time_sec() { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return ts.tv_sec + ts.tv_nsec / 1e9f; +} + void warmup_gpu() { // Warm up GPU clocks with some busy work float* dummy; @@ -58,22 +69,41 @@ void warmup_gpu() { cudaFree(dummy); } +const int BATCH_SIZE = 100; // Check timeout every BATCH_SIZE iterations + float profile_kernel(kernel_fn fn, void* args, const char* name = nullptr) { cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); + // Warmup with timeout + float warmup_start = get_time_sec(); for (int i = 0; i < WARMUP_ITERS; ++i) { fn(args); - cudaDeviceSynchronize(); + if (i % BATCH_SIZE == 0) { + cudaDeviceSynchronize(); + if (get_time_sec() - warmup_start > TIMEOUT_SEC) break; + } } + cudaDeviceSynchronize(); + // Timed runs with timeout - check wall clock every BATCH_SIZE iters cudaProfilerStart(); if (name) nvtxRangePushA(name); cudaEventRecord(start); - for (int i = 0; i < TIMING_ITERS; ++i) { - fn(args); + + float timing_start = get_time_sec(); + long iters = 0; + float elapsed = 0; + while (elapsed < TIMEOUT_SEC) { + for (int i = 0; i < BATCH_SIZE; ++i) { + fn(args); + } + iters += BATCH_SIZE; + cudaDeviceSynchronize(); + elapsed = get_time_sec() - timing_start; } + cudaEventRecord(stop); cudaEventSynchronize(stop); if (name) nvtxRangePop(); @@ -85,8 +115,10 @@ float profile_kernel(kernel_fn fn, void* args, const char* name = nullptr) { cudaEventDestroy(stop); cudaDeviceSynchronize(); +#ifdef USE_TORCH c10::cuda::CUDACachingAllocator::emptyCache(); - return ms / TIMING_ITERS; +#endif + return ms / iters; } #ifdef USE_TORCH @@ -96,13 +128,20 @@ float profile_graph(kernel_fn fn, void* args, const char* name = nullptr) { at::cuda::CUDAGraph cuda_graph; at::cuda::CUDAStream current_stream = at::cuda::getCurrentCUDAStream(); + // Warmup with timeout at::cuda::CUDAStream warmup_stream = at::cuda::getStreamFromPool(); at::cuda::setCurrentCUDAStream(warmup_stream); + float warmup_start = get_time_sec(); for (int i = 0; i < WARMUP_ITERS; ++i) { fn(args); + if (i % BATCH_SIZE == 0) { + warmup_stream.synchronize(); + if (get_time_sec() - warmup_start > TIMEOUT_SEC) break; + } } warmup_stream.synchronize(); + // Capture graph at::cuda::CUDAStream cap_stream = at::cuda::getStreamFromPool(); at::cuda::setCurrentCUDAStream(cap_stream); cuda_graph.capture_begin(); @@ -117,12 +156,23 @@ float profile_graph(kernel_fn fn, void* args, const char* name = nullptr) { cudaEventCreate(&start); cudaEventCreate(&stop); + // Timed runs with timeout cudaProfilerStart(); if (name) nvtxRangePushA(name); cudaEventRecord(start); - for (int i = 0; i < TIMING_ITERS; ++i) { - cuda_graph.replay(); + + float timing_start = get_time_sec(); + long iters = 0; + float elapsed = 0; + while (elapsed < TIMEOUT_SEC) { + for (int i = 0; i < BATCH_SIZE; ++i) { + cuda_graph.replay(); + } + iters += BATCH_SIZE; + cudaDeviceSynchronize(); + elapsed = get_time_sec() - timing_start; } + cudaEventRecord(stop); cudaEventSynchronize(stop); if (name) nvtxRangePop(); @@ -133,7 +183,7 @@ float profile_graph(kernel_fn fn, void* args, const char* name = nullptr) { cudaEventDestroy(start); cudaEventDestroy(stop); - return ms / TIMING_ITERS; + return ms / iters; } #endif @@ -899,6 +949,489 @@ void profile_fusedscan(int batch, int seq, int hidden) { free_fusedscanargs(args); } +// ============================================================================= +// FCMax: Simple FC -> Max kernel (no intermediate ReLU layer) +// Input: x (B, N, D_in), W (D_out, D_in), b (D_out) +// Output: (B, D_out) = max_over_N(x @ W.T + b) +// ============================================================================= + +typedef struct { + float* x; // (B, N, D_in) + float* W; // (D_out, D_in) + float* b; // (D_out) + float* out; // (B, D_out) + int* argmax_indices; // (B, D_out) + float* grad_x; // (B, N, D_in) + float* grad_W; // (D_out, D_in) + float* grad_b; // (D_out) + float* grad_out; // (B, D_out) + int B; + int N; + int D_in; + int D_out; +} FCMaxArgs; + +FCMaxArgs* create_fcmaxargs(int batch, int num_points, int d_in, int d_out) { + FCMaxArgs* args = (FCMaxArgs*)calloc(1, sizeof(FCMaxArgs)); + args->B = batch; + args->N = num_points; + args->D_in = d_in; + args->D_out = d_out; + + int N_x = batch * num_points * d_in; + int N_W = d_out * d_in; + int N_out = batch * d_out; + + cudaMalloc(&args->x, N_x * sizeof(float)); + cudaMalloc(&args->W, N_W * sizeof(float)); + cudaMalloc(&args->b, d_out * sizeof(float)); + cudaMalloc(&args->out, N_out * sizeof(float)); + cudaMalloc(&args->argmax_indices, N_out * sizeof(int)); + cudaMalloc(&args->grad_x, N_x * sizeof(float)); + cudaMalloc(&args->grad_W, N_W * sizeof(float)); + cudaMalloc(&args->grad_b, d_out * sizeof(float)); + cudaMalloc(&args->grad_out, N_out * sizeof(float)); + + float* x_buf = (float*)malloc(N_x * sizeof(float)); + float* W_buf = (float*)malloc(N_W * sizeof(float)); + float* b_buf = (float*)malloc(d_out * sizeof(float)); + float* grad_out_buf = (float*)malloc(N_out * sizeof(float)); + + for (int i = 0; i < N_x; ++i) x_buf[i] = rand1(); + for (int i = 0; i < N_W; ++i) W_buf[i] = rand1() * 0.1f; + for (int i = 0; i < d_out; ++i) b_buf[i] = 0.0f; + for (int i = 0; i < N_out; ++i) grad_out_buf[i] = rand1(); + + cudaMemcpy(args->x, x_buf, N_x * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(args->W, W_buf, N_W * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(args->b, b_buf, d_out * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(args->grad_out, grad_out_buf, N_out * sizeof(float), cudaMemcpyHostToDevice); + + free(x_buf); + free(W_buf); + free(b_buf); + free(grad_out_buf); + return args; +} + +void free_fcmaxargs(FCMaxArgs* args) { + cudaFree(args->x); + cudaFree(args->W); + cudaFree(args->b); + cudaFree(args->out); + cudaFree(args->argmax_indices); + cudaFree(args->grad_x); + cudaFree(args->grad_W); + cudaFree(args->grad_b); + cudaFree(args->grad_out); + free(args); +} + +void run_fcmax_forward(FCMaxArgs* args) { + launch_fc_max_forward_float( + args->out, args->argmax_indices, + args->x, args->W, args->b, + args->B, args->N, args->D_in, args->D_out, 0); +} + +void run_fcmax_backward(FCMaxArgs* args) { + cudaMemset(args->grad_x, 0, args->B * args->N * args->D_in * sizeof(float)); + cudaMemset(args->grad_W, 0, args->D_out * args->D_in * sizeof(float)); + cudaMemset(args->grad_b, 0, args->D_out * sizeof(float)); + + launch_fc_max_backward_float( + args->grad_x, args->grad_W, args->grad_b, + args->grad_out, args->x, args->W, + args->argmax_indices, + args->B, args->N, args->D_in, args->D_out, 0); +} + +#ifdef USE_TORCH + +typedef struct { + torch::Tensor x; // (B, N, D_in) + torch::Tensor W; // (D_out, D_in) + torch::Tensor b; // (D_out) + torch::Tensor out; // (B, D_out) + torch::Tensor grad_out;// (B, D_out) + int B, N, D_in, D_out; +} FCMaxArgsTorch; + +FCMaxArgsTorch* create_fcmaxargs_torch(FCMaxArgs* raw) { + FCMaxArgsTorch* args = new FCMaxArgsTorch(); + args->B = raw->B; + args->N = raw->N; + args->D_in = raw->D_in; + args->D_out = raw->D_out; + + auto opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA); + args->x = torch::from_blob(raw->x, {raw->B, raw->N, raw->D_in}, opts).clone().requires_grad_(true); + args->W = torch::from_blob(raw->W, {raw->D_out, raw->D_in}, opts).clone().requires_grad_(true); + args->b = torch::from_blob(raw->b, {raw->D_out}, opts).clone().requires_grad_(true); + args->grad_out = torch::from_blob(raw->grad_out, {raw->B, raw->D_out}, opts).clone(); + + return args; +} + +void run_fcmax_forward_torch(FCMaxArgsTorch* args) { + torch::NoGradGuard no_grad; + fc_max(args->x, args->W, args->b); +} + +void run_fcmax_backward_torch(FCMaxArgsTorch* args) { + // Recompute forward each time since backward frees the graph + auto out = fc_max(args->x, args->W, args->b); + if (args->x.grad().defined()) args->x.grad().zero_(); + if (args->W.grad().defined()) args->W.grad().zero_(); + if (args->b.grad().defined()) args->b.grad().zero_(); + out.backward(args->grad_out); +} + +void run_fcmax_forward_cpp(FCMaxArgsTorch* args) { + torch::NoGradGuard no_grad; + fc_max_cpp(args->x, args->W, args->b); +} + +void test_fcmax_correct(FCMaxArgsTorch* args) { + auto x_fused = args->x.detach().clone().requires_grad_(true); + auto W_fused = args->W.detach().clone().requires_grad_(true); + auto b_fused = args->b.detach().clone().requires_grad_(true); + auto fused_out = fc_max(x_fused, W_fused, b_fused); + + auto x_ref = args->x.detach().clone().requires_grad_(true); + auto W_ref = args->W.detach().clone().requires_grad_(true); + auto b_ref = args->b.detach().clone().requires_grad_(true); + auto ref_out = fc_max_cpp(x_ref, W_ref, b_ref); + + float rtol = 1e-3f, atol = 1e-4f; + bool out_match = torch::allclose(fused_out, ref_out, rtol, atol); + float out_max_diff = (fused_out - ref_out).abs().max().item(); + + printf(" forward correctness: out=%s(%.2e)\n", + out_match ? "\033[32mok\033[0m" : "\033[31mFAIL\033[0m", out_max_diff); + + // Backward + fused_out.backward(args->grad_out); + ref_out.backward(args->grad_out); + + bool grad_x_match = torch::allclose(x_fused.grad(), x_ref.grad(), rtol, atol); + float grad_x_max_diff = (x_fused.grad() - x_ref.grad()).abs().max().item(); + bool grad_W_match = torch::allclose(W_fused.grad(), W_ref.grad(), rtol, atol); + float grad_W_max_diff = (W_fused.grad() - W_ref.grad()).abs().max().item(); + + printf(" backward correctness: grad_x=%s(%.2e) grad_W=%s(%.2e)\n", + grad_x_match ? "\033[32mok\033[0m" : "\033[31mFAIL\033[0m", grad_x_max_diff, + grad_W_match ? "\033[32mok\033[0m" : "\033[31mFAIL\033[0m", grad_W_max_diff); +} + +#endif + +void profile_fcmax(int batch, int num_points, int d_in, int d_out) { + FCMaxArgs* args = create_fcmaxargs(batch, num_points, d_in, d_out); + + printf("fc_max (B=%d, N=%d, D_in=%d, D_out=%d)\n", batch, num_points, d_in, d_out); + + float fwd_ms = profile_kernel((kernel_fn)run_fcmax_forward, args); + print_timing("\tforward", fwd_ms, batch); + + float bwd_ms = profile_kernel((kernel_fn)run_fcmax_backward, args); + print_timing("\tbackward", bwd_ms, batch); + +#ifdef USE_TORCH + FCMaxArgsTorch* args_torch = create_fcmaxargs_torch(args); + + test_fcmax_correct(args_torch); + + float fwd_torch_ms = profile_kernel((kernel_fn)run_fcmax_forward_torch, args_torch); + print_timing("\tforward (torch)", fwd_torch_ms, batch); + + args_torch->out = fc_max(args_torch->x, args_torch->W, args_torch->b); + + float bwd_torch_ms = profile_kernel((kernel_fn)run_fcmax_backward_torch, args_torch); + print_timing("\tbackward (torch)", bwd_torch_ms, batch); + + float fwd_cpp_ms = profile_kernel((kernel_fn)run_fcmax_forward_cpp, args_torch); + print_timing("\tforward (cpp)", fwd_cpp_ms, batch); + + float fwd_graph_ms = profile_graph((kernel_fn)run_fcmax_forward_cpp, args_torch); + print_timing("\tforward (graph)", fwd_graph_ms, batch); + + delete args_torch; +#endif + printf("\n"); + + free_fcmaxargs(args); +} + +// FCReluFCMax: Fused FC -> ReLU -> FC -> Max kernel +// Input: x (B, N, D_in), W1 (D_mid, D_in), b1 (D_mid), W2 (D_out, D_mid), b2 (D_out) +// Output: (B, D_out) = max_over_N(FC2(ReLU(FC1(x)))) +typedef struct { + float* x; // (B, N, D_in) + float* W1; // (D_mid, D_in) + float* b1; // (D_mid) + float* W2; // (D_out, D_mid) + float* b2; // (D_out) + float* out; // (B, D_out) + int* argmax_indices; // (B, D_out) + float* fc1_at_argmax; // (B, D_out, D_mid) + float* grad_x; // (B, N, D_in) + float* grad_W1; // (D_mid, D_in) + float* grad_b1; // (D_mid) + float* grad_W2; // (D_out, D_mid) + float* grad_b2; // (D_out) + float* grad_out; // (B, D_out) + int B; + int N; + int D_in; + int D_mid; + int D_out; +} FCReluFCMaxArgs; + +FCReluFCMaxArgs* create_fcrelufcmaxargs(int batch, int num_points, int d_in, int d_mid, int d_out) { + FCReluFCMaxArgs* args = (FCReluFCMaxArgs*)calloc(1, sizeof(FCReluFCMaxArgs)); + args->B = batch; + args->N = num_points; + args->D_in = d_in; + args->D_mid = d_mid; + args->D_out = d_out; + + int N_x = batch * num_points * d_in; + int N_W1 = d_mid * d_in; + int N_W2 = d_out * d_mid; + int N_out = batch * d_out; + int N_fc1_at_argmax = batch * d_out * d_mid; + + cudaMalloc(&args->x, N_x * sizeof(float)); + cudaMalloc(&args->W1, N_W1 * sizeof(float)); + cudaMalloc(&args->b1, d_mid * sizeof(float)); + cudaMalloc(&args->W2, N_W2 * sizeof(float)); + cudaMalloc(&args->b2, d_out * sizeof(float)); + cudaMalloc(&args->out, N_out * sizeof(float)); + cudaMalloc(&args->argmax_indices, N_out * sizeof(int)); + cudaMalloc(&args->fc1_at_argmax, N_fc1_at_argmax * sizeof(float)); + cudaMalloc(&args->grad_x, N_x * sizeof(float)); + cudaMalloc(&args->grad_W1, N_W1 * sizeof(float)); + cudaMalloc(&args->grad_b1, d_mid * sizeof(float)); + cudaMalloc(&args->grad_W2, N_W2 * sizeof(float)); + cudaMalloc(&args->grad_b2, d_out * sizeof(float)); + cudaMalloc(&args->grad_out, N_out * sizeof(float)); + + // Allocate and initialize host buffers + float* x_buf = (float*)malloc(N_x * sizeof(float)); + float* W1_buf = (float*)malloc(N_W1 * sizeof(float)); + float* b1_buf = (float*)malloc(d_mid * sizeof(float)); + float* W2_buf = (float*)malloc(N_W2 * sizeof(float)); + float* b2_buf = (float*)malloc(d_out * sizeof(float)); + float* grad_out_buf = (float*)malloc(N_out * sizeof(float)); + + for (int i = 0; i < N_x; ++i) x_buf[i] = rand1(); + for (int i = 0; i < N_W1; ++i) W1_buf[i] = rand1() * 0.1f; + for (int i = 0; i < d_mid; ++i) b1_buf[i] = 0.0f; + for (int i = 0; i < N_W2; ++i) W2_buf[i] = rand1() * 0.1f; + for (int i = 0; i < d_out; ++i) b2_buf[i] = 0.0f; + for (int i = 0; i < N_out; ++i) grad_out_buf[i] = rand1(); + + cudaMemcpy(args->x, x_buf, N_x * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(args->W1, W1_buf, N_W1 * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(args->b1, b1_buf, d_mid * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(args->W2, W2_buf, N_W2 * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(args->b2, b2_buf, d_out * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(args->grad_out, grad_out_buf, N_out * sizeof(float), cudaMemcpyHostToDevice); + + free(x_buf); + free(W1_buf); + free(b1_buf); + free(W2_buf); + free(b2_buf); + free(grad_out_buf); + return args; +} + +void free_fcrelufcmaxargs(FCReluFCMaxArgs* args) { + cudaFree(args->x); + cudaFree(args->W1); + cudaFree(args->b1); + cudaFree(args->W2); + cudaFree(args->b2); + cudaFree(args->out); + cudaFree(args->argmax_indices); + cudaFree(args->fc1_at_argmax); + cudaFree(args->grad_x); + cudaFree(args->grad_W1); + cudaFree(args->grad_b1); + cudaFree(args->grad_W2); + cudaFree(args->grad_b2); + cudaFree(args->grad_out); + free(args); +} + +void run_fcrelufcmax_forward(FCReluFCMaxArgs* args) { + launch_fc_relu_fc_max_forward( + args->out, args->argmax_indices, args->fc1_at_argmax, + args->x, args->W1, args->b1, args->W2, args->b2, + args->B, args->N, args->D_in, args->D_mid, args->D_out, 0); +} + +void run_fcrelufcmax_backward(FCReluFCMaxArgs* args) { + // Zero gradients before backward (kernel uses atomicAdd) + cudaMemset(args->grad_x, 0, args->B * args->N * args->D_in * sizeof(float)); + cudaMemset(args->grad_W1, 0, args->D_mid * args->D_in * sizeof(float)); + cudaMemset(args->grad_b1, 0, args->D_mid * sizeof(float)); + cudaMemset(args->grad_W2, 0, args->D_out * args->D_mid * sizeof(float)); + cudaMemset(args->grad_b2, 0, args->D_out * sizeof(float)); + + launch_fc_relu_fc_max_backward( + args->grad_x, args->grad_W1, args->grad_b1, args->grad_W2, args->grad_b2, + args->grad_out, args->x, args->W1, args->W2, + args->argmax_indices, args->fc1_at_argmax, + args->B, args->N, args->D_in, args->D_mid, args->D_out, 0); +} + +#ifdef USE_TORCH + +typedef struct { + torch::Tensor x; // (B, N, D_in) + torch::Tensor W1; // (D_mid, D_in) + torch::Tensor b1; // (D_mid) + torch::Tensor W2; // (D_out, D_mid) + torch::Tensor b2; // (D_out) + torch::Tensor out; // (B, D_out) + torch::Tensor grad_out;// (B, D_out) + int B; + int N; + int D_in; + int D_mid; + int D_out; +} FCReluFCMaxArgsTorch; + +FCReluFCMaxArgsTorch* create_fcrelufcmaxargs_torch(FCReluFCMaxArgs* raw) { + FCReluFCMaxArgsTorch* args = new FCReluFCMaxArgsTorch(); + args->B = raw->B; + args->N = raw->N; + args->D_in = raw->D_in; + args->D_mid = raw->D_mid; + args->D_out = raw->D_out; + + auto opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA); + args->x = torch::from_blob(raw->x, {raw->B, raw->N, raw->D_in}, opts).requires_grad_(true); + args->W1 = torch::from_blob(raw->W1, {raw->D_mid, raw->D_in}, opts).requires_grad_(true); + args->b1 = torch::from_blob(raw->b1, {raw->D_mid}, opts).requires_grad_(true); + args->W2 = torch::from_blob(raw->W2, {raw->D_out, raw->D_mid}, opts).requires_grad_(true); + args->b2 = torch::from_blob(raw->b2, {raw->D_out}, opts).requires_grad_(true); + args->grad_out = torch::from_blob(raw->grad_out, {raw->B, raw->D_out}, opts); + + return args; +} + +void run_fcrelufcmax_forward_torch(FCReluFCMaxArgsTorch* args) { + torch::NoGradGuard no_grad; + fc_relu_fc_max(args->x, args->W1, args->b1, args->W2, args->b2); +} + +void run_fcrelufcmax_backward_torch(FCReluFCMaxArgsTorch* args) { + // Recompute forward each time since backward frees the graph + auto out = fc_relu_fc_max(args->x, args->W1, args->b1, args->W2, args->b2); + args->x.mutable_grad() = torch::Tensor(); + args->W1.mutable_grad() = torch::Tensor(); + args->b1.mutable_grad() = torch::Tensor(); + args->W2.mutable_grad() = torch::Tensor(); + args->b2.mutable_grad() = torch::Tensor(); + out.backward(args->grad_out); +} + +void run_fcrelufcmax_forward_cpp(FCReluFCMaxArgsTorch* args) { + torch::NoGradGuard no_grad; + fc_relu_fc_max_cpp(args->x, args->W1, args->b1, args->W2, args->b2); +} + +void test_fcrelufcmax_correct(FCReluFCMaxArgsTorch* args) { + // Run fused kernel forward + auto x_fused = args->x.detach().clone().requires_grad_(true); + auto W1_fused = args->W1.detach().clone().requires_grad_(true); + auto b1_fused = args->b1.detach().clone().requires_grad_(true); + auto W2_fused = args->W2.detach().clone().requires_grad_(true); + auto b2_fused = args->b2.detach().clone().requires_grad_(true); + auto fused_out = fc_relu_fc_max(x_fused, W1_fused, b1_fused, W2_fused, b2_fused); + + // Run reference (unfused) forward + auto x_ref = args->x.detach().clone().requires_grad_(true); + auto W1_ref = args->W1.detach().clone().requires_grad_(true); + auto b1_ref = args->b1.detach().clone().requires_grad_(true); + auto W2_ref = args->W2.detach().clone().requires_grad_(true); + auto b2_ref = args->b2.detach().clone().requires_grad_(true); + auto ref_out = fc_relu_fc_max_cpp(x_ref, W1_ref, b1_ref, W2_ref, b2_ref); + + // Numerical comparison + float rtol = 1e-3f, atol = 1e-4f; + bool out_match = torch::allclose(fused_out, ref_out, rtol, atol); + float out_max_diff = (fused_out - ref_out).abs().max().item(); + + printf(" forward correctness: out=%s(%.2e)\n", + out_match ? "\033[32mok\033[0m" : "\033[31mFAIL\033[0m", out_max_diff); + + // Test backward pass + torch::autograd::backward({fused_out}, {args->grad_out}); + torch::autograd::backward({ref_out}, {args->grad_out}); + + bool grad_x_match = torch::allclose(x_fused.grad(), x_ref.grad(), rtol, atol); + float grad_x_max_diff = (x_fused.grad() - x_ref.grad()).abs().max().item(); + bool grad_W1_match = torch::allclose(W1_fused.grad(), W1_ref.grad(), rtol, atol); + float grad_W1_max_diff = (W1_fused.grad() - W1_ref.grad()).abs().max().item(); + bool grad_W2_match = torch::allclose(W2_fused.grad(), W2_ref.grad(), rtol, atol); + float grad_W2_max_diff = (W2_fused.grad() - W2_ref.grad()).abs().max().item(); + + printf(" backward correctness: grad_x=%s(%.2e) grad_W1=%s(%.2e) grad_W2=%s(%.2e)\n", + grad_x_match ? "\033[32mok\033[0m" : "\033[31mFAIL\033[0m", grad_x_max_diff, + grad_W1_match ? "\033[32mok\033[0m" : "\033[31mFAIL\033[0m", grad_W1_max_diff, + grad_W2_match ? "\033[32mok\033[0m" : "\033[31mFAIL\033[0m", grad_W2_max_diff); +} + +#endif + +void profile_fcrelufcmax(int batch, int num_points, int d_in, int d_mid, int d_out) { + FCReluFCMaxArgs* args = create_fcrelufcmaxargs(batch, num_points, d_in, d_mid, d_out); + + printf("fc_relu_fc_max (B=%d, N=%d, D_in=%d, D_mid=%d, D_out=%d)\n", + batch, num_points, d_in, d_mid, d_out); + + float fwd_ms = profile_kernel((kernel_fn)run_fcrelufcmax_forward, args); + print_timing("\tforward", fwd_ms, batch); + + float bwd_ms = profile_kernel((kernel_fn)run_fcrelufcmax_backward, args); + print_timing("\tbackward", bwd_ms, batch); + +#ifdef USE_TORCH + FCReluFCMaxArgsTorch* args_torch = create_fcrelufcmaxargs_torch(args); + + test_fcrelufcmax_correct(args_torch); + + float fwd_torch_ms = profile_kernel((kernel_fn)run_fcrelufcmax_forward_torch, args_torch); + print_timing("\tforward (torch)", fwd_torch_ms, batch); + + args_torch->out = fc_relu_fc_max(args_torch->x, args_torch->W1, args_torch->b1, args_torch->W2, args_torch->b2); + + float bwd_torch_ms = profile_kernel((kernel_fn)run_fcrelufcmax_backward_torch, args_torch); + print_timing("\tbackward (torch)", bwd_torch_ms, batch); + + float fwd_cpp_ms = profile_kernel((kernel_fn)run_fcrelufcmax_forward_cpp, args_torch); + print_timing("\tforward (cpp)", fwd_cpp_ms, batch); + + args_torch->out = fc_relu_fc_max_cpp(args_torch->x, args_torch->W1, args_torch->b1, args_torch->W2, args_torch->b2); + + float bwd_cpp_ms = profile_kernel((kernel_fn)run_fcrelufcmax_backward_torch, args_torch); + print_timing("\tbackward (cpp)", bwd_cpp_ms, batch); + + float fwd_graph_ms = profile_graph((kernel_fn)run_fcrelufcmax_forward_cpp, args_torch); + print_timing("\tforward (graph)", fwd_graph_ms, batch); + + delete args_torch; +#endif + printf("\n"); + + free_fcrelufcmaxargs(args); +} + typedef struct { float* logits; float* values_pred; @@ -1461,7 +1994,6 @@ void profile_ppoloss(int batch, int seq, int actions) { free_ppolossargs(args); } -/* // ============================================================================ // sample_logits profiling // ============================================================================ @@ -1469,10 +2001,11 @@ void profile_ppoloss(int batch, int seq, int actions) { typedef struct { float* logits; // (B, A) float* value; // (B, 1) - double* actions; // (B,) - float64 for discrete/continuous compatibility + double* actions; // (B, 1) - float64 for discrete/continuous compatibility float* logprobs; // (B,) float* value_out; // (B,) int64_t* offset; // RNG offset (on device for CUDA graph support) + int* act_sizes; // (1,) - single action head uint64_t seed; int B; int A; @@ -1488,17 +2021,18 @@ SampleLogitsArgs* create_samplelogitsargs(int batch, int num_actions) { int N_batch = batch; cudaMalloc(&args->logits, N_logits * sizeof(float)); - cudaMalloc(&args->value, N_batch * sizeof(float)); // (B, 1) flattened + cudaMalloc(&args->value, N_batch * sizeof(float)); cudaMalloc(&args->actions, N_batch * sizeof(double)); cudaMalloc(&args->logprobs, N_batch * sizeof(float)); cudaMalloc(&args->value_out, N_batch * sizeof(float)); cudaMalloc(&args->offset, sizeof(int64_t)); - cudaMemset(args->offset, 0, sizeof(int64_t)); // Initialize offset to 0 + cudaMalloc(&args->act_sizes, sizeof(int)); + cudaMemset(args->offset, 0, sizeof(int64_t)); + cudaMemcpy(args->act_sizes, &num_actions, sizeof(int), cudaMemcpyHostToDevice); float* logits_buf = (float*)malloc(N_logits * sizeof(float)); float* value_buf = (float*)malloc(N_batch * sizeof(float)); - // Initialize logits and value with random values for (int i = 0; i < N_logits; ++i) { logits_buf[i] = rand1() * 5.0f; } @@ -1521,6 +2055,7 @@ void free_samplelogitsargs(SampleLogitsArgs* args) { cudaFree(args->logprobs); cudaFree(args->value_out); cudaFree(args->offset); + cudaFree(args->act_sizes); free(args); } @@ -1528,12 +2063,10 @@ void run_samplelogits_forward(SampleLogitsArgs* args) { launch_sample_logits( args->actions, args->logprobs, args->value_out, args->logits, args->value, - args->seed, args->offset, - args->A, args->B, - args->A, // logits_stride = A (contiguous) - 1, // value_stride = 1 (contiguous, 1D) - 0); - // Note: not incrementing offset here since this is just for profiling + args->act_sizes, args->seed, args->offset, + 1, // num_atns + args->B, args->A, 1, // B, logits_stride, value_stride + 0); // stream } #ifdef USE_TORCH @@ -1541,10 +2074,11 @@ void run_samplelogits_forward(SampleLogitsArgs* args) { typedef struct { torch::Tensor logits; // (B, A) torch::Tensor value; // (B, 1) - input - torch::Tensor actions; // (B,) float64 - output + torch::Tensor actions; // (B, 1) float64 - output torch::Tensor logprobs; // (B,) - output torch::Tensor value_out; // (B,) - output torch::Tensor offset; // (1,) int64 - RNG offset tensor + torch::Tensor act_sizes; // (1,) int32 - action sizes uint64_t seed; int B; int A; @@ -1559,18 +2093,19 @@ SampleLogitsArgsTorch* create_samplelogitsargs_torch(SampleLogitsArgs* raw) { auto opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA); args->logits = torch::from_blob(raw->logits, {raw->B, raw->A}, opts); args->value = torch::from_blob(raw->value, {raw->B, 1}, opts); - args->actions = torch::empty({raw->B}, opts.dtype(torch::kFloat64)); + args->actions = torch::empty({raw->B, 1}, opts.dtype(torch::kFloat64)); args->logprobs = torch::empty({raw->B}, opts); args->value_out = torch::empty({raw->B}, opts); args->offset = torch::zeros({1}, opts.dtype(torch::kInt64)); + args->act_sizes = torch::tensor({raw->A}, torch::dtype(torch::kInt32).device(torch::kCUDA)); return args; } void run_samplelogits_forward_torch(SampleLogitsArgsTorch* args) { torch::NoGradGuard no_grad; - sample_logits(args->logits, args->value, args->actions, args->logprobs, args->value_out, args->seed, args->offset); - args->offset.add_(1); // Increment with CUDA op + sample_logits(args->logits, args->value, args->actions, args->logprobs, + args->value_out, args->act_sizes, args->seed, args->offset); } void run_samplelogits_forward_cpp(SampleLogitsArgsTorch* args) { @@ -1606,45 +2141,57 @@ void profile_samplelogits(int batch, int num_actions) { free_samplelogitsargs(args); } -*/ // ============================================================================ -// forward_call profiling (inference forward pass) - using GraphBuf +// OUTDATED TESTS BELOW - GraphBuf and DLL loading no longer used +// Uncomment and update when needed +// ============================================================================ + +// ============================================================================ +// forward_call profiling (inference forward pass) // ============================================================================ #ifdef USE_TORCH typedef struct { std::shared_ptr policy; - GraphBuf graph; + Tensor obs; + Tensor state; + Tensor actions; + Tensor logprobs; + Tensor values; Tensor rng_offset; Tensor act_sizes; Tensor act_sizes_cpu; uint64_t seed; bool use_kernels; + int batch; + int num_atns; } ForwardCallArgs; ForwardCallArgs* create_forwardcallargs(int batch, int input_size, int hidden_size, int act_n, int num_layers, bool use_kernels) { - // act_n = total action space size (decoder output size) - // For discrete: num_action_heads=1, act_sizes=[act_n] - // For multidiscrete: num_action_heads=len(act_sizes), act_n=sum(act_sizes) int num_action_heads = 1; // Using discrete for profiling + int expansion_factor = 1; ForwardCallArgs* args = new ForwardCallArgs(); args->use_kernels = use_kernels; args->seed = 42; + args->batch = batch; + args->num_atns = num_action_heads; // Create policy with default encoder/decoder auto enc = std::make_shared(input_size, hidden_size); auto dec = std::make_shared(hidden_size, act_n); - args->policy = std::make_shared(enc, dec, input_size, act_n, hidden_size, 1, num_layers, use_kernels); + args->policy = std::make_shared(enc, dec, input_size, act_n, hidden_size, expansion_factor, num_layers, use_kernels); args->policy->to(torch::kCUDA); - // Use create_graph factory (minibatch_segments=0 since not used for inference) - // num_action_heads=1 for discrete action space - args->graph = create_graph(batch, input_size, 0, 0, num_layers, hidden_size, 1, num_action_heads, args->policy.get()); - args->graph.obs = torch::randn({batch, input_size}, torch::dtype(DTYPE).device(torch::kCUDA)); + // Create tensors directly + args->obs = torch::randn({batch, input_size}, torch::dtype(DTYPE).device(torch::kCUDA)); + args->state = args->policy->initial_state(batch, torch::kCUDA); + args->actions = torch::zeros({batch, num_action_heads}, torch::dtype(torch::kFloat64).device(torch::kCUDA)); + args->logprobs = torch::zeros({batch}, torch::dtype(torch::kFloat32).device(torch::kCUDA)); + args->values = torch::zeros({batch}, torch::dtype(DTYPE).device(torch::kCUDA)); args->rng_offset = torch::zeros({1}, torch::dtype(torch::kInt64).device(torch::kCUDA)); // Create act_sizes tensor: for discrete, single entry with total action count @@ -1659,7 +2206,36 @@ void free_forwardcallargs(ForwardCallArgs* args) { } void run_forward_call(ForwardCallArgs* args) { - forward_call(args->graph, args->policy.get(), args->use_kernels, args->seed, args->rng_offset, args->act_sizes, args->act_sizes_cpu); + torch::NoGradGuard no_grad; + + // Run policy forward + auto [logits, value, state_out] = args->policy->forward(args->obs, args->state); + + // Sample actions + if (args->use_kernels) { + sample_logits(logits, value, args->actions, args->logprobs, + args->values, args->act_sizes, args->seed, args->rng_offset); + } else { + int num_action_heads = args->num_atns; + logits = torch::nan_to_num(logits, 1e-8, 1e-8, 1e-8); + auto split_logits = torch::split(logits, c10::IntArrayRef(args->act_sizes_cpu.data_ptr(), num_action_heads), 1); + std::vector actions_vec; + std::vector logprobs_vec; + for (int i = 0; i < num_action_heads; i++) { + Tensor head_logits = split_logits[i]; + Tensor log_probs = torch::log_softmax(head_logits, 1); + Tensor action = at::multinomial(log_probs.exp(), 1, true); + Tensor logprob = log_probs.gather(1, action); + actions_vec.push_back(action); + logprobs_vec.push_back(logprob); + } + args->actions.copy_(torch::cat(actions_vec, 1).to(torch::kFloat64), false); + args->logprobs.copy_(torch::cat(logprobs_vec, 1).sum(1), false); + args->values.copy_(value.flatten(), false); + } + + // Update state + args->state.copy_(state_out, false); } #endif @@ -1696,81 +2272,7 @@ void profile_forwardcall(int batch, int input_size, int hidden_size, int num_atn } // ============================================================================ -// rollout_copy_call profiling - using RolloutBuf, GraphBuf, EnvBuf -// ============================================================================ - -#ifdef USE_TORCH - -typedef struct { - RolloutBuf rollouts; - GraphBuf graph; - EnvBuf env; - int horizon; - int num_envs; - int num_buffers; - int h; // current timestep - int buf; // current buffer index -} RolloutCopyArgs; - -RolloutCopyArgs* create_rolloutcopyargs(int horizon, int num_envs, int num_buffers, int input_size, int num_action_heads = 1) { - // num_action_heads = number of action heads (1 for discrete, >1 for multidiscrete) - RolloutCopyArgs* args = new RolloutCopyArgs(); - args->horizon = horizon; - args->num_envs = num_envs; - args->num_buffers = num_buffers; - args->h = 0; - args->buf = 0; - - int block_size = num_envs / num_buffers; - - // Use factory functions - args->rollouts = create_rollouts(horizon, num_envs, input_size, num_action_heads); - args->env = create_env(num_envs, input_size); - - // Create minimal graph for rollout (only rollout tensors needed, use dummy policy for state) - auto opts = torch::TensorOptions().dtype(DTYPE).device(torch::kCUDA); - args->graph.obs = torch::randn({block_size, input_size}, opts); - args->graph.actions = torch::randint(0, 4, {block_size, num_action_heads}, torch::dtype(torch::kFloat64).device(torch::kCUDA)); - args->graph.logprobs = torch::randn({block_size}, opts); - args->graph.value = torch::randn({block_size}, opts); - - return args; -} - -void free_rolloutcopyargs(RolloutCopyArgs* args) { - delete args; -} - -void run_rollout_copy_call(RolloutCopyArgs* args) { - rollout_copy_call(args->rollouts, args->env, args->graph, args->num_envs, args->num_buffers, args->h, args->buf); -} - -#endif - -void profile_rolloutcopycall(int horizon, int num_envs, int num_buffers, int input_size) { -#ifdef USE_TORCH - int block_size = num_envs / num_buffers; - printf("rollout_copy_call (H=%d, envs=%d, buffers=%d, block=%d)\n", - horizon, num_envs, num_buffers, block_size); - - RolloutCopyArgs* args = create_rolloutcopyargs(horizon, num_envs, num_buffers, input_size); - - float copy_ms = profile_kernel((kernel_fn)run_rollout_copy_call, args, "rollout_copy"); - print_timing("\tcopy ops", copy_ms, block_size); - - float copy_graph_ms = profile_graph((kernel_fn)run_rollout_copy_call, args, "rollout_copy_graph"); - print_timing("\tcopy (graph)", copy_graph_ms, block_size); - - free_rolloutcopyargs(args); - - printf("\n"); -#else - printf("rollout_copy_call: requires USE_TORCH\n\n"); -#endif -} - -// ============================================================================ -// train_forward_call profiling - using GraphBuf +// train_forward_call profiling - using TrainGraph // ============================================================================ #ifdef USE_TORCH @@ -1778,7 +2280,7 @@ void profile_rolloutcopycall(int horizon, int num_envs, int num_buffers, int inp typedef struct { std::shared_ptr policy; torch::optim::Muon* muon; - GraphBuf graph; + TrainGraph train_buf; HypersT hypers; Tensor adv_mean; Tensor adv_std; @@ -1789,9 +2291,8 @@ typedef struct { TrainForwardArgs* create_trainforwardargs(int segments, int horizon, int input_size, int hidden_size, int act_n, int num_layers, bool use_kernels) { - // act_n = total action space size (decoder output size) - // For discrete: num_action_heads=1, act_sizes=[act_n] int num_action_heads = 1; // Using discrete for profiling + int expansion_factor = 1; TrainForwardArgs* args = new TrainForwardArgs(); args->use_kernels = use_kernels; @@ -1808,25 +2309,26 @@ TrainForwardArgs* create_trainforwardargs(int segments, int horizon, int input_s // Create policy with default encoder/decoder auto enc = std::make_shared(input_size, hidden_size); auto dec = std::make_shared(hidden_size, act_n); - args->policy = std::make_shared(enc, dec, input_size, act_n, hidden_size, 1, num_layers, use_kernels); + args->policy = std::make_shared(enc, dec, input_size, act_n, hidden_size, expansion_factor, num_layers, use_kernels); args->policy->to(torch::kCUDA); + args->policy->to(DTYPE); // Create Muon optimizer args->muon = new torch::optim::Muon(args->policy->parameters(), torch::optim::MuonOptions(0.0003).momentum(0.95).eps(1e-8)); - // Use create_graph factory (batch=0 since not used for training) - // num_action_heads=1 for discrete action space - args->graph = create_graph(0, input_size, segments, horizon, num_layers, hidden_size, 1, num_action_heads, args->policy.get()); + // Use create_train_graph factory + args->train_buf = pufferlib::create_train_graph(segments, horizon, input_size, + num_layers, hidden_size, expansion_factor, num_action_heads); // Initialize mb_* tensors with test data for training - args->graph.mb_obs = torch::randn({segments, horizon, input_size}, torch::dtype(DTYPE).device(torch::kCUDA)); - args->graph.mb_actions = torch::randint(0, act_n, {segments, horizon, num_action_heads}, torch::dtype(torch::kInt64).device(torch::kCUDA)); - args->graph.mb_logprobs = torch::randn({segments, horizon}, torch::dtype(DTYPE).device(torch::kCUDA)) * 0.1f - 2.0f; // ~log probs - args->graph.mb_advantages = torch::randn({segments, horizon}, torch::dtype(DTYPE).device(torch::kCUDA)); - args->graph.mb_values = torch::randn({segments, horizon}, torch::dtype(DTYPE).device(torch::kCUDA)); - args->graph.mb_returns = args->graph.mb_advantages + args->graph.mb_values; - args->graph.mb_prio = torch::ones({segments, 1}, torch::dtype(DTYPE).device(torch::kCUDA)); + args->train_buf.mb_obs = torch::randn({segments, horizon, input_size}, torch::dtype(DTYPE).device(torch::kCUDA)); + args->train_buf.mb_actions = torch::randint(0, act_n, {segments, horizon, num_action_heads}, torch::dtype(torch::kInt64).device(torch::kCUDA)); + args->train_buf.mb_logprobs = torch::randn({segments, horizon}, torch::dtype(DTYPE).device(torch::kCUDA)) * 0.1f - 2.0f; + args->train_buf.mb_advantages = torch::randn({segments, horizon}, torch::dtype(DTYPE).device(torch::kCUDA)); + args->train_buf.mb_values = torch::randn({segments, horizon}, torch::dtype(DTYPE).device(torch::kCUDA)); + args->train_buf.mb_returns = args->train_buf.mb_advantages + args->train_buf.mb_values; + args->train_buf.mb_prio = torch::ones({segments, 1}, torch::dtype(DTYPE).device(torch::kCUDA)); // Adv normalization tensors args->adv_mean = torch::zeros({1}, torch::dtype(DTYPE).device(torch::kCUDA)); @@ -1844,7 +2346,8 @@ void free_trainforwardargs(TrainForwardArgs* args) { } void run_train_forward_call(TrainForwardArgs* args) { - train_forward_call(args->graph, args->policy.get(), args->muon, args->hypers, args->adv_mean, args->adv_std, args->act_sizes_cpu, args->use_kernels); + pufferlib::train_forward_call(args->train_buf, args->policy.get(), args->muon, + args->hypers, args->adv_mean, args->adv_std, args->act_sizes_cpu, args->use_kernels); } #endif @@ -1884,72 +2387,41 @@ void profile_trainforwardcall(int segments, int horizon, int input_size, } // ============================================================================ -// Environment speed test (breakout) +// Environment speed test - uses static linking (compile with specific env) // ============================================================================ -// Function pointers for env interface (loaded dynamically) -static create_environments_fn profile_create_envs = nullptr; -static create_threads_fn profile_create_threads = nullptr; -static vec_reset_fn profile_vec_reset = nullptr; -static vec_send_fn profile_vec_send = nullptr; -static vec_recv_fn profile_vec_recv = nullptr; -static vec_close_fn profile_vec_close = nullptr; +#ifdef USE_STATIC_ENV + +#include "pufferlib/extensions/static_envbinding.h" + +// Empty callback for OMP test (no-op, just testing env stepping speed) +static void empty_net_callback(void* ctx, int buf, int t) { + (void)ctx; (void)buf; (void)t; +} + +static void empty_thread_init(void* ctx, int buf) { + (void)ctx; (void)buf; +} typedef struct { - VecEnv* vec; + StaticVec* vec; int num_envs; int num_buffers; int num_threads; int horizon; - int obs_n; - int act_n; + int obs_size; + int num_atns; } EnvSpeedArgs; EnvSpeedArgs* create_envspeedargs(int total_agents, int num_buffers, int num_threads, int horizon) { - // Load drive.so dynamically - void* handle = dlopen("./drive.so", RTLD_NOW); - if (!handle) { - fprintf(stderr, "dlopen error: %s\n", dlerror()); - fprintf(stderr, "Make sure to build drive first: ./scripts/build_vec.sh drive\n"); - return nullptr; - } - dlerror(); - - // Load function pointers - profile_create_envs = (create_environments_fn)dlsym(handle, "create_environments"); - profile_create_threads = (create_threads_fn)dlsym(handle, "create_threads"); - profile_vec_reset = (vec_reset_fn)dlsym(handle, "vec_reset"); - profile_vec_send = (vec_send_fn)dlsym(handle, "vec_send"); - profile_vec_recv = (vec_recv_fn)dlsym(handle, "vec_recv"); - profile_vec_close = (vec_close_fn)dlsym(handle, "vec_close"); - int obs_n = *(int*)dlsym(handle, "OBS_N"); - int num_atns = *(int*)dlsym(handle, "NUM_ATNS_EXPORT"); - - const char* dlsym_error = dlerror(); - if (dlsym_error) { - fprintf(stderr, "dlsym error: %s\n", dlsym_error); - dlclose(handle); - return nullptr; - } - // Create vec_kwargs with total_agents and num_buffers Dict* vec_kwargs = create_dict(8); dict_set(vec_kwargs, "total_agents", (double)total_agents); dict_set(vec_kwargs, "num_buffers", (double)num_buffers); - // Create env_kwargs for drive - Dict* env_kwargs = create_dict(32); - dict_set(env_kwargs, "human_agent_idx", 0); - dict_set(env_kwargs, "reward_vehicle_collision", -0.5); - dict_set(env_kwargs, "reward_offroad_collision", -0.2); - dict_set(env_kwargs, "spawn_immunity_timer", 50); - dict_set(env_kwargs, "reward_goal_post_respawn", 0.25); - dict_set(env_kwargs, "reward_vehicle_collision_post_respawn", -0.5); - dict_set(env_kwargs, "num_maps", 10000); - - // Create env_kwargs for breakout - /* + // Create env_kwargs - loaded from config in real usage, use defaults here Dict* env_kwargs = create_dict(32); + // Breakout defaults dict_set(env_kwargs, "frameskip", 4); dict_set(env_kwargs, "width", 576); dict_set(env_kwargs, "height", 330); @@ -1965,23 +2437,22 @@ EnvSpeedArgs* create_envspeedargs(int total_agents, int num_buffers, int num_thr dict_set(env_kwargs, "max_ball_speed", 448); dict_set(env_kwargs, "paddle_speed", 620); dict_set(env_kwargs, "continuous", 0); - */ - // Create environments with new signature - VecEnv* vec = profile_create_envs(num_buffers, true, 0, vec_kwargs, env_kwargs); + // Create environments using static binding + StaticVec* vec = create_static_vec(total_agents, num_buffers, vec_kwargs, env_kwargs); if (!vec) { fprintf(stderr, "Failed to create environments\n"); return nullptr; } - // Create threads int num_envs = vec->size; - int block_size = num_envs / num_threads; - if (block_size < 1) block_size = 1; - profile_create_threads(vec, num_threads, block_size); + printf("Created %d envs for %d total_agents\n", num_envs, total_agents); + + // Create threads for OMP stepping + create_static_threads(vec, num_threads, horizon, nullptr, empty_net_callback, empty_thread_init); // Reset - profile_vec_reset(vec); + static_vec_reset(vec); cudaDeviceSynchronize(); EnvSpeedArgs* args = (EnvSpeedArgs*)calloc(1, sizeof(EnvSpeedArgs)); @@ -1990,31 +2461,20 @@ EnvSpeedArgs* create_envspeedargs(int total_agents, int num_buffers, int num_thr args->num_buffers = num_buffers; args->num_threads = num_threads; args->horizon = horizon; - args->obs_n = obs_n; - args->act_n = num_atns; + args->obs_size = get_obs_size(); + args->num_atns = get_num_atns(); return args; } void free_envspeedargs(EnvSpeedArgs* args) { - if (args && args->vec) { - profile_vec_close(args->vec); - } + // Note: no static_vec_close yet, just free the args free(args); } -// Run full rollout iteration: iterate through all buffers * horizon +// Run full rollout using OMP threading void run_env_rollout(EnvSpeedArgs* args) { - int num_buffers = args->num_buffers; - int horizon = args->horizon; - VecEnv* vec = args->vec; - - for (int i = 0; i < num_buffers * horizon; ++i) { - int buf = i % num_buffers; - profile_vec_recv(vec, buf); - // In real usage, policy forward would happen here (async on GPU) - profile_vec_send(vec, buf); - } + static_vec_omp_step(args->vec); } float profile_env_rollout(EnvSpeedArgs* args, const char* name) { @@ -2024,7 +2484,7 @@ float profile_env_rollout(EnvSpeedArgs* args, const char* name) { // Warmup auto start_time = std::chrono::steady_clock::now(); - for (int i = 0; i < 100; ++i) { + for (int i = 0; i < 10; ++i) { run_env_rollout(args); cudaDeviceSynchronize(); auto now = std::chrono::steady_clock::now(); @@ -2059,7 +2519,7 @@ float profile_env_rollout(EnvSpeedArgs* args, const char* name) { } void profile_envspeed(int total_agents, int num_buffers, int num_threads, int horizon) { - printf("env_speed (total_agents=%d, buffers=%d, threads=%d, horizon=%d)\n", + printf("env_speed_static (total_agents=%d, buffers=%d, threads=%d, horizon=%d)\n", total_agents, num_buffers, num_threads, horizon); EnvSpeedArgs* args = create_envspeedargs(total_agents, num_buffers, num_threads, horizon); @@ -2068,9 +2528,9 @@ void profile_envspeed(int total_agents, int num_buffers, int num_threads, int ho return; } - printf("\tnum_envs=%d, obs_n=%d, num_atns=%d\n", args->num_envs, args->obs_n, args->act_n); + printf("\tnum_envs=%d, obs_size=%d, num_atns=%d\n", args->num_envs, args->obs_size, args->num_atns); - // Profile full rollout (num_buffers * horizon steps) + // Profile full rollout (horizon steps per OMP call) float rollout_ms = profile_env_rollout(args, "env_rollout"); int total_steps = total_agents * horizon; printf("\trollout time: %.2f ms (%d steps)\n", rollout_ms, total_steps); @@ -2083,13 +2543,18 @@ void profile_envspeed(int total_agents, int num_buffers, int num_threads, int ho printf("\n"); } +#endif // USE_STATIC_ENV + void print_usage(const char* prog) { printf("Usage: %s \n", prog); printf(" kernels - Individual kernel profiling (no nsys needed)\n"); +#ifdef USE_TORCH printf(" forwardcall - Inference forward pass\n"); printf(" trainforward - Training forward + backward + optimizer\n"); - printf(" rolloutcopy - Rollout buffer copy operations\n"); - printf(" envspeed - Environment step throughput\n"); +#endif +#ifdef USE_STATIC_ENV + printf(" envspeed - Environment step throughput (static linked)\n"); +#endif printf(" all - Run all profiles\n"); } @@ -2105,30 +2570,37 @@ int main(int argc, char** argv) { // Using typical breakout settings: INPUT_SIZE=96, H=128, A=4 if (strcmp(profile, "kernels") == 0 || strcmp(profile, "all") == 0) { - // profile_mingrugate(BR, H); - // profile_logcoeffsandvalues(BT, T, H); - // profile_logcumsumexp(BT, T, H); - // profile_fusedscan(BT, T, H); - //profile_samplelogits(BR, A); + profile_mingrugate(BR, H); + profile_logcoeffsandvalues(BT, T, H); + profile_logcumsumexp(BT, T, H); + profile_fusedscan(BT, T, H); + profile_samplelogits(BR, A); profile_ppoloss(BT, T, A); + + // FCMax: simple FC -> Max (no intermediate layer) + // Drive encoder dimensions: partner (B, 63, 7) -> 128, road (B, 200, 13) -> 128 + profile_fcmax(BR, 63, 7, 128); // partner encoder + profile_fcmax(BR, 200, 13, 128); // road encoder + + // FCReluFCMax: FC -> ReLU -> FC -> Max (for comparison) + profile_fcrelufcmax(BR, 63, 7, 128, 128); // partner encoder + profile_fcrelufcmax(BR, 200, 13, 128, 128); // road encoder } +#ifdef USE_TORCH if (strcmp(profile, "forwardcall") == 0 || strcmp(profile, "all") == 0) { profile_forwardcall(BR, INPUT_SIZE, H, A, 1); } - if (strcmp(profile, "trainforward") == 0 || strcmp(profile, "all") == 0) { profile_trainforwardcall(BT, T, INPUT_SIZE, H, A, 1); } +#endif - if (strcmp(profile, "rolloutcopy") == 0 || strcmp(profile, "all") == 0) { - profile_rolloutcopycall(T, BR, 1, INPUT_SIZE); - } - +#ifdef USE_STATIC_ENV if (strcmp(profile, "envspeed") == 0 || strcmp(profile, "all") == 0) { - // total_agents=8192, num_buffers=2, num_threads=8, horizon=64 - profile_envspeed(BUF*BR, BUF, 8, T); + profile_envspeed(BUF*BR, BUF, 16, T); } +#endif return 0; } diff --git a/pufferlib/config/ocean/breakout.ini b/pufferlib/config/ocean/breakout.ini index 054586d17..81a76da6a 100644 --- a/pufferlib/config/ocean/breakout.ini +++ b/pufferlib/config/ocean/breakout.ini @@ -72,7 +72,7 @@ mean = 1 scale = auto [train] -total_timesteps = 120_000_000 +total_timesteps = 100_000_000 adam_beta1 = 0.8946507418260217 adam_beta2 = 0.9 adam_eps = 0.0001 diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini index e96daaf87..084ce57a2 100644 --- a/pufferlib/config/ocean/drive.ini +++ b/pufferlib/config/ocean/drive.ini @@ -6,7 +6,7 @@ rnn_name = Recurrent [vec] total_agents = 8192 -num_buffers = 2 +num_buffers = 8 [policy] input_size = 64 @@ -30,16 +30,11 @@ num_maps = 10000 [train] total_timesteps = 2_000_000_000 -#learning_rate = 0.02 -#gamma = 0.985 anneal_lr = True -batch_size = 745472 -minibatch_size = 11648 -max_minibatch_size = 11648 -#minibatch_size = 32768 +batch_size = auto +minibatch_size = 32768 num_minibatches = 16 -bptt_horizon = 91 -#bptt_horizon = 64 +bptt_horizon = 64 adam_beta1 = 0.9 adam_beta2 = 0.999 adam_eps = 1e-8 diff --git a/pufferlib/extensions/bindings.cpp b/pufferlib/extensions/bindings.cpp index 843eb4aa1..1b4144ce2 100644 --- a/pufferlib/extensions/bindings.cpp +++ b/pufferlib/extensions/bindings.cpp @@ -26,13 +26,11 @@ Tensor initial_state(pybind11::object pufferl_obj, int64_t batch_size, torch::De } void python_vec_recv(pybind11::object pufferl_obj, int buf) { - auto& pufferl = pufferl_obj.cast(); - pufferl.env_exports->vec_recv(pufferl.vec, buf, pufferl.vec->streams[buf]); + // Not used in static/OMP path } void python_vec_send(pybind11::object pufferl_obj, int buf) { - auto& pufferl = pufferl_obj.cast(); - pufferl.env_exports->vec_send(pufferl.vec, buf, pufferl.vec->streams[buf]); + // Not used in static/OMP path } torch::autograd::tensor_list env_buffers(pybind11::object pufferl_obj) { @@ -44,7 +42,7 @@ void rollouts(pybind11::object pufferl_obj) { PuffeRL& pufferl = pufferl_obj.cast(); pybind11::gil_scoped_release no_gil; if (pufferl.hypers.use_omp) { - pufferl.env_exports->vec_omp_step(pufferl.vec); + static_vec_omp_step(pufferl.vec); } else { rollouts_impl(pufferl); } @@ -151,6 +149,8 @@ TORCH_LIBRARY(_C, m) { m.def("log_coeffs_and_values(Tensor gate, Tensor hidden) -> (Tensor, Tensor)"); m.def("fused_scan(Tensor combined, Tensor state) -> (Tensor, Tensor)"); m.def("fused_ppo_loss(Tensor logits, Tensor values, Tensor actions, Tensor old_logprobs, Tensor advantages, Tensor prio, Tensor values, Tensor returns, Tensor adv_mean, Tensor adv_std, float clip_coef, float vf_clip_coef, float vf_coef, float ent_coef) -> Tensor"); + m.def("fc_relu_fc_max(Tensor x, Tensor W1, Tensor b1, Tensor W2, Tensor b2) -> Tensor"); + m.def("fc_max(Tensor x, Tensor W, Tensor b) -> Tensor"); m.def("policy_forward(Tensor obs, Tensor state) -> (Tensor, Tensor, Tensor)"); } @@ -165,6 +165,10 @@ PYBIND11_MODULE(_C, m) { m.def("log_coeffs_and_values", &log_coeffs_and_values); m.def("fused_scan", &fused_scan); m.def("fused_ppo_loss", &fused_ppo_loss); + m.def("fc_relu_fc_max", &fc_relu_fc_max); + m.def("fc_relu_fc_max_cpp", &fc_relu_fc_max_cpp); + m.def("fc_max", &fc_max); + m.def("fc_max_cpp", &fc_max_cpp); m.def("sample_logits", &sample_logits); m.def("python_vec_recv", &python_vec_recv); m.def("python_vec_send", &python_vec_send); diff --git a/pufferlib/extensions/cuda/kernels.cu b/pufferlib/extensions/cuda/kernels.cu index f75d6c559..b9608aded 100644 --- a/pufferlib/extensions/cuda/kernels.cu +++ b/pufferlib/extensions/cuda/kernels.cu @@ -2384,6 +2384,355 @@ void launch_sample_logits( } } +// ============================================================================= +// FCMax: Fused FC -> Max kernel +// Input: x (B, N, D_in), W (D_out, D_in), b (D_out) +// Output: out (B, D_out) = max_over_N(x @ W.T + b) +// Each thread computes one (b, d_out) output element +// N-fold memory bandwidth reduction vs separate FC + Max kernels +// ============================================================================= + +template +__global__ void fc_max_forward_kernel( + T* __restrict__ out, // (B, D_out) + int* __restrict__ argmax_indices, // (B, D_out) - which N produced the max + const T* __restrict__ x, // (B, N, D_in) + const T* __restrict__ W, // (D_out, D_in) + const T* __restrict__ b, // (D_out) + int B, int N, int D_in, int D_out +) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= B * D_out) return; + + int batch = idx / D_out; + int d_out = idx % D_out; + + float bias = float(b[d_out]); + float max_val = -INFINITY; + int argmax_n = 0; + + // Iterate over all N points, compute FC output, track max + for (int n = 0; n < N; n++) { + float val = bias; + for (int di = 0; di < D_in; di++) { + val += float(x[batch * N * D_in + n * D_in + di]) * float(W[d_out * D_in + di]); + } + if (val > max_val) { + max_val = val; + argmax_n = n; + } + } + + out[idx] = T(max_val); + argmax_indices[idx] = argmax_n; +} + +template +__global__ void fc_max_backward_kernel( + T* __restrict__ grad_x, // (B, N, D_in) + T* __restrict__ grad_W, // (D_out, D_in) + T* __restrict__ grad_b, // (D_out) + const T* __restrict__ grad_out, // (B, D_out) + const T* __restrict__ x, // (B, N, D_in) + const T* __restrict__ W, // (D_out, D_in) + const int* __restrict__ argmax_indices, // (B, D_out) + int B, int N, int D_in, int D_out +) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= B * D_out) return; + + int batch = idx / D_out; + int d_out = idx % D_out; + + float g_out = float(grad_out[idx]); + int argmax_n = argmax_indices[idx]; + + // grad_b[d_out] += g_out + atomicAdd(reinterpret_cast(&grad_b[d_out]), g_out); + + // Backprop through FC at argmax position only + for (int di = 0; di < D_in; di++) { + int x_idx = batch * N * D_in + argmax_n * D_in + di; + int w_idx = d_out * D_in + di; + + // grad_W[d_out, di] += g_out * x[batch, argmax_n, di] + atomicAdd(reinterpret_cast(&grad_W[w_idx]), g_out * float(x[x_idx])); + + // grad_x[batch, argmax_n, di] += g_out * W[d_out, di] + atomicAdd(reinterpret_cast(&grad_x[x_idx]), g_out * float(W[w_idx])); + } +} + +template +void launch_fc_max_forward( + T* out, int* argmax_indices, + const T* x, const T* W, const T* b, + int B, int N, int D_in, int D_out, + cudaStream_t stream +) { + int total = B * D_out; + int grid = grid_size(total); + fc_max_forward_kernel<<>>( + out, argmax_indices, x, W, b, B, N, D_in, D_out); +} + +template +void launch_fc_max_backward( + T* grad_x, T* grad_W, T* grad_b, + const T* grad_out, const T* x, const T* W, + const int* argmax_indices, + int B, int N, int D_in, int D_out, + cudaStream_t stream +) { + int total = B * D_out; + int grid = grid_size(total); + fc_max_backward_kernel<<>>( + grad_x, grad_W, grad_b, grad_out, x, W, argmax_indices, B, N, D_in, D_out); +} + +// Non-templated wrappers +void launch_fc_max_forward_float( + float* out, int* argmax_indices, + const float* x, const float* W, const float* b, + int B, int N, int D_in, int D_out, cudaStream_t stream +) { + launch_fc_max_forward(out, argmax_indices, x, W, b, B, N, D_in, D_out, stream); +} + +void launch_fc_max_backward_float( + float* grad_x, float* grad_W, float* grad_b, + const float* grad_out, const float* x, const float* W, + const int* argmax_indices, + int B, int N, int D_in, int D_out, cudaStream_t stream +) { + launch_fc_max_backward(grad_x, grad_W, grad_b, grad_out, x, W, argmax_indices, B, N, D_in, D_out, stream); +} + +// ============================================================================= +// FCReluFCMax: Fused FC -> ReLU -> FC -> Max kernel for Drive encoder +// Avoids materializing intermediate buffers (B, N, D_mid) and (B, N, D_out) +// Input: x (B, N, D_in), W1 (D_in, D_mid), b1 (D_mid), W2 (D_mid, D_out), b2 (D_out) +// FC operations applied pointwise to each of N points, then max over N dimension +// Output: out (B, D_out) = max_over_N(FC2(ReLU(FC1(x)))) +// ============================================================================= + +// Optimized kernel: one block per batch element, threads cooperate on FC1 +// D_out threads per block, each computes one output element +// Two-pass: first find argmax, then recompute FC1 at argmax for backward +template +__global__ void fc_relu_fc_max_forward_kernel( + T* __restrict__ out, // (B, D_out) + int* __restrict__ argmax_indices, // (B, D_out) - which N produced the max + float* __restrict__ fc1_at_argmax, // (B, D_out, D_mid) - FC1 ReLU output at argmax for backward + const T* __restrict__ x, // (B, N, D_in) + const T* __restrict__ W1, // (D_mid, D_in) - transposed for x @ W1.T + b1 + const T* __restrict__ b1, // (D_mid) + const T* __restrict__ W2, // (D_out, D_mid) - transposed for fc1 @ W2.T + b2 + const T* __restrict__ b2, // (D_out) + int B, + int N, + int D_in, + int D_mid, + int D_out +) { + int b = blockIdx.x; + if (b >= B) return; + + int tid = threadIdx.x; // 0..D_out-1 + + // Shared memory for cooperative FC1 computation + extern __shared__ float shared_mem[]; + float* fc1_shared = shared_mem; // D_mid floats + float* x_shared = shared_mem + D_mid; // D_in floats + + // Each thread tracks its own max for its d_out + float max_val = -INFINITY; + int argmax_n = 0; + + float my_b2 = (tid < D_out) ? float(b2[tid]) : 0.0f; + + // Pass 1: Iterate over all N points to find max and argmax + for (int n = 0; n < N; n++) { + // Cooperatively load x[b, n, :] into shared memory + if (tid < D_in) { + x_shared[tid] = float(x[b * N * D_in + n * D_in + tid]); + } + __syncthreads(); + + // Threads cooperate to compute FC1: first D_mid threads each compute one element + if (tid < D_mid) { + float fc1_val = float(b1[tid]); + for (int di = 0; di < D_in; di++) { + fc1_val += x_shared[di] * float(W1[tid * D_in + di]); + } + fc1_shared[tid] = fmaxf(fc1_val, 0.0f); // ReLU + } + __syncthreads(); + + // Each thread computes FC2 for its d_out using shared FC1 output + if (tid < D_out) { + float fc2_val = my_b2; + for (int dm = 0; dm < D_mid; dm++) { + fc2_val += fc1_shared[dm] * float(W2[tid * D_mid + dm]); + } + + if (fc2_val > max_val) { + max_val = fc2_val; + argmax_n = n; + } + } + __syncthreads(); + } + + // Write max output and argmax + if (tid < D_out) { + int out_idx = b * D_out + tid; + out[out_idx] = T(max_val); + argmax_indices[out_idx] = argmax_n; + + // Pass 2: Each thread independently computes FC1 at its argmax_n + // This has some redundant computation but avoids serialization + for (int dm = 0; dm < D_mid; dm++) { + float fc1_val = float(b1[dm]); + for (int di = 0; di < D_in; di++) { + fc1_val += float(x[b * N * D_in + argmax_n * D_in + di]) * float(W1[dm * D_in + di]); + } + fc1_at_argmax[b * D_out * D_mid + tid * D_mid + dm] = fmaxf(fc1_val, 0.0f); + } + } +} + +template +__global__ void fc_relu_fc_max_backward_kernel( + T* __restrict__ grad_x, // (B, N, D_in) - accumulated + T* __restrict__ grad_W1, // (D_mid, D_in) - accumulated + T* __restrict__ grad_b1, // (D_mid) - accumulated + T* __restrict__ grad_W2, // (D_out, D_mid) - accumulated + T* __restrict__ grad_b2, // (D_out) - accumulated + const T* __restrict__ grad_out, // (B, D_out) + const T* __restrict__ x, // (B, N, D_in) + const T* __restrict__ W1, // (D_mid, D_in) + const T* __restrict__ W2, // (D_out, D_mid) + const int* __restrict__ argmax_indices, // (B, D_out) + const float* __restrict__ fc1_at_argmax, // (B, D_out, D_mid) + int B, + int N, + int D_in, + int D_mid, + int D_out +) { + // Each thread handles one (b, d_out) gradient + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= B * D_out) return; + + int b = idx / D_out; + int d_out = idx % D_out; + + float g_out = float(grad_out[idx]); + int argmax_n = argmax_indices[idx]; + + // grad_b2[d_out] += g_out + atomicAdd(reinterpret_cast(&grad_b2[d_out]), g_out); + + // Backprop through FC2 and ReLU + for (int dm = 0; dm < D_mid; dm++) { + int w2_idx = d_out * D_mid + dm; + float fc1_relu_val = fc1_at_argmax[b * D_out * D_mid + d_out * D_mid + dm]; + + // grad_W2[d_out, dm] += g_out * fc1_relu[dm] + atomicAdd(reinterpret_cast(&grad_W2[w2_idx]), g_out * fc1_relu_val); + + // grad through ReLU: passes if fc1_relu > 0 + float grad_fc1 = (fc1_relu_val > 0.0f) ? (g_out * float(W2[w2_idx])) : 0.0f; + + // grad_b1[dm] += grad_fc1 + atomicAdd(reinterpret_cast(&grad_b1[dm]), grad_fc1); + + // Backprop through FC1 + for (int di = 0; di < D_in; di++) { + int x_idx = b * N * D_in + argmax_n * D_in + di; + int w1_idx = dm * D_in + di; + + // grad_W1[dm, di] += grad_fc1 * x[b, argmax_n, di] + atomicAdd(reinterpret_cast(&grad_W1[w1_idx]), grad_fc1 * float(x[x_idx])); + + // grad_x[b, argmax_n, di] += grad_fc1 * W1[dm, di] + atomicAdd(reinterpret_cast(&grad_x[x_idx]), grad_fc1 * float(W1[w1_idx])); + } + } +} + +template +void launch_fc_relu_fc_max_forward( + T* out, + int* argmax_indices, + float* fc1_at_argmax, + const T* x, + const T* W1, + const T* b1, + const T* W2, + const T* b2, + int B, + int N, + int D_in, + int D_mid, + int D_out, + cudaStream_t stream +) { + // One block per batch element + // Need max(D_mid, D_out) threads: D_mid for cooperative FC1, D_out for FC2 outputs + int threads = (D_mid > D_out) ? D_mid : D_out; + // Shared memory: fc1_shared (D_mid) + x_shared (D_in) + size_t shared_mem_size = (D_mid + D_in) * sizeof(float); + + fc_relu_fc_max_forward_kernel<<>>( + out, argmax_indices, fc1_at_argmax, + x, W1, b1, W2, b2, + B, N, D_in, D_mid, D_out + ); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "fc_relu_fc_max_forward kernel error: %s\n", cudaGetErrorString(err)); + } +} + +template +void launch_fc_relu_fc_max_backward( + T* grad_x, + T* grad_W1, + T* grad_b1, + T* grad_W2, + T* grad_b2, + const T* grad_out, + const T* x, + const T* W1, + const T* W2, + const int* argmax_indices, + const float* fc1_at_argmax, + int B, + int N, + int D_in, + int D_mid, + int D_out, + cudaStream_t stream +) { + int total = B * D_out; + int grid = grid_size(total); + + fc_relu_fc_max_backward_kernel<<>>( + grad_x, grad_W1, grad_b1, grad_W2, grad_b2, + grad_out, x, W1, W2, + argmax_indices, fc1_at_argmax, + B, N, D_in, D_mid, D_out + ); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "fc_relu_fc_max_backward kernel error: %s\n", cudaGetErrorString(err)); + } +} + // Non-templated wrappers for float void launch_mingru_gate_inference_float(float* out, float* next_state, const float* combined, const float* state_in, int H, int B, cudaStream_t stream) { launch_mingru_gate_inference(out, next_state, combined, state_in, H, B, stream); @@ -2421,6 +2770,12 @@ void launch_ppo_loss_backward_float(float* grad_logits, float* grad_values_pred, void launch_sample_logits_float(double* actions, float* logprobs, float* value_out, const float* logits, const float* value, const int* act_sizes, uint64_t seed, const int64_t* offset_ptr, int num_atns, int B, int logits_stride, int value_stride, cudaStream_t stream) { launch_sample_logits(actions, logprobs, value_out, logits, value, act_sizes, seed, offset_ptr, num_atns, B, logits_stride, value_stride, stream); } +void launch_fc_relu_fc_max_forward_float(float* out, int* argmax_indices, float* fc1_at_argmax, const float* x, const float* W1, const float* b1, const float* W2, const float* b2, int B, int N, int D_in, int D_mid, int D_out, cudaStream_t stream) { + launch_fc_relu_fc_max_forward(out, argmax_indices, fc1_at_argmax, x, W1, b1, W2, b2, B, N, D_in, D_mid, D_out, stream); +} +void launch_fc_relu_fc_max_backward_float(float* grad_x, float* grad_W1, float* grad_b1, float* grad_W2, float* grad_b2, const float* grad_out, const float* x, const float* W1, const float* W2, const int* argmax_indices, const float* fc1_at_argmax, int B, int N, int D_in, int D_mid, int D_out, cudaStream_t stream) { + launch_fc_relu_fc_max_backward(grad_x, grad_W1, grad_b1, grad_W2, grad_b2, grad_out, x, W1, W2, argmax_indices, fc1_at_argmax, B, N, D_in, D_mid, D_out, stream); +} // Non-templated wrappers for BFloat16 void launch_mingru_gate_inference_bf16(at::BFloat16* out, at::BFloat16* next_state, const at::BFloat16* combined, const at::BFloat16* state_in, int H, int B, cudaStream_t stream) { @@ -2473,6 +2828,12 @@ void launch_ppo_loss_backward_bf16(at::BFloat16* grad_logits, at::BFloat16* grad void launch_sample_logits_bf16(double* actions, at::BFloat16* logprobs, at::BFloat16* value_out, const at::BFloat16* logits, const at::BFloat16* value, const int* act_sizes, uint64_t seed, const int64_t* offset_ptr, int num_atns, int B, int logits_stride, int value_stride, cudaStream_t stream) { launch_sample_logits(actions, logprobs, value_out, logits, value, act_sizes, seed, offset_ptr, num_atns, B, logits_stride, value_stride, stream); } +void launch_fc_relu_fc_max_forward_bf16(at::BFloat16* out, int* argmax_indices, float* fc1_at_argmax, const at::BFloat16* x, const at::BFloat16* W1, const at::BFloat16* b1, const at::BFloat16* W2, const at::BFloat16* b2, int B, int N, int D_in, int D_mid, int D_out, cudaStream_t stream) { + launch_fc_relu_fc_max_forward(out, argmax_indices, fc1_at_argmax, x, W1, b1, W2, b2, B, N, D_in, D_mid, D_out, stream); +} +void launch_fc_relu_fc_max_backward_bf16(at::BFloat16* grad_x, at::BFloat16* grad_W1, at::BFloat16* grad_b1, at::BFloat16* grad_W2, at::BFloat16* grad_b2, const at::BFloat16* grad_out, const at::BFloat16* x, const at::BFloat16* W1, const at::BFloat16* W2, const int* argmax_indices, const float* fc1_at_argmax, int B, int N, int D_in, int D_mid, int D_out, cudaStream_t stream) { + launch_fc_relu_fc_max_backward(grad_x, grad_W1, grad_b1, grad_W2, grad_b2, grad_out, x, W1, W2, argmax_indices, fc1_at_argmax, B, N, D_in, D_mid, D_out, stream); +} void launch_ppo_loss_forward_optimized_float(float* loss_output, double* saved_for_backward, float* ratio_out, float* newvalue_out, const float* logits, const float* values_pred, const int64_t* actions, const float* old_logprobs, const float* advantages, const float* prio, const float* values, const float* returns, const float* adv_mean, const float* adv_var, float clip_coef, float vf_clip_coef, float vf_coef, float ent_coef, int T_seq, int A, int N, int logits_stride_n, int logits_stride_t, int logits_stride_a, int values_stride_n, int values_stride_t, cudaStream_t stream) { launch_ppo_loss_forward_optimized(loss_output, saved_for_backward, ratio_out, newvalue_out, logits, values_pred, actions, old_logprobs, advantages, prio, values, returns, adv_mean, adv_var, clip_coef, vf_clip_coef, vf_coef, ent_coef, T_seq, A, N, logits_stride_n, logits_stride_t, logits_stride_a, values_stride_n, values_stride_t, stream); diff --git a/pufferlib/extensions/cuda/kernels.h b/pufferlib/extensions/cuda/kernels.h index 3faca5186..2aa8fa5a7 100644 --- a/pufferlib/extensions/cuda/kernels.h +++ b/pufferlib/extensions/cuda/kernels.h @@ -25,6 +25,8 @@ void launch_ppo_loss_backward_float(float* grad_logits, float* grad_values_pred, void launch_ppo_loss_forward_optimized_float(float* loss_output, double* saved_for_backward, float* ratio_out, float* newvalue_out, const float* logits, const float* values_pred, const int64_t* actions, const float* old_logprobs, const float* advantages, const float* prio, const float* values, const float* returns, const float* adv_mean, const float* adv_var, float clip_coef, float vf_clip_coef, float vf_coef, float ent_coef, int T_seq, int A, int N, int logits_stride_n, int logits_stride_t, int logits_stride_a, int values_stride_n, int values_stride_t, cudaStream_t stream); void launch_ppo_loss_backward_optimized_float(float* grad_logits, float* grad_values_pred, const float* grad_loss, const float* logits, const float* values_pred, const int64_t* actions, const float* old_logprobs, const float* advantages, const float* prio, const float* values, const float* returns, const float* adv_mean, const float* adv_var, float clip_coef, float vf_clip_coef, float vf_coef, float ent_coef, int T_seq, int A, int N, int logits_stride_n, int logits_stride_t, int logits_stride_a, int values_stride_n, int values_stride_t, cudaStream_t stream); void launch_sample_logits_float(double* actions, float* logprobs, float* value_out, const float* logits, const float* value, const int* act_sizes, uint64_t seed, const int64_t* offset_ptr, int num_atns, int B, int logits_stride, int value_stride, cudaStream_t stream); +void launch_fc_relu_fc_max_forward_float(float* out, int* argmax_indices, float* fc1_at_argmax, const float* x, const float* W1, const float* b1, const float* W2, const float* b2, int B, int N, int D_in, int D_mid, int D_out, cudaStream_t stream); +void launch_fc_relu_fc_max_backward_float(float* grad_x, float* grad_W1, float* grad_b1, float* grad_W2, float* grad_b2, const float* grad_out, const float* x, const float* W1, const float* W2, const int* argmax_indices, const float* fc1_at_argmax, int B, int N, int D_in, int D_mid, int D_out, cudaStream_t stream); // BFloat16 wrappers void launch_mingru_gate_inference_bf16(at::BFloat16* out, at::BFloat16* next_state, const at::BFloat16* combined, const at::BFloat16* state_in, int H, int B, cudaStream_t stream); @@ -43,5 +45,11 @@ void launch_ppo_loss_backward_bf16(at::BFloat16* grad_logits, at::BFloat16* grad void launch_ppo_loss_forward_optimized_bf16(float* loss_output, double* saved_for_backward, at::BFloat16* ratio_out, at::BFloat16* newvalue_out, const at::BFloat16* logits, const at::BFloat16* values_pred, const int64_t* actions, const at::BFloat16* old_logprobs, const at::BFloat16* advantages, const at::BFloat16* prio, const at::BFloat16* values, const at::BFloat16* returns, const float* adv_mean, const float* adv_var, float clip_coef, float vf_clip_coef, float vf_coef, float ent_coef, int T_seq, int A, int N, int logits_stride_n, int logits_stride_t, int logits_stride_a, int values_stride_n, int values_stride_t, cudaStream_t stream); void launch_ppo_loss_backward_optimized_bf16(at::BFloat16* grad_logits, at::BFloat16* grad_values_pred, const float* grad_loss, const at::BFloat16* logits, const at::BFloat16* values_pred, const int64_t* actions, const at::BFloat16* old_logprobs, const at::BFloat16* advantages, const at::BFloat16* prio, const at::BFloat16* values, const at::BFloat16* returns, const float* adv_mean, const float* adv_var, float clip_coef, float vf_clip_coef, float vf_coef, float ent_coef, int T_seq, int A, int N, int logits_stride_n, int logits_stride_t, int logits_stride_a, int values_stride_n, int values_stride_t, cudaStream_t stream); void launch_sample_logits_bf16(double* actions, at::BFloat16* logprobs, at::BFloat16* value_out, const at::BFloat16* logits, const at::BFloat16* value, const int* act_sizes, uint64_t seed, const int64_t* offset_ptr, int num_atns, int B, int logits_stride, int value_stride, cudaStream_t stream); +void launch_fc_relu_fc_max_forward_bf16(at::BFloat16* out, int* argmax_indices, float* fc1_at_argmax, const at::BFloat16* x, const at::BFloat16* W1, const at::BFloat16* b1, const at::BFloat16* W2, const at::BFloat16* b2, int B, int N, int D_in, int D_mid, int D_out, cudaStream_t stream); +void launch_fc_relu_fc_max_backward_bf16(at::BFloat16* grad_x, at::BFloat16* grad_W1, at::BFloat16* grad_b1, at::BFloat16* grad_W2, at::BFloat16* grad_b2, const at::BFloat16* grad_out, const at::BFloat16* x, const at::BFloat16* W1, const at::BFloat16* W2, const int* argmax_indices, const float* fc1_at_argmax, int B, int N, int D_in, int D_mid, int D_out, cudaStream_t stream); + +// FCMax: FC -> Max (no intermediate layer) +void launch_fc_max_forward_float(float* out, int* argmax_indices, const float* x, const float* W, const float* b, int B, int N, int D_in, int D_out, cudaStream_t stream); +void launch_fc_max_backward_float(float* grad_x, float* grad_W, float* grad_b, const float* grad_out, const float* x, const float* W, const int* argmax_indices, int B, int N, int D_in, int D_out, cudaStream_t stream); #endif // PUFFERLIB_KERNELS_H diff --git a/pufferlib/extensions/env_binding.c b/pufferlib/extensions/env_binding.c new file mode 100644 index 000000000..63eaf6d90 --- /dev/null +++ b/pufferlib/extensions/env_binding.c @@ -0,0 +1,298 @@ +// static_envbinding.c - Template for static env binding +// Include this AFTER defining: Env, OBS_SIZE, NUM_ATNS, my_init, my_log, c_step, c_reset + +#include +#include +#include + +#include "env_binding.h" +#include "binding.h" + +// Forward declare CUDA types and functions to avoid conflicts with raylib's float3 +typedef int cudaError_t; +typedef int cudaMemcpyKind; +#define cudaSuccess 0 +#define cudaMemcpyHostToDevice 1 +#define cudaMemcpyDeviceToHost 2 +#define cudaHostAllocPortable 1 +#define cudaStreamNonBlocking 1 + +extern cudaError_t cudaHostAlloc(void**, size_t, unsigned int); +extern cudaError_t cudaMalloc(void**, size_t); +extern cudaError_t cudaMemcpy(void*, const void*, size_t, cudaMemcpyKind); +extern cudaError_t cudaMemcpyAsync(void*, const void*, size_t, cudaMemcpyKind, cudaStream_t); +extern cudaError_t cudaMemset(void*, int, size_t); +extern cudaError_t cudaFree(void*); +extern cudaError_t cudaFreeHost(void*); +extern cudaError_t cudaSetDevice(int); +extern cudaError_t cudaDeviceSynchronize(void); +extern cudaError_t cudaStreamSynchronize(cudaStream_t); +extern cudaError_t cudaStreamCreateWithFlags(cudaStream_t*, unsigned int); +extern cudaError_t cudaStreamQuery(cudaStream_t); +extern const char* cudaGetErrorString(cudaError_t); + +#define OMP_WAITING 5 +#define OMP_RUNNING 6 + +struct StaticThreading { + atomic_int* buffer_states; + int num_threads; + int num_buffers; + pthread_t* threads; +}; + +typedef struct StaticOMPArg { + StaticVec* vec; + int buf; + int horizon; + void* ctx; + net_callback_fn net_callback; + thread_init_fn thread_init; +} StaticOMPArg; + +// OMP thread manager +static void* static_omp_threadmanager(void* arg) { + StaticOMPArg* worker_arg = (StaticOMPArg*)arg; + StaticVec* vec = worker_arg->vec; + StaticThreading* threading = vec->threading; + int buf = worker_arg->buf; + int horizon = worker_arg->horizon; + void* ctx = worker_arg->ctx; + net_callback_fn net_callback = worker_arg->net_callback; + thread_init_fn thread_init = worker_arg->thread_init; + + if (thread_init != NULL) { + thread_init(ctx, buf); + } + + int agents_per_buffer = vec->agents_per_buffer; + int agent_start = buf * agents_per_buffer; + int env_start = vec->buffer_env_starts[buf]; + int env_count = vec->buffer_env_counts[buf]; + atomic_int* buffer_states = threading->buffer_states; + int num_workers = threading->num_threads / vec->buffers; + if (num_workers < 1) num_workers = 1; + + Env* envs = (Env*)vec->envs; + + while (1) { + while (atomic_load(&buffer_states[buf]) != OMP_RUNNING) {} + cudaStream_t stream = vec->streams[buf]; + + for (int t = 0; t < horizon; t++) { + net_callback(ctx, buf, t); + + cudaMemcpyAsync( + &vec->actions[agent_start * NUM_ATNS], + &vec->gpu_actions[agent_start * NUM_ATNS], + agents_per_buffer * NUM_ATNS * sizeof(double), + cudaMemcpyDeviceToHost, stream); + cudaStreamSynchronize(stream); + + #pragma omp parallel for schedule(static) num_threads(num_workers) + for (int i = env_start; i < env_start + env_count; i++) { + c_step(&envs[i]); + } + + cudaMemcpyAsync( + &vec->gpu_observations[agent_start * OBS_SIZE], + &vec->observations[agent_start * OBS_SIZE], + agents_per_buffer * OBS_SIZE * sizeof(float), + cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync( + &vec->gpu_rewards[agent_start], + &vec->rewards[agent_start], + agents_per_buffer * sizeof(float), + cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync( + &vec->gpu_terminals[agent_start], + &vec->terminals[agent_start], + agents_per_buffer * sizeof(float), + cudaMemcpyHostToDevice, stream); + } + cudaStreamSynchronize(stream); + atomic_store(&buffer_states[buf], OMP_WAITING); + } +} + +void static_vec_omp_step(StaticVec* vec) { + StaticThreading* threading = vec->threading; + for (int buf = 0; buf < vec->buffers; buf++) { + atomic_store(&threading->buffer_states[buf], OMP_RUNNING); + } + for (int buf = 0; buf < vec->buffers; buf++) { + while (atomic_load(&threading->buffer_states[buf]) != OMP_WAITING) {} + } +} + +// Optional: Initialize all envs at once (for shared state, variable agents per env, etc.) +// Default implementation creates one env per agent using my_init +#ifndef MY_VEC_INIT +Env* my_vec_init(int* num_envs_out, int* buffer_env_starts, int* buffer_env_counts, + Dict* vec_kwargs, Dict* env_kwargs) { + int total_agents = (int)dict_get(vec_kwargs, "total_agents")->value; + int num_buffers = (int)dict_get(vec_kwargs, "num_buffers")->value; + int agents_per_buffer = total_agents / num_buffers; + + // Default: one env per agent + Env* envs = (Env*)calloc(total_agents, sizeof(Env)); + for (int b = 0; b < num_buffers; b++) { + buffer_env_starts[b] = b * agents_per_buffer; + buffer_env_counts[b] = agents_per_buffer; + } + + for (int i = 0; i < total_agents; i++) { + srand(i); + my_init(&envs[i], env_kwargs); + } + + *num_envs_out = total_agents; + return envs; +} +#endif + +StaticVec* create_static_vec(int total_agents, int num_buffers, Dict* vec_kwargs, Dict* env_kwargs) { + StaticVec* vec = (StaticVec*)calloc(1, sizeof(StaticVec)); + vec->total_agents = total_agents; + vec->buffers = num_buffers; + vec->agents_per_buffer = total_agents / num_buffers; + vec->obs_size = OBS_SIZE; + vec->num_atns = NUM_ATNS; + + vec->buffer_env_starts = (int*)calloc(num_buffers, sizeof(int)); + vec->buffer_env_counts = (int*)calloc(num_buffers, sizeof(int)); + + // Let my_vec_init allocate and initialize envs, fill buffer info + int num_envs = 0; + vec->envs = my_vec_init(&num_envs, vec->buffer_env_starts, vec->buffer_env_counts, + vec_kwargs, env_kwargs); + vec->size = num_envs; + + cudaHostAlloc((void**)&vec->observations, total_agents * OBS_SIZE * sizeof(float), cudaHostAllocPortable); + cudaHostAlloc((void**)&vec->actions, total_agents * NUM_ATNS * sizeof(double), cudaHostAllocPortable); + cudaHostAlloc((void**)&vec->rewards, total_agents * sizeof(float), cudaHostAllocPortable); + cudaHostAlloc((void**)&vec->terminals, total_agents * sizeof(float), cudaHostAllocPortable); + + cudaMalloc((void**)&vec->gpu_observations, total_agents * OBS_SIZE * sizeof(float)); + cudaMalloc((void**)&vec->gpu_actions, total_agents * NUM_ATNS * sizeof(double)); + cudaMalloc((void**)&vec->gpu_rewards, total_agents * sizeof(float)); + cudaMalloc((void**)&vec->gpu_terminals, total_agents * sizeof(float)); + + cudaMemset(vec->gpu_observations, 0, total_agents * OBS_SIZE * sizeof(float)); + cudaMemset(vec->gpu_actions, 0, total_agents * NUM_ATNS * sizeof(double)); + cudaMemset(vec->gpu_rewards, 0, total_agents * sizeof(float)); + cudaMemset(vec->gpu_terminals, 0, total_agents * sizeof(float)); + + // Streams allocated here, created in create_static_threads + vec->streams = (cudaStream_t*)calloc(num_buffers, sizeof(cudaStream_t)); + + // Assign pointers to envs based on buffer layout + Env* envs = (Env*)vec->envs; + for (int buf = 0; buf < num_buffers; buf++) { + int buf_start = buf * vec->agents_per_buffer; + int buf_agent = 0; + int env_start = vec->buffer_env_starts[buf]; + int env_count = vec->buffer_env_counts[buf]; + + for (int e = 0; e < env_count; e++) { + Env* env = &envs[env_start + e]; + int slot = buf_start + buf_agent; + env->observations = vec->observations + slot * OBS_SIZE; + env->actions = vec->actions + slot * NUM_ATNS; + env->rewards = vec->rewards + slot; + env->terminals = vec->terminals + slot; + buf_agent += env->num_agents; + } + } + + return vec; +} + +void static_vec_reset(StaticVec* vec) { + Env* envs = (Env*)vec->envs; + for (int i = 0; i < vec->size; i++) { + c_reset(&envs[i]); + } + cudaMemcpy(vec->gpu_observations, vec->observations, + vec->total_agents * OBS_SIZE * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(vec->gpu_rewards, vec->rewards, + vec->total_agents * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(vec->gpu_terminals, vec->terminals, + vec->total_agents * sizeof(float), cudaMemcpyHostToDevice); + cudaDeviceSynchronize(); +} + +void create_static_threads(StaticVec* vec, int num_threads, int horizon, + void* ctx, net_callback_fn net_callback, thread_init_fn thread_init) { + vec->threading = (StaticThreading*)calloc(1, sizeof(StaticThreading)); + vec->threading->num_threads = num_threads; + vec->threading->num_buffers = vec->buffers; + vec->threading->buffer_states = (atomic_int*)calloc(vec->buffers, sizeof(atomic_int)); + vec->threading->threads = (pthread_t*)calloc(vec->buffers, sizeof(pthread_t)); + + // Create CUDA streams here, not in create_static_vec + for (int i = 0; i < vec->buffers; i++) { + cudaStreamCreateWithFlags(&vec->streams[i], cudaStreamNonBlocking); + } + + StaticOMPArg* args = (StaticOMPArg*)calloc(vec->buffers, sizeof(StaticOMPArg)); + for (int i = 0; i < vec->buffers; i++) { + args[i].vec = vec; + args[i].buf = i; + args[i].horizon = horizon; + args[i].ctx = ctx; + args[i].net_callback = net_callback; + args[i].thread_init = thread_init; + pthread_create(&vec->threading->threads[i], NULL, static_omp_threadmanager, &args[i]); + } +} + +void static_vec_log(StaticVec* vec, Dict* out) { + Env* envs = (Env*)vec->envs; + Log aggregate = {0}; + for (int i = 0; i < vec->size; i++) { + aggregate.perf += envs[i].log.perf; + aggregate.score += envs[i].log.score; + aggregate.episode_return += envs[i].log.episode_return; + aggregate.episode_length += envs[i].log.episode_length; + aggregate.n += envs[i].log.n; + memset(&envs[i].log, 0, sizeof(Log)); + } + if (aggregate.n > 0) { + float n = aggregate.n; + aggregate.perf /= n; + aggregate.score /= n; + aggregate.episode_return /= n; + aggregate.episode_length /= n; + dict_set(out, "n", n); + my_log(&aggregate, out); + } +} + +int get_obs_size(void) { return OBS_SIZE; } +int get_num_atns(void) { return NUM_ATNS; } +static int _act_sizes[] = ACT_SIZES; +int* get_act_sizes(void) { return _act_sizes; } + +// Optional shared state functions - default implementations +#ifndef MY_SHARED +void* my_shared(void* env, Dict* kwargs) { + return NULL; +} +#endif + +#ifndef MY_SHARED_CLOSE +void my_shared_close(void* env) {} +#endif + +#ifndef MY_GET +void* my_get(void* env, Dict* out) { + return NULL; +} +#endif + +#ifndef MY_PUT +int my_put(void* env, Dict* kwargs) { + return 0; +} +#endif diff --git a/pufferlib/extensions/env_binding.h b/pufferlib/extensions/env_binding.h index ead65525f..00ddea84c 100644 --- a/pufferlib/extensions/env_binding.h +++ b/pufferlib/extensions/env_binding.h @@ -1,806 +1,125 @@ -#include -#include -#include -#include -#include -#include -#include +// static_envbinding.h - Shared types for static env linking +// Included by both env .c files and pufferlib.cpp -#include "vecenv.h" -#include +#pragma once -// Forward declare CUDA types and functions to avoid conflicts with raylib's float3 -typedef int cudaError_t; -typedef int cudaMemcpyKind; -#define cudaSuccess 0 -#define cudaMemcpyHostToDevice 1 -#define cudaMemcpyDeviceToHost 2 -#define cudaHostAllocPortable 1 -#define cudaStreamNonBlocking 1 +#include +#include +#include +#include -extern cudaError_t cudaHostAlloc(void**, size_t, unsigned int); -extern cudaError_t cudaMalloc(void**, size_t); -extern cudaError_t cudaMemcpy(void*, const void*, size_t, cudaMemcpyKind); -extern cudaError_t cudaMemcpyAsync(void*, const void*, size_t, cudaMemcpyKind, cudaStream_t); -extern cudaError_t cudaMemset(void*, int, size_t); -extern cudaError_t cudaFree(void*); -extern cudaError_t cudaFreeHost(void*); -extern cudaError_t cudaSetDevice(int); -extern cudaError_t cudaDeviceSynchronize(void); -extern cudaError_t cudaStreamSynchronize(cudaStream_t); -extern cudaError_t cudaStreamCreateWithFlags(cudaStream_t*, unsigned int); -extern cudaError_t cudaStreamQuery(cudaStream_t); -extern const char* cudaGetErrorString(cudaError_t); +#ifdef __cplusplus +extern "C" { +#endif +// Type constants #define FLOAT 1 #define INT 2 #define UNSIGNED_CHAR 3 #define DOUBLE 4 +#define CHAR 5 -#if OBS_TYPE == FLOAT - #define OBS_DTYPE float -#elif OBS_TYPE == INT - #define OBS_DTYPE int -#elif OBS_TYPE == UNSIGNED_CHAR - #define OBS_DTYPE unsigned char -#elif OBS_TYPE == DOUBLE - #define OBS_DTYPE double -#elif OBS_TYPE == CHAR - #define OBS_DTYPE char -#endif - -#define CHECK_CUDA(call) \ - do { \ - cudaError_t err = (call); \ - if (err != cudaSuccess) { \ - fprintf(stderr, "CUDA error in %s: %s (error %d)\n", \ - #call, cudaGetErrorString(err), (int)err); \ - exit(EXIT_FAILURE); \ - } \ - } while(0) - -#define EXPORT __attribute__((visibility("default"))) - -EXPORT const int OBS_N = OBS_SIZE; -EXPORT const int NUM_ATNS_EXPORT = NUM_ATNS; -EXPORT const int ACT_SIZES_EXPORT[NUM_ATNS] = ACT_SIZES; -EXPORT const int OBS_T = OBS_TYPE; -EXPORT const int ACT_T = ACT_TYPE; -EXPORT const size_t ENV_SIZE = sizeof(Env); - -// Direct step on all envs - for benchmarking vec overhead -EXPORT void vec_step_direct(VecEnv* vec) { - for (int i = 0; i < vec->size; i++) { - c_step(&vec->envs[i]); - } -} - -#define INIT 0 -#define OBS_READY_ON_CPU 1 -#define OBS_READY_ON_GPU 2 -#define ATN_READY_ON_GPU 3 -#define ATN_READY_ON_CPU 4 -#define OMP_WAITING 5 -#define OMP_RUNNING 6 - -typedef struct Threading { - atomic_long* completed; - long start_index; - long end_index; - int num_threads; - pthread_cond_t wake_cond; - pthread_mutex_t wake_mutex; - pthread_cond_t all_done_cond; - pthread_mutex_t all_done_mutex; - pthread_t* threads; - atomic_int* buffer_states; - int block_size; - int num_envs; - int num_buffers; - bool use_gpu; - int test_idx; - long min_expected; - int iters; -} Threading; - -typedef struct WorkerArg { - VecEnv* vec; - int idx; -} WorkerArg; - -// Callback function types for OMP threading -typedef void (*net_callback_fn)(void* ctx, int buf, int t); -typedef void (*thread_init_fn)(void* ctx, int buf); - -typedef struct OMPWorkerArg { - VecEnv* vec; - int buf; - int horizon; - void* ctx; - net_callback_fn net_callback; - thread_init_fn thread_init; -} OMPWorkerArg; - -// Forward declarations for env-specific functions supplied by user -void my_log(Log* log, Dict* out); -void my_init(Env* env, Dict* args); +// Dict types +typedef struct { + const char* key; + double value; + void* ptr; +} DictItem; -// Optional: Initialize all envs at once (for shared state, etc.) -// Allocates and returns Env* array, sets *num_envs_out -// vec_kwargs contains: total_agents, num_buffers -// env_kwargs contains env-specific config -// Default implementation allocates max possible envs and loops over my_init -Env* my_vec_init(int* num_envs_out, Dict* vec_kwargs, Dict* env_kwargs); -#ifndef MY_VEC_INIT -Env* my_vec_init(int* num_envs_out, Dict* vec_kwargs, Dict* env_kwargs) { - int total_agents = (int)dict_get(vec_kwargs, "total_agents")->value; - // Allocate max possible envs (1 agent per env worst case) - Env* envs = (Env*)calloc(total_agents, sizeof(Env)); +typedef struct { + DictItem* items; + int size; + int capacity; +} Dict; - int num_envs = 0; - int agents_created = 0; - while (agents_created < total_agents) { - srand(num_envs); - my_init(&envs[num_envs], env_kwargs); - agents_created += envs[num_envs].num_agents; - num_envs++; - } - // Shrink to actual size needed - envs = (Env*)realloc(envs, num_envs * sizeof(Env)); - *num_envs_out = num_envs; - return envs; +static inline Dict* create_dict(int capacity) { + Dict* dict = (Dict*)calloc(1, sizeof(Dict)); + dict->capacity = capacity; + dict->items = (DictItem*)calloc(capacity, sizeof(DictItem)); + return dict; } -#endif - -void* my_shared(Env* env, Dict* kwargs); -#ifndef MY_SHARED -void* my_shared(Env* env, Dict* kwargs) { - return NULL; -} -#endif - -void my_shared_close(Env* env); -#ifndef MY_SHARED_CLOSE -void my_shared_close(Env* env) {} -#endif - -void* my_get(Env* env, Dict* out); -#ifndef MY_GET -void* my_get(Env* env, Dict* out) { - return NULL; -} -#endif - -int my_put(Env* env, Dict* kwargs); -#ifndef MY_PUT -int my_put(Env* env, Dict* kwargs) { - return 0; -} -#endif - -void update_buffer_state(Threading* threading, int buf, int val) { - atomic_int* states = threading->buffer_states; - int old_val = atomic_load(&states[buf]); - atomic_store(&states[buf], val); - //printf("Updated vecenv %d buf %d from %d to %d \n", threading->test_idx, buf, old_val, val); -} - -static void* c_threadstep(void* arg) -{ - WorkerArg* worker_arg = (WorkerArg*)arg; - VecEnv* vec = worker_arg->vec; - Threading* threading = vec->threading; - int block_size = threading->block_size; - int num_envs = vec->size; - atomic_long* completed = &threading->completed[worker_arg->idx]; - long end = 0; - long block_start = 0; - while (1) { - pthread_mutex_lock(&threading->wake_mutex); - while (threading->start_index >= threading->end_index) { - atomic_store(completed, threading->start_index); - //printf("Min completed %d on thread %d. end %d test idx %d\n", threading->start_index, worker_arg->idx, threading->end_index, threading->test_idx); - pthread_cond_wait(&threading->wake_cond, &threading->wake_mutex); +static inline DictItem* dict_get_unsafe(Dict* dict, const char* key) { + for (int i = 0; i < dict->size; i++) { + if (strcmp(dict->items[i].key, key) == 0) { + return &dict->items[i]; } - - long start = threading->start_index; - long end = threading->start_index + block_size; - if (end > threading->end_index) { - end = threading->end_index; - } - threading->start_index = end; - pthread_mutex_unlock(&threading->wake_mutex); - - for (long i=start; i= num_envs) { - fprintf(stderr, "BOUNDS ERROR: idx=%d, num_envs=%d, i=%ld, start=%ld, end=%ld\n", - idx, num_envs, i, start, end); - exit(1); - } - c_step(&vec->envs[idx]); - } - atomic_store(completed, end); } return NULL; } -static void* c_threadmanager(void* arg) { - VecEnv* vec = (VecEnv*)arg; - Threading* threading = vec->threading; - - int agents_per_buffer = vec->total_agents / vec->buffers; - atomic_int* buffer_states = threading->buffer_states; - long iters = 0; - int curr_buf = 0; - long min_expected = 0; - - while (1) { - for (int buf=0; buf < vec->buffers; buf++) { - int state = atomic_load(&buffer_states[buf]); - bool cuda_ready = !threading->use_gpu || cudaStreamQuery(vec->streams[buf]) == cudaSuccess; - if (state == ATN_READY_ON_GPU && cuda_ready) { - update_buffer_state(threading, buf, ATN_READY_ON_CPU); - int num_envs = vec->buffer_env_counts[buf]; - pthread_mutex_lock(&threading->wake_mutex); - threading->end_index += num_envs; - min_expected += num_envs; - pthread_cond_broadcast(&threading->wake_cond); - pthread_mutex_unlock(&threading->wake_mutex); - } - - if (buf != curr_buf) { - continue; - } - - threading->min_expected = min_expected; - long min_completed = LONG_MAX; - for (int i=0; inum_threads; i++) { - long completed = atomic_load(threading->completed + i); - if (completed < min_completed) { - min_completed = completed; - } - } - if (min_completed < min_expected) { - continue; - } - - if (state == ATN_READY_ON_CPU) { - curr_buf = (curr_buf + 1) % vec->buffers; - iters++; - threading->iters = iters; - - int start = buf * agents_per_buffer; - - if (threading->use_gpu) { - cudaMemcpyAsync( - &((OBS_DTYPE*)vec->gpu_observations)[start*OBS_SIZE], - &((OBS_DTYPE*)vec->observations)[start*OBS_SIZE], - agents_per_buffer*OBS_SIZE*sizeof(OBS_DTYPE), - cudaMemcpyHostToDevice, - vec->streams[buf] - ); - cudaMemcpyAsync( - &vec->gpu_rewards[start], - &vec->rewards[start], - agents_per_buffer*sizeof(float), - cudaMemcpyHostToDevice, - vec->streams[buf] - ); - cudaMemcpyAsync( - &vec->gpu_terminals[start], - &vec->terminals[start], - agents_per_buffer*sizeof(float), - cudaMemcpyHostToDevice, - vec->streams[buf] - ); - } - update_buffer_state(threading, buf, OBS_READY_ON_CPU); - } - } - } -} - -EXPORT void omp_minimal_vecstep(VecEnv* vec) { - int num_workers = vec->threading->num_threads; - int num_envs = vec->size; - #pragma omp parallel for schedule(static) num_threads(num_workers) - for (int i=0; ienvs[i]); - } -} - -static void* omp_threadmanager(void* arg) { - OMPWorkerArg* worker_arg = (OMPWorkerArg*)arg; - VecEnv* vec = worker_arg->vec; - Threading* threading = vec->threading; - int buf = worker_arg->buf; - int horizon = worker_arg->horizon; - void* ctx = worker_arg->ctx; - net_callback_fn net_callback = worker_arg->net_callback; - thread_init_fn thread_init = worker_arg->thread_init; - - assert(net_callback != NULL && "omp_threadmanager: net_callback is NULL"); - - // Initialize thread-local state (e.g., CUDA stream) once per thread - if (thread_init != NULL) { - thread_init(ctx, buf); - } - - int agents_per_buffer = vec->total_agents / vec->buffers; - int agent_start = buf * agents_per_buffer; - int env_start = vec->buffer_env_starts[buf]; - int env_count = vec->buffer_env_counts[buf]; - atomic_int* buffer_states = threading->buffer_states; - int num_workers = threading->num_threads / vec->buffers; - - while (1) { - // Wait for start signal - while (atomic_load(&buffer_states[buf]) != OMP_RUNNING) {} - cudaStream_t stream = vec->streams[buf]; - - for (int t=0; t CPU (same stream as policy, no sync needed before) - if (threading->use_gpu) { - cudaMemcpyAsync( - &vec->actions[agent_start*NUM_ATNS], - &vec->gpu_actions[agent_start*NUM_ATNS], - agents_per_buffer*NUM_ATNS*sizeof(double), - cudaMemcpyDeviceToHost, - stream - ); - // Sync before CPU stepping - cudaStreamSynchronize(stream); - } - - #pragma omp parallel for schedule(static) num_threads(num_workers) - for (int i = env_start; i < env_start + env_count; i++) { - c_step(&vec->envs[i]); - } - - // Transfer obs/rewards/terminals CPU -> GPU (async, no sync needed after) - if (threading->use_gpu) { - cudaMemcpyAsync( - &((OBS_DTYPE*)vec->gpu_observations)[agent_start*OBS_SIZE], - &((OBS_DTYPE*)vec->observations)[agent_start*OBS_SIZE], - agents_per_buffer*OBS_SIZE*sizeof(OBS_DTYPE), - cudaMemcpyHostToDevice, - stream - ); - cudaMemcpyAsync( - &vec->gpu_rewards[agent_start], - &vec->rewards[agent_start], - agents_per_buffer*sizeof(float), - cudaMemcpyHostToDevice, - stream - ); - cudaMemcpyAsync( - &vec->gpu_terminals[agent_start], - &vec->terminals[agent_start], - agents_per_buffer*sizeof(float), - cudaMemcpyHostToDevice, - stream - ); - } - //cudaStreamSynchronize(stream); - //cudaDeviceSynchronize(); - } - cudaStreamSynchronize(stream); - - // Signal done - atomic_store(&buffer_states[buf], OMP_WAITING); - } -} - -EXPORT void vec_omp_step(VecEnv* vec) { - Threading* threading = vec->threading; - - // Signal all buffers to start - for (int buf = 0; buf < vec->buffers; buf++) { - atomic_store(&threading->buffer_states[buf], OMP_RUNNING); - } - - // Wait for all buffers to finish - for (int buf = 0; buf < vec->buffers; buf++) { - while (atomic_load(&threading->buffer_states[buf]) != OMP_WAITING) {} - } +static inline DictItem* dict_get(Dict* dict, const char* key) { + DictItem* item = dict_get_unsafe(dict, key); + if (item == NULL) printf("dict_get failed to find key: %s\n", key); + assert(item != NULL); + return item; } -EXPORT VecEnv* create_environments(int buffers, bool use_gpu, int test_idx, Dict* vec_kwargs, Dict* env_kwargs) { - // my_vec_init allocates envs and determines how many are needed - int num_envs = 0; - Env* envs = my_vec_init(&num_envs, vec_kwargs, env_kwargs); - - VecEnv* vec = (VecEnv*)calloc(1, sizeof(VecEnv)); - vec->envs = envs; - vec->size = num_envs; - vec->buffers = buffers; - vec->threading = calloc(1, sizeof(Threading)); - vec->threading->use_gpu = use_gpu; - vec->threading->test_idx = test_idx; - - // Get total_agents from vec config - this is the padded total - int total_agents = (int)dict_get(vec_kwargs, "total_agents")->value; - int agents_per_buffer = total_agents / buffers; - vec->total_agents = total_agents; - vec->agents_per_buffer = agents_per_buffer; - - // Allocate buffer tracking arrays - vec->buffer_env_starts = (int*)calloc(buffers, sizeof(int)); - vec->buffer_env_counts = (int*)calloc(buffers, sizeof(int)); - - // Assign envs to buffers and validate - int current_buf = 0; - int current_buf_agents = 0; - vec->buffer_env_starts[0] = 0; - - for (int i = 0; i < num_envs; i++) { - int env_agents = envs[i].num_agents; - - // Check if adding this env exceeds buffer limit - if (current_buf_agents + env_agents > agents_per_buffer) { - if (current_buf >= buffers - 1) { - fprintf(stderr, "ERROR: Env %d with %d agents overruns last buffer (has %d, limit %d)\n", - i, env_agents, current_buf_agents, agents_per_buffer); - assert(0 && "my_vec_init created too many agents for buffer capacity"); - } - current_buf++; - vec->buffer_env_starts[current_buf] = i; - current_buf_agents = 0; - } - - vec->buffer_env_counts[current_buf]++; - current_buf_agents += env_agents; - } - - // Allocate memory for total_agents (includes padding) - if (use_gpu) { - cudaSetDevice(0); - CHECK_CUDA(cudaHostAlloc((void**)&vec->observations, total_agents*OBS_SIZE*sizeof(OBS_DTYPE), cudaHostAllocPortable)); - CHECK_CUDA(cudaHostAlloc((void**)&vec->actions, total_agents*NUM_ATNS*sizeof(double), cudaHostAllocPortable)); - CHECK_CUDA(cudaHostAlloc((void**)&vec->rewards, total_agents*sizeof(float), cudaHostAllocPortable)); - CHECK_CUDA(cudaHostAlloc((void**)&vec->terminals, total_agents*sizeof(float), cudaHostAllocPortable)); - CHECK_CUDA(cudaHostAlloc((void**)&vec->mask, total_agents*sizeof(float), cudaHostAllocPortable)); - } else { - vec->observations = calloc(total_agents*OBS_SIZE, sizeof(OBS_DTYPE)); - vec->actions = calloc(total_agents*NUM_ATNS, sizeof(double)); - vec->rewards = calloc(total_agents, sizeof(float)); - vec->terminals = calloc(total_agents, sizeof(float)); - vec->mask = calloc(total_agents, sizeof(float)); - } - - memset(vec->observations, 0, total_agents*OBS_SIZE*sizeof(OBS_DTYPE)); - memset(vec->actions, 0, total_agents*NUM_ATNS*sizeof(double)); - memset(vec->rewards, 0, total_agents*sizeof(float)); - memset(vec->terminals, 0, total_agents*sizeof(float)); - memset(vec->mask, 0, total_agents*sizeof(float)); - - if (use_gpu) { - CHECK_CUDA(cudaMalloc((void**)&vec->gpu_observations, total_agents*OBS_SIZE*sizeof(OBS_DTYPE))); - CHECK_CUDA(cudaMalloc((void**)&vec->gpu_actions, total_agents*NUM_ATNS*sizeof(double))); - CHECK_CUDA(cudaMalloc((void**)&vec->gpu_rewards, total_agents*sizeof(float))); - CHECK_CUDA(cudaMalloc((void**)&vec->gpu_terminals, total_agents*sizeof(float))); - CHECK_CUDA(cudaMalloc((void**)&vec->gpu_mask, total_agents*sizeof(float))); - cudaMemset(vec->gpu_observations, 0, total_agents*OBS_SIZE*sizeof(OBS_DTYPE)); - cudaMemset(vec->gpu_actions, 0, total_agents*NUM_ATNS*sizeof(double)); - cudaMemset(vec->gpu_rewards, 0, total_agents*sizeof(float)); - cudaMemset(vec->gpu_terminals, 0, total_agents*sizeof(float)); - cudaMemset(vec->gpu_mask, 0, total_agents*sizeof(float)); - } else { - vec->gpu_observations = vec->observations; - vec->gpu_actions = vec->actions; - vec->gpu_rewards = vec->rewards; - vec->gpu_terminals = vec->terminals; - vec->gpu_mask = vec->mask; - } - - // Assign env pointers and set mask for real agents - // Agents are laid out per-buffer with padding at end of each buffer - for (int buf = 0; buf < buffers; buf++) { - int buf_start = buf * agents_per_buffer; - int buf_agent = 0; - int env_start = vec->buffer_env_starts[buf]; - int env_count = vec->buffer_env_counts[buf]; - - for (int e = 0; e < env_count; e++) { - Env* env = &envs[env_start + e]; - int slot = buf_start + buf_agent; - env->observations = (OBS_DTYPE*)vec->observations + slot*OBS_SIZE; - env->actions = vec->actions + slot*NUM_ATNS; - env->rewards = vec->rewards + slot; - env->terminals = vec->terminals + slot; - - // Set mask to 1.0 for real agents - for (int a = 0; a < env->num_agents; a++) { - vec->mask[slot + a] = 1.0f; - } - buf_agent += env->num_agents; - } - // Remaining slots in buffer are padding (mask stays 0.0) - } - - // Copy mask to GPU - if (use_gpu) { - cudaMemcpy(vec->gpu_mask, vec->mask, total_agents*sizeof(float), cudaMemcpyHostToDevice); - } - - return vec; -} - -EXPORT void create_threads(VecEnv* vec, int threads, int block_size, bool use_omp, void* ctx, net_callback_fn net_callback, thread_init_fn thread_init, int horizon) { - Threading* threading = vec->threading; - threading->num_threads = threads; - threading->block_size = block_size; - threading->completed = (atomic_long*)calloc(threads, sizeof(atomic_long)); - threading->buffer_states = (atomic_int*)calloc(vec->buffers, sizeof(atomic_int)); - threading->num_envs = vec->size; - threading->num_buffers = vec->buffers; - - vec->streams = (cudaStream_t*)calloc(vec->buffers, sizeof(cudaStream_t)); - if (threading->use_gpu) { - for (int i = 0; i < vec->buffers; i++) { - cudaStreamCreateWithFlags(&vec->streams[i], cudaStreamNonBlocking); - } - } - - if (use_omp) { - OMPWorkerArg* worker_args = (OMPWorkerArg*)calloc(vec->buffers, sizeof(OMPWorkerArg)); - threading->threads = (pthread_t*)calloc(vec->buffers, sizeof(pthread_t)); - assert(threading->threads != NULL && "create_vecenv failed to allocate memory for threads\n"); - - for (int i = 0; i < vec->buffers; i++) { - OMPWorkerArg* arg = &worker_args[i]; - arg->ctx = ctx; - arg->net_callback = net_callback; - arg->thread_init = thread_init; - arg->horizon = horizon; - arg->vec = vec; - arg->buf = i; - - int err = pthread_create(&threading->threads[i], NULL, omp_threadmanager, (void*)(arg)); - assert(err == 0 && "create_vecenv failed to create thread\n"); - } - } else { - if (threads == 0) { - return; - } - - WorkerArg* worker_args = (WorkerArg*)calloc(threads, sizeof(WorkerArg)); - - threading->threads = (pthread_t*)calloc(threads + 1, sizeof(pthread_t)); - assert(threading->threads != NULL && "create_vecenv failed to allocate memory for threads\n"); - assert(pthread_cond_init(&threading->wake_cond, NULL) == 0 && "create_vecenv failed to initialize wake_cond\n"); - assert(pthread_mutex_init(&threading->wake_mutex, NULL) == 0 && "create_vecenv failed to initialize wake_mutex\n"); - //atomic_store(&threading->end_index, 0); - //atomic_store(&threading->work_index, 0); - - for (int i = 0; i < threads; i++) { - WorkerArg* arg = &worker_args[i]; - arg->vec = vec; - arg->idx = i; - - int err = pthread_create(&threading->threads[i], NULL, c_threadstep, (void*)(arg)); - assert(err == 0 && "create_vecenv failed to create thread\n"); - } - - // Last thread manages host device syncs - int err = pthread_create(&threading->threads[threads], NULL, c_threadmanager, (void*)(vec)); - assert(err == 0 && "create_vecenv failed to create manager thread\n"); - } -} - -EXPORT Env* env_init(OBS_DTYPE* observations, double* actions, float* rewards, - float* terminals, int seed, Dict* kwargs) { - Env* env = (Env*)calloc(1, sizeof(Env)); - assert(env != NULL && "env_init failed to allocated memory\n"); - - // TODO: Types can vary - env->observations = observations; - env->actions = actions; - env->rewards = rewards; - env->terminals = terminals; - - srand(seed); - my_init(env, kwargs); - return env; -} - -EXPORT void vec_reset(VecEnv* vec) { - for (int i = 0; i < vec->size; i++) { - Env* env = &vec->envs[i]; - c_reset(env); - } - - Threading* threading = vec->threading; - if (threading->use_gpu) { - cudaMemcpy( - vec->gpu_observations, - vec->observations, - vec->total_agents*OBS_SIZE*sizeof(OBS_DTYPE), - cudaMemcpyHostToDevice - ); - cudaMemcpy( - vec->gpu_rewards, - vec->rewards, - vec->total_agents*sizeof(float), - cudaMemcpyHostToDevice - ); - cudaMemcpy( - vec->gpu_terminals, - vec->terminals, - vec->total_agents*sizeof(float), - cudaMemcpyHostToDevice - ); - cudaDeviceSynchronize(); - } - if (threading->num_threads > 0) { - for (int buf=0; buf < vec->buffers; buf++) { - update_buffer_state(threading, buf, OBS_READY_ON_CPU); - } - } -} - -EXPORT void vec_send(VecEnv* vec, int buffer, cudaStream_t stream) { - int env_start = vec->buffer_env_starts[buffer]; - int env_count = vec->buffer_env_counts[buffer]; - int agents_per_buffer = vec->agents_per_buffer; - int start = buffer * agents_per_buffer; - - Threading* threading = vec->threading; - - // Single threaded - if (threading->num_threads == 0) { - - if (threading->use_gpu) { - cudaStreamSynchronize(stream); - cudaMemcpyAsync( - &vec->actions[start*NUM_ATNS], - &vec->gpu_actions[start*NUM_ATNS], - agents_per_buffer*NUM_ATNS*sizeof(double), - cudaMemcpyDeviceToHost, - stream - ); - cudaStreamSynchronize(stream); - } - - for (int i = env_start; i < env_start + env_count; i++) { - Env* env = &vec->envs[i]; - c_step(env); - } - if (threading->use_gpu) { - cudaMemcpyAsync( - &((OBS_DTYPE*)vec->gpu_observations)[start*OBS_SIZE], - &((OBS_DTYPE*)vec->observations)[start*OBS_SIZE], - agents_per_buffer*OBS_SIZE*sizeof(OBS_DTYPE), - cudaMemcpyHostToDevice, - stream - ); - cudaMemcpyAsync( - &vec->gpu_rewards[start], - &vec->rewards[start], - agents_per_buffer*sizeof(float), - cudaMemcpyHostToDevice, - stream - ); - cudaMemcpyAsync( - &vec->gpu_terminals[start], - &vec->terminals[start], - agents_per_buffer*sizeof(float), - cudaMemcpyHostToDevice, - stream - ); - cudaStreamSynchronize(stream); - } - } else { - if (threading->use_gpu) { - cudaMemcpyAsync( - &vec->actions[start*NUM_ATNS], - &vec->gpu_actions[start*NUM_ATNS], - agents_per_buffer*NUM_ATNS*sizeof(double), - cudaMemcpyDeviceToHost, - vec->streams[buffer] - ); - } - - atomic_int* buffer_states = threading->buffer_states; - update_buffer_state(threading, buffer, ATN_READY_ON_GPU); - } -} - -EXPORT void vec_recv(VecEnv* vec, int buffer, cudaStream_t stream) { - if (vec->threading->use_gpu) { - cudaStreamSynchronize(stream); - } - - Threading* threading = vec->threading; - - if (threading->num_threads > 0) { - atomic_int* buffer_states = threading->buffer_states; - while (atomic_load(&buffer_states[buffer]) != OBS_READY_ON_CPU) {} - if (threading->use_gpu) { - cudaStreamSynchronize(vec->streams[buffer]); - } - update_buffer_state(vec->threading, buffer, OBS_READY_ON_GPU); - } -} - -EXPORT void vec_step(VecEnv* vec, int buffer, cudaStream_t stream) { - vec_send(vec, buffer, stream); - vec_recv(vec, buffer, stream); -} - -EXPORT void env_close(Env* env) { - c_close(env); - free(env); -} - -EXPORT void vec_close(VecEnv* vec) { - for (int i = 0; i < vec->size; i++) { - Env* env = &vec->envs[i]; - c_close(env); - } - free(vec->envs); -} - -EXPORT void vec_render(VecEnv* vec, int env_idx) { - Env* env = &vec->envs[env_idx]; - c_render(env); -} - -EXPORT void vec_log(VecEnv* vec, Dict* out) { - Log aggregate = {0}; - int num_keys = sizeof(Log) / sizeof(float); - for (int i = 0; i < vec->size; i++) { - Env* env = &vec->envs[i]; - for (int j = 0; j < num_keys; j++) { - ((float*)&aggregate)[j] += ((float*)&env->log)[j]; - ((float*)&env->log)[j] = 0.0f; - } - } - - if (aggregate.n == 0.0f) { +static inline void dict_set(Dict* dict, const char* key, double value) { + assert(dict->size < dict->capacity); + DictItem* item = dict_get_unsafe(dict, key); + if (item != NULL) { + item->value = value; return; } + dict->items[dict->size].key = key; + dict->items[dict->size].value = value; + dict->size++; +} + +// Forward declare CUDA stream type +typedef struct CUstream_st* cudaStream_t; + +// Threading state +typedef struct StaticThreading StaticThreading; + +// Generic VecEnv - envs is void* to be type-agnostic +typedef struct StaticVec { + void* envs; + int size; + int total_agents; + int buffers; + int agents_per_buffer; + int* buffer_env_starts; + int* buffer_env_counts; + float* observations; + double* actions; + float* rewards; + float* terminals; + float* gpu_observations; + double* gpu_actions; + float* gpu_rewards; + float* gpu_terminals; + cudaStream_t* streams; + StaticThreading* threading; + int obs_size; + int num_atns; +} StaticVec; + +// Callback types +typedef void (*net_callback_fn)(void* ctx, int buf, int t); +typedef void (*thread_init_fn)(void* ctx, int buf); +typedef void (*step_fn)(void* env); - // Average - float n = aggregate.n; - for (int i = 0; i < num_keys; i++) { - ((float*)&aggregate)[i] /= n; - } +// Functions implemented by env's static library +StaticVec* create_static_vec(int total_agents, int num_buffers, Dict* vec_kwargs, Dict* env_kwargs); +void static_vec_reset(StaticVec* vec); +void static_vec_log(StaticVec* vec, Dict* out); +void create_static_threads(StaticVec* vec, int num_threads, int horizon, + void* ctx, net_callback_fn net_callback, thread_init_fn thread_init); +void static_vec_omp_step(StaticVec* vec); - // User populates dict - dict_set(out, "n", n); - my_log(&aggregate, out); -} +// Env info +int get_obs_size(void); +int get_num_atns(void); +int* get_act_sizes(void); -// Single dlsym entry point - returns struct with all exports -EXPORT EnvExports* get_env_exports(void) { - static EnvExports exports = {0}; - static int initialized = 0; - if (!initialized) { - exports.create_environments = create_environments; - exports.create_threads = create_threads; - exports.env_init = env_init; - exports.vec_reset = vec_reset; - exports.vec_step = vec_step; - exports.vec_send = vec_send; - exports.vec_recv = vec_recv; - exports.vec_omp_step = vec_omp_step; - exports.env_close = env_close; - exports.vec_close = vec_close; - exports.vec_log = vec_log; - exports.vec_render = vec_render; - exports.obs_n = OBS_N; - exports.num_atns = NUM_ATNS_EXPORT; - exports.act_sizes = (int*)ACT_SIZES_EXPORT; - exports.obs_type = OBS_T; - exports.act_type = ACT_T; - initialized = 1; - } - return &exports; +// Optional shared state functions +void* my_shared(void* env, Dict* kwargs); +void my_shared_close(void* env); +void* my_get(void* env, Dict* out); +int my_put(void* env, Dict* kwargs); + +#ifdef __cplusplus } +#endif diff --git a/pufferlib/extensions/models.cpp b/pufferlib/extensions/models.cpp index 35aae40dd..bf7644314 100644 --- a/pufferlib/extensions/models.cpp +++ b/pufferlib/extensions/models.cpp @@ -432,62 +432,87 @@ class NMMO3Decoder : public Decoder { }; // Drive encoder: ego/partner/road encoders with max pooling +// Two modes: +// use_fused_kernel=true: FC -> Max (fused kernel, no intermediate layer) +// use_fused_kernel=false: Linear -> LayerNorm -> Linear -> Max (original torch) class DriveEncoder : public Encoder { public: - // Ego encoder: Linear -> LayerNorm -> Linear + // Ego encoder: Linear -> ReLU -> Linear (no max pooling, single point) torch::nn::Linear ego_linear1{nullptr}; - torch::nn::LayerNorm ego_norm{nullptr}; torch::nn::Linear ego_linear2{nullptr}; - // Road encoder: Linear -> LayerNorm -> Linear + + // Road encoder weights - fused mode: single FC layer + Tensor road_W{nullptr}; + Tensor road_b{nullptr}; + // Road encoder modules - torch mode: Linear -> LayerNorm -> Linear torch::nn::Linear road_linear1{nullptr}; - torch::nn::LayerNorm road_norm{nullptr}; + torch::nn::LayerNorm road_ln{nullptr}; torch::nn::Linear road_linear2{nullptr}; - // Partner encoder: Linear -> LayerNorm -> Linear + + // Partner encoder weights - fused mode: single FC layer + Tensor partner_W{nullptr}; + Tensor partner_b{nullptr}; + // Partner encoder modules - torch mode: Linear -> LayerNorm -> Linear torch::nn::Linear partner_linear1{nullptr}; - torch::nn::LayerNorm partner_norm{nullptr}; + torch::nn::LayerNorm partner_ln{nullptr}; torch::nn::Linear partner_linear2{nullptr}; + // Shared embedding torch::nn::Linear shared_linear{nullptr}; int input_size; int hidden_size; + bool use_fused_kernel; + + DriveEncoder(int64_t input_size, int64_t hidden_size, bool use_fused_kernel = true) + : input_size(128), hidden_size(hidden_size), use_fused_kernel(use_fused_kernel) { - DriveEncoder(int64_t input_size, int64_t hidden_size) - : input_size(128), hidden_size(hidden_size) { - // Ego encoder: 7 -> 128 -> 128 + // Ego encoder: 7 -> 128 -> 128 (Linear -> ReLU -> Linear) ego_linear1 = register_module("ego_linear1", torch::nn::Linear( torch::nn::LinearOptions(7, 128).bias(true))); torch::nn::init::orthogonal_(ego_linear1->weight, std::sqrt(2.0)); torch::nn::init::constant_(ego_linear1->bias, 0.0); - ego_norm = register_module("ego_norm", torch::nn::LayerNorm( - torch::nn::LayerNormOptions({128}))); ego_linear2 = register_module("ego_linear2", torch::nn::Linear( torch::nn::LinearOptions(128, 128).bias(true))); torch::nn::init::orthogonal_(ego_linear2->weight, std::sqrt(2.0)); torch::nn::init::constant_(ego_linear2->bias, 0.0); - // Road encoder: 13 -> 128 -> 128 (6 continuous + 7 one-hot) - road_linear1 = register_module("road_linear1", torch::nn::Linear( - torch::nn::LinearOptions(13, 128).bias(true))); - torch::nn::init::orthogonal_(road_linear1->weight, std::sqrt(2.0)); - torch::nn::init::constant_(road_linear1->bias, 0.0); - road_norm = register_module("road_norm", torch::nn::LayerNorm( - torch::nn::LayerNormOptions({128}))); - road_linear2 = register_module("road_linear2", torch::nn::Linear( - torch::nn::LinearOptions(128, 128).bias(true))); - torch::nn::init::orthogonal_(road_linear2->weight, std::sqrt(2.0)); - torch::nn::init::constant_(road_linear2->bias, 0.0); - - // Partner encoder: 7 -> 128 -> 128 - partner_linear1 = register_module("partner_linear1", torch::nn::Linear( - torch::nn::LinearOptions(7, 128).bias(true))); - torch::nn::init::orthogonal_(partner_linear1->weight, std::sqrt(2.0)); - torch::nn::init::constant_(partner_linear1->bias, 0.0); - partner_norm = register_module("partner_norm", torch::nn::LayerNorm( - torch::nn::LayerNormOptions({128}))); - partner_linear2 = register_module("partner_linear2", torch::nn::Linear( - torch::nn::LinearOptions(128, 128).bias(true))); - torch::nn::init::orthogonal_(partner_linear2->weight, std::sqrt(2.0)); - torch::nn::init::constant_(partner_linear2->bias, 0.0); + if (use_fused_kernel) { + // Fused mode: single FC -> Max (no intermediate layer) + // Road: 13 -> 128 (6 continuous + 7 one-hot) + road_W = register_parameter("road_W", torch::empty({128, 13})); + road_b = register_parameter("road_b", torch::zeros({128})); + torch::nn::init::orthogonal_(road_W, std::sqrt(2.0)); + + // Partner: 7 -> 128 + partner_W = register_parameter("partner_W", torch::empty({128, 7})); + partner_b = register_parameter("partner_b", torch::zeros({128})); + torch::nn::init::orthogonal_(partner_W, std::sqrt(2.0)); + } else { + // Torch mode: Linear -> LayerNorm -> Linear -> Max + // Road: 13 -> 128 -> 128 + road_linear1 = register_module("road_linear1", torch::nn::Linear( + torch::nn::LinearOptions(13, 128).bias(true))); + torch::nn::init::orthogonal_(road_linear1->weight, std::sqrt(2.0)); + torch::nn::init::constant_(road_linear1->bias, 0.0); + road_ln = register_module("road_ln", torch::nn::LayerNorm( + torch::nn::LayerNormOptions({128}))); + road_linear2 = register_module("road_linear2", torch::nn::Linear( + torch::nn::LinearOptions(128, 128).bias(true))); + torch::nn::init::orthogonal_(road_linear2->weight, std::sqrt(2.0)); + torch::nn::init::constant_(road_linear2->bias, 0.0); + + // Partner: 7 -> 128 -> 128 + partner_linear1 = register_module("partner_linear1", torch::nn::Linear( + torch::nn::LinearOptions(7, 128).bias(true))); + torch::nn::init::orthogonal_(partner_linear1->weight, std::sqrt(2.0)); + torch::nn::init::constant_(partner_linear1->bias, 0.0); + partner_ln = register_module("partner_ln", torch::nn::LayerNorm( + torch::nn::LayerNormOptions({128}))); + partner_linear2 = register_module("partner_linear2", torch::nn::Linear( + torch::nn::LinearOptions(128, 128).bias(true))); + torch::nn::init::orthogonal_(partner_linear2->weight, std::sqrt(2.0)); + torch::nn::init::constant_(partner_linear2->bias, 0.0); + } // Shared embedding: 3*128 -> hidden_size shared_linear = register_module("shared_linear", torch::nn::Linear( @@ -505,22 +530,41 @@ class DriveEncoder : public Encoder { Tensor partner_obs = x.narrow(1, 7, 63*7); Tensor road_obs = x.narrow(1, 7 + 63*7, 200*7); - // Ego encoding - Tensor ego_features = ego_linear2->forward(ego_norm->forward(ego_linear1->forward(ego_obs))); + // Ego encoding: Linear -> ReLU -> Linear (single point, no max) + Tensor ego_features = ego_linear2->forward(torch::relu(ego_linear1->forward(ego_obs))); - // Partner encoding with max pooling - Tensor partner_objects = partner_obs.view({B, 63, 7}); - Tensor partner_enc = partner_linear2->forward(partner_norm->forward(partner_linear1->forward(partner_objects))); - Tensor partner_features = std::get<0>(partner_enc.max(1)); // max pool over 63 objects + // Partner encoding + Tensor partner_objects = partner_obs.view({B, 63, 7}).contiguous(); + Tensor partner_features; + if (use_fused_kernel) { + // Fused FC -> Max kernel + partner_features = fc_max(partner_objects, partner_W, partner_b); + } else { + // Torch: Linear -> LayerNorm -> Linear -> Max + auto h = partner_linear1->forward(partner_objects); // (B, 63, 128) + h = partner_ln->forward(h); + h = partner_linear2->forward(h); // (B, 63, 128) + partner_features = std::get<0>(h.max(1)); // (B, 128) + } - // Road encoding with one-hot and max pooling + // Road encoding with one-hot Tensor road_objects = road_obs.view({B, 200, 7}); Tensor road_continuous = road_objects.narrow(2, 0, 6); Tensor road_categorical = road_objects.narrow(2, 6, 1).squeeze(2); Tensor road_onehot = torch::one_hot(road_categorical.to(torch::kInt64), 7).to(torch::kFloat32); - Tensor road_combined = torch::cat({road_continuous, road_onehot}, 2); // (B, 200, 13) - Tensor road_enc = road_linear2->forward(road_norm->forward(road_linear1->forward(road_combined))); - Tensor road_features = std::get<0>(road_enc.max(1)); // max pool over 200 objects + Tensor road_combined = torch::cat({road_continuous, road_onehot}, 2).contiguous(); // (B, 200, 13) + + Tensor road_features; + if (use_fused_kernel) { + // Fused FC -> Max kernel + road_features = fc_max(road_combined, road_W, road_b); + } else { + // Torch: Linear -> LayerNorm -> Linear -> Max + auto h = road_linear1->forward(road_combined); // (B, 200, 128) + h = road_ln->forward(h); + h = road_linear2->forward(h); // (B, 200, 128) + road_features = std::get<0>(h.max(1)); // (B, 128) + } // Concatenate and shared embedding: GELU -> Linear -> ReLU Tensor concat_features = torch::cat({ego_features, road_features, partner_features}, 1); diff --git a/pufferlib/extensions/modules.cpp b/pufferlib/extensions/modules.cpp index 0b2f6c4e7..f011b86d1 100644 --- a/pufferlib/extensions/modules.cpp +++ b/pufferlib/extensions/modules.cpp @@ -1415,4 +1415,297 @@ std::vector sample_logits_cpp( return {actions, sampled_logprobs}; } +// ============================================================================= +// FCReluFCMax: Fused FC -> ReLU -> FC -> Max autograd function +// Input: x (B, N, D_in) - batch of N points, each with D_in features +// W1 (D_mid, D_in), b1 (D_mid) - first linear layer (applied as x @ W1.T + b1) +// W2 (D_out, D_mid), b2 (D_out) - second linear layer +// Output: (B, D_out) - max over N dimension after FC -> ReLU -> FC +// ============================================================================= + +class FCReluFCMaxFunction : public torch::autograd::Function { +public: + static torch::autograd::tensor_list forward( + torch::autograd::AutogradContext* ctx, + torch::Tensor x, // (B, N, D_in) + torch::Tensor W1, // (D_mid, D_in) + torch::Tensor b1, // (D_mid) + torch::Tensor W2, // (D_out, D_mid) + torch::Tensor b2 // (D_out) + ) { + TORCH_CHECK(x.is_cuda(), "x must be on CUDA"); + TORCH_CHECK(W1.is_cuda(), "W1 must be on CUDA"); + TORCH_CHECK(W2.is_cuda(), "W2 must be on CUDA"); + TORCH_CHECK(x.dim() == 3, "x must be (B, N, D_in)"); + TORCH_CHECK(W1.dim() == 2, "W1 must be (D_mid, D_in)"); + TORCH_CHECK(W2.dim() == 2, "W2 must be (D_out, D_mid)"); + TORCH_CHECK(x.is_contiguous(), "x must be contiguous"); + TORCH_CHECK(W1.is_contiguous(), "W1 must be contiguous"); + TORCH_CHECK(W2.is_contiguous(), "W2 must be contiguous"); + + auto dtype = x.dtype(); + auto device = x.device(); + auto B = x.size(0); + auto N = x.size(1); + auto D_in = x.size(2); + auto D_mid = W1.size(0); + auto D_out = W2.size(0); + + TORCH_CHECK(W1.size(1) == D_in, "W1 must be (D_mid, D_in)"); + TORCH_CHECK(W2.size(1) == D_mid, "W2 must be (D_out, D_mid)"); + TORCH_CHECK(b1.size(0) == D_mid, "b1 must be (D_mid)"); + TORCH_CHECK(b2.size(0) == D_out, "b2 must be (D_out)"); + + auto out = torch::empty({B, D_out}, x.options()); + auto argmax_indices = torch::empty({B, D_out}, torch::TensorOptions().dtype(torch::kInt32).device(device)); + auto fc1_at_argmax = torch::empty({B, D_out, D_mid}, torch::TensorOptions().dtype(torch::kFloat32).device(device)); + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + if (dtype == torch::kFloat32) { + launch_fc_relu_fc_max_forward_float( + out.data_ptr(), + argmax_indices.data_ptr(), + fc1_at_argmax.data_ptr(), + x.data_ptr(), + W1.data_ptr(), + b1.data_ptr(), + W2.data_ptr(), + b2.data_ptr(), + static_cast(B), + static_cast(N), + static_cast(D_in), + static_cast(D_mid), + static_cast(D_out), + stream + ); + } else if (dtype == torch::kBFloat16) { + launch_fc_relu_fc_max_forward_bf16( + out.data_ptr(), + argmax_indices.data_ptr(), + fc1_at_argmax.data_ptr(), + x.data_ptr(), + W1.data_ptr(), + b1.data_ptr(), + W2.data_ptr(), + b2.data_ptr(), + static_cast(B), + static_cast(N), + static_cast(D_in), + static_cast(D_mid), + static_cast(D_out), + stream + ); + } else { + TORCH_CHECK(false, "Unsupported dtype. Only float32 and bfloat16 supported."); + } + + ctx->save_for_backward({x, W1, b1, W2, b2, argmax_indices, fc1_at_argmax}); + + return {out}; + } + + static torch::autograd::tensor_list backward( + torch::autograd::AutogradContext* ctx, + torch::autograd::tensor_list grad_outputs + ) { + auto saved = ctx->get_saved_variables(); + auto x = saved[0]; + auto W1 = saved[1]; + auto b1 = saved[2]; + auto W2 = saved[3]; + auto b2 = saved[4]; + auto argmax_indices = saved[5]; + auto fc1_at_argmax = saved[6]; + + auto grad_out = grad_outputs[0].contiguous(); + auto dtype = x.dtype(); + + auto B = x.size(0); + auto N = x.size(1); + auto D_in = x.size(2); + auto D_mid = W1.size(0); + auto D_out = W2.size(0); + + // Initialize gradients to zero (backward kernel uses atomicAdd) + auto grad_x = torch::zeros_like(x); + auto grad_W1 = torch::zeros_like(W1); + auto grad_b1 = torch::zeros_like(b1); + auto grad_W2 = torch::zeros_like(W2); + auto grad_b2 = torch::zeros_like(b2); + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + if (dtype == torch::kFloat32) { + launch_fc_relu_fc_max_backward_float( + grad_x.data_ptr(), + grad_W1.data_ptr(), + grad_b1.data_ptr(), + grad_W2.data_ptr(), + grad_b2.data_ptr(), + grad_out.data_ptr(), + x.data_ptr(), + W1.data_ptr(), + W2.data_ptr(), + argmax_indices.data_ptr(), + fc1_at_argmax.data_ptr(), + static_cast(B), + static_cast(N), + static_cast(D_in), + static_cast(D_mid), + static_cast(D_out), + stream + ); + } else if (dtype == torch::kBFloat16) { + launch_fc_relu_fc_max_backward_bf16( + grad_x.data_ptr(), + grad_W1.data_ptr(), + grad_b1.data_ptr(), + grad_W2.data_ptr(), + grad_b2.data_ptr(), + grad_out.data_ptr(), + x.data_ptr(), + W1.data_ptr(), + W2.data_ptr(), + argmax_indices.data_ptr(), + fc1_at_argmax.data_ptr(), + static_cast(B), + static_cast(N), + static_cast(D_in), + static_cast(D_mid), + static_cast(D_out), + stream + ); + } else { + TORCH_CHECK(false, "Unsupported dtype in backward"); + } + + return {grad_x, grad_W1, grad_b1, grad_W2, grad_b2}; + } +}; + +// Named entrypoint: fc_relu_fc_max(x, W1, b1, W2, b2) -> out +torch::Tensor fc_relu_fc_max( + torch::Tensor x, + torch::Tensor W1, + torch::Tensor b1, + torch::Tensor W2, + torch::Tensor b2 +) { + return FCReluFCMaxFunction::apply(x, W1, b1, W2, b2)[0]; +} + +// Reference implementation for testing +torch::Tensor fc_relu_fc_max_cpp( + torch::Tensor x, // (B, N, D_in) + torch::Tensor W1, // (D_mid, D_in) + torch::Tensor b1, // (D_mid) + torch::Tensor W2, // (D_out, D_mid) + torch::Tensor b2 // (D_out) +) { + // FC1: x @ W1.T + b1 -> (B, N, D_mid) + auto fc1 = torch::addmm(b1, x.flatten(0, 1), W1.t()).view({x.size(0), x.size(1), -1}); + // ReLU + auto relu_out = torch::relu(fc1); + // FC2: relu_out @ W2.T + b2 -> (B, N, D_out) + auto fc2 = torch::addmm(b2, relu_out.flatten(0, 1), W2.t()).view({x.size(0), x.size(1), -1}); + // Max over N dimension + return std::get<0>(fc2.max(1)); +} + +// ============================================================================= +// FCMax: Simple FC -> Max (no intermediate ReLU layer) +// ============================================================================= + +class FCMaxFunction : public torch::autograd::Function { +public: + static torch::autograd::tensor_list forward( + torch::autograd::AutogradContext* ctx, + torch::Tensor x, // (B, N, D_in) + torch::Tensor W, // (D_out, D_in) + torch::Tensor b // (D_out) + ) { + int B = x.size(0); + int N = x.size(1); + int D_in = x.size(2); + int D_out = W.size(0); + + auto out = torch::empty({B, D_out}, x.options()); + auto argmax = torch::empty({B, D_out}, torch::dtype(torch::kInt32).device(x.device())); + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, x.scalar_type(), "fc_max_forward", [&] { + if constexpr (std::is_same_v) { + launch_fc_max_forward_float( + out.data_ptr(), + argmax.data_ptr(), + x.data_ptr(), + W.data_ptr(), + b.data_ptr(), + B, N, D_in, D_out, stream); + } + }); + + ctx->save_for_backward({x, W, argmax}); + ctx->saved_data["B"] = B; + ctx->saved_data["N"] = N; + ctx->saved_data["D_in"] = D_in; + ctx->saved_data["D_out"] = D_out; + + return {out, argmax}; + } + + static torch::autograd::tensor_list backward( + torch::autograd::AutogradContext* ctx, + torch::autograd::tensor_list grad_outputs + ) { + auto saved = ctx->get_saved_variables(); + auto x = saved[0]; + auto W = saved[1]; + auto argmax = saved[2]; + auto grad_out = grad_outputs[0]; + + int B = ctx->saved_data["B"].toInt(); + int N = ctx->saved_data["N"].toInt(); + int D_in = ctx->saved_data["D_in"].toInt(); + int D_out = ctx->saved_data["D_out"].toInt(); + + auto grad_x = torch::zeros_like(x); + auto grad_W = torch::zeros_like(W); + auto grad_b = torch::zeros({D_out}, x.options()); + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, x.scalar_type(), "fc_max_backward", [&] { + if constexpr (std::is_same_v) { + launch_fc_max_backward_float( + grad_x.data_ptr(), + grad_W.data_ptr(), + grad_b.data_ptr(), + grad_out.data_ptr(), + x.data_ptr(), + W.data_ptr(), + argmax.data_ptr(), + B, N, D_in, D_out, stream); + } + }); + + return {grad_x, grad_W, grad_b}; + } +}; + +// Named entrypoint: fc_max(x, W, b) -> out +torch::Tensor fc_max(torch::Tensor x, torch::Tensor W, torch::Tensor b) { + return FCMaxFunction::apply(x, W, b)[0]; +} + +// Reference implementation for testing +torch::Tensor fc_max_cpp(torch::Tensor x, torch::Tensor W, torch::Tensor b) { + // FC: x @ W.T + b -> (B, N, D_out) + auto fc = torch::addmm(b, x.flatten(0, 1), W.t()).view({x.size(0), x.size(1), -1}); + // Max over N dimension + return std::get<0>(fc.max(1)); +} + #endif // PUFFERLIB_MODULES_CPP diff --git a/pufferlib/extensions/muon.cpp b/pufferlib/extensions/muon.cpp index ee1148eff..87e15a168 100644 --- a/pufferlib/extensions/muon.cpp +++ b/pufferlib/extensions/muon.cpp @@ -110,58 +110,61 @@ Tensor Muon::step(LossClosure closure) { at::AutoGradMode enable_grad(true); loss = closure(); } + for (auto& group : param_groups_) { - for (auto& p : group.params()) { - if (!p.grad().defined()) { - continue; - } - const auto& grad = p.grad(); - TORCH_CHECK(!grad.is_sparse(), "Muon does not support sparse gradients"); - auto param_state = state_.find(p.unsafeGetTensorImpl()); - auto& options = static_cast(group.options()); - - // Perform stepweight decay - /* - if (options.weight_decay() != 0) { - p.mul_(1 - options.lr() * options.weight_decay()); + auto& options = static_cast(group.options()); + auto momentum_coef = options.momentum(); + auto weight_decay = options.weight_decay(); + + // Fast path: use contiguous buffers + if (weight_buffer.defined()) { + // Initialize momentum buffer lazily to match weight_buffer + if (!momentum_buffer.defined()) { + momentum_buffer = torch::zeros_like(weight_buffer); } - */ - - // State initialization - if (param_state == state_.end()) { - auto state = std::make_unique(); - state->step(0); - state->momentum_buffer(torch::zeros_like(p, MemoryFormat::Preserve)); - state_[p.unsafeGetTensorImpl()] = std::move(state); - } - - auto& state = - static_cast(*state_[p.unsafeGetTensorImpl()]); - auto& buf = state.momentum_buffer(); - auto& momentum = options.momentum(); - auto weight_decay = options.weight_decay(); - state.step(state.step() + 1); - - // Nesterov momentum. Do not use EMA - buf.mul_(momentum); - buf.add_(grad); - grad.add_(buf*momentum); - - torch::Tensor update = grad.clone(); + // Build full-size grad tensor (zeros for unused params) + Tensor all_grads = torch::zeros_like(weight_buffer); + int64_t offset = 0; + for (auto& p : group.params()) { + int64_t size = p.numel(); + if (p.grad().defined()) { + all_grads.narrow(0, offset, size).copy_(p.grad().flatten()); + } + offset += size; + } - if (grad.dim() >= 2) { - auto G = update.view({update.size(0), -1}); - update = _zeropower_via_newtonschulz(G); // original has hardcoded steps and eps - double ratio = (double)update.size(-2) / (double)update.size(-1); - double scale = std::sqrt(std::max(1.0, ratio)); // Matches heavyball and Keller - update.mul_(scale); + // Batched Nesterov momentum (one mul_, one add_ each) + momentum_buffer.mul_(momentum_coef); + momentum_buffer.add_(all_grads); + all_grads.add_(momentum_buffer, momentum_coef); + + // Newton-Schulz per-param and build full-size update tensor + Tensor all_updates = torch::zeros_like(weight_buffer); + offset = 0; + for (auto& p : group.params()) { + int64_t size = p.numel(); + if (p.grad().defined()) { + Tensor update = all_grads.narrow(0, offset, size).view(p.sizes()); + + if (p.dim() >= 2) { + auto G = update.view({update.size(0), -1}); + update = _zeropower_via_newtonschulz(G); + double ratio = (double)update.size(-2) / (double)update.size(-1); + double scale = std::sqrt(std::max(1.0, ratio)); + update.mul_(scale); + } + + all_updates.narrow(0, offset, size).copy_(update.flatten()); + } + offset += size; } - if (options.weight_decay() != 0) { - p.mul_(1 - lr * weight_decay); - } - p.sub_(lr*update.view(p.sizes())); + // Single batched param update (one mul_, one sub_) + if (weight_decay != 0) { + weight_buffer.mul_(1 - lr * weight_decay); + } + weight_buffer.sub_(all_updates * lr); } } return loss; diff --git a/pufferlib/extensions/muon.h b/pufferlib/extensions/muon.h index cefbfc9e2..60e6b4b9b 100644 --- a/pufferlib/extensions/muon.h +++ b/pufferlib/extensions/muon.h @@ -47,6 +47,9 @@ struct TORCH_API MuonParamState class TORCH_API Muon : public Optimizer { public: torch::Tensor lr; + torch::Tensor weight_buffer; // Contiguous weight buffer for batched updates + torch::Tensor momentum_buffer; // Contiguous momentum buffer + explicit Muon( const std::vector& param_groups, MuonOptions defaults = {}) @@ -63,6 +66,40 @@ class TORCH_API Muon : public Optimizer { explicit Muon(std::vector params, MuonOptions defaults = {}) : Muon({OptimizerParamGroup(std::move(params))}, std::move(defaults)) {} + // Create contiguous weight buffer from params for batched updates + void init_contiguous_weights() { + torch::NoGradGuard no_grad; + auto& params = param_groups_[0].params(); + + // Count total size + int64_t total_size = 0; + for (auto& p : params) { + total_size += p.numel(); + } + + // Allocate single contiguous buffer + auto device = params[0].device(); + weight_buffer = torch::zeros({total_size}, + torch::dtype(torch::kFloat32).device(device)); + weight_buffer.set_requires_grad(true); + + // Copy params into buffer and replace with views + int64_t offset = 0; + for (auto& p : params) { + int64_t size = p.numel(); + auto shape = p.sizes().vec(); + + // Copy current values into buffer + weight_buffer.narrow(0, offset, size).copy_(p.flatten()); + + // Replace param data with view into buffer + torch::Tensor view = weight_buffer.narrow(0, offset, size).view(shape); + p.set_data(view); + + offset += size; + } + } + torch::Tensor step(LossClosure closure = nullptr) override; //void save(serialize::OutputArchive& archive) const override; //void load(serialize::InputArchive& archive) override; diff --git a/pufferlib/extensions/pufferlib.cpp b/pufferlib/extensions/pufferlib.cpp index 99ff2343a..fde51f350 100644 --- a/pufferlib/extensions/pufferlib.cpp +++ b/pufferlib/extensions/pufferlib.cpp @@ -13,7 +13,6 @@ #include #include -#include "vecenv.h" #include #include "muon.h" @@ -27,6 +26,7 @@ #include #include +#include "env_binding.h" typedef torch::Tensor Tensor; @@ -43,10 +43,6 @@ namespace pufferlib { // Model classes are in models.cpp #include "models.cpp" -// Function pointer type for loading exports -typedef EnvExports* (*get_env_exports_fn)(void); - - torch::Dtype to_torch_dtype(int dtype) { if (dtype == FLOAT) { return torch::kFloat32; @@ -64,54 +60,40 @@ torch::Dtype to_torch_dtype(int dtype) { return torch::kFloat32; } -// Torch is stupid. Had to clip out a redundant cuda sync. +// Fast clip_grad_norm_ for contiguous weights +// Cats all grads for one-shot norm computation, then scales each grad void clip_grad_norm_( const std::vector& parameters, - double max_norm, - double norm_type = 2.0 + double max_norm ) { - std::vector params_with_grad; + // Collect flattened grads + std::vector flat_grads; + flat_grads.reserve(parameters.size()); for (const auto& param : parameters) { auto& grad = param.grad(); if (grad.defined()) { - params_with_grad.push_back(param); + flat_grads.push_back(grad.flatten()); } } - if (params_with_grad.empty()) { + if (flat_grads.empty()) { return; } - Tensor total_norm_tensor; - if (norm_type == std::numeric_limits::infinity()) { - std::vector norms; - norms.reserve(params_with_grad.size()); + // Single cat + norm (avoids per-param norm calls) + Tensor all_grads = torch::cat(flat_grads); + Tensor total_norm = all_grads.norm(2); - for (const auto& param : params_with_grad) { - norms.emplace_back(param.grad().data().abs().max()); - } - total_norm_tensor = - (norms.size() == 1) ? norms[0] : torch::max(torch::stack(norms)); - } else if (norm_type == 0) { - total_norm_tensor = - torch::full({}, static_cast(params_with_grad.size())); - } else { - std::vector norms; - norms.reserve(params_with_grad.size()); - - for (const auto& param : params_with_grad) { - norms.emplace_back(param.grad().data().norm(norm_type)); - } - total_norm_tensor = - (norms.size() == 1) ? norms[0] : torch::stack(norms).norm(norm_type); - } + // Compute clip coefficient + Tensor clip_coef = torch::clamp_max(max_norm / (total_norm + 1e-6), 1.0); - Tensor clip_coef = max_norm / (total_norm_tensor + 1e-6); - Tensor clip_coef_clamped = - torch::clamp(clip_coef, std::nullopt /* min */, 1.0 /* max */); - for (auto& param : params_with_grad) { - param.grad().data().mul_(clip_coef_clamped); + // Scale each grad in-place + for (const auto& param : parameters) { + auto& grad = param.grad(); + if (grad.defined()) { + grad.mul_(clip_coef); + } } } @@ -130,45 +112,24 @@ typedef struct { Tensor terminals; } EnvBuf; -std::tuple +std::tuple create_environments(int num_buffers, int total_agents, const std::string& env_name, Dict* vec_kwargs, Dict* env_kwargs, EnvBuf& env) { - std::string name = env_name; - if (name.rfind("puffer_", 0) == 0) { - name = name.substr(7); - } - std::string so_path = "./" + name + ".so"; - void* handle = dlopen(so_path.c_str(), RTLD_NOW); - if (!handle) { - fprintf(stderr, "dlopen error: %s\n", dlerror()); - exit(1); - } - dlerror(); - - // Single dlsym call to get all exports - get_env_exports_fn get_exports = (get_env_exports_fn)dlsym(handle, "get_env_exports"); - const char* dlsym_error = dlerror(); - if (dlsym_error) { - fprintf(stderr, "dlsym error: %s\n", dlsym_error); - dlclose(handle); - exit(1); - } - EnvExports* env_exports = get_exports(); - - VecEnv* vec = env_exports->create_environments(num_buffers, true, 0, vec_kwargs, env_kwargs); + StaticVec* vec = create_static_vec(total_agents, num_buffers, vec_kwargs, env_kwargs); printf("DEBUG create_environments: vec->size=%d, vec->total_agents=%d\n", vec->size, vec->total_agents); - auto obs_dtype = to_torch_dtype(env_exports->obs_type); + int obs_size = get_obs_size(); + int num_atns = get_num_atns(); - env.obs = torch::from_blob(vec->gpu_observations, {total_agents, env_exports->obs_n}, torch::dtype(obs_dtype).device(torch::kCUDA)); - env.actions = torch::from_blob(vec->gpu_actions, {total_agents, env_exports->num_atns}, torch::dtype(torch::kFloat64).device(torch::kCUDA)); + env.obs = torch::from_blob(vec->gpu_observations, {total_agents, obs_size}, torch::dtype(torch::kFloat32).device(torch::kCUDA)); + env.actions = torch::from_blob(vec->gpu_actions, {total_agents, num_atns}, torch::dtype(torch::kFloat64).device(torch::kCUDA)); env.rewards = torch::from_blob(vec->gpu_rewards, {total_agents}, torch::dtype(torch::kFloat32).device(torch::kCUDA)); env.terminals = torch::from_blob(vec->gpu_terminals, {total_agents}, torch::dtype(torch::kFloat32).device(torch::kCUDA)); // Create act_sizes tensor on CUDA (needed for sample_logits kernel) - Tensor act_sizes = torch::from_blob(env_exports->act_sizes, {env_exports->num_atns}, torch::dtype(torch::kInt32)).to(torch::kCUDA); + Tensor act_sizes = torch::from_blob(get_act_sizes(), {num_atns}, torch::dtype(torch::kInt32)).to(torch::kCUDA); - return std::make_tuple(env_exports, vec, act_sizes); + return std::make_tuple(vec, act_sizes); } typedef struct { @@ -274,9 +235,8 @@ typedef struct { typedef struct { PolicyMinGRU* policy; - VecEnv* vec; + StaticVec* vec; torch::optim::Muon* muon; - EnvExports* env_exports; HypersT hypers; std::vector buffer_states; // Per-buffer states for contiguous access RolloutBuf rollouts; @@ -296,7 +256,7 @@ typedef struct { Dict* log_environments_impl(PuffeRL& pufferl) { Dict* out = create_dict(32); - pufferl.env_exports->vec_log(pufferl.vec, out); + static_vec_log(pufferl.vec, out); return out; } @@ -468,8 +428,9 @@ void train_forward_call(TrainGraph& graph, PolicyMinGRU* policy, muon->zero_grad(); } -// Capture -void capture_graph(at::cuda::CUDAGraph* graph, std::function func) { +// Capture with shared memory pool +void capture_graph(at::cuda::CUDAGraph* graph, std::function func, + at::cuda::MempoolId_t pool) { /* Checklist for avoiding diabolical capture bugs: * 1. Don't start separate streams before tracing (i.e. env gpu buffers) * 2. Make sure input/output buffer pointers don't change @@ -489,7 +450,7 @@ void capture_graph(at::cuda::CUDAGraph* graph, std::function func) { auto cap_stream = at::cuda::getStreamFromPool(); at::cuda::setCurrentCUDAStream(cap_stream); - graph->capture_begin(); + graph->capture_begin(pool); func(); graph->capture_end(); cap_stream.synchronize(); @@ -513,11 +474,11 @@ inline void profile_end(bool enable) { } void env_recv(PuffeRL& pufferl, int buf) { - pufferl.env_exports->vec_recv(pufferl.vec, buf, pufferl.vec->streams[buf]); + // Not used in static/OMP path } void env_send(PuffeRL& pufferl, int buf) { - pufferl.env_exports->vec_send(pufferl.vec, buf, pufferl.vec->streams[buf]); + // Not used in static/OMP path } void compute_advantage(RolloutBuf& rollouts, Tensor& advantages, HypersT& hypers) { @@ -577,11 +538,10 @@ std::unique_ptr create_pufferl_impl(HypersT& hypers, const s // act_sizes: 1D tensor of action space sizes per head // num_action_heads: number of action heads (for MultiDiscrete) // act_n: sum of action space sizes (decoder output dim) - auto [env_exports, vec, act_sizes] = create_environments(hypers.num_buffers, hypers.total_agents, env_name, vec_kwargs, env_kwargs, pufferl->env); + auto [vec, act_sizes] = create_environments(hypers.num_buffers, hypers.total_agents, env_name, vec_kwargs, env_kwargs, pufferl->env); int num_action_heads = pufferl->env.actions.size(1); int act_n = act_sizes.sum().item(); - pufferl->env_exports = env_exports; pufferl->vec = vec; pufferl->act_sizes = act_sizes; pufferl->act_sizes_cpu = act_sizes.cpu().to(torch::kInt64).contiguous(); @@ -623,6 +583,8 @@ std::unique_ptr create_pufferl_impl(HypersT& hypers, const s float eps = hypers.eps; pufferl->muon = new torch::optim::Muon(policy->parameters(), torch::optim::MuonOptions(lr).momentum(beta1).eps(eps)); + pufferl->muon->init_contiguous_weights(); + printf("DEBUG: Contiguous weight buffer: %ld elements\n", pufferl->muon->weight_buffer.numel()); // Allocate buffers int segments = hypers.segments; @@ -652,13 +614,16 @@ std::unique_ptr create_pufferl_impl(HypersT& hypers, const s pufferl->train_cudagraph = at::cuda::CUDAGraph(); auto* p = pufferl.get(); + auto train_pool = at::cuda::graph_pool_handle(); capture_graph(&pufferl->train_cudagraph, [p]() { train_forward_call(p->train_buf, p->policy, p->muon, p->hypers, p->adv_mean, p->adv_std, p->act_sizes_cpu, p->hypers.kernels); - }); + }, train_pool); // Fused rollout cudagraphs: [horizon][num_buffers] // Each graph does input copy + forward + output copy in one shot + // Use shared memory pool to reduce memory usage across graphs + auto rollout_pool = at::cuda::graph_pool_handle(); pufferl->fused_rollout_cudagraphs.resize(horizon); for (int h = 0; h < horizon; ++h) { pufferl->fused_rollout_cudagraphs[h].resize(num_buffers); @@ -666,35 +631,27 @@ std::unique_ptr create_pufferl_impl(HypersT& hypers, const s pufferl->fused_rollout_cudagraphs[h][b] = at::cuda::CUDAGraph(); capture_graph(&pufferl->fused_rollout_cudagraphs[h][b], [p, h, b]() { fused_rollout_step(*p, h, b); - }); + }, rollout_pool); } } } - // FAILS IF DONE AFTER CREATE_ENVIRONMENTS - // Try num_threads=0 to disable threading for debugging - int num_threads = 16; - int block_size = vec->size / 16; - if (vec->size < num_threads) { - num_threads = vec->size; - } - if (block_size < 1) { - block_size = 1; - } - if (hypers.use_omp) { - pufferl->env_exports->create_threads(vec, num_threads, block_size, true, pufferl.get(), net_callback_wrapper, thread_init_wrapper, horizon); - // Create PyTorch-managed streams and replace vec->streams with their raw cudaStream_t // This ensures PyTorch properly recognizes the streams for all operations for (int i = 0; i < num_buffers; i++) { pufferl->torch_streams.push_back(at::cuda::getStreamFromPool(false)); vec->streams[i] = pufferl->torch_streams[i].stream(); } - } else { - pufferl->env_exports->create_threads(vec, num_threads, block_size, false, nullptr, nullptr, nullptr, 0); + + + // Static breakout - OMP only + int num_threads = 16; + if (hypers.use_omp) { + create_static_threads(vec, num_threads, horizon, pufferl.get(), net_callback_wrapper, thread_init_wrapper); + } - pufferl->env_exports->vec_reset(vec); + static_vec_reset(vec); return pufferl; } diff --git a/pufferlib/extensions/static_drive.c b/pufferlib/extensions/static_drive.c new file mode 100644 index 000000000..4e97d1d50 --- /dev/null +++ b/pufferlib/extensions/static_drive.c @@ -0,0 +1,138 @@ +// static_drive.c - Drive env with static env binding +// Compiled with clang into libstatic_drive.a + +#include + +// Include header first to get Dict and other types +#include "static_envbinding.h" + +// Drive config - must define before including static_envbinding.c +#define OBS_SIZE 1848 +#define NUM_ATNS 2 +#define ACT_SIZES {7, 13} + +// Include drive env +#include "../ocean/drive/drive.h" + +// Define Env type +#define Env Drive + +// Custom vec init for variable agents per map +#define MY_VEC_INIT +Env* my_vec_init(int* num_envs_out, int* buffer_env_starts, int* buffer_env_counts, + Dict* vec_kwargs, Dict* env_kwargs) { + int total_agents = (int)dict_get(vec_kwargs, "total_agents")->value; + int num_buffers = (int)dict_get(vec_kwargs, "num_buffers")->value; + int num_maps = (int)dict_get(env_kwargs, "num_maps")->value; + + int agents_per_buffer = total_agents / num_buffers; + + // Get config from env_kwargs + float reward_vehicle_collision = dict_get(env_kwargs, "reward_vehicle_collision")->value; + float reward_offroad_collision = dict_get(env_kwargs, "reward_offroad_collision")->value; + float reward_goal_post_respawn = dict_get(env_kwargs, "reward_goal_post_respawn")->value; + float reward_vehicle_collision_post_respawn = dict_get(env_kwargs, "reward_vehicle_collision_post_respawn")->value; + int spawn_immunity_timer = (int)dict_get(env_kwargs, "spawn_immunity_timer")->value; + int human_agent_idx = (int)dict_get(env_kwargs, "human_agent_idx")->value; + + // Allocate max possible envs (1 agent per env worst case) + Env* envs = (Env*)calloc(total_agents, sizeof(Env)); + + int num_envs = 0; + int current_buffer = 0; + int current_buffer_agents = 0; + buffer_env_starts[0] = 0; + + while (current_buffer < num_buffers) { + // Seed srand with current loop index over envs + srand(num_envs); + int map_id = rand() % num_maps; + + char map_file[100]; + sprintf(map_file, "resources/drive/binaries/map_%03d.bin", map_id); + + // Initialize env struct + Env* env = &envs[num_envs]; + memset(env, 0, sizeof(Env)); + + // Set config + env->map_name = strdup(map_file); + env->human_agent_idx = human_agent_idx; + env->reward_vehicle_collision = reward_vehicle_collision; + env->reward_offroad_collision = reward_offroad_collision; + env->reward_goal_post_respawn = reward_goal_post_respawn; + env->reward_vehicle_collision_post_respawn = reward_vehicle_collision_post_respawn; + env->spawn_immunity_timer = spawn_immunity_timer; + env->num_agents = 0; // Let init determine via set_active_agents + + // Call init (loads map, sets active agents, etc.) + init(env); + + int map_agent_count = env->active_agent_count; + + // Check if map fits in current buffer + if (current_buffer_agents + map_agent_count > agents_per_buffer) { + // Doesn't fit - close env and move to next buffer (padding) + c_close(env); + free(env->map_name); + memset(env, 0, sizeof(Env)); + + buffer_env_counts[current_buffer] = num_envs - buffer_env_starts[current_buffer]; + current_buffer++; + if (current_buffer < num_buffers) { + buffer_env_starts[current_buffer] = num_envs; + } + current_buffer_agents = 0; + continue; + } + + // Map fits + env->num_agents = map_agent_count; + current_buffer_agents += map_agent_count; + num_envs++; + } + + // Fill in last buffer count + if (current_buffer < num_buffers) { + buffer_env_counts[current_buffer] = num_envs - buffer_env_starts[current_buffer]; + } + + // Shrink to actual size needed + envs = (Env*)realloc(envs, num_envs * sizeof(Env)); + *num_envs_out = num_envs; + return envs; +} + +// Env-specific init (used for single env creation, not vec) +static void my_init(Env* env, Dict* kwargs) { + env->human_agent_idx = (int)dict_get(kwargs, "human_agent_idx")->value; + env->reward_vehicle_collision = dict_get(kwargs, "reward_vehicle_collision")->value; + env->reward_offroad_collision = dict_get(kwargs, "reward_offroad_collision")->value; + env->reward_goal_post_respawn = dict_get(kwargs, "reward_goal_post_respawn")->value; + env->reward_vehicle_collision_post_respawn = dict_get(kwargs, "reward_vehicle_collision_post_respawn")->value; + env->spawn_immunity_timer = (int)dict_get(kwargs, "spawn_immunity_timer")->value; + int map_id = (int)dict_get(kwargs, "map_id")->value; + int max_agents = (int)dict_get(kwargs, "max_agents")->value; + + char map_file[100]; + sprintf(map_file, "resources/drive/binaries/map_%03d.bin", map_id); + env->num_agents = max_agents; + env->map_name = strdup(map_file); + init(env); +} + +// Env-specific log +static void my_log(Log* log, Dict* out) { + dict_set(out, "perf", log->perf); + dict_set(out, "score", log->score); + dict_set(out, "episode_return", log->episode_return); + dict_set(out, "episode_length", log->episode_length); + dict_set(out, "offroad_rate", log->offroad_rate); + dict_set(out, "collision_rate", log->collision_rate); + dict_set(out, "dnf_rate", log->dnf_rate); + dict_set(out, "completion_rate", log->completion_rate); + dict_set(out, "clean_collision_rate", log->clean_collision_rate); +} + +// Include the template implementation +#include "static_envbinding.c" diff --git a/pufferlib/ocean/asteroids/binding.c b/pufferlib/ocean/asteroids/binding.h similarity index 100% rename from pufferlib/ocean/asteroids/binding.c rename to pufferlib/ocean/asteroids/binding.h diff --git a/pufferlib/ocean/battle/binding.c b/pufferlib/ocean/battle/binding.h similarity index 100% rename from pufferlib/ocean/battle/binding.c rename to pufferlib/ocean/battle/binding.h diff --git a/pufferlib/ocean/blastar/binding.c b/pufferlib/ocean/blastar/binding.c deleted file mode 100644 index 03e318a12..000000000 --- a/pufferlib/ocean/blastar/binding.c +++ /dev/null @@ -1,23 +0,0 @@ -#include "blastar.h" -#define Env Blastar -#include "../env_binding.h" - -static int my_init(Env* env, PyObject* args, PyObject* kwargs) { - env->num_obs = unpack(kwargs, "num_obs"); - init(env, env->num_obs); - return 0; -} - -static int my_log(PyObject* dict, Log* log) { - assign_to_dict(dict, "perf", log->perf); - assign_to_dict(dict, "score", log->score); - assign_to_dict(dict, "episode_return", log->episode_return); - assign_to_dict(dict, "episode_length", log->episode_length); - assign_to_dict(dict, "lives", log->lives); - assign_to_dict(dict, "vertical_closeness_rew", log->vertical_closeness_rew); - assign_to_dict(dict, "fired_bullet_rew", log->fired_bullet_rew); - assign_to_dict(dict, "kill_streak", log->kill_streak); - assign_to_dict(dict, "hit_enemy_with_bullet_rew", log->hit_enemy_with_bullet_rew); - assign_to_dict(dict, "avg_score_difference", log->avg_score_difference); - return 0; -} diff --git a/pufferlib/ocean/blastar/binding.h b/pufferlib/ocean/blastar/binding.h new file mode 100644 index 000000000..e8147fcbd --- /dev/null +++ b/pufferlib/ocean/blastar/binding.h @@ -0,0 +1,28 @@ +#include "blastar.h" +#define OBS_SIZE 10 +#define NUM_ATNS 1 +#define ACT_SIZES {6} +#define OBS_TYPE FLOAT +#define ACT_TYPE DOUBLE + +#define Env Blastar +#include "env_binding.h" + +void my_init(Env* env, Dict* kwargs) { + env->num_agents = 1; + int num_obs = dict_get(kwargs, "num_obs")->value; + init(env, num_obs); +} + +void my_log(Log* log, Dict* out) { + dict_set(out, "perf", log->perf); + dict_set(out, "score", log->score); + dict_set(out, "episode_return", log->episode_return); + dict_set(out, "episode_length", log->episode_length); + dict_set(out, "lives", log->lives); + dict_set(out, "vertical_closeness_rew", log->vertical_closeness_rew); + dict_set(out, "fired_bullet_rew", log->fired_bullet_rew); + dict_set(out, "kill_streak", log->kill_streak); + dict_set(out, "hit_enemy_with_bullet_rew", log->hit_enemy_with_bullet_rew); + dict_set(out, "avg_score_difference", log->avg_score_difference); +} diff --git a/pufferlib/ocean/boids/binding.c b/pufferlib/ocean/boids/binding.h similarity index 100% rename from pufferlib/ocean/boids/binding.c rename to pufferlib/ocean/boids/binding.h diff --git a/pufferlib/ocean/breakout/binding.c b/pufferlib/ocean/breakout/binding.c deleted file mode 100644 index adf766778..000000000 --- a/pufferlib/ocean/breakout/binding.c +++ /dev/null @@ -1,32 +0,0 @@ -#include "breakout.h" - -#define Env Breakout -#include "../env_binding.h" - -static int my_init(Env* env, PyObject* args, PyObject* kwargs) { - env->frameskip = unpack(kwargs, "frameskip"); - env->width = unpack(kwargs, "width"); - env->height = unpack(kwargs, "height"); - env->initial_paddle_width = unpack(kwargs, "paddle_width"); - env->paddle_height = unpack(kwargs, "paddle_height"); - env->ball_width = unpack(kwargs, "ball_width"); - env->ball_height = unpack(kwargs, "ball_height"); - env->brick_width = unpack(kwargs, "brick_width"); - env->brick_height = unpack(kwargs, "brick_height"); - env->brick_rows = unpack(kwargs, "brick_rows"); - env->brick_cols = unpack(kwargs, "brick_cols"); - env->initial_ball_speed = unpack(kwargs, "initial_ball_speed"); - env->max_ball_speed = unpack(kwargs, "max_ball_speed"); - env->paddle_speed = unpack(kwargs, "paddle_speed"); - env->continuous = unpack(kwargs, "continuous"); - init(env); - return 0; -} - -static int my_log(PyObject* dict, Log* log) { - assign_to_dict(dict, "perf", log->perf); - assign_to_dict(dict, "score", log->score); - assign_to_dict(dict, "episode_return", log->episode_return); - assign_to_dict(dict, "episode_length", log->episode_length); - return 0; -} diff --git a/pufferlib/extensions/breakout.c b/pufferlib/ocean/breakout/binding.h similarity index 97% rename from pufferlib/extensions/breakout.c rename to pufferlib/ocean/breakout/binding.h index 815164c3f..7114fa38f 100644 --- a/pufferlib/extensions/breakout.c +++ b/pufferlib/ocean/breakout/binding.h @@ -1,4 +1,4 @@ -#include "../ocean/breakout/breakout.h" +#include "breakout.h" #define OBS_SIZE 118 #define NUM_ATNS 1 #define ACT_SIZES {3} diff --git a/pufferlib/ocean/cartpole/binding.c b/pufferlib/ocean/cartpole/binding.c deleted file mode 100644 index 52cdbf1c6..000000000 --- a/pufferlib/ocean/cartpole/binding.c +++ /dev/null @@ -1,26 +0,0 @@ -#include "cartpole.h" -#define Env Cartpole -#include "../env_binding.h" - -static int my_init(Env* env, PyObject* args, PyObject* kwargs) { - env->cart_mass = unpack(kwargs, "cart_mass"); - env->pole_mass = unpack(kwargs, "pole_mass"); - env->pole_length = unpack(kwargs, "pole_length"); - env->gravity = unpack(kwargs, "gravity"); - env->force_mag = unpack(kwargs, "force_mag"); - env->tau = unpack(kwargs, "dt"); - env->continuous = unpack(kwargs, "continuous"); - init(env); - return 0; -} - -static int my_log(PyObject* dict, Log* log) { - assign_to_dict(dict, "score", log->score); - assign_to_dict(dict, "perf", log->perf); - assign_to_dict(dict, "episode_length", log->episode_length); - assign_to_dict(dict, "x_threshold_termination", log->x_threshold_termination); - assign_to_dict(dict, "pole_angle_termination", log->pole_angle_termination); - assign_to_dict(dict, "max_steps_termination", log->max_steps_termination); - assign_to_dict(dict, "n", log->n); - return 0; -} diff --git a/pufferlib/ocean/cartpole/binding.h b/pufferlib/ocean/cartpole/binding.h new file mode 100644 index 000000000..ad231b7ee --- /dev/null +++ b/pufferlib/ocean/cartpole/binding.h @@ -0,0 +1,31 @@ +#include "cartpole.h" +#define OBS_SIZE 4 +#define NUM_ATNS 1 +#define ACT_SIZES {2} +#define OBS_TYPE FLOAT +#define ACT_TYPE DOUBLE + +#define Env Cartpole +#include "env_binding.h" + +void my_init(Env* env, Dict* kwargs) { + env->num_agents = 1; + env->cart_mass = dict_get(kwargs, "cart_mass")->value; + env->pole_mass = dict_get(kwargs, "pole_mass")->value; + env->pole_length = dict_get(kwargs, "pole_length")->value; + env->gravity = dict_get(kwargs, "gravity")->value; + env->force_mag = dict_get(kwargs, "force_mag")->value; + env->tau = dict_get(kwargs, "dt")->value; + env->continuous = dict_get(kwargs, "continuous")->value; + init(env); +} + +void my_log(Log* log, Dict* out) { + dict_set(out, "score", log->score); + dict_set(out, "perf", log->perf); + dict_set(out, "episode_length", log->episode_length); + dict_set(out, "x_threshold_termination", log->x_threshold_termination); + dict_set(out, "pole_angle_termination", log->pole_angle_termination); + dict_set(out, "max_steps_termination", log->max_steps_termination); + dict_set(out, "n", log->n); +} diff --git a/pufferlib/ocean/chain_mdp/binding.c b/pufferlib/ocean/chain_mdp/binding.h similarity index 100% rename from pufferlib/ocean/chain_mdp/binding.c rename to pufferlib/ocean/chain_mdp/binding.h diff --git a/pufferlib/ocean/checkers/binding.c b/pufferlib/ocean/checkers/binding.h similarity index 100% rename from pufferlib/ocean/checkers/binding.c rename to pufferlib/ocean/checkers/binding.h diff --git a/pufferlib/ocean/connect4/binding.c b/pufferlib/ocean/connect4/binding.c deleted file mode 100644 index de8d7877a..000000000 --- a/pufferlib/ocean/connect4/binding.c +++ /dev/null @@ -1,17 +0,0 @@ -#include "connect4.h" -#define Env CConnect4 -#include "../env_binding.h" - -static int my_init(Env* env, PyObject* args, PyObject* kwargs) { - init(env); - return 0; -} - -static int my_log(PyObject* dict, Log* log) { - assign_to_dict(dict, "perf", log->perf); - assign_to_dict(dict, "score", log->score); - assign_to_dict(dict, "episode_return", log->episode_return); - assign_to_dict(dict, "episode_length", log->episode_length); - assign_to_dict(dict, "n", log->n); - return 0; -} diff --git a/pufferlib/ocean/connect4/binding.h b/pufferlib/ocean/connect4/binding.h new file mode 100644 index 000000000..030d42f6d --- /dev/null +++ b/pufferlib/ocean/connect4/binding.h @@ -0,0 +1,22 @@ +#include "connect4.h" +#define OBS_SIZE 42 +#define NUM_ATNS 1 +#define ACT_SIZES {7} +#define OBS_TYPE FLOAT +#define ACT_TYPE DOUBLE + +#define Env CConnect4 +#include "env_binding.h" + +void my_init(Env* env, Dict* kwargs) { + env->num_agents = 1; + init(env); +} + +void my_log(Log* log, Dict* out) { + dict_set(out, "perf", log->perf); + dict_set(out, "score", log->score); + dict_set(out, "episode_return", log->episode_return); + dict_set(out, "episode_length", log->episode_length); + dict_set(out, "n", log->n); +} diff --git a/pufferlib/ocean/convert/binding.c b/pufferlib/ocean/convert/binding.c deleted file mode 100644 index e53f6e943..000000000 --- a/pufferlib/ocean/convert/binding.c +++ /dev/null @@ -1,22 +0,0 @@ -#include "convert.h" - -#define Env Convert -#include "../env_binding.h" - -static int my_init(Env* env, PyObject* args, PyObject* kwargs) { - env->width = unpack(kwargs, "width"); - env->height = unpack(kwargs, "height"); - env->num_agents = unpack(kwargs, "num_agents"); - env->num_factories = unpack(kwargs, "num_factories"); - env->num_resources = unpack(kwargs, "num_resources"); - init(env); - return 0; -} - -static int my_log(PyObject* dict, Log* log) { - assign_to_dict(dict, "perf", log->perf); - assign_to_dict(dict, "score", log->score); - assign_to_dict(dict, "episode_return", log->episode_return); - assign_to_dict(dict, "episode_length", log->episode_length); - return 0; -} diff --git a/pufferlib/ocean/convert/binding.h b/pufferlib/ocean/convert/binding.h new file mode 100644 index 000000000..0895af7ee --- /dev/null +++ b/pufferlib/ocean/convert/binding.h @@ -0,0 +1,25 @@ +#include "convert.h" +#define OBS_SIZE 28 +#define NUM_ATNS 2 +#define ACT_SIZES {9, 5} +#define OBS_TYPE FLOAT +#define ACT_TYPE DOUBLE + +#define Env Convert +#include "env_binding.h" + +void my_init(Env* env, Dict* kwargs) { + env->num_agents = dict_get(kwargs, "num_agents")->value; + env->width = dict_get(kwargs, "width")->value; + env->height = dict_get(kwargs, "height")->value; + env->num_factories = dict_get(kwargs, "num_factories")->value; + env->num_resources = dict_get(kwargs, "num_resources")->value; + init(env); +} + +void my_log(Log* log, Dict* out) { + dict_set(out, "perf", log->perf); + dict_set(out, "score", log->score); + dict_set(out, "episode_return", log->episode_return); + dict_set(out, "episode_length", log->episode_length); +} diff --git a/pufferlib/ocean/convert_circle/binding.c b/pufferlib/ocean/convert_circle/binding.h similarity index 100% rename from pufferlib/ocean/convert_circle/binding.c rename to pufferlib/ocean/convert_circle/binding.h diff --git a/pufferlib/ocean/drive/binding.c b/pufferlib/ocean/drive/binding.c deleted file mode 100644 index fac309eeb..000000000 --- a/pufferlib/ocean/drive/binding.c +++ /dev/null @@ -1,169 +0,0 @@ -#include "drive.h" -#define Env Drive -#define MY_SHARED -#define MY_PUT -#include "../env_binding.h" - -static int my_put(Env* env, PyObject* args, PyObject* kwargs) { - PyObject* obs = PyDict_GetItemString(kwargs, "observations"); - if (!PyObject_TypeCheck(obs, &PyArray_Type)) { - PyErr_SetString(PyExc_TypeError, "Observations must be a NumPy array"); - return 1; - } - PyArrayObject* observations = (PyArrayObject*)obs; - if (!PyArray_ISCONTIGUOUS(observations)) { - PyErr_SetString(PyExc_ValueError, "Observations must be contiguous"); - return 1; - } - env->observations = PyArray_DATA(observations); - - PyObject* act = PyDict_GetItemString(kwargs, "actions"); - if (!PyObject_TypeCheck(act, &PyArray_Type)) { - PyErr_SetString(PyExc_TypeError, "Actions must be a NumPy array"); - return 1; - } - PyArrayObject* actions = (PyArrayObject*)act; - if (!PyArray_ISCONTIGUOUS(actions)) { - PyErr_SetString(PyExc_ValueError, "Actions must be contiguous"); - return 1; - } - env->actions = PyArray_DATA(actions); - if (PyArray_ITEMSIZE(actions) == sizeof(double)) { - PyErr_SetString(PyExc_ValueError, "Action tensor passed as float64 (pass np.float32 buffer)"); - return 1; - } - - PyObject* rew = PyDict_GetItemString(kwargs, "rewards"); - if (!PyObject_TypeCheck(rew, &PyArray_Type)) { - PyErr_SetString(PyExc_TypeError, "Rewards must be a NumPy array"); - return 1; - } - PyArrayObject* rewards = (PyArrayObject*)rew; - if (!PyArray_ISCONTIGUOUS(rewards)) { - PyErr_SetString(PyExc_ValueError, "Rewards must be contiguous"); - return 1; - } - if (PyArray_NDIM(rewards) != 1) { - PyErr_SetString(PyExc_ValueError, "Rewards must be 1D"); - return 1; - } - env->rewards = PyArray_DATA(rewards); - - PyObject* term = PyDict_GetItemString(kwargs, "terminals"); - if (!PyObject_TypeCheck(term, &PyArray_Type)) { - PyErr_SetString(PyExc_TypeError, "Terminals must be a NumPy array"); - return 1; - } - PyArrayObject* terminals = (PyArrayObject*)term; - if (!PyArray_ISCONTIGUOUS(terminals)) { - PyErr_SetString(PyExc_ValueError, "Terminals must be contiguous"); - return 1; - } - if (PyArray_NDIM(terminals) != 1) { - PyErr_SetString(PyExc_ValueError, "Terminals must be 1D"); - return 1; - } - env->terminals = PyArray_DATA(terminals); - return 0; -} - -static PyObject* my_shared(PyObject* self, PyObject* args, PyObject* kwargs) { - int num_agents = unpack(kwargs, "num_agents"); - int num_maps = unpack(kwargs, "num_maps"); - clock_gettime(CLOCK_REALTIME, &ts); - srand(ts.tv_nsec); - int total_agent_count = 0; - int env_count = 0; - int max_envs = num_agents; - PyObject* agent_offsets = PyList_New(max_envs+1); - PyObject* map_ids = PyList_New(max_envs); - // getting env count - while(total_agent_count < num_agents && env_count < max_envs){ - char map_file[100]; - int map_id = rand() % num_maps; - Drive* env = calloc(1, sizeof(Drive)); - sprintf(map_file, "resources/drive/binaries/map_%03d.bin", map_id); - env->entities = load_map_binary(map_file, env); - set_active_agents(env); - // Store map_id - PyObject* map_id_obj = PyLong_FromLong(map_id); - PyList_SetItem(map_ids, env_count, map_id_obj); - // Store agent offset - PyObject* offset = PyLong_FromLong(total_agent_count); - PyList_SetItem(agent_offsets, env_count, offset); - total_agent_count += env->active_agent_count; - env_count++; - for(int j=0;jnum_entities;j++) { - free_entity(&env->entities[j]); - } - free(env->entities); - free(env->active_agent_indices); - free(env->static_car_indices); - free(env->expert_static_car_indices); - free(env); - } - if(total_agent_count >= num_agents){ - total_agent_count = num_agents; - } - PyObject* final_total_agent_count = PyLong_FromLong(total_agent_count); - PyList_SetItem(agent_offsets, env_count, final_total_agent_count); - PyObject* final_env_count = PyLong_FromLong(env_count); - // resize lists - PyObject* resized_agent_offsets = PyList_GetSlice(agent_offsets, 0, env_count + 1); - PyObject* resized_map_ids = PyList_GetSlice(map_ids, 0, env_count); - // - //Py_DECREF(agent_offsets); - //Py_DECREF(map_ids); - // create a tuple - PyObject* tuple = PyTuple_New(3); - PyTuple_SetItem(tuple, 0, resized_agent_offsets); - PyTuple_SetItem(tuple, 1, resized_map_ids); - PyTuple_SetItem(tuple, 2, final_env_count); - return tuple; - - //Py_DECREF(num); - /* - for(int i = 0;ihuman_agent_idx = unpack(kwargs, "human_agent_idx"); - env->reward_vehicle_collision = unpack(kwargs, "reward_vehicle_collision"); - env->reward_offroad_collision = unpack(kwargs, "reward_offroad_collision"); - env->reward_goal_post_respawn = unpack(kwargs, "reward_goal_post_respawn"); - env->reward_vehicle_collision_post_respawn = unpack(kwargs, "reward_vehicle_collision_post_respawn"); - env->spawn_immunity_timer = unpack(kwargs, "spawn_immunity_timer"); - int map_id = unpack(kwargs, "map_id"); - int max_agents = unpack(kwargs, "max_agents"); - - char map_file[100]; - sprintf(map_file, "resources/drive/binaries/map_%03d.bin", map_id); - env->num_agents = max_agents; - env->map_name = strdup(map_file); - init(env); - return 0; -} - -static int my_log(PyObject* dict, Log* log) { - assign_to_dict(dict, "perf", log->perf); - assign_to_dict(dict, "score", log->score); - assign_to_dict(dict, "episode_return", log->episode_return); - assign_to_dict(dict, "episode_length", log->episode_length); - assign_to_dict(dict, "offroad_rate", log->offroad_rate); - assign_to_dict(dict, "collision_rate", log->collision_rate); - assign_to_dict(dict, "dnf_rate", log->dnf_rate); - assign_to_dict(dict, "n", log->n); - assign_to_dict(dict, "completion_rate", log->completion_rate); - assign_to_dict(dict, "clean_collision_rate", log->clean_collision_rate); - return 0; -} diff --git a/pufferlib/ocean/drive/binding.h b/pufferlib/ocean/drive/binding.h new file mode 100644 index 000000000..fe42b1034 --- /dev/null +++ b/pufferlib/ocean/drive/binding.h @@ -0,0 +1,117 @@ +#include "drive.h" +#define OBS_SIZE 1848 +#define NUM_ATNS 2 +#define ACT_SIZES {7, 13} +#define OBS_TYPE FLOAT +#define ACT_TYPE DOUBLE + +#define MY_VEC_INIT +#define Env Drive +#include "env_binding.h" + +// Test version: find first map with 8 agents and fill buffer with copies +Env* my_vec_init(int* num_envs_out, Dict* vec_kwargs, Dict* env_kwargs) { + int total_agents = (int)dict_get(vec_kwargs, "total_agents")->value; + int num_buffers = (int)dict_get(vec_kwargs, "num_buffers")->value; + int num_maps = (int)dict_get(env_kwargs, "num_maps")->value; + + int agents_per_buffer = total_agents / num_buffers; + + // Get config from env_kwargs + float reward_vehicle_collision = dict_get(env_kwargs, "reward_vehicle_collision")->value; + float reward_offroad_collision = dict_get(env_kwargs, "reward_offroad_collision")->value; + float reward_goal_post_respawn = dict_get(env_kwargs, "reward_goal_post_respawn")->value; + float reward_vehicle_collision_post_respawn = dict_get(env_kwargs, "reward_vehicle_collision_post_respawn")->value; + int spawn_immunity_timer = (int)dict_get(env_kwargs, "spawn_immunity_timer")->value; + int human_agent_idx = (int)dict_get(env_kwargs, "human_agent_idx")->value; + + // Find first map with exactly 8 agents + int target_map_id = -1; + for (int map_id = 0; map_id < num_maps; map_id++) { + char map_file[100]; + sprintf(map_file, "resources/drive/binaries/map_%03d.bin", map_id); + + Env temp_env = {0}; + temp_env.map_name = map_file; + temp_env.num_agents = 0; + init(&temp_env); + + int agent_count = temp_env.active_agent_count; + int total_agent_count = temp_env.num_agents; + c_close(&temp_env); + + if (agent_count == 8) { + target_map_id = map_id; + printf("Found map %d with 8 active agents and %d total agents\n", map_id, total_agent_count); + break; + } + } + + if (target_map_id < 0) { + printf("ERROR: No map found with exactly 8 agents\n"); + *num_envs_out = 0; + return NULL; + } + + // Calculate how many envs we need (8 agents per env) + int envs_per_buffer = agents_per_buffer / 8; + int total_envs = envs_per_buffer * num_buffers; + + Env* envs = (Env*)calloc(total_envs, sizeof(Env)); + + char map_file[100]; + sprintf(map_file, "resources/drive/binaries/map_%03d.bin", target_map_id); + + for (int i = 0; i < total_envs; i++) { + Env* env = &envs[i]; + memset(env, 0, sizeof(Env)); + + env->map_name = strdup(map_file); + env->human_agent_idx = human_agent_idx; + env->reward_vehicle_collision = reward_vehicle_collision; + env->reward_offroad_collision = reward_offroad_collision; + env->reward_goal_post_respawn = reward_goal_post_respawn; + env->reward_vehicle_collision_post_respawn = reward_vehicle_collision_post_respawn; + env->spawn_immunity_timer = spawn_immunity_timer; + env->num_agents = 0; + + init(env); + env->num_agents = env->active_agent_count; + } + + printf("Created %d envs with %d agents each (%d total agents)\n", + total_envs, 8, total_envs * 8); + + *num_envs_out = total_envs; + return envs; +} + +void my_init(Env* env, Dict* kwargs) { + env->human_agent_idx = dict_get(kwargs, "human_agent_idx")->value; + env->reward_vehicle_collision = dict_get(kwargs, "reward_vehicle_collision")->value; + env->reward_offroad_collision = dict_get(kwargs, "reward_offroad_collision")->value; + env->reward_goal_post_respawn = dict_get(kwargs, "reward_goal_post_respawn")->value; + env->reward_vehicle_collision_post_respawn = dict_get(kwargs, "reward_vehicle_collision_post_respawn")->value; + env->spawn_immunity_timer = dict_get(kwargs, "spawn_immunity_timer")->value; + int map_id = dict_get(kwargs, "map_id")->value; + int max_agents = dict_get(kwargs, "max_agents")->value; + + char map_file[100]; + sprintf(map_file, "resources/drive/binaries/map_%03d.bin", map_id); + env->num_agents = max_agents; + env->map_name = strdup(map_file); + init(env); +} + +void my_log(Log* log, Dict* out) { + dict_set(out, "perf", log->perf); + dict_set(out, "score", log->score); + dict_set(out, "episode_return", log->episode_return); + dict_set(out, "episode_length", log->episode_length); + dict_set(out, "offroad_rate", log->offroad_rate); + dict_set(out, "collision_rate", log->collision_rate); + dict_set(out, "dnf_rate", log->dnf_rate); + dict_set(out, "n", log->n); + dict_set(out, "completion_rate", log->completion_rate); + dict_set(out, "clean_collision_rate", log->clean_collision_rate); +} diff --git a/pufferlib/ocean/drone/binding.c b/pufferlib/ocean/drone/binding.c deleted file mode 100644 index ed5475ea9..000000000 --- a/pufferlib/ocean/drone/binding.c +++ /dev/null @@ -1,26 +0,0 @@ -#include "drone.h" -#include "render.h" - -#define Env DroneEnv -#include "../env_binding.h" - -static int my_init(Env *env, PyObject *args, PyObject *kwargs) { - env->num_agents = unpack(kwargs, "num_agents"); - env->max_rings = unpack(kwargs, "max_rings"); - init(env); - return 0; -} - -static int my_log(PyObject *dict, Log *log) { - assign_to_dict(dict, "perf", log->perf); - assign_to_dict(dict, "score", log->score); - assign_to_dict(dict, "rings_passed", log->rings_passed); - assign_to_dict(dict, "ring_collisions", log->ring_collision); - assign_to_dict(dict, "collision_rate", log->collision_rate); - assign_to_dict(dict, "oob", log->oob); - assign_to_dict(dict, "timeout", log->timeout); - assign_to_dict(dict, "episode_return", log->episode_return); - assign_to_dict(dict, "episode_length", log->episode_length); - assign_to_dict(dict, "n", log->n); - return 0; -} diff --git a/pufferlib/ocean/drone/binding.h b/pufferlib/ocean/drone/binding.h new file mode 100644 index 000000000..438cf365e --- /dev/null +++ b/pufferlib/ocean/drone/binding.h @@ -0,0 +1,28 @@ +#include "drone.h" +#define OBS_SIZE 26 +#define NUM_ATNS 4 +#define ACT_SIZES {1, 1, 1, 1} +#define OBS_TYPE FLOAT +#define ACT_TYPE DOUBLE + +#define Env DroneEnv +#include "env_binding.h" + +void my_init(Env* env, Dict* kwargs) { + env->num_agents = dict_get(kwargs, "num_agents")->value; + env->max_rings = dict_get(kwargs, "max_rings")->value; + init(env); +} + +void my_log(Log* log, Dict* out) { + dict_set(out, "perf", log->perf); + dict_set(out, "score", log->score); + dict_set(out, "rings_passed", log->rings_passed); + dict_set(out, "ring_collision", log->ring_collision); + dict_set(out, "collision_rate", log->collision_rate); + dict_set(out, "oob", log->oob); + dict_set(out, "timeout", log->timeout); + dict_set(out, "episode_return", log->episode_return); + dict_set(out, "episode_length", log->episode_length); + dict_set(out, "n", log->n); +} diff --git a/pufferlib/ocean/enduro/binding.c b/pufferlib/ocean/enduro/binding.c deleted file mode 100644 index 04919d12d..000000000 --- a/pufferlib/ocean/enduro/binding.c +++ /dev/null @@ -1,40 +0,0 @@ -#include "enduro.h" -#include -#define Env Enduro -#include "../env_binding.h" - -static int my_init(Env* env, PyObject* args, PyObject* kwargs) { - env->width = unpack(kwargs, "width"); - env->height = unpack(kwargs, "height"); - env->car_width = unpack(kwargs, "car_width"); - env->car_height = unpack(kwargs, "car_height"); - env->max_enemies = unpack(kwargs, "max_enemies"); - env->continuous = unpack(kwargs, "continuous"); - - PyObject* seed_val = PyDict_GetItemString(kwargs, "seed"); - if (seed_val) { - env->seed = unpack(kwargs, "seed"); - // Initialize the RNG state with the seed - env->rng_state = env->seed; - } - - return 0; -} - -static int my_log(PyObject* dict, Log* log) { - assign_to_dict(dict, "perf", log->perf); - assign_to_dict(dict, "score", log->score); - assign_to_dict(dict, "episode_return", log->episode_return); - assign_to_dict(dict, "episode_length", log->episode_length); - assign_to_dict(dict, "reward", log->reward); - assign_to_dict(dict, "step_rew_car_passed_no_crash", log->step_rew_car_passed_no_crash); - assign_to_dict(dict, "crashed_penalty", log->crashed_penalty); - assign_to_dict(dict, "passed_cars", log->passed_cars); - assign_to_dict(dict, "passed_by_enemy", log->passed_by_enemy); - assign_to_dict(dict, "cars_to_pass", log->cars_to_pass); - assign_to_dict(dict, "days_completed", log->days_completed); - assign_to_dict(dict, "days_failed", log->days_failed); - assign_to_dict(dict, "collisions_player_vs_car", log->collisions_player_vs_car); - assign_to_dict(dict, "collisions_player_vs_road", log->collisions_player_vs_road); - return 0; -} diff --git a/pufferlib/ocean/enduro/binding.h b/pufferlib/ocean/enduro/binding.h new file mode 100644 index 000000000..d19824f95 --- /dev/null +++ b/pufferlib/ocean/enduro/binding.h @@ -0,0 +1,37 @@ +#include "enduro.h" +#define OBS_SIZE 68 +#define NUM_ATNS 1 +#define ACT_SIZES {9} +#define OBS_TYPE FLOAT +#define ACT_TYPE DOUBLE + +#define Env Enduro +#include "env_binding.h" + +void my_init(Env* env, Dict* kwargs) { + env->num_agents = 1; + env->width = dict_get(kwargs, "width")->value; + env->height = dict_get(kwargs, "height")->value; + env->car_width = dict_get(kwargs, "car_width")->value; + env->car_height = dict_get(kwargs, "car_height")->value; + env->max_enemies = dict_get(kwargs, "max_enemies")->value; + env->continuous = dict_get(kwargs, "continuous")->value; + init(env); +} + +void my_log(Log* log, Dict* out) { + dict_set(out, "perf", log->perf); + dict_set(out, "score", log->score); + dict_set(out, "episode_return", log->episode_return); + dict_set(out, "episode_length", log->episode_length); + dict_set(out, "reward", log->reward); + dict_set(out, "step_rew_car_passed_no_crash", log->step_rew_car_passed_no_crash); + dict_set(out, "crashed_penalty", log->crashed_penalty); + dict_set(out, "passed_cars", log->passed_cars); + dict_set(out, "passed_by_enemy", log->passed_by_enemy); + dict_set(out, "cars_to_pass", log->cars_to_pass); + dict_set(out, "days_completed", log->days_completed); + dict_set(out, "days_failed", log->days_failed); + dict_set(out, "collisions_player_vs_car", log->collisions_player_vs_car); + dict_set(out, "collisions_player_vs_road", log->collisions_player_vs_road); +} diff --git a/pufferlib/ocean/freeway/binding.c b/pufferlib/ocean/freeway/binding.c deleted file mode 100644 index 42eea05bb..000000000 --- a/pufferlib/ocean/freeway/binding.c +++ /dev/null @@ -1,31 +0,0 @@ -#include "freeway.h" - -#define Env Freeway -#include "../env_binding.h" - -static int my_init(Env* env, PyObject* args, PyObject* kwargs) { - env->frameskip = unpack(kwargs, "frameskip"); - env->width = unpack(kwargs, "width"); - env->height = unpack(kwargs, "height"); - env->player_width = unpack(kwargs, "player_width"); - env->player_height = unpack(kwargs, "player_height"); - env->car_width = unpack(kwargs, "car_width"); - env->car_height = unpack(kwargs, "car_height"); - env->lane_size = unpack(kwargs, "lane_size"); - env->level = unpack(kwargs, "level"); - env->difficulty = unpack(kwargs, "difficulty"); - env->use_dense_rewards = unpack(kwargs, "use_dense_rewards"); - env->env_randomization = unpack(kwargs, "env_randomization"); - env->enable_human_player = unpack(kwargs, "enable_human_player"); - init(env); - return 0; -} - -static int my_log(PyObject* dict, Log* log) { - assign_to_dict(dict, "perf", log->perf); - assign_to_dict(dict, "score", log->score); - assign_to_dict(dict, "episode_return", log->episode_return); - assign_to_dict(dict, "up_action_frac", log->up_action_frac); - assign_to_dict(dict, "hits", log->hits); - return 0; -} diff --git a/pufferlib/ocean/freeway/binding.h b/pufferlib/ocean/freeway/binding.h new file mode 100644 index 000000000..209c6cd33 --- /dev/null +++ b/pufferlib/ocean/freeway/binding.h @@ -0,0 +1,36 @@ +#include "freeway.h" +#define OBS_SIZE 34 +#define NUM_ATNS 1 +#define ACT_SIZES {3} +#define OBS_TYPE FLOAT +#define ACT_TYPE DOUBLE + +#define Env Freeway +#include "env_binding.h" + +void my_init(Env* env, Dict* kwargs) { + env->num_agents = 1; + env->frameskip = dict_get(kwargs, "frameskip")->value; + env->width = dict_get(kwargs, "width")->value; + env->height = dict_get(kwargs, "height")->value; + env->player_width = dict_get(kwargs, "player_width")->value; + env->player_height = dict_get(kwargs, "player_height")->value; + env->car_width = dict_get(kwargs, "car_width")->value; + env->car_height = dict_get(kwargs, "car_height")->value; + env->lane_size = dict_get(kwargs, "lane_size")->value; + env->difficulty = dict_get(kwargs, "difficulty")->value; + env->level = dict_get(kwargs, "level")->value; + env->enable_human_player = dict_get(kwargs, "enable_human_player")->value; + env->env_randomization = dict_get(kwargs, "env_randomization")->value; + env->use_dense_rewards = dict_get(kwargs, "use_dense_rewards")->value; + init(env); +} + +void my_log(Log* log, Dict* out) { + dict_set(out, "perf", log->perf); + dict_set(out, "score", log->score); + dict_set(out, "episode_return", log->episode_return); + dict_set(out, "episode_length", log->episode_length); + dict_set(out, "up_action_frac", log->up_action_frac); + dict_set(out, "hits", log->hits); +} diff --git a/pufferlib/ocean/g2048/binding.c b/pufferlib/ocean/g2048/binding.c deleted file mode 100644 index e9c6952b8..000000000 --- a/pufferlib/ocean/g2048/binding.c +++ /dev/null @@ -1,23 +0,0 @@ -#include "g2048.h" - -#define Env Game -#include "../env_binding.h" - -static int my_init(Env* env, PyObject* args, PyObject* kwargs) { - env->scaffolding_ratio = unpack(kwargs, "scaffolding_ratio"); - init(env); - return 0; -} - -static int my_log(PyObject* dict, Log* log) { - assign_to_dict(dict, "perf", log->perf); - assign_to_dict(dict, "score", log->score); - assign_to_dict(dict, "merge_score", log->merge_score); - assign_to_dict(dict, "episode_return", log->episode_return); - assign_to_dict(dict, "episode_length", log->episode_length); - assign_to_dict(dict, "lifetime_max_tile", log->lifetime_max_tile); - assign_to_dict(dict, "reached_32768", log->reached_32768); - assign_to_dict(dict, "reached_65536", log->reached_65536); - assign_to_dict(dict, "reached_131072", log->reached_131072); - return 0; -} \ No newline at end of file diff --git a/pufferlib/ocean/g2048/binding.h b/pufferlib/ocean/g2048/binding.h new file mode 100644 index 000000000..c44c0e81a --- /dev/null +++ b/pufferlib/ocean/g2048/binding.h @@ -0,0 +1,27 @@ +#include "g2048.h" +#define OBS_SIZE 16 +#define NUM_ATNS 1 +#define ACT_SIZES {4} +#define OBS_TYPE UNSIGNED_CHAR +#define ACT_TYPE INT + +#define Env Game +#include "env_binding.h" + +void my_init(Env* env, Dict* kwargs) { + env->num_agents = 1; + env->scaffolding_ratio = dict_get(kwargs, "scaffolding_ratio")->value; + init(env); +} + +void my_log(Log* log, Dict* out) { + dict_set(out, "perf", log->perf); + dict_set(out, "score", log->score); + dict_set(out, "merge_score", log->merge_score); + dict_set(out, "episode_return", log->episode_return); + dict_set(out, "episode_length", log->episode_length); + dict_set(out, "lifetime_max_tile", log->lifetime_max_tile); + dict_set(out, "reached_32768", log->reached_32768); + dict_set(out, "reached_65536", log->reached_65536); + dict_set(out, "reached_131072", log->reached_131072); +} diff --git a/pufferlib/ocean/go/binding.c b/pufferlib/ocean/go/binding.c deleted file mode 100644 index 272c0ba41..000000000 --- a/pufferlib/ocean/go/binding.c +++ /dev/null @@ -1,33 +0,0 @@ -#include "go.h" -#define Env CGo -#include "../env_binding.h" - -static int my_init(Env* env, PyObject* args, PyObject* kwargs) { - env->width = unpack(kwargs, "width"); - env->height = unpack(kwargs, "height"); - env->grid_size = unpack(kwargs, "grid_size"); - env->board_width = unpack(kwargs, "board_width"); - env->board_height = unpack(kwargs, "board_height"); - env->grid_square_size = unpack(kwargs, "grid_square_size"); - env->moves_made = unpack(kwargs, "moves_made"); - env->komi = unpack(kwargs, "komi"); - env->score = unpack(kwargs, "score"); - env->last_capture_position = unpack(kwargs, "last_capture_position"); - env->reward_move_pass = unpack(kwargs, "reward_move_pass"); - env->reward_move_invalid = unpack(kwargs, "reward_move_invalid"); - env->reward_move_valid = unpack(kwargs, "reward_move_valid"); - env->reward_player_capture = unpack(kwargs, "reward_player_capture"); - env->reward_opponent_capture = unpack(kwargs, "reward_opponent_capture"); - - init(env); - return 0; -} - -static int my_log(PyObject* dict, Log* log) { - assign_to_dict(dict, "perf", log->perf); - assign_to_dict(dict, "score", log->score); - assign_to_dict(dict, "episode_length", log->episode_length); - assign_to_dict(dict, "episode_return", log->episode_return); - assign_to_dict(dict, "n", log->n); - return 0; -} diff --git a/pufferlib/ocean/go/binding.h b/pufferlib/ocean/go/binding.h new file mode 100644 index 000000000..6bfa06335 --- /dev/null +++ b/pufferlib/ocean/go/binding.h @@ -0,0 +1,37 @@ +#include "go.h" +#define OBS_SIZE 100 +#define NUM_ATNS 1 +#define ACT_SIZES {50} +#define OBS_TYPE FLOAT +#define ACT_TYPE DOUBLE + +#define Env CGo +#include "env_binding.h" + +void my_init(Env* env, Dict* kwargs) { + env->num_agents = 1; + env->width = dict_get(kwargs, "width")->value; + env->height = dict_get(kwargs, "height")->value; + env->grid_size = dict_get(kwargs, "grid_size")->value; + env->board_width = dict_get(kwargs, "board_width")->value; + env->board_height = dict_get(kwargs, "board_height")->value; + env->grid_square_size = dict_get(kwargs, "grid_square_size")->value; + env->moves_made = dict_get(kwargs, "moves_made")->value; + env->komi = dict_get(kwargs, "komi")->value; + env->score = dict_get(kwargs, "score")->value; + env->last_capture_position = dict_get(kwargs, "last_capture_position")->value; + env->reward_move_pass = dict_get(kwargs, "reward_move_pass")->value; + env->reward_move_invalid = dict_get(kwargs, "reward_move_invalid")->value; + env->reward_move_valid = dict_get(kwargs, "reward_move_valid")->value; + env->reward_player_capture = dict_get(kwargs, "reward_player_capture")->value; + env->reward_opponent_capture = dict_get(kwargs, "reward_opponent_capture")->value; + init(env); +} + +void my_log(Log* log, Dict* out) { + dict_set(out, "perf", log->perf); + dict_set(out, "score", log->score); + dict_set(out, "episode_length", log->episode_length); + dict_set(out, "episode_return", log->episode_return); + dict_set(out, "n", log->n); +} diff --git a/pufferlib/ocean/grid/binding.c b/pufferlib/ocean/grid/binding.c deleted file mode 100644 index 449a8a587..000000000 --- a/pufferlib/ocean/grid/binding.c +++ /dev/null @@ -1,71 +0,0 @@ -#include "grid.h" - -#define Env Grid -#define MY_SHARED -#include "../env_binding.h" - -static PyObject* my_shared(PyObject* self, PyObject* args, PyObject* kwargs) { - int num_maps = unpack(kwargs, "num_maps"); - int max_size = unpack(kwargs, "max_size"); - int size = unpack(kwargs, "size"); - State* levels = calloc(num_maps, sizeof(State)); - - if (max_size <= 5) { - PyErr_SetString(PyExc_ValueError, "max_size must be >5"); - return NULL; - } - - // Temporary env used to gen maps - Grid env; - env.max_size = max_size; - init_grid(&env); - - srand(time(NULL)); - int start_seed = rand(); - for (int i = 0; i < num_maps; i++) { - int sz = size; - if (size == -1) { - sz = 5 + (rand() % (max_size-5)); - } - - if (sz % 2 == 0) { - sz -= 1; - } - - float difficulty = (float)rand()/(float)(RAND_MAX); - create_maze_level(&env, sz, sz, difficulty, start_seed + i); - init_state(&levels[i], max_size, 1); - get_state(&env, &levels[i]); - } - - return PyLong_FromVoidPtr(levels); -} - -static int my_init(Env* env, PyObject* args, PyObject* kwargs) { - env->max_size = unpack(kwargs, "max_size"); - env->num_maps = unpack(kwargs, "num_maps"); - init_grid(env); - - PyObject* handle_obj = PyDict_GetItemString(kwargs, "state"); - if (!PyObject_TypeCheck(handle_obj, &PyLong_Type)) { - PyErr_SetString(PyExc_TypeError, "state handle must be an integer"); - return 1; - } - - State* levels = (State*)PyLong_AsVoidPtr(handle_obj); - if (!levels) { - PyErr_SetString(PyExc_ValueError, "Invalid state handle"); - return 1; - } - - env->levels = levels; - return 0; -} - -static int my_log(PyObject* dict, Log* log) { - assign_to_dict(dict, "perf", log->perf); - assign_to_dict(dict, "score", log->score); - assign_to_dict(dict, "episode_return", log->episode_return); - assign_to_dict(dict, "episode_length", log->episode_length); - return 0; -} diff --git a/pufferlib/ocean/grid/binding.h b/pufferlib/ocean/grid/binding.h new file mode 100644 index 000000000..757f1a4d1 --- /dev/null +++ b/pufferlib/ocean/grid/binding.h @@ -0,0 +1,83 @@ +#include "grid.h" +#define OBS_SIZE 121 +#define NUM_ATNS 1 +#define ACT_SIZES {5} +#define OBS_TYPE UNSIGNED_CHAR +#define ACT_TYPE DOUBLE + +#define MY_VEC_INIT +#define Env Grid +#include "env_binding.h" + +Env* my_vec_init(int* num_envs_out, Dict* vec_kwargs, Dict* env_kwargs) { + int num_envs = (int)dict_get(vec_kwargs, "total_agents")->value; + + int max_size = (int)dict_get(env_kwargs, "max_size")->value; + int num_maps = (int)dict_get(env_kwargs, "num_maps")->value; + int map_size = (int)dict_get(env_kwargs, "map_size")->value; + + if (max_size <= 5) { + *num_envs_out = 0; + return NULL; + } + + // Generate maze levels (shared across all envs) + State* levels = calloc(num_maps, sizeof(State)); + + // Temporary env used to generate maps + Grid temp_env; + temp_env.max_size = max_size; + init_grid(&temp_env); + + srand(time(NULL)); + int start_seed = rand(); + for (int i = 0; i < num_maps; i++) { + int sz = map_size; + if (map_size == -1) { + sz = 5 + (rand() % (max_size - 5)); + } + + if (sz % 2 == 0) { + sz -= 1; + } + + float difficulty = (float)rand() / (float)(RAND_MAX); + create_maze_level(&temp_env, sz, sz, difficulty, start_seed + i); + init_state(&levels[i], max_size, 1); + get_state(&temp_env, &levels[i]); + } + + // Free temp env internal allocations + free(temp_env.grid); + free(temp_env.counts); + free(temp_env.agents); + + // Allocate all environments + Env* envs = (Env*)calloc(num_envs, sizeof(Env)); + + for (int i = 0; i < num_envs; i++) { + Env* env = &envs[i]; + env->max_size = max_size; + env->num_maps = num_maps; + env->num_agents = 1; + env->levels = levels; + init_grid(env); + } + + *num_envs_out = num_envs; + return envs; +} + +void my_init(Env* env, Dict* kwargs) { + env->max_size = (int)dict_get(kwargs, "max_size")->value; + env->num_maps = (int)dict_get(kwargs, "num_maps")->value; + env->num_agents = 1; + init_grid(env); +} + +void my_log(Log* log, Dict* out) { + dict_set(out, "perf", log->perf); + dict_set(out, "score", log->score); + dict_set(out, "episode_return", log->episode_return); + dict_set(out, "episode_length", log->episode_length); +} diff --git a/pufferlib/ocean/impulse_wars/binding.c b/pufferlib/ocean/impulse_wars/binding.h similarity index 100% rename from pufferlib/ocean/impulse_wars/binding.c rename to pufferlib/ocean/impulse_wars/binding.h diff --git a/pufferlib/ocean/matsci/binding.c b/pufferlib/ocean/matsci/binding.h similarity index 100% rename from pufferlib/ocean/matsci/binding.c rename to pufferlib/ocean/matsci/binding.h diff --git a/pufferlib/ocean/memory/binding.c b/pufferlib/ocean/memory/binding.h similarity index 100% rename from pufferlib/ocean/memory/binding.c rename to pufferlib/ocean/memory/binding.h diff --git a/pufferlib/ocean/moba/binding.c b/pufferlib/ocean/moba/binding.c deleted file mode 100644 index bfc652881..000000000 --- a/pufferlib/ocean/moba/binding.c +++ /dev/null @@ -1,189 +0,0 @@ -#include "moba.h" - -#define Env MOBA -#define MY_SHARED -#define MY_SHARED_CLOSE -#include "../env_binding.h" - -static PyObject* my_shared(PyObject* self, PyObject* args, PyObject* kwargs) { - unsigned char* game_map_npy = read_file("resources/moba/game_map.npy"); - int* ai_path_buffer = calloc(3*8*128*128, sizeof(int)); - unsigned char* ai_paths = calloc(128*128*128*128, sizeof(unsigned char)); - for (int i = 0; i < 128*128*128*128; i++) { - ai_paths[i] = 255; - } - - PyObject* ai_path_buffer_handle = PyLong_FromVoidPtr(ai_path_buffer); - PyObject* ai_paths_handle = PyLong_FromVoidPtr(ai_paths); - PyObject* game_map_handle = PyLong_FromVoidPtr(game_map_npy); - PyObject* state = PyDict_New(); - PyDict_SetItemString(state, "ai_path_buffer", ai_path_buffer_handle); - PyDict_SetItemString(state, "ai_paths", ai_paths_handle); - PyDict_SetItemString(state, "game_map", game_map_handle); - return PyLong_FromVoidPtr(state); -} - -static PyObject* my_shared_close(PyObject* self, PyObject* args) { - PyObject* handle_obj = PyTuple_GetItem(args, 0); - if (!PyObject_TypeCheck(handle_obj, &PyLong_Type)) { - PyErr_SetString(PyExc_TypeError, "state handle must be an integer"); - return NULL; - } - - PyObject* state_dict = (PyObject*)PyLong_AsVoidPtr(handle_obj); - - PyObject* ai_path_buffer_handle = PyDict_GetItemString(state_dict, "ai_path_buffer"); - if (ai_path_buffer_handle == NULL) { - PyErr_SetString(PyExc_KeyError, "Key 'ai_path_buffer' not found in state"); - return NULL; - } - int* ai_path_buffer = (int*)PyLong_AsVoidPtr(ai_path_buffer_handle); - free(ai_path_buffer); - - PyObject* ai_paths_handle = PyDict_GetItemString(state_dict, "ai_paths"); - if (ai_paths_handle == NULL) { - PyErr_SetString(PyExc_KeyError, "Key 'ai_paths' not found in state"); - return NULL; - } - unsigned char* ai_paths = (unsigned char*)PyLong_AsVoidPtr(ai_paths_handle); - free(ai_paths); - - PyObject* game_map_handle = PyDict_GetItemString(state_dict, "game_map"); - if (game_map_handle == NULL) { - PyErr_SetString(PyExc_KeyError, "Key 'game_map' not found in state"); - return NULL; - } - unsigned char* game_map = (unsigned char*)PyLong_AsVoidPtr(game_map_handle); - free(game_map); - - Py_INCREF(Py_None); - return Py_None; -} - - -static int my_init(Env* env, PyObject* args, PyObject* kwargs) { - env->vision_range = unpack(kwargs, "vision_range"); - env->agent_speed = unpack(kwargs, "agent_speed"); - env->discretize = unpack(kwargs, "discretize"); - env->reward_death = unpack(kwargs, "reward_death"); - env->reward_xp = unpack(kwargs, "reward_xp"); - env->reward_distance = unpack(kwargs, "reward_distance"); - env->reward_tower = unpack(kwargs, "reward_tower"); - env->script_opponents = unpack(kwargs, "script_opponents"); - - PyObject* handle_obj = PyDict_GetItemString(kwargs, "state"); - if (handle_obj == NULL) { - PyErr_SetString(PyExc_KeyError, "Key 'state' not found in kwargs"); - return 1; - } - - // Check if handle_obj is a PyLong - if (!PyLong_Check(handle_obj)) { - PyErr_SetString(PyExc_TypeError, "state handle must be an integer"); - return 1; - } - - // Convert PyLong to PyObject* (state dictionary) - PyObject* state_dict = (PyObject*)PyLong_AsVoidPtr(handle_obj); - if (state_dict == NULL) { - PyErr_SetString(PyExc_ValueError, "Invalid state dictionary pointer"); - return 1; - } - - // Verify it’s a dictionary - if (!PyDict_Check(state_dict)) { - PyErr_SetString(PyExc_TypeError, "State pointer does not point to a dictionary"); - return 1; - } - - // Basic validation: check reference count - if (state_dict->ob_refcnt <= 0) { - PyErr_SetString(PyExc_RuntimeError, "State dictionary has invalid reference count"); - return 1; - } - - // Extract ai_path_buffer - PyObject* ai_path_buffer_obj = PyDict_GetItemString(state_dict, "ai_path_buffer"); - if (ai_path_buffer_obj == NULL) { - PyErr_SetString(PyExc_KeyError, "Key 'ai_path_buffer' not found in state"); - return 1; - } - if (!PyLong_Check(ai_path_buffer_obj)) { - PyErr_SetString(PyExc_TypeError, "ai_path_buffer must be an integer"); - return 1; - } - env->ai_path_buffer = (int*)PyLong_AsVoidPtr(ai_path_buffer_obj); - if (env->ai_path_buffer == NULL) { - PyErr_SetString(PyExc_ValueError, "Invalid ai_path_buffer pointer"); - return 1; - } - - // Extract ai_paths - PyObject* ai_paths_obj = PyDict_GetItemString(state_dict, "ai_paths"); - if (ai_paths_obj == NULL) { - PyErr_SetString(PyExc_KeyError, "Key 'ai_paths' not found in state"); - return 1; - } - if (!PyLong_Check(ai_paths_obj)) { - PyErr_SetString(PyExc_TypeError, "ai_paths must be an integer"); - return 1; - } - env->ai_paths = (unsigned char*)PyLong_AsVoidPtr(ai_paths_obj); - if (env->ai_paths == NULL) { - PyErr_SetString(PyExc_ValueError, "Invalid ai_paths pointer"); - return 1; - } - - // Extract game_map - PyObject* game_map_obj = PyDict_GetItemString(state_dict, "game_map"); - if (game_map_obj == NULL) { - PyErr_SetString(PyExc_KeyError, "Key 'game_map' not found in state"); - return 1; - } - if (!PyLong_Check(game_map_obj)) { - PyErr_SetString(PyExc_TypeError, "game_map must be an integer"); - return 1; - } - unsigned char* game_map = (unsigned char*)PyLong_AsVoidPtr(game_map_obj); - if (game_map == NULL) { - PyErr_SetString(PyExc_ValueError, "Invalid game_map pointer"); - return 1; - } - - init_moba(env, game_map); - return 0; -} - -static int my_log(PyObject* dict, Log* log) { - assign_to_dict(dict, "perf", log->perf); - assign_to_dict(dict, "score", log->score); - assign_to_dict(dict, "episode_return", log->episode_return); - assign_to_dict(dict, "episode_length", log->episode_length); - assign_to_dict(dict, "radiant_victory", log->radiant_victory); - assign_to_dict(dict, "dire_victory", log->dire_victory); - assign_to_dict(dict, "radiant_level", log->radiant_level); - assign_to_dict(dict, "dire_level", log->dire_level); - assign_to_dict(dict, "radiant_towers_alive", log->radiant_towers_alive); - assign_to_dict(dict, "dire_towers_alive", log->dire_towers_alive); - - assign_to_dict(dict, "radiant_support_episode_return", log->radiant_support_episode_return); - assign_to_dict(dict, "radiant_support_reward_death", log->radiant_support_reward_death); - assign_to_dict(dict, "radiant_support_reward_xp", log->radiant_support_reward_xp); - assign_to_dict(dict, "radiant_support_reward_distance", log->radiant_support_reward_distance); - assign_to_dict(dict, "radiant_support_reward_tower", log->radiant_support_reward_tower); - assign_to_dict(dict, "radiant_support_level", log->radiant_support_level); - assign_to_dict(dict, "radiant_support_kills", log->radiant_support_kills); - assign_to_dict(dict, "radiant_support_deaths", log->radiant_support_deaths); - assign_to_dict(dict, "radiant_support_damage_dealt", log->radiant_support_damage_dealt); - assign_to_dict(dict, "radiant_support_damage_received", log->radiant_support_damage_received); - assign_to_dict(dict, "radiant_support_healing_dealt", log->radiant_support_healing_dealt); - assign_to_dict(dict, "radiant_support_healing_received", log->radiant_support_healing_received); - assign_to_dict(dict, "radiant_support_creeps_killed", log->radiant_support_creeps_killed); - assign_to_dict(dict, "radiant_support_neutrals_killed", log->radiant_support_neutrals_killed); - assign_to_dict(dict, "radiant_support_towers_killed", log->radiant_support_towers_killed); - assign_to_dict(dict, "radiant_support_usage_auto", log->radiant_support_usage_auto); - assign_to_dict(dict, "radiant_support_usage_q", log->radiant_support_usage_q); - assign_to_dict(dict, "radiant_support_usage_w", log->radiant_support_usage_w); - assign_to_dict(dict, "radiant_support_usage_e", log->radiant_support_usage_e); - return 0; -} diff --git a/pufferlib/ocean/moba/binding.h b/pufferlib/ocean/moba/binding.h new file mode 100644 index 000000000..774e247ac --- /dev/null +++ b/pufferlib/ocean/moba/binding.h @@ -0,0 +1,108 @@ +#include "moba.h" +#define OBS_SIZE 510 +#define NUM_ATNS 6 +#define ACT_SIZES {7, 7, 3, 2, 2, 2} +#define OBS_TYPE UNSIGNED_CHAR +#define ACT_TYPE DOUBLE + +#define MY_VEC_INIT +#define Env MOBA +#include "env_binding.h" + +Env* my_vec_init(int* num_envs_out, Dict* vec_kwargs, Dict* env_kwargs) { + int num_envs = (int)dict_get(vec_kwargs, "total_agents")->value; + + int vision_range = (int)dict_get(env_kwargs, "vision_range")->value; + float agent_speed = dict_get(env_kwargs, "agent_speed")->value; + int discretize = (int)dict_get(env_kwargs, "discretize")->value; + float reward_death = dict_get(env_kwargs, "reward_death")->value; + float reward_xp = dict_get(env_kwargs, "reward_xp")->value; + float reward_distance = dict_get(env_kwargs, "reward_distance")->value; + float reward_tower = dict_get(env_kwargs, "reward_tower")->value; + int script_opponents = (int)dict_get(env_kwargs, "script_opponents")->value; + + // Load shared game map data + unsigned char* game_map_npy = read_file("resources/moba/game_map.npy"); + if (game_map_npy == NULL) { + *num_envs_out = 0; + return NULL; + } + + // Create shared AI path data + int* ai_path_buffer = calloc(3*8*128*128, sizeof(int)); + unsigned char* ai_paths = calloc(128*128*128*128, sizeof(unsigned char)); + for (int i = 0; i < 128*128*128*128; i++) { + ai_paths[i] = 255; + } + + // Calculate agents per env based on script_opponents + int agents_per_env = script_opponents ? 5 : 10; + int total_envs = num_envs / agents_per_env; + + Env* envs = (Env*)calloc(total_envs, sizeof(Env)); + + for (int i = 0; i < total_envs; i++) { + Env* env = &envs[i]; + env->num_agents = agents_per_env; + env->vision_range = vision_range; + env->agent_speed = agent_speed; + env->discretize = discretize; + env->reward_death = reward_death; + env->reward_xp = reward_xp; + env->reward_distance = reward_distance; + env->reward_tower = reward_tower; + env->script_opponents = script_opponents; + env->ai_path_buffer = ai_path_buffer; + env->ai_paths = ai_paths; + init_moba(env, game_map_npy); + } + + free(game_map_npy); + + *num_envs_out = total_envs; + return envs; +} + +void my_init(Env* env, Dict* kwargs) { + env->vision_range = dict_get(kwargs, "vision_range")->value; + env->agent_speed = dict_get(kwargs, "agent_speed")->value; + env->discretize = dict_get(kwargs, "discretize")->value; + env->reward_death = dict_get(kwargs, "reward_death")->value; + env->reward_xp = dict_get(kwargs, "reward_xp")->value; + env->reward_distance = dict_get(kwargs, "reward_distance")->value; + env->reward_tower = dict_get(kwargs, "reward_tower")->value; + env->script_opponents = dict_get(kwargs, "script_opponents")->value; + env->num_agents = env->script_opponents ? 5 : 10; +} + +void my_log(Log* log, Dict* out) { + dict_set(out, "perf", log->perf); + dict_set(out, "score", log->score); + dict_set(out, "episode_return", log->episode_return); + dict_set(out, "episode_length", log->episode_length); + dict_set(out, "radiant_victory", log->radiant_victory); + dict_set(out, "dire_victory", log->dire_victory); + dict_set(out, "radiant_level", log->radiant_level); + dict_set(out, "dire_level", log->dire_level); + dict_set(out, "radiant_towers_alive", log->radiant_towers_alive); + dict_set(out, "dire_towers_alive", log->dire_towers_alive); + dict_set(out, "radiant_support_episode_return", log->radiant_support_episode_return); + dict_set(out, "radiant_support_reward_death", log->radiant_support_reward_death); + dict_set(out, "radiant_support_reward_xp", log->radiant_support_reward_xp); + dict_set(out, "radiant_support_reward_distance", log->radiant_support_reward_distance); + dict_set(out, "radiant_support_reward_tower", log->radiant_support_reward_tower); + dict_set(out, "radiant_support_level", log->radiant_support_level); + dict_set(out, "radiant_support_kills", log->radiant_support_kills); + dict_set(out, "radiant_support_deaths", log->radiant_support_deaths); + dict_set(out, "radiant_support_damage_dealt", log->radiant_support_damage_dealt); + dict_set(out, "radiant_support_damage_received", log->radiant_support_damage_received); + dict_set(out, "radiant_support_healing_dealt", log->radiant_support_healing_dealt); + dict_set(out, "radiant_support_healing_received", log->radiant_support_healing_received); + dict_set(out, "radiant_support_creeps_killed", log->radiant_support_creeps_killed); + dict_set(out, "radiant_support_neutrals_killed", log->radiant_support_neutrals_killed); + dict_set(out, "radiant_support_towers_killed", log->radiant_support_towers_killed); + dict_set(out, "radiant_support_usage_auto", log->radiant_support_usage_auto); + dict_set(out, "radiant_support_usage_q", log->radiant_support_usage_q); + dict_set(out, "radiant_support_usage_w", log->radiant_support_usage_w); + dict_set(out, "radiant_support_usage_e", log->radiant_support_usage_e); +} diff --git a/pufferlib/ocean/nmmo3/binding.c b/pufferlib/ocean/nmmo3/binding.c deleted file mode 100644 index b11d76bcc..000000000 --- a/pufferlib/ocean/nmmo3/binding.c +++ /dev/null @@ -1,50 +0,0 @@ -#include "nmmo3.h" - -#define Env MMO -#include "../env_binding.h" - -static int my_init(Env* env, PyObject* args, PyObject* kwargs) { - env->width = unpack(kwargs, "width"); - env->height = unpack(kwargs, "height"); - env->num_agents = unpack(kwargs, "num_agents"); - env->num_enemies = unpack(kwargs, "num_enemies"); - env->num_resources = unpack(kwargs, "num_resources"); - env->num_weapons = unpack(kwargs, "num_weapons"); - env->num_gems = unpack(kwargs, "num_gems"); - env->tiers = unpack(kwargs, "tiers"); - env->levels = unpack(kwargs, "levels"); - env->teleportitis_prob = unpack(kwargs, "teleportitis_prob"); - env->enemy_respawn_ticks = unpack(kwargs, "enemy_respawn_ticks"); - env->item_respawn_ticks = unpack(kwargs, "item_respawn_ticks"); - env->x_window = unpack(kwargs, "x_window"); - env->y_window = unpack(kwargs, "y_window"); - env->reward_combat_level = unpack(kwargs, "reward_combat_level"); - env->reward_prof_level = unpack(kwargs, "reward_prof_level"); - env->reward_item_level = unpack(kwargs, "reward_item_level"); - env->reward_market = unpack(kwargs, "reward_market"); - env->reward_death = unpack(kwargs, "reward_death"); - init(env); - return 0; -} - -static int my_log(PyObject* dict, Log* log) { - assign_to_dict(dict, "perf", log->perf); - assign_to_dict(dict, "score", log->score); - assign_to_dict(dict, "episode_return", log->episode_return); - assign_to_dict(dict, "episode_length", log->episode_length); - assign_to_dict(dict, "return_comb_lvl", log->return_comb_lvl); - assign_to_dict(dict, "return_prof_lvl", log->return_prof_lvl); - assign_to_dict(dict, "return_item_atk_lvl", log->return_item_atk_lvl); - assign_to_dict(dict, "return_item_def_lvl", log->return_item_def_lvl); - assign_to_dict(dict, "return_market_buy", log->return_market_buy); - assign_to_dict(dict, "return_market_sell", log->return_market_sell); - assign_to_dict(dict, "return_death", log->return_death); - assign_to_dict(dict, "min_comb_prof", log->min_comb_prof); - assign_to_dict(dict, "purchases", log->purchases); - assign_to_dict(dict, "sales", log->sales); - assign_to_dict(dict, "equip_attack", log->equip_attack); - assign_to_dict(dict, "equip_defense", log->equip_defense); - assign_to_dict(dict, "r", log->r); - assign_to_dict(dict, "c", log->c); - return 0; -} diff --git a/pufferlib/ocean/nmmo3/binding.h b/pufferlib/ocean/nmmo3/binding.h new file mode 100644 index 000000000..02a46a21d --- /dev/null +++ b/pufferlib/ocean/nmmo3/binding.h @@ -0,0 +1,53 @@ +#include "nmmo3.h" +#define OBS_SIZE 1707 +#define NUM_ATNS 1 +#define ACT_SIZES {26} +#define OBS_TYPE UNSIGNED_CHAR +#define ACT_TYPE DOUBLE + +#define Env MMO +#include "env_binding.h" + +void my_init(Env* env, Dict* kwargs) { + env->width = dict_get(kwargs, "width")->value; + env->height = dict_get(kwargs, "height")->value; + env->num_agents = dict_get(kwargs, "num_agents")->value; + env->num_enemies = dict_get(kwargs, "num_enemies")->value; + env->num_resources = dict_get(kwargs, "num_resources")->value; + env->num_weapons = dict_get(kwargs, "num_weapons")->value; + env->num_gems = dict_get(kwargs, "num_gems")->value; + env->tiers = dict_get(kwargs, "tiers")->value; + env->levels = dict_get(kwargs, "levels")->value; + env->teleportitis_prob = dict_get(kwargs, "teleportitis_prob")->value; + env->enemy_respawn_ticks = dict_get(kwargs, "enemy_respawn_ticks")->value; + env->item_respawn_ticks = dict_get(kwargs, "item_respawn_ticks")->value; + env->x_window = dict_get(kwargs, "x_window")->value; + env->y_window = dict_get(kwargs, "y_window")->value; + env->reward_combat_level = dict_get(kwargs, "reward_combat_level")->value; + env->reward_prof_level = dict_get(kwargs, "reward_prof_level")->value; + env->reward_item_level = dict_get(kwargs, "reward_item_level")->value; + env->reward_market = dict_get(kwargs, "reward_market")->value; + env->reward_death = dict_get(kwargs, "reward_death")->value; + init(env); +} + +void my_log(Log* log, Dict* out) { + dict_set(out, "perf", log->perf); + dict_set(out, "score", log->score); + dict_set(out, "episode_return", log->episode_return); + dict_set(out, "episode_length", log->episode_length); + dict_set(out, "return_comb_lvl", log->return_comb_lvl); + dict_set(out, "return_prof_lvl", log->return_prof_lvl); + dict_set(out, "return_item_atk_lvl", log->return_item_atk_lvl); + dict_set(out, "return_item_def_lvl", log->return_item_def_lvl); + dict_set(out, "return_market_buy", log->return_market_buy); + dict_set(out, "return_market_sell", log->return_market_sell); + dict_set(out, "return_death", log->return_death); + dict_set(out, "min_comb_prof", log->min_comb_prof); + dict_set(out, "purchases", log->purchases); + dict_set(out, "sales", log->sales); + dict_set(out, "equip_attack", log->equip_attack); + dict_set(out, "equip_defense", log->equip_defense); + dict_set(out, "r", log->r); + dict_set(out, "c", log->c); +} diff --git a/pufferlib/ocean/onestateworld/binding.c b/pufferlib/ocean/onestateworld/binding.h similarity index 100% rename from pufferlib/ocean/onestateworld/binding.c rename to pufferlib/ocean/onestateworld/binding.h diff --git a/pufferlib/ocean/onlyfish/binding.c b/pufferlib/ocean/onlyfish/binding.h similarity index 100% rename from pufferlib/ocean/onlyfish/binding.c rename to pufferlib/ocean/onlyfish/binding.h diff --git a/pufferlib/ocean/pacman/binding.c b/pufferlib/ocean/pacman/binding.c deleted file mode 100644 index 6bcf45e49..000000000 --- a/pufferlib/ocean/pacman/binding.c +++ /dev/null @@ -1,24 +0,0 @@ -#include "pacman.h" - -#define Env PacmanEnv -#include "../env_binding.h" - -static int my_init(Env* env, PyObject* args, PyObject* kwargs) { - env->randomize_starting_position = unpack(kwargs, "randomize_starting_position"); - env->min_start_timeout = unpack(kwargs, "min_start_timeout"); - env->max_start_timeout = unpack(kwargs, "max_start_timeout"); - env->frightened_time = unpack(kwargs, "frightened_time"); - env->max_mode_changes = unpack(kwargs, "max_mode_changes"); - env->scatter_mode_length = unpack(kwargs, "scatter_mode_length"); - env->chase_mode_length = unpack(kwargs, "chase_mode_length"); - init(env); - return 0; -} - -static int my_log(PyObject* dict, Log* log) { - assign_to_dict(dict, "score", log->score); - assign_to_dict(dict, "perf", log->perf); - assign_to_dict(dict, "episode_return", log->episode_return); - assign_to_dict(dict, "episode_length", log->episode_length); - return 0; -} diff --git a/pufferlib/ocean/pacman/binding.h b/pufferlib/ocean/pacman/binding.h new file mode 100644 index 000000000..1196da4e2 --- /dev/null +++ b/pufferlib/ocean/pacman/binding.h @@ -0,0 +1,28 @@ +#include "pacman.h" +#define OBS_SIZE 291 +#define NUM_ATNS 1 +#define ACT_SIZES {4} +#define OBS_TYPE FLOAT +#define ACT_TYPE DOUBLE + +#define Env PacmanEnv +#include "env_binding.h" + +void my_init(Env* env, Dict* kwargs) { + env->num_agents = 1; + env->randomize_starting_position = dict_get(kwargs, "randomize_starting_position")->value; + env->min_start_timeout = dict_get(kwargs, "min_start_timeout")->value; + env->max_start_timeout = dict_get(kwargs, "max_start_timeout")->value; + env->frightened_time = dict_get(kwargs, "frightened_time")->value; + env->max_mode_changes = dict_get(kwargs, "max_mode_changes")->value; + env->scatter_mode_length = dict_get(kwargs, "scatter_mode_length")->value; + env->chase_mode_length = dict_get(kwargs, "chase_mode_length")->value; + init(env); +} + +void my_log(Log* log, Dict* out) { + dict_set(out, "perf", log->perf); + dict_set(out, "score", log->score); + dict_set(out, "episode_return", log->episode_return); + dict_set(out, "episode_length", log->episode_length); +} diff --git a/pufferlib/ocean/pong/binding.c b/pufferlib/ocean/pong/binding.c deleted file mode 100644 index 4a84ae1d7..000000000 --- a/pufferlib/ocean/pong/binding.c +++ /dev/null @@ -1,31 +0,0 @@ -#include "pong.h" - -#define Env Pong -#include "../env_binding.h" - -static int my_init(Env* env, PyObject* args, PyObject* kwargs) { - env->width = unpack(kwargs, "width"); - env->height = unpack(kwargs, "height"); - env->paddle_width = unpack(kwargs, "paddle_width"); - env->paddle_height = unpack(kwargs, "paddle_height"); - env->ball_width = unpack(kwargs, "ball_width"); - env->ball_height = unpack(kwargs, "ball_height"); - env->paddle_speed = unpack(kwargs, "paddle_speed"); - env->ball_initial_speed_x = unpack(kwargs, "ball_initial_speed_x"); - env->ball_initial_speed_y = unpack(kwargs, "ball_initial_speed_y"); - env->ball_max_speed_y = unpack(kwargs, "ball_max_speed_y"); - env->ball_speed_y_increment = unpack(kwargs, "ball_speed_y_increment"); - env->max_score = unpack(kwargs, "max_score"); - env->frameskip = unpack(kwargs, "frameskip"); - env->continuous = unpack(kwargs, "continuous"); - init(env); - return 0; -} - -static int my_log(PyObject* dict, Log* log) { - assign_to_dict(dict, "perf", log->perf); - assign_to_dict(dict, "score", log->score); - assign_to_dict(dict, "episode_return", log->episode_return); - assign_to_dict(dict, "episode_length", log->episode_length); - return 0; -} diff --git a/pufferlib/ocean/pong/binding.h b/pufferlib/ocean/pong/binding.h new file mode 100644 index 000000000..f1ef8c363 --- /dev/null +++ b/pufferlib/ocean/pong/binding.h @@ -0,0 +1,35 @@ +#include "pong.h" +#define OBS_SIZE 8 +#define NUM_ATNS 1 +#define ACT_SIZES {3} +#define OBS_TYPE FLOAT +#define ACT_TYPE DOUBLE + +#define Env Pong +#include "env_binding.h" + +void my_init(Env* env, Dict* kwargs) { + env->num_agents = 1; + env->width = dict_get(kwargs, "width")->value; + env->height = dict_get(kwargs, "height")->value; + env->paddle_width = dict_get(kwargs, "paddle_width")->value; + env->paddle_height = dict_get(kwargs, "paddle_height")->value; + env->ball_width = dict_get(kwargs, "ball_width")->value; + env->ball_height = dict_get(kwargs, "ball_height")->value; + env->paddle_speed = dict_get(kwargs, "paddle_speed")->value; + env->ball_initial_speed_x = dict_get(kwargs, "ball_initial_speed_x")->value; + env->ball_initial_speed_y = dict_get(kwargs, "ball_initial_speed_y")->value; + env->ball_max_speed_y = dict_get(kwargs, "ball_max_speed_y")->value; + env->ball_speed_y_increment = dict_get(kwargs, "ball_speed_y_increment")->value; + env->max_score = dict_get(kwargs, "max_score")->value; + env->frameskip = dict_get(kwargs, "frameskip")->value; + env->continuous = dict_get(kwargs, "continuous")->value; + init(env); +} + +void my_log(Log* log, Dict* out) { + dict_set(out, "perf", log->perf); + dict_set(out, "score", log->score); + dict_set(out, "episode_return", log->episode_return); + dict_set(out, "episode_length", log->episode_length); +} diff --git a/pufferlib/ocean/rware/binding.c b/pufferlib/ocean/rware/binding.c deleted file mode 100644 index 3cf1d5d14..000000000 --- a/pufferlib/ocean/rware/binding.c +++ /dev/null @@ -1,24 +0,0 @@ -#include "rware.h" - -#define Env CRware -#include "../env_binding.h" - -static int my_init(Env* env, PyObject* args, PyObject* kwargs) { - env->width = unpack(kwargs, "width"); - env->height = unpack(kwargs, "height"); - env->map_choice = unpack(kwargs, "map_choice"); - env->num_agents = unpack(kwargs, "num_agents"); - env->num_requested_shelves = unpack(kwargs, "num_requested_shelves"); - env->grid_square_size = unpack(kwargs, "grid_square_size"); - env->human_agent_idx = unpack(kwargs, "human_agent_idx"); - init(env); - return 0; -} - -static int my_log(PyObject* dict, Log* log) { - assign_to_dict(dict, "perf", log->perf); - assign_to_dict(dict, "score", log->score); - assign_to_dict(dict, "episode_return", log->episode_return); - assign_to_dict(dict, "episode_length", log->episode_length); - return 0; -} diff --git a/pufferlib/ocean/rware/binding.h b/pufferlib/ocean/rware/binding.h new file mode 100644 index 000000000..324e87bf6 --- /dev/null +++ b/pufferlib/ocean/rware/binding.h @@ -0,0 +1,27 @@ +#include "rware.h" +#define OBS_SIZE 27 +#define NUM_ATNS 1 +#define ACT_SIZES {5} +#define OBS_TYPE FLOAT +#define ACT_TYPE DOUBLE + +#define Env CRware +#include "env_binding.h" + +void my_init(Env* env, Dict* kwargs) { + env->width = dict_get(kwargs, "width")->value; + env->height = dict_get(kwargs, "height")->value; + env->map_choice = dict_get(kwargs, "map_choice")->value; + env->num_agents = dict_get(kwargs, "num_agents")->value; + env->num_requested_shelves = dict_get(kwargs, "num_requested_shelves")->value; + env->grid_square_size = dict_get(kwargs, "grid_square_size")->value; + env->human_agent_idx = dict_get(kwargs, "human_agent_idx")->value; + init(env); +} + +void my_log(Log* log, Dict* out) { + dict_set(out, "perf", log->perf); + dict_set(out, "score", log->score); + dict_set(out, "episode_return", log->episode_return); + dict_set(out, "episode_length", log->episode_length); +} diff --git a/pufferlib/ocean/shared_pool/binding.c b/pufferlib/ocean/shared_pool/binding.h similarity index 100% rename from pufferlib/ocean/shared_pool/binding.c rename to pufferlib/ocean/shared_pool/binding.h diff --git a/pufferlib/ocean/slimevolley/binding.c b/pufferlib/ocean/slimevolley/binding.c deleted file mode 100644 index 3d294935e..000000000 --- a/pufferlib/ocean/slimevolley/binding.c +++ /dev/null @@ -1,18 +0,0 @@ -#include "slimevolley.h" - -#define Env SlimeVolley -#include "../env_binding.h" - -static int my_init(Env* env, PyObject* args, PyObject* kwargs) { - env->num_agents = unpack(kwargs, "num_agents"); - init(env); - return 0; -} - -static int my_log(PyObject* dict, Log* log) { - assign_to_dict(dict, "perf", log->perf); - assign_to_dict(dict, "score", log->score); - assign_to_dict(dict, "episode_return", log->episode_return); - assign_to_dict(dict, "episode_length", log->episode_length); - return 0; -} diff --git a/pufferlib/ocean/slimevolley/binding.h b/pufferlib/ocean/slimevolley/binding.h new file mode 100644 index 000000000..7decf34cd --- /dev/null +++ b/pufferlib/ocean/slimevolley/binding.h @@ -0,0 +1,21 @@ +#include "slimevolley.h" +#define OBS_SIZE 12 +#define NUM_ATNS 3 +#define ACT_SIZES {2, 2, 2} +#define OBS_TYPE FLOAT +#define ACT_TYPE DOUBLE + +#define Env SlimeVolley +#include "env_binding.h" + +void my_init(Env* env, Dict* kwargs) { + env->num_agents = dict_get(kwargs, "num_agents")->value; + init(env); +} + +void my_log(Log* log, Dict* out) { + dict_set(out, "perf", log->perf); + dict_set(out, "score", log->score); + dict_set(out, "episode_return", log->episode_return); + dict_set(out, "episode_length", log->episode_length); +} diff --git a/pufferlib/ocean/snake/binding.c b/pufferlib/ocean/snake/binding.c deleted file mode 100644 index 5400edf17..000000000 --- a/pufferlib/ocean/snake/binding.c +++ /dev/null @@ -1,29 +0,0 @@ -#include "snake.h" - -#define Env CSnake -#include "../env_binding.h" - -static int my_init(Env* env, PyObject* args, PyObject* kwargs) { - env->width = unpack(kwargs, "width"); - env->height = unpack(kwargs, "height"); - env->num_agents = unpack(kwargs, "num_agents"); - env->vision = unpack(kwargs, "vision"); - env->leave_corpse_on_death = unpack(kwargs, "leave_corpse_on_death"); - env->food = unpack(kwargs, "num_food"); - env->reward_food = unpack(kwargs, "reward_food"); - env->reward_corpse = unpack(kwargs, "reward_corpse"); - env->reward_death = unpack(kwargs, "reward_death"); - env->max_snake_length = unpack(kwargs, "max_snake_length"); - env->cell_size = unpack(kwargs, "cell_size"); - init_csnake(env); - return 0; -} - -static int my_log(PyObject* dict, Log* log) { - assign_to_dict(dict, "perf", log->perf); - assign_to_dict(dict, "score", log->score); - assign_to_dict(dict, "episode_return", log->episode_return); - assign_to_dict(dict, "episode_length", log->episode_length); - assign_to_dict(dict, "n", log->n); - return 0; -} diff --git a/pufferlib/ocean/snake/binding.h b/pufferlib/ocean/snake/binding.h new file mode 100644 index 000000000..b60f89059 --- /dev/null +++ b/pufferlib/ocean/snake/binding.h @@ -0,0 +1,32 @@ +#include "snake.h" +#define OBS_SIZE 121 +#define NUM_ATNS 1 +#define ACT_SIZES {4} +#define OBS_TYPE CHAR +#define ACT_TYPE DOUBLE + +#define Env CSnake +#include "env_binding.h" + +void my_init(Env* env, Dict* kwargs) { + env->width = dict_get(kwargs, "width")->value; + env->height = dict_get(kwargs, "height")->value; + env->num_agents = dict_get(kwargs, "num_agents")->value; + env->vision = dict_get(kwargs, "vision")->value; + env->leave_corpse_on_death = dict_get(kwargs, "leave_corpse_on_death")->value; + env->food = dict_get(kwargs, "num_food")->value; + env->reward_food = dict_get(kwargs, "reward_food")->value; + env->reward_corpse = dict_get(kwargs, "reward_corpse")->value; + env->reward_death = dict_get(kwargs, "reward_death")->value; + env->max_snake_length = dict_get(kwargs, "max_snake_length")->value; + env->cell_size = dict_get(kwargs, "cell_size")->value; + init_csnake(env); +} + +void my_log(Log* log, Dict* out) { + dict_set(out, "perf", log->perf); + dict_set(out, "score", log->score); + dict_set(out, "episode_return", log->episode_return); + dict_set(out, "episode_length", log->episode_length); + dict_set(out, "n", log->n); +} diff --git a/pufferlib/ocean/squared/binding.c b/pufferlib/ocean/squared/binding.c deleted file mode 100644 index be9dfade2..000000000 --- a/pufferlib/ocean/squared/binding.c +++ /dev/null @@ -1,17 +0,0 @@ -#include "squared.h" - -#define Env Squared -#include "../env_binding.h" - -static int my_init(Env* env, PyObject* args, PyObject* kwargs) { - env->size = unpack(kwargs, "size"); - return 0; -} - -static int my_log(PyObject* dict, Log* log) { - assign_to_dict(dict, "perf", log->perf); - assign_to_dict(dict, "score", log->score); - assign_to_dict(dict, "episode_return", log->episode_return); - assign_to_dict(dict, "episode_length", log->episode_length); - return 0; -} diff --git a/pufferlib/ocean/squared/binding.h b/pufferlib/ocean/squared/binding.h new file mode 100644 index 000000000..6823681b0 --- /dev/null +++ b/pufferlib/ocean/squared/binding.h @@ -0,0 +1,22 @@ +#include "squared.h" +#define OBS_SIZE 121 +#define NUM_ATNS 1 +#define ACT_SIZES {5} +#define OBS_TYPE UNSIGNED_CHAR +#define ACT_TYPE DOUBLE + +#define Env Squared +#include "env_binding.h" + +void my_init(Env* env, Dict* kwargs) { + env->num_agents = 1; + env->size = dict_get(kwargs, "size")->value; + c_reset(env); +} + +void my_log(Log* log, Dict* out) { + dict_set(out, "perf", log->perf); + dict_set(out, "score", log->score); + dict_set(out, "episode_return", log->episode_return); + dict_set(out, "episode_length", log->episode_length); +} diff --git a/pufferlib/ocean/tactical/binding.c b/pufferlib/ocean/tactical/binding.h similarity index 100% rename from pufferlib/ocean/tactical/binding.c rename to pufferlib/ocean/tactical/binding.h diff --git a/pufferlib/ocean/target/binding.c b/pufferlib/ocean/target/binding.c deleted file mode 100644 index f8e7bde78..000000000 --- a/pufferlib/ocean/target/binding.c +++ /dev/null @@ -1,21 +0,0 @@ -#include "target.h" - -#define Env Target -#include "../env_binding.h" - -static int my_init(Env* env, PyObject* args, PyObject* kwargs) { - env->width = unpack(kwargs, "width"); - env->height = unpack(kwargs, "height"); - env->num_agents = unpack(kwargs, "num_agents"); - env->num_goals = unpack(kwargs, "num_goals"); - init(env); - return 0; -} - -static int my_log(PyObject* dict, Log* log) { - assign_to_dict(dict, "perf", log->perf); - assign_to_dict(dict, "score", log->score); - assign_to_dict(dict, "episode_return", log->episode_return); - assign_to_dict(dict, "episode_length", log->episode_length); - return 0; -} diff --git a/pufferlib/ocean/target/binding.h b/pufferlib/ocean/target/binding.h new file mode 100644 index 000000000..20b6136ef --- /dev/null +++ b/pufferlib/ocean/target/binding.h @@ -0,0 +1,24 @@ +#include "target.h" +#define OBS_SIZE 28 +#define NUM_ATNS 2 +#define ACT_SIZES {9, 5} +#define OBS_TYPE FLOAT +#define ACT_TYPE DOUBLE + +#define Env Target +#include "env_binding.h" + +void my_init(Env* env, Dict* kwargs) { + env->width = dict_get(kwargs, "width")->value; + env->height = dict_get(kwargs, "height")->value; + env->num_agents = dict_get(kwargs, "num_agents")->value; + env->num_goals = dict_get(kwargs, "num_goals")->value; + init(env); +} + +void my_log(Log* log, Dict* out) { + dict_set(out, "perf", log->perf); + dict_set(out, "score", log->score); + dict_set(out, "episode_return", log->episode_return); + dict_set(out, "episode_length", log->episode_length); +} diff --git a/pufferlib/ocean/template/binding.c b/pufferlib/ocean/template/binding.h similarity index 100% rename from pufferlib/ocean/template/binding.c rename to pufferlib/ocean/template/binding.h diff --git a/pufferlib/ocean/terraform/binding.c b/pufferlib/ocean/terraform/binding.c deleted file mode 100644 index 1c88f2aab..000000000 --- a/pufferlib/ocean/terraform/binding.c +++ /dev/null @@ -1,22 +0,0 @@ -#include "terraform.h" - -#define Env Terraform -#include "../env_binding.h" - -static int my_init(Env* env, PyObject* args, PyObject* kwargs) { - env->size = unpack(kwargs, "size"); - env->num_agents = unpack(kwargs, "num_agents"); - env->reward_scale = unpack(kwargs, "reward_scale"); - env->reset_frequency = unpack(kwargs, "reset_frequency"); - init(env); - return 0; -} - -static int my_log(PyObject* dict, Log* log) { - assign_to_dict(dict, "perf", log->perf); - assign_to_dict(dict, "score", log->score); - assign_to_dict(dict, "episode_return", log->episode_return); - assign_to_dict(dict, "episode_length", log->episode_length); - assign_to_dict(dict, "quadrant_progress", log->quadrant_progress); - return 0; -} diff --git a/pufferlib/ocean/terraform/binding.h b/pufferlib/ocean/terraform/binding.h new file mode 100644 index 000000000..5c0fe0980 --- /dev/null +++ b/pufferlib/ocean/terraform/binding.h @@ -0,0 +1,25 @@ +#include "terraform.h" +#define OBS_SIZE 319 +#define NUM_ATNS 3 +#define ACT_SIZES {5, 5, 3} +#define OBS_TYPE FLOAT +#define ACT_TYPE DOUBLE + +#define Env Terraform +#include "env_binding.h" + +void my_init(Env* env, Dict* kwargs) { + env->num_agents = dict_get(kwargs, "num_agents")->value; + env->size = dict_get(kwargs, "size")->value; + env->reset_frequency = dict_get(kwargs, "reset_frequency")->value; + env->reward_scale = dict_get(kwargs, "reward_scale")->value; + init(env); +} + +void my_log(Log* log, Dict* out) { + dict_set(out, "perf", log->perf); + dict_set(out, "score", log->score); + dict_set(out, "episode_return", log->episode_return); + dict_set(out, "episode_length", log->episode_length); + dict_set(out, "quadrant_progress", log->quadrant_progress); +} diff --git a/pufferlib/ocean/tetris/binding.c b/pufferlib/ocean/tetris/binding.c deleted file mode 100644 index 213b25842..000000000 --- a/pufferlib/ocean/tetris/binding.c +++ /dev/null @@ -1,32 +0,0 @@ -#include "tetris.h" - -#define Env Tetris -#include "../env_binding.h" - -static int my_init(Env* env, PyObject* args, PyObject* kwargs) { - env->n_rows = unpack(kwargs, "n_rows"); - env->n_cols = unpack(kwargs, "n_cols"); - env->use_deck_obs = unpack(kwargs, "use_deck_obs"); - env->n_noise_obs = unpack(kwargs, "n_noise_obs"); - env->n_init_garbage = unpack(kwargs, "n_init_garbage"); - init(env); - return 0; -} - -static int my_log(PyObject* dict, Log* log) { - assign_to_dict(dict, "score", log->score); - assign_to_dict(dict, "perf", log->perf); - assign_to_dict(dict, "ep_length", log->ep_length); - assign_to_dict(dict, "ep_return", log->ep_return); - assign_to_dict(dict, "avg_combo", log->avg_combo); - assign_to_dict(dict, "lines_deleted", log->lines_deleted); - assign_to_dict(dict, "game_level", log->game_level); - assign_to_dict(dict, "ticks_per_line", log->ticks_per_line); - - // assign_to_dict(dict, "atn_frac_soft_drop", log->atn_frac_soft_drop); - assign_to_dict(dict, "atn_frac_hard_drop", log->atn_frac_hard_drop); - assign_to_dict(dict, "atn_frac_rotate", log->atn_frac_rotate); - assign_to_dict(dict, "atn_frac_hold", log->atn_frac_hold); - - return 0; -} \ No newline at end of file diff --git a/pufferlib/ocean/tetris/binding.h b/pufferlib/ocean/tetris/binding.h new file mode 100644 index 000000000..aa37b403f --- /dev/null +++ b/pufferlib/ocean/tetris/binding.h @@ -0,0 +1,34 @@ +#include "tetris.h" +#define OBS_SIZE 234 +#define NUM_ATNS 1 +#define ACT_SIZES {7} +#define OBS_TYPE FLOAT +#define ACT_TYPE DOUBLE + +#define Env Tetris +#include "env_binding.h" + +void my_init(Env* env, Dict* kwargs) { + env->num_agents = 1; + env->n_cols = dict_get(kwargs, "n_cols")->value; + env->n_rows = dict_get(kwargs, "n_rows")->value; + env->use_deck_obs = dict_get(kwargs, "use_deck_obs")->value; + env->n_noise_obs = dict_get(kwargs, "n_noise_obs")->value; + env->n_init_garbage = dict_get(kwargs, "n_init_garbage")->value; + init(env); +} + +void my_log(Log* log, Dict* out) { + dict_set(out, "perf", log->perf); + dict_set(out, "score", log->score); + dict_set(out, "episode_return", log->episode_return); + dict_set(out, "episode_length", log->episode_length); + dict_set(out, "lines_deleted", log->lines_deleted); + dict_set(out, "avg_combo", log->avg_combo); + dict_set(out, "atn_frac_soft_drop", log->atn_frac_soft_drop); + dict_set(out, "atn_frac_hard_drop", log->atn_frac_hard_drop); + dict_set(out, "atn_frac_rotate", log->atn_frac_rotate); + dict_set(out, "atn_frac_hold", log->atn_frac_hold); + dict_set(out, "game_level", log->game_level); + dict_set(out, "ticks_per_line", log->ticks_per_line); +} diff --git a/pufferlib/ocean/tetris/tetris.h b/pufferlib/ocean/tetris/tetris.h index f94c92224..76d6b7221 100644 --- a/pufferlib/ocean/tetris/tetris.h +++ b/pufferlib/ocean/tetris/tetris.h @@ -45,8 +45,8 @@ const float REWARD_COMBO[5] = {0, 0.1, 0.3, 0.5, 1.0}; typedef struct Log { float perf; float score; - float ep_length; - float ep_return; + float episode_length; + float episode_return; float lines_deleted; float avg_combo; float atn_frac_soft_drop; @@ -101,7 +101,7 @@ typedef struct Tetris { int cur_tetromino_col; int cur_tetromino_rot; - float ep_return; + float episode_return; int lines_deleted; int count_combos; int game_level; @@ -149,8 +149,8 @@ void free_allocated(Tetris *env) { void add_log(Tetris *env) { env->log.score += env->score; env->log.perf += env->score / ((float)PERSONAL_BEST); - env->log.ep_length += env->tick; - env->log.ep_return += env->ep_return; + env->log.episode_length += env->tick; + env->log.episode_return += env->episode_return; env->log.lines_deleted += env->lines_deleted; env->log.avg_combo += env->count_combos > 0 ? ((float)env->lines_deleted) / ((float)env->count_combos) : 1.0f; env->log.atn_frac_hard_drop += env->atn_count_hard_drop / ((float)env->tick); @@ -426,7 +426,7 @@ void c_reset(Tetris *env) { env->tick_garbage = 0; env->can_swap = 1; - env->ep_return = 0.0; + env->episode_return = 0.0; env->count_combos = 0; env->lines_deleted = 0; env->atn_count_hard_drop = 0; @@ -478,7 +478,7 @@ void place_tetromino(Tetris *env) { env->lines_deleted += lines_deleted; env->score += SCORE_COMBO[lines_deleted]; env->rewards[0] += REWARD_COMBO[lines_deleted]; - env->ep_return += REWARD_COMBO[lines_deleted]; + env->episode_return += REWARD_COMBO[lines_deleted]; // These determine the game difficulty. Consider making them args. env->game_level = 1 + env->lines_deleted / LINES_PER_LEVEL; @@ -506,7 +506,7 @@ void c_step(Tetris *env) { env->cur_tetromino_col -= 1; } else { env->rewards[0] += REWARD_INVALID_ACTION; - env->ep_return += REWARD_INVALID_ACTION; + env->episode_return += REWARD_INVALID_ACTION; } } if (action == ACTION_RIGHT) { @@ -514,7 +514,7 @@ void c_step(Tetris *env) { env->cur_tetromino_col += 1; } else { env->rewards[0] += REWARD_INVALID_ACTION; - env->ep_return += REWARD_INVALID_ACTION; + env->episode_return += REWARD_INVALID_ACTION; } } if (action == ACTION_ROTATE) { @@ -522,10 +522,10 @@ void c_step(Tetris *env) { if (can_rotate(env)) { env->cur_tetromino_rot = (env->cur_tetromino_rot + 1) % NUM_ROTATIONS; env->rewards[0] += REWARD_ROTATE; - env->ep_return += REWARD_ROTATE; + env->episode_return += REWARD_ROTATE; } else { env->rewards[0] += REWARD_INVALID_ACTION; - env->ep_return += REWARD_INVALID_ACTION; + env->episode_return += REWARD_INVALID_ACTION; } } if (action == ACTION_SOFT_DROP) { @@ -534,10 +534,10 @@ void c_step(Tetris *env) { env->cur_tetromino_row += 1; env->score += SCORE_SOFT_DROP; // env->rewards[0] += REWARD_SOFT_DROP; - // env->ep_return += REWARD_SOFT_DROP; + // env->episode_return += REWARD_SOFT_DROP; } else { env->rewards[0] += REWARD_INVALID_ACTION; - env->ep_return += REWARD_INVALID_ACTION; + env->episode_return += REWARD_INVALID_ACTION; } } if (action == ACTION_HOLD) { @@ -561,7 +561,7 @@ void c_step(Tetris *env) { } } else { env->rewards[0] += REWARD_INVALID_ACTION; - env->ep_return += REWARD_INVALID_ACTION; + env->episode_return += REWARD_INVALID_ACTION; } } if (action == ACTION_HARD_DROP) { @@ -570,7 +570,7 @@ void c_step(Tetris *env) { env->cur_tetromino_row += 1; // NOTE: this seems to be a super effective reward trick env->rewards[0] += REWARD_HARD_DROP; - env->ep_return += REWARD_HARD_DROP; + env->episode_return += REWARD_HARD_DROP; } env->score += SCORE_HARD_DROP; place_tetromino(env); diff --git a/pufferlib/ocean/tmaze/binding.c b/pufferlib/ocean/tmaze/binding.h similarity index 100% rename from pufferlib/ocean/tmaze/binding.c rename to pufferlib/ocean/tmaze/binding.h diff --git a/pufferlib/ocean/tower_climb/binding.c b/pufferlib/ocean/tower_climb/binding.c deleted file mode 100644 index 184f97325..000000000 --- a/pufferlib/ocean/tower_climb/binding.c +++ /dev/null @@ -1,159 +0,0 @@ -#include - -#include "tower_climb.h" - -#define Env CTowerClimb -#define MY_SHARED - -static PyObject* py_generate_one_map(PyObject* self, PyObject* args); -#define MY_METHODS {"generate_one_map", py_generate_one_map, METH_VARARGS, "Generate one tower climb map."} - -#include "../env_binding.h" - -static PyObject* my_shared(PyObject* self, PyObject* args, PyObject* kwargs) { - const char* path = "resources/tower_climb/maps.bin"; - int num_maps = 0; - - Level* levels = load_levels_from_file(&num_maps, path); - if (levels == NULL) { - PyErr_SetString(PyExc_IOError, "Failed to load maps from maps.bin. Did you run './tower_climb' to pregenerate them?"); - return NULL; - } - - PuzzleState* puzzle_states = calloc(num_maps, sizeof(PuzzleState)); - - for (int i = 0; i < num_maps; i++) { - init_puzzle_state(&puzzle_states[i]); - levelToPuzzleState(&levels[i], &puzzle_states[i]); - } - - PyObject* levels_handle = PyLong_FromVoidPtr(levels); - PyObject* puzzles_handle = PyLong_FromVoidPtr(puzzle_states); - PyObject* num_maps_obj = PyLong_FromLong(num_maps); - PyObject* state = PyDict_New(); - PyDict_SetItemString(state, "levels", levels_handle); - PyDict_SetItemString(state, "puzzles", puzzles_handle); - PyDict_SetItemString(state, "num_maps", num_maps_obj); - return PyLong_FromVoidPtr(state); -} - -static int my_init(Env* env, PyObject* args, PyObject* kwargs) { - env->reward_climb_row = unpack(kwargs, "reward_climb_row"); - env->reward_fall_row = unpack(kwargs, "reward_fall_row"); - env->reward_illegal_move = unpack(kwargs, "reward_illegal_move"); - env->reward_move_block = unpack(kwargs, "reward_move_block"); - init(env); - - PyObject* handle_obj = PyDict_GetItemString(kwargs, "state"); - if (handle_obj == NULL) { - PyErr_SetString(PyExc_KeyError, "Key 'state' not found in kwargs"); - return 1; - } - - // Check if handle_obj is a PyLong - if (!PyLong_Check(handle_obj)) { - PyErr_SetString(PyExc_TypeError, "state handle must be an integer"); - return 1; - } - - // Convert PyLong to PyObject* (state dictionary) - PyObject* state_dict = (PyObject*)PyLong_AsVoidPtr(handle_obj); - if (state_dict == NULL) { - PyErr_SetString(PyExc_ValueError, "Invalid state dictionary pointer"); - return 1; - } - - // Verify it’s a dictionary - if (!PyDict_Check(state_dict)) { - PyErr_SetString(PyExc_TypeError, "State pointer does not point to a dictionary"); - return 1; - } - - // Basic validation: check reference count - if (state_dict->ob_refcnt <= 0) { - PyErr_SetString(PyExc_RuntimeError, "State dictionary has invalid reference count"); - return 1; - } - - PyObject* levels_obj = PyDict_GetItemString(state_dict, "levels"); - if (levels_obj == NULL) { - PyErr_SetString(PyExc_KeyError, "Key 'levels' not found in state"); - return 1; - } - if (!PyLong_Check(levels_obj)) { - PyErr_SetString(PyExc_TypeError, "levels must be an integer"); - return 1; - } - env->all_levels = (Level*)PyLong_AsVoidPtr(levels_obj); - - PyObject* num_maps_obj = PyDict_GetItemString(state_dict, "num_maps"); - if (num_maps_obj == NULL) { - PyErr_SetString(PyExc_KeyError, "Key 'num_maps' not found in state"); - return 1; - } - if (!PyLong_Check(num_maps_obj)) { - PyErr_SetString(PyExc_TypeError, "'num_maps' must be an integer"); - return 1; - } - if (env->all_levels != NULL) { - env->num_maps = PyLong_AsLong(num_maps_obj); - } - - PyObject* puzzles_obj = PyDict_GetItemString(state_dict, "puzzles"); - if (!PyObject_TypeCheck(puzzles_obj, &PyLong_Type)) { - PyErr_SetString(PyExc_TypeError, "puzzles handle must be an integer"); - return 1; - } - PuzzleState* puzzles = (PuzzleState*)PyLong_AsVoidPtr(puzzles_obj); - if (!puzzles) { - PyErr_SetString(PyExc_ValueError, "Invalid puzzles handle"); - return 1; - } - env->all_puzzles = puzzles; - - return 0; -} - -static PyObject* py_generate_one_map(PyObject* self, PyObject* args) { - int seed; - if (!PyArg_ParseTuple(args, "i", &seed)) { - return NULL; // PyArg_ParseTuple sets the error - } - - Level level; - init_level(&level); - - // Generation parameters from generate_maps.py - int goal_height = 5 + (seed % 4); - int min_moves = 10; - int max_moves = 30; - - cy_init_random_level(&level, goal_height, max_moves, min_moves, seed); - - // Package the map data into a Python tuple - PyObject* map_data_obj = PyBytes_FromStringAndSize((const char*)level.map, BLOCK_BYTES); - if (map_data_obj == NULL) { - free(level.map); - return NULL; - } - - PyObject* result_tuple = Py_BuildValue( - "Oiiiiii", - map_data_obj, - level.rows, level.cols, level.size, - level.total_length, level.goal_location, level.spawn_location - ); - - Py_DECREF(map_data_obj); - free(level.map); - - return result_tuple; -} - -static int my_log(PyObject* dict, Log* log) { - assign_to_dict(dict, "perf", log->perf); - assign_to_dict(dict, "score", log->score); - assign_to_dict(dict, "episode_return", log->episode_return); - assign_to_dict(dict, "episode_length", log->episode_length); - return 0; -} diff --git a/pufferlib/ocean/tower_climb/binding.h b/pufferlib/ocean/tower_climb/binding.h new file mode 100644 index 000000000..10a7c3ff3 --- /dev/null +++ b/pufferlib/ocean/tower_climb/binding.h @@ -0,0 +1,68 @@ +#include "tower_climb.h" +#define OBS_SIZE 228 +#define NUM_ATNS 1 +#define ACT_SIZES {6} +#define OBS_TYPE UNSIGNED_CHAR +#define ACT_TYPE DOUBLE + +#define MY_VEC_INIT +#define Env CTowerClimb +#include "env_binding.h" + +Env* my_vec_init(int* num_envs_out, Dict* vec_kwargs, Dict* env_kwargs) { + int num_envs = (int)dict_get(vec_kwargs, "total_agents")->value; + + float reward_climb_row = dict_get(env_kwargs, "reward_climb_row")->value; + float reward_fall_row = dict_get(env_kwargs, "reward_fall_row")->value; + float reward_illegal_move = dict_get(env_kwargs, "reward_illegal_move")->value; + float reward_move_block = dict_get(env_kwargs, "reward_move_block")->value; + + const char* path = "resources/tower_climb/maps.bin"; + int num_maps = 0; + + Level* levels = load_levels_from_file(&num_maps, path); + if (levels == NULL) { + *num_envs_out = 0; + return NULL; + } + + PuzzleState* puzzle_states = calloc(num_maps, sizeof(PuzzleState)); + for (int i = 0; i < num_maps; i++) { + init_puzzle_state(&puzzle_states[i]); + levelToPuzzleState(&levels[i], &puzzle_states[i]); + } + + Env* envs = (Env*)calloc(num_envs, sizeof(Env)); + + for (int i = 0; i < num_envs; i++) { + Env* env = &envs[i]; + env->num_agents = 1; + env->reward_climb_row = reward_climb_row; + env->reward_fall_row = reward_fall_row; + env->reward_illegal_move = reward_illegal_move; + env->reward_move_block = reward_move_block; + env->all_levels = levels; + env->all_puzzles = puzzle_states; + env->num_maps = num_maps; + init(env); + } + + *num_envs_out = num_envs; + return envs; +} + +void my_init(Env* env, Dict* kwargs) { + env->num_agents = 1; + env->reward_climb_row = dict_get(kwargs, "reward_climb_row")->value; + env->reward_fall_row = dict_get(kwargs, "reward_fall_row")->value; + env->reward_illegal_move = dict_get(kwargs, "reward_illegal_move")->value; + env->reward_move_block = dict_get(kwargs, "reward_move_block")->value; + init(env); +} + +void my_log(Log* log, Dict* out) { + dict_set(out, "perf", log->perf); + dict_set(out, "score", log->score); + dict_set(out, "episode_return", log->episode_return); + dict_set(out, "episode_length", log->episode_length); +} diff --git a/pufferlib/ocean/trash_pickup/binding.c b/pufferlib/ocean/trash_pickup/binding.c deleted file mode 100644 index 4e23da1b3..000000000 --- a/pufferlib/ocean/trash_pickup/binding.c +++ /dev/null @@ -1,24 +0,0 @@ -#include "trash_pickup.h" - -#define Env CTrashPickupEnv -#include "../env_binding.h" - -static int my_init(Env* env, PyObject* args, PyObject* kwargs) { - env->num_agents = unpack(kwargs, "num_agents"); - env->grid_size = unpack(kwargs, "grid_size"); - env->num_trash = unpack(kwargs, "num_trash"); - env->num_bins = unpack(kwargs, "num_bins"); - env->max_steps = unpack(kwargs, "max_steps"); - env->agent_sight_range = unpack(kwargs, "agent_sight_range"); - initialize_env(env); - return 0; -} - -static int my_log(PyObject* dict, Log* log) { - assign_to_dict(dict, "perf", log->perf); - assign_to_dict(dict, "score", log->score); - assign_to_dict(dict, "episode_return", log->episode_return); - assign_to_dict(dict, "episode_length", log->episode_length); - assign_to_dict(dict, "trash_collected", log->trash_collected); - return 0; -} diff --git a/pufferlib/ocean/trash_pickup/binding.h b/pufferlib/ocean/trash_pickup/binding.h new file mode 100644 index 000000000..c476e8bd1 --- /dev/null +++ b/pufferlib/ocean/trash_pickup/binding.h @@ -0,0 +1,27 @@ +#include "trash_pickup.h" +#define OBS_SIZE 605 +#define NUM_ATNS 1 +#define ACT_SIZES {4} +#define OBS_TYPE CHAR +#define ACT_TYPE DOUBLE + +#define Env CTrashPickupEnv +#include "env_binding.h" + +void my_init(Env* env, Dict* kwargs) { + env->num_agents = dict_get(kwargs, "num_agents")->value; + env->grid_size = dict_get(kwargs, "grid_size")->value; + env->num_trash = dict_get(kwargs, "num_trash")->value; + env->num_bins = dict_get(kwargs, "num_bins")->value; + env->max_steps = dict_get(kwargs, "max_steps")->value; + env->agent_sight_range = dict_get(kwargs, "agent_sight_range")->value; + initialize_env(env); +} + +void my_log(Log* log, Dict* out) { + dict_set(out, "perf", log->perf); + dict_set(out, "score", log->score); + dict_set(out, "episode_return", log->episode_return); + dict_set(out, "episode_length", log->episode_length); + dict_set(out, "trash_collected", log->trash_collected); +} diff --git a/pufferlib/ocean/tripletriad/binding.c b/pufferlib/ocean/tripletriad/binding.c deleted file mode 100644 index 4c725d313..000000000 --- a/pufferlib/ocean/tripletriad/binding.c +++ /dev/null @@ -1,22 +0,0 @@ -#include "tripletriad.h" - -#define Env CTripleTriad -#include "../env_binding.h" - -static int my_init(Env* env, PyObject* args, PyObject* kwargs) { - env->width = unpack(kwargs, "width"); - env->height = unpack(kwargs, "height"); - env->card_width = unpack(kwargs, "card_width"); - env->card_height = unpack(kwargs, "card_height"); - init_ctripletriad(env); - return 0; -} - -static int my_log(PyObject* dict, Log* log) { - assign_to_dict(dict, "perf", log->perf); - assign_to_dict(dict, "score", log->score); - assign_to_dict(dict, "episode_return", log->episode_return); - assign_to_dict(dict, "episode_length", log->episode_length); - assign_to_dict(dict, "n", log->n); - return 0; -} diff --git a/pufferlib/ocean/tripletriad/binding.h b/pufferlib/ocean/tripletriad/binding.h new file mode 100644 index 000000000..b2813744d --- /dev/null +++ b/pufferlib/ocean/tripletriad/binding.h @@ -0,0 +1,26 @@ +#include "tripletriad.h" +#define OBS_SIZE 114 +#define NUM_ATNS 1 +#define ACT_SIZES {14} +#define OBS_TYPE FLOAT +#define ACT_TYPE DOUBLE + +#define Env CTripleTriad +#include "env_binding.h" + +void my_init(Env* env, Dict* kwargs) { + env->num_agents = 1; + env->width = dict_get(kwargs, "width")->value; + env->height = dict_get(kwargs, "height")->value; + env->card_width = dict_get(kwargs, "card_width")->value; + env->card_height = dict_get(kwargs, "card_height")->value; + init_ctripletriad(env); +} + +void my_log(Log* log, Dict* out) { + dict_set(out, "perf", log->perf); + dict_set(out, "score", log->score); + dict_set(out, "episode_return", log->episode_return); + dict_set(out, "episode_length", log->episode_length); + dict_set(out, "n", log->n); +} diff --git a/pufferlib/ocean/whisker_racer/binding.c b/pufferlib/ocean/whisker_racer/binding.c deleted file mode 100644 index 88582daaf..000000000 --- a/pufferlib/ocean/whisker_racer/binding.c +++ /dev/null @@ -1,49 +0,0 @@ -#include "whisker_racer.h" - -#define Env WhiskerRacer -#include "../env_binding.h" - -static int my_init(Env* env, PyObject* args, PyObject* kwargs) { - env->frameskip = unpack(kwargs, "frameskip"); - env->width = unpack(kwargs, "width"); - env->height = unpack(kwargs, "height"); - env->llw_ang = unpack(kwargs, "llw_ang"); - env->flw_ang = unpack(kwargs, "flw_ang"); - env->frw_ang = unpack(kwargs, "frw_ang"); - env->rrw_ang = unpack(kwargs, "rrw_ang"); - env->max_whisker_length = unpack(kwargs, "max_whisker_length"); - env->turn_pi_frac = unpack(kwargs, "turn_pi_frac"); - env->maxv = unpack(kwargs, "maxv"); - env->render = unpack(kwargs, "render"); - env->continuous = unpack(kwargs, "continuous"); - env->reward_yellow = unpack(kwargs, "reward_yellow"); - env->reward_green = unpack(kwargs, "reward_green"); - env->gamma = unpack(kwargs, "gamma"); - env->track_width = unpack(kwargs, "track_width"); - env->num_radial_sectors = unpack(kwargs, "num_radial_sectors"); - env->num_points = unpack(kwargs, "num_points"); - env->bezier_resolution = unpack(kwargs, "bezier_resolution"); - env->turn_pi_frac = unpack(kwargs, "turn_pi_frac"); - env->w_ang = unpack(kwargs, "w_ang"); - env->corner_thresh = unpack(kwargs, "corner_thresh"); - env->ftmp1 = unpack(kwargs, "ftmp1"); - env->ftmp2 = unpack(kwargs, "ftmp2"); - env->ftmp3 = unpack(kwargs, "ftmp3"); - env->ftmp4 = unpack(kwargs, "ftmp4"); - env->mode7 = unpack(kwargs, "mode7"); - env->render_many = unpack(kwargs, "render_many"); - env->rng = unpack(kwargs, "rng"); - env->method = unpack(kwargs, "method"); - env->i = unpack(kwargs, "i"); - - init(env); - return 0; -} - -static int my_log(PyObject* dict, Log* log) { - assign_to_dict(dict, "perf", log->perf); - assign_to_dict(dict, "score", log->score); - assign_to_dict(dict, "episode_return", log->episode_return); - assign_to_dict(dict, "episode_length", log->episode_length); - return 0; -} diff --git a/pufferlib/ocean/whisker_racer/binding.h b/pufferlib/ocean/whisker_racer/binding.h new file mode 100644 index 000000000..31517aa4d --- /dev/null +++ b/pufferlib/ocean/whisker_racer/binding.h @@ -0,0 +1,51 @@ +#include "whisker_racer.h" +#define OBS_SIZE 3 +#define NUM_ATNS 1 +#define ACT_SIZES {3} +#define OBS_TYPE FLOAT +#define ACT_TYPE DOUBLE + +#define Env WhiskerRacer +#include "env_binding.h" + +void my_init(Env* env, Dict* kwargs) { + env->num_agents = 1; + env->frameskip = dict_get(kwargs, "frameskip")->value; + env->width = dict_get(kwargs, "width")->value; + env->height = dict_get(kwargs, "height")->value; + env->llw_ang = dict_get(kwargs, "llw_ang")->value; + env->flw_ang = dict_get(kwargs, "flw_ang")->value; + env->frw_ang = dict_get(kwargs, "frw_ang")->value; + env->rrw_ang = dict_get(kwargs, "rrw_ang")->value; + env->max_whisker_length = dict_get(kwargs, "max_whisker_length")->value; + env->turn_pi_frac = dict_get(kwargs, "turn_pi_frac")->value; + env->maxv = dict_get(kwargs, "maxv")->value; + env->render = dict_get(kwargs, "render")->value; + env->continuous = dict_get(kwargs, "continuous")->value; + env->reward_yellow = dict_get(kwargs, "reward_yellow")->value; + env->reward_green = dict_get(kwargs, "reward_green")->value; + env->gamma = dict_get(kwargs, "gamma")->value; + env->track_width = dict_get(kwargs, "track_width")->value; + env->num_radial_sectors = dict_get(kwargs, "num_radial_sectors")->value; + env->num_points = dict_get(kwargs, "num_points")->value; + env->bezier_resolution = dict_get(kwargs, "bezier_resolution")->value; + env->w_ang = dict_get(kwargs, "w_ang")->value; + env->corner_thresh = dict_get(kwargs, "corner_thresh")->value; + env->ftmp1 = dict_get(kwargs, "ftmp1")->value; + env->ftmp2 = dict_get(kwargs, "ftmp2")->value; + env->ftmp3 = dict_get(kwargs, "ftmp3")->value; + env->ftmp4 = dict_get(kwargs, "ftmp4")->value; + env->mode7 = dict_get(kwargs, "mode7")->value; + env->render_many = dict_get(kwargs, "render_many")->value; + env->rng = dict_get(kwargs, "rng")->value; + env->method = dict_get(kwargs, "method")->value; + env->i = dict_get(kwargs, "i")->value; + init(env); +} + +void my_log(Log* log, Dict* out) { + dict_set(out, "perf", log->perf); + dict_set(out, "score", log->score); + dict_set(out, "episode_return", log->episode_return); + dict_set(out, "episode_length", log->episode_length); +} diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py index 83a4fdac3..fbfba87a3 100644 --- a/pufferlib/pufferl.py +++ b/pufferlib/pufferl.py @@ -155,7 +155,7 @@ def __init__(self, config, logger=None, verbose=True): config['cudagraphs'] = True config['kernels'] = True config['use_omp'] = True - config['num_buffers'] = 2 + config['num_buffers'] = config['num_buffers'] self.pufferl_cpp = _C.create_pufferl(config) self.observations = self.pufferl_cpp.rollouts.observations self.actions = self.pufferl_cpp.rollouts.actions diff --git a/scripts/build_envspeed.sh b/scripts/build_envspeed.sh index 878bee029..91096ce98 100644 --- a/scripts/build_envspeed.sh +++ b/scripts/build_envspeed.sh @@ -1,6 +1,22 @@ #!/bin/bash -# Usage: ./build_envspeed.sh +# Usage: ./scripts/build_envspeed.sh +# Example: ./scripts/build_envspeed.sh breakout + +if [ -z "$1" ]; then + echo "Usage: $0 " + echo "Example: $0 breakout" + exit 1 +fi + +ENV_NAME=$1 +STATIC_LIB="pufferlib/extensions/libstatic_${ENV_NAME}.a" + +if [ ! -f "$STATIC_LIB" ]; then + echo "Error: Static library not found: $STATIC_LIB" + echo "Build it first with: ./scripts/build_static_${ENV_NAME}.sh" + exit 1 +fi RAYLIB_NAME='raylib-5.5_linux_amd64' LINK_ARCHIVES="./$RAYLIB_NAME/lib/libraylib.a" @@ -10,14 +26,14 @@ FLAGS=( -I./$RAYLIB_NAME/include -I/usr/local/cuda/include -Ipufferlib/extensions - "pufferlib/extensions/test_envspeed.c" + "pufferlib/extensions/test_envspeed_static.c" "pufferlib/extensions/ini.c" - -o "test_envspeed" + "$STATIC_LIB" + -o "test_envspeed_static" $LINK_ARCHIVES -lGL -lm -lpthread - -ldl -L/usr/local/cuda/lib64 -lcudart -ferror-limit=3 -DPLATFORM_DESKTOP @@ -33,4 +49,4 @@ FLAGS=( -fPIC ) -clang -O2 -DNDEBUG -fopenmp ${FLAGS[@]} +clang -O2 -DNDEBUG -fopenmp -DENV_NAME=\"${ENV_NAME}\" ${FLAGS[@]} diff --git a/setup.py b/setup.py index 4ff2b486f..9a2c5abd9 100644 --- a/setup.py +++ b/setup.py @@ -120,12 +120,16 @@ def download_box2d(platform): extra_compile_args += [ '-O2', '-flto=auto', + '-fno-semantic-interposition', + '-fvisibility=hidden', ] extra_link_args += [ '-O2', ] cxx_args += [ - '-O', + '-O2', + '-fno-semantic-interposition', + '-Wno-c++11-narrowing', ] nvcc_args += [ '-O3', @@ -229,11 +233,13 @@ def run(self): class ProfilerBuildExt(build_ext): user_options = build_ext.user_options + [ ('no-torch', None, 'Build profiler without torch support'), + ('env=', None, 'Static env to link (e.g., breakout, drive)'), ] def initialize_options(self): super().initialize_options() self.no_torch = False + self.env = None def finalize_options(self): super().finalize_options() @@ -247,7 +253,7 @@ def run(self): out = 'profile_kernels' nvcc = cpp_ext._join_cuda_home('bin', 'nvcc') - arch = '-arch=sm_80' + arch = '-arch=sm_89' cmd = [nvcc, '-O3', arch, '-I.', src, '-o', out] @@ -255,7 +261,7 @@ def run(self): out = 'profile_kernels_torch' lib_paths = cpp_ext.library_paths() nvtx_lib_dir = os.path.join(cpp_ext.CUDA_HOME, 'lib64') - cmd = [nvcc, '-O3', arch, '-DUSE_TORCH', '-I.'] + cmd = [nvcc, '-O3', arch, '-DUSE_TORCH', '-I.', f'-I./{RAYLIB_NAME}/include', '-Ipufferlib/extensions'] cmd += ['-I' + sysconfig.get_path('include')] cmd += ['-I' + p for p in cpp_ext.include_paths()] cmd += ['-L' + p for p in lib_paths] @@ -263,6 +269,15 @@ def run(self): cmd += ['-Xlinker', '-rpath,' + ':'.join(lib_paths)] cmd += ['-Xlinker', '--no-as-needed'] cmd += ['-lc10', '-lc10_cuda', '-ltorch', '-ltorch_cpu', '-ltorch_cuda', '-lnvToolsExt', '-ldl'] + + # Add static env if specified + if self.env: + static_lib = f'pufferlib/extensions/libstatic_{self.env}.a' + if not os.path.exists(static_lib): + raise RuntimeError(f'Static library not found: {static_lib}\n' + f'Build it first with: python setup.py build_{self.env}') + cmd += ['-DUSE_STATIC_ENV', static_lib, f'./{RAYLIB_NAME}/lib/libraylib.a', '-lGL', '-lomp5'] + cmd += ['pufferlib/extensions/muon.cpp', 'pufferlib/extensions/cuda/advantage.cu', src, '-o', out] print(f'Building profiler: {" ".join(cmd)}') @@ -277,6 +292,60 @@ def run(self): "build_profiler": ProfilerBuildExt, } +# Static env builds: clang-compiled env + gcc/nvcc torch extension +# Discover envs by listing folders in pufferlib/ocean +OCEAN_DIR = 'pufferlib/ocean' +STATIC_ENVS = [ + name for name in os.listdir(OCEAN_DIR) + if os.path.isdir(os.path.join(OCEAN_DIR, name)) + and not name.startswith('__') + and os.path.exists(f'pufferlib/ocean/{name}/binding.h') +] + +def create_static_env_build_class(env_name): + """Create a build class that compiles env with clang and links with torch extension.""" + class StaticEnvBuildExt(cpp_extension.BuildExtension): + def run(self): + import subprocess + + # Step 1: Build static library with clang + # env_binding.c includes binding.h from the env's directory + env_binding_src = 'pufferlib/extensions/env_binding.c' + static_lib = f'pufferlib/extensions/libstatic_{env_name}.a' + static_obj = f'pufferlib/extensions/libstatic_{env_name}.o' + + clang_cmd = [ + 'clang', '-c', '-O2', '-DNDEBUG', + '-I.', '-Ipufferlib/extensions', f'-Ipufferlib/ocean/{env_name}', + f'-I./{RAYLIB_NAME}/include', '-I/usr/local/cuda/include', + '-DPLATFORM_DESKTOP', + '-fno-semantic-interposition', '-fvisibility=hidden', + '-fPIC', '-fopenmp', + env_binding_src, '-o', static_obj + ] + print(f'Building static env: {" ".join(clang_cmd)}') + subprocess.check_call(clang_cmd) + + ar_cmd = ['ar', 'rcs', static_lib, static_obj] + print(f'Creating static library: {" ".join(ar_cmd)}') + subprocess.check_call(ar_cmd) + + # Step 2: Build torch extension linked against this env's static lib + # Filter to only the pufferlib._C extension + self.extensions = [e for e in self.extensions if e.name == 'pufferlib._C'] + + # Update extra_objects to use this env's static lib + for ext in self.extensions: + ext.extra_objects = [RAYLIB_A, static_lib] + + super().run() + + return StaticEnvBuildExt + +# Add build_ for static-linked envs +for env_name in STATIC_ENVS: + cmdclass[f"build_{env_name}"] = create_static_env_build_class(env_name) + if not NO_OCEAN: def create_env_build_class(full_name): class EnvBuildExt(build_ext): @@ -285,10 +354,10 @@ def run(self): super().run() return EnvBuildExt - # Add a build_ command for each env + # Add a build__so command for each env (dynamic .so build) for c_ext in c_extensions: env_name = c_ext.name.split('.')[-2] - cmdclass[f"build_{env_name}"] = create_env_build_class(c_ext.name) + cmdclass[f"build_{env_name}_so"] = create_env_build_class(c_ext.name) # Check if CUDA compiler is available. You need cuda dev, not just runtime. @@ -311,6 +380,8 @@ def run(self): extension = CppExtension import torch + # Note: Use build_ (e.g. build_breakout, build_drive) to build with static env linking + # build_torch alone won't link any env - it's for the training code only torch_extensions = [ extension( "pufferlib._C", @@ -322,7 +393,7 @@ def run(self): }, extra_link_args=extra_link_args, extra_objects=[RAYLIB_A], - libraries=[nvtx_lib], + libraries=[nvtx_lib, 'omp5'], library_dirs=[nvtx_lib_dir], ), ]